#### Advanced retrieval
In this file we:
* Test BM25
* Test Hybrid retrieval with BM25 and an embedding model
* Evaluate different rerankers

Results can be seen in *-hybrid-evaluation.csv* and *-rerank-evaluation.csv*

In [4]:
from datasets import load_dataset, concatenate_datasets
import csv
import numpy as np

# load test dataset
test_dataset = load_dataset("json", data_files="../2_datasets/RAG_test_dataset.json", split="train")
train_dataset = load_dataset("json", data_files="../2_datasets/RAG_train_dataset.json", split="train")
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
)  # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
)  # Our queries (qid => question)

# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for q_id in queries:
    relevant_docs[q_id] = [q_id]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
def compute_metrics(predicted_ranks, relevant_docs, k):
    correct_at_k = 0
    recall_at_k = 0

    for query_id, relevant_doc_ids in relevant_docs.items():
        top_k_results = predicted_ranks[query_id][:k]
        if any(doc in top_k_results for doc in relevant_doc_ids):
            correct_at_k += 1
        recall_at_k += len(set(relevant_doc_ids).intersection(top_k_results)) / len(relevant_doc_ids)

    hit_rate = correct_at_k / len(relevant_docs)
    recall = recall_at_k / len(relevant_docs)
    
    return hit_rate, recall

def compute_mrr(predicted_ranks, relevant_docs, k):
    reciprocal_ranks = []

    for query_id, relevant_doc_ids in relevant_docs.items():
        top_k_results = predicted_ranks[query_id][:k]
        for rank, doc_id in enumerate(top_k_results, start=1):
            if doc_id in relevant_doc_ids:
                reciprocal_ranks.append(1 / rank)
                break
        else:
            reciprocal_ranks.append(0)  # No relevant document in the top k

    return np.mean(reciprocal_ranks)

def writeToCSV_Embedding(model_name, results):
    row = {
        'model': model_name,
        'recall@1': results['recall@1'],
        'recall@3': results['recall@3'], 
        'recall@5': results['recall@5'],  
        'mrr': results['mrr@10']
    }

    with open('-embedding-evaluation.csv','a', newline='') as file:
        fields = ['model', 'recall@1', 'recall@3', 'recall@5', 'mrr']
        writer = csv.DictWriter(file, fieldnames=fields)
        writer.writerow(row)

In [4]:
from rank_bm25 import BM25Okapi

# BM25
tokenized_corpus = [doc.split() for doc in corpus.values()] 
bm25 = BM25Okapi(tokenized_corpus)  # Initialize BM25

predicted_ranks = {}

for q_id, query in queries.items():
    tokenized_query = query.split()  # Tokenize the query
    doc_scores = bm25.get_scores(tokenized_query)  # Get BM25 scores for the query
    ranked_doc_ids = [list(corpus.keys())[i] for i in np.argsort(doc_scores)[::-1]]  # Sort docs by score
    predicted_ranks[q_id] = ranked_doc_ids

# Calculate metrics at various cutoffs
k_values = [1, 3, 5]
results = {}

for k in k_values:
    hit_rate, recall = compute_metrics(predicted_ranks, relevant_docs, k)
    results[f'recall@{k}'] = recall
    results['mrr@10'] = compute_mrr(predicted_ranks, relevant_docs, k=10)

writeToCSV_Embedding("BM25", results)

In [7]:
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

# Tokenize the corpus and queries
tokenized_corpus = [doc.split() for doc in corpus.values()]  # Split corpus into tokens (words)
bm25 = BM25Okapi(tokenized_corpus)  # Initialize BM25

# Load the multilingual embedding model
embedding_model = SentenceTransformer("./models/RAG-multilingual-e5-small")
corpus_embeddings = embedding_model.encode(list(corpus.values()), convert_to_tensor=True)

# Hybrid retrieval
def hybrid_score(bm25_scores, embedding_scores, lambda_weight):
    return lambda_weight * bm25_scores + (1 - lambda_weight) * embedding_scores

def writeToCSV_Hybrid(model_name, results):
    row = {
        'model': model_name,
        'lambda_weight': results['lambda'],
        'recall@1': results['recall@1'],
        'recall@3': results['recall@3'], 
        'recall@5': results['recall@5'],  
        'mrr': results['mrr@10'],
    }

    with open('-hybrid-evaluation.csv', 'a', newline='') as file:
        fields = ['model', 'lambda_weight','recall@1', 'recall@3', 'recall@5', 'mrr']
        writer = csv.DictWriter(file, fieldnames=fields)
        writer.writerow(row)

lambda_values = [0.0, 0.1, 0.2, 0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9, 1.0]  # Test with different ratios of BM25 and embeddings
for lambda_weight in lambda_values:
    predicted_ranks = {}

    # Iterate over each query
    for q_id, query in queries.items():
        # BM25 scores
        tokenized_query = query.split()
        bm25_scores = bm25.get_scores(tokenized_query)  # BM25 scores

        # Embedding-based scores
        query_embedding = embedding_model.encode(query, convert_to_tensor=True)
        embedding_scores = cos_sim(query_embedding, corpus_embeddings).numpy().flatten()

        # Combine BM25 and embedding scores using the hybrid score
        hybrid_scores = hybrid_score(bm25_scores, embedding_scores, lambda_weight)
        
        # Rank documents by the hybrid score
        ranked_doc_ids = [list(corpus.keys())[i] for i in np.argsort(hybrid_scores)[::-1]]
        predicted_ranks[q_id] = ranked_doc_ids

    # Calculate metrics
    k_values = [1, 3, 5]
    results = {}

    for k in k_values:
        accuracy, recall = compute_metrics(predicted_ranks, relevant_docs, k)
        results[f'recall@{k}'] = recall
    
    results['mrr@10'] = compute_mrr(predicted_ranks, relevant_docs, k=10)
    results['lambda'] = lambda_weight
    
    writeToCSV_Hybrid("Hybrid-BM25-Embedding", results)

In [1]:
reranker_model_1 = "BAAI/bge-reranker-v2-m3"  # Cross-encoder model ID
reranker_model_2 = "svalabs/cross-electra-ms-marco-german-uncased"  # Cross-encoder model ID

_embedding_model = "./models/RAG-multilingual-e5-small"
embedding_model = "intfloat/multilingual-e5-small"

In [2]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util

def writeToCSV_Rerank(model_name, results):
    row = {
        'model': model_name,
        'recall@1': results['recall@1'],
        'recall@3': results['recall@3'], 
        'recall@5': results['recall@5'],  
        'mrr': results['mrr@10'],
    }

    with open('-rerank-evaluation.csv', 'a', newline='') as file:
        fields = ['model','recall@1', 'recall@3', 'recall@5', 'mrr']
        writer = csv.DictWriter(file, fieldnames=fields)
        writer.writerow(row)

def evaluate(emb_model, ranker_model):
    # Load the bi-encoder and cross-encoder models
    bi_encoder = SentenceTransformer(emb_model)
    cross_encoder = CrossEncoder(ranker_model)

    # Encode corpus and queries using the bi-encoder
    corpus_embeddings = bi_encoder.encode(list(corpus.values()), convert_to_tensor=True)
    query_embeddings = bi_encoder.encode(list(queries.values()), convert_to_tensor=True)

    top_k = 10  # Number of top-k documents to retrieve
    hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k=top_k)

    # List to store ranks of relevant documents
    predicted_ranks = {}

    # Re-rank using cross-encoder
    for query_idx, query_id in enumerate(queries.keys()):
        query_text = queries[query_id]

        # Retrieve top-k hits using bi-encoder
        top_hits = hits[query_idx]
        doc_ids = [list(corpus.keys())[hit['corpus_id']] for hit in top_hits]

        # Create query-document pairs for cross-encoder scoring
        pairs = [(query_text, corpus[doc_id]) for doc_id in doc_ids]
        scores = cross_encoder.predict(pairs)

        # Re-rank documents based on cross-encoder scores
        reranked_results = sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)
        reranked_doc_ids = [doc_id for doc_id, _ in reranked_results]

        predicted_ranks[query_id] = reranked_doc_ids
        # Logging rank changes: print(f"For query {query_id}, Before: {doc_ids}, After: {reranked_doc_ids}")
    
    
    # Calculate metrics
    k_values = [1, 3, 5]
    results = {}

    for k in k_values:
        hit_rate, recall = compute_metrics(predicted_ranks, relevant_docs, k)
        results[f'recall@{k}'] = recall
    results['mrr@10'] = compute_mrr(predicted_ranks, relevant_docs, k=10)

    writeToCSV_Rerank(ranker_model, results)


  from tqdm.autonotebook import tqdm, trange





In [6]:
evaluate(_embedding_model, reranker_model_1)

evaluate(_embedding_model, reranker_model_2)

config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/276k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/528k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [2]:
import json
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers.util import cos_sim
import csv

def load_jsonl_with_ids(file_path):
    documents = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f):
            # Parse each line as a JSON object
            doc = json.loads(line)
            # Assign a unique ID to each document
            doc_id = idx + 1
            # Store the document with its ID
            documents[doc_id] = doc["page_content"]
    return documents

# Example usage:
file_path = '../1_preproc/_chunks.jsonl'
corpus = load_jsonl_with_ids(file_path)

# Load your JSON dataset
with open('../2_datasets/expert-dataset.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Create a mapping for queries to multiple contexts
queries = {}  # qid => question text
relevant_docs = {}  # qid => set of relevant context ids
doc_id = 0  # Counter for generating unique context ids

for item in data:
    question = item['question']
    id = item["Id"]
    contexts = item['context']  # This is an array of contexts

    if not contexts:
        continue

    queries[id] = question
    

    # Assign a unique ID to each context and add it to the corpus
    for context in contexts:
        content = context['page_content']  # The actual context text
        
        if content not in corpus.values():
            print(f"Error: {content} not in corpus")
        # Map question to relevant context ids
        context_id = list(corpus.keys())[list(corpus.values()).index(content)]
        if id not in relevant_docs:
            relevant_docs[id] = []
        relevant_docs[id].append(context_id)

  from tqdm.autonotebook import tqdm, trange





In [3]:
from sklearn.metrics import ndcg_score

def compute_ndcg(predicted_ranks, relevant_docs, k):
    true_relevance = []
    scores = []

    for query_id, relevant_doc_ids in relevant_docs.items():
        y_true = [1 if doc in relevant_doc_ids else 0 for doc in predicted_ranks[query_id][:k]]

        if len(y_true) <= 1:
            continue

        true_relevance.append(y_true)
        scores.append([1 / (i + 1) for i in range(len(y_true))])

    if len(true_relevance) == 0:
        return 0.0  # Return 0 NDCG if no valid data

    ndcg_values = [ndcg_score([true], [score], k=k) for true, score in zip(true_relevance, scores)]
    return np.mean(ndcg_values)

def writeToCSV_expert(model_name, results):
    row = {
        'model': model_name,
        'hit-rate@1': results['hit-rate@1'],
        'hit-rate@3': results['hit-rate@3'], 
        'hit-rate@5': results['hit-rate@5'],
        'hit-rate@7': results['hit-rate@7'],
        'hit-rate@10': results['hit-rate@10'],
        'recall@1': results['recall@1'],
        'recall@3': results['recall@3'], 
        'recall@5': results['recall@5'],
        'recall@7': results['recall@7'],
        'recall@10': results['recall@10'],
        'mrr@1': results['mrr@1'],
        'mrr@3': results['mrr@3'],
        'mrr@5': results['mrr@5'],
        'mrr@7': results['mrr@7'],
        'mrr@10': results['mrr@10'],
        'ndcg@1': 0.0,
        'ndcg@3': results['ndcg@3'],
        'ndcg@5': results['ndcg@5'],
        'ndcg@7': results['ndcg@7'],
        'ndcg@10': results['ndcg@10']
    }

    with open('-embedding-evaluation-expert.csv','a', newline='') as file:
        fields = ['model']
        fields.extend([f"hit-rate@{i}" for i in [1,3,5,7,10]])
        fields.extend([f"recall@{i}" for i in [1,3,5,7,10]])
        fields.extend([f"mrr@{i}" for i in [1,3,5,7,10]])
        fields.extend([f"ndcg@{i}" for i in [1,3,5,7,10]])
        writer = csv.DictWriter(file, fieldnames=fields)
        writer.writerow(row)

In [60]:
from rank_bm25 import BM25Okapi

# BM25
tokenized_corpus = [doc.split() for doc in corpus.values()] 
bm25 = BM25Okapi(tokenized_corpus)  # Initialize BM25

predicted_ranks = {}

for q_id, query in queries.items():
    tokenized_query = query.split()  # Tokenize the query
    doc_scores = bm25.get_scores(tokenized_query)  # Get BM25 scores for the query
    ranked_doc_ids = [list(corpus.keys())[i] for i in np.argsort(doc_scores)[::-1]]  # Sort docs by score
    predicted_ranks[q_id] = ranked_doc_ids

# Calculate metrics at various cutoffs
k_values = [1, 3, 5, 7, 10]
results = {}

for k in k_values:
    hit_rate, recall = compute_metrics(predicted_ranks, relevant_docs, k)
    results[f'hit-rate@{k}'] = hit_rate
    results[f'recall@{k}'] = recall
    results[f'ndcg@{k}'] = compute_ndcg(predicted_ranks, relevant_docs, k=k)
    results[f'mrr@{k}'] = compute_mrr(predicted_ranks, relevant_docs, k=k)

writeToCSV_expert("BM25", results)

In [13]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the bi-encoder and cross-encoder models
bi_encoder = SentenceTransformer(embedding_model)

# Encode corpus and queries using the bi-encoder
corpus_embeddings = bi_encoder.encode(list(corpus.values()), convert_to_tensor=True)
query_embeddings = bi_encoder.encode(list(queries.values()), convert_to_tensor=True)

top_k = 10  # Number of top-k documents to retrieve
hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k=top_k)

# List to store ranks of relevant documents
predicted_ranks = {}

# Re-rank using cross-encoder
for query_idx, query_id in enumerate(queries.keys()):
    query_text = queries[query_id]

    # Retrieve top-k hits using bi-encoder
    top_hits = hits[query_idx]
    doc_ids = [list(corpus.keys())[hit['corpus_id']] for hit in top_hits]

    predicted_ranks[query_id] = doc_ids
    
    
# Calculate metrics
k_values = [1, 3, 5, 7, 10]
results = {}

for k in k_values:
    hit_rate, recall = compute_metrics(predicted_ranks, relevant_docs, k)
    results[f'hit-rate@{k}'] = hit_rate
    results[f'recall@{k}'] = recall
    results[f'ndcg@{k}'] = compute_ndcg(predicted_ranks, relevant_docs, k=k)
    results[f'mrr@{k}'] = compute_mrr(predicted_ranks, relevant_docs, k=k)

writeToCSV_expert("Base-embedding", results)

In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the bi-encoder and cross-encoder models
bi_encoder = SentenceTransformer(_embedding_model)

# Encode corpus and queries using the bi-encoder
corpus_embeddings = bi_encoder.encode(list(corpus.values()), convert_to_tensor=True)
query_embeddings = bi_encoder.encode(list(queries.values()), convert_to_tensor=True)

top_k = 10  # Number of top-k documents to retrieve
hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k=top_k)

# List to store ranks of relevant documents
predicted_ranks = {}

# Re-rank using cross-encoder
for query_idx, query_id in enumerate(queries.keys()):
    query_text = queries[query_id]

    # Retrieve top-k hits using bi-encoder
    top_hits = hits[query_idx]
    doc_ids = [list(corpus.keys())[hit['corpus_id']] for hit in top_hits]

    predicted_ranks[query_id] = doc_ids
    
    
# Calculate metrics
k_values = [1, 3, 5, 7, 10]
results = {}

for k in k_values:
    hit_rate, recall = compute_metrics(predicted_ranks, relevant_docs, k)
    results[f'hit-rate@{k}'] = hit_rate
    results[f'recall@{k}'] = recall
    results[f'ndcg@{k}'] = compute_ndcg(predicted_ranks, relevant_docs, k=k)
    results[f'mrr@{k}'] = compute_mrr(predicted_ranks, relevant_docs, k=k)

writeToCSV_expert("Fine-tune", results)

In [14]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util

# Load the bi-encoder and cross-encoder models
bi_encoder = SentenceTransformer(_embedding_model)
cross_encoder = CrossEncoder(reranker_model_2)

# Encode corpus and queries using the bi-encoder
corpus_embeddings = bi_encoder.encode(list(corpus.values()), convert_to_tensor=True)
query_embeddings = bi_encoder.encode(list(queries.values()), convert_to_tensor=True)

top_k = 10  # Number of top-k documents to retrieve
hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k=top_k)

# List to store ranks of relevant documents
predicted_ranks = {}

# Re-rank using cross-encoder
for query_idx, query_id in enumerate(queries.keys()):
    query_text = queries[query_id]

    # Retrieve top-k hits using bi-encoder
    top_hits = hits[query_idx]
    doc_ids = [list(corpus.keys())[hit['corpus_id']] for hit in top_hits]

    # Create query-document pairs for cross-encoder scoring
    pairs = [(query_text, corpus[doc_id]) for doc_id in doc_ids]
    scores = cross_encoder.predict(pairs)

    # Re-rank documents based on cross-encoder scores
    reranked_results = sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)
    reranked_doc_ids = [doc_id for doc_id, _ in reranked_results]

    predicted_ranks[query_id] = reranked_doc_ids
    # Logging rank changes: print(f"For query {query_id}, Before: {doc_ids}, After: {reranked_doc_ids}")
    
    
# Calculate metrics
k_values = [1, 3, 5, 7, 10]
results = {}

for k in k_values:
    hit_rate, recall = compute_metrics(predicted_ranks, relevant_docs, k)
    results[f'hit-rate@{k}'] = hit_rate
    results[f'recall@{k}'] = recall
    results[f'ndcg@{k}'] = compute_ndcg(predicted_ranks, relevant_docs, k=k)
    results[f'mrr@{k}'] = compute_mrr(predicted_ranks, relevant_docs, k=k)

writeToCSV_expert("Rerank", results)