In [1]:
!pip install ir_datasets
!pip install rank_bm25
!pip install sentence_transformers
!pip install pytrec_eval



In [2]:
from tqdm import tqdm
import json
import ir_datasets
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
from transformers import pipeline
import torch
from huggingface_hub import login
import pytrec_eval

api_key = "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"
login(token=api_key)

# Check GPU availability
def get_device():
    if torch.cuda.is_available():
        device = "cuda"
        gpu_properties = torch.cuda.get_device_properties(torch.cuda.current_device())
        print(f"Using GPU: {gpu_properties.name}")
        print(f"CUDA Cores: {gpu_properties.multi_processor_count}")
        print(f"Total Memory: {gpu_properties.total_memory / 1e9:.2f} GB")
        print(f"Compute Capability: {gpu_properties.major}.{gpu_properties.minor}")
    else:
        device = "cpu"
        print("Using CPU")
    return device

device = get_device()


Using GPU: Tesla T4
CUDA Cores: 40
Total Memory: 15.84 GB
Compute Capability: 7.5


# Section 1: Dataset loading and preparation

In [3]:
# Load dataset
print("Loading the trec covid dataset...")
dataset = ir_datasets.load("cord19/trec-covid")

# Prepare documents and queries
print("Preparing documents and queries...")

all_docs = [{"doc_id": doc.doc_id, "abstract": doc.abstract} for doc in dataset.docs_iter()]
all_queries = [{"query_id": query.query_id, "title": query.title} for query in dataset.queries_iter()]

# Print dataset size information
print(f"Summary: {len(all_docs)} documents and {len(all_queries)} queries are available in the dataset.")

tokenized_docs = [doc['abstract'].split() for doc in all_docs]
qrels = dataset.qrels

[INFO] [starting] building docstore


Loading the trec covid dataset...
Preparing documents and queries...


[INFO] If you have a local copy of https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/80d664e496b8b7e50a39c6f6bb92e0ef
[INFO] [starting] https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv
docs_iter:   0%|                                    | 0/192509 [00:00<?, ?doc/s]
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 0.0%| 0.00/269M [00:00<?, ?B/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 0.1%| 180k/269M [00:00<03:11, 1.41MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 0.5%| 1.22M/269M [00:00<00:51, 5.22MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 3.0%| 8.16M/269M [00:00<00:10, 24.0MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazona

Summary: 192509 documents and 50 queries are available in the dataset.


In [4]:
# convert qrels to a dictionary
qrels_dict = {}
for qrel in qrels:
    if qrel.query_id not in qrels_dict:
        qrels_dict[qrel.query_id] = {}
    qrels_dict[qrel.query_id][qrel.doc_id] = qrel.relevance

[INFO] [starting] https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt
[INFO] [finished] https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt: [00:00] [1.14MB] [3.66MB/s]


# Section 2: Embeddings generation

In [5]:
# Load or generate embeddings
def generate_embeddings():
    if os.path.exists("trec_covid_doc_embeddings.csv") and os.path.exists("trec_covid_query_embeddings.csv"):
        print("Loading precomputed embeddings...")
        doc_embeddings = pd.read_csv("trec_covid_doc_embeddings.csv").values
        query_embeddings = pd.read_csv("trec_covid_query_embeddings.csv").values
    else:
        print("No precomputed embeddings found.")
        print("Generating new embeddings using SentenceTransformer model 'sentence-transformers/all-mpnet-base-v2'.")
        model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)
        doc_embeddings = model.encode(all_docs, batch_size=32, show_progress_bar=True, normalize_embeddings=True)
        query_embeddings = model.encode(all_queries, batch_size=32, show_progress_bar=True, normalize_embeddings=True)

        # Save embeddings for future use
        pd.DataFrame(doc_embeddings).to_csv("trec_covid_doc_embeddings.csv", index=False)
        pd.DataFrame(query_embeddings).to_csv("trec_covid_query_embeddings.csv", index=False)

    return doc_embeddings, query_embeddings

doc_embeddings, query_embeddings = generate_embeddings()

No precomputed embeddings found.
Generating new embeddings using SentenceTransformer model 'sentence-transformers/all-mpnet-base-v2'.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/6016 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

# Section 3: Retrieval implementation

### Evaluation metrics
The following functions are used to evaluate the quality of document retrieval methods based on the ranked list of documents returned for a given query.

In [6]:
# Function to prepare run data for pytrec_eval
def prepare_run_data(results):
    """
    Prepares the run data in the format expected by pytrec_eval.
    Converts numpy scores to native Python float for compatibility.
    """
    run = {}
    for query_results in results:
        query_id = query_results['query']['query_id']
        run[query_id] = {}
        for doc_id, score in zip(query_results['results'], query_results['scores']):
            run[query_id][doc_id] = float(score)  # Convert numpy type to float
    return run

### Document Retrieval Methods

1. **BM25 Sparse Retrieval**:
   - The **BM25 algorithm** is used to perform sparse retrieval on tokenized documents by calculating a relevance score for each document based on the query. It then returns the indices and relevance scores of the top-k most relevant documents.

2. **Dense Retrieval**:
   - **Dense retrieval** is performed by calculating the cosine similarity between the query embedding and the document embeddings. The top-k documents with the highest similarity scores are returned.

3. **Rank Fusion Retrieval**:
   - Results from both **BM25** and **dense retrieval** are combined using a **rank fusion** technique. Scores from both methods are normalized, weighted by a parameter `alpha`, and the top-k documents are returned based on the combined scores.

4. **Cascading Retrieval**:
   - Initially, a set of documents is retrieved using **BM25**. These documents are then re-ranked using dense retrieval, with a similarity threshold applied to filter documents. The top-k documents are returned based on the final ranking.

In [7]:
# BM25 Sparse Retrieval
def bm25_retrieve(query, bm25, top_k=5):
    """
    Perform sparse retrieval using BM25 on the tokenized documents.
    Returns the indices and scores of the top-k documents.
    """
    tokenized_query = query.split()                                             # Tokenize the query into words
    scores = bm25.get_scores(tokenized_query)                                   # Get BM25 scores for all documents
    top_k_indices = np.argsort(scores)[-top_k:][::-1]                           # Get indices of top-k documents based on BM25 score
    return top_k_indices, scores[top_k_indices]

# Dense Retrieval
def dense_retrieve(query_embedding, doc_embeddings, top_k=5):
    """
    Perform dense retrieval using cosine similarity between query and document embeddings.
    Returns the indices and similarities of the top-k documents.
    """
    similarities = cosine_similarity([query_embedding], doc_embeddings)[0]      # Compute cosine similarity
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]                     # Get top-k indices based on similarity
    return top_k_indices, similarities[top_k_indices]

# Rank Fusion Retrieval
def fusion_retrieve(dense_query_embedding, doc_embeddings, query, top_k=5, alpha=0.25):
    """
    Implementa il rank fusion riutilizzando le funzioni esistenti di retrieval.
    """

    # Perform BM25 retrieval and dense retrieval
    sparse_indices, sparse_scores = bm25_retrieve(query, bm25, top_k=len(doc_embeddings))
    dense_indices, dense_scores = dense_retrieve(dense_query_embedding, doc_embeddings, top_k=len(doc_embeddings))

    # Initialize score arrays
    all_sparse_scores = np.zeros(len(doc_embeddings))
    all_dense_scores = np.zeros(len(doc_embeddings))

    # Fill score arrays with BM25 and dense scores
    all_sparse_scores[sparse_indices] = sparse_scores
    all_dense_scores[dense_indices] = dense_scores

    if np.min(all_sparse_scores) == np.max(all_sparse_scores):
        print("All sparse scores are the same.")

    if np.min(all_dense_scores) == np.max(all_dense_scores):
        print("All dense scores are the same.")

    # Normalize scores
    all_sparse_scores = (all_sparse_scores - all_sparse_scores.min()) / (all_sparse_scores.max() - all_sparse_scores.min())
    all_dense_scores = (all_dense_scores - all_dense_scores.min()) / (all_dense_scores.max() - all_dense_scores.min())

    print("BM25 Scores:", all_sparse_scores)
    print("Dense Scores:", all_dense_scores)

    # Combine scores using the alpha parameter
    combined_scores = alpha * all_dense_scores + (1 - alpha) * all_sparse_scores

    # Retrieve the top-k results based on combined scores
    top_k_indices = np.argsort(combined_scores)[-top_k:][::-1]
    return top_k_indices, combined_scores[top_k_indices]

# Cascading Retrieval
def cascade_retrieve(dense_query_embedding, doc_embeddings, query, initial_k=100, final_k=5, dense_threshold=0.7):
    """
    Perform cascading retrieval: sparse retrieval followed by dense re-ranking.
    Filters documents based on a similarity threshold and returns the top-k results.
    """
    # Stage 1: BM25 to get initial candidates
    initial_indices, _ = bm25_retrieve(query, bm25, top_k=initial_k)

    # Stage 2: Dense re-ranking of candidate documents
    candidate_embeddings = doc_embeddings[initial_indices]
    _, dense_scores = dense_retrieve(dense_query_embedding, candidate_embeddings, top_k=len(initial_indices))

    # Filter candidates by similarity threshold
    qualified_mask = dense_scores >= dense_threshold
    if np.sum(qualified_mask) >= final_k:
        # Select top-k qualified candidates
        qualified_indices = np.where(qualified_mask)[0]
        top_indices = qualified_indices[np.argsort(dense_scores[qualified_indices])[-final_k:][::-1]]
    else:
        # If there are not enough qualified candidates, select top-k by overall scores
        top_indices = np.argsort(dense_scores)[-final_k:][::-1]

    # Map filtered indices to original document IDs
    final_indices = initial_indices[top_indices]
    final_scores = dense_scores[top_indices]

    return final_indices, final_scores



In [8]:
# Initialize BM25 model
print("Initializing BM25 model.")
bm25 = BM25Okapi(tokenized_docs)

Initializing BM25 model.


This section of code performs several retrieval experiments using the four different Document Retrieval Methods described earlier.

In [None]:
# Run retrieval experiments
def run_retrieval_experiments():
    """
    Execute sparse, dense, rank fusion, and cascading retrieval for all queries.
    Save the results to a JSON file for further analysis.
    """
    results = {"sparse": [], "dense": [], "rank_fusion": [], "cascade": []}

    print("Running retrieval experiments on all queries.")

    # Iterate over each query and its embedding
    for query, query_embedding in tqdm(zip(all_queries, query_embeddings), total=len(all_queries)):
        # Extract the query ID and text for the current query
        query_id = query['query_id']
        query_text = query['title'] if isinstance(query, dict) else query

        print("Query n.1. ID:", query_id, "Title: ", query_text)

        # Sparse Retrieval using BM25
        sparse_indices, sparse_scores = bm25_retrieve(query_text, bm25)                 # Retrieve the top-k BM25 documents and their scores
        sparse_docs = [all_docs[idx]['doc_id'] for idx in sparse_indices]               # Get document IDs from the indices
        results["sparse"].append({"query": query, "results": sparse_docs, "scores": sparse_scores}) # Store the BM25 results for the current query

        print("Sparse Retrieval Results:", sparse_docs)
        print("Sparse Retrieval Scores:", sparse_scores)

        # Dense Retrieval using cosine similarity
        dense_indices, dense_scores = dense_retrieve(query_embedding, doc_embeddings)   # Retrieve the top-k documents based on cosine similarity of embeddings
        dense_docs = [all_docs[idx]['doc_id'] for idx in dense_indices]
        results["dense"].append({"query": query, "results": dense_docs, "scores": dense_scores})

        print("Dense Retrieval Results:", dense_docs)
        print("Dense Retrieval Scores:", dense_scores)

        # Rank Fusion Retrieval by combining sparse (BM25) and dense result
        fusion_indices, fusion_scores = fusion_retrieve(                                # Combine BM25 and cosine similarity results
            query_embedding, doc_embeddings, query_text
        )
        fusion_docs = [all_docs[idx]['doc_id'] for idx in fusion_indices]
        results["rank_fusion"].append({"query": query, "results": fusion_docs, "scores": fusion_scores})

        # Cascade Retrieval: First use BM25, then re-rank using dense retrieval
        cascade_indices, cascade_scores = cascade_retrieve(                             # Perform cascading retrieval
            query_embedding, doc_embeddings, query_text
        )
        cascade_docs = [all_docs[idx]['doc_id'] for idx in cascade_indices]
        results["cascade"].append({"query": query, "results": cascade_docs, "scores": cascade_scores})

    return results

results = run_retrieval_experiments()


Running retrieval experiments on all queries.


  0%|          | 0/50 [00:00<?, ?it/s]

Query n.1. ID: 1 Title:  coronavirus origin
Sparse Retrieval Results: ['8ccl9aui', 'u65mey2z', 'hewbl5yu', 'dv9m19yk', 'es7q6c90']
Sparse Retrieval Scores: [10.11986678 10.09531025 10.05892626  9.75917639  9.63448018]
Dense Retrieval Results: ['1dilcbl1', '1ycinsg1', '1tpiki91', '1enteev7', '1wrsuoy7']
Dense Retrieval Scores: [0.60095847 0.5944924  0.5645776  0.55734575 0.5525621 ]


  2%|▏         | 1/50 [00:02<02:17,  2.81s/it]

BM25 Scores: [0. 0. 0. ... 0. 0. 0.]
Dense Scores: [0.47634723 0.44860223 0.47473124 ... 0.47595882 0.35878082 0.50278847]
Query n.1. ID: 2 Title:  coronavirus response to weather changes
Sparse Retrieval Results: ['526elsrf', '9svrz0vj', 'aiwxlxzt', '0mikqjpj', 'kftchnhz']
Sparse Retrieval Scores: [19.2632898  19.22108577 19.22108577 18.36905419 18.08809198]
Dense Retrieval Results: ['2oudure1', '2woitk33', '2yozikdd', '2yeulry4', '2oinvver']
Dense Retrieval Scores: [0.56454515 0.5598252  0.5515532  0.54386413 0.54240143]
BM25 Scores: [0.19907016 0.27906757 0.46560455 ... 0.27452545 0.         0.        ]
Dense Scores: [0.38507093 0.53639674 0.53346221 ... 0.44442074 0.32828043 0.64916499]


  4%|▍         | 2/50 [00:05<02:00,  2.52s/it]

Query n.1. ID: 3 Title:  coronavirus immunity
Sparse Retrieval Results: ['t9u7d029', '3i466i1y', '73oe5as9', '3ol5ozz6', 'wh9vvgv2']
Sparse Retrieval Scores: [10.45374692 10.06667735  9.91835967  9.89903255  9.44797959]
Dense Retrieval Results: ['3onttonl', '3afiiitd', '3yesshxm', '3yyhmprk', '3proetih']
Dense Retrieval Scores: [0.5992446  0.58512497 0.5786346  0.5620555  0.5584568 ]


  6%|▌         | 3/50 [00:06<01:39,  2.11s/it]

BM25 Scores: [0. 0. 0. ... 0. 0. 0.]
Dense Scores: [0.34336684 0.45704275 0.38863602 ... 0.48172579 0.27235244 0.43614415]
Query n.1. ID: 4 Title:  how do people die from the coronavirus
Sparse Retrieval Results: ['n95j94ck', 'c0pzjq4a', '3pklqjbx', 'ktl1x03p', 'mpvkalu4']
Sparse Retrieval Scores: [21.00495924 21.00495924 21.00495924 20.33982549 19.92623384]
Dense Retrieval Results: ['4wwo9tho', '4elorryp', '4deurtps', '2yeulry4', '4fioyrgh']
Dense Retrieval Scores: [0.56137174 0.54275244 0.54269516 0.5407479  0.52900386]
BM25 Scores: [0.28850449 0.29313731 0.30355803 ... 0.33300053 0.         0.        ]
Dense Scores: [0.39585886 0.55576135 0.42648542 ... 0.4795151  0.34949636 0.47298438]


  8%|▊         | 4/50 [00:09<01:47,  2.33s/it]

Query n.1. ID: 5 Title:  animal models of COVID-19
Sparse Retrieval Results: ['oa8vzf02', '3sepefqa', 'vobslprh', '5vu27b0p', 'y1b1vf1b']
Sparse Retrieval Scores: [20.44605516 20.44605516 20.17362148 19.94815441 19.55986288]
Dense Retrieval Results: ['5deitsem', '5nnlyavp', '5tauphen', '5doeanau', '7yeszyew']
Dense Retrieval Scores: [0.5541383  0.5482321  0.54706    0.54130363 0.5406431 ]
BM25 Scores: [0.25299528 0.30687034 0.29731714 ... 0.2947856  0.         0.        ]
Dense Scores: [0.37899098 0.41173729 0.38222371 ... 0.39516285 0.39416062 0.40577413]


 10%|█         | 5/50 [00:11<01:40,  2.23s/it]

Query n.1. ID: 6 Title:  coronavirus test rapid testing
Sparse Retrieval Results: ['sw23wf4b', 'n5du3bqt', '1dr4r3n4', 'zij2wbzs', 'giy00lt5']
Sparse Retrieval Scores: [17.27772247 15.82293092 15.63405586 14.92818926 14.79807079]
Dense Retrieval Results: ['7yeszyew', '6coookt6', '6ssdamhp', '6hadssmh', '6ogoktah']
Dense Retrieval Scores: [0.6549027  0.5944474  0.5693915  0.55968815 0.5581009 ]
BM25 Scores: [0. 0. 0. ... 0. 0. 0.]
Dense Scores: [0.40466084 0.43359292 0.38283956 ... 0.39102853 0.34351398 0.39152105]


 12%|█▏        | 6/50 [00:13<01:38,  2.25s/it]

Query n.1. ID: 7 Title:  serological tests for coronavirus
Sparse Retrieval Results: ['upwn9o2m', 'px4fe7mn', '1dbeh8q7', 'r1yf75bo', 'qjma4rsp']
Sparse Retrieval Scores: [22.12290899 21.17947728 21.17947728 19.93635115 19.92442083]
Dense Retrieval Results: ['7yeszyew', '7juumedy', '7eyttfre', '7eyoybsl', '7vizzvz7']
Dense Retrieval Scores: [0.7336451  0.5786698  0.57697725 0.5648861  0.5598773 ]
BM25 Scores: [0.09097561 0.         0.         ... 0.         0.         0.        ]
Dense Scores: [0.39003338 0.3521906  0.32255855 ... 0.34468472 0.27940001 0.33124195]


 14%|█▍        | 7/50 [00:16<01:44,  2.44s/it]

Query n.1. ID: 8 Title:  coronavirus under reporting
Sparse Retrieval Results: ['02iicrsa', 'c5l57vvu', 'c5l57vvu', 'qrtdjtxj', 'e5q27vpw']
Sparse Retrieval Scores: [10.74364541 10.22450603 10.22450603 10.21176617  9.82375569]
Dense Retrieval Results: ['7yeszyew', '8eeophww', '8sleups8', '8eeepgoh', '8ijbesl8']
Dense Retrieval Scores: [0.64830494 0.6354172  0.6302739  0.6301479  0.6300478 ]
BM25 Scores: [0. 0. 0. ... 0. 0. 0.]
Dense Scores: [0.40021389 0.4262297  0.39541374 ... 0.29982825 0.32362381 0.4283834 ]


 16%|█▌        | 8/50 [00:19<01:44,  2.50s/it]

Query n.1. ID: 9 Title:  coronavirus in Canada
Sparse Retrieval Results: ['jkzxjk54', '4v3d86h3', 'w6scpc65', 'qkr6jydj', 'mh23h29f']
Sparse Retrieval Scores: [19.42752432 17.53008377 16.80564654 16.34366365 15.69130755]
Dense Retrieval Results: ['9iuesnxo', '9eaahowb', '9eeyegvt', '9fuiind8', '9rugpout']
Dense Retrieval Scores: [0.6400808 0.6339346 0.6292525 0.6239834 0.622988 ]
BM25 Scores: [0.22257518 0.20511057 0.23511453 ... 0.21860294 0.         0.        ]
Dense Scores: [0.43482066 0.39470362 0.3773975  ... 0.36433443 0.33884002 0.43319337]


 18%|█▊        | 9/50 [00:21<01:44,  2.56s/it]

Query n.1. ID: 10 Title:  coronavirus social distancing impact
Sparse Retrieval Results: ['q5xc4m3j', 'o9ii9fj3', 'po2c65nb', 'km4qijqj', 'le2eifv8']
Sparse Retrieval Scores: [20.00936328 19.18621723 19.18621723 19.18621723 18.39850069]
Dense Retrieval Results: ['10ucu0lt', '10vhnirm', '10nlfooh', '10xmdxzl', '10eour3m']
Dense Retrieval Scores: [0.5511129  0.5432372  0.54154325 0.5407272  0.54060984]
BM25 Scores: [0. 0. 0. ... 0. 0. 0.]
Dense Scores: [0.35172386 0.43433491 0.42045498 ... 0.36203019 0.48802163 0.40668878]


 20%|██        | 10/50 [00:24<01:39,  2.50s/it]

Query n.1. ID: 11 Title:  coronavirus hospital rationing
Sparse Retrieval Results: ['szhkvizb', 'od8k0utb', '8dico3zc', 'sp7brt83', 't55od92g']
Sparse Retrieval Scores: [14.11932886 13.11170562 13.11170562 12.58403697 12.26830956]
Dense Retrieval Results: ['11abmmyl', '11ogothr', '11halnby', '11fmszeu', '11divoy4']
Dense Retrieval Scores: [0.6452064  0.62472564 0.59251565 0.5904081  0.5776899 ]
BM25 Scores: [0. 0. 0. ... 0. 0. 0.]
Dense Scores: [0.32824273 0.43465233 0.39317322 ... 0.3636019  0.38034138 0.35786534]


 22%|██▏       | 11/50 [00:26<01:28,  2.28s/it]

Query n.1. ID: 12 Title:  coronavirus quarantine
Sparse Retrieval Results: ['nyan7jnt', 'bpu3hpbn', 's0zdqd6d', '9hrrkqgi', 'kjnnh00e']
Sparse Retrieval Scores: [11.67022013 11.67022013 11.11431961 11.10539155 11.06989104]
Dense Retrieval Results: ['12yzvfcu', '12sakknb', '12godkoq', '12eajtcw', '12hoppys']
Dense Retrieval Scores: [0.5986586  0.5957926  0.5915795  0.58525676 0.5722784 ]


 24%|██▍       | 12/50 [00:27<01:18,  2.08s/it]

BM25 Scores: [0. 0. 0. ... 0. 0. 0.]
Dense Scores: [0.31566153 0.48391119 0.38717989 ... 0.41485617 0.34648201 0.45035393]
Query n.1. ID: 13 Title:  how does coronavirus spread
Sparse Retrieval Results: ['8wpccy2y', '4hph547d', '91j5ozws', '4067srwc', '8s8ma1q1']
Sparse Retrieval Scores: [12.28832665 11.68456566 11.68456566 11.68456566 11.68456566]
Dense Retrieval Results: ['13dsznnt', '9yyoryfa', '13ppjswf', '13cotyfb', '13llqigd']
Dense Retrieval Scores: [0.57191586 0.5699467  0.5630795  0.5547023  0.55105144]
BM25 Scores: [0. 0. 0. ... 0. 0. 0.]
Dense Scores: [0.37081738 0.41461727 0.36445338 ... 0.48017567 0.30269768 0.41987212]


 26%|██▌       | 13/50 [00:29<01:16,  2.06s/it]

Query n.1. ID: 14 Title:  coronavirus super spreaders
Sparse Retrieval Results: ['na3vrf5q', '93l22ign', 'p48bw6s4', 'c48gl27o', 'axns3ukm']
Sparse Retrieval Scores: [28.50607337 28.50607337 28.50607337 24.04516868 17.82424783]
Dense Retrieval Results: ['14obiyyd', '14baohhm', '14otwglm', '9yor16tf', '9yyoryfa']
Dense Retrieval Scores: [0.60960984 0.5830667  0.55143404 0.5439932  0.5381864 ]
BM25 Scores: [0. 0. 0. ... 0. 0. 0.]
Dense Scores: [0.37309861 0.34007422 0.35152266 ... 0.43982164 0.30126289 0.38873971]


 28%|██▊       | 14/50 [00:31<01:11,  1.99s/it]

Query n.1. ID: 15 Title:  coronavirus outside body
Sparse Retrieval Results: ['2w3bx6p8', 'bbg11u3w', '959w9sln', '5dzyx6pw', 'j2l61p76']
Sparse Retrieval Scores: [11.10572087 10.83028752 10.76519478 10.49909064 10.49909064]
Dense Retrieval Results: ['15pcopre', '15utzbyj', '15zfvsur', '15vvvxsa', '15rounhl']
Dense Retrieval Scores: [0.5812427  0.54851806 0.5354036  0.5352009  0.53013897]
BM25 Scores: [0. 0. 0. ... 0. 0. 0.]
Dense Scores: [0.38465041 0.2727829  0.38111765 ... 0.35166591 0.34869569 0.35222306]


 30%|███       | 15/50 [00:33<01:08,  1.95s/it]

Query n.1. ID: 16 Title:  how long does coronavirus survive on surfaces
Sparse Retrieval Results: ['pdmfxssd', 'ou7w3zkv', 'ou7w3zkv', '959w9sln', 'tjplc5j6']
Sparse Retrieval Scores: [22.66137236 21.39671883 21.39671883 20.18090081 19.72178981]
Dense Retrieval Results: ['16yw8yyb', '16eytnxy', '18nnwoav', '8yo8dw0h', '9yor16tf']
Dense Retrieval Scores: [0.6593798 0.6537776 0.5929102 0.5871397 0.5867139]
BM25 Scores: [0.         0.03172885 0.01779236 ... 0.03381601 0.         0.        ]
Dense Scores: [0.39460242 0.39811889 0.42641787 ... 0.3590084  0.35039453 0.39629059]


 32%|███▏      | 16/50 [00:37<01:23,  2.47s/it]

Query n.1. ID: 17 Title:  coronavirus clinical trials
Sparse Retrieval Results: ['i4fz2c49', 'zed7d315', '7izjhyyk', 'mgrz9jfd', 'm23mvaf7']
Sparse Retrieval Scores: [12.89437155 12.74032107 12.74032107 12.63610973 12.63610973]


In [None]:
run_sparse = prepare_run_data(results["sparse"])
run_dense = prepare_run_data(results["dense"])
run_rank_fusion = prepare_run_data(results["rank_fusion"])
run_cascade = prepare_run_data(results["cascade"])

# Evaluate results with pytrec_eval
evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, {'recall.5', 'ndcg_cut.5'})
eval_results_sparse = evaluator.evaluate(run_sparse)
eval_results_dense = evaluator.evaluate(run_dense)
eval_results_rank_fusion = evaluator.evaluate(run_rank_fusion)
eval_results_cascade = evaluator.evaluate(run_cascade)

# Aggregate metrics for overall performance
aggregated_results = {
    "sparse": {
        metric: sum([res[metric] for res in eval_results_sparse.values()]) / len(eval_results_sparse)
        for metric in eval_results_sparse[next(iter(eval_results_sparse))]
    },
    "dense": {
        metric: sum([res[metric] for res in eval_results_dense.values()]) / len(eval_results_dense)
        for metric in eval_results_dense[next(iter(eval_results_dense))]
    },
    "rank_fusion": {
        metric: sum([res[metric] for res in eval_results_rank_fusion.values()]) / len(eval_results_rank_fusion)
        for metric in eval_results_rank_fusion[next(iter(eval_results_rank_fusion))]
    },
    "cascade": {
        metric: sum([res[metric] for res in eval_results_cascade.values()]) / len(eval_results_cascade)
        for metric in eval_results_cascade[next(iter(eval_results_cascade))]
    }
}

print("Aggregated results:", json.dumps(aggregated_results, indent=4))
print("Retrieval results and metrics saved to files.")

# Section 4: QA with Language Model

In [None]:
# QA for the first query
QUERY_INDEX = 3                                                     # Index of the query to be used for retrieval
query = all_queries[QUERY_INDEX - 1]                                # Select the query from the list based on the index
query_text = query['title'] if isinstance(query, dict) else query   # Get the query text

# Retrieval calls:

# Perform dense retrieval using query embedding and document embeddings
dense_top_k_indices, dense_top_k_scores = dense_retrieve(query_embeddings[QUERY_INDEX], doc_embeddings)
# Perform sparse retrieval using BM25 on the query text
sparse_top_k_indices, sparse_top_k_scores = bm25_retrieve(query_text, bm25)
# Perform rank fusion retrieval by combining BM25 and dense retrieval results
rank_top_k_indices, rank_top_k_scores = fusion_retrieve(
    query_embeddings[QUERY_INDEX],
    doc_embeddings,
    query_text
)
# Perform cascading retrieval: first BM25, then re-rank with dense retrieval
cascading_top_k_indices, cascading_top_k_scores = cascade_retrieve(
    query_embeddings[QUERY_INDEX],
    doc_embeddings,
    query_text
)

# Get retrieved documents for each method
dense_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(dense_top_k_indices)]
sparse_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(sparse_top_k_indices)]
rank_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(rank_top_k_indices)]
cascading_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(cascading_top_k_indices)]

# Definition of the model that will be used to generate the various responses.
lm_pipeline = pipeline("text-generation",
                      model="meta-llama/Llama-3.2-1B",
                      device=0 if device == "cuda" else -1)

#### Question-answering using DENSE RETRIEVAL

In [None]:
print("------------------ DENSE RETRIEVAL ----------------------\n")
context = "\n".join(dense_retrieved_docs)
prompt = f"Context:\n{context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"

print(f"----------------- Length of the prompt -----------------\n{len(prompt.split())} words")
print(f"------------------------ Prompt ------------------------\n{prompt}")

# Generate response
response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.7,
                      truncation=False)[0]["generated_text"]
response = response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()

print(f"------------------ Response ------------------\n{response}")

#### Question-answering using SPARSE RETRIEVAL

In [None]:
print("------------------ SPARSE RETRIEVAL ----------------------\n")
context = "\n".join(sparse_retrieved_docs)
prompt = f"Context:\n{context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"

print(f"----------------- Length of the prompt -----------------\n{len(prompt.split())} words")
print(f"------------------------ Prompt ------------------------\n{prompt}")

# Generate response
response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.7,
                      truncation=False)[0]["generated_text"]

response = response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()
print(f"------------------ Response ------------------\n{response}")

#### Question-answering using RANK FUSION

In [None]:
print("------------------ RANK FUSION ----------------------\n")
context = "\n".join(rank_retrieved_docs)
prompt = f"Context:\n{context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"

print(f"----------------- Length of the prompt -----------------\n{len(prompt.split())} words")
print(f"------------------------ Prompt ------------------------\n{prompt}")

# Generate response
response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.7,
                      truncation=False)[0]["generated_text"]

response = response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()
print(f"------------------ Response ------------------\n{response}")

#### Question-answering using CASCADING RETRIEVAL

In [None]:
print("------------------ CASCADING RETRIEVAL ----------------------\n")
context = "\n".join(cascading_retrieved_docs)
prompt = f"Context:\n{context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"

print(f"----------------- Length of the prompt -----------------\n{len(prompt.split())} words")
print(f"------------------------ Prompt ------------------------\n{prompt}")

# Generate response
response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.7,
                      truncation=False)[0]["generated_text"]

response = response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()
print(f"------------------ Response ------------------\n{response}")

#### Question-answering WITH NO CONTEXT PROVIDED WITH RAG

In [None]:

print("------------------ RESPONSE WITHOUT RAG ----------------------\n")
prompt = f"""Question:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"""

print(f"----------------- Length of the prompt -----------------\n{len(prompt.split())} words")
print(f"------------------------ Prompt ------------------------\n{prompt}")

response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.7,
                      truncation=False)[0]["generated_text"]

response = response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()
print(f"------------------ Response ------------------\n{response}")

In [None]:
import random

picked_queries = random.sample(all_queries, 5)

for q in picked_queries:

    # For each query, retrieve and rank documents independently
    query_text = q['title']
    cascading_top_k_indices, cascading_top_k_scores = cascade_retrieve(query_embeddings[QUERY_INDEX], doc_embeddings, query_text)

    # Use the top-k documents for that specific query
    cascading_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(cascading_top_k_indices)]
    cascading_context = "\n".join(cascading_retrieved_docs)

    # Repeat the process for rank fusion
    rank_top_k_indices, rank_top_k_scores = fusion_retrieve(query_embeddings[QUERY_INDEX], doc_embeddings, query_text)
    rank_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(rank_top_k_indices)]
    rank_fusion_context = "\n".join(rank_retrieved_docs)


    cascading_prompt = f"Context:\n{cascading_context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"
    rank_fusion_prompt = f"Context:\n{rank_fusion_context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"

    # Generate response using language model
    cascading_response = lm_pipeline(cascading_prompt,
                           max_new_tokens=150,
                           temperature=0.7,
                           truncation=False)[0]["generated_text"]

    rank_fusion_response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.1,
                      truncation=False)[0]["generated_text"]

    # Extract the answer from the response
    cascading_response = cascading_response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()
    rank_fusion_response = rank_fusion_response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()

    # Print the results
    print(f"\nQuery: {query_text}")
    print(f"Cascading Response: {cascading_response}")
    print(f"Rank Fusion Response: {rank_fusion_response}")
    print("------------------------------\n")
