# Project: Question-Answering using Retrieval Augmented Generation
by L.Arduini, D.N.Ghaneh, L.Menchini, C.Petruzzella

## Description
This project implements a QA chatbot leveraging language models hosted on a scalable server infrastructure. It provides embeddings to facilitate query-answering capabilities with advanced retrieval mechanisms.

## Instructions to Run

### Prerequisites
1. Python 3.10 or above.
2. Access to a runtime environment with GPU support (e.g., NVIDIA T4 on Google Colab) for optimal performance.

### Running the project
- Switch the runtime to GPU (e.g., NVIDIA T4) for enhanced performance

In [32]:
!pip install ir_datasets
!pip install rank_bm25
!pip install sentence_transformers
!pip install pytrec_eval
!pip install PyStemmer



In [33]:
from tqdm import tqdm
import json
import ir_datasets
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
from transformers import pipeline
import torch
from huggingface_hub import login
import pytrec_eval
import collections
import itertools
import heapq

api_key = "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"
login(token=api_key)

# Check GPU availability
def get_device():
    if torch.cuda.is_available():
        device = "cuda"
        gpu_properties = torch.cuda.get_device_properties(torch.cuda.current_device())
        print(f"Using GPU: {gpu_properties.name}")
        print(f"CUDA Cores: {gpu_properties.multi_processor_count}")
        print(f"Total Memory: {gpu_properties.total_memory / 1e9:.2f} GB")
        print(f"Compute Capability: {gpu_properties.major}.{gpu_properties.minor}")
    elif torch.backends.mps.is_available():
        device = "mps"
        print("Using MPS (Metal Performance Shaders)")
    else:
        device = "cpu"
        print("Using CPU")
    return device

device = get_device()


Using GPU: NVIDIA A100-SXM4-40GB
CUDA Cores: 108
Total Memory: 42.48 GB
Compute Capability: 8.0


# Section 1: Dataset loading and preparation

In [34]:
from functools import lru_cache
import re
import string
import Stemmer
import nltk
nltk.download("stopwords", quiet=True)

# ------- Pre Initialization -------
# 1. Compile regex patterns once globally
# 2. Preload stopwords set
# 3. Initialize stemmer

ACRONYM_REGEX = re.compile(r"(?<!\w)\.(?!\d)")
PUNCTUATION_TRANS = str.maketrans("", "", string.punctuation)
STOPWORDS = set(nltk.corpus.stopwords.words('english'))
STEMMER = Stemmer.Stemmer('english')

# Define a cached function to stem individual words
@lru_cache(maxsize=1000)
def stem(word):
    return STEMMER.stemWord(word)

# ----------------------------------

def preprocess(s):
    """
    Preprocess a string for indexing or querying.

    Args:
        s: The input string.

    Returns:
        A list of preprocessed tokens.
    """

    s = s.lower()
    s = s.replace("&", " and ")
    # normalize quotes and dashes
    s = s.translate(str.maketrans("‘’´“”–-", "'''\"\"--"))
    # remove unnecessary dots in acronyms (but not decimals)
    s = ACRONYM_REGEX.sub("", s)
    # remove punctuation
    s = s.translate(PUNCTUATION_TRANS)
    # strip and remove extra spaces
    s = " ".join(s.split())

    tokens = s.split()
    tokens = [t for t in tokens if t not in STOPWORDS]
    tokens = STEMMER.stemWords(tokens)
    return tokens

In [35]:
# Load dataset
print("Loading the trec covid dataset...")
dataset = ir_datasets.load("cord19/trec-covid")

# Prepare documents and queries
print("Preparing documents and queries...")

# put all documents and queries in a list of dictionaries
all_docs = []
for doc in dataset.docs_iter():
    if doc.abstract:  # Controlla se default_text è presente
        abstract = f"Title: {doc.title} Text: {doc.abstract}"
    else:
        abstract = f"Title: {doc.title}"  # Usa solo il titolo se il testo non è disponibile
    all_docs.append({"doc_id": doc.doc_id, "abstract": abstract})

all_queries = []
for query in dataset.queries_iter():
    query_text = f"Title: {query.title}\nDescription: {query.description}\nNarrative: {query.narrative}"
    all_queries.append({"query_id": query.query_id, "title": query_text})

# all_docs = [{"doc_id": doc.doc_id, "abstract": doc.title + " " + doc.default_text()} for doc in dataset.docs_iter()]
# all_queries = [{"query_id": query.query_id, "title": query.title + " " + query.description + " " + query.narrative} for query in dataset.queries_iter()]

# Print dataset size information
print(f"Summary: {len(all_docs)} documents and {len(all_queries)} queries are available in the dataset.")

# Tokenize documents
tokenized_docs = [preprocess(doc) for doc in [docs["abstract"] for docs in all_docs]]
tokenized_queries = [preprocess(query) for query in [queries["title"] for queries in all_queries]]
print("Tokenization of documents is done.")

bm25 = BM25Okapi(tokenized_docs)

Loading the trec covid dataset...
Preparing documents and queries...
Summary: 192509 documents and 50 queries are available in the dataset.
Tokenization of documents is done.


In [36]:
# convert qrels to a dictionary
qrels_dict = collections.defaultdict(dict)
for qrel in dataset.qrels_iter():
    qrels_dict[qrel.query_id][qrel.doc_id] = int(qrel.relevance)

# Section 2: Embeddings generation

In [37]:
# Load or generate embeddings
force_generate = False

def generate_embeddings():
    if not force_generate and os.path.exists("trec_covid_doc_embeddings.csv") and os.path.exists("trec_covid_query_embeddings.csv"):
        print("Loading precomputed embeddings...")
        doc_embeddings = pd.read_csv("trec_covid_doc_embeddings.csv").values
        query_embeddings = pd.read_csv("trec_covid_query_embeddings.csv").values
    else:
        print("No precomputed embeddings found.")
        print("Generating new embeddings using SentenceTransformer model 'sentence-transformers/all-MiniLM-L6-v2'.")
        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
        doc_embeddings = model.encode([doc["abstract"] for doc in all_docs], batch_size=32, show_progress_bar=True, normalize_embeddings=True)
        query_embeddings = model.encode([query['title'] for query in all_queries], batch_size=32, show_progress_bar=True, normalize_embeddings=True)

        # Save embeddings for future use
        pd.DataFrame(doc_embeddings).to_csv("trec_covid_doc_embeddings.csv", index=False)
        pd.DataFrame(query_embeddings).to_csv("trec_covid_query_embeddings.csv", index=False)

    return doc_embeddings, query_embeddings

doc_embeddings, query_embeddings = generate_embeddings()

Loading precomputed embeddings...


# Section 3: Retrieval implementation

In [38]:
# Function to prepare run data for pytrec_eval
def prepare_run_data(results):
    """
    Prepares the run data in the format expected by pytrec_eval.
    Converts numpy scores to native Python float for compatibility.
    """
    run = {}
    for query_results in results:
        query_id = query_results['query']['query_id']
        run[query_id] = {}
        for doc_id, score in zip(query_results['results'], query_results['scores']):
            run[query_id][doc_id] = float(score)  # Convert numpy type to float
    return run

### Document Retrieval Methods

1. **BM25 Sparse Retrieval**:
   - The **BM25 algorithm** is used to perform sparse retrieval on tokenized documents by calculating a relevance score for each document based on the query. It then returns the indices and relevance scores of the top-k most relevant documents.

2. **Dense Retrieval**:
   - **Dense retrieval** is performed by calculating the cosine similarity between the query embedding and the document embeddings. The top-k documents with the highest similarity scores are returned.

3. **Rank Fusion Retrieval**:
   - Results from both **BM25** and **dense retrieval** are combined using a **rank fusion** technique. Scores from both methods are normalized, weighted by a parameter `alpha`, and the top-k documents are returned based on the combined scores.

4. **Cascading Retrieval**:
   - Initially, a set of documents is retrieved using **BM25**. These documents are then re-ranked using dense retrieval, with a similarity threshold applied to filter documents. The top-k documents are returned based on the final ranking.

In [47]:
from scipy.stats import zscore

# BM25 Sparse Retrieval
def bm25_retrieve(query, bm25, top_k=5):
    """
    Perform sparse retrieval using BM25 on the tokenized documents.
    Returns the indices and scores of the top-k documents.
    """
    tokenized_query = preprocess(query)                                     # Tokenize the query into words
    scores = bm25.get_scores(tokenized_query)                                   # Get BM25 scores for all documents
    top_k_indices = np.argsort(scores)[-top_k:][::-1]                           # Get indices of top-k documents based on BM25 score
    return top_k_indices, scores[top_k_indices]

# Dense Retrieval
def dense_retrieve(query_embedding, doc_embeddings, top_k=5):
    """
    Perform dense retrieval using cosine similarity between query and document embeddings.
    Returns the indices and similarities of the top-k documents.
    """
    similarities = cosine_similarity([query_embedding], doc_embeddings)[0]      # Compute cosine similarity
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]                     # Get top-k indices based on similarity
    return top_k_indices, similarities[top_k_indices]

def combsum_fusion(query_id, dense_indices, dense_scores, sparse_indices, sparse_scores, top_k=5):
    # Combine scores using CombSUM
    all_doc_ids = np.concatenate((sparse_indices, dense_indices))
    all_scores = np.concatenate((sparse_scores, dense_scores))
    combined_scores = collections.defaultdict(float)
    for doc_id, score in zip(all_doc_ids, all_scores):
        combined_scores[doc_id] += score

    # Retrieve top-k documents based on combined scores
    top_docs = heapq.nlargest(top_k, combined_scores.items(), key=lambda x: x[1])

    # return top k indices and scores
    return [doc[0] for doc in top_docs], [doc[1] for doc in top_docs]

# Cascading Retrieval
def cascade_retrieve(dense_query_embedding, doc_embeddings, query, initial_k=1000, final_k=5, dense_threshold=0.7):
    """
    Perform cascading retrieval: sparse retrieval followed by dense re-ranking.
    Filters documents based on a similarity threshold and returns the top-k results.
    """
    # Stage 1: BM25 to get initial candidates
    initial_indices, _ = bm25_retrieve(query, bm25, top_k=initial_k)

    # Stage 2: Dense re-ranking of candidate documents
    candidate_embeddings = doc_embeddings[initial_indices]
    _, dense_scores = dense_retrieve(dense_query_embedding, candidate_embeddings, top_k=len(initial_indices))

    # Filter candidates by similarity threshold
    qualified_mask = dense_scores >= dense_threshold
    if np.sum(qualified_mask) >= final_k:
        # Select top-k qualified candidates
        qualified_indices = np.where(qualified_mask)[0]
        top_indices = qualified_indices[np.argsort(dense_scores[qualified_indices])[-final_k:][::-1]]
    else:
        # If there are not enough qualified candidates, select top-k by overall scores
        top_indices = np.argsort(dense_scores)[-final_k:][::-1]

    # Map filtered indices to original document IDs
    final_indices = initial_indices[top_indices]
    final_scores = dense_scores[top_indices]

    return final_indices, final_scores



This section of code performs several retrieval experiments using the four different Document Retrieval Methods described earlier.

In [48]:
# Run retrieval experiments
def run_retrieval_experiments():
    """
    Execute sparse, dense, rank fusion, and cascading retrieval for all queries.
    Save the results to a JSON file for further analysis.
    """
    results = {"sparse": [], "dense": [], "rank_fusion": [], "cascade": []}

    print("Running retrieval experiments on all queries.")

    # Iterate over each query and its embedding
    for query, query_embedding in tqdm(zip(all_queries, query_embeddings), total=len(all_queries)):
        # Extract the query ID and text for the current query
        query_id = query['query_id']
        query_text = query['title']

        # Sparse Retrieval using BM25
        sparse_indices, sparse_scores = bm25_retrieve(query_text, bm25)                 # Retrieve the top-k BM25 documents and their scores
        sparse_docs = [all_docs[idx]['doc_id'] for idx in sparse_indices]               # Get document IDs from the indices

        # Dense Retrieval using cosine similarity
        dense_indices, dense_scores = dense_retrieve(query_embedding, doc_embeddings)   # Retrieve the top-k documents based on cosine similarity of embeddings
        dense_docs = [all_docs[idx]['doc_id'] for idx in dense_indices]

        # Normalize scores
        sparse_scores = zscore(sparse_scores)
        dense_scores = zscore(dense_scores)
        results["sparse"].append({"query": query, "results": sparse_docs, "scores": sparse_scores}) # Store the BM25 results for the current query
        results["dense"].append({"query": query, "results": dense_docs, "scores": dense_scores})

        print(sparse_indices, dense_indices)
        fusion_indices, fusion_scores = combsum_fusion(query_id, dense_indices, dense_scores, sparse_indices, sparse_scores)
        print(fusion_indices, fusion_scores)
        fusion_docs = [all_docs[idx]['doc_id'] for idx in fusion_indices]
        results["rank_fusion"].append({"query": query, "results": fusion_docs, "scores": fusion_scores})
        # Rank Fusion Retrieval by combining sparse (BM25) and dense result
        # fusion_indices, fusion_scores = fusion_retrieve(                                # Combine BM25 and cosine similarity results
        #     query_embedding, doc_embeddings, query_text
        # )
        # fusion_docs = [all_docs[idx]['doc_id'] for idx in fusion_indices]
        # results["rank_fusion"].append({"query": query, "results": fusion_docs, "scores": fusion_scores})

        # Cascade Retrieval: First use BM25, then re-rank using dense retrieval
        cascade_indices, cascade_scores = cascade_retrieve(                             # Perform cascading retrieval
            query_embedding, doc_embeddings, query_text
        )
        cascade_docs = [all_docs[idx]['doc_id'] for idx in cascade_indices]
        results["cascade"].append({"query": query, "results": cascade_docs, "scores": cascade_scores})

    return results

results = run_retrieval_experiments()


Running retrieval experiments on all queries.


  0%|          | 0/50 [00:00<?, ?it/s]

[171232  96819 113751 163782 118377] [ 83277 159595 162667 128664 169722]
[83277, 171232, 96819, 113751, 159595] [1.8129179657310328, 1.0137063005060618, 1.0137063005060618, 0.322235573161346, 0.13075101228649974]


  2%|▏         | 1/50 [00:04<03:16,  4.02s/it]

[190981  97988 114172 177067  74696] [124988 124987 124986 134539 176603]
[190981, 97988, 124988, 124987, 124986] [1.5547674586356315, 0.8228845261916485, 0.6558325162425469, 0.6558325162425469, 0.6558325162425469]


  4%|▍         | 2/50 [00:09<03:50,  4.79s/it]

[177580 188839 179261 135589  97732] [112009 112008  49085 119779 119778]
[177580, 112009, 112008, 49085, 188839] [1.9831543001301857, 0.8164965809277084, 0.8164965809277084, 0.8164965809277084, -0.30636577030555073]


  6%|▌         | 3/50 [00:14<03:54,  4.98s/it]

[108427 152540  88890 109489 109488] [175123 144067 119240  76337 120318]
[175123, 108427, 152540, 144067, 119240] [1.7900161983757648, 1.2743885469333576, 1.1651004475402145, 0.049646853940860824, 0.049646853940860824]


  8%|▊         | 4/50 [00:17<03:13,  4.21s/it]

[ 75973  72253  80384 180990  81903] [110689 110688 149313 116335 116336]
[75973, 110689, 110688, 149313, 72253] [1.7995728793199433, 0.8208789614537716, 0.8208789614537716, 0.8077034720081534, 0.3849732323515608]


 10%|█         | 5/50 [00:23<03:40,  4.90s/it]

[ 59511 182089  99662 109740 109741] [ 44665 107284 107285 162605  65060]
[59511, 44665, 107284, 107285, 182089] [1.9139944011064942, 0.8842058667941491, 0.7216316462963016, 0.7216316462963016, 0.04088988274871325]


 12%|█▏        | 6/50 [00:27<03:19,  4.53s/it]

[110864  47930 161541  90059  60591] [ 91397 168247 122213 192502  40838]
[91397, 110864, 47930, 168247, 161541] [1.6086997448975642, 0.9209620646207547, 0.9209620646207547, 0.7464146356266144, -0.01220469298666555]


 14%|█▍        | 7/50 [00:32<03:16,  4.58s/it]

[128074  63907 166142 143437  74888] [169269 129379 129380  66985  44665]
[128074, 63907, 169269, 129379, 129380] [1.1239399532772838, 1.1239399532772838, 0.8164718066790388, 0.8164718066790388, 0.8164718066790388]


 16%|█▌        | 8/50 [00:37<03:15,  4.65s/it]

[ 71741  74369 121096  57256  84566] [ 76782  76123 100066 138021 100067]
[71741, 76782, 76123, 74369, 121096] [1.9597326237359007, 1.8401401883441382, 0.29859288002206485, -0.14489456562518147, -0.5227120551160076]


 18%|█▊        | 9/50 [00:41<03:05,  4.54s/it]

[134656  34042 132564  95734 132563] [142157 169075 123859  59878 100526]
[134656, 142157, 169075, 34042, 132564] [1.9988883700709201, 1.7711228608763985, 0.45349006701449535, -0.4351662736821994, -0.521240698796237]


 20%|██        | 10/50 [00:46<03:07,  4.68s/it]

[75701 83572 95117 95119 95118] [73594 75632 89152 69027 95164]
[73594, 75701, 83572, 75632, 89152] [1.9982841843745727, 1.9928901147353248, -0.3350824702115195, -0.42577693333372524, -0.49692047021893104]


 22%|██▏       | 11/50 [00:51<03:04,  4.72s/it]

[ 69862 160538 130155 149344  75490] [ 91051  23699 133351  33969 111252]
[69862, 91051, 23699, 133351, 160538] [1.3802204507736096, 0.9272699283576785, 0.7823023832882275, 0.7213229671566134, 0.2852890235972231]


 24%|██▍       | 12/50 [00:55<02:54,  4.59s/it]

[126533 126534  62463  76422  74777] [103710 103711  41389 112307 112306]
[103710, 103711, 126533, 126534, 62463] [0.9480169138988198, 0.9480169138988198, 0.8152137481360494, 0.8152137481360494, 0.8152137481360494]


 26%|██▌       | 13/50 [00:58<02:37,  4.26s/it]

[ 63823 127975 127976 176578 124921] [ 63823 127976 127975 129850 152550]
[63823, 127975, 127976, 129850, 152550] [1.7948523718574343, 1.5470390260012792, 1.5470390260012792, -1.2197203405383998, -1.2197203405383998]


 28%|██▊       | 14/50 [01:04<02:49,  4.71s/it]

[ 15571 178906 133396 183338  89338] [133396 178906 157636  76516  94626]
[178906, 133396, 15571, 157636, 183338] [2.1008752716389685, 1.6294043036535766, 0.9832292088669622, -0.6385290096354259, -0.657362905126163]


 30%|███       | 15/50 [01:10<02:54,  4.98s/it]

[ 72624 109235 138731    686 157636] [157636  76516 177857 113048 130692]
[72624, 76516, 109235, 138731, 686] [1.1985236403390078, 1.1264895405043687, 0.5705194383491641, 0.3180859109250407, -0.3511714353225218]


 32%|███▏      | 16/50 [01:15<02:52,  5.07s/it]

[113040 149126 132158  67758 132159] [174759 127553  90139  28617 132989]
[113040, 149126, 174759, 127553, 90139] [1.2247448713915865, 1.2247448713915865, 0.9792394124626624, 0.9792387807760277, 0.08614091447308135]


 34%|███▍      | 17/50 [01:19<02:38,  4.80s/it]

[177453 187630 143501 109591  46796] [ 62094 126208  33196 138108  94878]
[177453, 62094, 187630, 126208, 33196] [1.2387772982745087, 1.232156893697772, 1.2102247947488016, 0.5563778454750697, 0.2550490060862854]


 36%|███▌      | 18/50 [01:23<02:21,  4.43s/it]

[125732 156229 130033 130032 130034] [132368 155496 130033 130032 130034]
[125732, 132368, 155496, 156229, 130033] [1.6346947277173263, 1.2247448713915892, 1.2247448713915892, 0.7070266924879437, -1.597070387662817]


 38%|███▊      | 19/50 [01:27<02:15,  4.37s/it]

[ 85901 120340 121425 101155  82223] [103985 103986 107836 114930 146983]
[85901, 120340, 103985, 103986, 107836] [1.182140109425385, 1.182140109425385, 1.0460385668495182, 1.0460385668495182, 0.18440491476653448]


 40%|████      | 20/50 [01:33<02:24,  4.81s/it]

[ 70776  76194  71175 142676  96592] [ 70407 165621 108654 177176  76879]
[70776, 70407, 165621, 108654, 76194] [1.869937733933687, 0.9501478792222919, 0.8751384394937671, 0.5839055640934511, 0.11341407371308973]


 42%|████▏     | 21/50 [01:38<02:19,  4.81s/it]

[130470 162637 118401 184585 185788] [ 36686  59013  97524 114898 130470]
[36686, 162637, 130470, 59013, 118401] [1.8217781988271178, 0.9662352639811456, 0.49544025721893425, 0.34039748254135127, -0.30879906269343205]


 44%|████▍     | 22/50 [01:42<02:08,  4.61s/it]

[ 94041 114966 140117 170641  60124] [ 29570  91109 131463 170030  36686]
[29570, 94041, 114966, 140117, 91109] [1.0028532341290066, 0.8072778345821708, 0.8011142338124553, 0.7299063044625255, 0.33676923440406165]


 46%|████▌     | 23/50 [01:46<02:02,  4.54s/it]

[101619  27442 186323 185135 108550] [190718  27442  60068 110788 138403]
[101619, 27442, 190718, 186323, 60068] [1.3623517670399088, 1.2940888003593392, 1.19834213434471, 0.47762343665956786, 0.2818794592655626]


 48%|████▊     | 24/50 [01:51<01:57,  4.50s/it]

[  5631  42836 105270  46508 179657] [118251 146042 163360 119500 119501]
[5631, 118251, 146042, 42836, 105270] [1.9999999999999982, 1.924355217675723, 0.04645591780023721, -0.500000000000002, -0.500000000000002]


 50%|█████     | 25/50 [01:56<01:57,  4.70s/it]

[187272  95514 100801 100802 174212] [116018 116017  52673  70227 151132]
[187272, 95514, 116018, 116017, 52673] [1.2245211381335213, 1.2245211381335213, 1.2195045278744927, 1.2195045278744927, -0.6442908607285626]


 52%|█████▏    | 26/50 [01:59<01:45,  4.41s/it]

[ 75875 175845 116010  74228 114754] [ 11977  44380 106970  38162 185762]
[11977, 44380, 106970, 75875, 175845] [0.8154279363359512, 0.8154279363359512, 0.8154279363359512, 0.5643430044296882, 0.4781810342046631]


 54%|█████▍    | 27/50 [02:03<01:34,  4.09s/it]

[ 92481  30954  77095 177171  92104] [158910 107772 164717  48875  26496]
[92481, 30954, 158910, 107772, 77095] [1.1582937864286302, 1.1582937864286302, 1.1399581570892465, 1.1399581570892465, -0.242601669446577]


 56%|█████▌    | 28/50 [02:07<01:29,  4.06s/it]

[ 26598 143548 100797  97198 183094] [100572 191657  38761 121418 187114]
[26598, 100572, 191657, 143548, 38761] [1.7295314746697688, 1.609406656180062, 0.5674449346415067, 0.5124562985140529, -0.14665086875039918]


 58%|█████▊    | 29/50 [02:14<01:46,  5.09s/it]

[ 41572 103912 132956 147251  59847] [69806 71887 29974 94833 91515]
[69806, 41572, 103912, 132956, 71887] [1.99164629567943, 1.0915936835581659, 1.0915936835581659, -0.17823931470300783, -0.3426711979307875]


 60%|██████    | 30/50 [02:18<01:33,  4.66s/it]

[ 87457  78609  40461    298 157269] [147518 125051 130235  65924 110819]
[87457, 147518, 125051, 78609, 40461] [1.322214456043391, 1.2037373003553449, 1.1876903257727651, 0.6572618082001531, 0.28650354255793076]


 62%|██████▏   | 31/50 [02:23<01:31,  4.83s/it]

[  5546  48815  75945  24322 137522] [192411 159928  49085 112008 112009]
[192411, 5546, 48815, 75945, 159928] [1.9525461931876926, 1.2600177207263905, 0.5543582161469982, 0.5504964362404515, -0.06880296598869275]


 64%|██████▍   | 32/50 [02:27<01:21,  4.52s/it]

[ 77746 135188  34077   3979 185191] [ 50934 162538  65924 130235  99278]
[77746, 50934, 162538, 135188, 34077] [1.7277074380935638, 1.644544087494254, 0.6615873866135493, 0.1764138543378974, 0.030518394336924257]


 66%|██████▌   | 33/50 [02:33<01:26,  5.06s/it]

[150097 114279 143566  70036 186301] [ 34803  96525  96526  96527 135588]
[150097, 34803, 114279, 143566, 96525] [1.5916587823701784, 1.313803817631766, 0.26313359318993124, 0.17883200736264562, 0.15823849234517437]


 68%|██████▊   | 34/50 [02:38<01:19,  4.96s/it]

[ 72705  90561 165927 125969 127078] [190724  89707  68382  40489 102714]
[72705, 190724, 89707, 90561, 165927] [1.6354066611904399, 1.2848724145416028, 0.9632232719243583, 0.09844591423712706, 0.001762000746187557]


 70%|███████   | 35/50 [02:45<01:22,  5.48s/it]

[188308  76309 158588 172170  77358] [192114  71753  72130  61143 168736]
[192114, 188308, 76309, 71753, 72130] [1.9414402461510438, 1.553995950000475, 0.7100878856271458, -0.06087293591988388, -0.4514573222398731]


 72%|███████▏  | 36/50 [02:49<01:10,  5.04s/it]

[113074 113073 174447  77505 181422] [ 95006  95007 150827 174447 113074]
[95006, 95007, 113073, 150827, 174447] [0.8164969999987154, 0.8164969999987154, 0.8164965809277245, 0.8164957427854653, -0.40824803578462654]


 74%|███████▍  | 37/50 [02:53<01:03,  4.85s/it]

[105443 150444 105442  54416 117989] [99620 37680 37103 98990 98989]
[99620, 37680, 105443, 150444, 105442] [1.3646937846007896, 1.0643460092140429, 0.8164965809277219, 0.8164965809277219, 0.8164965809277219]


 76%|███████▌  | 38/50 [02:59<01:02,  5.18s/it]

[102401  40247 102402  90260  90259] [ 97022 151462 129127 129126 105004]
[97022, 102401, 40247, 102402, 151462] [1.8608103654448716, 1.2056330189154003, 0.5906485371328112, 0.5906485371328112, 0.03502892854250543]


 78%|███████▊  | 39/50 [03:04<00:57,  5.23s/it]

[105068 156067  73493  74351  75487] [145013 159928 135889 113728  73493]
[145013, 105068, 156067, 159928, 135889] [1.435943462561242, 1.2126932594685793, 1.2126932594685793, 0.9351357951277997, -0.5643793948320014]


 80%|████████  | 40/50 [03:11<00:55,  5.57s/it]

[168442 129328  43554 106082 106083] [152284 120975 120974 137575  46826]
[168442, 129328, 152284, 120975, 120974] [1.2247448713915834, 1.2247448713915834, 0.8112569032937226, 0.8112569032937226, 0.8112569032937226]


 82%|████████▏ | 41/50 [03:19<00:56,  6.25s/it]

[112397 141714  95384  95383   5648] [149566  75227 132762 154307 125491]
[112397, 149566, 75227, 141714, 95384] [1.7955434769616214, 1.607160075115966, 0.6194742731181511, -0.16456950672530585, -0.16456950672530585]


 84%|████████▍ | 42/50 [03:24<00:48,  6.00s/it]

[ 95776  95775  34080 156839  68476] [95776 95775 34080 11699 93262]
[95776, 95775, 34080, 11699, 156839] [1.5974007467598876, 1.5974007467598876, 1.5244678685927924, -0.5451751779886697, -1.1677951249643108]


 86%|████████▌ | 43/50 [03:29<00:39,  5.63s/it]

[ 14395  10357 147810  69510 143501] [102044 179146  97755 153645  56944]
[102044, 14395, 10357, 179146, 97755] [1.7437237431109334, 1.5194372440269732, 0.8690593117845777, 0.11255838349288622, -0.20341142939484702]


 88%|████████▊ | 44/50 [03:34<00:32,  5.48s/it]

[ 99116 188302  39265 101315 101316] [110477 177106  38652  76225 173818]
[110477, 99116, 188302, 177106, 38652] [1.292017781039793, 1.2247448713915954, 1.2247448713915954, 0.5345508356387978, 0.2293754066168278]


 90%|█████████ | 45/50 [03:39<00:26,  5.22s/it]

[174094  92766  92767  97922 178098] [184248 105178  42731  71503 118458]
[184248, 174094, 92766, 92767, 105178] [1.860153008084353, 0.8164965809277144, 0.8164965809277144, 0.8164965809277144, 0.18846610211367104]


 92%|█████████▏| 46/50 [03:44<00:21,  5.25s/it]

[117930 154212  68486 131698 172947] [177904  90261 157214 120130 120131]
[177904, 90261, 117930, 154212, 68486] [1.224744871391592, 1.224744871391592, 1.0782584969615774, 1.0782584969615774, 0.14610775527313577]


 94%|█████████▍| 47/50 [03:50<00:16,  5.40s/it]

[ 73761 119642  55962 119643  70016] [183284 113977  47725 110625 112979]
[73761, 183284, 113977, 119642, 55962] [1.9687337102699853, 1.415885199816272, 0.9953079110686763, -0.37849178592281574, -0.37849178592281574]


 96%|█████████▌| 48/50 [03:57<00:12,  6.10s/it]

[ 71709 140965 188043 134029 135964] [191751 114294 114293 190406 111696]
[71709, 191751, 114294, 114293, 140965] [1.7156891364013245, 0.8164965809277234, 0.8164965809277234, 0.8164965809277234, 0.21851815852676962]


 98%|█████████▊| 49/50 [04:04<00:06,  6.34s/it]

[  3979   4490  69958   4743 160772] [ 90862 138356 140858 129323 129322]
[3979, 90862, 138356, 4490, 69958] [1.4470102390603647, 1.22474487139159, 1.22474487139159, 0.48758019193991486, 0.327009760291939]


100%|██████████| 50/50 [04:09<00:00,  5.00s/it]


In [49]:
run_sparse = prepare_run_data(results["sparse"])
run_dense = prepare_run_data(results["dense"])
run_rank_fusion = prepare_run_data(results["rank_fusion"])
run_cascade = prepare_run_data(results["cascade"])

# Evaluate results with pytrec_eval
evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, {'recall.5', 'ndcg_cut.5'})
eval_results_sparse = evaluator.evaluate(run_sparse)
eval_results_dense = evaluator.evaluate(run_dense)
eval_results_rank_fusion = evaluator.evaluate(run_rank_fusion)
eval_results_cascade = evaluator.evaluate(run_cascade)

# Aggregate metrics for overall performance
aggregated_results = {
    "sparse": {
        metric: sum([res[metric] for res in eval_results_sparse.values()]) / len(eval_results_sparse)
        for metric in eval_results_sparse[next(iter(eval_results_sparse))]
    },
    "dense": {
        metric: sum([res[metric] for res in eval_results_dense.values()]) / len(eval_results_dense)
        for metric in eval_results_dense[next(iter(eval_results_dense))]
    },
    "rank_fusion": {
        metric: sum([res[metric] for res in eval_results_rank_fusion.values()]) / len(eval_results_rank_fusion)
        for metric in eval_results_rank_fusion[next(iter(eval_results_rank_fusion))]
    },
    "cascade": {
        metric: sum([res[metric] for res in eval_results_cascade.values()]) / len(eval_results_cascade)
        for metric in eval_results_cascade[next(iter(eval_results_cascade))]
    }
}

print("Aggregated results:", json.dumps(aggregated_results, indent=4))
print("Retrieval results and metrics saved to files.")

Aggregated results: {
    "sparse": {
        "recall_5": 0.008603751543369476,
        "ndcg_cut_5": 0.7017622426368895
    },
    "dense": {
        "recall_5": 0.008045358532929462,
        "ndcg_cut_5": 0.5830060175423573
    },
    "rank_fusion": {
        "recall_5": 0.008561802785762561,
        "ndcg_cut_5": 0.6735944159049838
    },
    "cascade": {
        "recall_5": 0.008402741442359375,
        "ndcg_cut_5": 0.6966364988247945
    }
}
Retrieval results and metrics saved to files.


# Section 4: QA with Language Model

In [None]:
# QA for the first query
QUERY_INDEX = 3                                                     # Index of the query to be used for retrieval
query = all_queries[QUERY_INDEX - 1]                                # Select the query from the list based on the index
query_text = query['title'] if isinstance(query, dict) else query   # Get the query text

# Retrieval calls:

# Perform dense retrieval using query embedding and document embeddings
dense_top_k_indices, dense_top_k_scores = dense_retrieve(query_embeddings[QUERY_INDEX], doc_embeddings)
# Perform sparse retrieval using BM25 on the query text
sparse_top_k_indices, sparse_top_k_scores = bm25_retrieve(query_text, bm25)
# Perform rank fusion retrieval by combining BM25 and dense retrieval results
rank_top_k_indices, rank_top_k_scores = fusion_retrieve(
    query_embeddings[QUERY_INDEX],
    doc_embeddings,
    query_text
)
# Perform cascading retrieval: first BM25, then re-rank with dense retrieval
cascading_top_k_indices, cascading_top_k_scores = cascade_retrieve(
    query_embeddings[QUERY_INDEX],
    doc_embeddings,
    query_text
)

# Get retrieved documents for each method
dense_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(dense_top_k_indices)]
sparse_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(sparse_top_k_indices)]
rank_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(rank_top_k_indices)]
cascading_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(cascading_top_k_indices)]

# Definition of the model that will be used to generate the various responses.
lm_pipeline = pipeline("text-generation",
                      model="meta-llama/Llama-3.2-1B",
                      device=0 if device == "cuda" else -1)

#### Question-answering using DENSE RETRIEVAL

In [None]:
print("------------------ DENSE RETRIEVAL ----------------------\n")
context = "\n".join(dense_retrieved_docs)
prompt = f"Context:\n{context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"

print(f"----------------- Length of the prompt -----------------\n{len(prompt.split())} words")
print(f"------------------------ Prompt ------------------------\n{prompt}")

# Generate response
response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.7,
                      truncation=False)[0]["generated_text"]
response = response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()

print(f"------------------ Response ------------------\n{response}")

#### Question-answering using SPARSE RETRIEVAL

In [None]:
print("------------------ SPARSE RETRIEVAL ----------------------\n")
context = "\n".join(sparse_retrieved_docs)
prompt = f"Context:\n{context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"

print(f"----------------- Length of the prompt -----------------\n{len(prompt.split())} words")
print(f"------------------------ Prompt ------------------------\n{prompt}")

# Generate response
response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.7,
                      truncation=False)[0]["generated_text"]

response = response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()
print(f"------------------ Response ------------------\n{response}")

#### Question-answering using RANK FUSION

In [None]:
print("------------------ RANK FUSION ----------------------\n")
context = "\n".join(rank_retrieved_docs)
prompt = f"Context:\n{context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"

print(f"----------------- Length of the prompt -----------------\n{len(prompt.split())} words")
print(f"------------------------ Prompt ------------------------\n{prompt}")

# Generate response
response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.7,
                      truncation=False)[0]["generated_text"]

response = response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()
print(f"------------------ Response ------------------\n{response}")

#### Question-answering using CASCADING RETRIEVAL

In [None]:
print("------------------ CASCADING RETRIEVAL ----------------------\n")
context = "\n".join(cascading_retrieved_docs)
prompt = f"Context:\n{context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"

print(f"----------------- Length of the prompt -----------------\n{len(prompt.split())} words")
print(f"------------------------ Prompt ------------------------\n{prompt}")

# Generate response
response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.7,
                      truncation=False)[0]["generated_text"]

response = response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()
print(f"------------------ Response ------------------\n{response}")

#### Question-answering WITH NO CONTEXT PROVIDED WITH RAG

In [None]:

print("------------------ RESPONSE WITHOUT RAG ----------------------\n")
prompt = f"""Question:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"""

print(f"----------------- Length of the prompt -----------------\n{len(prompt.split())} words")
print(f"------------------------ Prompt ------------------------\n{prompt}")

response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.7,
                      truncation=False)[0]["generated_text"]

response = response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()
print(f"------------------ Response ------------------\n{response}")

In [None]:
import random

picked_queries = random.sample(all_queries, 5)

for q in picked_queries:

    # For each query, retrieve and rank documents independently
    query_text = q['title']
    cascading_top_k_indices, cascading_top_k_scores = cascade_retrieve(query_embeddings[QUERY_INDEX], doc_embeddings, query_text)

    # Use the top-k documents for that specific query
    cascading_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(cascading_top_k_indices)]
    cascading_context = "\n".join(cascading_retrieved_docs)

    # Repeat the process for rank fusion
    rank_top_k_indices, rank_top_k_scores = fusion_retrieve(query_embeddings[QUERY_INDEX], doc_embeddings, query_text)
    rank_retrieved_docs = [f"Document {i+1}: {all_docs[idx]['abstract']}" for i, idx in enumerate(rank_top_k_indices)]
    rank_fusion_context = "\n".join(rank_retrieved_docs)


    cascading_prompt = f"Context:\n{cascading_context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"
    rank_fusion_prompt = f"Context:\n{rank_fusion_context}\n\nQuestion:\n{query_text}\n\nAnswer in a concise and clear manner without repetition (if no direct answer, provide a general summary):"

    # Generate response using language model
    cascading_response = lm_pipeline(cascading_prompt,
                           max_new_tokens=150,
                           temperature=0.7,
                           truncation=False)[0]["generated_text"]

    rank_fusion_response = lm_pipeline(prompt,
                      max_new_tokens=150,
                      temperature=0.1,
                      truncation=False)[0]["generated_text"]

    # Extract the answer from the response
    cascading_response = cascading_response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()
    rank_fusion_response = rank_fusion_response.split("Answer in a concise and clear manner without repetition (if no direct answer, provide a general summary):")[1].strip()

    # Print the results
    print(f"\nQuery: {query_text}")
    print(f"Cascading Response: {cascading_response}")
    print(f"Rank Fusion Response: {rank_fusion_response}")
    print("------------------------------\n")
