In [1]:
# --- QUESTION 4: RAG Pipeline Setup ---

print("---  Setting up RAG Environment ---")

# 1. Install RAG Libraries
# rank_bm25: The search engine algorithm
# sentence-transformers: For vector retrieval (Task 4b)
# accelerate: Helps load FLAN-T5 efficiently
!pip install rank_bm25 sentence-transformers transformers datasets torch scikit-learn accelerate > /dev/null

# 2. Download Spacy (for text processing)
!python -m spacy download en_core_web_sm > /dev/null

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"\n✅ Setup Complete.")
print(f"Device: {device}")

---  Setting up RAG Environment ---
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
cudf-polars-cu12 25.6.0 requires pylibcudf-cu12==25.6.*, but you have pylibcudf-cu12 25.2.2 which is incompatible.
pylibcugraph-cu12 25.6.0 requires pylibraft-cu12==25.6.*, but you have pylibraft-cu12 25.2.0 which

In [2]:
# --- STEP 2 & 3 : Load Data, Build KB, and Initialize Retriever ---
from datasets import load_dataset
from rank_bm25 import BM25Okapi
import time

#  Load HotpotQA
print("1. Loading HotpotQA Dataset (this takes a minute)...")
dataset = load_dataset("hotpot_qa", "distractor", split="train[:2000]") 
print(f"    Loaded {len(dataset)} samples.")

#  Build Knowledge Base (Corpus) & Ground Truths
print("2. Structuring Knowledge Base & Ground Truths...")

corpus = []       
corpus_ids = []   
queries = []      
answers = []      
ground_truth_titles = [] 

for item in dataset:
    queries.append(item['question'])
    answers.append(item['answer'])
    
    # Extract Ground Truth Titles 
    supp_titles = set(item['supporting_facts']['title'])
    ground_truth_titles.append(supp_titles)
    
    # Extract Paragraphs for Search
    titles = item['context']['title']
    sentences = item['context']['sentences']
    
    for title, sent_list in zip(titles, sentences):
        text = " ".join(sent_list)
        if text not in corpus:
            corpus.append(text)
            corpus_ids.append(title)

print(f"   Knowledge Base Built. Documents: {len(corpus)}")
print(f"   Ground Truths Ready: {len(ground_truth_titles)}")

#  Build Retriever (BM25)
print("3. Tokenizing corpus for BM25...")
tokenized_corpus = [doc.split(" ") for doc in corpus]

print("   Indexing corpus...")
start_time = time.time()
bm25 = BM25Okapi(tokenized_corpus)
end_time = time.time()
print(f"   ✅ Retriever Ready (took {end_time - start_time:.2f}s).")

#  Retrieval Helper Function
def retrieve_documents(query, k=3):
    tokenized_query = query.split(" ")
    return bm25.get_top_n(tokenized_query, corpus, n=k)

# Test it
print("\n--- Test Retrieval ---")
test_q = queries[0]
docs = retrieve_documents(test_q)
print(f"Q: {test_q}")
print(f"Top Result: {docs[0][:150]}...")

1. Loading HotpotQA Dataset (this takes a minute)...


README.md: 0.00B [00:00, ?B/s]

distractor/train-00000-of-00002.parquet:   0%|          | 0.00/166M [00:00<?, ?B/s]

distractor/train-00001-of-00002.parquet:   0%|          | 0.00/166M [00:00<?, ?B/s]

distractor/validation-00000-of-00001.par(…):   0%|          | 0.00/27.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

    Loaded 2000 samples.
2. Structuring Knowledge Base & Ground Truths...
   Knowledge Base Built. Documents: 19214
   Ground Truths Ready: 2000
3. Tokenizing corpus for BM25...
   Indexing corpus...
   ✅ Retriever Ready (took 0.69s).

--- Test Retrieval ---
Q: Which magazine was started first Arthur's Magazine or First for Women?
Top Result: First for Women is a woman's magazine published by Bauer Media Group in the USA.  The magazine was started in 1989.  It is based in Englewood Cliffs, ...


In [3]:
# --- STEP 4 : The Generator (FLAN-T5) ---
import os
# Fix for the "AttributeError" noise
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import random

# Define Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load FLAN-T5
# If the download looks "stuck" at 100%, IT IS WORKING. Just wait 1-2 mins for it to verify the file.
model_name = "google/flan-t5-base"
print(f"Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
print("✅ Generator Loaded.")

# Define Generation Function
def generate_answer(question, context_docs):
    context_text = " ".join(context_docs)
    input_text = f"question: {question} context: {context_text}"
    
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).input_ids.to(device)
    
    outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# The Full RAG Pipeline
def run_rag(question):
    retrieved_docs = retrieve_documents(question, k=3)
    answer = generate_answer(question, retrieved_docs)
    return answer, retrieved_docs

# --- TEST THE SYSTEM (5 Random Examples) ---
print("\n---  RAG System Test (5 Random Samples) ---")

test_indices = random.sample(range(len(queries)), 5)

for i, idx in enumerate(test_indices):
    question = queries[idx]
    true_answer = answers[idx]
    
    print(f"\n[Example {i+1}]")
    print(f"Q: {question}")
    
    pred_ans, docs = run_rag(question)
    
    print(f"True Answer: {true_answer}")
    print(f"RAG Answer:  {pred_ans}")
    print("-" * 40)

Using device: cuda
Loading google/flan-t5-base...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

2025-11-28 12:59:38.126663: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764334778.280052     117 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764334778.322830     117 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



✅ Generator Loaded.

---  RAG System Test (5 Random Samples) ---

[Example 1]
Q: Kerrygold Irish Cream Liqueur is owned by a company that mainly sells what type or product?
True Answer: dairy products
RAG Answer:  liqueur
----------------------------------------

[Example 2]
Q: Were Lee Ji-hye and Shim Eun-jin in the same band?
True Answer: no
RAG Answer:  no
----------------------------------------

[Example 3]
Q: Where is the Peace Palace Library located?
True Answer: It is located in The Hague, Netherlands, and was established to support the Permanent Court of Justice
RAG Answer:  The Hague
----------------------------------------

[Example 4]
Q: Scrabble and Sentinels of the Multiverse, is which form of entertainment?
True Answer: game
RAG Answer:  board game
----------------------------------------

[Example 5]
Q: A 2001 Indian epic sports-drama film starred an actor who was also know for his role in what 1999 movie?
True Answer: Sarfarosh
RAG Answer:  The Secret Life of Girls
---

In [4]:
# --- STEP 5 : Full RAG Evaluation ---

#  Install Metric Libraries
print("Installing evaluation libraries...")
!pip install torchmetrics bert_score > /dev/null

from torchmetrics.text.rouge import ROUGEScore
from torchmetrics.text.bleu import BLEUScore
from torchmetrics.text.bert import BERTScore
import torch
import numpy as np

#  Initialize Metrics
print("Loading Metrics (this may take a moment)...")
rouge = ROUGEScore()
bleu = BLEUScore()
# Use distilbert for speed
bert_scorer = BERTScore(model_name_or_path='distilbert-base-uncased', device=device)
print(" Metrics Ready.")

#  Helper: Retrieve Indices
def retrieve_indices(query, k=3):
    tokenized_query = query.split(" ")
    scores = bm25.get_scores(tokenized_query)
    top_n_indices = np.argsort(scores)[::-1][:k]
    return top_n_indices

#  Evaluation Loop
def evaluate_rag(num_samples=50):
    print(f"\n---  Evaluating RAG Pipeline ({num_samples} samples) ---")
    
    retrieval_hits = 0
    total_gold_docs = 0
    
    preds = []
    targets = []
    
    for i in range(num_samples):
        if i % 10 == 0: print(".", end="", flush=True)
        
        query = queries[i]
        gold_answer = answers[i]
        gold_titles = ground_truth_titles[i]
        
        # A. Retrieve
        top_indices = retrieve_indices(query, k=3)
        retrieved_titles = [corpus_ids[idx] for idx in top_indices]
        retrieved_texts = [corpus[idx] for idx in top_indices]
        
        # Calculate Recall
        hits = sum([1 for t in retrieved_titles if t in gold_titles])
        retrieval_hits += hits
        total_gold_docs += len(gold_titles)
        
        # B. Generate
        pred_ans = generate_answer(query, retrieved_texts)
        
        preds.append(pred_ans)
        targets.append(gold_answer)

    print(" Done!")
    
    #  Calculate Scores
    
    # Retrieval Score
    recall = retrieval_hits / total_gold_docs if total_gold_docs > 0 else 0
    print(f"\n Retrieval Recall@3: {recall:.2%}")
    
    # Generation Scores
    print(" Calculating Generation Metrics...")
    
    # BLEU
    bleu_targets = [[t] for t in targets]
    b_score = bleu(preds, bleu_targets).item() * 100
    
    # ROUGE
    r_score = rouge(preds, targets)
    
    # BERTScore
    bert_vals = bert_scorer(preds, targets)
    bert_f1 = bert_vals['f1'].mean().item() * 100
    
    print(f" BLEU:       {b_score:.2f}")
    print(f" ROUGE-L:    {r_score['rougeL_fmeasure'].item()*100:.2f}")
    print(f" BERTScore:  {bert_f1:.2f}")
    
    return preds, targets

# Run Evaluation on 50 samples
generated_answers, true_answers = evaluate_rag(num_samples=50)

# --- Qualitative Analysis (Task 4d) ---
print("\n---  Qualitative Analysis (5 Examples) ---")
for i in range(5): 
    print(f"\n[Example {i+1}]")
    print(f"Q: {queries[i]}")
    print(f"True: {true_answers[i]}")
    print(f"RAG:  {generated_answers[i]}")

Installing evaluation libraries...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading Metrics (this may take a moment)...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

 Metrics Ready.

---  Evaluating RAG Pipeline (50 samples) ---
..... Done!

 Retrieval Recall@3: 44.00%
 Calculating Generation Metrics...




model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

The following layers were not sharded: embeddings.position_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.ffn.lin*.bias, embeddings.LayerNorm.bias, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.k_lin.bias, embeddings.word_embeddings.weight, transformer.layer.*.attention.out_lin.bias, transformer.layer.*.output_layer_norm.weight, transformer.layer.*.attention.k_lin.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.weight, embeddings.LayerNorm.weight


 BLEU:       0.00
 ROUGE-L:    40.60
 BERTScore:  46.24

---  Qualitative Analysis (5 Examples) ---

[Example 1]
Q: Which magazine was started first Arthur's Magazine or First for Women?
True: Arthur's Magazine
RAG:  First for Women

[Example 2]
Q: The Oberoi family is part of a hotel company that has a head office in what city?
True: Delhi
RAG:  Delhi

[Example 3]
Q: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
True: President Richard Nixon
RAG:  President Richard Nixon

[Example 4]
Q:  What nationality was James Henry Miller's wife?
True: American
RAG:  Jamaican

[Example 5]
Q: Cadmium Chloride is slightly soluble in this chemical, it is also called what?
True: alcohol
RAG:  Benzamide


In [5]:
# --- STEP 6: Build Dense Retriever (Sentence-BERT) ---
from sentence_transformers import SentenceTransformer, util
import torch
import time

# Load Model
print("Loading Sentence-BERT model...")
dense_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Encode Corpus (The Knowledge Base)
print("Encoding Corpus ...")
start_time = time.time()

# We encode the corpus list we built in Step 2
corpus_embeddings = dense_model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

end_time = time.time()
print(f" Corpus Encoded in {end_time - start_time:.2f} seconds.")
print(f" Embedding Shape: {corpus_embeddings.shape}")

# Define Retrieval Function
def retrieve_dense(query, k=3):
    # Encode query
    query_embedding = dense_model.encode(query, convert_to_tensor=True)
    
    # Semantic Search (Cos Sim)
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=k)
    
    # hits is a list of lists (one per query). We only have 1 query.
    top_hits = hits[0]
    
    # Return indices
    return [hit['corpus_id'] for hit in top_hits]

# Test
print("\n--- Dense Retrieval Test ---")
test_q = "Who directed the movie 'Silence of the Lambs'?" # Out of domain test
docs_idx = retrieve_dense(test_q, k=1)
print(f"Q: {test_q}")
print(f"Top Result: {corpus[docs_idx[0]][:200]}...")

Loading Sentence-BERT model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

The following layers were not sharded: encoder.layer.*.attention.self.query.bias, embeddings.position_embeddings.weight, encoder.layer.*.output.dense.weight, encoder.layer.*.attention.output.LayerNorm.weight, pooler.dense.weight, encoder.layer.*.attention.output.LayerNorm.bias, encoder.layer.*.output.dense.bias, encoder.layer.*.attention.output.dense.bias, encoder.layer.*.attention.self.query.weight, encoder.layer.*.output.LayerNorm.bias, pooler.dense.bias, encoder.layer.*.attention.output.dense.weight, embeddings.LayerNorm.bias, embeddings.token_type_embeddings.weight, embeddings.word_embeddings.weight, encoder.layer.*.attention.self.value.bias, encoder.layer.*.attention.self.key.weight, encoder.layer.*.attention.self.value.weight, encoder.layer.*.attention.self.key.bias, encoder.layer.*.intermediate.dense.bias, embeddings.LayerNorm.weight, encoder.layer.*.output.LayerNorm.weight, encoder.layer.*.intermediate.dense.weight


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding Corpus ...


Batches:   0%|          | 0/601 [00:00<?, ?it/s]

 Corpus Encoded in 23.15 seconds.
 Embedding Shape: torch.Size([19214, 384])

--- Dense Retrieval Test ---
Q: Who directed the movie 'Silence of the Lambs'?
Top Result: The Silence of the Lambs is a 1991 American horror-thriller film directed by Jonathan Demme and starring Jodie Foster, Anthony Hopkins, and Scott Glenn.  Adapted by Ted Tally from the 1988 novel of th...


In [6]:
# --- STEP 7 : Final Comparison (BM25 vs Dense) ---
import numpy as np
import pandas as pd

# Helper: Calculate Precision/Recall for Retrieval
def calculate_retrieval_metrics(retrieved_titles, gold_titles):
    if len(retrieved_titles) == 0: return 0.0, 0.0
    
    # Hits (Is the gold title in our retrieved list?)
    hits = sum([1 for t in retrieved_titles if t in gold_titles])
    
    # Precision = Hits / Retrieved
    precision = hits / len(retrieved_titles)
    
    # Recall = Hits / Total Relevant
    recall = hits / len(gold_titles) if len(gold_titles) > 0 else 0.0
    
    return precision, recall

def evaluate_pipeline(name, retrieve_fn, num_samples=50):
    print(f"\n--- Evaluating {name} ({num_samples} samples) ---")
    
    # Metric Accumulators
    total_precision = 0
    total_recall = 0
    preds = []
    targets = []
    
    print("Running...", end="", flush=True)
    for i in range(num_samples):
        if i % 10 == 0: print(".", end="", flush=True)
        
        query = queries[i]
        gold_ans = answers[i]
        gold_set = ground_truth_titles[i]
        
        # RETRIEVE
        if name == "BM25":
            top_indices = retrieve_indices(query, k=3)
        else:
            top_indices = retrieve_fn(query, k=3)
            
        # Get titles and text
        ret_titles = [corpus_ids[idx] for idx in top_indices]
        ret_texts = [corpus[idx] for idx in top_indices]
        
        # Calculate P/R
        p, r = calculate_retrieval_metrics(ret_titles, gold_set)
        total_precision += p
        total_recall += r
        
        # GENERATE
        pred_ans = generate_answer(query, ret_texts)
        preds.append(pred_ans)
        targets.append(gold_ans)
        
    print(" Done!")
    
    # Averages
    avg_p = total_precision / num_samples
    avg_r = total_recall / num_samples
    
    # Generation Metrics
    bleu_targets = [[t] for t in targets]
    
    b_score = bleu(preds, bleu_targets).item() * 100
    r_score = rouge(preds, targets)['rougeL_fmeasure'].item() * 100
    bert_f1 = bert_scorer(preds, targets)['f1'].mean().item() * 100
    
    return {
        "Precision@3": avg_p * 100,
        "Recall@3": avg_r * 100,
        "BLEU": b_score,
        "ROUGE-L": r_score,
        "BERTScore": bert_f1
    }

# --- RUN COMPARISON ---
results_table = {}

# 1. Evaluate BM25
results_table["BM25 + FLAN-T5"] = evaluate_pipeline("BM25", None, num_samples=50)

# 2. Evaluate Dense
results_table["Dense + FLAN-T5"] = evaluate_pipeline("Dense", retrieve_dense, num_samples=50)

# Print Final Table
print("\n\n===  FINAL RAG SCOREBOARD ===")
df = pd.DataFrame(results_table).T
print(df.round(2))


--- Evaluating BM25 (50 samples) ---
Running........ Done!


The following layers were not sharded: embeddings.position_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.ffn.lin*.bias, embeddings.LayerNorm.bias, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.k_lin.bias, embeddings.word_embeddings.weight, transformer.layer.*.attention.out_lin.bias, transformer.layer.*.output_layer_norm.weight, transformer.layer.*.attention.k_lin.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.weight, embeddings.LayerNorm.weight



--- Evaluating Dense (50 samples) ---
Running........ Done!


The following layers were not sharded: embeddings.position_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.ffn.lin*.bias, embeddings.LayerNorm.bias, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.k_lin.bias, embeddings.word_embeddings.weight, transformer.layer.*.attention.out_lin.bias, transformer.layer.*.output_layer_norm.weight, transformer.layer.*.attention.k_lin.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.weight, embeddings.LayerNorm.weight




===  FINAL RAG SCOREBOARD ===
                 Precision@3  Recall@3  BLEU  ROUGE-L  BERTScore
BM25 + FLAN-T5         29.33      44.0   0.0    40.60      46.24
Dense + FLAN-T5        44.00      66.0   0.0    49.67      48.76
