In [1]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from datasets import load_dataset
import pandas as pd
import faiss
import numpy as np
from tqdm import tqdm

class DPRRetriever(nn.Module):
    def __init__(self, model_name="bert-base-uncased", proj_dim=512):
        super(DPRRetriever, self).__init__()

        self.query_encoder = BertModel.from_pretrained(model_name)
        self.passage_encoder = BertModel.from_pretrained(model_name)
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

        # Freeze BERT parameters
        for param in self.query_encoder.parameters():
            param.requires_grad = False
        for param in self.passage_encoder.parameters():
            param.requires_grad = False

        # Add trainable projection layers: 768 → 512
        self.query_proj = nn.Linear(self.query_encoder.config.hidden_size, proj_dim)
        self.passage_proj = nn.Linear(self.passage_encoder.config.hidden_size, proj_dim)

    def encode_query(self, texts, device):
        with torch.no_grad():
            encoding = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            outputs = self.query_encoder(input_ids=input_ids, attention_mask=attention_mask)
            cls_token = outputs.last_hidden_state[:, 0]  # CLS token
        return self.query_proj(cls_token)  # Trainable layer

    def encode_passage(self, texts, device):
        with torch.no_grad():
            encoding = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            outputs = self.passage_encoder(input_ids=input_ids, attention_mask=attention_mask)
            cls_token = outputs.last_hidden_state[:, 0]
        return self.passage_proj(cls_token)  # Trainable layer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

model = DPRRetriever()
model.load_state_dict(torch.load("/Users/likhit/Desktop/Projects/RAG/model/retriever/testmodel/dpr_model.pt", map_location=device))
model.to(device)
model.eval()

  model.load_state_dict(torch.load("/Users/likhit/Desktop/Projects/RAG/model/retriever/testmodel/dpr_model.pt", map_location=device))


DPRRetriever(
  (query_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [3]:
# from datasets import load_dataset
# import torch
# import numpy as np
# import pandas as pd
# import faiss
# import os
# from tqdm import tqdm

# # Load test data
# test_data = load_dataset("neural-bridge/rag-dataset-12000", split="test")
# test_contexts = test_data["context"]

# # Filter out any None or empty context entries
# test_contexts = [c for c in test_contexts if c]

# # Use your already-loaded fine-tuned DPR model
# # Make sure `model = DPRRetriever()` is already defined and loaded

# # Create storage directory
# store_dir = "dpr_faiss_store_512"
# os.makedirs(store_dir, exist_ok=True)

# # Encode all test contexts using DPR model
# context_embeddings = []
# batch_size = 32

# print("Encoding test contexts with DPR model...")
# model.eval()
# with torch.no_grad():
#     for i in tqdm(range(0, len(test_contexts), batch_size)):
#         batch = test_contexts[i:i+batch_size]
#         embs = model.encode_passage(batch, device).cpu().numpy()
#         context_embeddings.append(embs)

# context_embeddings = np.vstack(context_embeddings)

# # Save FAISS index
# dimension = context_embeddings.shape[1]  # should match model projection (e.g. 512)
# index = faiss.IndexFlatL2(dimension)
# index.add(context_embeddings)
# faiss.write_index(index, f"{store_dir}/context_index.faiss")

# # Save context mapping
# pd.DataFrame({"context": test_contexts}).to_csv(f"{store_dir}/context_mapping.csv", index=False)

# print(f"✅ DPR FAISS index and mapping saved to '{store_dir}/'")

In [4]:
# Load test set
test_data = load_dataset("neural-bridge/rag-dataset-12000", split="test")
test_questions = test_data["question"]
test_contexts = test_data["context"]

# Filter out any invalid (None or empty) question-context pairs
filtered = [(q, c) for q, c in zip(test_questions, test_contexts) if isinstance(q, str) and q.strip() and isinstance(c, str) and c.strip()]
test_questions, test_contexts = zip(*filtered)


# Load FAISS index and context mapping
index = faiss.read_index("dpr_faiss_store_512/context_index.faiss")
context_df = pd.read_csv("dpr_faiss_store_512/context_mapping.csv")

# Evaluation
def compute_metrics(k=3):
    correct = 0
    total = len(test_questions)
    precision_scores = []
    recall_scores = []

    for question, true_context in tqdm(zip(test_questions, test_contexts), total=total):
        with torch.no_grad():
            query_vec = model.encode_query([question], device).cpu().numpy()
        # print("query_vec shape:", query_vec.shape)
        _, indices = index.search(query_vec, k)
        retrieved_contexts = [context_df.iloc[i]["context"] for i in indices[0]]

        match = [1 if true_context.strip() == r.strip() else 0 for r in retrieved_contexts]

        precision = sum(match) / k
        recall = 1.0 if any(match) else 0.0
        accuracy = 1 if any(match) else 0

        precision_scores.append(precision)
        recall_scores.append(recall)
        correct += accuracy

    avg_precision = np.mean(precision_scores)
    avg_recall = np.mean(recall_scores)
    accuracy = correct / total

    return {
        "Precision@%d" % k: round(avg_precision, 4),
        "Recall@%d" % k: round(avg_recall, 4),
        "Accuracy@%d" % k: round(accuracy, 4)
    }

# Run
metrics = compute_metrics(k=5)
print(metrics)

100%|██████████| 2399/2399 [00:37<00:00, 63.47it/s]

{'Precision@5': 0.0003, 'Recall@5': 0.0017, 'Accuracy@5': 0.0017}





In [16]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import faiss
from tqdm import tqdm
import torch
from sentence_transformers import SentenceTransformer

# Load fine-tuned model
model = SentenceTransformer("/Users/likhit/Desktop/Projects/RAG/Projection_head")

# Load test set
test_data = load_dataset("neural-bridge/rag-dataset-12000", split="test")
test_questions = test_data["question"]
test_contexts = test_data["context"]

# Filter invalid samples
filtered = [(q, c) for q, c in zip(test_questions, test_contexts) if isinstance(q, str) and q.strip() and isinstance(c, str) and c.strip()]
test_questions, test_contexts = zip(*filtered)

# Load FAISS index + mapping
index = faiss.read_index("/Users/likhit/Desktop/Projects/RAG/model/retriever/testmodel/dpr_faiss_store_512/context_index.faiss")
context_df = pd.read_csv("/Users/likhit/Desktop/Projects/RAG/model/retriever/testmodel/dpr_faiss_store_512/context_mapping.csv")

# Evaluate
def compute_metrics(k=5):
    correct = 0
    total = len(test_questions)
    precision_scores = []
    recall_scores = []

    for question, true_context in tqdm(zip(test_questions, test_contexts), total=total):
        query_vec = model.encode([question], convert_to_numpy=True)
        query_vec = np.expand_dims(query_vec, axis=0) if query_vec.ndim == 1 else query_vec

        _, indices = index.search(query_vec, k)
        retrieved_contexts = [context_df.iloc[i]["context"] for i in indices[0]]

        match = [1 if true_context.strip() == r.strip() else 0 for r in retrieved_contexts]

        precision = sum(match) / k
        recall = 1.0 if any(match) else 0.0
        accuracy = 1 if any(match) else 0

        precision_scores.append(precision)
        recall_scores.append(recall)
        correct += accuracy

    return {
        f"Precision@{k}": round(np.mean(precision_scores), 4),
        f"Recall@{k}": round(np.mean(recall_scores), 4),
        f"Accuracy@{k}": round(correct / total, 4)
    }

# Run
metrics = compute_metrics(k=3)
print(metrics)

100%|██████████| 2399/2399 [00:27<00:00, 86.19it/s]

{'Precision@3': 0.0003, 'Recall@3': 0.0008, 'Accuracy@3': 0.0008}





In [5]:
# {'Precision@5': 0.0037, 'Recall@5': 0.0183, 'Accuracy@5': 0.0183}

# {'Precision@5': 0.0004, 'Recall@5': 0.0021, 'Accuracy@5': 0.0021}

# {'Precision@5': 0.0003, 'Recall@5': 0.0017, 'Accuracy@5': 0.0017}

# {'Precision@5': 0.0005, 'Recall@5': 0.0025, 'Accuracy@5': 0.0025}