<!-- Sentence Transformer performance -->

In [2]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
import faiss
import os
from tqdm import tqdm

# Load test data
test_data = load_dataset("neural-bridge/rag-dataset-12000", split="test")
test_contexts = test_data["context"]

# Load pretrained sentence transformer
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Create storage directory
os.makedirs("test_faiss_store", exist_ok=True)

# Encode all test contexts
context_embeddings = []
batch_size = 32

print("Encoding contexts...")
for i in tqdm(range(0, len(test_contexts), batch_size)):
    batch = test_contexts[i:i+batch_size]
    emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
    context_embeddings.append(emb)

context_embeddings = np.vstack(context_embeddings)

# Save FAISS index
dimension = context_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(context_embeddings)
faiss.write_index(index, "test_faiss_store/context_index.faiss")

# Save mapping to retrieve text later
pd.DataFrame({"context": test_contexts}).to_csv("test_faiss_store/context_mapping.csv", index=False)

print("✅ FAISS index and context mapping saved to 'test_faiss_store/'")

In [4]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import pandas as pd
import faiss
import numpy as np
import torch
from tqdm import tqdm

# Load model and device
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

# Load test data
test_data = load_dataset("neural-bridge/rag-dataset-12000", split="test")

filtered_pairs = [(q, c) for q, c in zip(test_data["question"], test_data["context"]) if q and c]
test_questions, test_contexts = zip(*filtered_pairs)

# Load FAISS index + mapping (built using test_contexts)
index = faiss.read_index("test_faiss_store/context_index.faiss")
context_df = pd.read_csv("test_faiss_store/context_mapping.csv")

# Evaluation
def compute_metrics(model, k=3):
    correct = 0
    precision_scores = []
    recall_scores = []
    total = len(test_questions)

    for q, true_ctx in tqdm(zip(test_questions, test_contexts), total=total):
        # Get query embedding
        query_vec = model.encode(q, convert_to_numpy=True)
        query_vec = np.expand_dims(query_vec, axis=0)

        # Search FAISS
        _, indices = index.search(query_vec, k)
        retrieved = [context_df.iloc[i]["context"] for i in indices[0]]

        # Binary relevance
        match = [1 if r.strip() == true_ctx.strip() else 0 for r in retrieved]

        precision = sum(match) / k
        recall = 1.0 if any(match) else 0.0
        accuracy = 1 if any(match) else 0

        precision_scores.append(precision)
        recall_scores.append(recall)
        correct += accuracy

    return {
        f"Precision@{k}": round(np.mean(precision_scores), 4),
        f"Recall@{k}": round(np.mean(recall_scores), 4),
        f"Accuracy@{k}": round(correct / total, 4)
    }

# Run evaluation
metrics = compute_metrics(model, k=5)
print(metrics)

100%|██████████| 2399/2399 [00:27<00:00, 86.17it/s]

{'Precision@5': 0.1801, 'Recall@5': 0.9004, 'Accuracy@5': 0.9004}



