In [16]:
import faiss
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import numpy as np
import nltk
import torch

In [17]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

retriever = SentenceTransformer("/Users/likhit/Desktop/Projects/RAG/1fineeeeminilm_proj512_only_dense")


# Load FAISS index + mapping
index = faiss.read_index("/Users/likhit/Desktop/Projects/RAG/model/retriever/testmodel/chunks_test_faiss_store/context_index.faiss")
context_df = pd.read_csv("/Users/likhit/Desktop/Projects/RAG/model/retriever/testmodel/chunks_test_faiss_store/context_mapping.csv")

retriever.eval()
retriever.to(device)


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [18]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
generator = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
generator.to(device)
test_data = load_dataset("neural-bridge/rag-dataset-12000", split="test")
test_questions = test_data["question"]
test_answers = test_data["answer"]

In [19]:
def retrieve_top_k(question, k=5):
    query_vec = retriever.encode([question], convert_to_numpy=True)
    query_vec = np.expand_dims(query_vec, axis=0) if query_vec.ndim == 1 else query_vec

    _, indices = index.search(query_vec, k)
    
    retrieved_contexts = [context_df.iloc[i]["chunk"] for i in indices[0]]
    return retrieved_contexts


def build_prompt(question, contexts):
    prompt = f"Answer the following question using complete sentences based only on the given context.\n"
    prompt += "\n".join(contexts)
    prompt += f"\n\nQuestion: {question}\nAnswer:"
    return prompt


def generate_answer(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = generator.generate(**inputs, max_length=512)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [20]:
def evaluate_model(k=5, max_samples=500):
    ems, f1s, rouges, bleus, inclusives = [], [], [], [], []
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth_fn = SmoothingFunction().method1

    for i in tqdm(range(min(len(test_questions), max_samples))):
        q, gt = test_questions[i], test_answers[i].strip()
        ctxs = retrieve_top_k(q, k)
        pred = generate_answer(build_prompt(q, ctxs)).strip()

        # Normalize
        gt_lower = gt.lower()
        pred_lower = pred.lower()

        # EM
        em = int(gt_lower == pred_lower)

        # F1
        gt_tokens = gt_lower.split()
        pred_tokens = pred_lower.split()
        common = set(gt_tokens) & set(pred_tokens)
        f1 = 2 * len(common) / (len(gt_tokens) + len(pred_tokens)) if (gt_tokens and pred_tokens) else 0.0

        # BLEU
        bleu = sentence_bleu([gt_tokens], pred_tokens, smoothing_function=smooth_fn)

        # ROUGE-L
        rouge = scorer.score(gt_lower, pred_lower)["rougeL"].fmeasure

        # Inclusive accuracy (substring match)
        inclusive = int(gt_lower in pred_lower or pred_lower in gt_lower)

        ems.append(em)
        f1s.append(f1)
        rouges.append(rouge)
        bleus.append(bleu)
        inclusives.append(inclusive)

    return {
        "Exact Match": round(np.mean(ems), 4),
        "F1 Score": round(np.mean(f1s), 4),
        "BLEU": round(np.mean(bleus), 4),
        "ROUGE-L": round(np.mean(rouges), 4),
        "Inclusive Accuracy": round(np.mean(inclusives), 4),
    }

# 7. Run evaluation
metrics = evaluate_model(k=5, max_samples=500)
print("📊 Final Evaluation Metrics:\n", metrics)

100%|██████████| 500/500 [10:57<00:00,  1.32s/it] 

📊 Final Evaluation Metrics:
 {'Exact Match': 0.01, 'F1 Score': 0.2409, 'BLEU': 0.0872, 'ROUGE-L': 0.2826, 'Inclusive Accuracy': 0.354}





In [22]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Generate answer from prompt
def generate_answer1(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = generator.generate(**inputs, max_length=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Use LLM to compare answer with ground truth
def verify_equivalence(predicted, ground_truth):
    check_prompt = (
    f"Is the following short answer correct based on the full answer?\n"
    f"Short answer: {predicted}\n"
    f"Full answer: {ground_truth}\n"
    f"Reply with 'Yes' or 'No'."
    )
    result = generate_answer1(check_prompt).lower()
    return 1 if "yes" in result else 0

pred = "Fabien Gabel"
gt = "The music director of the Quebec Symphony Orchestra is Fabien Gabel."
is_equivalent = verify_equivalence(pred, gt)

print(f"Are they semantically same? {'✅ Yes' if is_equivalent else '❌ No'}")

Are they semantically same? ✅ Yes


In [23]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import torch
from tqdm import tqdm

# ✅ Setup model and device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# ✅ Helper: Generate LLM response
def generate_answer1(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = generator.generate(**inputs, max_length=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ✅ Helper: Use LLM to check semantic equivalence
def verify_equivalence(predicted, ground_truth):
    check_prompt = (
        f"Is the following short answer correct based on the full answer?\n"
        f"Short answer: {predicted}\n"
        f"Full answer: {ground_truth}\n"
        f"Reply with 'Yes' or 'No'."
    )
    result = generate_answer1(check_prompt).lower()
    return 1 if "yes" in result else 0

# ✅ Load test data (you can replace with your own predictions if available)
dataset = load_dataset("neural-bridge/rag-dataset-12000", split="test")
questions = dataset["question"]
ground_truths = dataset["answer"]

# ⛔️ Example dummy predictions — replace with your generated answers!
predictions = ["Fabien Gabel"] * len(ground_truths)  # Replace this with actual model predictions

# ✅ Loop and evaluate semantic accuracy
correct = 0
total = min(len(predictions), len(ground_truths))

for pred, gt in tqdm(zip(predictions, ground_truths), total=total):
    correct += verify_equivalence(pred, gt)

semantic_accuracy = correct / total
print(f"\n📊 Semantic Accuracy (via LLM judgment): {semantic_accuracy:.4f}")

100%|██████████| 2400/2400 [10:49<00:00,  3.70it/s] 


📊 Semantic Accuracy (via LLM judgment): 0.0013



