In [6]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sacrebleu import sentence_bleu as sacre_sentence_bleu
from evaluate import load
from nltk.translate.meteor_score import meteor_score
from bert_score import score
from rouge_score import rouge_scorer, scoring

import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# Load the Rouge scorer
import evaluate

# This will now load from Hugging Face, not your local dir
rouge = evaluate.load("rouge")

def calculate_nltk_bleu(references, hypotheses):
    scores = []
    smooth = SmoothingFunction().method1
    for ref, hyp in zip(references, hypotheses):
        score_val = sentence_bleu([ref.split()], hyp.split(), smoothing_function=smooth)
        scores.append(score_val * 100)
    return scores

def calculate_sacrebleu(references, hypotheses):
    return [sacre_sentence_bleu(hyp, [ref]).score for ref, hyp in zip(references, hypotheses)]

def calculate_rouge(references, hypotheses):
    results = rouge.compute(
        predictions=hypotheses,
        references=references,
        tokenizer=lambda x: x.split(),  # default whitespace tokenizer for English
        use_aggregator=False
    )
    return results['rouge1'], results['rouge2'], results['rougeL']

def calculate_meteor(references, hypotheses):
    return [
        meteor_score([ref.split()], hyp.split())
        for ref, hyp in zip(references, hypotheses)
    ]

def calculate_bert_score(references, hypotheses):
    # Use English language BERTScore
    P, R, F1 = score(hypotheses, references, lang="en", model_type="roberta-large")
    return P.tolist(), R.tolist(), F1.tolist()

def calculate_exact_match(references, hypotheses):
    return [1 if ref.strip() == hyp.strip() else 0 for ref, hyp in zip(references, hypotheses)]


FileNotFoundError: Couldn't find a module script at c:\hammad workings\Thesis\Multihop_Project\MultiHop_Query_Handling_in_RAG\MultiHop for English\code\evaluation_scripts\rouge\rouge.py. Module 'rouge' doesn't exist on the Hugging Face Hub either.

In [1]:
import pandas as pd
# --- Load Your CSVs --
vanilla_df = pd.read_csv("../../../Dataset_code_csvs/hotpotQA/hotpotQA_dataset_versions/5884paras_598queries/English/598_QnAs.csv")
simple_rag_df=pd.read_csv("../../results/pipeline results/5884paras_598qna/llama3.1_res/simple_rag_qna_results_GPU_version.csv")
lqr_df = pd.read_csv("../../results/pipeline results/5884paras_598qna/llama3.1_res/LQR_processed_results_en.csv")
mod_lqr_df = pd.read_csv("../../results/pipeline results/5884paras_598qna/llama3.1_res/modLQR_processed_results_en.csv")

# --- Ground Truth ---
references = vanilla_df['answer'].astype(str).tolist()

# --- Predictions ---
vanilla_preds = simple_rag_df['final_answer'].astype(str).tolist()
lqr_preds = lqr_df['final_answer'].astype(str).tolist()
mod_lqr_preds = mod_lqr_df['final_answer'].astype(str).tolist()

In [13]:
# import pandas as pd

# # Load the CSV
# simple_rag_df = pd.read_csv("../../results/pipeline results/5884paras_598qna/simple_rag_qna_results_GPU_version.csv")

# # Check for duplicated index (if index is not set explicitly, use row number)
# duplicates = simple_rag_df[simple_rag_df.duplicated(keep=False)]

# print(f"🔍 Total duplicated rows: {duplicates.shape[0]}")
# print(duplicates)


🔍 Total duplicated rows: 0
Empty DataFrame
Columns: [level, question, answer, context, retrieved_context, final_answer, retriever_time, generator_time, total_time]
Index: []


In [2]:
def evaluate_pipeline(name, references, predictions):
    print(f"\n🚀 Evaluating pipeline: {name}")

    # Calculate all scores
    nltk_bleu = calculate_nltk_bleu(references, predictions)
    sacrebleu_scores = calculate_sacrebleu(references, predictions)
    rouge1, rouge2, rougeL = calculate_rouge(references, predictions)
    meteor = calculate_meteor(references, predictions)  # ✅ Updated: no tokenizer needed
    exact_match = calculate_exact_match(references, predictions)
    bert_p, bert_r, bert_f1 = calculate_bert_score(references, predictions)

    print(f"✅ All scores calculated for: {name}")

    # Create and return DataFrame
    return pd.DataFrame({
        'question': vanilla_df['question'],  # Aligning with each row
        f'{name}_nltk_bleu': nltk_bleu,
        f'{name}_sacrebleu': sacrebleu_scores,
        f'{name}_rouge1': rouge1,
        f'{name}_rouge2': rouge2,
        f'{name}_rougeL': rougeL,
        f'{name}_meteor': meteor,
        f'{name}_exact_match': exact_match,
        f'{name}_bert_precision': bert_p,
        f'{name}_bert_recall': bert_r,
        f'{name}_bert_f1': bert_f1
    })


In [3]:
vanilla_scores = evaluate_pipeline("Vanilla_RAG", references, vanilla_preds)
lqr_scores = evaluate_pipeline("LQR_RAG", references, lqr_preds)
mod_lqr_scores = evaluate_pipeline("modLQR_RAG", references, mod_lqr_preds)



🚀 Evaluating pipeline: Vanilla_RAG


NameError: name 'calculate_nltk_bleu' is not defined

In [28]:
import pandas as pd
from scipy import stats

# Slice to exclude first 98 records (keep 99-end)
vanilla_subset = vanilla_scores.iloc[98:]
lqr_subset = lqr_scores.iloc[98:]
mod_lqr_subset = mod_lqr_scores.iloc[98:]

# List of all metrics
metrics = [
    'nltk_bleu', 'sacrebleu', 'rouge1', 'rouge2', 'rougeL',
    'meteor', 'exact_match', 'bert_precision', 'bert_recall', 'bert_f1'
]

# Create summary DataFrame
results = pd.DataFrame(index=metrics, columns=['Vanilla', 'LQR', 'modLQR'])

# Calculate means for each pipeline (excluding first 98)
for metric in metrics:
    results.loc[metric, 'Vanilla'] = vanilla_subset[f'Vanilla_RAG_{metric}'].mean()
    results.loc[metric, 'LQR'] = lqr_subset[f'LQR_RAG_{metric}'].mean()
    results.loc[metric, 'modLQR'] = mod_lqr_subset[f'modLQR_RAG_{metric}'].mean()

# Statistical testing on the subset
p_values = []
for metric in metrics:
    _, p_val = stats.ttest_rel(
        lqr_subset[f'LQR_RAG_{metric}'],
        mod_lqr_subset[f'modLQR_RAG_{metric}']
    )
    p_values.append(f"{p_val:.4f}{'*' if p_val < 0.05 else ''}")

results['LQR_vs_modLQR_p'] = p_values

# Format and display
pd.set_option('display.float_format', lambda x: f"{x:.4f}")
print("\n📊 Mean Scores (Records 99-598 Only):")
print(results)
print(f"\nAnalyzing {len(vanilla_subset)} records (99-598)")


📊 Mean Scores (Records 99-598 Only):
               Vanilla     LQR  modLQR LQR_vs_modLQR_p
nltk_bleu       8.5736  6.9289  5.8009         0.0414*
sacrebleu      23.5059 20.6129 18.5155          0.1596
rouge1          0.2699  0.2365  0.2250          0.4578
rouge2          0.1271  0.1059  0.0893          0.1524
rougeL          0.2698  0.2356  0.2240          0.4556
meteor          0.2437  0.2151  0.2200          0.7196
exact_match     0.1660  0.1380  0.1040         0.0321*
bert_precision  0.5911  0.5297  0.5028         0.0241*
bert_recall     0.6307  0.6144  0.6223          0.4541
bert_f1         0.6044  0.5597  0.5451          0.1820

Analyzing 500 records (99-598)
