In [68]:
!pip install pandas nltk sacrebleu evaluate bert-score urduhack torch transformers




In [69]:
import pandas as pd
import nltk
import sacrebleu
from evaluate import load
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import torch
import stanza
import re

In [None]:
#Urdu Text Normalization
def normalize_urdu_text(text):
    text = text.strip()
    text = re.sub(r"[\u0610-\u061A\u064B-\u065F\u06D6-\u06ED]", "", text)
    text = re.sub(r"[\u06D4\u060C\u066B\u066C]", "", text)  
    text = re.sub(r"\s+", " ", text) 
   
    synonym_map = {
        "کیا": "کیا تھا",
        "ہے": "تھا",
        "ہیں": "تھے"
    }
    for k, v in synonym_map.items():
        text = text.replace(k, v)
    return text


In [None]:
# Urdu Tokenizer
def urdu_tokenizer(text):
    text = normalize_urdu_text(text)
    doc = nlp(text)
    return [word.text for sent in doc.sentences for word in sent.words]

rouge = load("rouge")

In [None]:
# Load Data
csv_file = 'Final_Answers_Generated_llama3.1dual1.csv'
df = pd.read_csv(csv_file, usecols=[
    'modified_rag_refined_answer', 'traditional_rag_refined_answer', 'answer'
])
df = df.fillna("").astype(str)

references = [normalize_urdu_text(ref) for ref in df['answer'].tolist()]
mod_rag_preds = [normalize_urdu_text(pred) for pred in df['modified_rag_refined_answer'].tolist()]
trad_rag_preds = [normalize_urdu_text(pred) for pred in df['traditional_rag_refined_answer'].tolist()]


In [None]:
# Metric Calculation Functions
def calculate_nltk_bleu(references, hypotheses):
    scores = []
    smooth = SmoothingFunction().method4
    for ref, hyp in zip(references, hypotheses):
        score = sentence_bleu([urdu_tokenizer(ref)], urdu_tokenizer(hyp), smoothing_function=smooth)
        scores.append(score * 100)
    return scores

def calculate_sacrebleu(references, hypotheses):
    return [sacrebleu.sentence_bleu(hyp, [ref]).score for ref, hyp in zip(references, hypotheses)]

def calculate_rouge(references, hypotheses):
    results = rouge.compute(
        predictions=hypotheses,
        references=references,
        tokenizer=lambda x: urdu_tokenizer(x),
        use_aggregator=False
    )
    return results['rouge1'], results['rouge2'], results['rougeL']

def calculate_meteor(references, hypotheses):
    ref_tok = [urdu_tokenizer(ref) for ref in references]
    hyp_tok = [urdu_tokenizer(hyp) for hyp in hypotheses]
    return [meteor_score([r], h) for r, h in zip(ref_tok, hyp_tok)]

def calculate_bert_score(references, hypotheses):
    P, R, F1 = bert_score(
        cands=hypotheses,
        refs=references,
        lang="ur",
        model_type="xlm-roberta-base",
        rescale_with_baseline=True,
        device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
        verbose=True,
    )
    return P.tolist(), R.tolist(), F1.tolist()

def calculate_exact_match(references, hypotheses):
    return [1 if ref.strip() == hyp.strip() else 0 for ref, hyp in zip(references, hypotheses)]


In [None]:
# Evaluation Function
def evaluate_pipeline(name, references, predictions):
    print(f"\n🔍 Evaluating: {name}")

    nltk_bleu = calculate_nltk_bleu(references, predictions)
    sacrebleu_scores = calculate_sacrebleu(references, predictions)
    rouge1, rouge2, rougeL = calculate_rouge(references, predictions)
    meteor = calculate_meteor(references, predictions)
    exact_match = calculate_exact_match(references, predictions)
    bert_p, bert_r, bert_f1 = calculate_bert_score(references, predictions)

    print(f"✅ Scores calculated for: {name}")

    return pd.DataFrame({
        'Pipeline': [name] * len(references),
        'Ground Truth Answer': references,
        'Generated Answer': predictions,
        'BLEU': nltk_bleu,
        'SacreBLEU': sacrebleu_scores,
        'ROUGE-1': rouge1,
        'ROUGE-2': rouge2,
        'ROUGE-L': rougeL,
        'METEOR': meteor,
        'Exact Match': exact_match,
        'BERT Precision': bert_p,
        'BERT Recall': bert_r,
        'BERT F1': bert_f1
    })


In [None]:

# Evaluate Pipelines
mod_results = evaluate_pipeline("Modified RAG", references, mod_rag_preds)
trad_results = evaluate_pipeline("Traditional RAG", references, trad_rag_preds)

final_results_df = pd.concat([mod_results, trad_results], ignore_index=True)



🔍 Evaluating: Modified RAG
calculating scores...
computing bert embedding.


100%|██████████| 21/21 [00:10<00:00,  1.93it/s]


computing greedy matching.


100%|██████████| 15/15 [00:00<00:00, 90.06it/s]


done in 11.05 seconds, 81.39 sentences/sec
✅ Scores calculated for: Modified RAG

🔍 Evaluating: Traditional RAG
calculating scores...
computing bert embedding.


100%|██████████| 21/21 [00:11<00:00,  1.90it/s]


computing greedy matching.


100%|██████████| 15/15 [00:00<00:00, 117.32it/s]


done in 11.19 seconds, 80.36 sentences/sec
✅ Scores calculated for: Traditional RAG


In [None]:
# Summary Function
def summarize_metrics(df, label):
    print(f"\n📈 === Average Scores for {label} ===")
    print(f"BLEU: {df['BLEU'].mean():.4f}")
    print(f"SacreBLEU: {df['SacreBLEU'].mean():.4f}")
    print(f"ROUGE-1: {df['ROUGE-1'].mean():.4f}")
    print(f"ROUGE-2: {df['ROUGE-2'].mean():.4f}")
    print(f"ROUGE-L: {df['ROUGE-L'].mean():.4f}")
    print(f"METEOR: {df['METEOR'].mean():.4f}")
    print(f"Exact Match: {df['Exact Match'].mean():.4f}")
    print(f"BERTScore F1: {df['BERT F1'].mean():.4f}")

summarize_metrics(mod_results, "Modified RAG")
summarize_metrics(trad_results, "Traditional RAG")


📈 === Average Scores for Modified RAG ===
BLEU: 0.6452
SacreBLEU: 1.2095
ROUGE-1: 0.0373
ROUGE-2: 0.0071
ROUGE-L: 0.0362
METEOR: 0.0370
Exact Match: 0.0000
BERTScore F1: 0.7899

📈 === Average Scores for Traditional RAG ===
BLEU: 1.5997
SacreBLEU: 2.5384
ROUGE-1: 0.0674
ROUGE-2: 0.0185
ROUGE-L: 0.0663
METEOR: 0.0815
Exact Match: 0.0067
BERTScore F1: 0.7949


In [None]:
# Combine & Save Both Pipelines into One CSV
final_results_df = pd.concat([mod_results, trad_results], ignore_index=True)

final_results_df.to_csv("Evaluation_Results_Modified_vs_Traditional.csv", index=False)
print("✅ Results saved to: Evaluation_Results_Modified_vs_Traditional.csv")


✅ Results saved to: Evaluation_Results_Modified_vs_Traditional.csv
