In [1]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import sacrebleu
from evaluate import load
from bert_score import score
from nltk.translate.meteor_score import meteor_score

In [2]:
import stanza
stanza.download("ur")
nlp = stanza.Pipeline("ur")

def urdu_tokenizer(text):
    doc = nlp(text)
    return [word.text for sent in doc.sentences for word in sent.words]


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-16 03:06:08 INFO: Downloaded file to C:\Users\hamma\stanza_resources\resources.json
2025-05-16 03:06:08 INFO: Downloading default packages for language: ur (Urdu) ...
2025-05-16 03:06:10 INFO: File exists: C:\Users\hamma\stanza_resources\ur\default.zip
2025-05-16 03:06:16 INFO: Finished downloading models and saved to C:\Users\hamma\stanza_resources
2025-05-16 03:06:16 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-16 03:06:16 INFO: Downloaded file to C:\Users\hamma\stanza_resources\resources.json
2025-05-16 03:06:18 INFO: Loading these models for language: ur (Urdu):
| Processor | Package       |
-----------------------------
| tokenize  | udtb          |
| pos       | udtb_nocharlm |
| lemma     | udtb_nocharlm |
| depparse  | udtb_nocharlm |

2025-05-16 03:06:19 INFO: Using device: cuda
2025-05-16 03:06:19 INFO: Loading: tokenize
2025-05-16 03:06:20 INFO: Loading: pos
2025-05-16 03:06:22 INFO: Loading: lemma
2025-05-16 03:06:22 INFO: Loading: depparse
2025-05-16 03:06:22 INFO: Done loading processors!


In [3]:
rouge = load("rouge")

In [4]:
# --- Load Your CSVs ---
vanilla_df = pd.read_csv("E:\\Study\\Thesis work\\Multihop for Urdu\\code\\evaluation\\processed results\\simple_rag_qna_results.csv")
lqr_df = pd.read_csv("E:\\Study\\Thesis work\\Multihop for Urdu\\code\\evaluation\\processed results\\LQR_processed_results_with_Alif.csv")
mod_lqr_df = pd.read_csv("E:\\Study\\Thesis work\\Multihop for Urdu\\code\\evaluation\\processed results\\modLQR_processed_results_with_Alif.csv")

# --- Ground Truth ---
references = vanilla_df['translated_answer'].astype(str).tolist()

# --- Predictions ---
vanilla_preds = vanilla_df['final_answer'].astype(str).tolist()
lqr_preds = lqr_df['final_answer'].astype(str).tolist()
mod_lqr_preds = mod_lqr_df['final_answer'].astype(str).tolist()

In [5]:
# --- Define Metric Functions ---
def calculate_nltk_bleu(references, hypotheses):
    scores = []
    smooth = SmoothingFunction().method1
    for ref, hyp in zip(references, hypotheses):
        score = sentence_bleu([ref.split()], hyp.split(), smoothing_function=smooth)
        scores.append(score * 100)
    return scores

def calculate_sacrebleu(references, hypotheses):
    return [sacrebleu.sentence_bleu(hyp, [ref]).score for ref, hyp in zip(references, hypotheses)]

def calculate_rouge(references, hypotheses):
    results = rouge.compute(
        predictions=hypotheses,
        references=references,
        tokenizer=lambda x: x.split(),
        use_aggregator=False
    )
    return results['rouge1'], results['rouge2'], results['rougeL']

def calculate_meteor(references, hypotheses, urdu_tokenizer):
    tokenized_references = [urdu_tokenizer(ref) for ref in references]
    tokenized_hypotheses = [urdu_tokenizer(hyp) for hyp in hypotheses]
    return [meteor_score([ref], hyp) for ref, hyp in zip(tokenized_references, tokenized_hypotheses)]

def calculate_bert_score(references, hypotheses):
    P, R, F1 = score(hypotheses, references, lang="ur", model_type="xlm-roberta-large")
    return P.tolist(), R.tolist(), F1.tolist()

def calculate_exact_match(references, hypotheses):
    return [1 if ref.strip() == hyp.strip() else 0 for ref, hyp in zip(references, hypotheses)]


In [6]:
def evaluate_pipeline(name, references, predictions):
    print(f"\n🚀 Evaluating pipeline: {name}")

    # --- Metric Calculations ---
    nltk_bleu = calculate_nltk_bleu(references, predictions)
    sacrebleu_scores = calculate_sacrebleu(references, predictions)
    rouge1, rouge2, rougeL = calculate_rouge(references, predictions)
    meteor = calculate_meteor(references, predictions, urdu_tokenizer)
    exact_match = calculate_exact_match(references, predictions)
    bert_p, bert_r, bert_f1 = calculate_bert_score(references, predictions)

    print(f"✅ All scores calculated for: {name}")

    # --- Build DataFrame ---
    data = {
        'pipeline': [name] * len(references),
        'reference': references,
        'prediction': predictions,
        'nltk_bleu': nltk_bleu,
        'sacrebleu': sacrebleu_scores,
        'rouge1': rouge1,
        'rouge2': rouge2,
        'rougeL': rougeL,
        'meteor': meteor,
        'exact_match': exact_match,
        'bert_precision': bert_p,
        'bert_recall': bert_r,
        'bert_f1': bert_f1
    }

    return pd.DataFrame(data)


In [None]:
# Run evaluation for all 3 pipelines
vanilla_results = evaluate_pipeline("Vanilla RAG", references, vanilla_preds)
lqr_results = evaluate_pipeline("LQR", references, lqr_preds)
mod_lqr_results = evaluate_pipeline("Modified LQR", references, mod_lqr_preds)

# Combine all detailed results
all_results_df = pd.concat([vanilla_results, lqr_results, mod_lqr_results], ignore_index=True)

# Preview or save
print("\n📋 Sample of evaluation results:")
print(all_results_df.head())

# Optional: Save to CSV
all_results_df.to_csv("all_rag_nlp_evaluation_results_with_alif.csv", index=False)
