In [2]:
import sys
!{sys.executable} -m pip install nltk --upgrade --quiet


You should consider upgrading via the '/Users/mac/Documents/UR_RAG_Code/venv/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import nltk
print("✅ NLTK version:", nltk.__version__)


✅ NLTK version: 3.9.2


In [7]:
!pip install -q pandas sacrebleu evaluate bert-score urduhack torch transformers stanza

You should consider upgrading via the '/Users/mac/Documents/UR_RAG_Code/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [10]:
# =============================================
# LangChain MultiVector Retriever - NLP Metrics Evaluation (Urdu, XLM-Roberta)
# =============================================



import pandas as pd
import nltk
import sacrebleu
from evaluate import load
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import torch
import stanza
import re

# ---------------------------------------------
# Urdu NLP Setup
# ---------------------------------------------
try:
    nlp = stanza.Pipeline('ur', processors='tokenize', use_gpu=torch.cuda.is_available(), verbose=False)
except:
    stanza.download('ur')
    nlp = stanza.Pipeline('ur', processors='tokenize', use_gpu=torch.cuda.is_available(), verbose=False)

# ---------------------------------------------
# Urdu Text Normalization
# ---------------------------------------------
def normalize_urdu_text(text):
    text = text.strip()
    # Remove diacritics and unwanted punctuation
    text = re.sub(r"[\u0610-\u061A\u064B-\u065F\u06D6-\u06ED]", "", text)
    text = re.sub(r"[\u06D4\u060C\u066B\u066C]", "", text)
    text = re.sub(r"\s+", " ", text)
    # Basic synonym normalization for tense consistency
    synonym_map = {"کیا": "کیا تھا", "ہے": "تھا", "ہیں": "تھے"}
    for k, v in synonym_map.items():
        text = text.replace(k, v)
    return text

# Urdu Tokenizer
def urdu_tokenizer(text):
    text = normalize_urdu_text(text)
    doc = nlp(text)
    return [word.text for sent in doc.sentences for word in sent.words]

# ---------------------------------------------
# Load Evaluation Metrics
# ---------------------------------------------
rouge = load("rouge")

# ---------------------------------------------
# Load LangChain MultiVector Results CSV
# ---------------------------------------------
csv_file = 'langchain_multivector_complete_results.csv'  # your generated file
df = pd.read_csv(csv_file, usecols=['langchain_refined_answer', 'answer'])
df = df.fillna("").astype(str)

references = [normalize_urdu_text(ref) for ref in df['answer'].tolist()]
multi_preds = [normalize_urdu_text(pred) for pred in df['langchain_refined_answer'].tolist()]

print(f"📂 Loaded {len(references)} Urdu samples for evaluation.")

# ---------------------------------------------
# Metric Calculation Functions
# ---------------------------------------------
def calculate_nltk_bleu(references, hypotheses):
    scores = []
    smooth = SmoothingFunction().method4
    for ref, hyp in zip(references, hypotheses):
        score = sentence_bleu([urdu_tokenizer(ref)], urdu_tokenizer(hyp), smoothing_function=smooth)
        scores.append(score * 100)
    return scores

def calculate_sacrebleu(references, hypotheses):
    return [sacrebleu.sentence_bleu(hyp, [ref]).score for ref, hyp in zip(references, hypotheses)]

def calculate_rouge(references, hypotheses):
    results = rouge.compute(
        predictions=hypotheses,
        references=references,
        tokenizer=lambda x: urdu_tokenizer(x),
        use_aggregator=False
    )
    return results['rouge1'], results['rouge2'], results['rougeL']

def calculate_meteor(references, hypotheses):
    ref_tok = [urdu_tokenizer(ref) for ref in references]
    hyp_tok = [urdu_tokenizer(hyp) for hyp in hypotheses]
    return [meteor_score([r], h) for r, h in zip(ref_tok, hyp_tok)]

def calculate_bert_score(references, hypotheses):
    # ✅ Using XLM-Roberta-Base for token-level similarity
    P, R, F1 = bert_score(
        cands=hypotheses,
        refs=references,
        lang="ur",
        model_type="xlm-roberta-base",
        rescale_with_baseline=True,
        device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
        verbose=True,
    )
    return P.tolist(), R.tolist(), F1.tolist()

def calculate_exact_match(references, hypotheses):
    return [1 if ref.strip() == hyp.strip() else 0 for ref, hyp in zip(references, hypotheses)]

# ---------------------------------------------
# Evaluation Function
# ---------------------------------------------
def evaluate_pipeline(name, references, predictions):
    print(f"\n🔍 Evaluating: {name}")

    nltk_bleu = calculate_nltk_bleu(references, predictions)
    sacrebleu_scores = calculate_sacrebleu(references, predictions)
    rouge1, rouge2, rougeL = calculate_rouge(references, predictions)
    meteor = calculate_meteor(references, predictions)
    exact_match = calculate_exact_match(references, predictions)
    bert_p, bert_r, bert_f1 = calculate_bert_score(references, predictions)

    print(f"✅ Scores calculated for: {name}")

    return pd.DataFrame({
        'Pipeline': [name] * len(references),
        'Ground Truth Answer': references,
        'Generated Answer': predictions,
        'BLEU': nltk_bleu,
        'SacreBLEU': sacrebleu_scores,
        'ROUGE-1': rouge1,
        'ROUGE-2': rouge2,
        'ROUGE-L': rougeL,
        'METEOR': meteor,
        'Exact Match': exact_match,
        'BERT Precision': bert_p,
        'BERT Recall': bert_r,
        'BERT F1': bert_f1
    })

# ---------------------------------------------
# Run Evaluation for LangChain MultiVector Retriever
# ---------------------------------------------
multi_results = evaluate_pipeline("LangChain MultiVector Retriever", references, multi_preds)

# ---------------------------------------------
# Summary Function
# ---------------------------------------------
def summarize_metrics(df, label):
    print(f"\n📈 === Average Scores for {label} ===")
    print(f"BLEU: {df['BLEU'].mean():.4f}")
    print(f"SacreBLEU: {df['SacreBLEU'].mean():.4f}")
    print(f"ROUGE-1: {df['ROUGE-1'].mean():.4f}")
    print(f"ROUGE-2: {df['ROUGE-2'].mean():.4f}")
    print(f"ROUGE-L: {df['ROUGE-L'].mean():.4f}")
    print(f"METEOR: {df['METEOR'].mean():.4f}")
    print(f"Exact Match: {df['Exact Match'].mean():.4f}")
    print(f"BERTScore F1: {df['BERT F1'].mean():.4f}")

# ---------------------------------------------
# Display Results & Save
# ---------------------------------------------
summarize_metrics(multi_results, "LangChain MultiVector Retriever")

output_csv = "LangChain_MultiVector_NLP_Evaluation_XLMR.csv"
multi_results.to_csv(output_csv, index=False, encoding="utf-8-sig")
print(f"\n✅ Results saved to: {output_csv}")


📂 Loaded 899 Urdu samples for evaluation.

🔍 Evaluating: LangChain MultiVector Retriever
calculating scores...
computing bert embedding.


100%|██████████| 18/18 [00:09<00:00,  1.88it/s]


computing greedy matching.


100%|██████████| 15/15 [00:00<00:00, 148.16it/s]


done in 9.67 seconds, 92.98 sentences/sec
✅ Scores calculated for: LangChain MultiVector Retriever

📈 === Average Scores for LangChain MultiVector Retriever ===
BLEU: 4.7063
SacreBLEU: 5.8257
ROUGE-1: 0.1569
ROUGE-2: 0.0890
ROUGE-L: 0.1549
METEOR: 0.2555
Exact Match: 0.0011
BERTScore F1: 0.8057

✅ Results saved to: LangChain_MultiVector_NLP_Evaluation_XLMR.csv


In [None]:
# =============================================
# LangChain MultiVector Retriever - NLP Metrics Evaluation (English, RoBERTa)
# =============================================

import pandas as pd
import nltk
import sacrebleu
from evaluate import load
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import torch
import re

# ---------------------------------------------
# English NLP Setup

import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)


def normalize_english_text(text):
    """Basic normalization: lowercase, strip punctuation, and extra spaces"""
    text = str(text).lower().strip()
    text = re.sub(r"[^a-z0-9\s.,!?']", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

def english_tokenizer(text):
    """Tokenize English text using NLTK"""
    return nltk.word_tokenize(text)

# ---------------------------------------------
# Load Evaluation Metrics
# ---------------------------------------------
rouge = load("rouge")

# ---------------------------------------------
# Load LangChain MultiVector Results CSV
# ---------------------------------------------
csv_file = 'langchain_multivector_english_results.csv'  # your generated file
df = pd.read_csv(csv_file, usecols=['langchain_refined_answer', 'answers'])
df = df.fillna("").astype(str)

references = [normalize_english_text(ref) for ref in df['answers'].tolist()]
multi_preds = [normalize_english_text(pred) for pred in df['langchain_refined_answer'].tolist()]

print(f"📂 Loaded {len(references)} English samples for evaluation.")

# ---------------------------------------------
# Metric Calculation Functions
# ---------------------------------------------
def calculate_nltk_bleu(references, hypotheses):
    scores = []
    smooth = SmoothingFunction().method4
    for ref, hyp in zip(references, hypotheses):
        ref_tokens = english_tokenizer(ref)
        hyp_tokens = english_tokenizer(hyp)
        score = sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smooth)
        scores.append(score * 100)
    return scores

def calculate_sacrebleu(references, hypotheses):
    return [sacrebleu.sentence_bleu(hyp, [ref]).score for ref, hyp in zip(references, hypotheses)]

def calculate_rouge(references, hypotheses):
    results = rouge.compute(
        predictions=hypotheses,
        references=references,
        tokenizer=lambda x: english_tokenizer(x),
        use_aggregator=False
    )
    return results['rouge1'], results['rouge2'], results['rougeL']

def calculate_meteor(references, hypotheses):
    ref_tok = [english_tokenizer(ref) for ref in references]
    hyp_tok = [english_tokenizer(hyp) for hyp in hypotheses]
    return [meteor_score([r], h) for r, h in zip(ref_tok, hyp_tok)]

def calculate_bert_score(references, hypotheses):
    # ✅ Using RoBERTa-Large for English semantic similarity
    P, R, F1 = bert_score(
        cands=hypotheses,
        refs=references,
        lang="en",
        model_type="roberta-large",
        rescale_with_baseline=True,
        device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
        verbose=True,
    )
    return P.tolist(), R.tolist(), F1.tolist()

def calculate_exact_match(references, hypotheses):
    return [1 if ref.strip() == hyp.strip() else 0 for ref, hyp in zip(references, hypotheses)]

# ---------------------------------------------
# Evaluation Function
# ---------------------------------------------
def evaluate_pipeline(name, references, predictions):
    print(f"\n🔍 Evaluating: {name}")

    nltk_bleu = calculate_nltk_bleu(references, predictions)
    sacrebleu_scores = calculate_sacrebleu(references, predictions)
    rouge1, rouge2, rougeL = calculate_rouge(references, predictions)
    meteor = calculate_meteor(references, predictions)
    exact_match = calculate_exact_match(references, predictions)
    bert_p, bert_r, bert_f1 = calculate_bert_score(references, predictions)

    print(f"✅ Scores calculated for: {name}")

    return pd.DataFrame({
        'Pipeline': [name] * len(references),
        'Ground Truth Answer': references,
        'Generated Answer': predictions,
        'BLEU': nltk_bleu,
        'SacreBLEU': sacrebleu_scores,
        'ROUGE-1': rouge1,
        'ROUGE-2': rouge2,
        'ROUGE-L': rougeL,
        'METEOR': meteor,
        'Exact Match': exact_match,
        'BERT Precision': bert_p,
        'BERT Recall': bert_r,
        'BERT F1': bert_f1
    })

# ---------------------------------------------
# Run Evaluation for LangChain MultiVector Retriever
# ---------------------------------------------
multi_results = evaluate_pipeline("LangChain MultiVector Retriever (English)", references, multi_preds)

# ---------------------------------------------
# Summary Function
# ---------------------------------------------
def summarize_metrics(df, label):
    print(f"\n📈 === Average Scores for {label} ===")
    print(f"BLEU: {df['BLEU'].mean():.4f}")
    print(f"SacreBLEU: {df['SacreBLEU'].mean():.4f}")
    print(f"ROUGE-1: {df['ROUGE-1'].mean():.4f}")
    print(f"ROUGE-2: {df['ROUGE-2'].mean():.4f}")
    print(f"ROUGE-L: {df['ROUGE-L'].mean():.4f}")
    print(f"METEOR: {df['METEOR'].mean():.4f}")
    print(f"Exact Match: {df['Exact Match'].mean():.4f}")
    print(f"BERTScore F1: {df['BERT F1'].mean():.4f}")

# ---------------------------------------------
# Display Results & Save
# ---------------------------------------------
summarize_metrics(multi_results, "LangChain MultiVector Retriever (English)")

output_csv = "LangChain_MultiVector_NLP_Evaluation_English.csv"
multi_results.to_csv(output_csv, index=False, encoding="utf-8-sig")
print(f"\n✅ Results saved to: {output_csv}")


📂 Loaded 1205 English samples for evaluation.

🔍 Evaluating: LangChain MultiVector Retriever (English)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 34/34 [00:53<00:00,  1.56s/it]


computing greedy matching.


100%|██████████| 19/19 [00:00<00:00, 95.89it/s]


done in 53.46 seconds, 22.54 sentences/sec
✅ Scores calculated for: LangChain MultiVector Retriever (English)

📈 === Average Scores for LangChain MultiVector Retriever (English) ===
BLEU: 11.3382
SacreBLEU: 15.0434
ROUGE-1: 0.3072
ROUGE-2: 0.1960
ROUGE-L: 0.3060
METEOR: 0.4747
Exact Match: 0.0498
BERTScore F1: 0.2160

✅ Results saved to: LangChain_MultiVector_NLP_Evaluation_English.csv


In [15]:
# =============================================
# LangChain MultiVector Retriever - NLP Metrics Evaluation (English, DeBERTa)
# =============================================

import pandas as pd
import nltk
import sacrebleu
from evaluate import load
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import torch
import re

# ---------------------------------------------
# English NLP Setup
# ---------------------------------------------
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# ---------------------------------------------
# English Text Normalization
# ---------------------------------------------
def normalize_english_text(text):
    """Normalize English text by lowercasing, removing punctuation, and trimming spaces."""
    text = str(text).lower().strip()
    text = re.sub(r"[^a-z0-9\s.,!?']", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

def english_tokenizer(text):
    """Tokenize English text using NLTK."""
    return nltk.word_tokenize(text)

# ---------------------------------------------
# Load Evaluation Metrics
# ---------------------------------------------
rouge = load("rouge")

# ---------------------------------------------
# Load LangChain MultiVector Results CSV
# ---------------------------------------------
csv_file = 'langchain_multivector_english_results.csv'  # your results file
df = pd.read_csv(csv_file, usecols=['langchain_refined_answer', 'answers'])
df = df.fillna("").astype(str)

references = [normalize_english_text(ref) for ref in df['answers'].tolist()]
multi_preds = [normalize_english_text(pred) for pred in df['langchain_refined_answer'].tolist()]

print(f"📂 Loaded {len(references)} English samples for evaluation.")

# ---------------------------------------------
# Metric Calculation Functions
# ---------------------------------------------
def calculate_nltk_bleu(references, hypotheses):
    scores = []
    smooth = SmoothingFunction().method4
    for ref, hyp in zip(references, hypotheses):
        ref_tokens = english_tokenizer(ref)
        hyp_tokens = english_tokenizer(hyp)
        if len(hyp_tokens) == 0 or len(ref_tokens) == 0:
            scores.append(0)
            continue
        score = sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smooth)
        scores.append(score)  # scaled 0–1
    return scores

def calculate_sacrebleu(references, hypotheses):
    """Return SacreBLEU normalized between 0–1"""
    return [sacrebleu.sentence_bleu(hyp, [ref]).score / 100 for ref, hyp in zip(references, hypotheses)]

def calculate_rouge(references, hypotheses):
    """ROUGE scores computed using HF evaluate."""
    results = rouge.compute(
        predictions=hypotheses,
        references=references,
        use_aggregator=False
    )
    return results['rouge1'], results['rouge2'], results['rougeL']

def calculate_meteor(references, hypotheses):
    ref_tok = [english_tokenizer(ref) for ref in references]
    hyp_tok = [english_tokenizer(hyp) for hyp in hypotheses]
    scores = []
    for r, h in zip(ref_tok, hyp_tok):
        try:
            scores.append(meteor_score([r], h))
        except:
            scores.append(0)
    return scores

def calculate_bert_score(references, hypotheses):
    """Semantic similarity using DeBERTa-Xlarge-MNLI for English."""
    P, R, F1 = bert_score(
        cands=hypotheses,
        refs=references,
        lang="en",
        model_type="microsoft/deberta-xlarge-mnli",
        rescale_with_baseline=True,
        device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
        verbose=True,
    )
    return P.tolist(), R.tolist(), F1.tolist()

def calculate_exact_match(references, hypotheses):
    return [1 if ref.strip() == hyp.strip() else 0 for ref, hyp in zip(references, hypotheses)]

# ---------------------------------------------
# Evaluation Function
# ---------------------------------------------
def evaluate_pipeline(name, references, predictions):
    print(f"\n🔍 Evaluating: {name}")

    nltk_bleu = calculate_nltk_bleu(references, predictions)
    sacrebleu_scores = calculate_sacrebleu(references, predictions)
    rouge1, rouge2, rougeL = calculate_rouge(references, predictions)
    meteor = calculate_meteor(references, predictions)
    exact_match = calculate_exact_match(references, predictions)
    bert_p, bert_r, bert_f1 = calculate_bert_score(references, predictions)

    print(f"✅ Scores calculated for: {name}")

    return pd.DataFrame({
        'Pipeline': [name] * len(references),
        'Ground Truth Answer': references,
        'Generated Answer': predictions,
        'BLEU': nltk_bleu,
        'SacreBLEU': sacrebleu_scores,
        'ROUGE-1': rouge1,
        'ROUGE-2': rouge2,
        'ROUGE-L': rougeL,
        'METEOR': meteor,
        'Exact Match': exact_match,
        'BERT Precision': bert_p,
        'BERT Recall': bert_r,
        'BERT F1': bert_f1
    })

# ---------------------------------------------
# Run Evaluation for LangChain MultiVector Retriever
# ---------------------------------------------
multi_results = evaluate_pipeline("LangChain MultiVector Retriever (English)", references, multi_preds)

# ---------------------------------------------
# Summary Function
# ---------------------------------------------
def summarize_metrics(df, label):
    print(f"\n📈 === Average Scores for {label} ===")
    print(f"BLEU: {df['BLEU'].mean():.4f}")
    print(f"SacreBLEU: {df['SacreBLEU'].mean():.4f}")
    print(f"ROUGE-1: {df['ROUGE-1'].mean():.4f}")
    print(f"ROUGE-2: {df['ROUGE-2'].mean():.4f}")
    print(f"ROUGE-L: {df['ROUGE-L'].mean():.4f}")
    print(f"METEOR: {df['METEOR'].mean():.4f}")
    print(f"Exact Match: {df['Exact Match'].mean():.4f}")
    print(f"BERTScore F1: {df['BERT F1'].mean():.4f}")

# ---------------------------------------------
# Display Results & Save
# ---------------------------------------------
summarize_metrics(multi_results, "LangChain MultiVector Retriever (English)")

output_csv = "LangChain_MultiVector_NLP_Evaluation_English1.csv"
multi_results.to_csv(output_csv, index=False, encoding="utf-8-sig")
print(f"\n✅ Results saved to: {output_csv}")


📂 Loaded 1205 English samples for evaluation.

🔍 Evaluating: LangChain MultiVector Retriever (English)
calculating scores...
computing bert embedding.


100%|██████████| 34/34 [03:05<00:00,  5.47s/it]


computing greedy matching.


100%|██████████| 19/19 [00:00<00:00, 35.36it/s]


done in 186.51 seconds, 6.46 sentences/sec
✅ Scores calculated for: LangChain MultiVector Retriever (English)

📈 === Average Scores for LangChain MultiVector Retriever (English) ===
BLEU: 0.1134
SacreBLEU: 0.1504
ROUGE-1: 0.3262
ROUGE-2: 0.2115
ROUGE-L: 0.3249
METEOR: 0.4747
Exact Match: 0.0498
BERTScore F1: 0.1903

✅ Results saved to: LangChain_MultiVector_NLP_Evaluation_English1.csv
