# ROUGE score with rouge_score library

## Model 1 ROUGE score

In [1]:
import pandas as pd
import re
from rouge_score import rouge_scorer, scoring

# Text Cleaning Function
def clean_text(text, remove_punctuation=False):
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation if needed
    return text

# Load data
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Sample.csv")

# Clean and rename columns
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Expected Answer': 'reference',
    'Initial Answer from Base Model (Mistral 7B)': 'base_model',
    'Initial Answer from LLM (gpt-4o-mini)': 'gpt4omini',
    'Finetuned Mistral Model 1 (100 Tokens)': 'fine_tuned'
})

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Bootstrap aggregators
aggregator_fine = scoring.BootstrapAggregator()
aggregator_base = scoring.BootstrapAggregator()
aggregator_gpt = scoring.BootstrapAggregator()

fine_tuned_better_base = 0
fine_tuned_better_gpt = 0
total_samples = 0

with open("rouge_scores_model_1.txt", "w", encoding="utf-8") as f:
    for i, row in df.iterrows():
        # Clean the text fields
        reference = clean_text(row['reference']) if pd.notna(row['reference']) else ""
        base_model = clean_text(row['base_model']) if pd.notna(row['base_model']) else ""
        gpt4o = clean_text(row['gpt4omini']) if pd.notna(row['gpt4omini']) else ""
        fine_tuned = clean_text(row['fine_tuned']) if pd.notna(row['fine_tuned']) else ""

        if reference == "" or (base_model == "" and gpt4o == "" and fine_tuned == ""):
            continue

        # Score each model
        score_base = scorer.score(reference, base_model)
        score_gpt = scorer.score(reference, gpt4o)
        score_fine = scorer.score(reference, fine_tuned)

        aggregator_base.add_scores(score_base)
        aggregator_gpt.add_scores(score_gpt)
        aggregator_fine.add_scores(score_fine)

        # Compare ROUGE-L F1
        if score_fine['rougeL'].fmeasure > score_base['rougeL'].fmeasure:
            fine_tuned_better_base += 1
        if score_fine['rougeL'].fmeasure > score_gpt['rougeL'].fmeasure:
            fine_tuned_better_gpt += 1

        total_samples += 1

        # Write sample results
        f.write(f"Sample {i+1}:\n")
        f.write(f"fine_tuned: ROUGE-L = {score_fine['rougeL'].fmeasure:.4f}\n")
        f.write(f"base_model: ROUGE-L = {score_base['rougeL'].fmeasure:.4f}\n")
        f.write(f"gpt4omini: ROUGE-L = {score_gpt['rougeL'].fmeasure:.4f}\n\n")

    # Aggregate results
    results_fine = aggregator_fine.aggregate()
    results_base = aggregator_base.aggregate()
    results_gpt = aggregator_gpt.aggregate()

    f.write("="*60 + "\n")
    f.write(f"Fine-tuned > Base Model: {fine_tuned_better_base}/{total_samples} ({fine_tuned_better_base/total_samples*100:.2f}%)\n")
    f.write(f"Fine-tuned > GPT-4o-mini: {fine_tuned_better_gpt}/{total_samples} ({fine_tuned_better_gpt/total_samples*100:.2f}%)\n\n")

    def write_agg_scores(name, scores):
        f.write(f"{name}:\n")
        f.write(f"  ROUGE-1: {scores['rouge1'].mid.fmeasure:.4f}\n")
        f.write(f"  ROUGE-2: {scores['rouge2'].mid.fmeasure:.4f}\n")
        f.write(f"  ROUGE-L: {scores['rougeL'].mid.fmeasure:.4f}\n\n")

    f.write("Corpus-level ROUGE Scores:\n")
    write_agg_scores("fine_tuned", results_fine)
    write_agg_scores("base_model", results_base)
    write_agg_scores("gpt4omini", results_gpt)

print("ROUGE scores saved to rouge_scores_model_1.txt")


ROUGE scores saved to rouge_scores_model_1.txt


## Model 2 ROUGE score

In [2]:
import pandas as pd
import re
from rouge_score import rouge_scorer, scoring

# Text Cleaning Function
def clean_text(text, remove_punctuation=False):
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation if needed
    return text

# Load data
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Sample.csv")

# Clean and rename columns
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Expected Answer': 'reference',
    'Initial Answer from Base Model (Mistral 7B)': 'base_model',
    'Initial Answer from LLM (gpt-4o-mini)': 'gpt4omini',
    'Finetuned Mistral Model 2 (100 Tokens)': 'fine_tuned'
})

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Bootstrap aggregators
aggregator_fine = scoring.BootstrapAggregator()
aggregator_base = scoring.BootstrapAggregator()
aggregator_gpt = scoring.BootstrapAggregator()

fine_tuned_better_base = 0
fine_tuned_better_gpt = 0
total_samples = 0

with open("rouge_scores_model_2.txt", "w", encoding="utf-8") as f:
    for i, row in df.iterrows():
        # Clean the text fields
        reference = clean_text(row['reference']) if pd.notna(row['reference']) else ""
        base_model = clean_text(row['base_model']) if pd.notna(row['base_model']) else ""
        gpt4o = clean_text(row['gpt4omini']) if pd.notna(row['gpt4omini']) else ""
        fine_tuned = clean_text(row['fine_tuned']) if pd.notna(row['fine_tuned']) else ""

        if reference == "" or (base_model == "" and gpt4o == "" and fine_tuned == ""):
            continue

        # Score each model
        score_base = scorer.score(reference, base_model)
        score_gpt = scorer.score(reference, gpt4o)
        score_fine = scorer.score(reference, fine_tuned)

        aggregator_base.add_scores(score_base)
        aggregator_gpt.add_scores(score_gpt)
        aggregator_fine.add_scores(score_fine)

        # Compare ROUGE-L F1
        if score_fine['rougeL'].fmeasure > score_base['rougeL'].fmeasure:
            fine_tuned_better_base += 1
        if score_fine['rougeL'].fmeasure > score_gpt['rougeL'].fmeasure:
            fine_tuned_better_gpt += 1

        total_samples += 1

        # Write sample results
        f.write(f"Sample {i+1}:\n")
        f.write(f"fine_tuned: ROUGE-L = {score_fine['rougeL'].fmeasure:.4f}\n")
        f.write(f"base_model: ROUGE-L = {score_base['rougeL'].fmeasure:.4f}\n")
        f.write(f"gpt4omini: ROUGE-L = {score_gpt['rougeL'].fmeasure:.4f}\n\n")

    # Aggregate results
    results_fine = aggregator_fine.aggregate()
    results_base = aggregator_base.aggregate()
    results_gpt = aggregator_gpt.aggregate()

    f.write("="*60 + "\n")
    f.write(f"Fine-tuned > Base Model: {fine_tuned_better_base}/{total_samples} ({fine_tuned_better_base/total_samples*100:.2f}%)\n")
    f.write(f"Fine-tuned > GPT-4o-mini: {fine_tuned_better_gpt}/{total_samples} ({fine_tuned_better_gpt/total_samples*100:.2f}%)\n\n")

    def write_agg_scores(name, scores):
        f.write(f"{name}:\n")
        f.write(f"  ROUGE-1: {scores['rouge1'].mid.fmeasure:.4f}\n")
        f.write(f"  ROUGE-2: {scores['rouge2'].mid.fmeasure:.4f}\n")
        f.write(f"  ROUGE-L: {scores['rougeL'].mid.fmeasure:.4f}\n\n")

    f.write("Corpus-level ROUGE Scores:\n")
    write_agg_scores("fine_tuned", results_fine)
    write_agg_scores("base_model", results_base)
    write_agg_scores("gpt4omini", results_gpt)

print("ROUGE scores saved to rouge_scores_model_2.txt")


ROUGE scores saved to rouge_scores_model_2.txt


# ROUGE score with Hugging Face datasets library

## Model 1 ROUGE score

In [1]:
import pandas as pd
import re
import evaluate

# Text Cleaning Function
def clean_text(text, remove_punctuation=False):
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)
    return text

# Load ROUGE once outside the loop
rouge = evaluate.load("rouge")

# Load and clean data
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Sample.csv")
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Expected Answer': 'reference',
    'Initial Answer from Base Model (Mistral 7B)': 'base_model',
    'Initial Answer from LLM (gpt-4o-mini)': 'gpt4omini',
    'Finetuned Mistral Model 1 (100 Tokens)': 'fine_tuned'
})

# Limit to first 60 valid rows
df = df.dropna(subset=['reference', 'base_model', 'gpt4omini', 'fine_tuned'])
df = df.head(60)

# Initialize counters and lists
fine_tuned_better_base = 0
fine_tuned_better_gpt = 0
total_samples = 0
references, base_preds, gpt_preds, fine_preds = [], [], [], []

with open("rouge_with_huggingface_model_1.txt", "w", encoding="utf-8") as f:
    for i, row in df.iterrows():
        reference = clean_text(row['reference'])
        base_model = clean_text(row['base_model'])
        gpt4o = clean_text(row['gpt4omini'])
        fine_tuned = clean_text(row['fine_tuned'])

        references.append(reference)
        base_preds.append(base_model)
        gpt_preds.append(gpt4o)
        fine_preds.append(fine_tuned)

        # Compute individual ROUGE-L scores
        fine_score = rouge.compute(predictions=[fine_tuned], references=[reference])
        base_score = rouge.compute(predictions=[base_model], references=[reference])
        gpt_score = rouge.compute(predictions=[gpt4o], references=[reference])

        if fine_score['rougeL'] > base_score['rougeL']:
            fine_tuned_better_base += 1
        if fine_score['rougeL'] > gpt_score['rougeL']:
            fine_tuned_better_gpt += 1

        total_samples += 1

        f.write(f"Sample {i+1}:\n")
        f.write(f"fine_tuned: ROUGE-L = {fine_score['rougeL']:.4f}\n")
        f.write(f"base_model: ROUGE-L = {base_score['rougeL']:.4f}\n")
        f.write(f"gpt4omini: ROUGE-L = {gpt_score['rougeL']:.4f}\n\n")

    # Corpus-level scores
    score_fine = rouge.compute(predictions=fine_preds, references=references)
    score_base = rouge.compute(predictions=base_preds, references=references)
    score_gpt = rouge.compute(predictions=gpt_preds, references=references)

    f.write("="*60 + "\n")
    f.write(f"Fine-tuned > Base Model: {fine_tuned_better_base}/{total_samples} ({fine_tuned_better_base/total_samples*100:.2f}%)\n")
    f.write(f"Fine-tuned > GPT-4o-mini: {fine_tuned_better_gpt}/{total_samples} ({fine_tuned_better_gpt/total_samples*100:.2f}%)\n\n")

    def write_agg_scores(name, scores):
        f.write(f"{name}:\n")
        f.write(f"  ROUGE-1: {scores['rouge1']:.4f}\n")
        f.write(f"  ROUGE-2: {scores['rouge2']:.4f}\n")
        f.write(f"  ROUGE-L: {scores['rougeL']:.4f}\n\n")

    f.write("Corpus-level ROUGE Scores:\n")
    write_agg_scores("fine_tuned", score_fine)
    write_agg_scores("base_model", score_base)
    write_agg_scores("gpt4omini", score_gpt)

print("ROUGE scores saved to rouge_with_huggingface_model_1.txt")


ROUGE scores saved to rouge_with_huggingface_model_1.txt


## Model 2 ROUGE score

In [2]:
import pandas as pd
import re
import evaluate

# Text Cleaning Function
def clean_text(text, remove_punctuation=False):
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)
    return text

# Load ROUGE once outside the loop
rouge = evaluate.load("rouge")

# Load and clean data
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Sample.csv")
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Expected Answer': 'reference',
    'Initial Answer from Base Model (Mistral 7B)': 'base_model',
    'Initial Answer from LLM (gpt-4o-mini)': 'gpt4omini',
    'Finetuned Mistral Model 2 (100 Tokens)': 'fine_tuned'
})

# Limit to first 60 valid rows
df = df.dropna(subset=['reference', 'base_model', 'gpt4omini', 'fine_tuned'])
df = df.head(60)

# Initialize counters and lists
fine_tuned_better_base = 0
fine_tuned_better_gpt = 0
total_samples = 0
references, base_preds, gpt_preds, fine_preds = [], [], [], []

with open("rouge_with_huggingface_model_2.txt", "w", encoding="utf-8") as f:
    for i, row in df.iterrows():
        reference = clean_text(row['reference'])
        base_model = clean_text(row['base_model'])
        gpt4o = clean_text(row['gpt4omini'])
        fine_tuned = clean_text(row['fine_tuned'])

        references.append(reference)
        base_preds.append(base_model)
        gpt_preds.append(gpt4o)
        fine_preds.append(fine_tuned)

        # Compute individual ROUGE-L scores
        fine_score = rouge.compute(predictions=[fine_tuned], references=[reference])
        base_score = rouge.compute(predictions=[base_model], references=[reference])
        gpt_score = rouge.compute(predictions=[gpt4o], references=[reference])

        if fine_score['rougeL'] > base_score['rougeL']:
            fine_tuned_better_base += 1
        if fine_score['rougeL'] > gpt_score['rougeL']:
            fine_tuned_better_gpt += 1

        total_samples += 1

        f.write(f"Sample {i+1}:\n")
        f.write(f"fine_tuned: ROUGE-L = {fine_score['rougeL']:.4f}\n")
        f.write(f"base_model: ROUGE-L = {base_score['rougeL']:.4f}\n")
        f.write(f"gpt4omini: ROUGE-L = {gpt_score['rougeL']:.4f}\n\n")

    # Corpus-level scores
    score_fine = rouge.compute(predictions=fine_preds, references=references)
    score_base = rouge.compute(predictions=base_preds, references=references)
    score_gpt = rouge.compute(predictions=gpt_preds, references=references)

    f.write("="*60 + "\n")
    f.write(f"Fine-tuned > Base Model: {fine_tuned_better_base}/{total_samples} ({fine_tuned_better_base/total_samples*100:.2f}%)\n")
    f.write(f"Fine-tuned > GPT-4o-mini: {fine_tuned_better_gpt}/{total_samples} ({fine_tuned_better_gpt/total_samples*100:.2f}%)\n\n")

    def write_agg_scores(name, scores):
        f.write(f"{name}:\n")
        f.write(f"  ROUGE-1: {scores['rouge1']:.4f}\n")
        f.write(f"  ROUGE-2: {scores['rouge2']:.4f}\n")
        f.write(f"  ROUGE-L: {scores['rougeL']:.4f}\n\n")

    f.write("Corpus-level ROUGE Scores:\n")
    write_agg_scores("fine_tuned", score_fine)
    write_agg_scores("base_model", score_base)
    write_agg_scores("gpt4omini", score_gpt)

print("ROUGE scores saved to rouge_with_huggingface_model_2.txt")


ROUGE scores saved to rouge_with_huggingface_model_2.txt
