In [30]:
import pandas as pd

In [31]:
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Sample.csv")

In [32]:
df.columns

Index(['#', 'Question', 'Quality of the question (Low, Medium, High)',
       'Expected Answer', 'Initial Answer from Base Model (Mistral 7B)',
       'Initial Answer from LLM (gpt-4o-mini) ',
       'Finetuned Mistral Model 1 (250 Tokens)',
       'Finetuned Mistral Model 1 (100 Tokens)',
       'BLUE Score  (Base Mistral 7B  vs  gpt-4o-mini  vs  Finetuned Mistral Model 1)',
       'Finetuned Mistral Model 2 (250 Tokens)',
       'Finetuned Mistral Model 2 (100 Tokens)',
       'BLUE Score  (Base Mistral 7B  vs  gpt-4o-mini  vs  Finetuned Mistral Model 2)'],
      dtype='object')

# BLUE score with sacrebleu library

In [35]:
import sacrebleu

references = [["The cat is on the mat."]]  # list of reference lists
hypotheses = ["The cat is on the mat"]        # list of system outputs

score = sacrebleu.corpus_bleu(hypotheses, references)
print(score.score)  # Output: BLEU score as float


84.64817248906144


## Model 1 BLUE score

In [42]:
import pandas as pd
import sacrebleu
import numpy as np
import re

# Text Cleaning Function
def clean_text(text, remove_punctuation=False):
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation if needed
    return text

# Load dataset
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Sample.csv")

# Clean and rename columns
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Expected Answer': 'reference',
    'Initial Answer from Base Model (Mistral 7B)': 'base_model',
    'Initial Answer from LLM (gpt-4o-mini)': 'gpt4omini',
    'Finetuned Mistral Model 1 (100 Tokens)': 'fine_tuned'
})

# Limit to first 60 samples
df = df.head(60)

# Prepare for BLEU scoring
references = []
fine_tuned_candidates = []
base_model_candidates = []
gpt4o_candidates = []

# Output file
with open("bleu_scores_sacrebleu_model_1.txt", "w", encoding="utf-8") as f:

    fine_tuned_better_base = 0
    fine_tuned_better_gpt = 0
    total_samples = 0

    fine_tuned_scores = []
    base_model_scores = []
    gpt4o_scores = []

    for i, row in df.iterrows():
        # Clean text fields
        reference = clean_text(row.get('reference', ''))
        base_model = clean_text(row.get('base_model', ''))
        gpt4o = clean_text(row.get('gpt4omini', ''))
        fine_tuned = clean_text(row.get('fine_tuned', ''))

        # Skip empty or invalid rows
        if reference == "" or (base_model == "" and gpt4o == "" and fine_tuned == ""):
            continue

        # Compute sentence-level BLEU (0–1)
        bleu_fine = sacrebleu.sentence_bleu(fine_tuned, [reference]).score / 100
        bleu_base = sacrebleu.sentence_bleu(base_model, [reference]).score / 100
        bleu_gpt = sacrebleu.sentence_bleu(gpt4o, [reference]).score / 100

        fine_tuned_scores.append(bleu_fine)
        base_model_scores.append(bleu_base)
        gpt4o_scores.append(bleu_gpt)

        if bleu_fine > bleu_base:
            fine_tuned_better_base += 1
        if bleu_fine > bleu_gpt:
            fine_tuned_better_gpt += 1

        total_samples += 1

        # Add for corpus-level BLEU
        references.append(reference)
        fine_tuned_candidates.append(fine_tuned)
        base_model_candidates.append(base_model)
        gpt4o_candidates.append(gpt4o)

        # Write per-sample BLEU scores
        f.write(f"Sample {i+1}:\n")
        f.write(f"fine_tuned: BLEU = {bleu_fine:.4f}\n")
        f.write(f"base_model: BLEU = {bleu_base:.4f}\n")
        f.write(f"gpt4omini: BLEU = {bleu_gpt:.4f}\n\n")

    # Corpus-level BLEU
    bleu_fine_total = sacrebleu.corpus_bleu(fine_tuned_candidates, [references]).score / 100
    bleu_base_total = sacrebleu.corpus_bleu(base_model_candidates, [references]).score / 100
    bleu_gpt_total = sacrebleu.corpus_bleu(gpt4o_candidates, [references]).score / 100

    # Write summary results
    f.write("="*60 + "\n")
    f.write(f"Fine-tuned > Base Model (Mistral 7B): {fine_tuned_better_base}/{total_samples} ({fine_tuned_better_base/total_samples*100:.2f}%)\n")
    f.write(f"Fine-tuned > GPT-4o-mini: {fine_tuned_better_gpt}/{total_samples} ({fine_tuned_better_gpt/total_samples*100:.2f}%)\n\n")

    f.write("Average Sentence BLEU Scores (0–1):\n")
    f.write(f"fine_tuned: {np.mean(fine_tuned_scores):.4f}\n")
    f.write(f"base_model: {np.mean(base_model_scores):.4f}\n")
    f.write(f"gpt4omini: {np.mean(gpt4o_scores):.4f}\n\n")

    f.write("Corpus BLEU Scores (0–1):\n")
    f.write(f"fine_tuned: {bleu_fine_total:.4f}\n")
    f.write(f"base_model: {bleu_base_total:.4f}\n")
    f.write(f"gpt4omini: {bleu_gpt_total:.4f}\n")

print("BLEU scores (0–1) for first 60 samples saved to bleu_scores_sacrebleu_model_1.txt")


BLEU scores (0–1) for first 60 samples saved to bleu_scores_sacrebleu_model_1.txt


## Model 2 BLUE score

In [47]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
import numpy as np
import re

# Load your data (replace with your actual file)
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Sample.csv")

# Clean column names
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Expected Answer': 'reference',
    'Initial Answer from Base Model (Mistral 7B)': 'base_model',
    'Initial Answer from LLM (gpt-4o-mini)': 'gpt4omini',
    'Finetuned Mistral Model 1 (100 Tokens)': 'fine_tuned'
})

# Text Cleaning Function
def clean_text(text, remove_punctuation=False):
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation if needed
    return text

# Prepare smoothing function to avoid BLEU=0 for short sentences
smooth = SmoothingFunction().method1

# Lists for corpus BLEU calculation
corpus_references = []
corpus_base_model = []
corpus_gpt4o = []
corpus_fine_tuned = []

# Open file for writing results
with open("bleu_scores_nltk_model_1.txt", "w") as f:

    fine_tuned_better_base = 0
    fine_tuned_better_gpt = 0
    total_samples = 0

    fine_tuned_scores = []
    base_model_scores = []
    gpt4o_scores = []

    for i, row in df.iterrows():
        # Handle missing or non-string data and clean text
        reference = clean_text(row['reference']) if pd.notna(row['reference']) else ""
        base_model = clean_text(row['base_model']) if pd.notna(row['base_model']) else ""
        gpt4o = clean_text(row['gpt4omini']) if pd.notna(row['gpt4omini']) else ""
        fine_tuned = clean_text(row['fine_tuned']) if pd.notna(row['fine_tuned']) else ""

        # Skip if reference or all models are empty
        if reference == "" or (base_model == "" and gpt4o == "" and fine_tuned == ""):
            continue

        # Tokenize for BLEU calculation (simple whitespace tokenizer)
        ref_tokens = [reference.split()]  # List of references (only one here)
        base_tokens = base_model.split()
        gpt4o_tokens = gpt4o.split()
        fine_tuned_tokens = fine_tuned.split()

        # Calculate sentence-level BLEU scores (0-1 scale)
        bleu_base = sentence_bleu(ref_tokens, base_tokens, smoothing_function=smooth)
        bleu_gpt4o = sentence_bleu(ref_tokens, gpt4o_tokens, smoothing_function=smooth)
        bleu_fine_tuned = sentence_bleu(ref_tokens, fine_tuned_tokens, smoothing_function=smooth)

        # Save sentence-level scores for summary
        base_model_scores.append(bleu_base)
        gpt4o_scores.append(bleu_gpt4o)
        fine_tuned_scores.append(bleu_fine_tuned)

        # Count improvements
        if bleu_fine_tuned > bleu_base:
            fine_tuned_better_base += 1
        if bleu_fine_tuned > bleu_gpt4o:
            fine_tuned_better_gpt += 1

        total_samples += 1

        # Append to corpus lists
        corpus_references.append(ref_tokens)
        corpus_base_model.append(base_tokens)
        corpus_gpt4o.append(gpt4o_tokens)
        corpus_fine_tuned.append(fine_tuned_tokens)

        # Write per-sample results
        f.write(f"Sample {i+1}:\n")
        f.write(f"fine_tuned: BLEU = {bleu_fine_tuned:.4f}\n")
        f.write(f"base_model: BLEU = {bleu_base:.4f}\n")
        f.write(f"gpt4omini: BLEU = {bleu_gpt4o:.4f}\n\n")

    # Calculate corpus-level BLEU scores
    corpus_bleu_base = corpus_bleu(corpus_references, corpus_base_model, smoothing_function=smooth)
    corpus_bleu_gpt4o = corpus_bleu(corpus_references, corpus_gpt4o, smoothing_function=smooth)
    corpus_bleu_fine_tuned = corpus_bleu(corpus_references, corpus_fine_tuned, smoothing_function=smooth)

    # Write summary
    f.write("="*60 + "\n")
    f.write(f"Fine-tuned > Base Model (Mistral 7B): {fine_tuned_better_base}/{total_samples} ({fine_tuned_better_base/total_samples*100:.2f}%)\n")
    f.write(f"Fine-tuned > GPT-4o-mini: {fine_tuned_better_gpt}/{total_samples} ({fine_tuned_better_gpt/total_samples*100:.2f}%)\n\n")

    f.write("Average Sentence-level BLEU Scores:\n")
    f.write(f"fine_tuned: {np.mean(fine_tuned_scores):.4f}\n")
    f.write(f"base_model: {np.mean(base_model_scores):.4f}\n")
    f.write(f"gpt4omini: {np.mean(gpt4o_scores):.4f}\n\n")

    f.write("Corpus-level BLEU Scores:\n")
    f.write(f"fine_tuned: {corpus_bleu_fine_tuned:.4f}\n")
    f.write(f"base_model: {corpus_bleu_base:.4f}\n")
    f.write(f"gpt4omini: {corpus_bleu_gpt4o:.4f}\n")

print("BLEU scores saved to bleu_scores_nltk_model_1.txt")


BLEU scores saved to bleu_scores_nltk_model_1.txt


## BLUE score with nltk library

##  Model 1 BLUE score

In [49]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
import numpy as np
import re

# Load your data (replace with your actual file)
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Sample.csv")

# Clean column names
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Expected Answer': 'reference',
    'Initial Answer from Base Model (Mistral 7B)': 'base_model',
    'Initial Answer from LLM (gpt-4o-mini)': 'gpt4omini',
    'Finetuned Mistral Model 1 (100 Tokens)': 'fine_tuned'
})

# Text Cleaning Function
def clean_text(text, remove_punctuation=False):
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation if needed
    return text

# Prepare smoothing function to avoid BLEU=0 for short sentences
smooth = SmoothingFunction().method1

# Lists for corpus BLEU calculation
corpus_references = []
corpus_base_model = []
corpus_gpt4o = []
corpus_fine_tuned = []

# Open file for writing results
with open("bleu_scores_nltk_model_1.txt", "w") as f:

    fine_tuned_better_base = 0
    fine_tuned_better_gpt = 0
    total_samples = 0

    fine_tuned_scores = []
    base_model_scores = []
    gpt4o_scores = []

    for i, row in df.iterrows():
        # Handle missing or non-string data and clean text
        reference = clean_text(row['reference']) if pd.notna(row['reference']) else ""
        base_model = clean_text(row['base_model']) if pd.notna(row['base_model']) else ""
        gpt4o = clean_text(row['gpt4omini']) if pd.notna(row['gpt4omini']) else ""
        fine_tuned = clean_text(row['fine_tuned']) if pd.notna(row['fine_tuned']) else ""

        # Skip if reference or all models are empty
        if reference == "" or (base_model == "" and gpt4o == "" and fine_tuned == ""):
            continue

        # Tokenize for BLEU calculation (simple whitespace tokenizer)
        ref_tokens = [reference.split()]  # List of references (only one here)
        base_tokens = base_model.split()
        gpt4o_tokens = gpt4o.split()
        fine_tuned_tokens = fine_tuned.split()

        # Calculate sentence-level BLEU scores (0-1 scale)
        bleu_base = sentence_bleu(ref_tokens, base_tokens, smoothing_function=smooth)
        bleu_gpt4o = sentence_bleu(ref_tokens, gpt4o_tokens, smoothing_function=smooth)
        bleu_fine_tuned = sentence_bleu(ref_tokens, fine_tuned_tokens, smoothing_function=smooth)

        # Save sentence-level scores for summary
        base_model_scores.append(bleu_base)
        gpt4o_scores.append(bleu_gpt4o)
        fine_tuned_scores.append(bleu_fine_tuned)

        # Count improvements
        if bleu_fine_tuned > bleu_base:
            fine_tuned_better_base += 1
        if bleu_fine_tuned > bleu_gpt4o:
            fine_tuned_better_gpt += 1

        total_samples += 1

        # Append to corpus lists
        corpus_references.append(ref_tokens)
        corpus_base_model.append(base_tokens)
        corpus_gpt4o.append(gpt4o_tokens)
        corpus_fine_tuned.append(fine_tuned_tokens)

        # Write per-sample results
        f.write(f"Sample {i+1}:\n")
        f.write(f"fine_tuned: BLEU = {bleu_fine_tuned:.4f}\n")
        f.write(f"base_model: BLEU = {bleu_base:.4f}\n")
        f.write(f"gpt4omini: BLEU = {bleu_gpt4o:.4f}\n\n")

    # Calculate corpus-level BLEU scores
    corpus_bleu_base = corpus_bleu(corpus_references, corpus_base_model, smoothing_function=smooth)
    corpus_bleu_gpt4o = corpus_bleu(corpus_references, corpus_gpt4o, smoothing_function=smooth)
    corpus_bleu_fine_tuned = corpus_bleu(corpus_references, corpus_fine_tuned, smoothing_function=smooth)

    # Write summary
    f.write("="*60 + "\n")
    f.write(f"Fine-tuned > Base Model (Mistral 7B): {fine_tuned_better_base}/{total_samples} ({fine_tuned_better_base/total_samples*100:.2f}%)\n")
    f.write(f"Fine-tuned > GPT-4o-mini: {fine_tuned_better_gpt}/{total_samples} ({fine_tuned_better_gpt/total_samples*100:.2f}%)\n\n")

    f.write("Average Sentence-level BLEU Scores:\n")
    f.write(f"fine_tuned: {np.mean(fine_tuned_scores):.4f}\n")
    f.write(f"base_model: {np.mean(base_model_scores):.4f}\n")
    f.write(f"gpt4omini: {np.mean(gpt4o_scores):.4f}\n\n")

    f.write("Corpus-level BLEU Scores:\n")
    f.write(f"fine_tuned: {corpus_bleu_fine_tuned:.4f}\n")
    f.write(f"base_model: {corpus_bleu_base:.4f}\n")
    f.write(f"gpt4omini: {corpus_bleu_gpt4o:.4f}\n")

print("BLEU scores saved to bleu_scores_nltk_model_1.txt")


BLEU scores saved to bleu_scores_nltk_model_1.txt


## Model 2 BLUE score

In [50]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
import numpy as np
import re

# Load your data (replace with your actual file)
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Sample.csv")

# Clean column names
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Expected Answer': 'reference',
    'Initial Answer from Base Model (Mistral 7B)': 'base_model',
    'Initial Answer from LLM (gpt-4o-mini)': 'gpt4omini',
    'Finetuned Mistral Model 2 (100 Tokens)': 'fine_tuned'
})

# Text Cleaning Function
def clean_text(text, remove_punctuation=False):
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation if needed
    return text

# Prepare smoothing function to avoid BLEU=0 for short sentences
smooth = SmoothingFunction().method1

# Lists for corpus BLEU calculation
corpus_references = []
corpus_base_model = []
corpus_gpt4o = []
corpus_fine_tuned = []

# Open file for writing results
with open("bleu_scores_nltk_model_2.txt", "w") as f:

    fine_tuned_better_base = 0
    fine_tuned_better_gpt = 0
    total_samples = 0

    fine_tuned_scores = []
    base_model_scores = []
    gpt4o_scores = []

    for i, row in df.iterrows():
        # Handle missing or non-string data and clean text
        reference = clean_text(row['reference']) if pd.notna(row['reference']) else ""
        base_model = clean_text(row['base_model']) if pd.notna(row['base_model']) else ""
        gpt4o = clean_text(row['gpt4omini']) if pd.notna(row['gpt4omini']) else ""
        fine_tuned = clean_text(row['fine_tuned']) if pd.notna(row['fine_tuned']) else ""

        # Skip if reference or all models are empty
        if reference == "" or (base_model == "" and gpt4o == "" and fine_tuned == ""):
            continue

        # Tokenize for BLEU calculation (simple whitespace tokenizer)
        ref_tokens = [reference.split()]  # List of references (only one here)
        base_tokens = base_model.split()
        gpt4o_tokens = gpt4o.split()
        fine_tuned_tokens = fine_tuned.split()

        # Calculate sentence-level BLEU scores (0-1 scale)
        bleu_base = sentence_bleu(ref_tokens, base_tokens, smoothing_function=smooth)
        bleu_gpt4o = sentence_bleu(ref_tokens, gpt4o_tokens, smoothing_function=smooth)
        bleu_fine_tuned = sentence_bleu(ref_tokens, fine_tuned_tokens, smoothing_function=smooth)

        # Save sentence-level scores for summary
        base_model_scores.append(bleu_base)
        gpt4o_scores.append(bleu_gpt4o)
        fine_tuned_scores.append(bleu_fine_tuned)

        # Count improvements
        if bleu_fine_tuned > bleu_base:
            fine_tuned_better_base += 1
        if bleu_fine_tuned > bleu_gpt4o:
            fine_tuned_better_gpt += 1

        total_samples += 1

        # Append to corpus lists
        corpus_references.append(ref_tokens)
        corpus_base_model.append(base_tokens)
        corpus_gpt4o.append(gpt4o_tokens)
        corpus_fine_tuned.append(fine_tuned_tokens)

        # Write per-sample results
        f.write(f"Sample {i+1}:\n")
        f.write(f"fine_tuned: BLEU = {bleu_fine_tuned:.4f}\n")
        f.write(f"base_model: BLEU = {bleu_base:.4f}\n")
        f.write(f"gpt4omini: BLEU = {bleu_gpt4o:.4f}\n\n")

    # Calculate corpus-level BLEU scores
    corpus_bleu_base = corpus_bleu(corpus_references, corpus_base_model, smoothing_function=smooth)
    corpus_bleu_gpt4o = corpus_bleu(corpus_references, corpus_gpt4o, smoothing_function=smooth)
    corpus_bleu_fine_tuned = corpus_bleu(corpus_references, corpus_fine_tuned, smoothing_function=smooth)

    # Write summary
    f.write("="*60 + "\n")
    f.write(f"Fine-tuned > Base Model (Mistral 7B): {fine_tuned_better_base}/{total_samples} ({fine_tuned_better_base/total_samples*100:.2f}%)\n")
    f.write(f"Fine-tuned > GPT-4o-mini: {fine_tuned_better_gpt}/{total_samples} ({fine_tuned_better_gpt/total_samples*100:.2f}%)\n\n")

    f.write("Average Sentence-level BLEU Scores:\n")
    f.write(f"fine_tuned: {np.mean(fine_tuned_scores):.4f}\n")
    f.write(f"base_model: {np.mean(base_model_scores):.4f}\n")
    f.write(f"gpt4omini: {np.mean(gpt4o_scores):.4f}\n\n")

    f.write("Corpus-level BLEU Scores:\n")
    f.write(f"fine_tuned: {corpus_bleu_fine_tuned:.4f}\n")
    f.write(f"base_model: {corpus_bleu_base:.4f}\n")
    f.write(f"gpt4omini: {corpus_bleu_gpt4o:.4f}\n")

print("BLEU scores saved to bleu_scores_nltk_model_2.txt")


BLEU scores saved to bleu_scores_nltk_model_2.txt
