In [1]:
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)


In [1]:
import pandas as pd
import re
import torch
from bert_score import score

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Text Cleaning Function
def clean_text(text, remove_punctuation=False):
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)
    return text

# Load data
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Sample.csv")

# Clean and rename columns
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Expected Answer': 'reference',
    'Initial Answer from Base Model (Mistral 7B)': 'base_model',
    'Initial Answer from LLM (gpt-4o-mini)': 'gpt4omini',
    'Finetuned Mistral Model 1 (100 Tokens)': 'fine_tuned'
})

# Clean the texts
df['reference'] = df['reference'].apply(clean_text)
df['base_model'] = df['base_model'].apply(clean_text)
df['gpt4omini'] = df['gpt4omini'].apply(clean_text)
df['fine_tuned'] = df['fine_tuned'].apply(clean_text)

# Filter valid rows
df = df[
    df['reference'].notna() & 
    (df['base_model'].notna() | df['gpt4omini'].notna() | df['fine_tuned'].notna())
]

# Limit to first 60 valid rows
df = df.head(60)

# Prepare lists
references = df['reference'].tolist()
fine_preds = df['fine_tuned'].tolist()
base_preds = df['base_model'].tolist()
gpt_preds = df['gpt4omini'].tolist()

# Compute BERTScores (batch)
P_fine, R_fine, F1_fine = score(fine_preds, references, lang='en', device=device, model_type='distilbert-base-uncased')
P_base, R_base, F1_base = score(base_preds, references, lang='en', device=device, model_type='distilbert-base-uncased')
P_gpt, R_gpt, F1_gpt = score(gpt_preds, references, lang='en', device=device, model_type='distilbert-base-uncased')

# Add to DataFrame
df['f1_fine'] = F1_fine.tolist()
df['f1_base'] = F1_base.tolist()
df['f1_gpt'] = F1_gpt.tolist()

# Count comparisons
fine_better_base = (df['f1_fine'] > df['f1_base']).sum()
fine_better_gpt = (df['f1_fine'] > df['f1_gpt']).sum()
total = len(df)

# Save detailed results
with open("bertscore_scores_model_1.txt", "w", encoding="utf-8") as f:
    for i, row in df.iterrows():
        f.write(f"Sample {i+1}:\n")
        f.write(f"fine_tuned: BERTScore F1 = {row['f1_fine']:.4f}\n")
        f.write(f"base_model: BERTScore F1 = {row['f1_base']:.4f}\n")
        f.write(f"gpt4omini: BERTScore F1 = {row['f1_gpt']:.4f}\n\n")

    f.write("="*60 + "\n")
    f.write(f"Fine-tuned > Base Model: {fine_better_base}/{total} ({fine_better_base/total*100:.2f}%)\n")
    f.write(f"Fine-tuned > GPT-4o-mini: {fine_better_gpt}/{total} ({fine_better_gpt/total*100:.2f}%)\n\n")

    f.write("Corpus-level BERTScore (F1 average):\n")
    f.write(f"fine_tuned: {F1_fine.mean().item():.4f}\n")
    f.write(f"base_model: {F1_base.mean().item():.4f}\n")
    f.write(f"gpt4omini: {F1_gpt.mean().item():.4f}\n")

print("BERTScore saved to bertscore_scores_model_1.txt")


Using device: cuda
BERTScore saved to bertscore_scores_model_1.txt


In [2]:
import pandas as pd
import re
import torch
from bert_score import score

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Text Cleaning Function
def clean_text(text, remove_punctuation=False):
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)
    return text

# Load data
df = pd.read_csv(r"C:\Users\LENOVO\Desktop\Sample.csv")

# Clean and rename columns
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Expected Answer': 'reference',
    'Initial Answer from Base Model (Mistral 7B)': 'base_model',
    'Initial Answer from LLM (gpt-4o-mini)': 'gpt4omini',
    'Finetuned Mistral Model 2 (100 Tokens)': 'fine_tuned'
})

# Clean the texts
df['reference'] = df['reference'].apply(clean_text)
df['base_model'] = df['base_model'].apply(clean_text)
df['gpt4omini'] = df['gpt4omini'].apply(clean_text)
df['fine_tuned'] = df['fine_tuned'].apply(clean_text)

# Filter valid rows
df = df[
    df['reference'].notna() & 
    (df['base_model'].notna() | df['gpt4omini'].notna() | df['fine_tuned'].notna())
]

# Limit to first 60 valid rows
df = df.head(60)

# Prepare lists
references = df['reference'].tolist()
fine_preds = df['fine_tuned'].tolist()
base_preds = df['base_model'].tolist()
gpt_preds = df['gpt4omini'].tolist()

# Compute BERTScores (batch)
P_fine, R_fine, F1_fine = score(fine_preds, references, lang='en', device=device, model_type='distilbert-base-uncased')
P_base, R_base, F1_base = score(base_preds, references, lang='en', device=device, model_type='distilbert-base-uncased')
P_gpt, R_gpt, F1_gpt = score(gpt_preds, references, lang='en', device=device, model_type='distilbert-base-uncased')

# Add to DataFrame
df['f1_fine'] = F1_fine.tolist()
df['f1_base'] = F1_base.tolist()
df['f1_gpt'] = F1_gpt.tolist()

# Count comparisons
fine_better_base = (df['f1_fine'] > df['f1_base']).sum()
fine_better_gpt = (df['f1_fine'] > df['f1_gpt']).sum()
total = len(df)

# Save detailed results
with open("bertscore_scores_model_2.txt", "w", encoding="utf-8") as f:
    for i, row in df.iterrows():
        f.write(f"Sample {i+1}:\n")
        f.write(f"fine_tuned: BERTScore F1 = {row['f1_fine']:.4f}\n")
        f.write(f"base_model: BERTScore F1 = {row['f1_base']:.4f}\n")
        f.write(f"gpt4omini: BERTScore F1 = {row['f1_gpt']:.4f}\n\n")

    f.write("="*60 + "\n")
    f.write(f"Fine-tuned > Base Model: {fine_better_base}/{total} ({fine_better_base/total*100:.2f}%)\n")
    f.write(f"Fine-tuned > GPT-4o-mini: {fine_better_gpt}/{total} ({fine_better_gpt/total*100:.2f}%)\n\n")

    f.write("Corpus-level BERTScore (F1 average):\n")
    f.write(f"fine_tuned: {F1_fine.mean().item():.4f}\n")
    f.write(f"base_model: {F1_base.mean().item():.4f}\n")
    f.write(f"gpt4omini: {F1_gpt.mean().item():.4f}\n")

print("BERTScore for  saved to bertscore_scores__model_2.txt")


Using device: cuda
BERTScore for  saved to bertscore_scores__model_2.txt
