In [1]:
import pandas as pd
df_small = pd.read_csv("/content/curated_small_pred.csv")
df_base = pd.read_csv("/content/curated_base_pred.csv")
df_large = pd.read_csv("/content/curated_large_pred.csv")

# Calculate metrics function, Precision, Recall, F1

In [2]:
def calculate_metrics(ground_truth, input_sentence, generated_sentence):
    ground_truth = ground_truth.tolist()
    input_sentence = input_sentence.tolist()
    generated_sentence = generated_sentence.tolist()

    # Identify wrong words in input sentence and ground truth
    wrong_words = [(gt_word, input_word) for gt_word, input_word in zip(ground_truth, input_sentence) if gt_word != input_word]

    # Count the number of wrong words corrected by the model
    corrected_by_model = sum(1 for gt_word, input_word in wrong_words if generated_sentence[input_sentence.index(input_word)] == gt_word)

    # Calculate metrics
    total_wrong_words = len(wrong_words)
    precision = corrected_by_model / total_wrong_words if total_wrong_words > 0 else 0
    recall = corrected_by_model / len(ground_truth)
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return {
        "Total Wrong Words": total_wrong_words,
        "Corrected by Model": corrected_by_model,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1_score
    }

# Small

In [3]:
ground_truth = df_small["Ground Truth"]
input_sentence = df_small["Input Sentence"]
generated_sentence = df_small["Generated Sentence"]

metrics = calculate_metrics(ground_truth, input_sentence, generated_sentence)
print(metrics)

{'Total Wrong Words': 3000, 'Corrected by Model': 2025, 'Precision': 0.675, 'Recall': 0.675, 'F1 Score': 0.675}


# Base

In [4]:
ground_truth = df_base["Ground Truth"]
input_sentence = df_base["Input Sentence"]
generated_sentence = df_base["Generated Sentence"]

metrics = calculate_metrics(ground_truth, input_sentence, generated_sentence)
print(metrics)

{'Total Wrong Words': 3000, 'Corrected by Model': 2745, 'Precision': 0.915, 'Recall': 0.915, 'F1 Score': 0.915}


# Large

In [5]:
ground_truth = df_large["Ground Truth"]
input_sentence = df_large["Input Sentence"]
generated_sentence = df_large["Generated Sentence"]

metrics = calculate_metrics(ground_truth, input_sentence, generated_sentence)
print(metrics)

{'Total Wrong Words': 3000, 'Corrected by Model': 1977, 'Precision': 0.659, 'Recall': 0.659, 'F1 Score': 0.659}
