In [None]:
import pandas as pd
import evaluate

baseline_file = "final_baseline_predictions.csv"
finetuned_file = "final_finetuned_predictions.csv"

baseline_data = pd.read_csv(baseline_file)
finetuned_data = pd.read_csv(finetuned_file)

baseline_predictions = baseline_data["Prediction"].tolist()
baseline_references = baseline_data["Reference"].tolist()

finetuned_predictions = finetuned_data["Prediction"].tolist()
finetuned_references = finetuned_data["Reference"].tolist()

meteor_metric = evaluate.load("meteor")
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
exactmatch_metric = evaluate.load("exact_match")

def format_data(predictions, references, no_answer_probability=0.0):
    formatted_predictions = [
        pred for pred in predictions
    ]

    formatted_references = []
    for ref in references:
        formatted_references.append(ref)  # Just append the reference as a string

    return {'predictions': formatted_predictions, 'references': formatted_references}



def evaluate_metrics(predictions, references):
    results = {}



    results["bleu"] = bleu_metric.compute(predictions=predictions, references=[[ref] for ref in references])

    results["rouge"] = rouge_metric.compute(predictions=predictions, references=references)

    results["meteor"] = meteor_metric.compute(predictions=predictions, references=references)

    results["exact_match"] = exactmatch_metric.compute(predictions=predictions, references=references)

    return results

baseline_data_formatted = format_data(baseline_predictions, baseline_references)
finetuned_data_formatted = format_data(finetuned_predictions, finetuned_references)

baseline_results = evaluate_metrics(baseline_data_formatted['predictions'], baseline_data_formatted['references'])
print("Baseline Results:")
for metric, result in baseline_results.items():
    print(f"{metric}: {result}")


finetuned_results = evaluate_metrics(finetuned_data_formatted['predictions'], finetuned_data_formatted['references'])
print("\nFine-tuned Results:")
for metric, result in finetuned_results.items():
    print(f"{metric}: {result}")



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Baseline Results:
bleu: {'bleu': 0.08248386085408173, 'precisions': [0.10254185381788485, 0.0851403729310706, 0.0769912217121688, 0.06886491247385225], 'brevity_penalty': 1.0, 'length_ratio': 6.404707420725727, 'translation_length': 39184, 'reference_length': 6118}
rouge: {'rouge1': 0.41447380297519804, 'rouge2': 0.3713187815919462, 'rougeL': 0.41363571143814465, 'rougeLsum': 0.414381400160198}
meteor: {'meteor': 0.4002828854229779}
exact_match: {'exact_match': 0.391}

Fine-tuned Results:
bleu: {'bleu': 0.44941060895141816, 'precisions': [0.5334352311807413, 0.5245630609352858, 0.5321387940841866, 0.5383575532274417], 'brevity_penalty': 0.8445970053945707, 'length_ratio': 0.8555083360575352, 'translation_length': 5234, 'reference_length': 6118}
rouge: {'rouge1': 0.5170069249174545, 'rouge2': 0.3576785714285714, 'rougeL': 0.5163161189655757, 'rougeLsum': 0.5170885547474191}
meteor: {'meteor': 0.42789877241665897}
exact_match: {'exact_match': 0.456}


In [6]:
import pandas as pd
import evaluate


baseline_file = "prednrefs_baseline.csv"
finetuned_file = "prednrefs_finetuneed.csv"

baseline_data = pd.read_csv(baseline_file)
finetuned_data = pd.read_csv(finetuned_file)

baseline_predictions = baseline_data["Prediction"].tolist()
baseline_references = baseline_data["Reference"].tolist()

finetuned_predictions = finetuned_data["Prediction"].tolist()
finetuned_references = finetuned_data["Reference"].tolist()




In [33]:
import ast
import evaluate

# Load the SQuAD v2 metric
squad_metric = evaluate.load("squad_v2")

# Parse the references and predictions
references = [ast.literal_eval(ref) for ref in baseline_references]
predictions = [ast.literal_eval(pred) for pred in baseline_predictions]

# Format data for SQuAD v2 metrics
formatted_references = [
    {
        "id": ref["id"],
        "answers": [
            {"text": ans, "answer_start": 0} for ans in ref["answers"]["text"]
        ]
    }
    for ref in references
]

formatted_predictions = [
    {
        "id": pred["id"],
        "prediction_text": pred["prediction_text"],
        "no_answer_probability": 0  # Default value
    }
    for pred in predictions
]

# Compute the SQuAD v2 metrics
results = squad_metric.compute(predictions=formatted_predictions, references=formatted_references)

# Display the results
print("SQuAD v2 Metrics:")
print(f"Exact Match (EM): {results['exact']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")


SQuAD v2 Metrics:
Exact Match (EM): 0.90
F1 Score: 3.71


In [43]:
print(results)

{'exact': 0.9, 'f1': 3.7102392792679186, 'total': 2000, 'HasAns_exact': 0.3988035892323031, 'HasAns_f1': 6.0024711451005315, 'HasAns_total': 1003, 'NoAns_exact': 1.4042126379137412, 'NoAns_f1': 1.4042126379137412, 'NoAns_total': 997, 'best_exact': 49.85, 'best_exact_thresh': 0.0, 'best_f1': 49.86, 'best_f1_thresh': 0.0}


In [30]:
import ast
import evaluate

squad_metric = evaluate.load("squad_v2")

# Parse the references and predictions
references = [ast.literal_eval(ref) for ref in finetuned_references]
predictions = [ast.literal_eval(pred) for pred in finetuned_predictions]

formatted_references = [
    {
        "id": ref["id"],
        "answers": [
            {"text": ans, "answer_start": 0} for ans in ref["answers"]["text"]
        ]
    }
    for ref in references
]

formatted_predictions = [
    {
        "id": pred["id"],
        "prediction_text": pred["prediction_text"],
        "no_answer_probability": 0
    }
    for pred in predictions
]

# Compute the SQuAD v2 metrics
results = squad_metric.compute(predictions=formatted_predictions, references=formatted_references)

# Display the results
print("SQuAD v2 Metrics:")
print(f"Exact Match (EM): {results['exact']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")


SQuAD v2 Metrics:
Exact Match (EM): 0.95
F1 Score: 4.51
