In [None]:
import os
import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau
from pathlib import Path
from scipy.stats import f_oneway, ttest_ind

file_path = Path(os.getcwd()).parent / 'data/summarization_evaluation_metrics.csv'

existing_df = pd.read_csv(
    file_path, 
    sep='\t',
    quoting=1,
    quotechar='"',
    escapechar='\\',
    doublequote=True,
    engine='python',
)

final_df = existing_df.copy()

final_df['ROUGE-AVG'] = final_df[['rouge-1_f', 'rouge-2_f', 'rouge-l_f']].mean(axis=1)

metrics = ['bert_f1', 'meteor']

for metric in metrics:
    valid_data = final_df[[metric, 'ROUGE-AVG']].dropna()
    x = valid_data[metric]
    y = valid_data['ROUGE-AVG']

    pearson_corr, _ = pearsonr(x, y)
    spearman_corr, _ = spearmanr(x, y)
    kendall_corr, _ = kendalltau(x, y)

    print(f"\nCorrelations between {metric} and ROUGE-AVG:")
    print(f"  Pearson correlation:  {pearson_corr:.4f}")
    print(f"  Spearman correlation: {spearman_corr:.4f}")
    print(f"  Kendall's Tau:        {kendall_corr:.4f}")



Correlations between bert_f1 and ROUGE-AVG:
  Pearson correlation:  0.4308
  Spearman correlation: 0.3238
  Kendall's Tau:        0.2209

Correlations between meteor and ROUGE-AVG:
  Pearson correlation:  0.9461
  Spearman correlation: 0.8620
  Kendall's Tau:        0.6920


In [None]:
groups = [
    group['ROUGE-AVG'].dropna()
    for name, group in final_df.groupby('model_name')
    if len(group) > 5  # optionally filter out tiny groups
]

f_stat, p_val = f_oneway(*groups)

print("ANOVA test across model_name groups (ROUGE-AVG):")
print(f"  F-statistic: {f_stat:.4f}")
print(f"  p-value:     {p_val:.4f}")

if p_val < 0.05:
    print("=> At least one model differs significantly.")
else:
    print("=> No significant differences among models.")


In [None]:
checkpoints = [
    'facebook/bart-large-cnn',
    'google-t5/t5-base',
    'google/pegasus-x-large',
    'human-centered-summarization/financial-summarization-pegasus',
]

models = []
for checkpoint in checkpoints:
    models.append(final_df[final_df['model_name'] == checkpoint]['ROUGE-AVG'].dropna())

print("Pairwise t-tests on ROUGE-AVG scores between models:\n")

for i in range(len(checkpoints)):
    for j in range(i + 1, len(checkpoints)):
        model_1 = models[i]
        model_2 = models[j]
        name_1 = checkpoints[i]
        name_2 = checkpoints[j]

        t_stat, p_val = ttest_ind(model_1, model_2, equal_var=False)

        print(f"{name_1} vs {name_2}:")
        print(f"  t-statistic: {t_stat:.4f}")
        print(f"  p-value:     {p_val:.4f}")
        if p_val < 0.05:
            print("  => Statistically significant difference\n")
        else:
            print("  => No significant difference\n")


Pairwise t-tests on ROUGE-AVG scores between models:

facebook/bart-large-cnn vs google-t5/t5-base:
  t-statistic: 3.8370
  p-value:     0.0001
  => Statistically significant difference

facebook/bart-large-cnn vs google/pegasus-x-large:
  t-statistic: -1.2734
  p-value:     0.2033
  => No significant difference

facebook/bart-large-cnn vs human-centered-summarization/financial-summarization-pegasus:
  t-statistic: 5.0744
  p-value:     0.0000
  => Statistically significant difference

google-t5/t5-base vs google/pegasus-x-large:
  t-statistic: -5.2379
  p-value:     0.0000
  => Statistically significant difference

google-t5/t5-base vs human-centered-summarization/financial-summarization-pegasus:
  t-statistic: 0.4209
  p-value:     0.6740
  => No significant difference

google/pegasus-x-large vs human-centered-summarization/financial-summarization-pegasus:
  t-statistic: 6.8359
  p-value:     0.0000
  => Statistically significant difference

