In [1]:
import numpy as np
from scipy.stats import ttest_rel
import pickle

In [8]:
with open('bleu_scores-exp2.pkl', 'rb') as f:
    bleu_scores = pickle.load(f)

with open('meteor_scores-exp2.pkl', 'rb') as f:
    meteor_scores = pickle.load(f)

with open('chrf_scores-exp2.pkl', 'rb') as f:
    chrf_scores = pickle.load(f)

with open('bertscore_scores-exp2.pkl', 'rb') as f:
    bertscore_scores = pickle.load(f)

In [9]:
def ttest(metric_scores):
    alpha = 0.05  # Soglia per la significatività statistica

    for lang in metric_scores:
        print(f"\n### Language: {lang} ###")

        scores = metric_scores[lang]

        # Calcolare i punteggi medi per ogni modello
        mean_scores = {model: np.mean(scores[model]) for model in scores}
        print("\nPunteggi medi:")
        for model, mean in mean_scores.items():
            print(f" - {model}: {mean:.4f}")

        # Estrarre le liste di punteggi medi per il t-test
        model_names = list(scores.keys())
        model_data = {model: np.mean(np.array(scores[model]), axis=1) for model in model_names}

        # Confronto tra tutti i modelli
        print("\nT-test risultati:")
        for i in range(len(model_names)):
            for j in range(i + 1, len(model_names)):
                model_a, model_b = model_names[i], model_names[j]
                t_stat, p_val = ttest_rel(model_data[model_a], model_data[model_b])

                print(f" - {model_a} vs {model_b}: t = {t_stat:.4f}, p = {p_val:.4f}")

                # Interpretazione del p-value
                if p_val < alpha:
                    print(f"   → Differenza significativa tra {model_a} e {model_b}.")
                else:
                    print(f"   → Nessuna differenza significativa tra {model_a} e {model_b}.")

        print("\n" + "#" * 40)  # Separatore per le lingue

In [10]:
print('BLEU')
ttest(bleu_scores)
print()

print('METEOR')
ttest(meteor_scores)
print()

print('CHRF')
ttest(chrf_scores)
print()

print('BERTScore')
ttest(bertscore_scores)

BLEU

### Language: en ###

Punteggi medi:
 - Llama-3.1-8B-Instruct-en: 0.6402
 - Mistral-Nemo-Instruct-2407-en: 0.6362
 - Qwen2.5-7B-Instruct-en: 0.6014

T-test risultati:
 - Llama-3.1-8B-Instruct-en vs Mistral-Nemo-Instruct-2407-en: t = 4.9523, p = 0.0384
   → Differenza significativa tra Llama-3.1-8B-Instruct-en e Mistral-Nemo-Instruct-2407-en.
 - Llama-3.1-8B-Instruct-en vs Qwen2.5-7B-Instruct-en: t = 20.4292, p = 0.0024
   → Differenza significativa tra Llama-3.1-8B-Instruct-en e Qwen2.5-7B-Instruct-en.
 - Mistral-Nemo-Instruct-2407-en vs Qwen2.5-7B-Instruct-en: t = 12.9994, p = 0.0059
   → Differenza significativa tra Mistral-Nemo-Instruct-2407-en e Qwen2.5-7B-Instruct-en.

########################################

### Language: it ###

Punteggi medi:
 - Llama-3.1-8B-Instruct-it: 0.6177
 - Mistral-Nemo-Instruct-2407-it: 0.6297
 - Qwen2.5-7B-Instruct-it: 0.5627

T-test risultati:
 - Llama-3.1-8B-Instruct-it vs Mistral-Nemo-Instruct-2407-it: t = -8.8831, p = 0.0124
   → Differenza 