In [1]:
import numpy as np
import pickle
from scipy.stats import wilcoxon
from itertools import combinations

In [2]:
with open('bleu_scores-exp1.pkl', 'rb') as f:
    bleu_scores = pickle.load(f)

with open('meteor_scores-exp1.pkl', 'rb') as f:
    meteor_scores = pickle.load(f)

with open('chrf_scores-exp1.pkl', 'rb') as f:
    chrf_scores = pickle.load(f)

with open('bertscore_scores-exp1.pkl', 'rb') as f:
    bertscore_scores = pickle.load(f)

In [3]:
def averageScores(scores):
    for lang in scores:
        for model in scores[lang]:
            averages = []
            for i in range(len(scores[lang][model][0])):
                average = (scores[lang][model][0][i] + scores[lang][model][1][i] + scores[lang][model][2][i]) / 3
                averages.append(average)
            scores[lang][model] = averages
    return scores

bleu_scores = averageScores(bleu_scores)
meteor_scores = averageScores(meteor_scores)
chrf_scores = averageScores(chrf_scores)
bertscore_scores = averageScores(bertscore_scores)

In [None]:
def statistical_tests(metric_scores):
    alpha = 0.05  # Soglia di significatività
    print("\n===== STATISTICAL TESTS =====")

    for lang in metric_scores:
        print(f"\n### Language: {lang} ###")
        scores = metric_scores[lang]
        model_names = list(scores.keys())
        model_data = {model: np.array(scores[model]) for model in model_names}

        # calcola la media
        for model in model_names:
            print(f" - {model}: {model_data[model].mean():.4f}")

        # Test post-hoc Wilcoxon
        for model_a, model_b in combinations(model_names, 2):
            stat, p_val = wilcoxon(model_data[model_a], model_data[model_b])
            print(f" - {model_a} vs {model_b}: Wilcoxon statistic={stat:.4f}, p-value={p_val:.4f}")

            if p_val < alpha:
                print(f"   → Differenza significativa tra {model_a} e {model_b}.")
            else:
                print(f"   → Nessuna differenza significativa tra {model_a} e {model_b}.\n")
       
        print("\n" + "#" * 40)  # Separatore per le lingue


In [5]:
print('BLEU')
statistical_tests(bleu_scores)
print()

print('METEOR')
statistical_tests(meteor_scores)
print()

print('CHRF')
statistical_tests(chrf_scores)
print()

print('BERTScore')
statistical_tests(bertscore_scores)

BLEU

===== STATISTICAL TESTS =====

### Language: en ###
 - Llama-3.1-8B-Instruct-en: 0.5485
 - Mistral-Nemo-Instruct-2407-en: 0.5416
 - Qwen2.5-7B-Instruct-en: 0.5207
 - Llama-3.1-8B-Instruct-en vs Mistral-Nemo-Instruct-2407-en: Wilcoxon statistic=489287.0000, p-value=0.0412
   → Differenza significativa tra Llama-3.1-8B-Instruct-en e Mistral-Nemo-Instruct-2407-en.
 - Llama-3.1-8B-Instruct-en vs Qwen2.5-7B-Instruct-en: Wilcoxon statistic=543470.5000, p-value=0.0000
   → Differenza significativa tra Llama-3.1-8B-Instruct-en e Qwen2.5-7B-Instruct-en.
 - Mistral-Nemo-Instruct-2407-en vs Qwen2.5-7B-Instruct-en: Wilcoxon statistic=548284.0000, p-value=0.0000
   → Differenza significativa tra Mistral-Nemo-Instruct-2407-en e Qwen2.5-7B-Instruct-en.

########################################

### Language: it ###
 - Llama-3.1-8B-Instruct-it: 0.5180
 - Mistral-Nemo-Instruct-2407-it: 0.5430
 - Qwen2.5-7B-Instruct-it: 0.4693
 - Llama-3.1-8B-Instruct-it vs Mistral-Nemo-Instruct-2407-it: Wilcoxon 