In [None]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
from scipy.stats import ttest_rel

# evaluation 
from evaluate import load

In [None]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [None]:
models_generations = {
    'en': {},
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-en-' in file:
        if model not in models_generations['en']:
            models_generations['en'][model] = {}
        models_generations['en'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: en")
    elif '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")


## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/bertscore">Bertscore</a>

In [None]:
bertscore = load("bertscore")

In [None]:
from tqdm import tqdm
import numpy as np

bertscore_scores = {'en': {}, 'it': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_bertscore_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        gens = sorted(models_generations[lang][model])  # Ensure order (gen0, gen1, gen2)
        for gen in tqdm(gens, desc=f'Processing {model} ({lang})'):
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            results = bertscore.compute(predictions=predictions, references=references, lang=lang)
            model_bertscore_scores.append(results['f1'])  # Store bertscore for each generation
        
        # Store the three bertscore scores instead of their mean
        bertscore_scores[lang][model] = model_bertscore_scores
        print(f'bertscore scores for {model} ({lang}): {model_bertscore_scores} - Average: {np.mean(model_bertscore_scores)}')
        print()


In [None]:
#save bertscore scores
import pickle

#with open('bertscore_scores.pkl', 'wb') as f:
#    pickle.dump(bertscore_scores, f)

In [None]:
# open bertscore_scores-exp1.pkl
with open('bertscore_scores-exp1.pkl', 'rb') as f:
    bertscore_scores_exp1 = pickle.load(f)

# open bertscore_scores-exp2.pkl
with open('bertscore_scores-exp3-parziale.pkl', 'rb') as f:
    bertscore_scores_exp2 = pickle.load(f)

In [None]:
bertscore_scores = {'it': {}}
for lang in bertscore_scores_exp1:
    if lang == 'it':
        for model in bertscore_scores_exp1[lang]:
            bertscore_scores[lang][model] = bertscore_scores_exp1[lang][model]
            print(model)

for lang in bertscore_scores_exp2:
    if lang == 'it':
        for model in bertscore_scores_exp2[lang]:
            bertscore_scores[lang][model] = bertscore_scores_exp2[lang][model]
            print(model)

In [None]:
#save
with open('bertscore_scores-exp3.pkl', 'wb') as f:
    pickle.dump(bertscore_scores, f)

In [None]:
print(len(bertscore_scores['it']['LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it'][0]))
print(len(bertscore_scores['it']['LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it'][1]))
print(len(bertscore_scores['it']['LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it'][2]))

In [None]:
# Perform t-test between models using actual bertscore scores
for lang in bertscore_scores:
    models = list(bertscore_scores[lang].keys())
    print(f"\nT-test results for {lang.upper()} models:\n")
    
    for i in range(len(models)):
        for j in range(i+1, len(models)):
            model_1, model_2 = models[i], models[j]
            bertscore_1 = bertscore_scores[lang][model_1]  # Three bertscore scores from gen0, gen1, gen2
            bertscore_2 = bertscore_scores[lang][model_2]  # Same for second model
            
            t_stat, p_value = ttest_rel(bertscore_1, bertscore_2)  # Use real bertscore values           
            print(f"T-test between {model_1} and {model_2}: t-stat={t_stat:.4f}, p-value={p_value:.4f}")

In [None]:
from scipy.stats import ttest_rel
import numpy as np

# Perform t-test between models using the average BERTScore
for lang in bertscore_scores:
    models = list(bertscore_scores[lang].keys())
    print(f"\nT-test results for {lang.upper()} models:\n")
    
    for i in range(len(models)):
        for j in range(i+1, len(models)):
            model_1, model_2 = models[i], models[j]
            bertscore_1 = np.mean(bertscore_scores[lang][model_1], axis=0)  # Compute average across generations
            bertscore_2 = np.mean(bertscore_scores[lang][model_2], axis=0)  # Compute average across generations
            
            # Ensure both arrays have the same length
            if bertscore_1.shape != bertscore_2.shape:
                print(f"Skipping T-test between {model_1} and {model_2}: Mismatched shapes {bertscore_1.shape} vs {bertscore_2.shape}")
                continue

            # Check if the values are too similar to avoid precision loss
            if np.allclose(bertscore_1, bertscore_2):
                print(f"T-test between {model_1} and {model_2}: Skipped (scores are nearly identical)")
                continue

            # Perform the t-test
            t_stat, p_value = ttest_rel(bertscore_1, bertscore_2)

            # Print results, ensuring they are scalars
            print(f"T-test between {model_1} and {model_2}: t-stat={float(t_stat):.4f}, p-value={float(p_value):.4f}")


In [None]:
import pandas as pd
from scipy.stats import ttest_rel
import numpy as np

# Lista per salvare i risultati
t_test_results = []

# Analisi per ogni lingua
for lang in bertscore_scores:
    models = list(bertscore_scores[lang].keys())

    # Ordina i modelli dal bertscore più alto al più basso
    sorted_models = sorted(models, key=lambda m: np.mean(bertscore_scores[lang][m]), reverse=True)

    # Creiamo una lista per la tabella finale
    results_table = []

    print(f"\n📊 **bertscore Score Ranking for {lang.upper()}**\n")
    
    # Confrontiamo ogni modello con quello successivo nella lista ordinata
    for i in range(len(sorted_models)):
        model_1 = sorted_models[i]
        bertscore_1 = bertscore_scores[lang][model_1]  # bertscore scores per il primo modello
        mean_bertscore_1 = np.mean(bertscore_1)  # bertscore medio del primo modello

        if i < len(sorted_models) - 1:
            model_2 = sorted_models[i + 1]
            bertscore_2 = bertscore_scores[lang][model_2]  # bertscore scores per il secondo modello
            mean_bertscore_2 = np.mean(bertscore_2)  # bertscore medio del secondo modello

            # Calcoliamo il t-test
            t_stat, p_value = ttest_rel(bertscore_1, bertscore_2)

            # Verifica se la differenza è statisticamente significativa
            significant = "✅ Yes" if p_value < 0.05 else "❌ No"
        else:
            model_2, t_stat, p_value, significant = "-", "-", "-", "-"

        # Aggiungiamo i risultati alla tabella
        results_table.append({
            "Model": model_1,
            "bertscore Score": round(mean_bertscore_1, 4),
            "Compared with": model_2,
            "T-Statistic": round(t_stat, 4) if t_stat != "-" else "-",
            "P-Value": round(p_value, 4) if p_value != "-" else "-",
            "Significant?": significant
        })

    # Creiamo un DataFrame per la tabella finale
    df_results = pd.DataFrame(results_table)

    # Stampiamo la tabella
    print(df_results)

    # Se vuoi salvare la tabella
    df_results.to_csv(f"bertscore_t_test_{lang}.csv", index=False)


In [None]:
import pandas as pd
from scipy.stats import ttest_rel
import numpy as np

# Lista per salvare i risultati
t_test_results = []

# Analisi per ogni lingua
for lang in bertscore_scores:
    models = list(bertscore_scores[lang].keys())

    # Ordina i modelli dal BERTScore medio più alto al più basso
    sorted_models = sorted(models, key=lambda m: np.mean(bertscore_scores[lang][m]), reverse=True)

    # Lista per la tabella finale
    results_table = []

    print(f"\n📊 **BERTScore Ranking for {lang.upper()}**\n")
    
    # Confrontiamo ogni modello con quello successivo nella lista ordinata
    for i in range(len(sorted_models) - 1):  # L'ultimo modello non ha successivo con cui confrontarsi
        model_1 = sorted_models[i]
        model_2 = sorted_models[i + 1]

        # Prendiamo i punteggi e calcoliamo la media
        bertscore_1 = np.array(bertscore_scores[lang][model_1]).flatten()
        bertscore_2 = np.array(bertscore_scores[lang][model_2]).flatten()

        mean_bertscore_1 = np.mean(bertscore_1)
        mean_bertscore_2 = np.mean(bertscore_2)

        # Verifica che le lunghezze siano uguali
        if bertscore_1.shape != bertscore_2.shape:
            print(f"Skipping {model_1} vs {model_2}: Mismatched shapes {bertscore_1.shape} vs {bertscore_2.shape}")
            continue

        # Se i dati sono quasi identici, evitiamo il t-test
        if np.allclose(bertscore_1, bertscore_2):
            print(f"T-test skipped for {model_1} vs {model_2}: Scores are nearly identical")
            t_stat, p_value, significant = "-", "-", "❌ No (Identical Scores)"
        else:
            # Calcoliamo il t-test
            t_stat, p_value = ttest_rel(bertscore_1, bertscore_2)

            # Verifica se la differenza è statisticamente significativa
            significant = "✅ Yes" if p_value < 0.05 else "❌ No"

        # Aggiungiamo i risultati alla tabella
        results_table.append({
            "Model": model_1,
            "BERTScore": round(mean_bertscore_1, 4),
            "Compared with": model_2,
            "T-Statistic": round(float(t_stat), 4) if t_stat != "-" else "-",
            "P-Value": round(float(p_value), 4) if p_value != "-" else "-",
            "Significant?": significant
        })

    # Crea un DataFrame per la tabella finale
    df_results = pd.DataFrame(results_table)

    # Stampiamo la tabella
    print(df_results)

    # Salviamo la tabella come CSV
    df_results.to_csv(f"bertscore_t_test_{lang}.csv", index=False)
