In [1]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
from scipy.stats import ttest_rel

# evaluation 
from evaluate import load

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
models_generations = {
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")


model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen0, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen1, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen2, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it-sga - gen: gen0, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it-sga - gen: gen1, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it-sga - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-it-sga - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it-sga - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it-sga - gen: gen2, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen0, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen1, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen2, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen0, lang: it
model: Mistral-Nemo-

## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/bertscore">Bertscore</a>

In [2]:
bertscore = load("bertscore")

In [3]:
predictions = ["Il Monumento all'11° Fanteria del Mississippi, in Pennsylvania, è classificato come proprietà contribuente. La contea in cui si trova il monumento, Adams, si distingue da altre come ad esempio la contea di Carroll, situata in Maryland, a sud-est di Adams in Pennsylvania."]
references = [
    ["Il monumento dell'11° Fanteria del Mississippi, che rientra nella categoria delle proprietà contribuenti, si trova nella Contea di Adams, in Pennsylvania, negli Stati Uniti. A sud-est della Contea di Adams, Pennsylvania, si trova la Contea di Carroll, Maryland"],
]
results = bertscore.compute(predictions=predictions, references=references, lang='it')
print(results)

{'precision': [0.8031600117683411], 'recall': [0.8289094567298889], 'f1': [0.8158316016197205], 'hashcode': 'bert-base-multilingual-cased_L9_no-idf_version=0.3.12(hug_trans=4.41.0)'}


In [5]:
from tqdm import tqdm
import numpy as np

bertscore_scores = {'it': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_bertscore_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        gens = sorted(models_generations[lang][model])  # Ensure order (gen0, gen1, gen2)
        for gen in tqdm(gens, desc=f'Processing {model} ({lang})'):
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            results = bertscore.compute(predictions=predictions, references=references, lang=lang)
            model_bertscore_scores.append(results['f1'])  # Store bertscore for each generation
        
        # Store the three bertscore scores instead of their mean
        bertscore_scores[lang][model] = model_bertscore_scores
        print(f'bertscore scores for {model} ({lang}): {model_bertscore_scores} - Average: {np.mean(model_bertscore_scores)}')
        print()


Language: it, Model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it


Processing LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it (it):   0%|          | 0/3 [02:15<?, ?it/s]


KeyboardInterrupt: 

In [None]:
#save bertscore scores
import pickle

with open('bertscore_scores-exp4.pkl', 'wb') as f:
    pickle.dump(bertscore_scores, f)

In [None]:
# open bertscore_scores-exp1.pkl
with open('bertscore_scores-exp1.pkl', 'rb') as f:
    bertscore_scores_exp1 = pickle.load(f)

# open bertscore_scores-exp2.pkl
with open('bertscore_scores-exp3-parziale.pkl', 'rb') as f:
    bertscore_scores_exp2 = pickle.load(f)

In [5]:
bertscore_scores = {'it': {}}
for lang in bertscore_scores_exp1:
    if lang == 'it':
        for model in bertscore_scores_exp1[lang]:
            bertscore_scores[lang][model] = bertscore_scores_exp1[lang][model]
            print(model)

for lang in bertscore_scores_exp2:
    if lang == 'it':
        for model in bertscore_scores_exp2[lang]:
            bertscore_scores[lang][model] = bertscore_scores_exp2[lang][model]
            print(model)

Llama-3.1-8B-Instruct-it
Mistral-Nemo-Instruct-2407-it
Qwen2.5-7B-Instruct-it
LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it
Minerva-7B-instruct-v1.0-it


In [6]:
#save
with open('bertscore_scores-exp3.pkl', 'wb') as f:
    pickle.dump(bertscore_scores, f)

In [11]:
print(len(bertscore_scores['it']['LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it'][0]))
print(len(bertscore_scores['it']['LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it'][1]))
print(len(bertscore_scores['it']['LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it'][2]))

1779
1779
1779


In [9]:
# Perform t-test between models using actual bertscore scores
for lang in bertscore_scores:
    models = list(bertscore_scores[lang].keys())
    print(f"\nT-test results for {lang.upper()} models:\n")
    
    for i in range(len(models)):
        for j in range(i+1, len(models)):
            model_1, model_2 = models[i], models[j]
            bertscore_1 = bertscore_scores[lang][model_1]  # Three bertscore scores from gen0, gen1, gen2
            bertscore_2 = bertscore_scores[lang][model_2]  # Same for second model
            
            t_stat, p_value = ttest_rel(bertscore_1, bertscore_2)  # Use real bertscore values           
            print(f"T-test between {model_1} and {model_2}: t-stat={t_stat:.4f}, p-value={p_value:.4f}")


T-test results for EN models:



  return hypotest_fun_in(*args, **kwds)


TypeError: unsupported format string passed to numpy.ndarray.__format__

In [13]:
from scipy.stats import ttest_rel
import numpy as np

# Perform t-test between models using the average BERTScore
for lang in bertscore_scores:
    models = list(bertscore_scores[lang].keys())
    print(f"\nT-test results for {lang.upper()} models:\n")
    
    for i in range(len(models)):
        for j in range(i+1, len(models)):
            model_1, model_2 = models[i], models[j]
            bertscore_1 = np.mean(bertscore_scores[lang][model_1], axis=0)  # Compute average across generations
            bertscore_2 = np.mean(bertscore_scores[lang][model_2], axis=0)  # Compute average across generations
            
            # Ensure both arrays have the same length
            if bertscore_1.shape != bertscore_2.shape:
                print(f"Skipping T-test between {model_1} and {model_2}: Mismatched shapes {bertscore_1.shape} vs {bertscore_2.shape}")
                continue

            # Check if the values are too similar to avoid precision loss
            if np.allclose(bertscore_1, bertscore_2):
                print(f"T-test between {model_1} and {model_2}: Skipped (scores are nearly identical)")
                continue

            # Perform the t-test
            t_stat, p_value = ttest_rel(bertscore_1, bertscore_2)

            # Print results, ensuring they are scalars
            print(f"T-test between {model_1} and {model_2}: t-stat={float(t_stat):.4f}, p-value={float(p_value):.4f}")



T-test results for IT models:

T-test between Llama-3.1-8B-Instruct-it and Mistral-Nemo-Instruct-2407-it: t-stat=-4.1938, p-value=0.0000
T-test between Llama-3.1-8B-Instruct-it and Qwen2.5-7B-Instruct-it: t-stat=9.0535, p-value=0.0000
T-test between Llama-3.1-8B-Instruct-it and LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it: t-stat=1.0574, p-value=0.2905
T-test between Llama-3.1-8B-Instruct-it and Minerva-7B-instruct-v1.0-it: t-stat=14.3407, p-value=0.0000
T-test between Mistral-Nemo-Instruct-2407-it and Qwen2.5-7B-Instruct-it: t-stat=12.4694, p-value=0.0000
T-test between Mistral-Nemo-Instruct-2407-it and LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it: t-stat=4.8145, p-value=0.0000
T-test between Mistral-Nemo-Instruct-2407-it and Minerva-7B-instruct-v1.0-it: t-stat=17.3447, p-value=0.0000
T-test between Qwen2.5-7B-Instruct-it and LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it: t-stat=-8.2494, p-value=0.0000
T-test between Qwen2.5-7B-Instruct-it and Minerva-7B-instruct-v1.0-it: t-stat=5.8944, p-value=0.0000
T-t

In [29]:
import pandas as pd
from scipy.stats import ttest_rel
import numpy as np

# Lista per salvare i risultati
t_test_results = []

# Analisi per ogni lingua
for lang in bertscore_scores:
    models = list(bertscore_scores[lang].keys())

    # Ordina i modelli dal bertscore più alto al più basso
    sorted_models = sorted(models, key=lambda m: np.mean(bertscore_scores[lang][m]), reverse=True)

    # Creiamo una lista per la tabella finale
    results_table = []

    print(f"\n📊 **bertscore Score Ranking for {lang.upper()}**\n")
    
    # Confrontiamo ogni modello con quello successivo nella lista ordinata
    for i in range(len(sorted_models)):
        model_1 = sorted_models[i]
        bertscore_1 = bertscore_scores[lang][model_1]  # bertscore scores per il primo modello
        mean_bertscore_1 = np.mean(bertscore_1)  # bertscore medio del primo modello

        if i < len(sorted_models) - 1:
            model_2 = sorted_models[i + 1]
            bertscore_2 = bertscore_scores[lang][model_2]  # bertscore scores per il secondo modello
            mean_bertscore_2 = np.mean(bertscore_2)  # bertscore medio del secondo modello

            # Calcoliamo il t-test
            t_stat, p_value = ttest_rel(bertscore_1, bertscore_2)

            # Verifica se la differenza è statisticamente significativa
            significant = "✅ Yes" if p_value < 0.05 else "❌ No"
        else:
            model_2, t_stat, p_value, significant = "-", "-", "-", "-"

        # Aggiungiamo i risultati alla tabella
        results_table.append({
            "Model": model_1,
            "bertscore Score": round(mean_bertscore_1, 4),
            "Compared with": model_2,
            "T-Statistic": round(t_stat, 4) if t_stat != "-" else "-",
            "P-Value": round(p_value, 4) if p_value != "-" else "-",
            "Significant?": significant
        })

    # Creiamo un DataFrame per la tabella finale
    df_results = pd.DataFrame(results_table)

    # Stampiamo la tabella
    print(df_results)

    # Se vuoi salvare la tabella
    df_results.to_csv(f"bertscore_t_test_{lang}.csv", index=False)



📊 **bertscore Score Ranking for EN**



ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [14]:
import pandas as pd
from scipy.stats import ttest_rel
import numpy as np

# Lista per salvare i risultati
t_test_results = []

# Analisi per ogni lingua
for lang in bertscore_scores:
    models = list(bertscore_scores[lang].keys())

    # Ordina i modelli dal BERTScore medio più alto al più basso
    sorted_models = sorted(models, key=lambda m: np.mean(bertscore_scores[lang][m]), reverse=True)

    # Lista per la tabella finale
    results_table = []

    print(f"\n📊 **BERTScore Ranking for {lang.upper()}**\n")
    
    # Confrontiamo ogni modello con quello successivo nella lista ordinata
    for i in range(len(sorted_models) - 1):  # L'ultimo modello non ha successivo con cui confrontarsi
        model_1 = sorted_models[i]
        model_2 = sorted_models[i + 1]

        # Prendiamo i punteggi e calcoliamo la media
        bertscore_1 = np.array(bertscore_scores[lang][model_1]).flatten()
        bertscore_2 = np.array(bertscore_scores[lang][model_2]).flatten()

        mean_bertscore_1 = np.mean(bertscore_1)
        mean_bertscore_2 = np.mean(bertscore_2)

        # Verifica che le lunghezze siano uguali
        if bertscore_1.shape != bertscore_2.shape:
            print(f"Skipping {model_1} vs {model_2}: Mismatched shapes {bertscore_1.shape} vs {bertscore_2.shape}")
            continue

        # Se i dati sono quasi identici, evitiamo il t-test
        if np.allclose(bertscore_1, bertscore_2):
            print(f"T-test skipped for {model_1} vs {model_2}: Scores are nearly identical")
            t_stat, p_value, significant = "-", "-", "❌ No (Identical Scores)"
        else:
            # Calcoliamo il t-test
            t_stat, p_value = ttest_rel(bertscore_1, bertscore_2)

            # Verifica se la differenza è statisticamente significativa
            significant = "✅ Yes" if p_value < 0.05 else "❌ No"

        # Aggiungiamo i risultati alla tabella
        results_table.append({
            "Model": model_1,
            "BERTScore": round(mean_bertscore_1, 4),
            "Compared with": model_2,
            "T-Statistic": round(float(t_stat), 4) if t_stat != "-" else "-",
            "P-Value": round(float(p_value), 4) if p_value != "-" else "-",
            "Significant?": significant
        })

    # Crea un DataFrame per la tabella finale
    df_results = pd.DataFrame(results_table)

    # Stampiamo la tabella
    print(df_results)

    # Salviamo la tabella come CSV
    df_results.to_csv(f"bertscore_t_test_{lang}.csv", index=False)



📊 **BERTScore Ranking for IT**

                                   Model  BERTScore  \
0          Mistral-Nemo-Instruct-2407-it     0.9071   
1               Llama-3.1-8B-Instruct-it     0.9036   
2  LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it     0.9028   
3                 Qwen2.5-7B-Instruct-it     0.8948   

                           Compared with  T-Statistic  P-Value Significant?  
0               Llama-3.1-8B-Instruct-it       6.9481   0.0000        ✅ Yes  
1  LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it       1.6389   0.1013         ❌ No  
2                 Qwen2.5-7B-Instruct-it      13.5671   0.0000        ✅ Yes  
3            Minerva-7B-instruct-v1.0-it       9.5978   0.0000        ✅ Yes  
