In [4]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
from scipy.stats import ttest_rel
import pickle

# evaluation 
from evaluate import load

In [5]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [6]:
models_generations = {
    'en': {},
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-en-' in file:
        if model not in models_generations['en']:
            models_generations['en'][model] = {}
        models_generations['en'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: en")
    elif '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")

model: Llama-3.1-8B-Instruct-en - gen: gen0, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen1, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen2, lang: en
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Mistral-Nemo-Instruct-2407-en - gen: gen0, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen1, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen2, lang: en
model: Mistral-Nemo-Instruct-2407-it - gen: gen0, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen1, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen2, lang: it
model: Qwen2.5-7B-Instruct-en - gen: gen0, lang: en
model: Qwen2.5-7B-Instruct-en - gen: gen1, lang: en
model: Qwen2.5-7B-Instruct-en - gen: gen2, lang: en
model: Qwen2.5-7B-Instruct-it - gen: gen0, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen1, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen2, lang: it


## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/chrf">chrF++</a>

In [7]:
chrf = load("chrf")

Using the latest cached version of the module from C:\Users\OliverioM\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--chrf\d244bab9383988714085a8dacc4871986d9f025398581c33d6b2ee22836b4069 (last modified on Wed Sep  4 18:15:30 2024) since it couldn't be found locally at evaluate-metric--chrf, or remotely on the Hugging Face Hub.


In [8]:
chrf_scores = {'en': {}, 'it': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_chrf_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            gen_chrf_scores = []
            for i in tqdm(range(len(references)), desc="Calcolo punteggi CHRF"):
                reference_scores = []
                reference = references[i]
                for j in range(len(reference)):
                    actual = reference[j]
                    result = chrf.compute(predictions=[predictions[i]], references=[actual], word_order=2)
                    reference_scores.append(result['score'])
                
                gen_chrf_scores.append(np.mean(reference_scores))
            
            model_chrf_scores.append(gen_chrf_scores)

        # Store the three chrf scores instead of their mean
        chrf_scores[lang][model] = model_chrf_scores
        print(f'chrf scores for {model} ({lang}): {model_chrf_scores}')
        print(f'Average: {np.mean(model_chrf_scores)}')
        print()

Language: en, Model: Llama-3.1-8B-Instruct-en


Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:36<00:00, 48.74it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:40<00:00, 43.48it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:42<00:00, 41.56it/s]


chrf scores for Llama-3.1-8B-Instruct-en (en): [[87.14281457395852, 76.97127850712057, 65.64137854504732, 37.22362031253879, 66.3814898221009, 51.553709935897274, 69.91912114426437, 70.95801779034475, 66.90170327610434, 57.732896927723374, 69.18482562978961, 59.059271193086296, 74.47169584819586, 71.96107966645202, 54.16540663341322, 69.93858481284474, 59.617572827327784, 56.29805894988825, 64.16782019957469, 74.940182014986, 73.51300385767352, 59.015853178340045, 60.62883676988654, 76.25899847238416, 69.82575309358954, 67.0908788383676, 56.624088632651116, 74.20026122932437, 61.130084091159254, 76.43950906069443, 80.26971127332739, 64.8969377327549, 59.26492880747364, 49.52241149703658, 66.3762052004711, 45.08319032074439, 48.540117766774536, 87.09524452891215, 60.656826072519074, 45.59149497383522, 56.124843690360315, 70.31843886227512, 62.30940301611458, 62.38260042909305, 76.25286981018344, 58.635946632754724, 69.82848943960089, 47.19285946401376, 63.936077386968485, 73.29139661187

Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:44<00:00, 40.20it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:45<00:00, 39.39it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:48<00:00, 36.48it/s]


chrf scores for Mistral-Nemo-Instruct-2407-en (en): [[87.14281457395852, 79.73732143399165, 68.68510788871913, 65.18747933069015, 75.09411153739872, 55.50706434271651, 73.67696336702197, 64.17855804289995, 60.4044836228598, 60.313546070135594, 72.49339052548181, 69.48531500648052, 74.47169584819586, 69.60337304303606, 51.82872153936399, 69.93858481284474, 62.215920222896635, 59.18067937180307, 64.16782019957469, 76.31549155173298, 69.76021727889956, 60.141570280007926, 50.09814482428276, 77.88269966040934, 69.3930594032209, 65.1283455603841, 56.736799148664964, 67.52566906113812, 57.136570483278206, 76.43950906069443, 80.26971127332739, 63.946402638056036, 54.583361752446216, 51.2540893840291, 65.95977840603804, 49.57048135735034, 58.28249665609233, 89.84106624972578, 60.656826072519074, 47.61217126223437, 52.76023685171458, 71.89186907275821, 70.68301827309556, 66.98267296826668, 76.25286981018344, 58.635946632754724, 70.43605713037692, 54.660986596582234, 65.40898677088673, 70.169542

Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:52<00:00, 33.92it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:59<00:00, 29.97it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:56<00:00, 31.68it/s]


chrf scores for Qwen2.5-7B-Instruct-en (en): [[88.65855144975302, 80.20089559833433, 64.05317677852618, 65.18747933069015, 73.94506623156015, 55.79633128130615, 61.3773924821453, 68.81427430908116, 60.21958092163529, 64.03668729497365, 68.90741269418105, 63.28809155435035, 83.4365519787077, 66.11525880290745, 55.62797485266284, 64.43079034032753, 64.00635243048846, 53.85724518397779, 62.065457419815175, 74.940182014986, 58.166681005271194, 63.02104089253985, 57.135921871120615, 57.63603428015009, 69.3930594032209, 66.80198306640703, 58.03736036028411, 57.79565938924142, 63.66785745872077, 78.79215936823492, 80.26971127332739, 62.59011592423633, 67.07346002963351, 52.00541726741815, 63.74920342538561, 43.49629094898983, 59.28570090219824, 89.84106624972578, 54.17560648441454, 55.25540037895109, 63.32709200496794, 70.24314851683799, 65.22216659621627, 60.4581671026915, 62.91111931349722, 68.77547604425027, 73.99112871163486, 55.9048212506922, 56.54511422628108, 71.30794556636708, 70.8562

Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [01:02<00:00, 28.41it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:57<00:00, 30.91it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [01:00<00:00, 29.42it/s]


chrf scores for Llama-3.1-8B-Instruct-it (it): [[67.66929718973815, 67.62948722357508, 68.95884946501913, 37.272691722813015, 80.37584123905536, 50.55437546044103, 59.45617638979045, 71.56916071668964, 58.75386334234681, 73.78629276453584, 71.99320494354089, 66.93441921383419, 91.29725792844361, 67.03043878374106, 50.871692598082525, 67.51777804560574, 56.55675460488872, 56.466962240259875, 65.51574525020887, 76.68428786041822, 56.09945005312113, 57.1941609188116, 71.87410516389782, 70.9501933023739, 65.74275774125702, 51.897388455692955, 60.00611045338755, 63.32992942813858, 64.35877426811638, 91.39719431757946, 46.17978390531792, 68.00327268974, 69.41034461581866, 51.74558105514598, 69.94853024845251, 42.28680423040603, 75.7050946431939, 94.21217526686412, 60.20550319875531, 40.061665091472186, 50.618801389362694, 74.77590415704213, 62.48079428055676, 57.95642349042898, 64.7350688056473, 83.19351227414022, 72.08485134330125, 59.48859710791095, 53.65539218610409, 73.31835832786614, 63

Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [01:01<00:00, 28.91it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:55<00:00, 31.96it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:47<00:00, 37.21it/s]


chrf scores for Mistral-Nemo-Instruct-2407-it (it): [[59.2665103181129, 78.97285262462599, 65.83029983633257, 37.272691722813015, 80.37584123905536, 50.93405388224574, 63.01400169669078, 59.64396188425036, 58.63410021894578, 73.81374750333455, 71.99320494354089, 73.2192559633544, 91.29725792844361, 80.81564068932931, 51.679561239006915, 67.51777804560574, 58.36269130401181, 45.96371862922731, 70.83246526700542, 78.36148197973138, 56.604347533534586, 53.723437133294794, 73.20490126507653, 75.54013037985236, 72.09104828052496, 67.02416997209338, 60.00611045338755, 45.4182088557244, 65.70294962635269, 91.39719431757946, 84.12214751717633, 70.53372015200019, 71.69600202792158, 57.34158284644081, 68.29995323718077, 48.34446894473927, 60.11699541157898, 94.21217526686412, 64.67126611301582, 52.260503556931475, 57.95997238999746, 72.53163031510861, 61.57687080167276, 62.397182525070605, 64.7350688056473, 83.19351227414022, 70.54586726207727, 71.09849209035393, 62.5972890669676, 73.25577438120

Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:38<00:00, 46.26it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:44<00:00, 40.38it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:44<00:00, 39.83it/s]

chrf scores for Qwen2.5-7B-Instruct-it (it): [[57.669114148909465, 61.09576607480836, 62.97115411990476, 50.22164657254099, 76.79271689299114, 47.15214068041828, 61.45987650812908, 67.16366533945362, 58.63410021894578, 73.51084210339086, 58.800158003842974, 60.050749853508, 91.29725792844361, 62.3111621642345, 57.117815841111586, 53.74620284305874, 64.42940384126048, 48.54043724005255, 65.51574525020887, 81.08806078496919, 63.234755662380245, 57.23983731179353, 71.87410516389782, 66.81412106052296, 67.50025771146717, 73.22136494498363, 62.417694137620224, 71.00731084136999, 52.818031430177456, 95.12483181546104, 41.71330492744724, 65.78921868253973, 70.80426735352957, 57.28117682238095, 67.60461939590697, 48.95822061807337, 47.788333535015404, 68.92576601267452, 56.03925510968307, 49.601676228909206, 47.30820911237532, 69.92035289586444, 50.57171100505962, 58.11662804196086, 84.54176903372793, 83.19351227414022, 71.45198402174127, 71.09849209035393, 56.68406833744436, 72.0087470170289,




In [9]:
# Save chrF++ scores
with open('chrf_scores-exp1.pkl', 'wb') as f:
    pickle.dump(chrf_scores, f)