In [5]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
from scipy.stats import ttest_rel

# evaluation 
from evaluate import load

In [6]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [7]:
models_generations = {
    'en': {},
    'it': {},
    'ru': {},
    'ge': {}
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-exp2-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-en-' in file:
        if model not in models_generations['en']:
            models_generations['en'][model] = {}
        models_generations['en'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: en")
    elif '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")
    elif '-ge-' in file:
        if model not in models_generations['ge']:
            models_generations['ge'][model] = {}
        models_generations['ge'][model][gen] = model_generations
        print(f"model: {model} - ge: {gen}, lang: ge")
    elif '-ru-' in file:
        if model not in models_generations['ru']:
            models_generations['ru'][model] = {}
        models_generations['ru'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: ru")

model: Llama-3.1-8B-Instruct-en - gen: gen0, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen1, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen2, lang: en
model: Llama-3.1-8B-Instruct-ge - ge: gen0, lang: ge
model: Llama-3.1-8B-Instruct-ge - ge: gen1, lang: ge
model: Llama-3.1-8B-Instruct-ge - ge: gen2, lang: ge
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-ru - gen: gen0, lang: ru
model: Llama-3.1-8B-Instruct-ru - gen: gen1, lang: ru
model: Llama-3.1-8B-Instruct-ru - gen: gen2, lang: ru
model: Mistral-Nemo-Instruct-2407-en - gen: gen0, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen1, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen2, lang: en
model: Mistral-Nemo-Instruct-2407-ge - ge: gen0, lang: ge
model: Mistral-Nemo-Instruct-2407-ge - ge: gen1, lang: ge
model: Mistral-Nemo-Instruct-2407-ge - ge: gen2, lang: ge
mode

## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/chrf">chrF++</a>

In [8]:
chrf = load("chrf")

Using the latest cached version of the module from C:\Users\OliverioM\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--chrf\d244bab9383988714085a8dacc4871986d9f025398581c33d6b2ee22836b4069 (last modified on Wed Sep  4 18:15:30 2024) since it couldn't be found locally at evaluate-metric--chrf, or remotely on the Hugging Face Hub.


In [9]:
chrf_scores = {'en': {}, 'it': {}, 'ge': {}, 'ru': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_chrf_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            gen_chrf_scores = []
            for i in tqdm(range(len(references)), desc="Calcolo punteggi CHRF"):
                reference_scores = []
                reference = references[i]
                for j in range(len(reference)):
                    actual = reference[j]
                    result = chrf.compute(predictions=[predictions[i]], references=[actual], word_order=2)
                    reference_scores.append(result['score'])
                
                gen_chrf_scores.append(np.mean(reference_scores))
            
            model_chrf_scores.append(gen_chrf_scores)

        # Store the three chrf scores instead of their mean
        chrf_scores[lang][model] = model_chrf_scores
        print(f'chrf scores for {model} ({lang}): {model_chrf_scores}')
        print(f'Average: {np.mean(model_chrf_scores)}')
        print()

Language: en, Model: Llama-3.1-8B-Instruct-en


Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 45.33it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:06<00:00, 55.74it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:07<00:00, 48.28it/s]


chrf scores for Llama-3.1-8B-Instruct-en (en): [[87.17181709025928, 78.93906724984905, 36.02861577538228, 60.08120395338609, 50.060800844227785, 69.47318157856837, 59.465087470690875, 54.66564061488424, 88.37126587816316, 76.89061603620279, 67.5773304806614, 62.89789277026777, 86.00472185077967, 77.12689871891617, 77.71215551645194, 66.12298560474608, 74.12903610779017, 64.60444886017625, 67.80152117726146, 84.79206641167995, 66.44696180942172, 74.25903036988984, 59.02034335031275, 86.10156813500487, 83.87423052011248, 70.953839607417, 83.27156861678628, 59.05698801844636, 71.86643013289985, 48.81388195570608, 76.79048235864246, 79.04694982064677, 80.19801630138143, 56.721757636055315, 69.87441324186756, 73.70975369339567, 66.98911873551886, 67.78768784068583, 95.37703512463582, 80.2281348281053, 65.9098715357725, 73.32762767074003, 67.52057170525717, 63.36980939735961, 77.20184524543548, 70.40289366581709, 80.61808151136682, 75.25040647803462, 74.95570456152299, 73.4908871202142, 86.7

Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 42.59it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 47.28it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:09<00:00, 39.11it/s]


chrf scores for Mistral-Nemo-Instruct-2407-en (en): [[87.17181709025928, 78.93906724984905, 36.02861577538228, 61.41300393636724, 49.076174534382204, 72.42871118802584, 58.718437582861156, 57.82209080952136, 88.37126587816316, 76.89061603620279, 66.84439454162246, 67.77524843732421, 86.00472185077967, 56.60728390379876, 77.71215551645194, 66.12298560474608, 74.12903610779017, 64.60444886017625, 52.880232024903044, 83.39053033875025, 61.69563981022037, 75.41056430567656, 67.96588698801779, 87.21847037644379, 75.16186787094922, 70.953839607417, 83.27156861678628, 64.0766698228926, 68.06033885876751, 48.81388195570608, 60.04149705540827, 76.54154608185866, 80.13127735348887, 56.04278337987936, 69.87441324186756, 70.88450132283306, 67.09425722747162, 79.70431231526958, 91.78865289137383, 75.0578142341912, 66.1206458012178, 73.32762767074003, 63.682275969409545, 57.04653356722225, 73.29212504941073, 72.06503477243666, 80.61808151136682, 80.98627048608215, 74.95570456152299, 73.4908871202142

Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 42.92it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 43.06it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:06<00:00, 60.92it/s]


chrf scores for Qwen2.5-7B-Instruct-en (en): [[87.17181709025928, 78.93906724984905, 49.65634220208317, 60.58456746424455, 53.05251886446963, 68.05097966368116, 58.5436161418311, 46.754656558545825, 88.37126587816316, 76.89061603620279, 69.86690398361517, 59.68128465210027, 86.00472185077967, 70.7802692728674, 77.71215551645194, 70.37199170713774, 72.22507385274251, 58.84046885734847, 47.75792182297898, 84.79206641167995, 62.97422817887394, 76.37974842150408, 69.35331705916938, 87.21847037644379, 68.43594871273116, 90.3152964925444, 83.27156861678628, 64.0766698228926, 70.02576803016643, 56.658712977046555, 78.6790872810713, 80.71919644818874, 79.45964610929492, 55.67719655461162, 67.89798159388513, 63.17978536168624, 60.290321027643664, 76.66813672475598, 93.82587401235745, 66.33084135623314, 68.04901781572117, 71.81779520683426, 72.17006429444636, 54.9641089199956, 77.20184524543548, 73.41898906070648, 80.61808151136682, 75.25040647803462, 74.95570456152299, 73.4908871202142, 86.7078

Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:06<00:00, 59.02it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:09<00:00, 40.03it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 43.07it/s]


chrf scores for Llama-3.1-8B-Instruct-it (it): [[91.37353617441485, 95.17078801725748, 40.23090664781859, 63.588783990363886, 53.694729602113384, 64.16406949207476, 59.23468224601969, 57.131650250629264, 92.8949234139254, 75.0629161515184, 61.26680450716873, 64.82672912192227, 95.18232594034023, 68.15332795278653, 98.14706961897507, 52.6004979266717, 77.04018566134182, 54.349216547346025, 58.028518062802156, 88.83697524452106, 65.02018596846251, 72.73276776476278, 67.5259017724, 75.70650629613513, 74.45353010666727, 100.0, 97.39823240299577, 52.796348858581496, 74.81789319059425, 45.05822771987704, 86.80076745123338, 74.3333320948934, 78.29057890970573, 58.464843332841234, 83.60711400836782, 72.61368493057947, 67.7575605458913, 68.8150756612718, 93.34354559271316, 76.71959341215944, 69.11208604308202, 59.48914486059514, 64.13013329402821, 48.879327792361494, 77.87645172411668, 78.90249196492871, 62.332354687471586, 77.42997774831916, 64.86230344011004, 82.01295833760206, 83.62707954810

Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:09<00:00, 42.21it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:09<00:00, 42.52it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 46.50it/s]


chrf scores for Mistral-Nemo-Instruct-2407-it (it): [[91.37353617441485, 95.17078801725748, 50.209891713094066, 60.303178410358235, 52.21504723781728, 64.15433421682066, 60.37421519694039, 65.8677778061455, 86.12438228677422, 72.70286012873653, 51.28760545498423, 62.97664285058363, 95.18232594034023, 84.81748497953073, 98.14706961897507, 52.6004979266717, 77.04018566134182, 54.349216547346025, 60.68817047869845, 90.21422670317334, 69.83897224138515, 78.56592800180398, 58.63690827663401, 82.71789255464577, 73.21095891080223, 100.0, 97.39823240299577, 49.52061878415691, 63.35442721364435, 45.05822771987704, 87.6286569452319, 74.042142843342, 80.22241054701105, 49.96404452019565, 75.95260302429024, 72.61368493057947, 67.32387973352382, 70.79606761291274, 95.03282052594153, 69.7864015984133, 69.55412430994615, 61.663872041659, 63.16496406434127, 47.46144114154476, 77.87645172411668, 79.55328496814515, 92.15446082207497, 58.70757113953746, 64.86230344011004, 76.2155710560465, 83.62707954810

Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 42.66it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 42.61it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 46.47it/s]


chrf scores for Qwen2.5-7B-Instruct-it (it): [[64.88260195572462, 76.45973334459707, 48.330671459981275, 54.79234557012146, 49.286714582772035, 64.31847616370887, 61.44246800074386, 55.976327224085196, 92.8949234139254, 72.70286012873653, 53.91374628365598, 58.018474383855995, 95.18232594034023, 79.16698800581689, 98.14706961897507, 47.826242761817156, 62.23321591967569, 57.71041439183413, 48.799743164536714, 88.83697524452106, 65.99537858451153, 71.72728915907699, 71.6886500500994, 76.2196812966325, 68.97942802266321, 94.11925068640586, 92.05822222413566, 51.66077134494295, 66.12391511244546, 45.05822771987704, 73.73821637632258, 74.49769773797385, 77.66711565447065, 54.209002187857095, 74.18357174816249, 51.90326067452561, 66.3759418959714, 50.81318823844837, 92.10928299574562, 71.34416273490575, 69.55412430994615, 59.48914486059514, 64.08577840069867, 47.17686869199292, 80.74527775544003, 78.29554731412051, 63.44201366680031, 77.42997774831916, 58.95285171625514, 78.9591940388969, 8

Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 45.60it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 45.70it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 44.69it/s]


chrf scores for Llama-3.1-8B-Instruct-ru (ru): [[77.08266729328675, 91.20786483914902, 47.46292675948204, 60.67376555643292, 47.85751674210309, 58.799887183946, 48.321826865009285, 27.77162390715141, 82.43546402010625, 67.4238503032664, 57.85614558086533, 67.56282164660568, 100.0, 67.77012327797244, 89.84955003932633, 62.33218856812027, 41.53888701449431, 43.62272149396602, 36.59633118039272, 97.66489544744898, 54.65431119356859, 71.54784444536705, 58.484824366887295, 82.09574335033227, 55.320784035792144, 71.90999831255328, 77.84118399329415, 43.06735902646579, 60.13876291140977, 44.439862770850056, 79.66706637239602, 63.21256912115869, 66.36287063113086, 52.5920961025105, 57.2511902612789, 45.385965066959, 56.53065658370222, 32.51440190406854, 94.5228748919539, 73.35447003626608, 52.61416351759405, 78.65664787421491, 55.839466730365814, 41.52589095430062, 65.7407351858566, 68.45754296122868, 57.47014863261759, 79.46906913019183, 61.80839093345127, 72.16598372624641, 81.13290210556035

Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:09<00:00, 42.55it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 42.57it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:11<00:00, 34.49it/s]


chrf scores for Mistral-Nemo-Instruct-2407-ru (ru): [[66.33221342153149, 90.50855114472239, 47.19401935451871, 66.95668604451988, 60.72563215473352, 52.473648092231706, 42.77173816914657, 38.06355710655025, 82.43546402010625, 65.25017641243416, 61.32260643453576, 61.5846930278123, 100.0, 61.01715700376743, 89.84955003932633, 34.10349385056199, 64.98406764120935, 58.172878101939695, 39.26937577553789, 97.66489544744898, 55.792129179702094, 71.75974881835047, 46.58365876385728, 78.08048212944705, 60.62446938163735, 75.32110812644095, 77.84118399329415, 33.08966112877047, 58.81400339042765, 63.20330938407032, 79.66706637239602, 65.09592427435862, 46.099570808986726, 56.02756807197944, 57.2511902612789, 54.40775345969996, 51.10083205740559, 49.22175859485659, 94.5228748919539, 73.35447003626608, 46.32807818133921, 80.08791873205519, 60.583959881644205, 46.507720041976505, 70.42345209811002, 75.93208922616677, 72.88169920357097, 68.11156626972429, 61.40238887607975, 69.02467619638286, 81.13

Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:09<00:00, 40.13it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:10<00:00, 36.68it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:07<00:00, 47.91it/s]


chrf scores for Qwen2.5-7B-Instruct-ru (ru): [[72.36582117267784, 91.20786483914902, 42.656356920843436, 59.62992575053004, 54.172821730239264, 67.1059022985488, 50.137104035887326, 35.83174374390254, 82.43546402010625, 66.54237821769355, 61.04577223484002, 63.98044640123002, 100.0, 33.032542178793484, 89.84955003932633, 42.98837398004418, 31.2048916006819, 40.49215674042793, 31.004478521467394, 97.66489544744898, 54.4574323993347, 54.75188138350632, 62.07379885318952, 78.08048212944705, 67.6060012537481, 75.32110812644095, 77.84118399329415, 51.69628663516725, 60.49796709013225, 36.90709260949804, 79.66706637239602, 62.02473329050298, 70.06193814955049, 60.03568427547075, 66.99224716568055, 34.179424862986515, 60.07420219926718, 71.26640382562876, 77.15021082238593, 67.63940503329951, 56.10881688385394, 80.08791873205519, 58.67207564081965, 39.30421088071969, 67.79011760342439, 67.92441366093763, 72.88169920357097, 39.87816266232638, 61.40238887607975, 69.02467619638286, 81.1329021055

Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:07<00:00, 48.08it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:07<00:00, 51.60it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:07<00:00, 54.30it/s]


chrf scores for Llama-3.1-8B-Instruct-ge (ge): [[87.9246067447163, 88.6545944358612, 37.417414348695274, 54.051645520282854, 40.5155226303797, 64.15921147219392, 61.131469393048675, 52.5149725744244, 48.57190703677453, 82.54351295769524, 57.90245213761702, 67.0107531401945, 89.643746666902, 81.69056932382743, 64.77740910315778, 27.3200482620299, 36.63105352234357, 72.48654045790052, 65.74015902817095, 78.16863709193937, 66.34509985630432, 75.72838180692456, 48.07778029020009, 69.25285044561909, 71.33923319955638, 82.93251531012726, 76.73613181339508, 41.74090479446002, 66.4677224289715, 45.42898098990588, 57.18225084914874, 74.4952998491126, 77.74559075046191, 57.31135561444029, 55.73612655464569, 64.51512086190051, 52.92769045501708, 73.70236228847754, 100.0, 75.49324360127562, 51.919419716563475, 76.84462161171494, 49.44662800401769, 40.94491286238631, 73.4533078176732, 70.98907120857575, 71.45708491815289, 80.09178643791296, 76.32795848054126, 66.3890007956863, 85.4381187501099, 68.

Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 47.64it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:08<00:00, 46.39it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:09<00:00, 41.61it/s]


chrf scores for Mistral-Nemo-Instruct-2407-ge (ge): [[87.9246067447163, 88.6545944358612, 33.58604218006884, 54.051645520282854, 45.120904411422245, 61.67477022403838, 61.47433132199469, 42.0221549082136, 46.728046154468174, 84.42765026541316, 54.89750008781506, 64.87993817793071, 75.14119793797914, 49.86867522967188, 64.77740910315778, 27.3200482620299, 36.63105352234357, 59.27297313491564, 68.90224727084144, 82.45050600621092, 62.692001293405035, 79.63569695739726, 44.58436526378904, 77.09310912267154, 63.233332486138686, 65.10154873147896, 76.73613181339508, 45.807315398436764, 64.12887678913303, 49.98644468706207, 57.18225084914874, 72.43270311477627, 77.81312262925945, 54.62335891039544, 65.53725127276094, 57.52876174854072, 55.563564123519086, 67.07868503737127, 100.0, 75.50550692736144, 51.63930220379004, 71.44560483459848, 58.22871752560299, 50.17956013222417, 70.81014181486488, 57.53596463492189, 67.56094406166928, 48.64973190572536, 76.32795848054126, 75.74485632685195, 85.43

Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:10<00:00, 35.08it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:09<00:00, 39.44it/s]
Calcolo punteggi CHRF: 100%|██████████| 383/383 [00:07<00:00, 50.19it/s]

chrf scores for Qwen2.5-7B-Instruct-ge (ge): [[87.9246067447163, 94.33553775579813, 37.03080000020097, 54.882463454143625, 48.11331795572613, 57.331692853829644, 53.42708505598988, 43.18805043521502, 48.57190703677453, 78.9900415137656, 55.93678537422488, 64.85233153901841, 92.71817503317129, 61.4072876275497, 45.52376799374142, 24.565232841986614, 37.32089125092825, 72.44105222606844, 54.00870026944364, 82.45050600621092, 58.79143881495069, 67.51633968666538, 47.056512942237084, 57.35941870560414, 51.44277911537788, 82.93251531012726, 76.73613181339508, 38.86803573996972, 61.29807212779094, 49.98644468706207, 46.67304753870799, 75.1722755768692, 77.642869576872, 53.29106503294245, 69.87074798231951, 49.0667518040181, 44.902213308611806, 62.623323341223845, 100.0, 80.25609523060079, 53.876887302848296, 76.38000352813087, 56.681003251641044, 41.467058161117556, 73.4533078176732, 57.973075868515416, 67.56094406166928, 49.38855690737707, 76.32795848054126, 64.50565239952492, 46.1897509014




In [11]:
import pickle

# Save chrF++ scores
with open('chrf_scores-exp2.pkl', 'wb') as f:
    pickle.dump(chrf_scores, f)