In [10]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
from scipy.stats import ttest_rel
import pickle

# evaluation 
from evaluate import load

In [11]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [12]:
models_generations = {
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")

model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen0, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen1, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen0, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen1, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen2, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen0, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen1, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen2, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen0, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen1, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen2, lang: it


## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/chrf">chrF++</a>

In [13]:
chrf = load("chrf")

Using the latest cached version of the module from C:\Users\OliverioM\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--chrf\d244bab9383988714085a8dacc4871986d9f025398581c33d6b2ee22836b4069 (last modified on Wed Sep  4 18:15:30 2024) since it couldn't be found locally at evaluate-metric--chrf, or remotely on the Hugging Face Hub.


In [14]:
chrf_scores = {'it': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_chrf_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            gen_chrf_scores = []
            for i in tqdm(range(len(references)), desc="Calcolo punteggi CHRF"):
                reference_scores = []
                reference = references[i]
                for j in range(len(reference)):
                    actual = reference[j]
                    result = chrf.compute(predictions=[predictions[i]], references=[actual], word_order=2)
                    reference_scores.append(result['score'])
                
                gen_chrf_scores.append(np.mean(reference_scores))
            
            model_chrf_scores.append(gen_chrf_scores)

        # Store the three chrf scores instead of their mean
        chrf_scores[lang][model] = model_chrf_scores
        print(f'chrf scores for {model} ({lang}): {model_chrf_scores}')
        print(f'Average: {np.mean(model_chrf_scores)}')
        print()

Language: it, Model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it


Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:41<00:00, 42.62it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:43<00:00, 40.55it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:43<00:00, 40.76it/s]


chrf scores for LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it (it): [[77.07756815151951, 67.92305210937597, 67.51055866738012, 37.272691722813015, 80.37584123905536, 58.899331959469514, 57.183875505675466, 72.49343033139611, 56.93797281541595, 73.78629276453584, 77.14337763012337, 66.94585512862157, 91.29725792844361, 62.353166588001095, 51.90402664057462, 56.808934806806995, 53.6640819010022, 48.45879293189389, 70.83246526700542, 76.68428786041822, 56.09945005312113, 46.896426114502084, 71.87410516389782, 62.73827333623972, 72.09104828052496, 70.62486302940796, 59.420370803397056, 75.02314087179622, 56.26279759731916, 91.39719431757946, 53.91320815147207, 71.42557360283958, 70.80426735352957, 51.63391208205897, 64.54435657499253, 46.93310650367702, 51.97021451741103, 94.21217526686412, 60.20550319875531, 44.356941691306154, 47.8514281422167, 71.24165403535208, 58.57749203594421, 58.65278373122897, 64.7350688056473, 83.19351227414022, 70.24399682579003, 59.48859710791095, 50.75580489764428, 73

Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:45<00:00, 39.41it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:45<00:00, 38.89it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:45<00:00, 38.88it/s]


chrf scores for Llama-3.1-8B-Instruct-it (it): [[67.66929718973815, 67.62948722357508, 68.95884946501913, 37.272691722813015, 80.37584123905536, 50.55437546044103, 59.45617638979045, 71.56916071668964, 58.75386334234681, 73.78629276453584, 71.99320494354089, 66.93441921383419, 91.29725792844361, 67.03043878374106, 50.871692598082525, 67.51777804560574, 56.55675460488872, 56.466962240259875, 65.51574525020887, 76.68428786041822, 56.09945005312113, 57.1941609188116, 71.87410516389782, 70.9501933023739, 65.74275774125702, 51.897388455692955, 60.00611045338755, 63.32992942813858, 64.35877426811638, 91.39719431757946, 46.17978390531792, 68.00327268974, 69.41034461581866, 51.74558105514598, 69.94853024845251, 42.28680423040603, 75.7050946431939, 94.21217526686412, 60.20550319875531, 40.061665091472186, 50.618801389362694, 74.77590415704213, 62.48079428055676, 57.95642349042898, 64.7350688056473, 83.19351227414022, 72.08485134330125, 59.48859710791095, 53.65539218610409, 73.31835832786614, 63

Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:44<00:00, 39.89it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:48<00:00, 37.05it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:45<00:00, 39.28it/s]


chrf scores for Minerva-7B-instruct-v1.0-it (it): [[50.880823007668916, 60.11131710306042, 59.389941698508494, 49.07080334667665, 80.37584123905536, 50.03628992769566, 61.20796031469169, 69.58525007458472, 61.619227316779664, 73.78629276453584, 59.26752309852927, 55.64707932705977, 52.181514891449524, 56.42660513138147, 54.156043235448486, 62.419637303241835, 64.38339190955888, 64.5844505555812, 68.3491353743708, 75.85414434151052, 63.46637317049842, 64.58651232926076, 53.42705609597622, 62.81731263394494, 65.78309702310936, 67.16752760769656, 69.78834514092263, 66.62922853007876, 55.27614992295597, 84.68961231363103, 42.2354565982441, 55.16536298409892, 68.35459948508014, 51.17655272160155, 68.13170619397535, 40.807370483444664, 48.37982045917777, 76.9785026984423, 64.67126611301582, 52.44136285993665, 56.152161322465595, 69.61861749218959, 61.57211481955209, 62.84698875007547, 69.74323133262226, 81.81533493604839, 69.46568064908917, 68.4263955203553, 51.12740446966916, 69.57777630932

Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:39<00:00, 44.53it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:35<00:00, 50.17it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:35<00:00, 49.61it/s]


chrf scores for Mistral-Nemo-Instruct-2407-it (it): [[59.2665103181129, 78.97285262462599, 65.83029983633257, 37.272691722813015, 80.37584123905536, 50.93405388224574, 63.01400169669078, 59.64396188425036, 58.63410021894578, 73.81374750333455, 71.99320494354089, 73.2192559633544, 91.29725792844361, 80.81564068932931, 51.679561239006915, 67.51777804560574, 58.36269130401181, 45.96371862922731, 70.83246526700542, 78.36148197973138, 56.604347533534586, 53.723437133294794, 73.20490126507653, 75.54013037985236, 72.09104828052496, 67.02416997209338, 60.00611045338755, 45.4182088557244, 65.70294962635269, 91.39719431757946, 84.12214751717633, 70.53372015200019, 71.69600202792158, 57.34158284644081, 68.29995323718077, 48.34446894473927, 60.11699541157898, 94.21217526686412, 64.67126611301582, 52.260503556931475, 57.95997238999746, 72.53163031510861, 61.57687080167276, 62.397182525070605, 64.7350688056473, 83.19351227414022, 70.54586726207727, 71.09849209035393, 62.5972890669676, 73.25577438120

Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:39<00:00, 44.54it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:43<00:00, 40.64it/s]
Calcolo punteggi CHRF: 100%|██████████| 1779/1779 [00:40<00:00, 43.92it/s]

chrf scores for Qwen2.5-7B-Instruct-it (it): [[57.669114148909465, 61.09576607480836, 62.97115411990476, 50.22164657254099, 76.79271689299114, 47.15214068041828, 61.45987650812908, 67.16366533945362, 58.63410021894578, 73.51084210339086, 58.800158003842974, 60.050749853508, 91.29725792844361, 62.3111621642345, 57.117815841111586, 53.74620284305874, 64.42940384126048, 48.54043724005255, 65.51574525020887, 81.08806078496919, 63.234755662380245, 57.23983731179353, 71.87410516389782, 66.81412106052296, 67.50025771146717, 73.22136494498363, 62.417694137620224, 71.00731084136999, 52.818031430177456, 95.12483181546104, 41.71330492744724, 65.78921868253973, 70.80426735352957, 57.28117682238095, 67.60461939590697, 48.95822061807337, 47.788333535015404, 68.92576601267452, 56.03925510968307, 49.601676228909206, 47.30820911237532, 69.92035289586444, 50.57171100505962, 58.11662804196086, 84.54176903372793, 83.19351227414022, 71.45198402174127, 71.09849209035393, 56.68406833744436, 72.0087470170289,




In [15]:
# Save chrF++ scores
with open('chrf_scores-exp3.pkl', 'wb') as f:
    pickle.dump(chrf_scores, f)