In [7]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
from scipy.stats import ttest_rel
import pickle

# evaluation 
from evaluate import load

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
models_generations = {
    'en': {},
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-en-' in file:
        if model not in models_generations['en']:
            models_generations['en'][model] = {}
        models_generations['en'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: en")
    elif '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")

model: Llama-3.1-8B-Instruct-en - gen: gen0, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen1, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen2, lang: en
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Mistral-Nemo-Instruct-2407-en - gen: gen0, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen1, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen2, lang: en
model: Mistral-Nemo-Instruct-2407-it - gen: gen0, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen1, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen2, lang: it
model: Qwen2.5-7B-Instruct-en - gen: gen0, lang: en
model: Qwen2.5-7B-Instruct-en - gen: gen1, lang: en
model: Qwen2.5-7B-Instruct-en - gen: gen2, lang: en
model: Qwen2.5-7B-Instruct-it - gen: gen0, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen1, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen2, lang: it


## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/bleu">Bleu</a>

In [4]:
bleu = load("bleu")

Using the latest cached version of the module from C:\Users\OliverioM\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--bleu\9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Wed Sep  4 17:21:14 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


In [5]:
bleu_scores = {'en': {}, 'it': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_bleu_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            gen_bleu_scores = []
            for i in tqdm(range(len(references)), desc="Calcolo punteggi BLEU"):
                results = bleu.compute(predictions=[predictions[i]], references=[references[i]])
                gen_bleu_scores.append(results["bleu"])

            model_bleu_scores.append(gen_bleu_scores)  # Store BLEU for each generation
        
        # Store the three BLEU scores instead of their mean
        bleu_scores[lang][model] = model_bleu_scores
        print(f'BLEU scores for {model} ({lang}): {model_bleu_scores}')
        print(f'Average: {np.mean(model_bleu_scores)}')
        print()

Language: en, Model: Llama-3.1-8B-Instruct-en


Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:13<00:00, 136.84it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:14<00:00, 123.21it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 113.16it/s]


BLEU scores for Llama-3.1-8B-Instruct-en (en): [[1.0, 0.7344112539787184, 0.4903047069202663, 0.0, 0.5183282721440023, 0.4080359375751397, 0.4418102787384536, 0.6227485588070553, 0.637180485789211, 0.6027071865868198, 0.6291819397799013, 0.4444418079960644, 0.8668778997501817, 0.47200948741386867, 0.3175476568503661, 1.0, 0.5130024752447759, 0.2601278440403792, 0.7825422900366437, 0.5923033072023248, 1.0, 0.32407945953245476, 0.4457732753604864, 0.6504011927452344, 0.6891557807535084, 0.5971226353234798, 0.5008718428920987, 1.0, 0.5671276403520978, 0.7315254958027825, 0.7348889200874658, 0.46925651197632257, 0.4487293333733679, 0.37131566862918736, 0.593259812060215, 0.2821764980997448, 0.48834994094164574, 0.9234732618882052, 0.6865890479690393, 0.21483820122696787, 0.28253893006668057, 0.6068427282456142, 0.5599808085945802, 0.590188116664391, 1.0, 0.39278394849128806, 0.5991757076502909, 0.6319145618915731, 0.6204032394013997, 0.6569465907969158, 0.38246981732095403, 0.4469338322542

Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 112.60it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:16<00:00, 108.67it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:14<00:00, 124.00it/s]


BLEU scores for Mistral-Nemo-Instruct-2407-en (en): [[1.0, 0.7001575310229897, 0.6888365053466561, 1.0, 0.8938651488499076, 0.5344927803752876, 0.5391855437500148, 0.6229000917607898, 0.0, 0.40228686126014285, 0.7621991222319221, 1.0, 0.8668778997501817, 0.46741105375181624, 0.23207142194487027, 1.0, 0.5216948600244291, 0.5396717798766962, 0.7825422900366437, 0.7611606003349892, 0.7677331684336531, 0.360468900403241, 0.27935069213733144, 0.6848075777090852, 0.7172835948406505, 0.5247218664551436, 0.42718025135819776, 0.5946035575013605, 0.49292958425543404, 0.7315254958027825, 0.7348889200874658, 0.4600806381950465, 0.24968109263613555, 0.40643629337266673, 0.6368173523660625, 0.42235547655244177, 0.5397106933365019, 0.9036020036098449, 0.6865890479690393, 0.3299895472527792, 0.23758597346269164, 0.6569720574991903, 0.6136915527964187, 0.7271648621286583, 1.0, 0.39278394849128806, 0.5561415810451998, 0.6452027037854574, 0.5008718428920987, 0.7411540972423474, 0.374521031761603, 0.90483

Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:19<00:00, 89.86it/s] 
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:14<00:00, 121.10it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 116.70it/s]


BLEU scores for Qwen2.5-7B-Instruct-en (en): [[0.7348889200874658, 0.7473097602214631, 0.5378109430133184, 1.0, 0.6814839425935316, 0.39318423195723434, 0.4291887210820193, 0.8209512402593354, 0.0, 0.4934494673001857, 0.5740531638330801, 0.5607900239988007, 1.0, 0.32783631284396253, 0.34092789717196414, 0.37531192687516973, 0.515047708390275, 0.2460137257692754, 0.6865890479690392, 0.5923033072023248, 0.3220491428474935, 0.40075796796278235, 0.2556088041865078, 0.5367088830805397, 0.7172835948406505, 0.610550692688897, 0.6389431042462724, 0.3816330911371337, 0.6304575164812234, 0.6989307622784944, 0.7348889200874658, 0.5171569013307548, 0.6450001140844255, 0.28372511658292066, 0.5113708994733462, 0.25452894419055505, 0.5397106933365019, 0.9036020036098449, 0.392814650900513, 0.363353641549119, 0.26230609318615344, 0.6102039735107802, 0.6507307602149268, 0.38538569180303145, 0.0, 0.7354490792324542, 0.7606626379586936, 0.767279645960659, 0.30289764018096393, 0.7135989341462954, 0.445786

Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:16<00:00, 110.92it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:13<00:00, 130.43it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:13<00:00, 127.84it/s]


BLEU scores for Llama-3.1-8B-Instruct-it (it): [[0.5491004867761125, 0.4600545023284529, 0.7938047857077988, 0.0, 1.0, 0.4076344059470743, 0.4261528449554083, 0.7536324264832722, 0.5784107053631243, 0.7748677442328399, 0.7243199659627639, 0.7067791893538207, 1.0, 0.3873569518606464, 0.4375613045812791, 0.5969491792019646, 0.40707225369512123, 0.3784481137591871, 0.5844356470407898, 0.7611606003349892, 0.5509785767132415, 0.33743632179678623, 0.576091046275884, 0.3297075509975986, 0.4608662469973654, 0.35072719288216747, 0.0, 0.0, 0.5246837953108363, 1.0, 0.0, 0.47087130600152305, 0.6676520339910388, 0.35424829149328463, 0.7504254072415915, 0.24867715063212503, 0.8070557274927981, 1.0, 0.7016879391277371, 0.2517269569783253, 0.0, 0.6966594911767172, 0.4214036984491675, 0.46264979608994905, 0.6703200460356393, 0.9009325445966684, 0.6654201286679353, 0.4630777161991027, 0.3072441646649103, 0.6901573050811814, 0.3161270060445738, 0.5623413251903491, 0.7447819789879647, 0.6546229854401499, 

Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:13<00:00, 135.50it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:13<00:00, 130.94it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:14<00:00, 119.70it/s]


BLEU scores for Mistral-Nemo-Instruct-2407-it (it): [[0.3816330911371337, 0.5129497107827519, 0.6459139467348547, 0.0, 1.0, 0.40066361284662694, 0.3642077792766302, 0.579354373763178, 0.5146906560381768, 0.6774689751374905, 0.7243199659627639, 0.6608372989865366, 1.0, 0.6730544160441616, 0.41238552613852936, 0.5969491792019646, 0.37323924780738715, 0.2691109110344471, 0.8070557274927981, 0.5974178044844201, 0.5642647028042946, 0.42563402761425845, 0.549696364469696, 0.7839067979033139, 0.7509071535037981, 0.4238181339038442, 0.0, 0.0, 0.4944796026651536, 1.0, 0.7016879391277371, 0.5151534384532293, 0.5867411578622623, 0.5040075403056274, 0.7493299113738129, 0.33609001613404255, 0.5452469119630863, 1.0, 0.7311104457090247, 0.43303913492362117, 0.25510012742866267, 0.7036493374404218, 0.45632474251342453, 0.45479124441660884, 0.6703200460356393, 0.9009325445966684, 0.5983229240828449, 1.0, 0.6053287453371726, 0.698366134932572, 0.4286942324014855, 0.9234732618882052, 0.8307018474412792, 

Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:13<00:00, 127.97it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 117.53it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:13<00:00, 129.81it/s]


BLEU scores for Qwen2.5-7B-Instruct-it (it): [[0.3448444257953326, 0.5367532631666345, 0.708587641194393, 0.5946035575013605, 0.630190855592386, 0.31167368916782756, 0.4509821189657739, 0.5235057729325346, 0.5146906560381768, 0.5873949094699213, 0.460342709344403, 0.4416051649800462, 1.0, 0.34053306396940136, 0.37126974646706873, 0.0, 0.5501824970004027, 0.3267534141240997, 0.5844356470407898, 0.7611606003349892, 0.5491004867761125, 0.3988210462407746, 0.576091046275884, 0.7881929718099911, 0.6413307450708073, 0.6333040538193904, 0.3784481137591871, 0.488923022434901, 0.20824096843600085, 1.0, 0.0, 0.37805079235713684, 0.7161899903426547, 0.3313738362696476, 0.6964925945603451, 0.38858224922670836, 0.42268392163412416, 0.5484498092204758, 0.0, 0.5309354663044072, 0.0, 0.5916707673134813, 0.25127004242367584, 0.45591791610313354, 1.0, 0.9009325445966684, 0.6720051516997396, 1.0, 0.32263864160302524, 0.7123550983177815, 0.30385576155210053, 0.0, 0.7447819789879647, 0.5566562786713232, 0.

In [8]:
# Save BLEU scores
with open('bleu_scores-exp1.pkl', 'wb') as f:
    pickle.dump(bleu_scores, f)