In [1]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
from scipy.stats import ttest_rel
import pickle

# evaluation 
from evaluate import load

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
models_generations = {
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")

model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen0, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen1, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen0, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen1, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen2, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen0, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen1, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen2, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen0, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen1, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen2, lang: it


## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/bleu">Bleu</a>

In [4]:
bleu = load("bleu")

Using the latest cached version of the module from C:\Users\OliverioM\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--bleu\9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Wed Sep  4 17:21:14 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


In [5]:
bleu_scores = {'it': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_bleu_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            gen_bleu_scores = []
            for i in tqdm(range(len(references)), desc="Calcolo punteggi BLEU"):
                results = bleu.compute(predictions=[predictions[i]], references=[references[i]])
                gen_bleu_scores.append(results["bleu"])

            model_bleu_scores.append(gen_bleu_scores)  # Store BLEU for each generation
        
        # Store the three BLEU scores instead of their mean
        bleu_scores[lang][model] = model_bleu_scores
        print(f'BLEU scores for {model} ({lang}): {model_bleu_scores}')
        print(f'Average: {np.mean(model_bleu_scores)}')
        print()

Language: it, Model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it


Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:16<00:00, 106.83it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 114.54it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 111.43it/s]


BLEU scores for LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it (it): [[0.691441569283882, 0.4386913376508308, 0.6477832615546546, 0.0, 1.0, 0.4814233453445703, 0.3393116530148065, 0.7783744323988415, 0.48865475181405904, 0.7748677442328399, 0.7041168335414218, 0.8553583862778322, 1.0, 0.35720753476336553, 0.3188719246336876, 0.0, 0.40421542981370134, 0.2816609358824751, 0.8070557274927981, 0.7611606003349892, 0.5509785767132415, 0.2637873622593391, 0.576091046275884, 0.27560016787236363, 0.7509071535037981, 0.5950246949262603, 0.0, 1.0, 0.4151891241704209, 1.0, 0.0, 0.5909997718347788, 0.7161899903426547, 0.4006911765751864, 0.6971993357988346, 0.31491021038952444, 0.4832697830906221, 1.0, 0.7016879391277371, 0.5295361149563323, 0.0, 0.6443617124213459, 0.4847081271306397, 0.5534146990752098, 0.6703200460356393, 0.9009325445966684, 0.597721230644098, 0.4630777161991027, 0.0, 0.6825300868225764, 0.4072789825177894, 0.9234732618882052, 0.7447819789879647, 0.49039303364724773, 0.31491369418461007,

Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 116.10it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 115.35it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 117.11it/s]


BLEU scores for Llama-3.1-8B-Instruct-it (it): [[0.5491004867761125, 0.4600545023284529, 0.7938047857077988, 0.0, 1.0, 0.4076344059470743, 0.4261528449554083, 0.7536324264832722, 0.5784107053631243, 0.7748677442328399, 0.7243199659627639, 0.7067791893538207, 1.0, 0.3873569518606464, 0.4375613045812791, 0.5969491792019646, 0.40707225369512123, 0.3784481137591871, 0.5844356470407898, 0.7611606003349892, 0.5509785767132415, 0.33743632179678623, 0.576091046275884, 0.3297075509975986, 0.4608662469973654, 0.35072719288216747, 0.0, 0.0, 0.5246837953108363, 1.0, 0.0, 0.47087130600152305, 0.6676520339910388, 0.35424829149328463, 0.7504254072415915, 0.24867715063212503, 0.8070557274927981, 1.0, 0.7016879391277371, 0.2517269569783253, 0.0, 0.6966594911767172, 0.4214036984491675, 0.46264979608994905, 0.6703200460356393, 0.9009325445966684, 0.6654201286679353, 0.4630777161991027, 0.3072441646649103, 0.6901573050811814, 0.3161270060445738, 0.5623413251903491, 0.7447819789879647, 0.6546229854401499, 

Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 114.15it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 112.17it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 111.30it/s]


BLEU scores for Minerva-7B-instruct-v1.0-it (it): [[0.43361890903486755, 0.3033668865762665, 0.5506083283927117, 0.5946035575013605, 1.0, 0.23553192603376089, 0.22915295704820032, 0.8800612169700752, 0.6334323002440795, 0.7748677442328399, 0.44179790959844856, 0.6510803637373397, 0.0, 0.21053765276783554, 0.3179552183248196, 0.0, 0.5827259740336067, 0.40980949787910764, 0.8070557274927981, 0.5445178846139404, 0.6636154805687886, 0.6413190292883517, 0.26699675743024226, 0.24202875575621302, 0.3924259174695316, 0.5754889691772622, 0.7348889200874658, 0.488923022434901, 0.3325026294099889, 0.8666415730847504, 0.0, 0.33687759320176464, 0.5700405643085635, 0.3550813671406284, 0.816360100090376, 0.25978699792631493, 0.0, 0.5484498092204758, 0.7311104457090247, 0.325962298278404, 0.0, 0.6115107593524637, 0.42213340455572723, 0.37906376148607823, 0.7071067811865475, 0.8028236549339718, 0.544422796414192, 0.743344673640789, 0.0, 0.5688956740991937, 0.2263602347764041, 0.28646290158800986, 0.529

Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:14<00:00, 118.70it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 113.81it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:14<00:00, 120.47it/s]


BLEU scores for Mistral-Nemo-Instruct-2407-it (it): [[0.3816330911371337, 0.5129497107827519, 0.6459139467348547, 0.0, 1.0, 0.40066361284662694, 0.3642077792766302, 0.579354373763178, 0.5146906560381768, 0.6774689751374905, 0.7243199659627639, 0.6608372989865366, 1.0, 0.6730544160441616, 0.41238552613852936, 0.5969491792019646, 0.37323924780738715, 0.2691109110344471, 0.8070557274927981, 0.5974178044844201, 0.5642647028042946, 0.42563402761425845, 0.549696364469696, 0.7839067979033139, 0.7509071535037981, 0.4238181339038442, 0.0, 0.0, 0.4944796026651536, 1.0, 0.7016879391277371, 0.5151534384532293, 0.5867411578622623, 0.5040075403056274, 0.7493299113738129, 0.33609001613404255, 0.5452469119630863, 1.0, 0.7311104457090247, 0.43303913492362117, 0.25510012742866267, 0.7036493374404218, 0.45632474251342453, 0.45479124441660884, 0.6703200460356393, 0.9009325445966684, 0.5983229240828449, 1.0, 0.6053287453371726, 0.698366134932572, 0.4286942324014855, 0.9234732618882052, 0.8307018474412792, 

Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 115.93it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:15<00:00, 111.58it/s]
Calcolo punteggi BLEU: 100%|██████████| 1779/1779 [00:14<00:00, 119.45it/s]

BLEU scores for Qwen2.5-7B-Instruct-it (it): [[0.3448444257953326, 0.5367532631666345, 0.708587641194393, 0.5946035575013605, 0.630190855592386, 0.31167368916782756, 0.4509821189657739, 0.5235057729325346, 0.5146906560381768, 0.5873949094699213, 0.460342709344403, 0.4416051649800462, 1.0, 0.34053306396940136, 0.37126974646706873, 0.0, 0.5501824970004027, 0.3267534141240997, 0.5844356470407898, 0.7611606003349892, 0.5491004867761125, 0.3988210462407746, 0.576091046275884, 0.7881929718099911, 0.6413307450708073, 0.6333040538193904, 0.3784481137591871, 0.488923022434901, 0.20824096843600085, 1.0, 0.0, 0.37805079235713684, 0.7161899903426547, 0.3313738362696476, 0.6964925945603451, 0.38858224922670836, 0.42268392163412416, 0.5484498092204758, 0.0, 0.5309354663044072, 0.0, 0.5916707673134813, 0.25127004242367584, 0.45591791610313354, 1.0, 0.9009325445966684, 0.6720051516997396, 1.0, 0.32263864160302524, 0.7123550983177815, 0.30385576155210053, 0.0, 0.7447819789879647, 0.5566562786713232, 0.




In [6]:
# Save BLEU scores
with open('bleu_scores-exp3.pkl', 'wb') as f:
    pickle.dump(bleu_scores, f)