In [2]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
import pickle

# evaluation 
from evaluate import load

In [3]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [4]:
models_generations = {
    'en': {},
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-en-' in file:
        if model not in models_generations['en']:
            models_generations['en'][model] = {}
        models_generations['en'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: en")
    elif '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")

model: Llama-3.1-8B-Instruct-en - gen: gen0, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen1, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen2, lang: en
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Mistral-Nemo-Instruct-2407-en - gen: gen0, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen1, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen2, lang: en
model: Mistral-Nemo-Instruct-2407-it - gen: gen0, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen1, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen2, lang: it
model: Qwen2.5-7B-Instruct-en - gen: gen0, lang: en
model: Qwen2.5-7B-Instruct-en - gen: gen1, lang: en
model: Qwen2.5-7B-Instruct-en - gen: gen2, lang: en
model: Qwen2.5-7B-Instruct-it - gen: gen0, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen1, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen2, lang: it


## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/meteor">METEOR</a>

In [5]:
meteor = load('meteor')

Using the latest cached version of the module from C:\Users\OliverioM\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--meteor\e7ed321a1b44c34fa4679192809db2cee7e3bd4bba0fe8b76061d807706c2374 (last modified on Thu Oct 10 16:06:59 2024) since it couldn't be found locally at evaluate-metric--meteor, or remotely on the Hugging Face Hub.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
meteor_scores = {'en': {}, 'it': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_meteor_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            gen_meteor_scores = []
            for i in tqdm(range(len(references)), desc="Calcolo punteggi METEOR"):
                results = meteor.compute(predictions=[predictions[i]], references=[references[i]])
                gen_meteor_scores.append(results["meteor"])

            model_meteor_scores.append(gen_meteor_scores)  # Store BLEU for each generation
        
        # Store the three METEOR scores instead of their mean
        meteor_scores[lang][model] = model_meteor_scores
        print(f'BLEU scores for {model} ({lang}): {model_meteor_scores}')
        print(f'Average: {np.mean(model_meteor_scores)}')
        print()

Language: en, Model: Llama-3.1-8B-Instruct-en


Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:25<00:00, 69.22it/s] 
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:25<00:00, 69.46it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:26<00:00, 67.96it/s]


BLEU scores for Llama-3.1-8B-Instruct-en (en): [[0.9993141289437586, 0.8361026414060091, 0.825414364640884, 0.5575, 0.7687074829931974, 0.5701627296587927, 0.7500000000000001, 0.8557101449275363, 0.7196994880659531, 0.7398101619207148, 0.7585420531849103, 0.7009680134680135, 0.8757427021441488, 0.7638811079987552, 0.5942356617443726, 0.9993141289437586, 0.6696913761277968, 0.6496644295302012, 0.896818181818182, 0.7601351351351352, 0.9997724169321802, 0.672514619883041, 0.8186789287267755, 0.8928571428571429, 0.8810389727011493, 0.7263761879146495, 0.9101941747572816, 0.9990234375, 0.680640799708313, 0.8653846153846154, 0.9139118457300276, 0.6079985917270607, 0.6035275974236357, 0.5962228834436107, 0.6807402985074626, 0.4947710032691149, 0.6442577030812325, 0.9280792420327304, 0.7707031250000002, 0.6397867611894112, 0.6701150089579841, 0.6984046962544019, 0.6170498201109824, 0.6525355871886123, 0.9985422740524781, 0.6915960021979002, 0.6436626857672353, 0.5306603773584906, 0.84362139917

Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:28<00:00, 62.28it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:26<00:00, 66.11it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:26<00:00, 66.67it/s]


BLEU scores for Mistral-Nemo-Instruct-2407-en (en): [[0.9993141289437586, 0.8319839091823342, 0.825414364640884, 0.9990234375, 0.8736842105263158, 0.5395845334101382, 0.7301575291133404, 0.6907996176144073, 0.6237762237762239, 0.8353960396039604, 0.7949161894252174, 0.9998518518518519, 0.8757427021441488, 0.7582432631030475, 0.573171199754369, 0.9993141289437586, 0.7492051168293404, 0.6829350011168193, 0.896818181818182, 0.7957957957957958, 0.9280792420327304, 0.7445423358802432, 0.5118499012508231, 0.8714352800546448, 0.8393063583815028, 0.7320350848957914, 0.9307692307692308, 0.864795918367347, 0.7690713101160863, 0.8653846153846154, 0.9139118457300276, 0.609168273386078, 0.6893327067669173, 0.6492989417989419, 0.6756756756756757, 0.5669606114050558, 0.9007523148148148, 0.9997724169321802, 0.7707031250000002, 0.6320784869582136, 0.6608720433171844, 0.7230009398073114, 0.7886640634244468, 0.7808346867584146, 0.9985422740524781, 0.6915960021979002, 0.6694059236432118, 0.777344645723458

Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:27<00:00, 64.13it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:29<00:00, 60.45it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:31<00:00, 57.07it/s]


BLEU scores for Qwen2.5-7B-Instruct-en (en): [[0.9624319660161953, 0.8965341199160473, 0.7471655328798186, 0.9990234375, 0.8905157630919404, 0.5269186712485681, 0.6429195194798523, 0.8756712102437009, 0.6380670611439841, 0.8110389062368133, 0.8006540508354731, 0.7990138067061144, 0.9990234375, 0.6942093740374076, 0.6006566664520698, 0.824175824175824, 0.735876959479444, 0.5986577181208053, 0.8887387387387389, 0.7601351351351352, 0.7622047244094489, 0.8011594917972124, 0.6843771525459651, 0.6003623188405797, 0.8393063583815028, 0.6501336715933814, 0.892, 0.6896975657732971, 0.7701327752922499, 0.9059829059829061, 0.9139118457300276, 0.7044393536451409, 0.7745835602759622, 0.5917857142857144, 0.7111583344019639, 0.5106685368731015, 0.9007523148148148, 0.9997724169321802, 0.6775541946145621, 0.7045983379501386, 0.8344205570367008, 0.6714103265827404, 0.7149799982020856, 0.7110253387167952, 0.6281690140845071, 0.7993412297045515, 0.7524314822281528, 0.8918539325842695, 0.8067053982086663, 

Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:27<00:00, 65.68it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:27<00:00, 64.32it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:27<00:00, 63.71it/s]


BLEU scores for Llama-3.1-8B-Instruct-it (it): [[0.866013071895425, 0.8150145871497387, 0.7864900346103353, 0.4445422535211268, 0.9997724169321802, 0.4697875845066774, 0.6660470478509806, 0.9120879120879121, 0.7178859447004607, 0.9338235294117647, 0.770858934169279, 0.8179012345679012, 0.9990234375, 0.675461570795745, 0.6160564361977379, 0.8819444444444444, 0.5635887028581092, 0.7083426401608219, 0.8152173913043477, 0.8613109512390087, 0.6533297287555195, 0.715042372881356, 0.7761507081280788, 0.8031613357057145, 0.7943045660621763, 0.6288842159521564, 0.7893805309734513, 0.8412698412698414, 0.7653061224489797, 0.9998518518518519, 0.6552706552706553, 0.7058017005652198, 0.8518945613617277, 0.5860465737685207, 0.7176562806192437, 0.5397576096211154, 0.9880128061946244, 0.9998518518518519, 0.7521527129370266, 0.6244244244244244, 0.7450331125827815, 0.7776105260394249, 0.7766272189349114, 0.702904761904762, 0.711764705882353, 0.9945130315500685, 0.7441294195136634, 0.6465763875643715, 0.6

Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:29<00:00, 59.86it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:25<00:00, 69.91it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:32<00:00, 55.33it/s]


BLEU scores for Mistral-Nemo-Instruct-2407-it (it): [[0.7095046854082998, 0.8647939404783206, 0.7690546632854326, 0.4445422535211268, 0.9997724169321802, 0.5375231819082087, 0.5970495095388894, 0.7577614379084968, 0.6883947608894305, 0.9213952850877193, 0.770858934169279, 0.8887898192429278, 0.9990234375, 0.8406240103153416, 0.6048387096774194, 0.8819444444444444, 0.5578572923205692, 0.5814147480814148, 0.9073691460055096, 0.8440643218655903, 0.6545430672268908, 0.7291810841983852, 0.7900753662460833, 0.9772230320699709, 0.8880284139799711, 0.7066724197063737, 0.7893805309734513, 0.5357142857142857, 0.7560483870967744, 0.9998518518518519, 0.9861386138613862, 0.7491666666666666, 0.9043259259259259, 0.6233062330623307, 0.8302942848903799, 0.585238594942577, 0.7017173423423423, 0.9998518518518519, 0.8211575489862951, 0.725954619224056, 0.7357706396167933, 0.7778542968963204, 0.6190366998132013, 0.7076023391812867, 0.711764705882353, 0.9945130315500685, 0.716576622931138, 0.9995, 0.8892857

Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:25<00:00, 70.86it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:28<00:00, 62.55it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:27<00:00, 64.24it/s]

BLEU scores for Qwen2.5-7B-Instruct-it (it): [[0.701058201058201, 0.7916857086896912, 0.6641321577174558, 0.8412698412698414, 0.8934494195688226, 0.43900314031862747, 0.6538575141776938, 0.8202560324696987, 0.6883947608894305, 0.88, 0.5769986979735575, 0.7779374775422208, 0.9990234375, 0.5903043478260869, 0.6506309148264985, 0.7618884316415181, 0.7691790591096147, 0.5853699504493155, 0.8152173913043477, 0.920940170940171, 0.7833402699328141, 0.7524784954074939, 0.7761507081280788, 0.9335490187731471, 0.7723404255319148, 0.8161573086946221, 0.9417137886037409, 0.8412698412698414, 0.6107958477508649, 0.9998177842565598, 0.44834307992202727, 0.6339367703004066, 0.8569865116477394, 0.6092736551205058, 0.7134446585313637, 0.5278122189405116, 0.634712600636585, 0.7957957957957958, 0.6443520642201835, 0.7448052841612769, 0.5738636363636365, 0.7216538556485, 0.6063954244118028, 0.6738825385119633, 0.9990234375, 0.9945130315500685, 0.7156885228578301, 0.9995, 0.7559322033898305, 0.8166601151027




In [8]:
# Save meteor scores
with open('meteor_scores-exp1.pkl', 'wb') as f:
    pickle.dump(meteor_scores, f)