In [1]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
import pickle

# evaluation 
from evaluate import load

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
models_generations = {
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")

model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen0, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen1, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen0, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen1, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen2, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen0, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen1, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen2, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen0, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen1, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen2, lang: it


## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/meteor">METEOR</a>

In [4]:
meteor = load('meteor')

Using the latest cached version of the module from C:\Users\OliverioM\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--meteor\e7ed321a1b44c34fa4679192809db2cee7e3bd4bba0fe8b76061d807706c2374 (last modified on Thu Oct 10 16:06:59 2024) since it couldn't be found locally at evaluate-metric--meteor, or remotely on the Hugging Face Hub.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
meteor_scores = {'it': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_meteor_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            gen_meteor_scores = []
            for i in tqdm(range(len(references)), desc="Calcolo punteggi METEOR"):
                results = meteor.compute(predictions=[predictions[i]], references=[references[i]])
                gen_meteor_scores.append(results["meteor"])

            model_meteor_scores.append(gen_meteor_scores)  # Store BLEU for each generation
        
        # Store the three METEOR scores instead of their mean
        meteor_scores[lang][model] = model_meteor_scores
        print(f'BLEU scores for {model} ({lang}): {model_meteor_scores}')
        print(f'Average: {np.mean(model_meteor_scores)}')
        print()

Language: it, Model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it


Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:20<00:00, 87.62it/s] 
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:17<00:00, 99.45it/s] 
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:17<00:00, 101.36it/s]


BLEU scores for LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it (it): [[0.864795918367347, 0.8399364909781577, 0.7819174181300427, 0.4445422535211268, 0.9997724169321802, 0.5592966316501944, 0.605, 0.9455308167408111, 0.6373458153021873, 0.9338235294117647, 0.8172089734086514, 0.9933426414941622, 0.9990234375, 0.5935202688510816, 0.5828256393784457, 0.6392045454545455, 0.47473363774733646, 0.5991586538461539, 0.9073691460055096, 0.8613109512390087, 0.6533297287555195, 0.5894273127753304, 0.7761507081280788, 0.7117914096147131, 0.8880284139799711, 0.7621672796735843, 0.696969696969697, 0.9985422740524781, 0.6702149298257509, 0.9998518518518519, 0.6284569902727446, 0.6904026970791458, 0.8569865116477394, 0.5986577181208053, 0.7135997833807713, 0.5367629462479029, 0.7080965909090909, 0.9998518518518519, 0.7521527129370266, 0.5670203921952174, 0.6552706552706553, 0.7436105755433486, 0.6055319650199414, 0.649131335874933, 0.711764705882353, 0.9945130315500685, 0.718597412109375, 0.6465763875643715, 0

Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:17<00:00, 99.84it/s] 
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:18<00:00, 96.05it/s] 
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:19<00:00, 93.47it/s] 


BLEU scores for Llama-3.1-8B-Instruct-it (it): [[0.866013071895425, 0.8150145871497387, 0.7864900346103353, 0.4445422535211268, 0.9997724169321802, 0.4697875845066774, 0.6660470478509806, 0.9120879120879121, 0.7178859447004607, 0.9338235294117647, 0.770858934169279, 0.8179012345679012, 0.9990234375, 0.675461570795745, 0.6160564361977379, 0.8819444444444444, 0.5635887028581092, 0.7083426401608219, 0.8152173913043477, 0.8613109512390087, 0.6533297287555195, 0.715042372881356, 0.7761507081280788, 0.8031613357057145, 0.7943045660621763, 0.6288842159521564, 0.7893805309734513, 0.8412698412698414, 0.7653061224489797, 0.9998518518518519, 0.6552706552706553, 0.7058017005652198, 0.8518945613617277, 0.5860465737685207, 0.7176562806192437, 0.5397576096211154, 0.9880128061946244, 0.9998518518518519, 0.7521527129370266, 0.6244244244244244, 0.7450331125827815, 0.7776105260394249, 0.7766272189349114, 0.702904761904762, 0.711764705882353, 0.9945130315500685, 0.7441294195136634, 0.6465763875643715, 0.6

Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:18<00:00, 97.81it/s] 
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:18<00:00, 96.61it/s] 
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:18<00:00, 93.77it/s] 


BLEU scores for Minerva-7B-instruct-v1.0-it (it): [[0.701058201058201, 0.755176262315271, 0.594555177442878, 0.6281690140845071, 0.9997724169321802, 0.43457943925233644, 0.6332636689433276, 0.9325970149253732, 0.7877929687500002, 0.9338235294117647, 0.5662393162393162, 0.7014492753623188, 0.3211009174311927, 0.6036306542197937, 0.616801572949855, 0.6320224719101123, 0.7232470588235296, 0.715702479338843, 0.920940170940171, 0.826140873015873, 0.7458847736625513, 0.8367497691597415, 0.5807522123893806, 0.7098765432098765, 0.6932463707132336, 0.7281553398058254, 0.9880128061946244, 0.8412698412698414, 0.5756522368873681, 0.9319727891156462, 0.761888431641518, 0.5486586349064427, 0.8556036724927173, 0.5859753077443892, 0.7748538011695907, 0.42511154598825834, 0.6496644295302012, 0.8044763513513514, 0.8211575489862951, 0.5717948717948718, 0.7120253164556962, 0.7006171857601883, 0.6578271841280892, 0.7406182386994913, 0.864795918367347, 0.9404304121972612, 0.7453219814241486, 0.9041027559546

Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:19<00:00, 90.77it/s] 
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:18<00:00, 96.28it/s] 
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:18<00:00, 94.87it/s] 


BLEU scores for Mistral-Nemo-Instruct-2407-it (it): [[0.7095046854082998, 0.8647939404783206, 0.7690546632854326, 0.4445422535211268, 0.9997724169321802, 0.5375231819082087, 0.5970495095388894, 0.7577614379084968, 0.6883947608894305, 0.9213952850877193, 0.770858934169279, 0.8887898192429278, 0.9990234375, 0.8406240103153416, 0.6048387096774194, 0.8819444444444444, 0.5578572923205692, 0.5814147480814148, 0.9073691460055096, 0.8440643218655903, 0.6545430672268908, 0.7291810841983852, 0.7900753662460833, 0.9772230320699709, 0.8880284139799711, 0.7066724197063737, 0.7893805309734513, 0.5357142857142857, 0.7560483870967744, 0.9998518518518519, 0.9861386138613862, 0.7491666666666666, 0.9043259259259259, 0.6233062330623307, 0.8302942848903799, 0.585238594942577, 0.7017173423423423, 0.9998518518518519, 0.8211575489862951, 0.725954619224056, 0.7357706396167933, 0.7778542968963204, 0.6190366998132013, 0.7076023391812867, 0.711764705882353, 0.9945130315500685, 0.716576622931138, 0.9995, 0.8892857

Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:22<00:00, 77.36it/s]
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:20<00:00, 88.82it/s] 
Calcolo punteggi METEOR: 100%|██████████| 1779/1779 [00:18<00:00, 95.57it/s] 

BLEU scores for Qwen2.5-7B-Instruct-it (it): [[0.701058201058201, 0.7916857086896912, 0.6641321577174558, 0.8412698412698414, 0.8934494195688226, 0.43900314031862747, 0.6538575141776938, 0.8202560324696987, 0.6883947608894305, 0.88, 0.5769986979735575, 0.7779374775422208, 0.9990234375, 0.5903043478260869, 0.6506309148264985, 0.7618884316415181, 0.7691790591096147, 0.5853699504493155, 0.8152173913043477, 0.920940170940171, 0.7833402699328141, 0.7524784954074939, 0.7761507081280788, 0.9335490187731471, 0.7723404255319148, 0.8161573086946221, 0.9417137886037409, 0.8412698412698414, 0.6107958477508649, 0.9998177842565598, 0.44834307992202727, 0.6339367703004066, 0.8569865116477394, 0.6092736551205058, 0.7134446585313637, 0.5278122189405116, 0.634712600636585, 0.7957957957957958, 0.6443520642201835, 0.7448052841612769, 0.5738636363636365, 0.7216538556485, 0.6063954244118028, 0.6738825385119633, 0.9990234375, 0.9945130315500685, 0.7156885228578301, 0.9995, 0.7559322033898305, 0.8166601151027




In [6]:
# Save meteor scores
with open('meteor_scores-exp3.pkl', 'wb') as f:
    pickle.dump(meteor_scores, f)