In [1]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
import pickle

# evaluation 
from evaluate import load

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
models_generations = {
    'en': {},
    'it': {},
    'ru': {},
    'ge': {}
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-exp2-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-en-' in file:
        if model not in models_generations['en']:
            models_generations['en'][model] = {}
        models_generations['en'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: en")
    elif '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")
    elif '-ge-' in file:
        if model not in models_generations['ge']:
            models_generations['ge'][model] = {}
        models_generations['ge'][model][gen] = model_generations
        print(f"model: {model} - ge: {gen}, lang: ge")
    elif '-ru-' in file:
        if model not in models_generations['ru']:
            models_generations['ru'][model] = {}
        models_generations['ru'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: ru")

model: Llama-3.1-8B-Instruct-en - gen: gen0, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen1, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen2, lang: en
model: Llama-3.1-8B-Instruct-ge - ge: gen0, lang: ge
model: Llama-3.1-8B-Instruct-ge - ge: gen1, lang: ge
model: Llama-3.1-8B-Instruct-ge - ge: gen2, lang: ge
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-ru - gen: gen0, lang: ru
model: Llama-3.1-8B-Instruct-ru - gen: gen1, lang: ru
model: Llama-3.1-8B-Instruct-ru - gen: gen2, lang: ru
model: Mistral-Nemo-Instruct-2407-en - gen: gen0, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen1, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen2, lang: en
model: Mistral-Nemo-Instruct-2407-ge - ge: gen0, lang: ge
model: Mistral-Nemo-Instruct-2407-ge - ge: gen1, lang: ge
model: Mistral-Nemo-Instruct-2407-ge - ge: gen2, lang: ge
mode

## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/meteor">METEOR</a>

In [4]:
meteor = load('meteor')

Using the latest cached version of the module from C:\Users\OliverioM\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--meteor\e7ed321a1b44c34fa4679192809db2cee7e3bd4bba0fe8b76061d807706c2374 (last modified on Thu Oct 10 16:06:59 2024) since it couldn't be found locally at evaluate-metric--meteor, or remotely on the Hugging Face Hub.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
meteor_scores = {'en': {}, 'it': {}, 'ge': {}, 'ru': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_meteor_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            gen_meteor_scores = []
            for i in tqdm(range(len(references)), desc="Calcolo punteggi METEOR"):
                results = meteor.compute(predictions=[predictions[i]], references=[references[i]])
                gen_meteor_scores.append(results["meteor"])

            model_meteor_scores.append(gen_meteor_scores)  # Store BLEU for each generation
        
        # Store the three METEOR scores instead of their mean
        meteor_scores[lang][model] = model_meteor_scores
        print(f'BLEU scores for {model} ({lang}): {model_meteor_scores}')
        print(f'Average: {np.mean(model_meteor_scores)}')
        print()

Language: en, Model: Llama-3.1-8B-Instruct-en


Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:09<00:00, 39.47it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:04<00:00, 83.08it/s] 
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 104.88it/s]


BLEU scores for Llama-3.1-8B-Instruct-en (en): [[0.9990234375, 0.9353193773483629, 0.5681818181818182, 0.6457115009746588, 0.5365350099079751, 0.8780549256739733, 0.75625, 0.5971471857759483, 0.9990234375, 0.9905133928571428, 0.6755884901104169, 0.7455234159779615, 0.9996243425995492, 0.8979985955056179, 0.8950617283950617, 0.9990234375, 0.9490196078431373, 0.9985422740524781, 0.9283625730994152, 0.9835843169176502, 0.8313775831653225, 0.789243781019589, 0.6782293923155853, 0.9997106481481481, 0.985735843633228, 0.8066666666666668, 0.9997724169321802, 0.6499975701025417, 0.8786823033100167, 0.5353535353535354, 0.8950617283950617, 0.857836285547282, 0.84162772810599, 0.7819174181300428, 0.8300434512725017, 0.9993141289437586, 0.8227909482758621, 0.7089466816235891, 0.9999375, 0.9526488919667591, 0.7883417917431522, 0.889690170940171, 0.7229026760276759, 0.7199882041953272, 0.9366457015045212, 0.7702460567823343, 0.8745874587458746, 0.9995, 0.9985422740524781, 0.8610146186134028, 0.9995,

Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:04<00:00, 95.67it/s] 
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 105.44it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 98.62it/s] 


BLEU scores for Mistral-Nemo-Instruct-2407-en (en): [[0.9990234375, 0.9353193773483629, 0.5681818181818182, 0.7305194805194806, 0.5513044491031913, 0.9101196803950482, 0.6918799458187781, 0.5336896663427275, 0.9990234375, 0.9905133928571428, 0.8027522935779816, 0.8894491207600632, 0.9996243425995492, 0.738954922628392, 0.8950617283950617, 0.9990234375, 0.9490196078431373, 0.9985422740524781, 0.7258064516129032, 0.9993141289437586, 0.786574074074074, 0.8964143426294822, 0.7773446457234581, 0.9268808114961962, 0.9114583333333334, 0.8066666666666668, 0.9997724169321802, 0.6898601398601399, 0.8816137566137566, 0.5353535353535354, 0.6918367346938775, 0.8140409370652161, 0.7029435016148292, 0.6701150089579841, 0.8300434512725017, 0.9728931830381106, 0.7592654503773045, 0.9537627551020408, 0.9641009852216748, 0.8743523316062176, 0.7958612611440098, 0.889690170940171, 0.5105819397993311, 0.6898846495119786, 0.8416446163240382, 0.8082429582065591, 0.8745874587458746, 0.9799019607843138, 0.99854

Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 108.05it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 108.48it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 106.43it/s]


BLEU scores for Qwen2.5-7B-Instruct-en (en): [[0.9990234375, 0.9353193773483629, 0.8745874587458746, 0.6971619192477876, 0.722902676027676, 0.8823575258760444, 0.7301424501424502, 0.4774982908602277, 0.9990234375, 0.9905133928571428, 0.7910002543077673, 0.7382726108718426, 0.9996243425995492, 0.854119425547997, 0.8950617283950617, 0.9455976122642791, 0.9584158415841585, 0.9067055393586005, 0.6433945105820107, 0.9835843169176502, 0.6871466654834929, 0.9342766826941932, 0.8256704980842913, 0.9268808114961962, 0.8565647482014387, 0.9976851851851852, 0.9997724169321802, 0.6898601398601399, 0.7806978798586572, 0.7150900900900901, 0.9995, 0.8596744203695974, 0.8664225865209471, 0.7053231814270042, 0.8848012889366272, 0.8819444444444444, 0.7836004273504275, 0.9422222222222222, 0.9896039603960397, 0.728322440087146, 0.8037238799902431, 0.8333333333333334, 0.8330609318996415, 0.6814273846094399, 0.9366457015045212, 0.8082429582065591, 0.8745874587458746, 0.9995, 0.9985422740524781, 0.8610146186

Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 104.17it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 102.96it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 107.97it/s]


BLEU scores for Llama-3.1-8B-Instruct-it (it): [[0.9995, 0.9995, 0.5625, 0.7418154761904762, 0.6399317406143346, 0.7562196601941749, 0.6055818852027383, 0.7474959935897436, 0.9993141289437586, 0.9347587719298245, 0.687235004364634, 0.6809160305343512, 0.9997106481481481, 0.8819444444444444, 0.9998518518518519, 0.9985422740524781, 0.9996243425995492, 0.9976851851851852, 0.8353403473190518, 0.9996243425995492, 0.7887965527854724, 0.8754577595684604, 0.7667908276102898, 0.9997106481481481, 0.9844782983615982, 0.9976851851851852, 0.9998779296875, 0.555496878373938, 0.8341611087460384, 0.3381818181818182, 0.9993141289437586, 0.7215909090909091, 0.7571992249956062, 0.7375661375661378, 0.9999530428249437, 0.9993141289437586, 0.8858816964285713, 0.8102976640391606, 0.9429245283018868, 0.9085607496454559, 0.8683000163731898, 0.7657150012521914, 0.7290519031141869, 0.5465298406697272, 0.9408635492895633, 0.9161745962354303, 0.7309228039041704, 0.9996243425995492, 0.7454289732770746, 0.9998982291

Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 101.08it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 120.37it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 100.16it/s]


BLEU scores for Mistral-Nemo-Instruct-2407-it (it): [[0.9995, 0.9995, 0.7053228091479459, 0.7453051643192489, 0.5991482465562402, 0.7711943069306932, 0.6601058201058201, 0.6968383208205561, 0.9976851851851852, 0.9347587719298245, 0.55529714259873, 0.7102272727272727, 0.9997106481481481, 0.9993141289437586, 0.9998518518518519, 0.9985422740524781, 0.9996243425995492, 0.9976851851851852, 0.8030303030303031, 0.9997724169321802, 0.8660247971339777, 0.9073691460055096, 0.6585034013605443, 0.9997106481481481, 0.8187416978625771, 0.9976851851851852, 0.9998779296875, 0.5609630470958029, 0.7490758620689654, 0.3381818181818182, 0.9993141289437586, 0.6815843621399177, 0.839578947368421, 0.5409331661057323, 0.8636178810134946, 0.9993141289437586, 0.8596071518509192, 0.8872446705033639, 0.9999271030762502, 0.7626904539800996, 0.8335382345709369, 0.8587786259541985, 0.7290519031141869, 0.4444444444444444, 0.9408635492895633, 0.8621018500858287, 0.9995, 0.6756756756756757, 0.7454289732770746, 0.850007

Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 110.87it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 106.31it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 107.30it/s]


BLEU scores for Qwen2.5-7B-Instruct-it (it): [[0.6716861501882306, 0.7059558517284465, 0.6592637054821929, 0.5957630903204197, 0.5629372270998637, 0.7750310945273633, 0.6878153515834675, 0.5952380952380952, 0.9993141289437586, 0.9347587719298245, 0.5453928571428571, 0.5852417302798982, 0.9997106481481481, 0.9993141289437586, 0.9998518518518519, 0.8412698412698414, 0.7080965909090909, 0.9985422740524781, 0.5178125, 0.9996243425995492, 0.8294209702660407, 0.923391106960925, 0.7903073286052009, 0.9063588552694488, 0.868875893437297, 0.8300000000000002, 0.9422781271837876, 0.6011795343137255, 0.7712456641501735, 0.3381818181818182, 0.8819444444444444, 0.7282110091743119, 0.7710982209907554, 0.6229987191802753, 0.8674934559963404, 0.7515909589642309, 0.8213540785839112, 0.6499975701025417, 0.9368569958847738, 0.883890601225079, 0.8335382345709369, 0.7657150012521914, 0.5323642526562806, 0.48135212172307196, 0.9999375, 0.8881470491262776, 0.7636335784313727, 0.9996243425995492, 0.72702331961

Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 107.55it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 96.52it/s] 
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 105.86it/s]


BLEU scores for Llama-3.1-8B-Instruct-ru (ru): [[0.9990234375, 0.9995, 0.5616605616605616, 0.6987577639751552, 0.4379834125430994, 0.6859756097560975, 0.2382060831781502, 0.3166852678571429, 0.9976851851851852, 0.8593596927779301, 0.534860784462484, 0.9471552681429224, 0.9993141289437586, 0.7934426229508196, 0.9993141289437586, 0.9985422740524781, 0.6226379440665155, 0.7211538461538461, 0.5524157801418439, 0.9993141289437586, 0.8075404248243755, 0.9395238095238095, 0.8787827461607949, 0.9995, 0.5603642086330936, 0.7937500000000002, 0.9997106481481481, 0.31250000000000006, 0.7136000000000001, 0.5111111111111111, 0.9993141289437586, 0.5984042553191489, 0.7171494132779744, 0.5803050559066603, 0.7943507870938931, 0.5208333333333334, 0.6882225433526011, 0.4152960526315789, 0.999898229187869, 0.9036041288021672, 0.5394624510583179, 0.9444568535477627, 0.4855213994565217, 0.3884171195652174, 0.6987577639751552, 0.7062107616124237, 0.6861724281549355, 0.9993141289437586, 0.9976851851851852, 0.

Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:04<00:00, 85.41it/s] 
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:04<00:00, 81.41it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 101.73it/s]


BLEU scores for Mistral-Nemo-Instruct-2407-ru (ru): [[0.7361111111111112, 0.9995, 0.7552083333333334, 0.7630214040470452, 0.6843273180751175, 0.54461634630411, 0.2592388306674021, 0.4868872817590766, 0.9976851851851852, 0.7311294765840222, 0.7647753233628033, 0.7153712548849326, 0.9993141289437586, 0.7446153846153847, 0.9993141289437586, 0.9490196078431373, 0.7349537037037036, 0.8036036036036037, 0.49460827464788737, 0.9993141289437586, 0.7973978467805629, 0.8947766040636721, 0.6377551020408164, 0.9995, 0.6160910087719298, 0.7500000000000001, 0.9997106481481481, 0.23557397176181705, 0.7125920907782198, 0.7886904761904763, 0.9993141289437586, 0.5694117647058824, 0.4302855264393726, 0.6546991148983864, 0.7943507870938931, 0.6722222222222223, 0.587202380952381, 0.613203125, 0.999898229187869, 0.9036041288021672, 0.5584219898634766, 0.9530428249436514, 0.45909090909090905, 0.5877370913956798, 0.9228433402346445, 0.8515317329955868, 0.7773446457234581, 0.9142661179698217, 0.8066666666666668

Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 109.09it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 103.84it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 103.88it/s]


BLEU scores for Qwen2.5-7B-Instruct-ru (ru): [[0.7332282110091743, 0.9995, 0.4043478260869566, 0.6794643791191547, 0.5447821409359871, 0.7838736007462688, 0.5408583186360963, 0.43512195121951225, 0.9976851851851852, 0.7928623988226638, 0.6625514403292181, 0.7392265193370167, 0.9993141289437586, 0.3896604938271605, 0.9993141289437586, 0.7352941176470589, 0.2298850574712644, 0.6305084745762711, 0.2772207148755456, 0.9993141289437586, 0.6973944954128442, 0.5571663802978236, 0.8454412156758198, 0.9995, 0.7155126140633389, 0.7500000000000001, 0.9997106481481481, 0.5207959037010421, 0.7433333333333335, 0.33513513513513515, 0.9993141289437586, 0.5256980781456305, 0.7795759300776023, 0.6848169942139685, 0.918196137515409, 0.41471048513302033, 0.7543103448275862, 0.9413684247862946, 0.7817784256559768, 0.8029976696560976, 0.5448841842347834, 0.9530428249436514, 0.5615413232600733, 0.5185185185185186, 0.754206552113681, 0.7191249553828832, 0.7773446457234581, 0.2869318181818182, 0.80666666666666

Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 100.06it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:04<00:00, 93.22it/s] 
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 101.66it/s]


BLEU scores for Llama-3.1-8B-Instruct-ge (ge): [[0.9985422740524781, 0.9084673899488716, 0.6724489795918367, 0.6048387096774194, 0.3048955463728191, 0.7212094907407408, 0.8057510753589187, 0.4309622368387636, 0.6098360655737705, 0.9998779296875, 0.6676557863501484, 0.920940170940171, 0.9054545454545455, 0.8979985955056179, 0.9993141289437586, 0.5563218390804598, 0.5318681318681319, 0.9997106481481481, 0.860948667966212, 0.854119425547997, 0.876565060635857, 0.9438134228050196, 0.6490131578947369, 0.8103975535168196, 0.9733514001806685, 0.9976851851851852, 0.9997106481481481, 0.4508928571428571, 0.7987079037800686, 0.48478260869565226, 0.618131868131868, 0.7670454545454546, 0.8389171075837741, 0.6489948311237375, 0.6713681849551414, 0.9993141289437586, 0.7051990346311535, 0.9438513106090165, 0.9999460101500918, 0.9402268760907505, 0.5258550572795324, 0.9259259259259259, 0.3880070546737213, 0.4920764867551605, 0.8810643991712706, 0.8039867109634551, 0.855940934065934, 0.8775114984265312,

Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 101.70it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 113.73it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 103.05it/s]


BLEU scores for Mistral-Nemo-Instruct-2407-ge (ge): [[0.9985422740524781, 0.9084673899488716, 0.42216721672167207, 0.6048387096774194, 0.4838423295454545, 0.8141321044546851, 0.8270058369608231, 0.43835978835978845, 0.43314500941619577, 0.9998779296875, 0.726739537646199, 0.8186789287267755, 0.6918367346938775, 0.6691919191919192, 0.9993141289437586, 0.5563218390804598, 0.5318681318681319, 0.9498207885304659, 0.9213952850877193, 0.9995, 0.7932930086680797, 0.9349145063430778, 0.6410596026490065, 0.8030303030303031, 0.8151159690749132, 0.8066666666666668, 0.9997106481481481, 0.6136363636363636, 0.643598615916955, 0.6400966183574879, 0.618131868131868, 0.798826530612245, 0.8522727272727272, 0.5600841869314777, 0.7725591715976331, 0.9728931830381106, 0.7496253576131874, 0.7816606293060586, 0.9999460101500918, 0.9467186484730344, 0.577453276728639, 0.889690170940171, 0.4993288590604027, 0.6639462775381941, 0.8147445157389055, 0.5957054539450753, 0.7471655328798186, 0.6861724281549355, 0.99

Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 103.71it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 101.15it/s]
Calcolo punteggi METEOR: 100%|██████████| 383/383 [00:03<00:00, 104.62it/s]

BLEU scores for Qwen2.5-7B-Instruct-ge (ge): [[0.9985422740524781, 0.9995, 0.372, 0.6104651162790697, 0.5828578838174273, 0.7186411149825784, 0.6753535353535353, 0.44002112629563606, 0.6098360655737705, 0.9259259259259259, 0.5813517639852629, 0.8248810418231906, 0.9861386138613862, 0.8070175438596492, 0.6545430672268908, 0.3703703703703703, 0.6400966183574879, 0.8493645392041114, 0.6944444444444445, 0.9995, 0.7720588235294117, 0.7324630386914124, 0.7016690973910226, 0.6756756756756757, 0.6229323308270678, 0.9976851851851852, 0.9997106481481481, 0.4479817757361248, 0.7036124567474049, 0.6400966183574879, 0.66167290886392, 0.8130612244897959, 0.8090413716507011, 0.5495925320564471, 0.8848012889366272, 0.7687074829931974, 0.6463952282157674, 0.6553603516640182, 0.9999460101500918, 0.9941842757900539, 0.7526014358312495, 0.9641356881521297, 0.4905987933062648, 0.46073264986745877, 0.8810643991712706, 0.6571162667021205, 0.7471655328798186, 0.7425742574257427, 0.9985422740524781, 0.74957983




In [None]:
# Save meteor scores
with open('meteor_scores-exp2.pkl', 'wb') as f:
    pickle.dump(meteor_scores, f)