In [1]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
from scipy.stats import ttest_rel

# evaluation 
from evaluate import load

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
models_generations = {
    'en': {},
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-en-' in file:
        if model not in models_generations['en']:
            models_generations['en'][model] = {}
        models_generations['en'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: en")
    elif '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")

models_generations['it']['Llama-3.1-8B-Instruct-it']['gen0']

model: Llama-3.1-8B-Instruct-en - gen: gen0, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen1, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen2, lang: en
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Mistral-Nemo-Instruct-2407-en - gen: gen0, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen1, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen2, lang: en
model: Mistral-Nemo-Instruct-2407-it - gen: gen0, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen1, lang: it
model: Mistral-Nemo-Instruct-2407-it - gen: gen2, lang: it
model: Qwen2.5-7B-Instruct-en - gen: gen0, lang: en
model: Qwen2.5-7B-Instruct-en - gen: gen1, lang: en
model: Qwen2.5-7B-Instruct-en - gen: gen2, lang: en
model: Qwen2.5-7B-Instruct-it - gen: gen0, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen1, lang: it
model: Qwen2.5-7B-Instruct-it - gen: gen2, lang: it


Unnamed: 0,eids,input,prediction,actual,generation
0,Id1193,Brandon_Carter knownFor Doomsday_argument,Brandon Carter è noto per il suo ruolo nel Do...,[Brandon Carter è noto per il suo lavoro con l...,<s> [INST] Given the following triples in (TRI...
1,Id427,"Piotr_Hallmann birthPlace Gdynia,_Poland Gdyni...","Piotr Hallmann è nato a Gdynia, in Polonia, c...","[Piotr Hallmann, che pesa 70,308 kg, è nato a ...",<s> [INST] Given the following triples in (TRI...
2,Id1475,Darinka_Dentcheva influencedBy Andrzej_Piotr_R...,"Darinka Dentcheva, che lavora nel campo dell'...","[Influenzata da Andrzej Piotr Ruszczyński, Dar...",<s> [INST] Given the following triples in (TRI...
3,Id766,Istanbul utcOffset +2,Istanbul ha un offset UTC di +2.,"[Istanbul si trova nel fuso orario +2 UTC., L'...",<s> [INST] Given the following triples in (TRI...
4,Id486,Nie_Haisheng birthDate 1964-10-13 Nie_Haisheng...,Nie Haisheng è nato il 13 ottobre 1964 nella ...,[Nie Haisheng è nato il 13 ottobre 1964 ed è o...,<s> [INST] Given the following triples in (TRI...
...,...,...,...,...,...
1774,Id1131,It's_Great_to_Be_Young_(1956_film) editing Max...,"Il film ""It's Great to Be Young"" del 1956, sc...",[Cecil Parker ha recitato nel film del 1956 It...,<s> [INST] Given the following triples in (TRI...
1775,Id1295,Karlsruhe elevationAboveTheSeaLevel 115.0,"Karlsruhe si trova a 115,0 sul livello del ma...",[Karlsruhe ha un'altitudine di 115 metri sul l...,<s> [INST] Given the following triples in (TRI...
1776,Id861,Bananaman broadcastedBy BBC BBC foundedBy John...,"Bananaman è trasmesso dalla BBC, fondata da J...","[John Reith, 1° Barone Reith, fonda la BBC, ch...",<s> [INST] Given the following triples in (TRI...
1777,Id1460,Shenzhou_10 cosparId 2013-029A,Il cosparId del Shenzhou 10 è 2013-029A.,[L'ID COSPAR di Shenzhou 10 è 2013-029A.],<s> [INST] Given the following triples in (TRI...


## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/bertscore">Bertscore</a>

In [4]:
bertscore = load("bertscore")

Using the latest cached version of the module from C:\Users\OliverioM\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--bertscore\cf4907b18f8f741f202232c0f8009a3bd49ff98802c245abcb6ea51a37a8c05b (last modified on Mon Mar  4 15:55:30 2024) since it couldn't be found locally at evaluate-metric--bertscore, or remotely on the Hugging Face Hub.


In [5]:
from tqdm import tqdm
import numpy as np

bertscore_scores = {'en': {}, 'it': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_bertscore_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        gens = sorted(models_generations[lang][model])  # Ensure order (gen0, gen1, gen2)
        for gen in tqdm(gens, desc=f'Processing {model} ({lang})'):
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            results = bertscore.compute(predictions=predictions, references=references, lang=lang)
            model_bertscore_scores.append(results['f1'])  # Store bertscore for each generation
        
        # Store the three bertscore scores instead of their mean
        bertscore_scores[lang][model] = model_bertscore_scores
        print(f'bertscore scores for {model} ({lang}): {model_bertscore_scores} - Average: {np.mean(model_bertscore_scores)}')
        print()


Language: en, Model: Llama-3.1-8B-Instruct-en


Processing Llama-3.1-8B-Instruct-en (en):   0%|          | 0/3 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing Llama-3.1-8B-Instruct-en (en): 100%|██████████| 3/3 [1:11:35<00:00, 1431.98s/it]


bertscore scores for Llama-3.1-8B-Instruct-en (en): [[0.9999999403953552, 0.9755687117576599, 0.9691643118858337, 0.9398939609527588, 0.9598414301872253, 0.9239080548286438, 0.9761472940444946, 0.9670397639274597, 0.9656186103820801, 0.9533048868179321, 0.958212673664093, 0.9681646823883057, 0.987339198589325, 0.9641438722610474, 0.9146397113800049, 0.9999999403953552, 0.963658332824707, 0.9258201122283936, 0.9832453727722168, 0.9492051005363464, 1.0, 0.9378374814987183, 0.9563964009284973, 0.9746262431144714, 0.9415963292121887, 0.9500493407249451, 0.9736338257789612, 1.0, 0.9480230808258057, 0.9903646111488342, 0.9942805767059326, 0.9582239389419556, 0.9625673294067383, 0.9289659261703491, 0.926780104637146, 0.9214136600494385, 0.9455626010894775, 0.9858351349830627, 0.9584633111953735, 0.9317739009857178, 0.9389771819114685, 0.9591579437255859, 0.9445483088493347, 0.9437512755393982, 1.0, 0.980296790599823, 0.9579530358314514, 0.9654635190963745, 0.981255292892456, 0.953448593616485

Processing Mistral-Nemo-Instruct-2407-en (en): 100%|██████████| 3/3 [1:19:46<00:00, 1595.46s/it]


bertscore scores for Mistral-Nemo-Instruct-2407-en (en): [[0.9999999403953552, 0.9829363822937012, 0.9737188816070557, 0.9999999403953552, 0.9862887263298035, 0.9243161678314209, 0.9683665633201599, 0.9550752639770508, 0.9467968344688416, 0.9532947540283203, 0.9558698534965515, 1.0, 0.987339198589325, 0.9635984301567078, 0.920369565486908, 0.9999999403953552, 0.9634261131286621, 0.9497230052947998, 0.9832453727722168, 0.9827113151550293, 0.9752463698387146, 0.9478526711463928, 0.9380329847335815, 0.9763203859329224, 0.9443202614784241, 0.9566362500190735, 0.9612124562263489, 0.9828670620918274, 0.9607272744178772, 0.9903646111488342, 0.9942805767059326, 0.9501579403877258, 0.9652373194694519, 0.9337447881698608, 0.9214510917663574, 0.9374306201934814, 0.9511074423789978, 0.9975405335426331, 0.9584633111953735, 0.9315940141677856, 0.9313762187957764, 0.9621526598930359, 0.9579222202301025, 0.961509644985199, 1.0, 0.980296790599823, 0.9579752683639526, 0.972524881362915, 0.97537595033645

Processing Qwen2.5-7B-Instruct-en (en): 100%|██████████| 3/3 [1:17:04<00:00, 1541.39s/it]


bertscore scores for Qwen2.5-7B-Instruct-en (en): [[0.9928882122039795, 0.9729917049407959, 0.9686226844787598, 0.9999999403953552, 0.9783089756965637, 0.9163023233413696, 0.9541284441947937, 0.9884920716285706, 0.9457038044929504, 0.9498216509819031, 0.9609667658805847, 0.9683293104171753, 1.0, 0.9679377675056458, 0.9180595874786377, 0.9485225677490234, 0.9645930528640747, 0.9192357063293457, 0.9689984917640686, 0.9492051005363464, 0.9297413229942322, 0.9447482824325562, 0.9195305109024048, 0.9517964124679565, 0.9443202614784241, 0.9489766955375671, 0.9742627739906311, 0.9676289558410645, 0.9564720988273621, 0.9904076457023621, 0.9942805767059326, 0.9576908946037292, 0.9707698225975037, 0.9302405714988708, 0.9297313094139099, 0.924543559551239, 0.9584185481071472, 0.9975405335426331, 0.9520887136459351, 0.9550122618675232, 0.952795684337616, 0.9588478207588196, 0.9491767287254333, 0.9330609440803528, 0.9801568984985352, 0.9785155653953552, 0.9672107100486755, 0.9903160333633423, 0.973

Processing Llama-3.1-8B-Instruct-it (it): 100%|██████████| 3/3 [22:53<00:00, 457.70s/it]


bertscore scores for Llama-3.1-8B-Instruct-it (it): [[0.9124559164047241, 0.8709936738014221, 0.9343350529670715, 0.8514431118965149, 1.0, 0.8036054372787476, 0.919719398021698, 0.9745334386825562, 0.913061797618866, 0.9674572944641113, 0.9160212874412537, 0.918339729309082, 0.9999999403953552, 0.901322066783905, 0.8533045053482056, 0.9753714799880981, 0.8723885416984558, 0.8915979266166687, 0.9239615797996521, 0.9488560557365417, 0.9440426826477051, 0.8525288105010986, 0.9381900429725647, 0.935106635093689, 0.8654909133911133, 0.8706619143486023, 0.9473466277122498, 0.9871997237205505, 0.8804678916931152, 1.0, 0.8174940347671509, 0.8675440549850464, 0.9230707287788391, 0.8343014717102051, 0.8812362551689148, 0.8277773857116699, 0.9825567603111267, 1.0, 0.9527955651283264, 0.8125050067901611, 0.8640748262405396, 0.9123405814170837, 0.8798090219497681, 0.8863799571990967, 0.9085203409194946, 0.9878652691841125, 0.8834203481674194, 0.8907316327095032, 0.8692023754119873, 0.89546567201614

Processing Mistral-Nemo-Instruct-2407-it (it): 100%|██████████| 3/3 [21:35<00:00, 431.71s/it]


bertscore scores for Mistral-Nemo-Instruct-2407-it (it): [[0.8829940557479858, 0.897627592086792, 0.9159049391746521, 0.8514431118965149, 1.0, 0.7991232872009277, 0.882602870464325, 0.8850091099739075, 0.9037794470787048, 0.9567463397979736, 0.9160212874412537, 0.9402575492858887, 0.9999999403953552, 0.9595977067947388, 0.828165590763092, 0.9753714799880981, 0.868842601776123, 0.8509210348129272, 0.9731349945068359, 0.9785830974578857, 0.9456759095191956, 0.8896697759628296, 0.924561083316803, 0.9792613387107849, 0.908886194229126, 0.8767868876457214, 0.9473466277122498, 0.9168255925178528, 0.8977922797203064, 1.0, 0.9809784889221191, 0.8945854902267456, 0.912769615650177, 0.8487603068351746, 0.9294781684875488, 0.8348931670188904, 0.9225883483886719, 1.0, 0.9582328200340271, 0.8292344808578491, 0.8654000163078308, 0.9026670455932617, 0.9119797945022583, 0.8648281693458557, 0.9085203409194946, 0.9878652691841125, 0.8782879114151001, 1.000000238418579, 0.9288711547851562, 0.906770765781

Processing Qwen2.5-7B-Instruct-it (it): 100%|██████████| 3/3 [21:58<00:00, 439.38s/it]

bertscore scores for Qwen2.5-7B-Instruct-it (it): [[0.8443081378936768, 0.8524596095085144, 0.8902831673622131, 0.9797024726867676, 0.9488788843154907, 0.8008025884628296, 0.9084104299545288, 0.9206162095069885, 0.9037794470787048, 0.92048180103302, 0.9061306118965149, 0.937617301940918, 0.9999999403953552, 0.8546064496040344, 0.8255196213722229, 0.9141436219215393, 0.8742873668670654, 0.8846025466918945, 0.9239615797996521, 0.9961159825325012, 0.9137716889381409, 0.8443050384521484, 0.9381900429725647, 0.9879658818244934, 0.8909716010093689, 0.9250983595848083, 0.9269139766693115, 0.9657539129257202, 0.8426083326339722, 0.9999999403953552, 0.7764139175415039, 0.8556170463562012, 0.9348837733268738, 0.809691309928894, 0.901153028011322, 0.8494065999984741, 0.8668110966682434, 0.8819257020950317, 0.9324167966842651, 0.9396616816520691, 0.8426828980445862, 0.9047004580497742, 0.833283007144928, 0.879375696182251, 1.0000001192092896, 0.9878652691841125, 0.8731351494789124, 1.0000002384185




In [None]:
#save bertscore scores
import pickle

with open('bertscore_scores-exp1.pkl', 'wb') as f:
    pickle.dump(bertscore_scores, f)