In [1]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
from scipy.stats import ttest_rel

# evaluation 
from evaluate import load

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
models_generations = {
    'en': {},
    'it': {},
    'ru': {},
    'ge': {}
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-exp2-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-en-' in file:
        if model not in models_generations['en']:
            models_generations['en'][model] = {}
        models_generations['en'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: en")
    elif '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")
    elif '-ge-' in file:
        if model not in models_generations['ge']:
            models_generations['ge'][model] = {}
        models_generations['ge'][model][gen] = model_generations
        print(f"model: {model} - ge: {gen}, lang: ge")
    elif '-ru-' in file:
        if model not in models_generations['ru']:
            models_generations['ru'][model] = {}
        models_generations['ru'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: ru")

models_generations['it']['Llama-3.1-8B-Instruct-it']['gen0']

model: Llama-3.1-8B-Instruct-en - gen: gen0, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen1, lang: en
model: Llama-3.1-8B-Instruct-en - gen: gen2, lang: en
model: Llama-3.1-8B-Instruct-ge - ge: gen0, lang: ge
model: Llama-3.1-8B-Instruct-ge - ge: gen1, lang: ge
model: Llama-3.1-8B-Instruct-ge - ge: gen2, lang: ge
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-ru - gen: gen0, lang: ru
model: Llama-3.1-8B-Instruct-ru - gen: gen1, lang: ru
model: Llama-3.1-8B-Instruct-ru - gen: gen2, lang: ru
model: Mistral-Nemo-Instruct-2407-en - gen: gen0, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen1, lang: en
model: Mistral-Nemo-Instruct-2407-en - gen: gen2, lang: en
model: Mistral-Nemo-Instruct-2407-ge - ge: gen0, lang: ge
model: Mistral-Nemo-Instruct-2407-ge - ge: gen1, lang: ge
model: Mistral-Nemo-Instruct-2407-ge - ge: gen2, lang: ge
mode

Unnamed: 0,eids,input,prediction,actual,generation
0,Id3,103_Colmore_Row completionDate 1976,Il 103 Colmore Row è stato completato nel 1976.,[Il 103 Colmore Row è stato completato nel 197...,<s> [INST] Given the following triples in (TRI...
1,Id10,AWH_Engineering_College state Kerala,L'AWH Engineering College si trova nello stat...,[L'AWH Engineering College si trova nello stat...,<s> [INST] Given the following triples in (TRI...
2,Id148,Bakso country Chinese_cuisine Bakso ingredient...,Il bakso è un piatto cinese che contiene tapi...,"[La tapioca è un ingrediente del bakso, prodot...",<s> [INST] Given the following triples in (TRI...
3,Id72,Elliot_See birthPlace Dallas Elliot_See deathP...,"Elliot See, nato a Dallas, Stati Uniti, è mor...",[Elliot See era un cittadino statunitense nato...,<s> [INST] Given the following triples in (TRI...
4,Id216,Batchoy country Philippines Philippines ethnic...,Il batchoy è un piatto originario delle Filip...,[Le Filippine sono il Paese da cui proviene Ba...,<s> [INST] Given the following triples in (TRI...
...,...,...,...,...,...
378,Id133,Appleton_International_Airport cityServed Appl...,L'aeroporto internazionale di Appleton serve ...,"[Greenville, dove si trova l'aeroporto interna...",<s> [INST] Given the following triples in (TRI...
379,Id18,Serie_A champions Juventus_F.C. A.S._Roma full...,Il nome completo dell'A.S. Roma è Associazion...,"[L'""Associazione Sportiva Roma S.p.A."" è il no...",<s> [INST] Given the following triples in (TRI...
380,Id22,Serie_B champions Carpi_F.C._1909 A.C._Cesena ...,Il Carpi FC 1909 è stato campione della Serie...,[L'A.C. Cesena gioca allo Stadio Dino Manuzzi ...,<s> [INST] Given the following triples in (TRI...
381,Id12,Bakewell_tart ingredient Frangipane,Il frangipane è un ingrediente della torta Ba...,[Un ingrediente della torta bakewell è il fran...,<s> [INST] Given the following triples in (TRI...


## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/bertscore">Bertscore</a>

In [4]:
bertscore = load("bertscore")

Using the latest cached version of the module from C:\Users\OliverioM\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--bertscore\cf4907b18f8f741f202232c0f8009a3bd49ff98802c245abcb6ea51a37a8c05b (last modified on Mon Mar  4 15:55:30 2024) since it couldn't be found locally at evaluate-metric--bertscore, or remotely on the Hugging Face Hub.


In [6]:
from tqdm import tqdm
import numpy as np

bertscore_scores = {'en': {}, 'it': {}, 'ge': {}, 'ru': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_bertscore_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        gens = sorted(models_generations[lang][model])  # Ensure order (gen0, gen1, gen2)
        for gen in tqdm(gens, desc=f'Processing {model} ({lang})'):
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            results = bertscore.compute(predictions=predictions, references=references, lang=lang)
            model_bertscore_scores.append(results['f1'])  # Store bertscore for each generation
        
        # Store the three bertscore scores instead of their mean
        bertscore_scores[lang][model] = model_bertscore_scores
        print(f'bertscore scores for {model} ({lang}): {model_bertscore_scores} - Average: {np.mean(model_bertscore_scores)}')
        print()


Language: en, Model: Llama-3.1-8B-Instruct-en


Processing Llama-3.1-8B-Instruct-en (en): 100%|██████████| 3/3 [16:00<00:00, 320.10s/it]


bertscore scores for Llama-3.1-8B-Instruct-en (en): [[1.0000001192092896, 0.9871123433113098, 0.9489569664001465, 0.949309229850769, 0.9049603939056396, 0.976327121257782, 0.9657802581787109, 0.9675465226173401, 0.9999999403953552, 0.9780132174491882, 0.9613801836967468, 0.9532809853553772, 1.0, 0.988953173160553, 0.9921119213104248, 0.9999999403953552, 0.990121066570282, 1.0, 0.9850550293922424, 0.9911262392997742, 0.9662472605705261, 0.9776471257209778, 0.9608689546585083, 1.0, 0.9726154208183289, 0.9822422862052917, 1.0000001192092896, 0.9539879560470581, 0.9713131189346313, 0.9198744893074036, 0.9528504610061646, 0.96565181016922, 0.983439028263092, 0.9717096090316772, 0.9779887795448303, 0.9999998807907104, 0.977752685546875, 0.934906542301178, 1.0, 0.9887784719467163, 0.9535112380981445, 0.9738200306892395, 0.933668851852417, 0.961065948009491, 0.9879341721534729, 0.9722996950149536, 0.9699904918670654, 1.0, 0.9999999403953552, 0.985289454460144, 1.0, 0.9921659231185913, 1.000000

Processing Mistral-Nemo-Instruct-2407-en (en): 100%|██████████| 3/3 [1:17:27<00:00, 1549.17s/it]


bertscore scores for Mistral-Nemo-Instruct-2407-en (en): [[1.0000001192092896, 0.9871123433113098, 0.9489569664001465, 0.966205358505249, 0.9112358689308167, 0.986506462097168, 0.937029242515564, 0.9354651570320129, 0.9999999403953552, 0.9780132174491882, 0.9569023251533508, 0.9733424186706543, 1.0, 0.9536904096603394, 0.9921119213104248, 0.9999999403953552, 0.990121066570282, 1.0, 0.9315239191055298, 0.9999998807907104, 0.9567088484764099, 0.9784883856773376, 0.9624418020248413, 0.9789490103721619, 0.9601083397865295, 0.9822422862052917, 1.0000001192092896, 0.9624094367027283, 0.9777973294258118, 0.9198744893074036, 0.9397350549697876, 0.9613179564476013, 0.9803296327590942, 0.9563753008842468, 0.9779887795448303, 0.9765360355377197, 0.9801866412162781, 0.9705907106399536, 0.9921748042106628, 0.9814532399177551, 0.9527116417884827, 0.9738200306892395, 0.9256959557533264, 0.9631209969520569, 0.9745307564735413, 0.9674184322357178, 0.9699904918670654, 0.9733095765113831, 0.9999999403953

Processing Qwen2.5-7B-Instruct-en (en): 100%|██████████| 3/3 [16:03<00:00, 321.22s/it]


bertscore scores for Qwen2.5-7B-Instruct-en (en): [[1.0000001192092896, 0.9871123433113098, 0.972503662109375, 0.9549265503883362, 0.8998623490333557, 0.9539329409599304, 0.9320996403694153, 0.9356433749198914, 0.9999999403953552, 0.9780132174491882, 0.9720425009727478, 0.9621866941452026, 1.0, 0.9793853163719177, 0.9921119213104248, 0.9674044251441956, 0.9893003702163696, 0.968032717704773, 0.9700347781181335, 0.9911262392997742, 0.9560850262641907, 0.9737284779548645, 0.9609466195106506, 0.9789490103721619, 0.9582708477973938, 0.9999999403953552, 1.0, 0.9624094367027283, 0.9694182276725769, 0.9351182579994202, 1.0, 0.9690403938293457, 0.9816384315490723, 0.9510841965675354, 0.965675413608551, 0.9769505262374878, 0.9453490376472473, 0.9870471358299255, 0.9955793023109436, 0.9754318594932556, 0.9657133221626282, 0.9777485132217407, 0.9434863328933716, 0.948113203048706, 0.9879341721534729, 0.9743580222129822, 0.9699904918670654, 1.0, 0.9999999403953552, 0.985289454460144, 1.0, 0.989405

Processing Llama-3.1-8B-Instruct-it (it): 100%|██████████| 3/3 [05:02<00:00, 100.67s/it]


bertscore scores for Llama-3.1-8B-Instruct-it (it): [[1.0, 1.0, 0.877234935760498, 0.8897830843925476, 0.8552973866462708, 0.9143869876861572, 0.9199389219284058, 0.9037865996360779, 1.0, 0.9793652892112732, 0.8738012909889221, 0.8644979000091553, 1.0, 0.9528720378875732, 1.0000001192092896, 1.0, 0.9999999403953552, 1.0, 0.9451808333396912, 1.0, 0.9331319332122803, 0.8695903420448303, 0.9433524012565613, 0.9887174963951111, 0.960894763469696, 0.9999999403953552, 0.9999999403953552, 0.8389389514923096, 0.9009085297584534, 0.8063920140266418, 0.990601658821106, 0.9229763150215149, 0.9439373016357422, 0.9050946235656738, 1.0, 1.0, 0.9351850152015686, 0.9627248048782349, 0.9849638938903809, 0.9740467071533203, 0.9269565939903259, 0.9036545157432556, 0.8229038715362549, 0.8468814492225647, 0.984818696975708, 0.9786685705184937, 0.9046666026115417, 1.0, 0.9685795307159424, 1.0, 0.9999999403953552, 0.9999999403953552, 1.0, 0.9781917929649353, 0.87298983335495, 0.8240690231323242, 1.0, 0.96795

Processing Mistral-Nemo-Instruct-2407-it (it): 100%|██████████| 3/3 [04:51<00:00, 97.04s/it]


bertscore scores for Mistral-Nemo-Instruct-2407-it (it): [[1.0, 1.0, 0.8741648197174072, 0.9070236086845398, 0.8422218561172485, 0.9343938231468201, 0.9344915747642517, 0.9050819873809814, 0.9999999403953552, 0.98087078332901, 0.8540971875190735, 0.9144261479377747, 1.0, 1.0, 1.0000001192092896, 1.0, 0.9999999403953552, 1.0, 0.895247220993042, 0.9948587417602539, 0.9473604559898376, 0.9719410538673401, 0.8729146718978882, 0.9999998807907104, 0.9392833113670349, 0.9999999403953552, 0.9999999403953552, 0.8614650368690491, 0.8613175749778748, 0.8063920140266418, 1.0, 0.9326342344284058, 0.963715672492981, 0.8885130286216736, 0.9612290263175964, 1.0, 0.9659641981124878, 0.9321322441101074, 1.0, 0.966069221496582, 0.940201461315155, 0.9130942225456238, 0.8261061310768127, 0.8407790660858154, 0.984818696975708, 0.9565443992614746, 1.0, 0.8767295479774475, 0.9685795307159424, 0.9600616097450256, 0.9999999403953552, 0.9527283906936646, 1.0, 0.9781917929649353, 0.9296999573707581, 0.89079976081

Processing Qwen2.5-7B-Instruct-it (it): 100%|██████████| 3/3 [04:53<00:00, 97.89s/it]


bertscore scores for Qwen2.5-7B-Instruct-it (it): [[0.8969510197639465, 0.9684614539146423, 0.8959146738052368, 0.8490006923675537, 0.8180546164512634, 0.9389036297798157, 0.9485598802566528, 0.9283688068389893, 1.0, 0.98087078332901, 0.8640781044960022, 0.872469961643219, 1.0, 0.9893158078193665, 1.0000001192092896, 0.9569677710533142, 0.8531904220581055, 1.0, 0.8564832210540771, 1.0, 0.9239198565483093, 0.9615366458892822, 0.9485995173454285, 0.9783062934875488, 0.954024076461792, 0.950196385383606, 0.9935871362686157, 0.8974255323410034, 0.8799638152122498, 0.8063920140266418, 0.9661005139350891, 0.9242497086524963, 0.9445816874504089, 0.8956776857376099, 0.9603971242904663, 0.9076198935508728, 0.9559271931648254, 0.8702625632286072, 0.9831711649894714, 0.9599068760871887, 0.940201461315155, 0.9036545157432556, 0.8104299902915955, 0.8539888858795166, 1.0, 0.9731451869010925, 0.943945050239563, 1.0, 0.9309206008911133, 0.9553375840187073, 0.9999999403953552, 0.9431496858596802, 1.0, 

Processing Llama-3.1-8B-Instruct-ru (ru): 100%|██████████| 3/3 [06:32<00:00, 130.92s/it]


bertscore scores for Llama-3.1-8B-Instruct-ru (ru): [[1.0, 1.0, 0.9222251772880554, 0.9056653380393982, 0.796830952167511, 0.9166111350059509, 0.8818433880805969, 0.7983737587928772, 1.0, 0.9667124152183533, 0.8573322892189026, 0.9903314113616943, 1.0, 0.9003586769104004, 1.0000001192092896, 1.0, 0.9085084795951843, 0.9369510412216187, 0.8570681810379028, 1.0, 0.9160333871841431, 0.9430478811264038, 0.9194055199623108, 1.0000001192092896, 0.9109877347946167, 0.9650757312774658, 1.0, 0.8406030535697937, 0.8746486306190491, 0.8613681197166443, 0.96954745054245, 0.9130955338478088, 0.9033578634262085, 0.9105108380317688, 0.9282261729240417, 0.9148672223091125, 0.9173343777656555, 0.8494848012924194, 1.0, 0.9677373766899109, 0.8578542470932007, 0.9336409568786621, 0.8330811262130737, 0.8631961941719055, 0.9707268476486206, 0.9347119331359863, 0.9036728143692017, 1.0, 1.0, 0.9611036777496338, 1.0, 0.9967030882835388, 0.9387104511260986, 0.8541139364242554, 0.9287853837013245, 0.862918436527

Processing Mistral-Nemo-Instruct-2407-ru (ru): 100%|██████████| 3/3 [09:03<00:00, 181.12s/it]


bertscore scores for Mistral-Nemo-Instruct-2407-ru (ru): [[0.9856544137001038, 1.0, 0.8979520201683044, 0.9284616112709045, 0.8658062219619751, 0.8939957022666931, 0.8871180415153503, 0.8282546997070312, 1.0, 0.9552603960037231, 0.9127666354179382, 0.9376690983772278, 1.0, 0.8658220171928406, 1.0000001192092896, 0.9290857315063477, 0.9698155522346497, 0.8721498847007751, 0.8671272397041321, 1.0, 0.9159930348396301, 0.926899254322052, 0.8390597701072693, 0.9738274216651917, 0.9021127820014954, 0.8735650777816772, 1.0, 0.7880542278289795, 0.8925028443336487, 0.9122926592826843, 0.96954745054245, 0.9128056764602661, 0.841288685798645, 0.9015567302703857, 0.9282261729240417, 0.9474279284477234, 0.894352376461029, 0.8739290237426758, 1.0, 0.9677373766899109, 0.8465543985366821, 0.9656915068626404, 0.8468296527862549, 0.8685593008995056, 0.9833618998527527, 0.9488506317138672, 0.9147276282310486, 0.9615138173103333, 0.993110716342926, 0.9639475345611572, 1.0, 0.9621880054473877, 1.0000001192

Processing Qwen2.5-7B-Instruct-ru (ru): 100%|██████████| 3/3 [17:13<00:00, 344.38s/it]


bertscore scores for Qwen2.5-7B-Instruct-ru (ru): [[0.9092591404914856, 1.0, 0.8417307138442993, 0.8894250988960266, 0.835926353931427, 0.9416521191596985, 0.9219732284545898, 0.8243953585624695, 1.0, 0.9586988091468811, 0.8896303772926331, 0.8966179490089417, 1.0, 0.8443834185600281, 1.0000001192092896, 0.9029198288917542, 0.8302111029624939, 0.8580486178398132, 0.8318787813186646, 1.0, 0.876952588558197, 0.8422152400016785, 0.9293829202651978, 0.9738274216651917, 0.9106355309486389, 0.8735650777816772, 1.0, 0.8173012137413025, 0.8875362277030945, 0.7894052863121033, 0.96954745054245, 0.9260250329971313, 0.9078205823898315, 0.9287090301513672, 0.9242652058601379, 0.8883264660835266, 0.9350778460502625, 0.9662245512008667, 0.9314563274383545, 0.953459620475769, 0.855683445930481, 0.9656915068626404, 0.8711008429527283, 0.8541305065155029, 0.9149023294448853, 0.9370373487472534, 0.9147276282310486, 0.8402191400527954, 0.993110716342926, 0.9639475345611572, 1.0, 0.9157904386520386, 1.000

Processing Llama-3.1-8B-Instruct-ge (ge): 100%|██████████| 3/3 [04:28<00:00, 89.41s/it] 


bertscore scores for Llama-3.1-8B-Instruct-ge (ge): [[1.0, 0.9810638427734375, 0.8608179092407227, 0.8769885301589966, 0.7498611807823181, 0.881640613079071, 0.9337469339370728, 0.8588432669639587, 0.9036869406700134, 1.0, 0.865149974822998, 0.9839729070663452, 0.9905035495758057, 0.9868208169937134, 0.9951050281524658, 0.835066556930542, 0.9100574851036072, 1.0000001192092896, 0.9335072040557861, 0.9690958857536316, 0.9335634112358093, 0.9531164765357971, 0.8416513204574585, 0.9548292756080627, 0.9531960487365723, 1.0, 1.0, 0.7739312052726746, 0.8789063692092896, 0.8559494614601135, 0.896772563457489, 0.93929523229599, 0.9566065669059753, 0.9007622003555298, 0.8823297023773193, 1.0, 0.9207784533500671, 0.933404803276062, 0.9999998807907104, 0.9890676736831665, 0.8244577050209045, 0.9602530002593994, 0.8000123500823975, 0.8258044123649597, 0.9689891338348389, 0.9480341076850891, 0.9564406871795654, 0.954404890537262, 1.000000238418579, 0.9456437826156616, 1.0, 0.9322201013565063, 0.999

Processing Mistral-Nemo-Instruct-2407-ge (ge): 100%|██████████| 3/3 [03:38<00:00, 72.84s/it]


bertscore scores for Mistral-Nemo-Instruct-2407-ge (ge): [[1.0, 0.9810638427734375, 0.8443567156791687, 0.8769885301589966, 0.8128446936607361, 0.8821711540222168, 0.9386039972305298, 0.8408973217010498, 0.8987153768539429, 0.9999999403953552, 0.8745347857475281, 0.9215226769447327, 0.9345927238464355, 0.9284707307815552, 0.9951050281524658, 0.835066556930542, 0.9100574851036072, 0.921557605266571, 0.9393867254257202, 1.0000001192092896, 0.8869792819023132, 0.9615763425827026, 0.8493564128875732, 0.9761090278625488, 0.9057002067565918, 0.928288459777832, 1.0, 0.8465237617492676, 0.8439702987670898, 0.8343063592910767, 0.896772563457489, 0.92987459897995, 0.9611860513687134, 0.896506667137146, 0.9112072587013245, 0.9291199445724487, 0.9211730360984802, 0.9058517217636108, 0.9999998807907104, 0.9961762428283691, 0.8564631342887878, 0.9379209876060486, 0.8249948620796204, 0.8711720705032349, 0.9208897352218628, 0.86881422996521, 0.9419012665748596, 0.8448786735534668, 1.000000238418579, 0

Processing Qwen2.5-7B-Instruct-ge (ge): 100%|██████████| 3/3 [03:29<00:00, 69.78s/it]

bertscore scores for Qwen2.5-7B-Instruct-ge (ge): [[1.0, 1.0, 0.8160256743431091, 0.8775259256362915, 0.8085162043571472, 0.9146373867988586, 0.8860414028167725, 0.8231433033943176, 0.9036869406700134, 0.9230839014053345, 0.8658782839775085, 0.9257379770278931, 0.989852786064148, 0.9264320731163025, 0.9198285341262817, 0.8348734974861145, 0.8698859214782715, 0.9064706563949585, 0.8399171829223633, 1.0000001192092896, 0.8641795516014099, 0.9064930081367493, 0.8491594195365906, 0.9055044651031494, 0.8735807538032532, 1.0, 1.0, 0.8262041807174683, 0.8497624397277832, 0.8343063592910767, 0.9322935938835144, 0.9466245770454407, 0.944487452507019, 0.8847506046295166, 0.9342438578605652, 0.9612171053886414, 0.925819993019104, 0.8447194695472717, 0.9999998807907104, 0.9927763342857361, 0.8489528298377991, 0.9589816927909851, 0.8308433294296265, 0.8068790435791016, 0.9689891338348389, 0.8834384679794312, 0.9419012665748596, 0.8411844968795776, 1.000000238418579, 0.9426584243774414, 0.8870362639




In [None]:
import pickle

with open('bertscore_scores-exp2.pkl', 'wb') as f:
    pickle.dump(bertscore_scores, f)