In [1]:
#utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm

#bleu 
from evaluate import load
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

#ter
from torchmetrics.text import TranslationEditRate

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
datasets = {}

files = open_files('generations')
for file in files:
    dataset = pd.read_csv('generations/' + file)
    datasets[file] = dataset

## Valutazione automatica

### Bleu

In [4]:
sacrebleu = load("sacrebleu")

def blue_evaluation(df):
    blue_scores = []
    
    for i in range(len(df)):
        reference = df['actuals'][i].lower()
        candidate = df['predictions'][i].lower()

        blue_score = sacrebleu.compute(predictions=[candidate], references=[reference])
        blue_scores.append(float(blue_score['score']))
        
    return blue_scores

In [5]:
print('BLEU SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')

    datasets[dataset_name]['bleu_score'] = blue_evaluation(datasets[dataset_name])
    print(f'    {datasets[dataset_name]['bleu_score'].mean()}')
    print('\n')

BLEU SCORES: 

- llama2_7b_ge_2epoch_decoding.csv:
    35.8782874601763


- llama2_7b_it_2epoch_decoding.csv:
    45.092876173069776


- meta-llama2_7b_en_2epoch_decoding.csv:
    43.051376982304724


- meta-llama2_7b_ru_2epoch_decoding.csv:
    28.55205813777609


- t5-large_en_2epoch_decoding.csv:
    42.964119405688045


- t5-large_ge_2epoch_decoding.csv:
    35.55978151377656


- t5-large_it_2epoch_decoding.csv:
    42.66809957085583


- t5-large_ru_2epoch_decoding.csv:
    9.021369548545128




### BERTScore

In [6]:
bertscore = load("bertscore")

def bertscore_evaluation(actual, prediction, lang):
    predictions = []
    references = []

    for i in range(len(actual)):
        predictions.append(prediction.iloc[i].lower())
        references.append(actual.iloc[i].lower())

    bertscore_scores = bertscore.compute(predictions=predictions, references=references, lang=lang)

    return bertscore_scores

In [7]:
print('BERTSCORE SCORES: \n')

for dataset_name in datasets:
    #check if dataset contain substring en
    if 'en' in dataset_name:
        lang = 'en'
    elif 'it' in dataset_name:
        lang = 'it'
    elif 'ge' in dataset_name:
        lang = 'de'
    else:
        lang = 'ru'

    print(f'- {dataset_name} ({lang}):')

    score = bertscore_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'], lang)
    datasets[dataset_name]['bertscore_f1'] = score['f1']
    datasets[dataset_name]['bertscore_precision'] = score['precision']
    datasets[dataset_name]['bertscore_recall'] = score['recall']
    print(score)
    
    print(f'-F1: ', np.mean(score['f1']))
    print(f'-P: ', np.mean(score['precision']))
    print(f'-R: ', np.mean(score['recall']))
    print('\n')



BERTSCORE SCORES: 

- llama2_7b_en_2epoch_decoding.csv (en):


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.9729059934616089, 0.9751385450363159, 0.9327673316001892, 0.9312354922294617, 0.959779679775238, 0.9923622608184814, 0.9498351812362671, 0.9212179183959961, 0.9235183596611023, 0.9302449226379395, 0.9678229093551636, 0.9405224323272705, 0.9601513743400574, 0.9131309986114502, 0.9881763458251953, 0.97807776927948, 0.9811967015266418, 0.9655203819274902, 0.9769431948661804, 0.9606181383132935, 0.9555200338363647, 0.8917943239212036, 0.915739893913269, 0.9903479814529419, 0.9729192852973938, 0.9644076228141785, 0.9667905569076538, 0.9604920148849487, 0.9340745210647583, 0.929125189781189, 0.9121882915496826, 0.9252443909645081, 0.973579466342926, 1.0, 0.87945157289505, 0.8915907144546509, 1.0, 0.9652556777000427, 0.8970138430595398, 0.931464672088623, 0.945489227771759, 0.9045913815498352, 0.9241700768470764, 0.9818558692932129, 0.9921329617500305, 0.9856300950050354, 0.9913486242294312, 0.9443972706794739, 0.9491870999336243, 0.9264247417449951, 0.9467135071754456, 0.967



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

{'precision': [0.8782235980033875, 0.8706378936767578, 0.8567282557487488, 0.8345656991004944, 0.9222155809402466, 0.8987988233566284, 0.8247069120407104, 0.7383654713630676, 0.8715047836303711, 0.8720943927764893, 0.9275824427604675, 0.8231143951416016, 1.0000001192092896, 0.9030656218528748, 1.0000001192092896, 0.9686693549156189, 0.8410263061523438, 0.9333924651145935, 0.9168727397918701, 0.8394834995269775, 0.8641015291213989, 0.7648269534111023, 0.7984208464622498, 0.8843298554420471, 0.8133538365364075, 0.9450134038925171, 0.8237912654876709, 0.8869729042053223, 0.9073262214660645, 0.9573355317115784, 0.8077120184898376, 0.8957401514053345, 0.9124603271484375, 0.9473829865455627, 0.8424268364906311, 0.7671169638633728, 1.0000001192092896, 0.8975526094436646, 0.794845700263977, 0.8601500988006592, 0.8489413261413574, 0.7850196957588196, 0.856897234916687, 0.9249387383460999, 0.9282903075218201, 0.859119176864624, 0.9182567000389099, 0.9417770504951477, 0.7746158838272095, 0.813312

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.953035295009613, 0.9751385450363159, 0.9327673316001892, 0.9362486600875854, 0.9570546746253967, 0.9923622608184814, 0.9433082938194275, 0.9433962106704712, 0.9218342304229736, 0.9088730216026306, 0.9678229093551636, 0.9339120388031006, 0.9621224403381348, 0.9075775742530823, 0.9912322759628296, 0.97807776927948, 0.9675539135932922, 0.9679776430130005, 0.9857344031333923, 0.9606181383132935, 0.9606384634971619, 0.9392310380935669, 0.9182304739952087, 0.9903479814529419, 0.8959831595420837, 0.9765452146530151, 0.9732834696769714, 0.9604920148849487, 0.9351090788841248, 0.9478276371955872, 0.9060231447219849, 0.9354431629180908, 0.9936785697937012, 0.9876233339309692, 0.9051495790481567, 0.9118991494178772, 1.0, 0.9652556777000427, 0.9126935005187988, 0.9544192552566528, 0.9507805109024048, 0.9213085770606995, 0.9497047662734985, 0.9772617816925049, 0.9921329617500305, 0.9787706136703491, 0.9913486242294312, 0.9443973898887634, 0.9317074418067932, 0.9307563304901123, 0.9



{'precision': [0.8552197217941284, 0.8057533502578735, 0.8686326146125793, 0.848466157913208, 0.9639512300491333, 0.8112519383430481, 0.850721001625061, 0.7317187190055847, 0.8706487417221069, 0.8191921710968018, 0.9275824427604675, 0.8032580614089966, 1.0, 0.8557270169258118, 1.0000001192092896, 0.7969148755073547, 0.8089537620544434, 0.9178246259689331, 0.7985261678695679, 0.834928035736084, 0.8264189958572388, 0.7660163640975952, 0.7967055439949036, 0.9141320586204529, 0.8133538365364075, 0.8969235420227051, 0.8373432159423828, 0.877509593963623, 0.8500226736068726, 0.9462310671806335, 0.8637629747390747, 0.8623035550117493, 0.9601131677627563, 1.0, 0.8657375574111938, 0.7671607732772827, 1.0000001192092896, 0.8975526094436646, 0.8253505229949951, 0.8675915598869324, 0.8430807590484619, 0.7521789073944092, 0.8769962787628174, 0.8441171646118164, 0.9400598406791687, 0.8359234929084778, 0.9043546915054321, 0.9240821599960327, 0.8002580404281616, 0.8133124113082886, 0.9111802577972412,

### TER

In [8]:
ter = TranslationEditRate()

def ter_evaluation(actual, prediction):
    ter_scores = []

    for i in range(len(actual)):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()

        ter_score = ter(candidate, [reference])
        ter_scores.append(ter_score)

    return ter_scores

In [9]:
print('TER SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['ter'] = ter_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f'    {datasets[dataset_name]['ter'].mean()}')
    print('\n')

TER SCORES: 

- llama2_7b_en_2epoch_decoding.csv:
    0.5167704467773437


- llama2_7b_ge_2epoch_decoding.csv:
    0.591541748046875


- llama2_7b_it_2epoch_decoding.csv:
    0.5171953735351562


- llama2_7b_ru_2epoch_decoding.csv:
    0.7748748168945313


- t5-large_en_2epoch_decoding.csv:
    0.5067647094726563


- t5-large_ge_2epoch_decoding.csv:
    0.5874913330078125


- t5-large_it_2epoch_decoding.csv:
    0.5387223510742187


- t5-large_ru_2epoch_decoding.csv:
    0.9479175415039063




### CHRF

In [10]:
chrf = load("chrf")

def chrf_evaluation(actual, prediction):
    chrf_scores = []

    # Utilizza tqdm per monitorare lo stato di avanzamento
    for i in tqdm(range(len(actual)), desc="Calcolo CHRF"):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()
        chrf_score = chrf.compute(predictions=[candidate], references=[reference])['score']

        chrf_scores.append(chrf_score)

    return chrf_scores

In [11]:
print('CHRF SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['chrf'] = chrf_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f'    {datasets[dataset_name]['chrf'].mean()}')
    print('\n')

CHRF SCORES: 

- llama2_7b_en_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:06<00:00, 152.03it/s]


    72.54250721088415


- llama2_7b_ge_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:05<00:00, 173.47it/s]


    65.87369713633603


- llama2_7b_it_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:05<00:00, 168.21it/s]


    72.69516525831362


- llama2_7b_ru_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:05<00:00, 189.66it/s]


    56.13009186763659


- t5-large_en_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:05<00:00, 177.32it/s]


    72.33452508663039


- t5-large_ge_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:06<00:00, 146.52it/s]


    65.83374677162277


- t5-large_it_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:10<00:00, 97.78it/s]


    71.43777279684457


- t5-large_ru_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:07<00:00, 140.16it/s]


    31.332860613706167




In [12]:
datasets['llama2_7b_en_2epoch_decoding.csv']

Unnamed: 0,triples,predictions,actuals,bleu_score,bertscore_f1,bertscore_precision,bertscore_recall,ter,chrf
0,Peter_Stöger club FC_Admira_Wacker_Mödling,Peter Stöger is in the FC Admira Wacker Mödlin...,Peter Stöger's club is the FC Admira Wacker Mö...,40.637983,0.973361,0.972906,0.973816,tensor(0.5556),77.474297
1,Arem-arem ingredient Banana_leaf,Banana leaf is an ingredient in Arem-arem.,Banana leaf is an ingredient in Arem arem.,62.401954,0.979676,0.975139,0.984257,tensor(0.2500),91.446465
2,Weymouth_Sands followedBy Maiden_Castle_(novel...,A Glastonbury Romance was followed by Weymouth...,'A Glastonbury Romance' was followed by 'Weymo...,9.442399,0.915396,0.932767,0.898660,tensor(0.7333),61.371397
3,Christian_Panucci club Genoa_C.F.C. Christian_...,Christian Panucci is a member of the Italy nat...,A.S. Livorno Calcio are managed by Christian P...,46.227871,0.935675,0.931235,0.940158,tensor(0.9000),69.648347
4,Chicharrón region Andalusia,Chicharrón is a dish from Andalusia.,Chicharrón is found in the Andalusia region.,13.540372,0.956181,0.959780,0.952610,tensor(0.7143),50.191925
...,...,...,...,...,...,...,...,...,...
995,300_North_LaSalle floorCount 60,300 North LaSalle has 60 floors.,There are 60 floors at 300 North LaSalle.,24.177237,0.939056,0.949612,0.928732,tensor(0.7500),60.882521
996,United_States leader Joe_Biden,Joe Biden is the leader of the United States.,Joe Biden is the United States leader.,39.281465,0.966363,0.963028,0.969721,tensor(0.7143),83.625011
997,United_States leaderTitle President_of_the_Uni...,The leader of the United States is called the ...,The leader of the United States is known as th...,74.870525,0.980670,0.982693,0.978656,tensor(0.1333),85.242958
998,Chicago country United_States Chicago isPartOf...,"300 North LaSalle is located in Chicago, DuPag...","300 North LaSalle is located in Chicago, Dupag...",75.763744,0.983055,0.982672,0.983438,tensor(0.1818),92.712601
