In [1]:
#utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm

#bleu 
from evaluate import load
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

#ter
from torchmetrics.text import TranslationEditRate

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
datasets = {}

files = open_files('generations')
for file in files:
    dataset = pd.read_csv('generations/' + file)
    datasets[file] = dataset

In [4]:
files

['meta-llama2_7b_en_2epoch_decoding.csv',
 'meta-llama2_7b_it_2epoch_decoding.csv',
 't5-large_en_2epoch_decoding.csv',
 't5-large_it_2epoch_decoding.csv']

## Valutazione automatica

### Bleu

In [5]:
sacrebleu = load("sacrebleu")

def blue_evaluation(df):
    blue_scores = []
    
    for i in range(len(df)):
        reference = df['actuals'][i].lower()
        candidate = df['predictions'][i].lower()

        blue_score = sacrebleu.compute(predictions=[candidate], references=[reference])
        blue_scores.append(float(blue_score['score']))
        
    return blue_scores

In [6]:
print('BLEU SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')

    datasets[dataset_name]['bleu_score'] = blue_evaluation(datasets[dataset_name])
    print(f'    {datasets[dataset_name]['bleu_score'].mean()}')
    print('\n')

BLEU SCORES: 

- meta-llama2_7b_en_2epoch_decoding.csv:
    33.89323112544573


- meta-llama2_7b_it_2epoch_decoding.csv:
    31.891374150401013


- t5-large_en_2epoch_decoding.csv:
    32.70961019679847


- t5-large_it_2epoch_decoding.csv:
    29.922332667730085




### BERTScore

In [7]:
bertscore = load("bertscore")

def bertscore_evaluation(actual, prediction, lang):
    predictions = []
    references = []

    for i in range(len(actual)):
        predictions.append(prediction.iloc[i].lower())
        references.append(actual.iloc[i].lower())

    bertscore_scores = bertscore.compute(predictions=predictions, references=references, lang=lang)

    return bertscore_scores

In [8]:
print('BERTSCORE SCORES: \n')

for dataset_name in datasets:
    #check if dataset contain substring en
    if 'en' in dataset_name:
        lang = 'en'
    else:
        lang = 'it'

    print(f'- {dataset_name} ({lang}):')

    score = bertscore_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'], lang)
    datasets[dataset_name]['bertscore_f1'] = score['f1']
    datasets[dataset_name]['bertscore_precision'] = score['precision']
    datasets[dataset_name]['bertscore_recall'] = score['recall']
    #print(score)
    
    print(f'-F1: ', np.mean(score['f1']))
    print(f'-P: ', np.mean(score['precision']))
    print(f'-R: ', np.mean(score['recall']))
    print('\n')



BERTSCORE SCORES: 

- meta-llama2_7b_en_2epoch_decoding.csv (en):


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


-F1:  0.9430238373875618
-P:  0.9452727970480919
-R:  0.9408732368350029


- meta-llama2_7b_it_2epoch_decoding.csv (it):




-F1:  0.8600391951203347
-P:  0.8634315873980523
-R:  0.857119910299778


- t5-large_en_2epoch_decoding.csv (en):


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


-F1:  0.9407918038964271
-P:  0.9440240755677223
-R:  0.9376756454706192


- t5-large_it_2epoch_decoding.csv (it):




-F1:  0.8564814774990082
-P:  0.8602060106992722
-R:  0.8532467696666718




### TER

In [9]:
ter = TranslationEditRate()

def ter_evaluation(actual, prediction):
    ter_scores = []

    for i in range(len(actual)):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()

        ter_score = ter(candidate, [reference])
        ter_scores.append(ter_score)

    return ter_scores

In [10]:
print('TER SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['ter'] = ter_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f'    {datasets[dataset_name]['ter'].mean()}')
    print('\n')

TER SCORES: 

- meta-llama2_7b_en_2epoch_decoding.csv:
    0.5982618408203125


- meta-llama2_7b_it_2epoch_decoding.csv:
    0.696801513671875


- t5-large_en_2epoch_decoding.csv:
    0.6065496215820313


- t5-large_it_2epoch_decoding.csv:
    0.6736116943359375




### CHRF

In [11]:
chrf = load("chrf")

def chrf_evaluation(actual, prediction):
    chrf_scores = []

    # Utilizza tqdm per monitorare lo stato di avanzamento
    for i in tqdm(range(len(actual)), desc="Calcolo CHRF"):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()
        chrf_score = chrf.compute(predictions=[candidate], references=[reference])['score']

        chrf_scores.append(chrf_score)

    return chrf_scores

In [12]:
print('CHRF SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['chrf'] = chrf_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f'    {datasets[dataset_name]['chrf'].mean()}')
    print('\n')

CHRF SCORES: 

- meta-llama2_7b_en_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:10<00:00, 97.22it/s]


    65.7996098209086


- meta-llama2_7b_it_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:10<00:00, 91.76it/s] 


    62.61171393131402


- t5-large_en_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:09<00:00, 105.84it/s]


    64.57947403011968


- t5-large_it_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:10<00:00, 98.68it/s]


    61.121502896977965




In [31]:
datasets['llama2_7b_en_2epoch_decoding.csv']

Unnamed: 0,triples,predictions,actuals,bleu_score,bertscore_f1,bertscore_precision,bertscore_recall,ter,chrf
0,11th_Mississippi_Infantry_Monument category Co...,The 11th Mississippi Infantry Monument is loca...,"A monument to the 11th Mississippi Infantry, w...",31.735182,0.941381,0.942412,0.940351,tensor(0.6571),71.089044
1,Bananaman broadcastedBy BBC BBC city Broadcast...,"Bananaman, starring Bill Oddie, was broadcast ...",Bill Oddie stars in a BBC programme called Ban...,7.363124,0.895899,0.896028,0.895769,tensor(0.8889),45.632178
2,Karlsruhe postalCode 76131–76229,The postal codes for Karlsruhe are 76131–76229.,The postal codes of Karlsruhe are 76131–76229.,50.000000,0.993597,0.993597,0.993597,tensor(0.1429),88.130745
3,Turkish_people religion Irreligion,The Turkish people are Irreligious.,Some Turkish people are irreligious.,75.983569,0.998647,0.998647,0.998647,tensor(0.2000),90.411086
4,ENAIRE city Madrid Adolfo_Suárez_Madrid–Baraja...,Adolfo Suarez Madrid-Barajas Airport is locate...,The Adolfo Suarez Madrid-Barajas airport is op...,37.514537,0.953546,0.958903,0.948248,tensor(0.5484),69.168244
...,...,...,...,...,...,...,...,...,...
995,"ALCO_RS-3 buildDate ""May 1950 - August 1956"" A...",The ALCO RS-3 was built between May 1950 and A...,The ALCO RS-3 produced between May 1950 and Au...,51.795975,0.975178,0.972941,0.977425,tensor(0.2632),76.764512
996,Super_Capers budget 2000000.0 Super_Capers dir...,"Super Capers, starring Justin Whalin, was dist...",Super Capers is a 98 minute film that was dist...,35.282778,0.941008,0.943822,0.938210,tensor(0.5882),68.502208
997,It's_Great_to_Be_Young_(1956_film) editing Max...,"The film ""It's Great to Be Young"" (1956) was e...",It's Great to Be Young was edited by Max Bened...,46.606087,0.906294,0.893222,0.919754,tensor(0.5000),86.557185
998,Lady_Anne_Monson deathDate 1776-02-18 Lady_Ann...,Lady Anne Monson was a national of the Kingdom...,Lady Anne Monson married George Monson was a B...,13.330451,0.944976,0.944121,0.945832,tensor(0.7600),54.476607
