In [1]:
#utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm

#bleu 
from evaluate import load
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

#ter
from torchmetrics.text import TranslationEditRate

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
datasets = {}

files = open_files('generations')
for file in files:
    dataset = pd.read_csv('generations/' + file)
    datasets[file] = dataset

In [4]:
files

['it5-large_it_2epoch_decoding.csv',
 'llamantino2_7b_it_2epoch_decoding.csv',
 'minerva_3B_it_2epoch_decoding.csv',
 't5-large_it_2epoch_decoding.csv']

## Valutazione automatica

### Bleu

In [5]:
sacrebleu = load("sacrebleu")

def blue_evaluation(df):
    blue_scores = []
    
    for i in range(len(df)):
        reference = df['actuals'][i].lower()
        candidate = df['predictions'][i].lower()

        blue_score = sacrebleu.compute(predictions=[candidate], references=[reference])
        blue_scores.append(float(blue_score['score']))
        
    return blue_scores

In [34]:
print('BLEU SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')

    datasets[dataset_name]['bleu_score'] = blue_evaluation(datasets[dataset_name])
    print(f"    {datasets[dataset_name]['bleu_score'].mean()}")
    print('\n')


BLEU SCORES: 

- it5-large_it_2epoch_decoding.csv:
    18.298273228528913


- llamantino2_7b_it_2epoch_decoding.csv:
    32.92150384048398


- minerva_3B_it_2epoch_decoding.csv:
    25.72137021914339


- t5-large_it_2epoch_decoding.csv:
    29.922332667730085




In [38]:
#copy
dt = datasets['llamantino2_7b_it_2epoch_decoding.csv'].copy()
#rename actuals and blue in actuals_llamantino and blue_llamantino
dt.rename(columns={'predictions': 'predictions_llamantino', 'bleu_score': 'bleu_llamantino'}, inplace=True)
dt['predictions_t5'] = datasets['t5-large_it_2epoch_decoding.csv']['predictions']
dt['bleu_t5'] = datasets['t5-large_it_2epoch_decoding.csv']['bleu_score']


In [41]:
# rendilo excel
dt.to_excel('llamantino_t5.xlsx', index=False)

### BERTScore

In [20]:
bertscore = load("bertscore")

def bertscore_evaluation(actual, prediction, lang):
    predictions = []
    references = []

    for i in range(len(actual)):
        predictions.append(prediction.iloc[i].lower())
        references.append(actual.iloc[i].lower())

    bertscore_scores = bertscore.compute(predictions=predictions, references=references, lang=lang)

    return bertscore_scores

In [21]:
print('BERTSCORE SCORES: \n')

for dataset_name in datasets:

    print(f'- {dataset_name} (it):')

    score = bertscore_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'], 'it')
    datasets[dataset_name]['bertscore_f1'] = score['f1']
    datasets[dataset_name]['bertscore_precision'] = score['precision']
    datasets[dataset_name]['bertscore_recall'] = score['recall']
    print(score)
    
    print(f'-F1: ', np.mean(score['f1']))
    print(f'-P: ', np.mean(score['precision']))
    print(f'-R: ', np.mean(score['recall']))
    print('\n')



BERTSCORE SCORES: 

- it5-large_it_2epoch_decoding.csv (it):




{'precision': [0.8903883695602417, 0.8024725914001465, 0.8433958292007446, 0.8381259441375732, 0.815991222858429, 0.821508526802063, 0.8094969987869263, 0.816057562828064, 0.8268133997917175, 0.7656431794166565, 0.7416132688522339, 0.9256267547607422, 0.7860872149467468, 0.8728875517845154, 0.7971066832542419, 0.744569718837738, 0.7233704924583435, 0.8235538601875305, 0.8532174229621887, 0.8989826440811157, 0.9545911550521851, 0.803907036781311, 0.8227558732032776, 0.7777860164642334, 0.9047048091888428, 0.8490905165672302, 0.8302837014198303, 0.8146888017654419, 0.7957568168640137, 0.7761846780776978, 0.7126418352127075, 0.7635639905929565, 0.7801066040992737, 0.8995725512504578, 0.9337978363037109, 0.7999980449676514, 0.7657080292701721, 0.7744979858398438, 0.8953322172164917, 0.8813364505767822, 0.8811492919921875, 0.8565411567687988, 0.8777878880500793, 0.8130094408988953, 0.8146255612373352, 0.8405922651290894, 0.792359471321106, 0.7713502645492554, 0.797931969165802, 0.8722907304

### TER

In [22]:
ter = TranslationEditRate()

def ter_evaluation(actual, prediction):
    ter_scores = []

    for i in range(len(actual)):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()

        ter_score = ter(candidate, [reference])
        ter_scores.append(ter_score)

    return ter_scores

In [23]:
print('TER SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['ter'] = ter_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f"    {datasets[dataset_name]['ter'].mean()}")
    print('\n')

TER SCORES: 

- it5-large_it_2epoch_decoding.csv:
    0.790024658203125


- llamantino2_7b_it_2epoch_decoding.csv:
    0.6816765747070312


- minerva_3B_it_2epoch_decoding.csv:
    0.7444573364257813




### CHRF

In [24]:
chrf = load("chrf")

def chrf_evaluation(actual, prediction):
    chrf_scores = []

    # Utilizza tqdm per monitorare lo stato di avanzamento
    for i in tqdm(range(len(actual)), desc="Calcolo CHRF"):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()
        chrf_score = chrf.compute(predictions=[candidate], references=[reference])['score']

        chrf_scores.append(chrf_score)

    return chrf_scores

In [26]:
print('CHRF SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['chrf'] = chrf_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f"    {datasets[dataset_name]['chrf'].mean()}")
    print('\n')

CHRF SCORES: 

- it5-large_it_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:10<00:00, 93.87it/s]


    45.3875835753282


- llamantino2_7b_it_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:07<00:00, 131.96it/s]


    63.580743422030515


- minerva_3B_it_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:08<00:00, 119.00it/s]


    56.88715954254813


