In [None]:
#utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm

#bleu 
from evaluate import load
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

#ter
from torchmetrics.text import TranslationEditRate

In [None]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [None]:
datasets = {}

files = open_files('generations')
for file in files:
    dataset = pd.read_csv('generations/' + file)
    datasets[file] = dataset

In [None]:
files

## Valutazione automatica

### Bleu

In [None]:
sacrebleu = load("sacrebleu")

def blue_evaluation(df):
    blue_scores = []
    
    for i in range(len(df)):
        reference = df['actuals'][i].lower()
        candidate = df['predictions'][i].lower()

        blue_score = sacrebleu.compute(predictions=[candidate], references=[reference])
        blue_scores.append(float(blue_score['score']))
        
    return blue_scores

In [None]:
print('BLEU SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')

    datasets[dataset_name]['bleu_score'] = blue_evaluation(datasets[dataset_name])
    print(f"    {datasets[dataset_name]['bleu_score'].mean()}")
    print('\n')


In [None]:
#copy
dt = datasets['llamantino2_7b_it_2epoch_decoding.csv'].copy()
#rename actuals and blue in actuals_llamantino and blue_llamantino
dt.rename(columns={'predictions': 'predictions_llamantino', 'bleu_score': 'bleu_llamantino'}, inplace=True)
dt['predictions_t5'] = datasets['t5-large_it_2epoch_decoding.csv']['predictions']
dt['bleu_t5'] = datasets['t5-large_it_2epoch_decoding.csv']['bleu_score']


In [None]:
# rendilo excel
dt.to_excel('llamantino_t5.xlsx', index=False)

### BERTScore

In [None]:
bertscore = load("bertscore")

def bertscore_evaluation(actual, prediction, lang):
    predictions = []
    references = []

    for i in range(len(actual)):
        predictions.append(prediction.iloc[i].lower())
        references.append(actual.iloc[i].lower())

    bertscore_scores = bertscore.compute(predictions=predictions, references=references, lang=lang)

    return bertscore_scores

In [None]:
print('BERTSCORE SCORES: \n')

for dataset_name in datasets:

    print(f'- {dataset_name} (it):')

    score = bertscore_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'], 'it')
    datasets[dataset_name]['bertscore_f1'] = score['f1']
    datasets[dataset_name]['bertscore_precision'] = score['precision']
    datasets[dataset_name]['bertscore_recall'] = score['recall']
    print(score)
    
    print(f'-F1: ', np.mean(score['f1']))
    print(f'-P: ', np.mean(score['precision']))
    print(f'-R: ', np.mean(score['recall']))
    print('\n')



### TER

In [None]:
ter = TranslationEditRate()

def ter_evaluation(actual, prediction):
    ter_scores = []

    for i in range(len(actual)):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()

        ter_score = ter(candidate, [reference])
        ter_scores.append(ter_score)

    return ter_scores

In [None]:
print('TER SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['ter'] = ter_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f"    {datasets[dataset_name]['ter'].mean()}")
    print('\n')

### CHRF

In [None]:
chrf = load("chrf")

def chrf_evaluation(actual, prediction):
    chrf_scores = []

    # Utilizza tqdm per monitorare lo stato di avanzamento
    for i in tqdm(range(len(actual)), desc="Calcolo CHRF"):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()
        chrf_score = chrf.compute(predictions=[candidate], references=[reference])['score']

        chrf_scores.append(chrf_score)

    return chrf_scores

In [None]:
print('CHRF SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['chrf'] = chrf_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f"    {datasets[dataset_name]['chrf'].mean()}")
    print('\n')