In [1]:
#utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm

#bleu 
from evaluate import load
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

#ter
from torchmetrics.text import TranslationEditRate

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
datasets = {}

files = open_files('generations')
for file in files:
    dataset = pd.read_csv('generations/' + file)
    datasets[file] = dataset

## Valutazione automatica

### Bleu

In [4]:
sacrebleu = load("sacrebleu")

def blue_evaluation(df):
    blue_scores = []
    
    for i in range(len(df)):
        reference = df['actuals'][i].lower()
        candidate = df['predictions'][i].lower()

        blue_score = sacrebleu.compute(predictions=[candidate], references=[reference])
        blue_scores.append(float(blue_score['score']))
        
    return blue_scores

In [5]:
print('BLEU SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')

    datasets[dataset_name]['bleu_score'] = blue_evaluation(datasets[dataset_name])
    print(f'    {datasets[dataset_name]['bleu_score'].mean()}')
    print('\n')

BLEU SCORES: 

- meta-llama2_7b_en_2epoch_decoding.csv:
    43.051376982304724


- meta-llama2_7b_ge_2epoch_decoding.csv:
    34.71901042740331


- meta-llama2_7b_it_2epoch_decoding.csv:
    43.887828384297286


- meta-llama2_7b_ru_2epoch_decoding.csv:
    28.55205813777609


- t5-large_en_2epoch_decoding.csv:
    42.964119405688045


- t5-large_ge_2epoch_decoding.csv:
    35.55978151377656


- t5-large_it_2epoch_decoding.csv:
    42.66809957085583


- t5-large_ru_2epoch_decoding.csv:
    9.021369548545128




### BERTScore

In [10]:
bertscore = load("bertscore")

def bertscore_evaluation(actual, prediction, lang):
    predictions = []
    references = []

    for i in range(len(actual)):
        predictions.append(prediction.iloc[i].lower())
        references.append(actual.iloc[i].lower())

    bertscore_scores = bertscore.compute(predictions=predictions, references=references, lang=lang)

    return bertscore_scores

In [11]:
print('BERTSCORE SCORES: \n')

for dataset_name in datasets:
    #check if dataset contain substring en
    if 'en' in dataset_name:
        lang = 'en'
    elif 'it' in dataset_name:
        lang = 'it'
    elif 'ge' in dataset_name:
        lang = 'de'
    else:
        lang = 'ru'

    print(f'- {dataset_name} ({lang}):')

    score = bertscore_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'], lang)
    datasets[dataset_name]['bertscore_f1'] = score['f1']
    datasets[dataset_name]['bertscore_precision'] = score['precision']
    datasets[dataset_name]['bertscore_recall'] = score['recall']
    #print(score)
    
    print(f'-F1: ', np.mean(score['f1']))
    print(f'-P: ', np.mean(score['precision']))
    print(f'-R: ', np.mean(score['recall']))
    print('\n')



BERTSCORE SCORES: 

- meta-llama2_7b_en_2epoch_decoding.csv (en):


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


-F1:  0.9520620909929276
-P:  0.9533244151473045
-R:  0.950865815103054


- meta-llama2_7b_ge_2epoch_decoding.csv (de):




-F1:  0.86166808116436
-P:  0.8629700642824173
-R:  0.8610310202240944


- meta-llama2_7b_it_2epoch_decoding.csv (it):


KeyboardInterrupt: 

### TER

In [6]:
ter = TranslationEditRate()

def ter_evaluation(actual, prediction):
    ter_scores = []

    for i in range(len(actual)):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()

        ter_score = ter(candidate, [reference])
        ter_scores.append(ter_score)

    return ter_scores

In [7]:
print('TER SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['ter'] = ter_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f'    {datasets[dataset_name]['ter'].mean()}')
    print('\n')

TER SCORES: 

- meta-llama2_7b_en_2epoch_decoding.csv:
    0.5132417602539062


- meta-llama2_7b_ge_2epoch_decoding.csv:
    0.6277633666992187


- meta-llama2_7b_it_2epoch_decoding.csv:


KeyboardInterrupt: 

### CHRF

In [8]:
chrf = load("chrf")

def chrf_evaluation(actual, prediction):
    chrf_scores = []

    # Utilizza tqdm per monitorare lo stato di avanzamento
    for i in tqdm(range(len(actual)), desc="Calcolo CHRF"):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()
        chrf_score = chrf.compute(predictions=[candidate], references=[reference])['score']

        chrf_scores.append(chrf_score)

    return chrf_scores

In [9]:
print('CHRF SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['chrf'] = chrf_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f'    {datasets[dataset_name]['chrf'].mean()}')
    print('\n')

CHRF SCORES: 

- meta-llama2_7b_en_2epoch_decoding.csv:


Calcolo CHRF:   0%|          | 0/1000 [00:00<?, ?it/s]

Calcolo CHRF: 100%|██████████| 1000/1000 [00:10<00:00, 97.15it/s]


    72.37754511824546


- meta-llama2_7b_ge_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:10<00:00, 96.38it/s]


    65.32267320482006


- meta-llama2_7b_it_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:11<00:00, 83.57it/s]


    71.99261290330436


- meta-llama2_7b_ru_2epoch_decoding.csv:


Calcolo CHRF:   0%|          | 3/1000 [00:00<00:12, 80.16it/s]


KeyboardInterrupt: 