In [1]:
#utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm

#bleu 
from evaluate import load
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

#ter
from torchmetrics.text import TranslationEditRate

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
datasets = {}

files = open_files('generations')
for file in files:
    dataset = pd.read_csv('generations/' + file)
    datasets[file] = dataset

In [4]:
files

['llama2_7b_it_2epoch_decoding.csv',
 'llama2_7b_it_sa_1shot_2epoch_decoding.csv',
 'llama2_7b_it_sa_2epoch_decoding.csv',
 'llamantino2_7b_it_2epoch_decoding.csv',
 'llamantino2_7b_it_sa_2epoch_decoding.csv']

In [7]:
datasets['llama2_7b_it_sa_1shot_2epoch_decoding.csv']['actuals'] =  datasets['llama2_7b_it_2epoch_decoding.csv']['actuals']
datasets['llama2_7b_it_sa_2epoch_decoding.csv']['actuals'] =  datasets['llama2_7b_it_2epoch_decoding.csv']['actuals']
datasets['llamantino2_7b_it_sa_2epoch_decoding.csv']['actuals'] =  datasets['llama2_7b_it_2epoch_decoding.csv']['actuals']

## Valutazione automatica

### Bleu

In [8]:
sacrebleu = load("sacrebleu")

def blue_evaluation(df):
    blue_scores = []
    
    for i in range(len(df)):
        reference = df['actuals'][i].lower()
        candidate = df['predictions'][i].lower()

        blue_score = sacrebleu.compute(predictions=[candidate], references=[reference])
        blue_scores.append(float(blue_score['score']))
        
    return blue_scores

In [9]:
print('BLEU SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')

    datasets[dataset_name]['bleu_score'] = blue_evaluation(datasets[dataset_name])
    print(f"    {datasets[dataset_name]['bleu_score'].mean()}")
    print('\n')


BLEU SCORES: 

- llama2_7b_it_2epoch_decoding.csv:
    30.947234161431346


- llama2_7b_it_sa_1shot_2epoch_decoding.csv:
    28.540514495575252


- llama2_7b_it_sa_2epoch_decoding.csv:
    28.553936917262842


- llamantino2_7b_it_2epoch_decoding.csv:
    32.92150384048398


- llamantino2_7b_it_sa_2epoch_decoding.csv:
    31.792868267825543




### BERTScore

In [10]:
bertscore = load("bertscore")

def bertscore_evaluation(actual, prediction, lang):
    predictions = []
    references = []

    for i in range(len(actual)):
        predictions.append(prediction.iloc[i].lower())
        references.append(actual.iloc[i].lower())

    bertscore_scores = bertscore.compute(predictions=predictions, references=references, lang=lang)

    return bertscore_scores

In [12]:
print('BERTSCORE SCORES: \n')

for dataset_name in datasets:

    print(f'- {dataset_name} (it):')

    score = bertscore_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'], 'it')
    datasets[dataset_name]['bertscore_f1'] = score['f1']
    datasets[dataset_name]['bertscore_precision'] = score['precision']
    datasets[dataset_name]['bertscore_recall'] = score['recall']
    #print(score)
    
    print(f'-F1: ', np.mean(score['f1']))
    print(f'-P: ', np.mean(score['precision']))
    print(f'-R: ', np.mean(score['recall']))
    print('\n')



BERTSCORE SCORES: 

- llama2_7b_it_2epoch_decoding.csv (it):
-F1:  0.8596069986224174
-P:  0.8620031984448433
-R:  0.8576335867643357


- llama2_7b_it_sa_1shot_2epoch_decoding.csv (it):
-F1:  0.849708055794239
-P:  0.8578462609648705
-R:  0.8427860285043717


- llama2_7b_it_sa_2epoch_decoding.csv (it):
-F1:  0.8480962705612183
-P:  0.8542063003778457
-R:  0.8431392735242844


- llamantino2_7b_it_2epoch_decoding.csv (it):
-F1:  0.8632007264494896
-P:  0.8663134371042251
-R:  0.8605485225915909


- llamantino2_7b_it_sa_2epoch_decoding.csv (it):
-F1:  0.8580915243625641
-P:  0.8621243567466735
-R:  0.8546362680196762




### TER

In [17]:
ter = TranslationEditRate()

def ter_evaluation(actual, prediction):
    ter_scores = []

    for i in range(len(actual)):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()

        ter_score = ter(candidate, [reference])
        ter_scores.append(ter_score)

    return ter_scores

In [18]:
print('TER SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['ter'] = ter_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f"    {datasets[dataset_name]['ter'].mean()}")
    print('\n')

TER SCORES: 

- llama2_7b_it_2epoch_decoding.csv:
    0.7037606201171875


- llama2_7b_it_sa_1shot_2epoch_decoding.csv:
    0.690429443359375


- llama2_7b_it_sa_2epoch_decoding.csv:
    0.7054981689453125


- llamantino2_7b_it_2epoch_decoding.csv:
    0.6816765747070312


- llamantino2_7b_it_sa_2epoch_decoding.csv:
    0.6700052490234375




### CHRF

In [19]:
chrf = load("chrf")

def chrf_evaluation(actual, prediction):
    chrf_scores = []

    # Utilizza tqdm per monitorare lo stato di avanzamento
    for i in tqdm(range(len(actual)), desc="Calcolo CHRF"):
        reference = actual.iloc[i].lower()
        candidate = prediction.iloc[i].lower()
        chrf_score = chrf.compute(predictions=[candidate], references=[reference])['score']

        chrf_scores.append(chrf_score)

    return chrf_scores

In [20]:
print('CHRF SCORES: \n')

for dataset_name in datasets:
    print(f'- {dataset_name}:')
    datasets[dataset_name]['chrf'] = chrf_evaluation(datasets[dataset_name]['actuals'], datasets[dataset_name]['predictions'])
    print(f"    {datasets[dataset_name]['chrf'].mean()}")
    print('\n')

CHRF SCORES: 

- llama2_7b_it_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:20<00:00, 47.94it/s]


    62.517357889450736


- llama2_7b_it_sa_1shot_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:19<00:00, 50.44it/s]


    59.2446339810021


- llama2_7b_it_sa_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:18<00:00, 52.86it/s]


    59.50446659679372


- llamantino2_7b_it_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:21<00:00, 46.43it/s]


    63.580743422030515


- llamantino2_7b_it_sa_2epoch_decoding.csv:


Calcolo CHRF: 100%|██████████| 1000/1000 [00:20<00:00, 48.77it/s]


    62.98687214051879


