In [15]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast

# evaluation 
from evaluate import load

In [37]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [38]:
datasets = {}

files = open_files('generations')
for file in files:
    dataset = pd.read_csv('generations/' + file)
    datasets[file] = dataset
files

['de_meta-llama2_7b_2epoch_decoding.csv',
 'de_t5-large_2epoch_decoding.csv',
 'en_meta-llama2_7b_2epoch_decoding.csv',
 'en_t5-large_2epoch_decoding.csv',
 'it_meta-llama2_7b_2epoch_decoding.csv',
 'it_t5-large_2epoch_decoding.csv',
 'ru_meta-llama2_7b_2epoch_decoding.csv',
 'ru_t5-large_2epoch_decoding.csv']

In [39]:
# EN
en_data_points = pd.read_csv('en_data_points.csv')
en_data_points['data points'] = en_data_points['data points'].fillna('[]')
en_data_points['data points'] = en_data_points['data points'].apply(ast.literal_eval)

# IT
it_data_points = pd.read_csv('it_data_points.csv')
it_data_points['data points'] = it_data_points['data points'].fillna('[]')
it_data_points['data points'] = it_data_points['data points'].apply(ast.literal_eval)

# DE
de_data_points = pd.read_csv('de_data_points.csv')
de_data_points['data points'] = de_data_points['data points'].fillna('[]')
de_data_points['data points'] = de_data_points['data points'].apply(ast.literal_eval)

# RU
ru_data_points = pd.read_csv('ru_data_points.csv')
ru_data_points['data points'] = ru_data_points['data points'].fillna('[]')
ru_data_points['data points'] = ru_data_points['data points'].apply(ast.literal_eval)

Incrocia i valori dei 4 data frame e mantieni solamente quelli validi

In [40]:
i = 0
len_dt = len(it_data_points)
indexes = []
for i in range(len_dt):
    if it_data_points['data points'][i] == [] or en_data_points['data points'][i] == [] or de_data_points['data points'][i] == [] or ru_data_points['data points'][i] == []:
        # remove rows into dataframes
        it_data_points = it_data_points.drop(i)
        en_data_points = en_data_points.drop(i)
        de_data_points = de_data_points.drop(i)
        ru_data_points = ru_data_points.drop(i)
        indexes.append(i)

# reset index
it_data_points = it_data_points.reset_index(drop=True)
en_data_points = en_data_points.reset_index(drop=True)
de_data_points = de_data_points.reset_index(drop=True)
ru_data_points = ru_data_points.reset_index(drop=True)

In [41]:
len(indexes)

250

## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/bleu">Bleu</a>

In [42]:
bleu = load("bleu")

In [43]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        if index not in indexes:
            predictions.append(row['predictions'])

    # check if dataset_name contains "en"
    if "en" in dataset_name:
        data_points = en_data_points
    elif "it" in dataset_name:
        data_points = it_data_points
    elif "de" in dataset_name:
        data_points = de_data_points
    elif "ru" in dataset_name:
        data_points = ru_data_points

    references = []
    for index, row in data_points.iterrows():
        references.append(row['data points'])

    results = bleu.compute(predictions=predictions, references=references)
    print(dataset_name + " BLEU: " + str(results))

de_meta-llama2_7b_2epoch_decoding.csv BLEU: {'bleu': 0.5549915645915656, 'precisions': [0.8336517467248908, 0.6226089457788005, 0.4811492854971116, 0.3798968241173626], 'brevity_penalty': 1.0, 'length_ratio': 1.1169029111415942, 'translation_length': 14656, 'reference_length': 13122}
de_t5-large_2epoch_decoding.csv BLEU: {'bleu': 0.5952482311928546, 'precisions': [0.868700744703926, 0.6616466630991514, 0.5211256183602303, 0.4191347897418185], 'brevity_penalty': 1.0, 'length_ratio': 1.0540313976527969, 'translation_length': 13831, 'reference_length': 13122}
en_meta-llama2_7b_2epoch_decoding.csv BLEU: {'bleu': 0.6359652521938579, 'precisions': [0.8881840879951414, 0.7159002061269457, 0.5697875215857047, 0.45150767761954014], 'brevity_penalty': 1.0, 'length_ratio': 1.0992507974185892, 'translation_length': 14819, 'reference_length': 13481}
en_t5-large_2epoch_decoding.csv BLEU: {'bleu': 0.6508101803483489, 'precisions': [0.9032592900145319, 0.7287789212466244, 0.5830437804030577, 0.4674207

### <a href="https://huggingface.co/spaces/evaluate-metric/bertscore">Bertscore</a>

In [44]:
bertscore = load("bertscore")

In [48]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        if index not in indexes:
            predictions.append(row['predictions'])

    lang = ""
    # check if dataset_name contains "en"
    if "en" in dataset_name:
        data_points = en_data_points
        lang = "en"
    elif "it" in dataset_name:
        data_points = it_data_points
        lang = "it"
    elif "de" in dataset_name:
        data_points = de_data_points
        lang = "de"
    elif "ru" in dataset_name:
        data_points = ru_data_points
        lang = "ru"

    references = []
    for index, row in data_points.iterrows():
        references.append(row['data points'])

    results = bertscore.compute(predictions=predictions, references=references, lang=lang)
    average_precision = np.mean(results['precision'])
    average_recall = np.mean(results['recall'])
    average_f1 = np.mean(results['f1'])

    print(dataset_name + " Bert precision: " + str(average_precision) + " Bert recall: " + str(average_recall) + " Bert f1: " + str(average_f1))

de_meta-llama2_7b_2epoch_decoding.csv Bert precision: 0.9064151884714763 Bert recall: 0.9063240411281586 Bert f1: 0.9052164448102316
de_t5-large_2epoch_decoding.csv Bert precision: 0.9161613484223684 Bert recall: 0.9096777300834655 Bert f1: 0.9118376518090566


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


en_meta-llama2_7b_2epoch_decoding.csv Bert precision: 0.9704108312924703 Bert recall: 0.9687355315685272 Bert f1: 0.9691636436780294
en_t5-large_2epoch_decoding.csv Bert precision: 0.9711603197256724 Bert recall: 0.9685412779649099 Bert f1: 0.9694004866282145
it_meta-llama2_7b_2epoch_decoding.csv Bert precision: 0.9293031032085419 Bert recall: 0.9267670160134633 Bert f1: 0.9271565098762512
it_t5-large_2epoch_decoding.csv Bert precision: 0.9268437720139822 Bert recall: 0.9234380162556967 Bert f1: 0.9242996752262116


### <a href="https://huggingface.co/spaces/evaluate-metric/chrf">chrF++</a>

In [None]:
chrf = load("chrf")

In [46]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        if index not in indexes:
            predictions.append(row['predictions'])

    # check if dataset_name contains "en"
    if "en" in dataset_name:
        data_points = en_data_points
    elif "it" in dataset_name:
        data_points = it_data_points
    elif "de" in dataset_name:
        data_points = de_data_points
    elif "ru" in dataset_name:
        data_points = ru_data_points

    references = []
    for index, row in data_points.iterrows():
        references.append([row['data points'][0]])

    results = chrf.compute(predictions=predictions, references=references, word_order=2) # word order 2 per calcolare chrF++
    print(dataset_name + " chrF++: " + str(results))

NameError: name 'chrf' is not defined

### <a href="https://huggingface.co/spaces/evaluate-metric/meteor">METEOR</a>

In [None]:
meteor = load('meteor')

In [None]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        if index not in indexes:
            predictions.append(row['predictions'])

    # check if dataset_name contains "en"
    if "en" in dataset_name:
        data_points = en_data_points
    elif "it" in dataset_name:
        data_points = it_data_points
    elif "de" in dataset_name:
        data_points = de_data_points
    elif "ru" in dataset_name:
        data_points = ru_data_points

    references = []
    for index, row in data_points.iterrows():
        references.append(row['data points'])

    results = meteor.compute(predictions=predictions, references=references)
    print(dataset_name + " METEOR: " + str(results))

it5-large_it_2epoch_decoding.csv METEOR: {'meteor': 0.462847709014826}
llamantino2_7b_it_2epoch_decoding.csv METEOR: {'meteor': 0.6760152549862166}
meta-llama2_7b_it_2epoch_decoding.csv METEOR: {'meteor': 0.6661720143184371}
minerva_3B_it_2epoch_decoding.csv METEOR: {'meteor': 0.5947401975344415}
t5-large_it_2epoch_decoding.csv METEOR: {'meteor': 0.6432971686995566}
