In [1]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast

# evaluation 
from evaluate import load

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
datasets = {}

files = open_files('generations')
for file in files:
    dataset = pd.read_csv('generations/' + file)
    datasets[file] = dataset
files

['meta-llama2_7b_en_2epoch_decoding.csv',
 'meta-llama2_7b_it_2epoch_decoding.csv',
 't5-large_en_2epoch_decoding.csv',
 't5-large_it_2epoch_decoding.csv']

In [4]:
en_data_points = pd.read_csv('en_data_points.csv')
en_data_points['data points'] = en_data_points['data points'].apply(ast.literal_eval)
en_data_points

Unnamed: 0,data points
0,"[A monument to the 11th Mississippi Infantry, ..."
1,[Bill Oddie stars in a BBC programme called Ba...
2,[The postal codes of Karlsruhe are 76131–76229.]
3,[The religion of some Turkish people could be ...
4,[The Adolfo Suarez Madrid-Barajas airport is o...
...,...
995,[The ALCO RS-3 was produced between May 1950 a...
996,[Super Capers is a 98 minute film that was dis...
997,[It's Great to Be Young was edited by Max Bene...
998,[Lady Anne Monson married George Monson was a ...


In [5]:
it_data_points = pd.read_csv('it_data_points.csv')
it_data_points['data points'] = it_data_points['data points'].apply(ast.literal_eval)
it_data_points

Unnamed: 0,data points
0,[Un monumento all'11° Fanteria del Mississippi...
1,[Bill Oddie è protagonista di un programma del...
2,[I codici postali di Karlsruhe sono 76131-76229.]
3,[La religione di alcuni turchi potrebbe essere...
4,[L'aeroporto Adolfo Suarez Madrid-Barajas è ge...
...,...
995,[L'ALCO RS-3 è stato prodotto tra il maggio 19...
996,[Super Capers è un film di 98 minuti distribui...
997,[It's Great to Be Young è stato curato da Max ...
998,"[Lady Anne Monson, sposata con George Monson, ..."


## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/bleu">Bleu</a>

In [7]:
bleu = load("bleu")

In [8]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        predictions.append(row['predictions'])

    # check if dataset_name contains "en"
    if "en" in dataset_name:
        data_points = en_data_points
    else:
        data_points = it_data_points

    references = []
    for index, row in data_points.iterrows():
        references.append(row['data points'])

    results = bleu.compute(predictions=predictions, references=references)
    print(dataset_name + " BLEU: " + str(results))

meta-llama2_7b_en_2epoch_decoding.csv BLEU: {'bleu': 0.4843774663252121, 'precisions': [0.8232991687515132, 0.5760238836094526, 0.4068123957510315, 0.28532733449637315], 'brevity_penalty': 1.0, 'length_ratio': 1.0479976318349051, 'translation_length': 24782, 'reference_length': 23647}
meta-llama2_7b_it_2epoch_decoding.csv BLEU: {'bleu': 0.41177672647440994, 'precisions': [0.694989346851811, 0.48428800244069864, 0.34640393307430023, 0.24659400544959129], 'brevity_penalty': 1.0, 'length_ratio': 1.1403795400276486, 'translation_length': 27222, 'reference_length': 23871}
t5-large_en_2epoch_decoding.csv BLEU: {'bleu': 0.4857048076613918, 'precisions': [0.8232565983641117, 0.5757108900431376, 0.4074960861957823, 0.2881552273385462], 'brevity_penalty': 1.0, 'length_ratio': 1.003002495031082, 'translation_length': 23718, 'reference_length': 23647}
t5-large_it_2epoch_decoding.csv BLEU: {'bleu': 0.4236152578739689, 'precisions': [0.7299880999603332, 0.5006195786864932, 0.35269280482550625, 0.249

### <a href="https://huggingface.co/spaces/evaluate-metric/bertscore">Bertscore</a>

In [9]:
bertscore = load("bertscore")

In [11]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        predictions.append(row['predictions'])

    # check if dataset_name contains "en"
    if "en" in dataset_name:
        lang = "en"
        data_points = en_data_points
    else:
        lang = "it"
        data_points = it_data_points



    references = []
    for index, row in data_points.iterrows():
        references.append(row['data points'])

    results = bertscore.compute(predictions=predictions, references=references, lang=lang)
    average_precision = np.mean(results['precision'])
    average_recall = np.mean(results['recall'])
    average_f1 = np.mean(results['f1'])

    print(dataset_name + " Bert precision: " + str(average_precision) + " Bert recall: " + str(average_recall) + " Bert f1: " + str(average_f1))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


meta-llama2_7b_en_2epoch_decoding.csv Bert precision: 0.9550041739940643 Bert recall: 0.9520425716638565 Bert f1: 0.9531879550814628
meta-llama2_7b_it_2epoch_decoding.csv Bert precision: 0.8838927245140076 Bert recall: 0.8800082238316536 Bert f1: 0.8810591713786126


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


t5-large_en_2epoch_decoding.csv Bert precision: 0.953766506254673 Bert recall: 0.9484262043833732 Bert f1: 0.9506630224585533
t5-large_it_2epoch_decoding.csv Bert precision: 0.8795806723833084 Bert recall: 0.8745500195026398 Bert f1: 0.8761314076781272


### <a href="https://huggingface.co/spaces/evaluate-metric/chrf">chrF++</a>

In [6]:
chrf = load("chrf")

In [7]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        predictions.append(row['predictions'])

    # check if dataset_name contains "en"
    if "en" in dataset_name:
        data_points = en_data_points
    else:
        data_points = it_data_points

    references = []
    for index, row in data_points.iterrows():
        references.append([row['data points'][0]])

    results = chrf.compute(predictions=predictions, references=references, word_order=2) # word order 2 per calcolare chrF++
    print(dataset_name + " chrF++: " + str(results))

meta-llama2_7b_en_2epoch_decoding.csv chrF++: {'score': 61.592537873211086, 'char_order': 6, 'word_order': 2, 'beta': 2}
meta-llama2_7b_it_2epoch_decoding.csv chrF++: {'score': 57.37054255146911, 'char_order': 6, 'word_order': 2, 'beta': 2}
t5-large_en_2epoch_decoding.csv chrF++: {'score': 60.11124922239467, 'char_order': 6, 'word_order': 2, 'beta': 2}
t5-large_it_2epoch_decoding.csv chrF++: {'score': 56.407829934847356, 'char_order': 6, 'word_order': 2, 'beta': 2}


### <a href="https://huggingface.co/spaces/evaluate-metric/meteor">METEOR</a>

In [8]:
meteor = load('meteor')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        predictions.append(row['predictions'])

    # check if dataset_name contains "en"
    if "en" in dataset_name:
        data_points = en_data_points
    else:
        data_points = it_data_points

    references = []
    for index, row in data_points.iterrows():
        references.append(row['data points'])

    results = meteor.compute(predictions=predictions, references=references)
    print(dataset_name + " METEOR: " + str(results))

meta-llama2_7b_en_2epoch_decoding.csv METEOR: {'meteor': 0.7199273993164659}
meta-llama2_7b_it_2epoch_decoding.csv METEOR: {'meteor': 0.6661720143184371}
t5-large_en_2epoch_decoding.csv METEOR: {'meteor': 0.7053381616756408}
t5-large_it_2epoch_decoding.csv METEOR: {'meteor': 0.6432971686995566}
