In [1]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast

# evaluation 
from evaluate import load

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [12]:
datasets = {}

files = open_files('generations')
for file in files:
    dataset = pd.read_csv('generations/' + file)
    datasets[file] = dataset
files

['llama2_7b_it_2epoch_decoding.csv',
 'llama2_7b_it_sa_2epoch_decoding.csv',
 'llamantino2_7b_it_2epoch_decoding.csv',
 'llamantino2_7b_it_sa_2epoch_decoding.csv']

In [4]:
data_points = pd.read_csv('data_points.csv')
data_points['data points'] = data_points['data points'].apply(ast.literal_eval)
data_points

Unnamed: 0,data points
0,[Un monumento all'11° Fanteria del Mississippi...
1,[Bill Oddie è protagonista di un programma del...
2,[I codici postali di Karlsruhe sono 76131-76229.]
3,[La religione di alcuni turchi potrebbe essere...
4,[L'aeroporto Adolfo Suarez Madrid-Barajas è ge...
...,...
995,[L'ALCO RS-3 è stato prodotto tra il maggio 19...
996,[Super Capers è un film di 98 minuti distribui...
997,[It's Great to Be Young è stato curato da Max ...
998,"[Lady Anne Monson, sposata con George Monson, ..."


## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/bleu">Bleu</a>

In [5]:
bleu = load("bleu")

In [15]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        predictions.append(row['predictions'])

    references = []
    for index, row in data_points.iterrows():
        references.append(row['data points'])

    results = bleu.compute(predictions=predictions, references=references)
    print(dataset_name + " BLEU: " + str(results))

llama2_7b_it_2epoch_decoding.csv BLEU: {'bleu': 0.4146736218750292, 'precisions': [0.7111454641429756, 0.4906747120131651, 0.3478509093874888, 0.24360173454638212], 'brevity_penalty': 1.0, 'length_ratio': 1.111055255330736, 'translation_length': 26522, 'reference_length': 23871}
llama2_7b_it_sa_2epoch_decoding.csv BLEU: {'bleu': 0.4068222298130507, 'precisions': [0.7233789411064843, 0.48717736939913275, 0.3365496446263192, 0.2309492730791736], 'brevity_penalty': 1.0, 'length_ratio': 1.0563026266180722, 'translation_length': 25215, 'reference_length': 23871}
llamantino2_7b_it_2epoch_decoding.csv BLEU: {'bleu': 0.4278339941515406, 'precisions': [0.7136303093088648, 0.5000384201629015, 0.3617148793351446, 0.259572165806559], 'brevity_penalty': 1.0, 'length_ratio': 1.1322525239830756, 'translation_length': 27028, 'reference_length': 23871}
llamantino2_7b_it_sa_2epoch_decoding.csv BLEU: {'bleu': 0.44364521134171725, 'precisions': [0.7423611381417617, 0.5217285650641934, 0.37457262251487905,

In [10]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        predictions.append(row['predictions'])

    references = []
    for index, row in data_points.iterrows():
        references.append(row['data points'])

    results = bleu.compute(predictions=predictions, references=references)
    print(dataset_name + " BLEU: " + str(results))

llama2_7b_it_2epoch_decoding.csv BLEU: {'bleu': 0.4146736218750292, 'precisions': [0.7111454641429756, 0.4906747120131651, 0.3478509093874888, 0.24360173454638212], 'brevity_penalty': 1.0, 'length_ratio': 1.111055255330736, 'translation_length': 26522, 'reference_length': 23871}
llama2_7b_it_sa_2epoch_decoding.csv BLEU: {'bleu': 0.41349389542955556, 'precisions': [0.7288240641269317, 0.49383673821222307, 0.3429562241037651, 0.23682722216985577], 'brevity_penalty': 1.0, 'length_ratio': 1.0138661974781116, 'translation_length': 24202, 'reference_length': 23871}
llamantino2_7b_it_2epoch_decoding.csv BLEU: {'bleu': 0.4278339941515406, 'precisions': [0.7136303093088648, 0.5000384201629015, 0.3617148793351446, 0.259572165806559], 'brevity_penalty': 1.0, 'length_ratio': 1.1322525239830756, 'translation_length': 27028, 'reference_length': 23871}
llamantino2_7b_it_sa_2epoch_decoding.csv BLEU: {'bleu': 0.44364521134171725, 'precisions': [0.7423611381417617, 0.5217285650641934, 0.3745726225148790

### <a href="https://huggingface.co/spaces/evaluate-metric/bertscore">Bertscore</a>

In [9]:
bertscore = load("bertscore")

In [10]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        predictions.append(row['predictions'])

    references = []
    for index, row in data_points.iterrows():
        references.append(row['data points'])

    results = bertscore.compute(predictions=predictions, references=references, lang="it")
    average_precision = np.mean(results['precision'])
    average_recall = np.mean(results['recall'])
    average_f1 = np.mean(results['f1'])

    print(dataset_name + " Bert precision: " + str(average_precision) + " Bert recall: " + str(average_recall) + " Bert f1: " + str(average_f1))

llama2_7b_it_2epoch_decoding.csv Bert precision: 0.8833115850687027 Bert recall: 0.8808526495099067 Bert f1: 0.8812159091830254
llama2_7b_it_sa_1shot_2epoch_decoding.csv Bert precision: 0.8772746261358261 Bert recall: 0.864955677986145 Bert f1: 0.8699512004852294
llama2_7b_it_sa_2epoch_decoding.csv Bert precision: 0.8731846770644188 Bert recall: 0.8642561140656472 Bert f1: 0.8675015712976456
llamantino2_7b_it_2epoch_decoding.csv Bert precision: 0.8864110988378525 Bert recall: 0.882094022333622 Bert f1: 0.883266131401062
llamantino2_7b_it_sa_2epoch_decoding.csv Bert precision: 0.8819521244168281 Bert recall: 0.875715501010418 Bert f1: 0.8778355576992035


### <a href="https://huggingface.co/spaces/evaluate-metric/chrf">chrF++</a>

In [11]:
chrf = load("chrf")

In [12]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        predictions.append(row['predictions'])

    references = []
    for index, row in data_points.iterrows():
        references.append([row['data points'][0]])

    results = chrf.compute(predictions=predictions, references=references, word_order=2) # word order 2 per calcolare chrF++
    print(dataset_name + " chrF++: " + str(results))

llama2_7b_it_2epoch_decoding.csv chrF++: {'score': 57.452247991643155, 'char_order': 6, 'word_order': 2, 'beta': 2}
llama2_7b_it_sa_1shot_2epoch_decoding.csv chrF++: {'score': 53.432035355745896, 'char_order': 6, 'word_order': 2, 'beta': 2}
llama2_7b_it_sa_2epoch_decoding.csv chrF++: {'score': 54.06440295862962, 'char_order': 6, 'word_order': 2, 'beta': 2}
llamantino2_7b_it_2epoch_decoding.csv chrF++: {'score': 58.239203118023816, 'char_order': 6, 'word_order': 2, 'beta': 2}
llamantino2_7b_it_sa_2epoch_decoding.csv chrF++: {'score': 58.09260870276559, 'char_order': 6, 'word_order': 2, 'beta': 2}


### <a href="https://huggingface.co/spaces/evaluate-metric/meteor">METEOR</a>

In [13]:
meteor = load('meteor')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\OliverioM\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [14]:
for dataset_name in datasets:
    predictions = []
    for index, row in datasets[dataset_name].iterrows():
        predictions.append(row['predictions'])

    references = []
    for index, row in data_points.iterrows():
        references.append(row['data points'])

    results = meteor.compute(predictions=predictions, references=references)
    print(dataset_name + " METEOR: " + str(results))

llama2_7b_it_2epoch_decoding.csv METEOR: {'meteor': 0.6651442457847393}
llama2_7b_it_sa_1shot_2epoch_decoding.csv METEOR: {'meteor': 0.6252816799181122}
llama2_7b_it_sa_2epoch_decoding.csv METEOR: {'meteor': 0.6283713666832174}
llamantino2_7b_it_2epoch_decoding.csv METEOR: {'meteor': 0.6760152549862166}
llamantino2_7b_it_sa_2epoch_decoding.csv METEOR: {'meteor': 0.6655893150817916}
