In [1]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
from scipy.stats import ttest_rel
import pickle

# evaluation 
from evaluate import load

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
models_generations = {
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")

model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen0, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen1, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen2, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it-sga - gen: gen0, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it-sga - gen: gen1, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it-sga - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-it-sga - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it-sga - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it-sga - gen: gen2, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen0, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen1, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen2, lang: it
model: Minerva-7B-instruct-v1.0-it-sga - gen: gen0, lang: it
model: Minerva-7B-

## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/bleu">Bleu</a>

In [3]:
bleu = load("bleu")

In [5]:
predictions = ["Il Monumento all'11° Fanteria del Mississippi, in Pennsylvania, è classificato come proprietà contribuente. La contea in cui si trova il monumento, Adams, si distingue da altre come ad esempio la contea di Carroll, situata in Maryland, a sud-est di Adams in Pennsylvania."]
references = [
    ["Il monumento dell'11° Fanteria del Mississippi, che rientra nella categoria delle proprietà contribuenti, si trova nella Contea di Adams, in Pennsylvania, negli Stati Uniti. A sud-est della Contea di Adams, Pennsylvania, si trova la Contea di Carroll, Maryland"],
]
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.15560013076555723, 'precisions': [0.54, 0.24489795918367346, 0.10416666666666667, 0.0425531914893617], 'brevity_penalty': 1.0, 'length_ratio': 1.0869565217391304, 'translation_length': 50, 'reference_length': 46}


In [6]:
bleu_scores = {'it': {}}

In [5]:
# open chrf_scores-exp3.pkl
with open('bleu_scores-exp4.pkl', 'rb') as f:
    bleu_scores = pickle.load(f)

In [6]:
for lang in models_generations:
    for model in models_generations[lang]:
        if not model in bleu_scores[lang]:
            model_bleu_scores = []
            print(f'Language: {lang}, Model: {model}')
            
            for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
                references = models_generations[lang][model][gen]['actual']
                predictions = models_generations[lang][model][gen]['prediction']

                gen_bleu_scores = []
                for i in tqdm(range(len(references)), desc="Calcolo punteggi BLEU"):
                    results = bleu.compute(predictions=[predictions[i]], references=[references[i]])
                    gen_bleu_scores.append(results["bleu"])

                model_bleu_scores.append(gen_bleu_scores)  # Store BLEU for each generation
            
            # Store the three BLEU scores instead of their mean
            bleu_scores[lang][model] = model_bleu_scores
            print(f'BLEU scores for {model} ({lang}): {model_bleu_scores}')
            print(f'Average: {np.mean(model_bleu_scores)}')
            print()

Language: it, Model: Minerva-7B-instruct-v1.0-it-sga


Calcolo punteggi BLEU:   0%|          | 0/710 [00:00<?, ?it/s]

Calcolo punteggi BLEU: 100%|██████████| 710/710 [00:06<00:00, 105.27it/s]
Calcolo punteggi BLEU: 100%|██████████| 710/710 [00:04<00:00, 151.29it/s]
Calcolo punteggi BLEU: 100%|██████████| 710/710 [00:04<00:00, 154.76it/s]

BLEU scores for Minerva-7B-instruct-v1.0-it-sga (it): [[0.1999535753402219, 0.24463338185200256, 0.12250546203586002, 0.5738732779664825, 0.0, 0.5014201260890806, 0.33132225126113696, 0.0, 0.583485438484936, 0.6119406318088602, 0.21966971312635195, 0.4126749386397027, 0.24084875116214222, 0.2294289967352424, 0.19796195387950594, 0.628154771972103, 0.0, 0.27141077330282, 0.5630828900113662, 0.5071393334426324, 0.195323434722534, 0.46398359087474533, 0.2421824578106704, 0.21492353915888088, 0.25666145410765273, 0.7082522876775705, 0.2984419248027257, 0.167045425494737, 0.223611603064259, 0.25395415605121635, 0.5836635892521324, 0.3884776649966734, 0.6716441932309151, 0.49123068782837886, 0.34141338730348464, 0.518917341258404, 0.4609351508201726, 0.4053747214370277, 0.42563169897761305, 0.0, 0.4110740664660996, 0.08444387874576265, 0.3154861612573669, 0.0, 0.5278926992638239, 0.316227766016838, 0.24452556479589543, 0.0, 0.684209154112886, 0.41302016317250884, 0.3963323090565257, 0.204103




In [7]:
# Save BLEU scores
with open('bleu_scores-exp4.pkl', 'wb') as f:
    pickle.dump(bleu_scores, f)