In [1]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
import pickle

# evaluation 
from evaluate import load

In [2]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
models_generations = {
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")

model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen0, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen1, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it - gen: gen2, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it-sga - gen: gen0, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it-sga - gen: gen1, lang: it
model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA-it-sga - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it - gen: gen2, lang: it
model: Llama-3.1-8B-Instruct-it-sga - gen: gen0, lang: it
model: Llama-3.1-8B-Instruct-it-sga - gen: gen1, lang: it
model: Llama-3.1-8B-Instruct-it-sga - gen: gen2, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen0, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen1, lang: it
model: Minerva-7B-instruct-v1.0-it - gen: gen2, lang: it
model: Minerva-7B-instruct-v1.0-it-sga - gen: gen0, lang: it
model: Minerva-7B-

## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/chrf">chrF++</a>

In [4]:
chrf = load("chrf")

Using the latest cached version of the module from C:\Users\OliverioM\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--chrf\d244bab9383988714085a8dacc4871986d9f025398581c33d6b2ee22836b4069 (last modified on Wed Sep  4 18:15:30 2024) since it couldn't be found locally at evaluate-metric--chrf, or remotely on the Hugging Face Hub.


In [5]:
chrf_scores = {'it': {}}

In [5]:
# open chrf_scores-exp3.pkl
with open('chrf_scores-exp4.pkl', 'rb') as f:
    chrf_scores = pickle.load(f)

In [6]:
for lang in models_generations:
    for model in models_generations[lang]:
        if model not in chrf_scores[lang]:
            model_chrf_scores = []
            print(f'Language: {lang}, Model: {model}')
            
            for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
                references = models_generations[lang][model][gen]['actual']
                predictions = models_generations[lang][model][gen]['prediction']

                gen_chrf_scores = []
                for i in tqdm(range(len(references)), desc="Calcolo punteggi CHRF"):
                    reference_scores = []
                    reference = references[i]
                    for j in range(len(reference)):
                        actual = reference[j]
                        result = chrf.compute(predictions=[predictions[i]], references=[actual], word_order=2)
                        reference_scores.append(result['score'])
                    
                    gen_chrf_scores.append(np.mean(reference_scores))
                
                model_chrf_scores.append(gen_chrf_scores)

            # Store the three chrf scores instead of their mean
            chrf_scores[lang][model] = model_chrf_scores
            print(f'chrf scores for {model} ({lang}): {model_chrf_scores}')
            print(f'Average: {np.mean(model_chrf_scores)}')
            print()

Language: it, Model: Minerva-7B-instruct-v1.0-it-sga


Calcolo punteggi CHRF: 100%|██████████| 710/710 [00:13<00:00, 51.12it/s]
Calcolo punteggi CHRF: 100%|██████████| 710/710 [00:15<00:00, 46.36it/s]
Calcolo punteggi CHRF: 100%|██████████| 710/710 [00:13<00:00, 51.35it/s]

chrf scores for Minerva-7B-instruct-v1.0-it-sga (it): [[44.363387152179655, 50.32894045232846, 23.73077986823019, 65.95495091206804, 45.45929306135131, 53.52737509014684, 59.43623244222889, 32.99447909796942, 64.51853741661715, 69.88479702523223, 37.725065864396925, 58.32385778520784, 39.652496971644005, 30.422162595618165, 35.80060667357092, 71.98731871142128, 50.36871105588627, 46.91780597767241, 71.01959177191858, 61.44629347544586, 55.22730679371638, 52.69979981159793, 27.815162513094375, 47.19641241203579, 57.151802534633, 75.86832580759788, 57.05667934126654, 43.63004691084845, 44.51153255251595, 33.122749257727, 73.61461600805183, 65.27829038709113, 70.4453002436939, 65.83596194358873, 42.5237881836157, 59.45775685493337, 58.71288525141838, 45.710739048096734, 56.01055228829269, 36.93185754504342, 55.00992890223427, 36.06490369280555, 49.91868959899091, 31.080950080736777, 52.970409030084824, 50.86688287881838, 53.22918569471247, 46.500070407450664, 72.97422760380233, 50.5981353




In [8]:
# Save chrF++ scores
with open('chrf_scores-exp4.pkl', 'wb') as f:
    pickle.dump(chrf_scores, f)