In [None]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
import pickle

# evaluation 
from evaluate import load

In [None]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [None]:
models_generations = {
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")

## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/chrf">chrF++</a>

In [None]:
chrf = load("chrf")

In [None]:
chrf_scores = {'it': {}}

In [None]:
# open chrf_scores-exp3.pkl
with open('chrf_scores-exp4.pkl', 'rb') as f:
    chrf_scores = pickle.load(f)

In [None]:
for lang in models_generations:
    for model in models_generations[lang]:
        if model not in chrf_scores[lang]:
            model_chrf_scores = []
            print(f'Language: {lang}, Model: {model}')
            
            for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
                references = models_generations[lang][model][gen]['actual']
                predictions = models_generations[lang][model][gen]['prediction']

                gen_chrf_scores = []
                for i in tqdm(range(len(references)), desc="Calcolo punteggi CHRF"):
                    reference_scores = []
                    reference = references[i]
                    for j in range(len(reference)):
                        actual = reference[j]
                        result = chrf.compute(predictions=[predictions[i]], references=[actual], word_order=2)
                        reference_scores.append(result['score'])
                    
                    gen_chrf_scores.append(np.mean(reference_scores))
                
                model_chrf_scores.append(gen_chrf_scores)

            # Store the three chrf scores instead of their mean
            chrf_scores[lang][model] = model_chrf_scores
            print(f'chrf scores for {model} ({lang}): {model_chrf_scores}')
            print(f'Average: {np.mean(model_chrf_scores)}')
            print()

In [None]:
# Save chrF++ scores
with open('chrf_scores-exp4.pkl', 'wb') as f:
    pickle.dump(chrf_scores, f)