In [None]:
# utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import ast
import re
import pickle

# evaluation 
from evaluate import load

In [None]:
# open files in generations directory
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [None]:
models_generations = {
    'it': {},
}

files = open_files('generations')
for file in files:
    model = re.sub(r"^fine-tuned-|-decoding-gen\d+.csv", "", file)
    gen = re.search(r"gen\d+", file).group()

    model_generations = pd.read_csv('generations/' + file)
    # convert string to list
    model_generations['actual'] = model_generations['actual'].apply(ast.literal_eval)
    
    # if containt -en-
    if '-it-' in file:
        if model not in models_generations['it']:
            models_generations['it'][model] = {}
        models_generations['it'][model][gen] = model_generations
        print(f"model: {model} - gen: {gen}, lang: it")

## Valutazione automatica

### <a href="https://huggingface.co/spaces/evaluate-metric/meteor">METEOR</a>

In [None]:
meteor = load('meteor')

In [None]:
meteor_scores = {'it': {}}

for lang in models_generations:
    for model in models_generations[lang]:
        model_meteor_scores = []
        print(f'Language: {lang}, Model: {model}')
        
        for gen in sorted(models_generations[lang][model]):  # Ensure order (gen0, gen1, gen2)
            references = models_generations[lang][model][gen]['actual']
            predictions = models_generations[lang][model][gen]['prediction']

            gen_meteor_scores = []
            for i in tqdm(range(len(references)), desc="Calcolo punteggi METEOR"):
                results = meteor.compute(predictions=[predictions[i]], references=[references[i]])
                gen_meteor_scores.append(results["meteor"])

            model_meteor_scores.append(gen_meteor_scores)  # Store BLEU for each generation
        
        # Store the three METEOR scores instead of their mean
        meteor_scores[lang][model] = model_meteor_scores
        print(f'BLEU scores for {model} ({lang}): {model_meteor_scores}')
        print(f'Average: {np.mean(model_meteor_scores)}')
        print()

In [None]:
# Save meteor scores
with open('meteor_scores-exp3.pkl', 'wb') as f:
    pickle.dump(meteor_scores, f)