In [1]:
import pandas as pd
import os
from glob import glob
from collections import defaultdict
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import re

In [2]:
# Funzione per aprire i file in una directory e ordinarli
def open_files(directory):
    files = os.listdir(directory)
    files.sort()
    return files

In [3]:
def round_sig(x, sig=3):
    if isinstance(x, (float, int)):
        return float(f"{x:.{sig}g}")
    return x

def format_report(report_dict):
    rounded = {}
    for label, metrics in report_dict.items():
        if isinstance(metrics, dict):
            rounded[label] = {k: round_sig(v) for k, v in metrics.items()}
        else:
            rounded[label] = round_sig(metrics)
    return rounded

def print_formatted_report(report_dict):
    print("\n📈 Report di classificazione (precision, recall, f1-score, support):")
    labels = [label for label in report_dict if label not in ('accuracy', 'macro avg', 'weighted avg')]
    header = f"{'Label':<20} {'Prec':>8} {'Rec':>8} {'F1':>8} {'Support':>8}"
    print(header)
    print("-" * len(header))
    for label in labels + ['macro avg', 'weighted avg']:
        row = report_dict[label]
        print(f"{label:<20} {row['precision']:>8.3f} {row['recall']:>8.3f} {row['f1-score']:>8.3f} {row['support']:>8.0f}")
    print(f"{'Accuracy':<20} {'':>8} {'':>8} {'':>8} {report_dict['accuracy']:>8.3f}")

def calculate_metrics(df, print_confusion=False):
    # Assumiamo che prediction e actual siano le colonne corrette
    df['extracted_prediction'] = df['prediction'].astype(str)
    df['rhetorical_figure'] = df['actual'].astype(str)  # assicurati che 'actual' sia la colonna giusta
    
    # Rimuoviamo eventuali righe con valori mancanti nelle due colonne (evita errori)
    df_clean = df

    if print_confusion:
        print(f"\n🧩 Matrice di confusione:")
        confusion = pd.crosstab(df_clean['rhetorical_figure'], df_clean['extracted_prediction'],
                                rownames=['Actual'], colnames=['Predicted'])
        print(confusion)

    report = classification_report(df_clean['rhetorical_figure'], df_clean['extracted_prediction'], output_dict=True, zero_division=0)
    return report

def average_reports(reports):
    avg_report = {}
    keys = reports[0].keys()

    for key in keys:
        if isinstance(reports[0][key], dict):
            avg_report[key] = {}
            for metric in reports[0][key]:
                values = [r[key].get(metric, 0.0) for r in reports if key in r]
                avg_report[key][metric] = np.mean(values)
        else:  # accuracy
            values = [r.get(key, 0.0) for r in reports]
            avg_report[key] = np.mean(values)

    return avg_report


In [7]:
model_files = open_files('models_generations')
actual_files = open_files('actuals')

models = {}
for model_file in model_files:
    # estrai il nome modello rimuovendo pattern specifici
    model_name = re.sub(r"^(fine-tuned-)?|-decoding-\d+\.csv$", "", model_file)
    print(f"Processing model: {model_name}")

    if 'qwen' in model_name.lower():
        print(f"Skipping model '{model_name}' as it contains 'qwen'.")
        continue

    model = pd.read_csv(os.path.join('models_generations', model_file))

    reports = []
    for actual_file in actual_files:
        actual = pd.read_csv(os.path.join('actuals', actual_file))

        # fai il join di actual e model, utilizzando la colonna "pippo" di actual e l'index di model
        merged = pd.merge(actual, model, left_on='index', right_index=True, how='left')

        # save merged output as csv
        merged_file = f"merged_output.csv"
        merged.to_csv(merged_file, index=False)

        print(f"\n📊 Report per il modello '{model_name}' con file '{actual_file}':")
        report = calculate_metrics(merged, print_confusion=False)
        #print(report)
        formatted_report = format_report(report)
        print_formatted_report(formatted_report)

Processing model: LLaMAntino-3-ANITA-8B-Inst-DPO-ITA_predictions.csv

📊 Report per il modello 'LLaMAntino-3-ANITA-8B-Inst-DPO-ITA_predictions.csv' con file 'female_actual_labels.csv':

📈 Report di classificazione (precision, recall, f1-score, support):
Label                    Prec      Rec       F1  Support
--------------------------------------------------------
ANALOGY                 0.308    0.414    0.353       29
CONTEXT SHIFT           0.356    0.232    0.281       69
EUPHEMISM               0.167    0.111    0.133        9
FALSE ASSERTION         0.143    0.062    0.087       16
HYPERBOLE               0.600    0.214    0.316       14
OTHER                   0.273    0.257    0.265       35
OXYMORON                0.022    0.333    0.041        3
RHETORICAL QUESTION     0.450    0.346    0.391       26
macro avg               0.290    0.246    0.233      201
weighted avg            0.333    0.259    0.279      201
Accuracy                                           0.259

📊 Rep