# Imports

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import torch
from tqdm import tqdm
import re
import gc
import csv


  from .autonotebook import tqdm as notebook_tqdm


# 🔍 Prometheus LLM-as-a-Judge

## Load Datasets

In [None]:
num_samples = 15  # Numero di campioni da processare
# === 1. Carica i dati ===
df_base = pd.read_csv("inputs/dataset_human_eval.csv").head(num_samples)  # usa .head(N) per limitare
df_mistral = pd.read_csv("outputs/dataset_with_mistral_translations.csv").head(num_samples)
df_mt5 = pd.read_csv("outputs/dataset_with_mT5_translations.csv").head(num_samples)
df_tinyllama = pd.read_csv("outputs/dataset_with_tinyllama_translations.csv").head(num_samples)
df_nnlb = pd.read_csv("outputs/dataset_with_translation_NNLB.csv").head(num_samples)
df_base["mistral"] = df_mistral["generated_translation"]
df_base["mistralHS"] = df_mistral["score_human"]

df_base["mt5"] = df_mistral["generated_translation"]
df_base["mt5HS"] = df_mistral["score_human"]

df_base["tinyllama"] = df_mistral["generated_translation"]
df_base["tinyllamaHS"] = df_mistral["score_human"]

df_base["nnlb"] = df_nnlb["generated_translation"]
df_base["nnlbHS"] = df_nnlb["score_human"]

## Load Model

In [None]:
# === 2. Carica il modello Prometheus ===
model_name = "prometheus-eval/prometheus-7b-v2.0"
#model_name = "Unbabel/M-Prometheus-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16,
    offload_folder="offload_prometheus",
    offload_buffers=True
)
model.eval()

## Prompt

In [None]:

# === 3. Rubrica e prompt ===

rubric_data = {
    "criteria": "Semantic fidelity of the translation",
    "score1": "Completely wrong, the meaning is unrecognizable.",
    "score2": "Severe meaning errors or omissions or explanations.",
    "score3": "Some inaccuracies, but the general meaning is conveyed.",
    "score4": "Good fidelity, minor non-substantial differences.",
    "score5": "Perfectly faithful to the original meaning."
}

def build_judge_prompt(original, human_translation, model_translation):
    return f"""
You are a translation evaluator. Your task is to assign a score from 1 to 5 that reflects how well the model's translation preserves the original meaning.

Original:
{original}

Human:
{human_translation}

Model:
{model_translation}

### Score Rubrics:
{rubric_data["criteria"]}
Score 1: {rubric_data["score1"]}
Score 2: {rubric_data["score2"]}
Score 3: {rubric_data["score3"]}
Score 4: {rubric_data["score4"]}
Score 5: {rubric_data["score5"]}

Assess the semantic fidelity objectively. Assign the appropriate score based on the rubric.

Score (1–5):
""".strip()

def estrai_punteggio(output):
    match = re.search(r"Score\s*\(1[\-–]5\):\s*(\d)", output)
    return int(match.group(1)) if match else None


Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.54it/s]
Some parameters are on the meta device because they were offloaded to the cpu.


## Run Prometheus

In [None]:
# === 4. Valutazione ===
score_as = []
score_bs = []
score_cs = []
score_ds = []
winners = []

for _, row in tqdm(df_base.iterrows(), total=len(df_base)):
    for label, translation, score_list in [("A", row["mistral"], score_as), ("B", row["nnlb"], score_bs),("C", row["tinyllama"], score_cs),("D", row["mt5"], score_ds)]:
        prompt = build_judge_prompt(row["Sentence"], row["HumanEval"], translation)
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=2, do_sample=False)
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        #print(output_text)
        score = estrai_punteggio(output_text)
        #print(score)
        score_list.append(score)

        # Pulisce memoria
        del inputs, output_ids
        torch.cuda.empty_cache()
        gc.collect()

    score_a = score_as[-1]
    score_b = score_bs[-1]
    score_c = score_cs[-1]
    score_d = score_ds[-1]
    if score_a is None or score_b is None:
        winners.append("?")
    elif score_a > score_b and score_a > score_c and score_a > score_d:
        winners.append("Mistral")
    elif score_b > score_a and score_b > score_c and score_b > score_d:
        winners.append("Nnlb")
    elif score_c > score_a and score_c > score_b and score_c > score_d:
        winners.append("TinyLlama")
    elif score_d > score_a and score_d > score_b and score_d > score_c:
        winners.append("Mt5") 
    else:
        winners.append("=")

# === 5. Salvataggio ===
df_base["score_a"] = score_as
df_base["score_b"] = score_bs
df_base["score_c"] = score_cs
df_base["score_d"] = score_ds
df_base["winner"] = winners

df_base.to_csv("outputs/prometheus_eval.csv", index=False, quoting=csv.QUOTE_MINIMAL)
print("✅ File salvato in outputs/prometheus_eval.csv")


# Crea file separati per ciascun modello
df_mistral = df_base[["Sentence", "HumanEval", "mistral", "score_a", "winner"]].rename(columns={"mistral": "translation", "score_a": "score"})
df_nnlb    = df_base[["Sentence", "HumanEval", "nnlb", "score_b", "winner"]].rename(columns={"nnlb": "translation", "score_b": "score"})
df_tiny    = df_base[["Sentence", "HumanEval", "tinyllama", "score_c", "winner"]].rename(columns={"tinyllama": "translation", "score_c": "score"})
df_mt5     = df_base[["Sentence", "HumanEval", "mt5", "score_d", "winner"]].rename(columns={"mt5": "translation", "score_d": "score"})

# Salvataggio file
df_mistral.to_csv("outputs/salmonators-hw2_transl-judge_mistral.jsonl.csv", index=False, quoting=csv.QUOTE_MINIMAL)
df_nnlb.to_csv("salmonators-hw2_transl-judge_nnlb.jsonl", index=False, quoting=csv.QUOTE_MINIMAL)
df_tiny.to_csv("salmonators-hw2_transl-judge_tinyllama.jsonl", index=False, quoting=csv.QUOTE_MINIMAL)
df_mt5.to_csv("salmonators-hw2_transl-judge_mt5.jsonl", index=False, quoting=csv.QUOTE_MINIMAL)

print("✅ File salvati in outputs/:")
print("- prometheus_eval.csv (completo)")
print("- outputs/salmonators-hw2_transl-judge_mistral.jsonl.csv")
print("- salmonators-hw2_transl-judge_nnlb.jsonl")
print("- salmonators-hw2_transl-judge_tinyllama.jsonl")
print("- salmonators-hw2_transl-judge_mt5.jsonl")


NameError: name 'tqdm' is not defined

# 📊 Calcolo concordanza

In [8]:
def evaluate_concordance(y_true, y_pred):
    print("== Valori unici ==")
    print("Umano:", sorted(set(y_true)))
    print("Modello:", sorted(set(y_pred)))
    
    if len(set(y_true)) < 2 or len(set(y_pred)) < 2:
        print("⚠️ Non abbastanza variabilità per calcolare Cohen's Kappa.")
        return
    
    print("\n== Cohen’s Kappa ==")
    print(f"{cohen_kappa_score(y_true, y_pred):.3f}")



In [9]:
df_base = pd.read_csv("outputs/prometheus_eval.csv")
print("MISTRAL")
evaluate_concordance(df_base["mistralHS"], df_base["score_a"])
print("NNLB")
evaluate_concordance(df_base["nnlbHS"], df_base["score_b"])


MISTRAL
== Valori unici ==
Umano: [3, 4, 5]
Modello: [3, 5]

== Cohen’s Kappa ==
0.574
NNLB
== Valori unici ==
Umano: [2, 3, 4, 5]
Modello: [1, 3, 5]

== Cohen’s Kappa ==
0.434
