# Imports

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import cohen_kappa_score
import torch
from tqdm import tqdm
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling, logging
import glob
import os


  from .autonotebook import tqdm as notebook_tqdm


# 📥 Caricamento dati

In [2]:
dataset_path = "inputs/dataset.csv"

In [3]:
df = pd.read_csv(dataset_path)
archaic_sentences = df["Sentence"].dropna().tolist()[:20]

# 👨‍🏫 Fine Tuning

## Parametri fine tuning

In [None]:
# 📁 Percorso alla cartella con i file CSV
cartella_csvs = "fine_tuning/csvs"  
# 📁 Percorso alla cartella dove andrà il dataset concatenato
cartella_dataset_concatenato = "inputs/dataset_concatenato.csv"

training_epochs=15

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
#model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#model_name = "bigscience/bloomz-3b"

nuovi_token_max=100
temperatura=0.7
max_translations=0 # 0 = no limit, >0 = max number of translations to generate

def getPrompt(archaic_sentence):
    prompt = (
        "Sei un traduttore professionista di testi antichi in italiano moderno.\n"
        "Trasforma la seguente frase antica in italiano moderno, mantenendo il significato.\n"
        f"Testo antico: {archaic_sentence}\n"
        "Traduzione moderna:"
    )
    return prompt

## Load Dataset

In [None]:

# 🔍 Prende tutti i file .csv nella cartella
csv_files = glob.glob(os.path.join(cartella_csvs, "*.csv"))

# 📦 Carica e concatena tutti i file
dataframes = []
for file in csv_files:
    try:
        df = pd.read_csv(file)
        dataframes.append(df)
    except Exception as e:
        print(f"Errore nel file {file}: {e}")

# 📚 Unione verticale
df_finale = pd.concat(dataframes, ignore_index=True)

# 💾 Salvataggio
df_finale.to_csv(cartella_dataset_concatenato, index=False)

dataset = load_dataset("csv", data_files=cartella_dataset_concatenato)["train"]
dataset = dataset.train_test_split(test_size=0.1)



## Modello e Tokenizer

In [None]:

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)


# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=True) #use_fast=True)

tokenizer.pad_token = tokenizer.eos_token  # Per evitare errori su padding
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model.is_loaded_in_4bit = True


## Configurazione Lora

In [None]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    task_type=TaskType.CAUSAL_LM,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


## Preprocessing dataset

In [None]:
def format_prompt(example):
    prompt = getPrompt(example['text'])
    return {
        "input_ids": tokenizer(prompt, truncation=True, padding="max_length", max_length=512)["input_ids"],
        "labels": tokenizer(example["translation"], truncation=True, padding="max_length", max_length=512)["input_ids"]
    }

tokenized_dataset = {
    "train": dataset["train"].map(format_prompt, remove_columns=dataset["train"].column_names),
    "test": dataset["test"].map(format_prompt, remove_columns=dataset["test"].column_names)
}

## Setup Trainer

In [None]:
logging.set_verbosity_debug()
training_args = TrainingArguments(
    output_dir="Models/Mistral/mistral-lora-itmoderno",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=training_epochs,
    learning_rate=2e-4,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=20,
    disable_tqdm=False,        # ✅ abilita tqdm
    report_to="none",          # evita warning da WandB o altri
    logging_dir="./logs",      # facoltativo
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
def compute_metrics(eval_preds):
    return {} 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


## Avvia Fine Tuning & Salva

In [None]:
trainer.train()
model.save_pretrained("Models/Mistral/mistral-finetuned-itmodern")
tokenizer.save_pretrained("Models/Mistral/mistral-finetuned-itmodern")
from peft import PeftModel
from transformers import AutoModelForCausalLM

# Carica il modello base
base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# Carica il checkpoint LoRA appena salvato
lora_model = PeftModel.from_pretrained(base_model, "Models/Mistral/mistral-finetuned-itmodern")

# Merge dei pesi LoRA nel modello base
merged_model = lora_model.merge_and_unload()

# Salva il modello fuso come standalone
merged_model.save_pretrained("Models/Mistral/mistral-lora-merged")



# 🧠 Traduzioni

In [None]:

# 0. Quantization config (bnb4bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# 1. Seleziona il dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Carica il modello MERGED e tokenizer
model_path = "Models/Mistral/mistral-lora-merged"
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")  # tokenizer originale
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

# 3. Carica dataset
df = pd.read_csv("inputs/dataset.csv")
df["generated_translation"] = ""

# 4. Funzione di traduzione
def traduci(s):
    prompt = getPrompt(s)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    output = model.generate(
        **inputs,
        max_new_tokens=nuovi_token_max,
        temperature=temperatura,
        top_p=0.9,
        do_sample=True,
        use_cache=True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True).split("Traduzione moderna:")[-1].strip()

# 5. Generazione con barra di avanzamento (solo primi 3 per test)
results = []
for i, s in enumerate(tqdm(df["Sentence"].tolist())):
    if max_translations!= 0 and i >= max_translations:
        results.append("[SKIPPED]")
        continue
    try:
        results.append(traduci(s))
    except:
        results.append("[ERRORE]")

df["generated_translation"] = results
df["score_human"] = 0
# 6. Salva i risultati
df.to_csv("outputs/dataset_with_mistral_translations.csv", index=False)


# 🔍 Prometheus LLM-as-a-Judge

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

prometheur_model_name = "prometheus-eval/prometheus-7b-v2.0"

prometheus_tokenizer = AutoTokenizer.from_pretrained(
    prometheur_model_name,
    trust_remote_code=True
)

prometheus_model = AutoModelForCausalLM.from_pretrained(
    prometheur_model_name,
    trust_remote_code=True, 
    device_map="auto",
    torch_dtype=torch.float16,
    offload_folder="offload_prometheus"
)

judge = pipeline("text-generation", model=prometheus_model, tokenizer=prometheus_tokenizer)


Loading checkpoint shards: 100%|██████████| 8/8 [00:08<00:00,  1.12s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0


In [13]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from tqdm import tqdm
import csv

# === 1. Carica i file ===
df_base = pd.read_csv("inputs/dataset_human_eval.csv")  # contiene Sentence e HumanEval
df_mistral = pd.read_csv("outputs/dataset_with_mistral_translations.csv")  # contiene generated_translation
df_nnlb = pd.read_csv("outputs/dataset_with_translation_NNLB.csv")  # contiene generated_translation

# Opzionale: solo i primi 15 esempi
df_base = df_base.head(3)
df_mistral = df_mistral.head(3)
df_nnlb = df_nnlb.head(3)

# Aggiungi le colonne di traduzione
df_base["mistral"] = df_mistral["generated_translation"]
df_base["mistralHS"] = df_mistral["score_human"]
df_base["nnlb"] = df_nnlb["generated_translation"]
df_base["nnlbHS"] = df_nnlb["score_human"]

# === 2. Carica Prometheus ===
model_name = "prometheus-eval/prometheus-7b-v2.0"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16,
    offload_folder="offload_prometheus",
    offload_buffers=True
)
judge = pipeline("text-generation", model=model, tokenizer=tokenizer)

# === 3. Rubrica ===
rubric_data = {
    "criteria": "Semantic fidelity of the translation",
    "score1": "Completely wrong, the meaning is unrecognizable.",
    "score2": "Severe meaning errors or omissions.",
    "score3": "Some inaccuracies, but the general meaning is conveyed.",
    "score4": "Good fidelity, minor non-substantial differences.",
    "score5": "Perfectly faithful to the original meaning."
}

def build_judge_prompt(original, human_translation, model_translation):
    return f"""
You are a translation evaluator.

Original:
{original}

Human:
{human_translation}

Model:
{model_translation}

###Score Rubrics:
{rubric_data["criteria"]}
Score 1: {rubric_data["score1"]}
Score 2: {rubric_data["score2"]}
Score 3: {rubric_data["score3"]}
Score 4: {rubric_data["score4"]}
Score 5: {rubric_data["score5"]}

Think carefully before assigning a score.
Avoid giving the maximum score unless it is perfectly accurate.

Score (1–5):
""".strip()



def estrai_punteggio(output):
    match = re.search(r"Score\s*\(1[–-]5\)\s*:\s*(\d)", output)
    return int(match.group(1)) if match else None


# === 6. Valutazione riga per riga ===
score_as = []
score_bs = []
winners = []

for _, row in tqdm(df_base.iterrows(), total=len(df_base)):
    # Valuta Mistral (Response A)
    prompt_a = build_judge_prompt(row["Sentence"], row["HumanEval"], row["mistral"])
    output_a = judge(prompt_a, max_new_tokens=2, do_sample=False)[0]["generated_text"]
    score_a = estrai_punteggio(output_a)

    # Valuta NLLB (Response B)
    prompt_b = build_judge_prompt(row["Sentence"], row["HumanEval"], row["nnlb"])
    output_b = judge(prompt_b, max_new_tokens=2, do_sample=False)[0]["generated_text"]
    score_b = estrai_punteggio(output_b)

    # Salva punteggi e vincitore
    score_as.append(score_a)
    score_bs.append(score_b)

    if score_a is None or score_b is None:
        winners.append("?")
    elif score_a > score_b:
        winners.append("A")
    elif score_b > score_a:
        winners.append("B")
    else:
        winners.append("=")

# === 7. Salvataggio risultati ===
df_base["score_a"] = score_as
df_base["score_b"] = score_bs
df_base["winner"] = winners

df_base.to_csv("outputs/prometheus_absolute_scores.csv", index=False, quoting=csv.QUOTE_MINIMAL)
print("✅ File salvato in outputs/prometheus_absolute_scores.csv")


Loading checkpoint shards: 100%|██████████| 8/8 [00:09<00:00,  1.22s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0
  0%|          | 0/3 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 33%|███▎      | 1/3 [00:08<00:16,  8.06s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 67%|██████▋   | 2/3 [00:14<00:06,  6.98s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation f

✅ File salvato in outputs/prometheus_absolute_scores.csv





# 📊 Calcolo concordanza

In [14]:
df_base = pd.read_csv("outputs/prometheus_absolute_scores.csv")

In [15]:
from sklearn.metrics import cohen_kappa_score, classification_report, confusion_matrix

def evaluate_concordance(y_true, y_pred):
    print("== Valori unici ==")
    print("Umano:", sorted(set(y_true)))
    print("Modello:", sorted(set(y_pred)))
    
    if len(set(y_true)) < 2 or len(set(y_pred)) < 2:
        print("⚠️ Non abbastanza variabilità per calcolare Cohen's Kappa.")
        return
    
    print("\n== Cohen’s Kappa ==")
    print(f"{cohen_kappa_score(y_true, y_pred):.3f}")
    
    print("\n== Classification Report ==")
    print(classification_report(y_true, y_pred, labels=[1,2,3,4,5]))
    
    print("\n== Confusion Matrix ==")
    print(confusion_matrix(y_true, y_pred, labels=[1,2,3,4,5]))
print("MISTRAL")
evaluate_concordance(df_base["mistralHS"], df_base["score_a"])

print("NNLB")
evaluate_concordance(df_base["nnlbHS"], df_base["score_b"])


MISTRAL
== Valori unici ==
Umano: [0]
Modello: [5]
⚠️ Non abbastanza variabilità per calcolare Cohen's Kappa.
NNLB
== Valori unici ==
Umano: [0]
Modello: [1, 2, 3]
⚠️ Non abbastanza variabilità per calcolare Cohen's Kappa.


# 💾 Salvataggio risultati JSONL