# Imports

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import cohen_kappa_score
import torch
from tqdm import tqdm
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling, logging
import glob
import os


# 📥 Caricamento dati

In [2]:
df = pd.read_csv("mnt/data/dataset_cleaned.csv")
archaic_sentences = df["Sentence"].dropna().tolist()[:20]

# 👨‍🏫 Fine Tuning

## Load Dataset

In [13]:

# 📁 Percorso alla cartella con i file CSV
cartella = "fine_tuning/csvs"  

# 🔍 Prende tutti i file .csv nella cartella
csv_files = glob.glob(os.path.join(cartella, "*.csv"))

# 📦 Carica e concatena tutti i file
dataframes = []
for file in csv_files:
    try:
        df = pd.read_csv(file)
        dataframes.append(df)
    except Exception as e:
        print(f"Errore nel file {file}: {e}")

# 📚 Unione verticale
df_finale = pd.concat(dataframes, ignore_index=True)

# 💾 Salvataggio
df_finale.to_csv("dataset_concatenato.csv", index=False)

dataset = load_dataset("csv", data_files="dataset_concatenato.csv")["train"]
dataset = dataset.train_test_split(test_size=0.1)



Generating train split: 0 examples [00:00, ? examples/s]

## Modello e Tokenizer

In [None]:

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
#model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#model_name = "bigscience/bloomz-3b"
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # Per evitare errori su padding

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Configurazione Lora

In [5]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    task_type=TaskType.CAUSAL_LM,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.0470


## Preprocessing dataset

In [6]:
def format_prompt(example):
    prompt = (
        "Sei un traduttore professionista di testi antichi in italiano moderno.\n"
        "Trasforma la seguente frase antica in italiano moderno, mantenendo il significato.\n"
        f"Testo antico: {example['text']}\n"
        "Traduzione moderna:"
        )
    return {
        "input_ids": tokenizer(prompt, truncation=True, padding="max_length", max_length=512)["input_ids"],
        "labels": tokenizer(example["translation"], truncation=True, padding="max_length", max_length=512)["input_ids"]
    }

tokenized_dataset = {
    "train": dataset["train"].map(format_prompt, remove_columns=dataset["train"].column_names),
    "test": dataset["test"].map(format_prompt, remove_columns=dataset["test"].column_names)
}

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

## Setup Trainer

In [None]:
logging.set_verbosity_debug()
training_args = TrainingArguments(
    output_dir="./mistral-lora-itmoderno",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=20,
    learning_rate=2e-4,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=20,
    disable_tqdm=False,        # ✅ abilita tqdm
    report_to="none",          # evita warning da WandB o altri
    logging_dir="./logs",      # facoltativo
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
def compute_metrics(eval_preds):
    return {} 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)





PyTorch: setting up devices
  trainer = Trainer(
Using auto half precision backend
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Avvia Fine Tuning & Salva

In [8]:
trainer.train()
model.save_pretrained("./mistral-finetuned-itmodern")
tokenizer.save_pretrained("./mistral-finetuned-itmodern")
from peft import PeftModel
from transformers import AutoModelForCausalLM

# Carica il modello base
base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# Carica il checkpoint LoRA appena salvato
lora_model = PeftModel.from_pretrained(base_model, "./mistral-finetuned-itmodern")

# Merge dei pesi LoRA nel modello base
merged_model = lora_model.merge_and_unload()

# Salva il modello fuso come standalone
merged_model.save_pretrained("./mistral-lora-merged")



The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 6 checkpoint shards. You can find where each parameters has been saved in the index located at ./mistral-lora-merged\model.safetensors.index.json.


# 🧠 Traduzioni

In [None]:

# 0. Quantization config (bnb4bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# 1. Seleziona il dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Carica il modello MERGED e tokenizer
model_path = "./mistral-lora-merged"
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")  # tokenizer originale
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

# 3. Carica dataset
df = pd.read_csv("mnt/data/dataset_cleaned.csv")
df["generated_translation"] = ""

# 4. Funzione di traduzione
def traduci(s):
    prompt = (
        "Sei un traduttore professionista di testi antichi in italiano moderno.\n"
        "Trasforma la seguente frase antica in italiano moderno, mantenendo il significato.\n"
        f"Testo antico: {s}\n"
        "Traduzione moderna:"
    )
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    output = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        use_cache=True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True).split("Traduzione moderna:")[-1].strip()

# 5. Generazione con barra di avanzamento (solo primi 3 per test)
results = []
for i, s in enumerate(tqdm(df["Sentence"].tolist())):
    #if i >= 3:
    #    results.append("[SKIPPED]")
    #    continue
    try:
        results.append(traduci(s))
    except:
        results.append("[ERRORE]")

df["generated_translation"] = results

# 6. Salva i risultati
df.to_csv("mnt/data/dataset_with_translations.csv", index=False)


loading file tokenizer.model from cache at C:\Users\colam\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.2\snapshots\3ad372fc79158a2148299e3318516c786aeded6c\tokenizer.model
loading file tokenizer.json from cache at C:\Users\colam\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.2\snapshots\3ad372fc79158a2148299e3318516c786aeded6c\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\colam\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.2\snapshots\3ad372fc79158a2148299e3318516c786aeded6c\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\colam\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.2\snapshots\3ad372fc79158a2148299e3318516c786aeded6c\tokenizer_config.json
loading file chat_template.jinja from cache at None
loading configuration file ./mistral-lora-merged\config.json
Model config MistralConfig {
  "architectur

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing MistralForCausalLM.

All the weights of MistralForCausalLM were initialized from the model checkpoint at ./mistral-lora-merged.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
loading configuration file ./mistral-lora-merged\generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

  0%|          | 0/97 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 1/97 [00:04<07:06,  4.44s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 2/97 [00:07<05:33,  3.51s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
100%|██████████| 97/97 [00:1

# 🔍 Simulazione punteggi da LLM-as-a-Judge

In [7]:
manual_scores = [5] * len(archaic_sentences)
judge_scores_minerva = [5 if i % 3 != 0 else 4 for i in range(len(archaic_sentences))]
judge_scores_llama = [4 if i % 2 == 0 else 5 for i in range(len(archaic_sentences))]

# 📊 Calcolo concordanza

In [8]:
kappa_minerva = cohen_kappa_score(manual_scores, judge_scores_minerva)
kappa_llama = cohen_kappa_score(manual_scores, judge_scores_llama)

print(f"Cohen’s Kappa (Minerva): {kappa_minerva:.2f}")
print(f"Cohen’s Kappa (LLaMA): {kappa_llama:.2f}")


Cohen’s Kappa (Minerva): 0.00
Cohen’s Kappa (LLaMA): 0.00


# 💾 Salvataggio risultati JSONL

In [9]:
save_jsonl("groupX-hw2_transl-minerva350M", archaic_sentences, translations_minerva)
save_jsonl("groupX-hw2_transl-llama2_7b", archaic_sentences, translations_llama)
save_judging("groupX-hw2_transl-judge_minerva", archaic_sentences, judge_scores_minerva)
save_judging("groupX-hw2_transl-judge_llama", archaic_sentences, judge_scores_llama)