# Imports

In [51]:
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import cohen_kappa_score
import torch
from tqdm import tqdm
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling, logging
import glob
import os


# 📥 Caricamento dati

In [2]:
dataset_path = "inputs/dataset.csv"

In [3]:
df = pd.read_csv(dataset_path)
archaic_sentences = df["Sentence"].dropna().tolist()[:20]

# 👨‍🏫 Fine Tuning

## Parametri fine tuning

In [None]:
# 📁 Percorso alla cartella con i file CSV
cartella_csvs = "fine_tuning/csvs"  
# 📁 Percorso alla cartella dove andrà il dataset concatenato
cartella_dataset_concatenato = "inputs/dataset_concatenato.csv"

training_epochs=15

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
#model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#model_name = "bigscience/bloomz-3b"

nuovi_token_max=50
temperatura=0.7
max_translations=0 # 0 = no limit, >0 = max number of translations to generate

def getPrompt(archaic_sentence):
    prompt = (
        "You are a professional translator of ancient texts into modern Italian.\n"
        "Transform the following archaic sentence into modern Italian, preserving its meaning.\n"
        f"Archaic text: {archaic_sentence}\n"
        "Modern translation:"
    )

    return prompt

## Load Dataset

In [None]:

# 🔍 Prende tutti i file .csv nella cartella
csv_files = glob.glob(os.path.join(cartella_csvs, "*.csv"))

# 📦 Carica e concatena tutti i file
dataframes = []
for file in csv_files:
    try:
        df = pd.read_csv(file)
        dataframes.append(df)
    except Exception as e:
        print(f"Errore nel file {file}: {e}")

# 📚 Unione verticale
df_finale = pd.concat(dataframes, ignore_index=True)

# 💾 Salvataggio
df_finale.to_csv(cartella_dataset_concatenato, index=False)

dataset = load_dataset("csv", data_files=cartella_dataset_concatenato)["train"]
dataset = dataset.train_test_split(test_size=0.1)



## Modello e Tokenizer

In [None]:

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)


# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=True) #use_fast=True)

tokenizer.pad_token = tokenizer.eos_token  # Per evitare errori su padding
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model.is_loaded_in_4bit = True


## Configurazione Lora

In [None]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    task_type=TaskType.CAUSAL_LM,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


## Preprocessing dataset

In [None]:
def format_prompt(example):
    prompt = getPrompt(example['text'])
    return {
        "input_ids": tokenizer(prompt, truncation=True, padding="max_length", max_length=512)["input_ids"],
        "labels": tokenizer(example["translation"], truncation=True, padding="max_length", max_length=512)["input_ids"]
    }

tokenized_dataset = {
    "train": dataset["train"].map(format_prompt, remove_columns=dataset["train"].column_names),
    "test": dataset["test"].map(format_prompt, remove_columns=dataset["test"].column_names)
}

## Setup Trainer

In [None]:
logging.set_verbosity_debug()
training_args = TrainingArguments(
    output_dir="Models/Mistral/mistral-lora-itmoderno",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=training_epochs,
    learning_rate=2e-4,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=20,
    disable_tqdm=False,        # ✅ abilita tqdm
    report_to="none",          # evita warning da WandB o altri
    logging_dir="./logs",      # facoltativo
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
def compute_metrics(eval_preds):
    return {} 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


## Avvia Fine Tuning & Salva

In [None]:
trainer.train()
model.save_pretrained("Models/Mistral/mistral-finetuned-itmodern")
tokenizer.save_pretrained("Models/Mistral/mistral-finetuned-itmodern")
from peft import PeftModel
from transformers import AutoModelForCausalLM

# Carica il modello base
base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# Carica il checkpoint LoRA appena salvato
lora_model = PeftModel.from_pretrained(base_model, "Models/Mistral/mistral-finetuned-itmodern")

# Merge dei pesi LoRA nel modello base
merged_model = lora_model.merge_and_unload()

# Salva il modello fuso come standalone
merged_model.save_pretrained("Models/Mistral/mistral-lora-merged")



# 🧠 Traduzioni

In [None]:

# 0. Quantization config (bnb4bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# 1. Seleziona il dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Carica il modello MERGED e tokenizer
model_path = "Models/Mistral/mistral-lora-merged"
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")  # tokenizer originale
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

# 3. Carica dataset
df = pd.read_csv("inputs/dataset.csv")
df["generated_translation"] = ""

# 4. Funzione di traduzione
def traduci(s):
    prompt = getPrompt(s)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    output = model.generate(
        **inputs,
        max_new_tokens=nuovi_token_max,
        temperature=temperatura,
        top_p=0.9,
        do_sample=True,
        use_cache=True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True).split("Traduzione moderna:")[-1].strip()

# 5. Generazione con barra di avanzamento (solo primi 3 per test)
results = []
for i, s in enumerate(tqdm(df["Sentence"].tolist())):
    if max_translations!= 0 and i >= max_translations:
        results.append("[SKIPPED]")
        continue
    try:
        results.append(traduci(s))
    except:
        results.append("[ERRORE]")

df["generated_translation"] = results
df["score_human"] = 0
# 6. Salva i risultati
df.to_csv("outputs/dataset_with_mistral_translations.csv", index=False)
