# Import Libraries

In [None]:
from transformers import AutoTokenizer,BitsAndBytesConfig, MT5Config, AutoModelForCausalLM, pipeline,DataCollatorForSeq2Seq,EarlyStoppingCallback, TrainingArguments,AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments,TrainerCallback, Trainer, DataCollatorForLanguageModeling, logging,MT5Tokenizer, MT5ForConditionalGeneration
import pandas as pd
import numpy as np
from datasets import load_dataset
import json
from sklearn.metrics import cohen_kappa_score
import torch
from tqdm.notebook import tqdm
import os
from huggingface_hub import login
import gc

# Load Datasets

In [None]:
#loading dataset for fine-tuning containing all samples with structure ('text':ancient sentence,'translation': modern translation sentence)
dataset = load_dataset("csv", data_files="./inputs/dataset_concatenato.csv")["train"]

#dataset divided in training set and test set=0.3 
dataset = dataset.train_test_split(test_size=0.3)

# loading dataset di test (100 items)
df = pd.read_csv("./inputs/dataset.csv")
df.head()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading model Multilingual T5 Base

In [None]:
#name of model
model_name="google/mt5-base"
#loading of pre-treained model, mapped automatically on devices (GPU VRAM and RAM)  
model =AutoModelForSeq2SeqLM.from_pretrained(model_name,device_map="auto")
#tokenizer of model mt5-base
tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=False,legacy=True)

#to avoid problem of padding, we insert the eos token like padding token and update teh configuration of pad and eos for model 
# (notes: the padding token is not a default config for mt5-base)
tokenizer.pad_token = tokenizer.eos_token 
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id

# Creating path to save model

In [None]:
#creating path directory where post-finetune model will be saved together the updated weights ecc...
path_mt5="./Models/MT_5/finetune_ft5"
os.makedirs(path_mt5, exist_ok=True)

#log path for every epoch 
path_log_mt5=path_mt5+"/logs"
os.makedirs(path_log_mt5,exist_ok=True)


# Preprocessing and tokenize prompt and sentences for finetuning

In [None]:
#function that tokenizes prompt+senetences and labels of sentences
def preprocess_tokenization_prompt(example):
    #we formalize the instruction prompt for every sample sentence where sentence is 
    prompt=("Trasforma la seguente frase antica in italiano moderno, mantenendo il significato ma cambia le parole:\n"
            f"Testo antico: {example['text']}" 
           )
    
    #tokenize input using max token 384 and to fill avery input with the padding and we cut the tokenized sentence if it overcomes the upperbound
    model_input = tokenizer(prompt, max_length=384,padding="max_length", truncation=True)
    
    #we repeat the same reasoning also for labels
    labels=tokenizer(example["translation"], max_length=384,padding="max_length",truncation=True)

    # we associated the corresponding tokenized labels to tokenized prompt
    model_input["labels"]=labels["input_ids"]
    return model_input

# we build a tokenized dataset which is the mapping result of the training and test sample 
tokenized_dataset = {
    "train": dataset["train"].map(preprocess_tokenization_prompt, remove_columns=dataset["train"].column_names),
    "test": dataset["test"].map(preprocess_tokenization_prompt, remove_columns=dataset["test"].column_names)
}


# Setup of Training arguments

In [None]:
#parameters of traininng for model seq to seq
training_args = Seq2SeqTrainingArguments(

    output_dir=path_mt5,    # output directory 

    eval_strategy="epoch",  #evaluation of model every epoch

    learning_rate=2e-4,     #stable learning rate of order e-4

    gradient_accumulation_steps=4, # accumulation of gradients for 4 forward steps 

    per_device_train_batch_size=2, #batch training size 2
    per_device_eval_batch_size=2, #batch test size 2

    num_train_epochs=18,  # number of epochs for training

    predict_with_generate=True,     # to generate sentence predictions during evaluation on validation set
    metric_for_best_model="eval_loss", # metric to evaluate the best model

    greater_is_better=False, 
    logging_dir=path_log_mt5, # directory for logging
    logging_strategy="epoch",  # logging strategy for every epoch
    
    report_to="none", 
    
    fp16=False, 
    seed=42,         # seed for reproducibility
    weight_decay=0.05,      # weight decay for regularization to avodi overfitting 
    generation_max_length=384  # maximum length of generated sentences
)

#data collator for seq2seq model that pads the input and labels to the maximum length of the batch it also sets the label_pad_token_id to -100 to ignore padding tokens in the loss calculation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model,label_pad_token_id=-100) 

# we define the trainer for seq2seq model with the model, training arguments, train and test dataset, data collator and tokenizer   
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer   
)

# Fine tune of model

In [None]:
trainer.train() #start training of model 
model.save_pretrained(path_mt5) # save the model weights and configuration
tokenizer.save_pretrained(path_mt5) # save the tokenizer configuration and vocabulary

# Inference of fine-tuned model

In [None]:
# reload the tokenizer from the saved path
tokenizer = AutoTokenizer.from_pretrained(path_mt5,use_fast=False,legacy=True)  

# load the model from the saved path
model = AutoModelForSeq2SeqLM.from_pretrained(path_mt5,
    device_map="auto")

# set the model to evaluation mode and move it to the appropriate device (GPU or CPU)
model.eval().to(device)
# we create a new column in the dataframe to store the generated translations into 
df["generated_translation"] = ""    

# function to translate ancient sentences to modern Italian using the fine-tuned mT5 model
def traduci_mt5(example):

    prompt=("Trasforma la seguente frase antica in italiano moderno, mantenendo il significato ma cambia le parole:\n"
            f"Testo antico: {example}" 
           )
    # tokenize the prompt and input sentence, truncating to a maximum length of 384 tokens
    inputs = tokenizer(prompt, return_tensors="pt",truncation=True,max_length=384).to(model.device)

    with torch.no_grad():
        # generate the translation using the model
        output = model.generate(
            **inputs,
            max_new_tokens=150, # maximum number of new tokens to generate
            do_sample=False,    # disable sampling to generate deterministic outputs
            num_beams=4,        # use beam search with 4 beams for better quality
            use_cache=True,    # enable caching to speed up generation
            early_stopping=True,  # stop generation when all beams finish
            pad_token_id=tokenizer.pad_token_id, # set padding token ID to avoid errors during generation
            eos_token_id=tokenizer.eos_token_id   # set end-of-sequence token ID to signal the end of generation
        )
    
    return  tokenizer.decode(output[0], skip_special_tokens=True).strip() 

# we create a list to store the generated translations
results = []
i =0
for s in tqdm(df["Sentence"].tolist()):
    results.append(traduci_mt5(s))

# we update the CSV file with the generated translations  "dataset.csv"
df["generated_translation"] = results

path_translations="./outputs"

os.makedirs(path_translations, exist_ok=True)
# Save result
translation_file=path_translations+"/dataset_with_mT5_translations.csv"
# Save the dataframe with translations to a CSV file "dataset.csv"
df.to_csv(translation_file, index=False)