# Import Libraries

In [1]:
from transformers import AutoTokenizer,BitsAndBytesConfig, MT5Config, AutoModelForCausalLM, pipeline,DataCollatorForSeq2Seq,EarlyStoppingCallback, TrainingArguments,AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments,TrainerCallback, Trainer, DataCollatorForLanguageModeling, logging,MT5Tokenizer, MT5ForConditionalGeneration
import pandas as pd
import numpy as np
from datasets import load_dataset
import json
from sklearn.metrics import cohen_kappa_score
import torch
from tqdm.notebook import tqdm
import os
from huggingface_hub import login
import gc

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

2025-06-07 09:33:42.195633: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749281622.951709   11025 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749281623.262350   11025 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-07 09:33:46.054601: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load Datasets

In [None]:
#loading dataset for fine-tuning of Multilingual T5-Base
dataset = load_dataset("csv", data_files="/kaggle/input/datasetfinetune/dataset_concatenato.csv")["train"]
dataset = dataset.train_test_split(test_size=0.3)
# loading dataset di test 
df = pd.read_csv("/kaggle/input/dataset-test/dataset.csv")
df.head()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading model Multilingual T5 Base

In [None]:
model_name="google/mt5-base"
model =AutoModelForSeq2SeqLM.from_pretrained(model_name,device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=False,legacy=True)
#to avoid problem of padding
tokenizer.pad_token = tokenizer.eos_token 
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id

# Creating path to save model

In [None]:
path_mt5="/kaggle/working/finetune_ft5"
os.makedirs(path_mt5, exist_ok=True)
path_log_mt5=path_mt5+"/logs"
os.makedirs(path_log_mt5,exist_ok=True)


# Preprocessing and tokenize prompt and sentences for finetuning

In [None]:
def preprocess_tokenization_prompt(example):
    prompt=("Trasforma la seguente frase antica in italiano moderno, mantenendo il significato.\n"
            f"Testo antico: {example['text']}" 
           )
    model_input = tokenizer(prompt, max_length=384,padding="max_length", truncation=True)
    labels=tokenizer(example["translation"], max_length=384,padding="max_length",truncation=True)
    model_input["labels"]=labels["input_ids"]
    return model_input


tokenized_dataset = {
    "train": dataset["train"].map(preprocess_tokenization_prompt, remove_columns=dataset["train"].column_names),
    "test": dataset["test"].map(preprocess_tokenization_prompt, remove_columns=dataset["test"].column_names)
}


# Setup of Training arguments

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=path_mt5,
    eval_strategy="epoch",
    learning_rate=2e-4,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=15,
    predict_with_generate=True,    
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    fp16=False,
    seed=42,
    weight_decay=0.05,
    generation_max_length=384,
    generation_num_beams=4
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model,label_pad_token_id=-100)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer   
)

# Fine tune of model

In [None]:
trainer.train()
model.save_pretrained(path_mt5)
tokenizer.save_pretrained(path_mt5)

# Inference of model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(path_mt5,use_fast=False,legacy=True)
model = AutoModelForSeq2SeqLM.from_pretrained(path_mt5,
    device_map="auto")

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.eval().to(device)

df["generated_translation"] = ""
# ""
def traduci_mt5(example):
    prompt=("Trasforma la seguente frase antica in italiano moderno, mantenendo il significato.\n"
            f"Testo antico: {example}" 
           )
    inputs = tokenizer(prompt, return_tensors="pt",truncation=True,max_length=384).to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            num_beams=4,
            use_cache=True,
            early_stopping=True
        )
    return  tokenizer.decode(output[0], skip_special_tokens=True).strip()
    
results = []
i =0
for s in tqdm(df["Sentence"].tolist()):
    results.append(traduci_mt5(s))
        
df["generated_translation"] = results

path_translations="/kaggle/working/translations"
os.makedirs(path_translations, exist_ok=True)
# 5. Save result
translation_file=path_translations+"dataset_with_translation_ft5.csv"
df.to_csv(translation_file, index=False)