#**Step 1: Installazione, caricamento delle librerie e definizione delle variabili**

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install accelerate -U

#!pip install evaluate
#!pip install bert_score
#!pip install torchmetrics

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
#from evaluate import load
#from torchmetrics.text import TranslationEditRate

In [None]:
percentuale_dataset = 1
dataset_type = ""
model_name = "t5-large"

#**Step 2: Collegamento a drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# change this to match your path
data_path_models = "./drive/MyDrive/Colab Notebooks/WebNLG-IT/Esperimento 1a/Models"
data_path_data = "./drive/MyDrive/Colab Notebooks/WebNLG-IT/Esperimento 1a/Datasets"

#**Step 3: Importazione dei dataset**

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        input_text = self.X[idx]
        target_text = self.y[idx]

        input_encoding = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        target_encoding = self.tokenizer(target_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# carico il dataset
trainset = pd.read_csv(f'{data_path_data}/WebNLG-IT/train.csv')
devset = pd.read_csv(f'{data_path_data}/WebNLG-IT/dev.csv')

In [None]:
# ottengo un sottoinsieme del dataset di partenza
subset_trainset = trainset.sample(frac=percentuale_dataset, random_state=42)
subset_devset = devset.sample(frac=percentuale_dataset, random_state=42)

# train
X_train = subset_trainset['triple'].values.tolist()
y_train = subset_trainset['sentence'].values.tolist()

# dev
X_val = subset_devset['triple'].values.tolist()
y_val = subset_devset['sentence'].values.tolist()

In [None]:
subset_trainset.head()

In [None]:
len(X_train)

#**Step 4: Caricamento del modello**

In [None]:
# Load the T5 model, tokenizer, and configuration
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
config = T5Config.from_pretrained(model_name)

In [None]:
# Create the custom datasets
train_dataset = CustomDataset(X_train, y_train, tokenizer)
val_dataset = CustomDataset(X_val, y_val, tokenizer)

In [None]:
# Set up the data collator, training arguments, and trainer
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    #per_device_train_batch_size=2,
    #per_device_eval_batch_size=2,
    num_train_epochs=2,
    logging_dir="./logs",
    evaluation_strategy="steps",  # Calcolo della validation loss ad ogni epoca
    save_strategy="steps",  # Checkpoint del modello ad ogni epoca
    save_total_limit=3,  # Limita il numero di checkpoint salvati
    load_best_model_at_end=True,  # Carica il modello migliore al termine dell'addestramento
    metric_for_best_model="eval_loss",  # Sceglie la metrica per determinare il miglior modello
    greater_is_better=False,  # Indica se un valore più alto della metrica è migliore o no
    logging_steps=500,
    eval_steps=500,  # Numero di passaggi prima di valutare il modello
    # Aggiungi eventuali altri argomenti di training qui
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

In [None]:
# salvataggio del modello
model.save_pretrained(f'{data_path_models}/{model_name}_it_2epoch')
tokenizer.save_pretrained(f'{data_path_models}/{model_name}_it_2epoch')

#**Step 5: generazione frasi del modello creato**

In [None]:
model1 = T5ForConditionalGeneration.from_pretrained(f'{data_path_models}/{model_name}_it_2epoch')
tokenizer1 = T5Tokenizer.from_pretrained(f'{data_path_models}/{model_name}_it_2epoch')
print(f'{data_path_models}/{model_name}_it_2epoch')

In [None]:
def generate_text(input_rdf, max_length=128):
  input_text = f"{input_rdf}"
  input_ids = tokenizer1.encode(input_text, return_tensors="pt")

  # Imposta il parametro max_length per controllare la lunghezza dell'output generato
  output_ids = model1.generate(input_ids, max_length=max_length)

  output_text = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
  return output_text

In [None]:
testset = pd.read_csv(f'{data_path_data}/WebNLG-IT/test.csv')

# ottengo un sottoinsieme del testset di partenza
subset_testset = testset.sample(frac=percentuale_dataset, random_state=42)

# test
X_test = subset_testset['triple'].values.tolist()
y_test = subset_testset['sentence'].values.tolist()

len(X_test)

In [None]:
subset_testset.head()

In [None]:
triple = []
predicted = []
actuals = []

i = 0
for tripla in X_test:
  if i < 1010:
    triple.append(tripla)
    prediction = generate_text(tripla, len(y_test[i]) * 1.2)
    predicted.append(prediction)
    actuals.append(y_test[i])

    print(f'{i}/{len(X_test)-1}')
    print('tripla: ', tripla)
    print('actual: ', y_test[i])
    print('predicted: ', prediction)
    print('\n')

    df = pd.DataFrame(list(zip(triple, predicted, actuals)), columns =['triple', 'predicted', 'actuals'])
    df.to_csv(f'{data_path_models}/{model_name}_it_2epoch_decoding.csv', index=False)

  i += 1

