#**Step 1: Installazione, caricamento delle librerie e definizione delle variabili**

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install accelerate -U

#!pip install evaluate
#!pip install bert_score
#!pip install torchmetrics

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [7]:
import pandas as pd
#from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
#from evaluate import load
#from torchmetrics.text import TranslationEditRate

In [3]:
percentuale_dataset = 1
dataset_type = ""
model_name = "it5-large"

#**Step 2: Collegamento a drive**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

# change this to match your path
data_path_models = "./drive/MyDrive/Colab Notebooks/WebNLG-IT/Esperimento 2a/Models"
data_path_data = "./drive/MyDrive/Colab Notebooks/WebNLG-IT/Esperimento 2a/Datasets"

Mounted at /content/drive


#**Step 3: Importazione dei dataset**

In [8]:
class CustomDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        input_text = self.X[idx]
        target_text = self.y[idx]

        input_encoding = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        target_encoding = self.tokenizer(target_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# carico il dataset
trainset = pd.read_csv(f'{data_path_data}/WebNLG-IT/train.csv')
devset = pd.read_csv(f'{data_path_data}/WebNLG-IT/dev.csv')

In [9]:
# ottengo un sottoinsieme del dataset di partenza
subset_trainset = trainset.sample(frac=percentuale_dataset, random_state=42)
subset_devset = devset.sample(frac=percentuale_dataset, random_state=42)

# train
X_train = subset_trainset['triple'].values.tolist()
y_train = subset_trainset['sentence'].values.tolist()

# dev
X_val = subset_devset['triple'].values.tolist()
y_val = subset_devset['sentence'].values.tolist()

In [10]:
subset_trainset.head()

Unnamed: 0,triple,sentence
13581,Michele_Marcolini club Vicenza_Calcio A.C._Lum...,Michele Marcolini ha giocato per il Vicenza Ca...
14695,San_Sebastián_de_los_Reyes country Spain Adolf...,L'aeroporto Adolfo Suarez Madrid-Barajas si tr...
9976,Marriott_International foundationPlace Washing...,Marriott International è un inquilino dell'AC ...
2847,(15788)_1993_SB discoverer Roque_de_los_Muchac...,(15788) 1993 SB è stata scoperta dall'Osservat...
23428,Andrew_Rayel associatedBand/associatedMusicalA...,Andrew Rayel è stato associato ai seguenti art...


In [11]:
len(X_train)

35422

#**Step 4: Caricamento del modello**

In [12]:
# Load the T5 model, tokenizer, and configuration
full_model_name = f'gsarti/{model_name}'
model = AutoModelForSeq2SeqLM.from_pretrained(full_model_name)
tokenizer = AutoTokenizer.from_pretrained(full_model_name)
config = AutoConfig.from_pretrained(full_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

In [13]:
# Create the custom datasets
train_dataset = CustomDataset(X_train, y_train, tokenizer)
val_dataset = CustomDataset(X_val, y_val, tokenizer)

In [14]:
# Set up the data collator, training arguments, and trainer
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    #per_device_train_batch_size=2,
    #per_device_eval_batch_size=2,
    num_train_epochs=2,
    logging_dir="./logs",
    evaluation_strategy="steps",  # Calcolo della validation loss ad ogni epoca
    save_strategy="steps",  # Checkpoint del modello ad ogni epoca
    save_total_limit=3,  # Limita il numero di checkpoint salvati
    load_best_model_at_end=True,  # Carica il modello migliore al termine dell'addestramento
    metric_for_best_model="eval_loss",  # Sceglie la metrica per determinare il miglior modello
    greater_is_better=False,  # Indica se un valore più alto della metrica è migliore o no
    logging_steps=500,
    eval_steps=500,  # Numero di passaggi prima di valutare il modello
    # Aggiungi eventuali altri argomenti di training qui
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

Step,Training Loss,Validation Loss
500,5.6888,4.20366
1000,3.6915,2.307497
1500,2.0018,0.82213
2000,1.0265,0.585599
2500,0.768,0.46011
3000,0.6448,0.418532
3500,0.6237,0.386528
4000,0.5539,0.36517
4500,0.4973,0.349404
5000,0.4895,0.334422


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=8856, training_loss=1.0954155478266412, metrics={'train_runtime': 11737.7404, 'train_samples_per_second': 6.036, 'train_steps_per_second': 0.754, 'total_flos': 4.081835232775373e+16, 'train_loss': 1.0954155478266412, 'epoch': 2.0})

In [None]:
# salvataggio del modello
model.save_pretrained(f'{data_path_models}/{model_name}_it_2epoch')
tokenizer.save_pretrained(f'{data_path_models}/{model_name}_it_2epoch')

#**Step 5: generazione frasi del modello creato**

In [None]:
model1 = AutoModelForSeq2SeqLM.from_pretrained(f'{data_path_webnlgit_models}/{model_name}_it_2epoch')
tokenizer1 = AutoTokenizer.from_pretrained(f'{data_path_webnlgit_models}/{model_name}_it_2epoch')
print(f'{data_path_webnlgit_models}/{model_name}_it_2epoch')

In [None]:
def generate_text(input_rdf, max_length=128):
  input_text = f"{input_rdf}"
  input_ids = tokenizer1.encode(input_text, return_tensors="pt")

  # Imposta il parametro max_length per controllare la lunghezza dell'output generato
  output_ids = model1.generate(input_ids, max_length=max_length)

  output_text = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
  return output_text

In [None]:
testset = pd.read_csv(f'{data_path_data}/WebNLG-IT/test.csv')

# ottengo un sottoinsieme del testset di partenza
subset_testset = testset.sample(frac=percentuale_dataset, random_state=42)

# test
X_test = subset_testset['triple'].values.tolist()
y_test = subset_testset['sentence'].values.tolist()

len(X_test)

In [None]:
subset_testset.head()

In [None]:
triple = []
predicted = []
actuals = []

i = 0
for tripla in X_test:
  if i < 1010:
    triple.append(tripla)
    prediction = generate_text(tripla, len(y_test[i]) * 1.2)
    predicted.append(prediction)
    actuals.append(y_test[i])

    print(f'{i}/{len(X_test)-1}')
    print('tripla: ', tripla)
    print('actual: ', y_test[i])
    print('predicted: ', prediction)
    print('\n')

    df = pd.DataFrame(list(zip(triple, predicted, actuals)), columns =['triple', 'predicted', 'actuals'])
    df.to_csv(f'{data_path_models}/{model_name}_it_2epoch_decoding.csv', index=False)

  i += 1

