#**Step 1: Installazione, caricamento delle librerie e definizione delle variabili**

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install accelerate -U

#!pip install evaluate
#!pip install bert_score
#!pip install torchmetrics

Collecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
#from evaluate import load
#from torchmetrics.text import TranslationEditRate

In [3]:
percentuale_dataset = 1
dataset_type = ""
model_name = "t5-large"

#**Step 2: Collegamento a drive**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

# change this to match your path
data_path_models = "./drive/MyDrive/Colab Notebooks/WebNLG-IT/Models"
data_path_data = "./drive/MyDrive/Colab Notebooks/data"

Mounted at /content/drive


#**Step 3: Importazione dei dataset**

In [6]:
class CustomDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        input_text = self.X[idx]
        target_text = self.y[idx]

        input_encoding = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        target_encoding = self.tokenizer(target_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# carico il dataset
trainset = pd.read_csv(f'{data_path_data}/WebNLG-IT/train.csv')
devset = pd.read_csv(f'{data_path_data}/WebNLG-IT/dev.csv')

In [7]:
# ottengo un sottoinsieme del dataset di partenza
subset_trainset = trainset.sample(frac=percentuale_dataset, random_state=42)
subset_devset = devset.sample(frac=percentuale_dataset, random_state=42)

# train
X_train = subset_trainset['data_unit'].values.tolist()
y_train = subset_trainset['sentence'].values.tolist()

# dev
X_val = subset_devset['data_unit'].values.tolist()
y_val = subset_devset['sentence'].values.tolist()

In [8]:
subset_trainset.head()

Unnamed: 0,data_unit,sentence
39562,Mexico language Mexican_Spanish,In Messico una delle lingue parlate è lo spagn...
36480,Belgium leader Philippe_of_Belgium,Il leader del Belgio è Filippo del Belgio.
17867,Super_Capers budget 2000000.0 Super_Capers gro...,Adam West ha avuto un ruolo principale nel fil...
14307,Alvah_Sabin activeYearsStartDate 1853-03-04 Al...,Il servizio attivo di Alvah Sabin è iniziato i...
25736,South_Africa leader Jacob_Zuma,Jacob Zuma è un leader in Sudafrica.


In [9]:
len(X_train)

40081

#**Step 4: Caricamento del modello**

In [10]:
# Load the T5 model, tokenizer, and configuration
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
config = T5Config.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# Create the custom datasets
train_dataset = CustomDataset(X_train, y_train, tokenizer)
val_dataset = CustomDataset(X_val, y_val, tokenizer)

In [12]:
# Set up the data collator, training arguments, and trainer
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    #per_device_train_batch_size=2,
    #per_device_eval_batch_size=2,
    num_train_epochs=2,
    logging_dir="./logs",
    evaluation_strategy="steps",  # Calcolo della validation loss ad ogni epoca
    save_strategy="steps",  # Checkpoint del modello ad ogni epoca
    save_total_limit=3,  # Limita il numero di checkpoint salvati
    load_best_model_at_end=True,  # Carica il modello migliore al termine dell'addestramento
    metric_for_best_model="eval_loss",  # Sceglie la metrica per determinare il miglior modello
    greater_is_better=False,  # Indica se un valore più alto della metrica è migliore o no
    logging_steps=500,
    eval_steps=500,  # Numero di passaggi prima di valutare il modello
    # Aggiungi eventuali altri argomenti di training qui
)

In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

Step,Training Loss,Validation Loss
500,0.865,0.323472
1000,0.3568,0.261562
1500,0.3024,0.232108
2000,0.2737,0.215651
2500,0.255,0.20167
3000,0.2406,0.193568
3500,0.2303,0.186847
4000,0.2271,0.179606
4500,0.2167,0.175368
5000,0.2092,0.171288


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=10022, training_loss=0.25083388684633795, metrics={'train_runtime': 7899.8906, 'train_samples_per_second': 10.147, 'train_steps_per_second': 1.269, 'total_flos': 4.3388630335488e+16, 'train_loss': 0.25083388684633795, 'epoch': 2.0})

In [14]:
# salvataggio del modello
model.save_pretrained(f'{data_path_models}/{model_name}_2epoch')
tokenizer.save_pretrained(f'{data_path_models}/{model_name}_2epoch')

('./drive/MyDrive/Colab Notebooks/WebNLG-IT/Models/t5-large_2epoch/tokenizer_config.json',
 './drive/MyDrive/Colab Notebooks/WebNLG-IT/Models/t5-large_2epoch/special_tokens_map.json',
 './drive/MyDrive/Colab Notebooks/WebNLG-IT/Models/t5-large_2epoch/spiece.model',
 './drive/MyDrive/Colab Notebooks/WebNLG-IT/Models/t5-large_2epoch/added_tokens.json')

#**Step 5: generazione frasi del modello creato**

In [15]:
model1 = T5ForConditionalGeneration.from_pretrained(f'{data_path_models}/{model_name}_2epoch')
tokenizer1 = T5Tokenizer.from_pretrained(f'{data_path_models}/{model_name}_2epoch')
print(f'{data_path_models}/{model_name}_2epoch')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


./drive/MyDrive/Colab Notebooks/WebNLG-IT/Models/t5-large_2epoch


In [16]:
def generate_text(input_rdf, max_length=128):
  input_text = f"{input_rdf}"
  input_ids = tokenizer1.encode(input_text, return_tensors="pt")

  # Imposta il parametro max_length per controllare la lunghezza dell'output generato
  output_ids = model1.generate(input_ids, max_length=max_length)

  output_text = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
  return output_text

In [17]:
testset = pd.read_csv(f'{data_path_data}/WebNLG-IT/test.csv')

# ottengo un sottoinsieme del testset di partenza
subset_testset = testset.sample(frac=percentuale_dataset, random_state=42)

# test
X_test = subset_testset['data_unit'].values.tolist()
y_test = subset_testset['sentence'].values.tolist()

len(X_test)

1000

In [18]:
subset_testset.head()

Unnamed: 0,data_unit,sentence
521,Peter_Stöger club FC_Admira_Wacker_Mödling,La squadra di Peter Stöger è l'FC Admira Wacke...
737,Arem-arem ingredient Banana_leaf,La foglia di banano è un ingrediente di Arem a...
740,Weymouth_Sands followedBy Maiden_Castle_(novel...,"A Glastonbury Romance"" fu seguito da ""Weymouth..."
660,Christian_Panucci club Genoa_C.F.C. Christian_...,L'A.S. Livorno Calcio è diretto da Christian P...
411,Chicharrón region Andalusia,Il chicharrón si trova nella regione dell'Anda...


In [19]:
triple = []
predicted = []
actuals = []

i = 0
for tripla in X_test:
  triple.append(tripla)
  prediction = generate_text(tripla)
  predicted.append(prediction)
  actuals.append(y_test[i])

  print(f'{i}/{len(X_test)-1}')
  print('tripla: ', tripla)
  print('actual: ', y_test[i])
  print('predicted: ', prediction)
  print('\n')

  df = pd.DataFrame(list(zip(triple, predicted, actuals)), columns =['triple', 'predicted', 'actuals'])
  df.to_csv(f'{data_path_models}/{model_name}_2epoch_decoding.csv', index=False)

  i += 1



[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m


167/999
tripla:  Andrews_County_Airport runwayLength 1773.0
actual:  La lunghezza della pista dell'aeroporto di Andrews County è di 1773 metri.
predicted:  La lunghezza della pista dell'aeroporto della contea di Andrews è di 1773,0 metri.


168/999
tripla:  Alderney_Airport runwayLength 497.0
actual:  L'aeroporto di Alderney ha una pista di 497,0 metri.
predicted:  La lunghezza della pista dell'aeroporto di Alderney è di 497,0 metri.


169/999
tripla:  A.C._Chievo_Verona ground "Verona, Italy" A.C._Chievo_Verona league Serie_A
actual:  Verona, Italia è la sede dell'A.C. Chievo Verona, che milita nel campionato di Serie A.
predicted:  L'A.C. Chievo Verona gioca in Serie A e il suo campo si trova a Verona, in Italia.


170/999
tripla:  Auron_(comicsCharacter) creator Karl_Kesel Auron_(comicsCharacter) creator Walt_Simonson
actual:  Il personaggio dei fumetti Auron è stato creato sia da Karl Kesel che da Walt Simonson.
pred