In [None]:
!pip install datasets
!pip install evaluate
!pip install transformers
!pip install accelerate

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

In [None]:
import os
import numpy as np
import pandas as pd
import datasets
import evaluate
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModel
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
import torch

In [None]:
# nome modello
model_name = 'dbmdz/bert-base-italian-cased'

# carico il modello
model = AutoModel.from_pretrained(model_name)

# tokenizzatore associato
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# carico i dati già tokenizzati estratti nel notebook "bert_fine_tuning_coherence_tedx"
train = datasets.load_from_disk('/content/drive/MyDrive/Colab Notebooks/hf_tokenized_train')
val = datasets.load_from_disk('/content/drive/MyDrive/Colab Notebooks/hf_tokenized_val')
test = datasets.load_from_disk('/content/drive/MyDrive/Colab Notebooks/hf_tokenized_test')

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# estraggo i dati sui quali ho usato BERT senza fine-tuning per fare feature extraction, mi servono per pre-addestrare la testa di classificazione
X_train = np.load('/content/drive/My Drive/feat_ex_data.npz')['X_train']
y_train = np.load('/content/drive/My Drive/feat_ex_data.npz')['y_train']

X_val = np.load('/content/drive/My Drive/feat_ex_data.npz')['X_val']
y_val = np.load('/content/drive/My Drive/feat_ex_data.npz')['y_val']

X_test = np.load('/content/drive/My Drive/feat_ex_data.npz')['X_test']
y_test = np.load('/content/drive/My Drive/feat_ex_data.npz')['y_test']

In [None]:
X_train.shape, X_val.shape, X_test.shape

((7200, 768), (800, 768), (800, 768))

In [None]:
class MLP(torch.nn.Module):
  """
  Testa di classificazione personalizzata
  """
  def __init__(self):
    super().__init__()


    # layer denso 1
    self.dense1 = nn.Linear(768, 128)
    # layer denso 2
    self.dense2 = nn.Linear(128, 256)

    # layer denso 3
    self.dense3 = nn.Linear(256, 512)

    # layer denso 4
    self.dense4 = nn.Linear(512, 128)

    # layer di output
    self.out = nn.Linear(128,1)

    # dropout
    self.dropout = nn.Dropout(p=0.5)

    # batchnorm
    self.batchnorm1 = nn.BatchNorm1d(128)
    self.batchnorm2 = nn.BatchNorm1d(256)
    self.batchnorm3 = nn.BatchNorm1d(512)
    self.batchnorm4 = nn.BatchNorm1d(128)



  def forward(self, x):
    # primo layer denso
    x = self.dense1(x)
    x = self.batchnorm1(x)
    x = F.relu(x)
    x = self.dropout(x)
    # per la connessione residua
    x1 = x.clone()


    # secondo layer denso
    x = self.dense2(x)
    x = self.batchnorm2(x)
    x = F.relu(x)
    x = self.dropout(x)

    # terzo layer denso
    x = self.dense3(x)
    x = self.batchnorm3(x)
    x = F.relu(x)
    x = self.dropout(x)

    # quarto layer denso
    x = self.dense4(x)
    x = self.batchnorm4(x)
    x = F.relu(x)
    x = self.dropout(x)


    # layer di output
    x = self.out(x + x1) # connessione residua
    x = F.sigmoid(x).squeeze()

    return x

In [None]:
mlp = MLP()
mlp.to('cuda')

MLP(
  (dense1): Linear(in_features=768, out_features=128, bias=True)
  (dense2): Linear(in_features=128, out_features=256, bias=True)
  (dense25): Linear(in_features=256, out_features=512, bias=True)
  (dense3): Linear(in_features=512, out_features=128, bias=True)
  (out): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (batchnorm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm25): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [None]:
class Net(nn.Module):
    """
    Modello completo usato per il fine-tuning, con BERT e la testa di classificazione personalizzata.
    L'input e l'output del metodo "forward" seguono l'interfaccia stardard di HuggingFace, in modo da poter usare il trainer di HuggingFace per il fine-tuning
    """
    def __init__(self):
        super().__init__()
        self.bert = model
        self.mlp = mlp

        self.criterion = nn.BCELoss()

    def forward(self, input_ids, labels, attention_mask):
      x = self.bert(input_ids, attention_mask).pooler_output

      logits = torch.Tensor(self.mlp(x))
      loss = self.criterion(logits, labels.float())
      return {'loss': loss, 'logits': logits}

In [None]:
from torch.utils.data import DataLoader, TensorDataset
# trasformo i dataset della features extraction in tensori torch
X_train_t = torch.Tensor(X_train)
X_val_t = torch.Tensor(X_val).to(device)
X_test_t = torch.Tensor(X_test)

y_train_t = torch.Tensor(y_train).float()
y_val_t = torch.Tensor(y_val).float().to(device)
y_test_t = torch.Tensor(y_test).float()

In [None]:
# trasformo train e test in DataLoader
data_train = TensorDataset(X_train_t, y_train_t)
data_test = TensorDataset(X_test_t, y_test_t)


train_dataloader = DataLoader(data_train, batch_size=256, shuffle=True, num_workers=2)
test_dataloader = DataLoader(data_test, batch_size=1, shuffle=True, num_workers=2)

In [None]:
def get_accuracy(y_true, y_prob):
    assert y_true.ndim == 1 and y_true.size() == y_prob.size()
    y_prob = y_prob > 0.5
    return (y_true == y_prob).sum().item() / y_true.size(0)

In [None]:
train_loss_list = []
train_acc_list = []
val_loss_list = []
val_acc_list = []

In [None]:
import time

In [None]:
# loss function e ottimizzatore
loss = torch.nn.BCELoss() 
opt = torch.optim.Adam(mlp.parameters(), lr=0.001, weight_decay=0.0001)

n_epochs = 35   # numero di epoche
batch_size = 256  # size of each batch
batches_per_epoch = X_train.shape[0] // batch_size


time_ = time.time()
delta_time = 0

best_acc = 0
patience = 0 # patience per il criterio di early stopping


# TRAINING LOOP del preaddestramento della testa di classificazione
for epoch in range(n_epochs):
  mlp.train()

  running_loss_val = 0.0
  running_acc = 0.0
  for i, data in enumerate(train_dataloader):
    # sposto i batch sulla gpu
    X = data[0].to(device)
    y = data[1].to(device)

    # azzero il gradiente
    opt.zero_grad()

    #forward
    y_pred = mlp(X)

    # calcolo training loss
    loss_val = loss(y_pred, y)


    # training accuracy
    train_acc = get_accuracy(y, y_pred)

    # backprop
    loss_val.backward()

    # optimization
    opt.step()


    # statistiche
    running_loss_val += loss_val.data.item()
    running_acc += train_acc


  #forward su validazione ogni fine epoca
  model.eval()
  with torch.no_grad():
    y_pred_val = mlp(X_val_t)
  val_loss_val = loss(y_pred_val, y_val_t)
  val_acc = get_accuracy(y_val_t, y_pred_val)


  delta_time = time.time() - time_
  time_ = time.time()
  print(f'epoca: {epoch+1} | t: {round(delta_time, 2)} sec || train_loss: {running_loss_val/(i+1)} | train_acc: {running_acc/(i+1)} || val_loss: {val_loss_val} | val_acc: {val_acc}' )

  train_loss_list.append(running_loss_val/(i+1))
  train_acc_list.append((running_acc/(i+1)))
  val_loss_list.append(val_loss_val.item())
  val_acc_list.append(val_acc)


  # early stopping
  if val_acc > best_acc:
    best_acc = val_acc
    patience = 0
  else:
    patience += 1
    if patience == 100:
      print('BLOCCATO')
      break


  running_loss_val = 0.0
  running_acc = 0.0



  self.pid = os.fork()


epoca: 1 | t: 0.27 sec || train_loss: 0.7393379889685532 | train_acc: 0.5080818965517241 || val_loss: 0.7283404469490051 | val_acc: 0.515
epoca: 2 | t: 0.26 sec || train_loss: 0.7220353114193884 | train_acc: 0.5119881465517241 || val_loss: 0.6875580549240112 | val_acc: 0.57625
epoca: 3 | t: 0.27 sec || train_loss: 0.7041510179125029 | train_acc: 0.5294989224137931 || val_loss: 0.6984705328941345 | val_acc: 0.5225
epoca: 4 | t: 0.26 sec || train_loss: 0.6973651236501234 | train_acc: 0.5367726293103449 || val_loss: 0.7084214687347412 | val_acc: 0.51
epoca: 5 | t: 0.27 sec || train_loss: 0.6950311003060177 | train_acc: 0.5405441810344828 || val_loss: 0.6818770170211792 | val_acc: 0.55125
epoca: 6 | t: 0.26 sec || train_loss: 0.69486193821348 | train_acc: 0.5387931034482759 || val_loss: 0.6906672716140747 | val_acc: 0.53125
epoca: 7 | t: 0.27 sec || train_loss: 0.6915096969440065 | train_acc: 0.5405441810344828 || val_loss: 0.682951033115387 | val_acc: 0.545
epoca: 8 | t: 0.26 sec || train

In [None]:
# definisco il modello completo
net = Net()

In [None]:

num_epochs = 5

training_args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    logging_strategy="epoch",
    save_strategy = "epoch",
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
)



In [None]:
def acc_metric(eval_pred):
  accuracy_metric = evaluate.load("accuracy")
  predictions, labels = eval_pred
  predictions = predictions >= 0.5
  print((predictions == labels).sum() / labels.shape[0])

  return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
# fine tuning del modello completo
trainer = Trainer(
    net,
    training_args,
    train_dataset=train,
    eval_dataset=val,
    tokenizer=tokenizer,
    compute_metrics=acc_metric
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6871,0.614953,0.665
2,0.5842,0.597185,0.7025
3,0.4895,0.630314,0.67875
4,0.4153,0.632697,0.68875
5,0.3634,0.656265,0.6825


0.665
0.7025
0.67875
0.68875
0.6825


TrainOutput(global_step=2250, training_loss=0.5078917439778646, metrics={'train_runtime': 1248.3776, 'train_samples_per_second': 28.837, 'train_steps_per_second': 1.802, 'total_flos': 0.0, 'train_loss': 0.5078917439778646, 'epoch': 5.0})

## Valutazione del modello

In questo caso la valutazione è stata fatta sullo stesso notebook, perchè, modificando la struttura del modello, non è più possibile salvarlo attraverso il trainer, che lo salverebbe come se fosse un BERT normale. Per semplicità è stato quindi valutato qui.

In [None]:
from sklearn.metrics import classification_report

In [None]:
# valutazione sul test set
y_pred_prob = trainer.predict(test)
y_test = test["label"].tolist()
y_pred = y_pred_prob.predictions >= 0.5

report = classification_report(y_test, y_pred)
print(report)

0.675
              precision    recall  f1-score   support

           0       0.69      0.64      0.66       400
           1       0.66      0.71      0.69       400

    accuracy                           0.68       800
   macro avg       0.68      0.68      0.67       800
weighted avg       0.68      0.68      0.67       800



In [None]:
log_history = trainer.state.log_history

In [None]:
# salvo i logs
import json
with open('/content/drive/My Drive/finetuned_model_CLS_MLP__logs.json', 'w') as f:
    json.dump(log_history, f)

In [None]:
# visti i problemi di salvataggio, questo modello è stato salvato attraverso lo state_dict torch
torch.save(net.state_dict(), '/content/drive/My Drive/finetuned_model_CLS_MLP')