# 3.5 - Models fine-tuning

In [23]:
#import
import pandas as pd

import torch
from torch.optim import AdamW
from torch.nn import BCEWithLogitsLoss
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [24]:
df_cleaned = pd.read_parquet('../data/processed/cleaned_data.parquet', engine='pyarrow')
for index, row in df_cleaned.iterrows():
    if len(row['topics']) == 0:
        df_cleaned.loc[index, 'flag'] = 1
df_cleaned = df_cleaned[~(df_cleaned['flag'] == 1)]

In [25]:
model_name = "nickprock/sentence-bert-base-italian-xxl-uncased"
device = torch.device(
    "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [26]:
mlb = MultiLabelBinarizer()
topic_encoded = mlb.fit_transform(df_cleaned['topics'])

In [27]:
texts = []
labels = []
for i, article_sentences in enumerate(df_cleaned['text chunked']):
    for sentence in article_sentences:
        texts.append(sentence)
        labels.append(topic_encoded[i])

In [28]:
def encode_data(tokenizer, texts, max_length=128):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
inputs = encode_data(tokenizer, texts)

In [29]:
labels = torch.tensor(labels, dtype=torch.float32)

In [30]:
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
dataloader = DataLoader(dataset, batch_size = 32, shuffle = True)

##### [torch Loss Functions] (https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html#torch.nn.BCEWithLogitsLoss)
##### [BCEWithLogitsLoss] (https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html#torch.nn.BCEWithLogitsLoss)

###### BCELoss: Creates a criterion that measures the Binary Cross Entropy between the target and the input probabilities:

###### BCEWithLogitsLoss: This loss combines a Sigmoid layer and the BCELoss in one single class.

In [31]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = len(mlb.classes_))
model.to(device) # move model to device GPU
optimizer = AdamW(model.parameters(), lr = 1e-3)
loss_fn = BCEWithLogitsLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nickprock/sentence-bert-base-italian-xxl-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


---

In [32]:
class EarlyStopping:
    def __init__(self, patience=3, verbose=False, delta=0):
        """
        Args:
            patience (int): Quante epoche attendere dopo l'ultima volta che si è visto migliorare la loss di validazione.
            verbose (bool): Se True, stampa un messaggio per ogni epoch in cui la loss di validazione migliora.
            delta (float): Minimo cambiamento nella loss di validazione per qualificarsi come miglioramento.
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')
        self.delta = delta

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        """Salva il modello quando la loss di validazione diminuisce."""
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model...')
        torch.save(model.state_dict(), '../models/checkpoints/checkpoint.pt')
        self.val_loss_min = val_loss


In [33]:
# Divisione del dataset in training e validation
train_dataset, val_dataset = train_test_split(dataset, test_size = 0.2, random_state = 42)

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 32, shuffle = False)

def train_model(model, train_loader, val_loader, loss_fn, optimizer, epochs, early_stopping, device):
    model.to(device)
    for epoch in range(epochs):
        step = 0
        model.train()
        total_loss = 0
        for batch in train_loader:
            step += 1
            inputs, masks, labels = batch
            inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs, attention_mask = masks)
            loss = loss_fn(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            print(f"Epoch: {epoch + 1} - Batch: {step}, Training Loss: {total_loss / len(train_loader)}")
        val_loss = validate(model, val_loader, loss_fn, device)
        print(f"Epoch {epoch + 1} - Batch {step}, Training Loss: {total_loss / len(train_loader)}, Validation Loss: {val_loss}")

        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break

def validate(model, dataloader, loss_fn, device):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs, masks, labels = batch
            inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)
            outputs = model(inputs, attention_mask = masks)
            loss = loss_fn(outputs.logits, labels)
            val_loss += loss.item()
    return val_loss / len(dataloader)

# Inizializzazione dell'EarlyStopping
early_stopping = EarlyStopping(patience = 5, verbose = True)

In [34]:
train_model(
    model,
    train_loader,
    val_loader,
    loss_fn,
    optimizer,
    epochs = 10,
    early_stopping = early_stopping,
    device = device
)

Epoch: 1 - Batch: 1, Training Loss: 0.0002931067303045472
Epoch: 1 - Batch: 2, Training Loss: 0.0005330798983771607
Epoch: 1 - Batch: 3, Training Loss: 0.0007044225931167603
Epoch: 1 - Batch: 4, Training Loss: 0.0008180482223456969
Epoch: 1 - Batch: 5, Training Loss: 0.0009122861373187297
Epoch: 1 - Batch: 6, Training Loss: 0.00100462429633188
Epoch: 1 - Batch: 7, Training Loss: 0.0010921313469089678
Epoch: 1 - Batch: 8, Training Loss: 0.0011888883983219045
Epoch: 1 - Batch: 9, Training Loss: 0.0012831542993066323
Epoch: 1 - Batch: 10, Training Loss: 0.0013591723198420175
Epoch: 1 - Batch: 11, Training Loss: 0.0014351644116155741
Epoch: 1 - Batch: 12, Training Loss: 0.0015300974793497405
Epoch: 1 - Batch: 13, Training Loss: 0.0016094569184788028
Epoch: 1 - Batch: 14, Training Loss: 0.0017040945391848708
Epoch: 1 - Batch: 15, Training Loss: 0.0018010287241358464
Epoch: 1 - Batch: 16, Training Loss: 0.0018859861687344698
Epoch: 1 - Batch: 17, Training Loss: 0.0019691622017529078
Epoch: 1

In [35]:
model.save_pretrained('../models/nickprock_fine_tuned')
tokenizer.save_pretrained('../models/nickprock_fine_tuned')

('../models/nickprock_fine_tuned/tokenizer_config.json',
 '../models/nickprock_fine_tuned/special_tokens_map.json',
 '../models/nickprock_fine_tuned/vocab.txt',
 '../models/nickprock_fine_tuned/added_tokens.json',
 '../models/nickprock_fine_tuned/tokenizer.json')

In [10]:
from datasets import load_dataset
dataset = load_dataset("stsb_multi_mt", name="it")

In [11]:
import pandas as pd
datasets = pd.DataFrame()
for _, dataset in dataset.items():
    dataset = pd.DataFrame(dataset)
    datasets = pd.concat([datasets, dataset], axis=0). reset_index(drop=True)

In [14]:
datasets.columns

Index(['sentence1', 'sentence2', 'similarity_score'], dtype='object')

In [26]:
datasets[datasets['sentence1'].apply(lambda x: x.find('gravidanza') != -1)]['sentence2'][7940]#[3077]

'Come è stato detto, il problema del formaggio è il potenziale della Listeria.'