In [None]:
#Test script for fine-tuning RobBERT dutch model
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import RobertaTokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel
from torch import cuda
import torch.nn.functional as F
import os
from tqdm import tqdm

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'

In this first section, the robbert dutch transformer is fine-tuned on a dutch word-arousal dataset

In [None]:
data = pd.read_excel('All_Valence.xlsx', sheet_name="Means")
data = data[['Word', 'Valence']]

#normalisation to 0-1 range
data.loc[:, 'label'] = (data['Valence'] - 1) / 4
data = data[['Word', 'label']]

In [None]:
data.head()

In [None]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('DTAI-KULeuven/robbert-2023-dutch-base', truncation=True, do_lower_case=True)

In [None]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Word
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
train_size = 0.8
train_data=data.sample(frac=train_size,random_state=200)
test_data=data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

In [None]:
print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

In [None]:
training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

In [None]:
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained('DTAI-KULeuven/robbert-2023-dutch-base')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)  # Output a single value for regression

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = RobertaClass()
model.to(device)

In [None]:
loss_function = torch.nn.MSELoss()  # Use MSE for regression
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
def train(epoch):
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        outputs = model(ids, mask, token_type_ids)
        outputs = outputs.squeeze()
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    print(f"Training Loss Epoch: {epoch_loss}")

    return

In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch)

In [None]:
def valid(model, testing_loader):
    model.eval()
    tr_loss = 0
    nb_tr_steps = 0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()

            nb_tr_steps += 1

    epoch_loss = tr_loss / nb_tr_steps
    print(f"Validation Loss Epoch: {epoch_loss}")
    return epoch_loss

In [None]:
val_loss = valid(model, testing_loader)

In [None]:
print(f"Validation Loss: {val_loss}")

In [None]:
save_directory = '~/RobBERT/valenceModel'

if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save the model's state dictionary
model_save_path = os.path.join(save_directory, 'model.pt')
torch.save(model.state_dict(), model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model state dictionary saved to {model_save_path}")
print(f"Tokenizer saved to {save_directory}")