In [None]:
#Test script for fine-tuning
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

This first part fine-tunes the 2023 RobBert Dutch transformer model to predict psychological arousal levels

In [None]:
data_arousal = pd.read_excel('All_Arousal.xlsx', sheet_name="MeanArousalPerWord")

In [None]:
data_arousal.shape

In [None]:
data_arousal.head()

In [None]:
#There are unrated words, these are removed. Not needed for the excel sheet with mean values
#data_arousal = data_arousal[data_arousal['Onbekend Woord'] == 0]

In [None]:
data_arousal.describe()

In [None]:
new_data_arousal = data_arousal[['Word', 'Arousal']]
new_data_arousal = new_data_arousal.rename(columns={'Arousal': 'label'})

In [None]:
train_df_arousal, val_df_arousal = train_test_split(new_data_arousal, test_size=0.2, random_state=42)

In [None]:
tokenizer_arousal = RobertaTokenizer.from_pretrained('DTAI-KULeuven/robbert-2023-dutch-base')

In [None]:
def tokenize_function(examples):
    return tokenizer_arousal(examples['Word'], padding="max_length", truncation=True)

In [None]:
train_encodings_arousal = tokenizer_arousal(list(train_df_arousal['Word']), truncation=True, padding=True)
val_encodings_arousal = tokenizer_arousal(list(val_df_arousal['Word']), truncation=True, padding=True)

In [None]:
class ArousalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset_arousal = ArousalDataset(train_encodings_arousal, list(train_df_arousal['label']))
val_dataset_arousal = ArousalDataset(val_encodings_arousal, list(val_df_arousal['label']))

In [None]:
model_arousal = RobertaForSequenceClassification.from_pretrained('DTAI-KULeuven/robbert-2023-dutch-base', num_labels=1)
model_arousal.to(device)

In [None]:
training_args_arousal = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
trainer_arousal = Trainer(
    model=model_arousal,
    args=training_args_arousal,
    train_dataset=train_dataset_arousal,
    eval_dataset=val_dataset_arousal,
)

In [None]:
trainer_arousal.train()

In [None]:
trainer_arousal.evaluate()

In [None]:
model_arousal.save_pretrained('~/RobBERT/models/arousal_model')
tokenizer_arousal.save_pretrained('~/RobBERT/models/arousal_tokenizer')

This next part also fine-tunes the Dutch RobBert transformer, but for valence

In [None]:
data_valence = pd.read_excel('All_Valence.xlsx', sheet_name='All')

In [None]:
data_valence.head()

In [None]:
#There are unrated words, these are removed
data_valence = data_valence[data_valence['Unknown'] == 0]

In [None]:
data_valence.describe()

In [None]:
new_data_valence = data_valence[['Word', 'Valence']]
new_data_valence = new_data_valence.rename(columns={'Valence': 'label'})

In [None]:
train_df_valence, val_df_valence = train_test_split(new_data_valence, test_size=0.2, random_state=42)

In [None]:
tokenizer_valence = RobertaTokenizer.from_pretrained('DTAI-KULeuven/robbert-2023-dutch-base')

In [None]:
def tokenize_function(examples):
    return tokenizer_valence(examples['Word'], padding="max_length", truncation=True)

In [None]:
train_encodings_valence = tokenizer_valence(list(train_df_valence['Word']), truncation=True, padding=True)
val_encodings_valence = tokenizer_valence(list(val_df_valence['Word']), truncation=True, padding=True)

In [None]:
class ValenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset_valence = ValenceDataset(train_encodings_valence, list(train_df_valence['label']))
val_dataset_valence = ValenceDataset(val_encodings_valence, list(val_df_valence['label']))

In [None]:
model_valence = RobertaForSequenceClassification.from_pretrained('DTAI-KULeuven/robbert-2023-dutch-base', num_labels=1)
model_valence.to(device)

In [None]:
training_args_valence = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
trainer_valence = Trainer(
    model=model_valence,
    args=training_args_valence,
    train_dataset=train_dataset_valence,
    eval_dataset=val_dataset_valence,
)

In [None]:
trainer_valence.train()

In [None]:
trainer_valence.evaluate()

In [None]:
model_valence.save_pretrained('~/RobBERT/models/valence_model')
tokenizer_valence.save_pretrained('~/RobBERT/models/valence_tokenizer')

In [None]:
data_emo = pd.read_excel('All_Primary_Emotions.xlsx', sheet_name="MeanEmotionPerWord")