In [None]:
#Test script for possible performance figures
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from torch import cuda
import os
import matplotlib.pyplot as plt

In [None]:

os.environ["WANDB_DISABLED"] = "true" #Disable logging/sending data to Wandb, handling visualisation locally
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:

data_arousal = pd.read_excel('~/RobBERT/All_Arousal.xlsx', sheet_name="MeanArousalPerWord")
new_data_arousal = data_arousal[['Word', 'Arousal']]
new_data_arousal = new_data_arousal.rename(columns={'Arousal': 'label'})

train_df_arousal, val_df_arousal = train_test_split(new_data_arousal, test_size=0.2, random_state=42)

In [None]:

tokenizer_arousal = RobertaTokenizer.from_pretrained('DTAI-KULeuven/robbert-2023-dutch-base')
train_encodings_arousal = tokenizer_arousal(list(train_df_arousal['Word']), truncation=True, padding=True)
val_encodings_arousal = tokenizer_arousal(list(val_df_arousal['Word']), truncation=True, padding=True)

In [None]:
class ArousalDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset_arousal = ArousalDataset(train_encodings_arousal, list(train_df_arousal['label']))
val_dataset_arousal = ArousalDataset(val_encodings_arousal, list(val_df_arousal['label']))

In [None]:
model_arousal = RobertaForSequenceClassification.from_pretrained('DTAI-KULeuven/robbert-2023-dutch-base', num_labels=1)
model_arousal.to(device)

In [None]:
training_args_arousal = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',  # directory for storing logs
    logging_steps=10,      # log every 10 steps
)

In [None]:
#Callback to get losses for visualisation
class LossHistoryCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = []
        self.eval_loss = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            self.train_loss.append(logs['loss'])
        if 'eval_loss' in logs:
            self.eval_loss.append(logs['eval_loss'])

loss_history = LossHistoryCallback()

In [None]:
trainer_arousal = Trainer(
    model=model_arousal,
    args=training_args_arousal,
    train_dataset=train_dataset_arousal,
    eval_dataset=val_dataset_arousal,
    callbacks=[loss_history],
)

In [None]:
trainer_arousal.train()
trainer_arousal.evaluate()

In [None]:
model_arousal.save_pretrained('~/RobBERT/arousal/arousal_model')
tokenizer_arousal.save_pretrained('~/RobBERT/arousal/arousal_tokenizer')

In [None]:
# Plotting loss graphs
plt.figure(figsize=(10, 5))
plt.plot(loss_history.train_loss, label='Training Loss')
plt.plot(loss_history.eval_loss, label='Validation Loss')
plt.ylim(0, 0.5)  # Adjust y-axis limit
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss Over Time (Adjusted)')
plt.show()

In [None]:
plt.savefig('~/RobBERT/arousal/arousalTrain.png')