In [None]:
#Some tests for EmotioNL dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from torch import cuda
import os
import matplotlib.pyplot as plt
os.environ["WANDB_DISABLED"] = "true"
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
sentences = pd.read_csv('~/RobBERT/corpus_fulltext_captions.txt', delimiter='\t')

In [None]:
sentences.head()

In [None]:
sentences = sentences[['ID', 'Valence', 'Arousal']]
sentences = sentences.rename(columns={'ID': 'Text', 'Valence': 'valence_label', 'Arousal': 'arousal_label'})

In [None]:
sentences.head()

In [None]:
train_df_sentences, val_df_sentences = train_test_split(sentences, test_size=0.2, random_state=42)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('DTAI-KULeuven/robbert-2023-dutch-base')

In [None]:
train_encodings_sentences = tokenizer(list(train_df_sentences['Text']), truncation=True, padding=True)
val_encodings_sentences = tokenizer(list(val_df_sentences['Text']), truncation=True, padding=True)

In [None]:
class SentenceDataset(Dataset):
    def __init__(self, encodings, arousal_labels, valence_labels):
        self.encodings = encodings
        self.arousal_labels = arousal_labels
        self.valence_labels = valence_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['arousal_labels'] = torch.tensor(self.arousal_labels[idx])
        item['valence_labels'] = torch.tensor(self.valence_labels[idx])
        return item

    def __len__(self):
        return len(self.arousal_labels)

In [None]:
train_dataset_sentences = SentenceDataset(train_encodings_sentences, list(train_df_sentences['arousal_label']), list(train_df_sentences['valence_label']))
val_dataset_sentences = SentenceDataset(val_encodings_sentences, list(val_df_sentences['arousal_label']), list(val_df_sentences['valence_label']))

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',  # directory for storing logs
    logging_steps=10,      # log every 10 steps
)

In [None]:
class LossHistoryCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = []
        self.eval_loss = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            self.train_loss.append(logs['loss'])
        if 'eval_loss' in logs:
            self.eval_loss.append(logs['eval_loss'])

# Initialize the callback
loss_history = LossHistoryCallback()

In [None]:
model = RobertaForSequenceClassification.from_pretrained('DTAI-KULeuven/robbert-2023-dutch-base', num_labels=1)
model.to(device)

In [None]:
trainer_sentences = Trainer(
    model=model,  # Use the fine-tuned model
    args=training_args,
    train_dataset=train_dataset_sentences,
    eval_dataset=val_dataset_sentences,
    callbacks=[loss_history],
)

In [None]:
trainer_sentences.train()
trainer_sentences.evaluate()

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(loss_history.train_loss, label='Training Loss')
plt.plot(loss_history.eval_loss, label='Validation Loss')
plt.ylim(0, 1)  # Adjust y-axis limit
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss Over Time (Sentences)')
plt.show()

In [None]:
tweets_df = pd.read_csv('~/RobBERT/corpus_fulltext_tweets.txt', sep='\t')
tweets_df = tweets_df[['Text', 'Valence', 'Arousal']]
sentences_df
#sentences_df = sentences_df.rename(columns={'ID': 'Text'})

# tweet_df = pd.read_csv('path_to_tweet_dataset.csv')
# tweet_df = tweet_df[['Tweet', 'Valence', 'Arousal']]
# tweet_df = tweet_df.rename(columns={'Tweet': 'Text'})