The goal of this project is to use the pretrained RoBERTa transformer as a feature extractor with a costum classification head to determine if text messages are offensive or not.

In [30]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

import seaborn as sns
import matplotlib.pyplot as plt

import wandb

import sys
from pathlib import Path

# Add src/ to path (once, so imports work)
sys.path.append(str(Path().resolve().parent / "src"))

%load_ext autoreload
%autoreload 2
from helper_functions import AttentionPooling, HateSpeechDataset
from helper_functions import train_model, test_model, get_class_distribution, oversample_dataset, undersample_dataset
from models import CustomClassifier, LargeCustomClassifier, BaseCNNClassifier

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
from paths import DATA_CLEANED, DATA_PROCESSED
print("Cleaned data path:", DATA_CLEANED)
print("Processed data path:", DATA_PROCESSED)

Cleaned data path: /Project/data/cleaned
Processed data path: /Project/data/processed


In [32]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Using RoBERTa as a feature extractor with a costum classification head

Found this pretrained model online: cardiffnlp/twitter-roberta-base-sentiment-latest (https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest)

It is already pretrained on twitter messages. 

Define which pretrained model is used and initilise tokenizer

In [33]:
model_name = 'roberta-base'

tokenizer = AutoTokenizer.from_pretrained(model_name)

## Load HASOC dataset for training, validation and testing

Define experiment scope:

In [34]:
# here we are just using the labels of the first task of the HASOC dataset, which is a binary classification
label = "task_1"

In [35]:
# Load training and test data
clean_df = pd.read_csv(DATA_CLEANED / "hasoc_2019_en_train_cleaned.tsv", sep='\t')
# test_df = pd.read_csv(DATA_PROCESSED / "hasoc_2019_en_test.tsv", sep='\t')
test_df = pd.read_csv(DATA_CLEANED / "hasoc_2019_en_test_cleaned.tsv", sep='\t')

# Split clean dataset in training and validation set
train_df, val_df = train_test_split(clean_df, test_size=0.3, random_state=42, stratify=clean_df[label])

# Automatically map string labels to integers
label_list = sorted(train_df[label].unique())
label_map = {label: idx for idx, label in enumerate(label_list)}

train_df[label] = train_df[label].map(label_map)
val_df[label] = val_df[label].map(label_map)
test_df[label] = test_df[label].map(label_map)

In [36]:
# Decide which technique to use to cope with data imbalance
handling_imbalance = "class_weighting"
# when choosing 'class_weighting' dataset is not touched but classes gets weighted depending on label/class distribution

if handling_imbalance == 'oversampling':
    # Oversample dataset
    train_df = oversample_dataset(train_df, label)
    # val_df = oversample_dataset(val_df, label) # over and undersampling only useful for training dataset
    # test_df = oversample_dataset(test_df, label)
elif handling_imbalance == 'undersampling':
    # Undersample dataset
    train_df = undersample_dataset(train_df, label)
    # val_df = undersample_dataset(val_df, label)
    # test_df = undersample_dataset(test_df, label)

In [37]:
# Create PyTorch Datasets and DataLoaders
train_dataset = HateSpeechDataset(train_df, tokenizer, label=label)
val_dataset = HateSpeechDataset(val_df, tokenizer, label=label)
test_dataset = HateSpeechDataset(test_df, tokenizer, label=label)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=32)

print(get_class_distribution(train_df, label))

{1: 2513, 0: 1583}


In [38]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df[label]),
    y=train_df[label]
)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

## Training and evaluation of model

In [4]:
# Decide what pooling to use (cls, mean or attention_pooling)
pooling = "max"

# Initialize model
unfrozen_last_layers = 2
model = BaseCNNClassifier(model_name, class_weights_tensor, device, unfrozen_last_layers).to(device)


# Set learning rate
learning_rate = 5e-4

# Optimizer only for the classification head
optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=learning_rate)
epochs = 100

NameError: name 'BaseCNNClassifier' is not defined

In [41]:
wandb.init(project="roberta-classifier", config={
    "model": model_name,
    "frozen_base": True if unfrozen_last_layers == 0 else f"last {unfrozen_last_layers} layers unfrozen",
    "pooling": pooling,
    "classifier_head": model.__class__.__name__,
    "epochs": epochs,
    "lr": learning_rate,
    "handling_imbalance": handling_imbalance
})

In [2]:
train_model(model, train_loader, val_loader, optimizer, device, epochs, 'best_model_variants.pt')

NameError: name 'train_model' is not defined

## Testing of model

In [3]:
# Load best model
model.load_state_dict(torch.load("best_model_variants.pt", weights_only=True))
# Test model on test set
test_model(model, test_loader, device, phase = "test")

NameError: name 'model' is not defined