# Hate Speech Classification with a Long-Short Term Memory Model

The Long-Short Term Memory (LSTM) model is an improvement on the Recurrent Neural Network (RNN) architecture. A RNN processes data sequentially and updates its parameters using a variation of backpropagation known as backpropagation through time (BPTT). BPTT unrolls the time steps, applies backpropagation and rolls the recurrent structure back up. RNNs suffer with long sequences of data as they aim to garner a representation of the input sequence by processing it element-wise. Over time thi makes them instable and inefficient as they commonly suffer with vanishing/exploding gradients, halting the learning process.

LSTMs work differently. Each LSTM module contains a cell state and a hidden state. The cell state allows a representation of the data to run through the model and undergo updates via linear instructions determined by internal gates. There is a forget gate, used to discard information deemed unimportant, an input gate, to add new information and an output gate, finalising the state of the cell and providing input for the next. By maintaining a consistent cell state, gradients flow easily through the network, mitigating the vanishing/exploding gradient problem. Compared to RNNs, LSTMs have significantly better stability and memory. 

## Imports

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch import nn
from torchtext.data.utils import get_tokenizer
from collections import Counter
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils
import matplotlib.pyplot as plt
import numpy as np
import time
from torchtext.vocab import Vocab
import json
import seaborn as sns
import torchtext
from wordcloud import WordCloud
import re
import random

sns.set()

## Data PreProcessing

All datasets come with their own format. This cell is used to standardise them and strip them of unnecessary columns. They are converted simply to `label` and `text` columns.

In [None]:
"""
Dataset: https://www.kaggle.com/datasets/kazanova/sentiment140
"""

labelled_text = pd.read_csv('Data/Unprocessed/training.1600000.processed.noemoticon.csv', encoding='latin', header=None)
labelled_text.columns = ['label', 'id', 'date', 'query', 'user_id', 'text']
labelled_text = labelled_text.drop(columns=['id', 'date', 'query', 'user_id'])

labelled_text['label'].mask(labelled_text['label'] == 0, 1, inplace=True)
labelled_text['label'].mask(labelled_text['label'] == 2, 0, inplace=True)
labelled_text['label'].mask(labelled_text['label'] == 4, 0, inplace=True)

labelled_text.to_csv('Data/Processed/labelled_text.csv')

"""
Dataset: https://hasocfire.github.io/hasoc/2021/dataset.html
"""

labelled_text_2 = pd.read_csv('Data/Unprocessed/hasoc_english_dataset.tsv', delimiter='\t')
labelled_text_2 = labelled_text_2.drop(columns=['text_id', 'task_1', 'task_3'])
labelled_text_2 = labelled_text_2.rename(columns={'task_2': 'label'})

labelled_text_2['label'].mask(labelled_text_2['label'] == 'HATE', 1, inplace=True)
labelled_text_2['label'].mask(labelled_text_2['label'] == 'OFFN', 1, inplace=True)
labelled_text_2['label'].mask(labelled_text_2['label'] == 'PRFN', 1, inplace=True)
labelled_text_2['label'].mask(labelled_text_2['label'] == 'NONE', 0, inplace=True)

labelled_text_2.to_csv('Data/Processed/labelled_text_2.csv')

"""
Dataset: https://figshare.com/articles/dataset/Labelled_Hate_Speech_Detection_Dataset_/19686954
"""

labelled_text_3 = pd.read_csv('Data/Unprocessed/HateSpeechDetection.csv')
labelled_text_3 = labelled_text_3.drop(columns=['Platform'])
labelled_text_3 = labelled_text_3.rename(columns={'Comment': 'text'})
labelled_text_3 = labelled_text_3.rename(columns={'Hateful': 'label'})

labelled_text_3.to_csv('Data/Processed/labelled_text_3.csv')

"""
Dataset: https://zenodo.org/record/3706866
"""

labelled_text_4 = pd.read_csv('Data/Unprocessed/hatespeech_text_label_vote_RESTRICTED_100K.csv')
labelled_text_4 = labelled_text_4.drop(columns=['Votes for the majority label'])
labelled_text_4 = labelled_text_4.rename(columns={'Tweet text': 'text'})
labelled_text_4 = labelled_text_4.rename(columns={'Label': 'label'})

labelled_text_4['label'].mask(labelled_text_4['label'] == 'normal', 0, inplace=True)
labelled_text_4['label'].mask(labelled_text_4['label'] == 'spam', 0, inplace=True)
labelled_text_4['label'].mask(labelled_text_4['label'] == 'abusive', 1, inplace=True)
labelled_text_4['label'].mask(labelled_text_4['label'] == 'hateful', 1, inplace=True)
labelled_text_4['text'] = labelled_text_4['text'].str.replace('RT', '')

labelled_text_4.to_csv('Data/Processed/labelled_text_4.csv')

"""
Dataset: https://www.kaggle.com/datasets/ashwiniyer176/toxic-tweets-dataset
"""

labelled_text_5 = pd.read_csv('Data/Unprocessed/FinalBalancedDataset.csv')
labelled_text_5.rename({"Unnamed: 0":"a"}, axis="columns", inplace=True)
labelled_text_5.drop(["a"], axis=1, inplace=True)
labelled_text_5 = labelled_text_5.rename(columns={'Toxicity': 'label'})
labelled_text_5 = labelled_text_5.rename(columns={'tweet': 'text'})
labelled_text_5['text'] = labelled_text_5['text'].str.replace('ð', '')

labelled_text_5.to_csv('Data/Processed/labelled_text_5.csv')

"""
Dataset: https://www.kaggle.com/datasets/cosmos98/twitter-and-reddit-sentimental-analysis-dataset
"""

labelled_text_6 = pd.read_csv('Data/Unprocessed/Reddit_Data.csv')
labelled_text_6 = labelled_text_6.rename(columns={'clean_comment': 'text'})
labelled_text_6 = labelled_text_6.rename(columns={'category': 'label'})
labelled_text_6.drop(labelled_text_6[labelled_text_6['label'] == 0].index, inplace=True)
labelled_text_6 = labelled_text_6[labelled_text_6['text'] != '']
labelled_text_6['label'].mask(labelled_text_6['label'] == 1, 0, inplace=True)
labelled_text_6['label'].mask(labelled_text_6['label'] == -1, 1, inplace=True)

labelled_text_6.to_csv('Data/Processed/labelled_text_6.csv')

"""
Concatenate datasets
"""

processed_data = pd.concat([labelled_text, labelled_text_2, labelled_text_3, labelled_text_4, labelled_text_5, labelled_text_6])

processed_data['text'] = processed_data['text'].str.replace('https', '')
processed_data['text'] = processed_data['text'].str.replace('t', '')
processed_data['text'] = processed_data['text'].str.replace('co', '')
processed_data['text'] = processed_data['text'].str.replace('amp', '')
processed_data['text'] = processed_data['text'].str.replace('quo', '')

# save all data
processed_data.to_csv('Data/processed_data.csv')
print(len(processed_data))
processed_data.head()

## Create Datasets

Splitting the total data into training, validation and test sets.

As the length of the dataset is very large (large enough to keep my laptop out of comission for a couple of days) I am splitting it into 4 datasets and training the model periodically until all are done.

In [None]:
first_half = processed_data.sample(frac=0.5)
second_half = processed_data.drop(first_half.index)

data_1 = first_half.sample(frac=0.5)
data_2 = first_half.drop(data_1.index)

data_3 = second_half.sample(frac=0.5)
data_4 = second_half.drop(data_3.index)

print(len(data_1))
print(len(data_2))
print(len(data_3))
print(len(data_4))
print()

data_1.reset_index(drop=True, inplace=True)
data_2.reset_index(drop=True, inplace=True)
data_3.reset_index(drop=True, inplace=True)
data_4.reset_index(drop=True, inplace=True)

data_1.to_csv('Data/Split Datasets/data_1.csv')
data_2.to_csv('Data/Split Datasets/data_2.csv')
data_3.to_csv('Data/Split Datasets/data_3.csv')
data_4.to_csv('Data/Split Datasets/data_4.csv')

print(data_1.head())
print(data_2.head())
print(data_3.head())
print(data_4.head())

In [None]:
def create_splits(data):
    train_data_ = data.sample(frac=0.8)
    test_data = data.drop(train_data_.index)
    train_data = train_data_.sample(frac=0.8)
    val_data = train_data_.drop(train_data.index)
    return train_data, val_data, test_data

def dataset_details(dataset, data_name, set_name):
    num_samples = len(dataset)
    num_label_0 = Counter(dataset['label'].tolist())[0]
    num_label_1 = Counter(dataset['label'].tolist())[1]
    split_percent = num_label_1 / num_samples * 100
    print('*' + '-' * 19 + '*')
    print(f'|      {data_name:11}  |')
    print(f'|      {set_name:11}  |')
    print('|' + '-' * 19 + '|')
    print(f'| Samples : {num_samples:7} |')
    print('*' + '-' * 19 + '*')
    print(f'| Neutral : {num_label_0:6}  |')
    print(f'| Hate    : {num_label_1:6}  |')
    print(f'| Split   : {split_percent:.2f}%  |')
    print('*' + '-' * 19 + '*')
    if set_name == 'Testing':
        print('\n\n')

In [None]:
data_name = 'Data 1'
train, val, test = create_splits(data_1)
dataset_details(train, data_name=data_name, set_name='Training')
dataset_details(val, data_name=data_name, set_name='Validation')
dataset_details(test, data_name=data_name, set_name='Testing')

train.to_csv('Data/Split Datasets/data_1/train.csv', index_col=0)
val.to_csv('Data/Split Datasets/data_1/val.csv', index_col=0)
test.to_csv('Data/Split Datasets/data_1/test.csv', index_col=0)

data_name = 'Data 2'
train, val, test = create_splits(data_2)
dataset_details(train, data_name=data_name, set_name='Training')
dataset_details(val, data_name=data_name, set_name='Validation')
dataset_details(test, data_name=data_name, set_name='Testing')

train.to_csv('Data/Split Datasets/data_2/train.csv', index_col=0)
val.to_csv('Data/Split Datasets/data_2/val.csv', index_col=0)
test.to_csv('Data/Split Datasets/data_2/test.csv', index_col=0)

data_name = 'Data 3'
train, val, test = create_splits(data_3)
dataset_details(train, data_name=data_name, set_name='Training')
dataset_details(val, data_name=data_name, set_name='Validation')
dataset_details(test, data_name=data_name, set_name='Testing')

train.to_csv('Data/Split Datasets/data_3/train.csv', index_col=0)
val.to_csv('Data/Split Datasets/data_3/val.csv', index_col=0)
test.to_csv('Data/Split Datasets/data_3/test.csv', index_col=0)

data_name = 'Data 4'
train, val, test = create_splits(data_4)
dataset_details(train, data_name=data_name, set_name='Training')
dataset_details(val, data_name=data_name, set_name='Validation')
dataset_details(test, data_name=data_name, set_name='Testing')

train.to_csv('Data/Split Datasets/data_4/train.csv', index_col=0)
val.to_csv('Data/Split Datasets/data_4/val.csv', index_col=0)
test.to_csv('Data/Split Datasets/data_4/test.csv', index_col=0)

## Collate Batches and Initialise DataLoaders

Collates `label`/`text` pairs into tuples, where the text is transformed into its GloVe embedding. Sequences are padded and tensors are moved to the GPU. 

Batches are padded so they are all the same length. 

These functions are called when DataLoaders are initialised to shuffle the data each epoch and process the batches using the pipeline described above.


In [None]:
def collate_batch(batch, embeddings):
    label_list, text_list = [], []
    for (label, text) in batch:
        label_list.append(int(label))
        embedding = []
        for word in text.split():
            # get word embedding
            # if word doesn't exist return vector of 0's
            vector = embeddings.get(word, np.zeros((50,)))
            embedding.append(torch.tensor(vector, dtype=torch.float32))
        text_list.append(torch.stack(embedding))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = rnn_utils.pad_sequence(text_list, batch_first=True)
    text_lengths = torch.tensor([len(t) for t in text_list], dtype=torch.int64)

    return label_list.to(device), text_list.to(device), text_lengths.to(device)


def batch_padding(batch_size, embeddings):
    def collate_fn(batch):
        padded_batch = batch + [batch[-1]] * (batch_size - len(batch))
        return collate_batch(padded_batch, embeddings)
    return collate_fn

## The Model

In [None]:
class LSTM_Model(nn.Module):
    def __init__(self, vector_dim, num_hidden_nodes, num_layers):
        super(LSTM_Model, self).__init__()

        self.lstm_layer_1 = nn.LSTM(vector_dim, num_hidden_nodes*25, num_layers=3, bidirectional=True, dropout=0.3, batch_first=True)
        self.lstm_layer_2 = nn.LSTM(num_hidden_nodes*50, num_hidden_nodes*25, num_layers=3, bidirectional=True, dropout=0.3, batch_first=True)

        self.linear_layer_1 = nn.Linear(num_hidden_nodes*50, 24)
        self.linear_layer_2 = nn.Linear(24, 8)
        self.linear_layer_3 = nn.Linear(8, 1)

        # self.lstm_batch_norm = nn.BatchNorm1d()

        self.relu_activation = nn.ReLU()
        self.linear_dropout = nn.Dropout(0.2)

    def forward(self, text, text_lengths):
        lstm_out, _ = self.lstm_layer_1(text)
        lstm_out, _ = self.lstm_layer_2(lstm_out)
        lstm_out = lstm_out[torch.arange(lstm_out.shape[0]), text_lengths - 1, :]
        x = self.linear_dropout(self.relu_activation(self.linear_layer_1(lstm_out)))
        x = self.linear_dropout(self.relu_activation(self.linear_layer_2(x)))
        x = self.linear_layer_3(x)
        return x

## Initialise Hyperparameters

In [None]:
dimensions = 50
hidden_nodes = 2
hidden_layers = 2

lstm_ = LSTM_Model(vector_dim=dimensions, num_hidden_nodes=hidden_nodes, num_layers=hidden_layers).to(device)

lstm = torch.compile(lstm_)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(lstm.parameters(), lr=0.01, momentum=0.8)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)

params = sum(p.numel() for p in lstm.parameters())
print(f"LSTM has {params} parameters.")


## Training and Evaluation

In [None]:
def train(dataloader, model, optimizer, criterion, epoch, verbose):
    model.train()
    # number of accurate predictions in batch
    batch_accuracy = 0
    # value of loss for each prediction in batch
    batch_loss = 0
    # number of predictions made in batch
    batch_count = 0
    # number of accurate predictions in epoch
    epoch_accuracy = 0
    # value of loss over eoch
    epoch_loss = 0
    # number of predictions made in epoch
    epoch_count = 0
    # displays training metrics every quarter of a batch
    intervals = (len(dataloader) / 4).__round__()
    # loss value for final batch
    last_loss = 0
    for idx, (label, text, text_lengths) in enumerate(dataloader):
        # make prediction
        prediction = model(text, text_lengths)
        label = label.unsqueeze(1)
        # compare prediction to label to calculate loss
        loss = criterion(prediction, label.float())
        batch_loss = loss.item()
        epoch_loss += batch_loss
        # update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # store metrics
        batch_accuracy += ((prediction > 0.5) == label).sum().item()
        batch_count += label.size(0)
        epoch_accuracy += batch_accuracy
        epoch_count += batch_count
        if verbose and idx % intervals == 0 and idx > 0:
            print(f'| Epoch {epoch + 1} | {idx:5} / {len(dataloader):5} batches | {(batch_accuracy/batch_count)*100:.7f}% accurate |')
            batch_accuracy = 0
            batch_count = 0
    scheduler.step()
    return epoch_loss, epoch_accuracy, epoch_count

def evaluate(dataloader, model, criterion):
    model.eval()
    batch_accuracy = 0
    batch_loss = 0
    batch_count = 0
    with torch.no_grad():
        for idx, (label, text, text_length) in enumerate(dataloader):
            prediction = model(text, text_length)
            label = label.unsqueeze(1)
            loss = criterion(prediction, label.float())
            batch_accuracy += ((prediction > 0.5) == label).sum().item()
            batch_count += label.size(0)
            batch_loss = loss.item()
    return batch_loss, batch_accuracy, batch_count

def model_env(training, validation, testing, model, optimizer, criterion, epochs, verbose=True):
    """
    Wraps the training and evaluation in one method.
    At the end of each epoch, the model asseses the validation set.

    Args:
        training (DataLoader): DataLoader with training data.
        validation (DataLoader): DataLoader with validation data.
        testing (DataLoader): DataLoader with testing data.
        model (nn.Module): The LSTM model being trained.
        optimizer (torch.optim.sgd): Backpropagation method.
        criterion (torch.nn.modules.loss): Loss function.
        epochs (int): Number of epochs the model is trained for.
        verbose (Boolean): Used to display metrics during training (default=True).

    Returns:
        train_accuracy, train_loss, val_accuracy, val_loss 
            (list, list, list, list): Metrics saved during training.
    """
    # save loss and accuracy values during training and evaluation
    train_accuracy = []
    train_loss = []
    val_accuracy = []
    val_loss = []
    # save start time
    start_time = time.time()
    for epoch in range(epochs):
        epoch_start = time.time()
        print('-' * 57)
        print(f'|\t\t     Start of epoch {epoch + 1}     \t\t |')
        print('-' * 57)
        loss, acc, count = train(training, model, optimizer, criterion, epoch, verbose) 
        train_loss.append(loss)
        train_accuracy.append(acc)
        loss, acc, count = evaluate(validation, model, criterion)
        val_loss.append(loss)
        val_accuracy.append(acc)
        val_ratio = (acc/count)*100
        print('-' * 57)
        print(f'| End of epoch {epoch + 1} | Time: {time.time() - epoch_start:.2f}s | Acc: {val_ratio:10}% |')
        print('-' * 57)
        print()
    loss, acc, count = evaluate(testing, model, criterion)
    test_ratio = (acc/count)*100
    # max_acc = max(val_accuracy)
    print(f'*\t\t\tTesting Epoch \t\t   *')
    print('*' * 57)
    print(f'* \tTest accuracy: {test_ratio}%\t\t*')
    print('*' * 57)
    print(f'* \t\tTotal time:     {(time.time() - run_time_start).__round__()/60 } mins\t*')
    print('*' * 57)
    # print(f'* \tMax Accuracy: {max_acc:12}%\t\t*')
    return train_accuracy, train_loss, val_accuracy, val_loss

In [None]:
"""
Train and evaluate model on GPU if possible ("mps" used for M1 Mac)
"""
if torch.backends.mps.is_available():
    if torch.backends.mps.is_built():
        print("Using MPS")
        device = torch.device("mps")
else:
    print("Using CPU")
    device = torch.device("cpu")

## Vocabulary

GloVe (Global Vectors for Word Representation) embeddings are vector representations of words with semantically similar words being projected closer together.

Loads the GloVe embeddings file into a dictionary where the key is the word and the value is the vector.


In [None]:
embedding_dict = {} 

with open("Data/GloVe/glove.twitter.27B.50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embedding_dict[word] = vector

In [None]:
def init_dataloaders(folder, batch_size):
    train = pd.read_csv(f'Data/Split Datasets/{folder}/train.csv')
    val = pd.read_csv(f'Data/Split Datasets/{folder}/val.csv')
    test = pd.read_csv(f'Data/Split Datasets/{folder}/test.csv')

    print(train.head())
    print(val.head())
    print(test.head())

    train = train.to_numpy()
    val = val.to_numpy()
    test = test.to_numpy()

    train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, collate_fn=batch_padding(batch_size, embedding_dict))
    val_loader = DataLoader(val, batch_size=batch_size, shuffle=True, collate_fn=batch_padding(batch_size, embedding_dict))
    test_loader = DataLoader(test, batch_size=batch_size, shuffle=True, collate_fn=batch_padding(batch_size, embedding_dict))

    return train_loader, val_loader, test_loader
    

In [None]:
train_loader, val_loader, test_loader = init_dataloaders('data_1', batch_size=64)

#  removed unnamed column, why is it there?


In [None]:
epochs = 10

# using lstm_ model, non-compiled
t_acc, t_loss, v_acc, v_loss = model_env(train_loader, val_loader, test_loader, lstm_, optimizer, criterion, epochs, verbose=True)

## Visualise Metrics

In [None]:
# Helper function for plotting training/validation loss over epochs

def plot_metrics(training_loss, validation_loss, metric, num_epochs):
    plt.plot(training_loss, c='blue', label=f'Training')
    plt.plot(validation_loss, c='red', label=f'Validation')
    plt.ylabel(f'{metric.title()}')
    plt.xlabel('Epoch')
    plt.title(f'{metric.title()} over {num_epochs} epochs')
    plt.legend()
    plt.show()

plot_metrics(train_acc, val_acc, 'Accuracy', epochs)
plot_metrics(train_loss, val_loss, 'Loss', epochs)