In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
import seaborn as sns
from torch.utils.data import DataLoader, Dataset
import nltk
from transformers import BertTokenizer

In [9]:
tweets = pd.read_csv("data/cleaned/nohashtag_cleaned_lemmatized_english.csv")
tweets.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,lang,mentioned_users,hashtags
0,0,word food crapilicious,not_cyberbullying,en,,katandandre mkr
1,1,white,not_cyberbullying,en,,aussietv MKR theblock ImACelebrityAU today sun...
2,2,@username classy whore red velvet cupcake,not_cyberbullying,en,XochitlSuckkks,
3,3,@username meh p thanks head concern angry dude...,not_cyberbullying,en,Jason_Gio,
4,4,@username isi account pretend kurdish account ...,not_cyberbullying,en,RudhoeEnglish,


In [10]:
tweets = tweets.drop('Unnamed: 0', axis = 1)

In [11]:
tweets.shape

(44660, 5)

In [12]:
tweets.isna().sum()

tweet_text               26
cyberbullying_type        0
lang                      0
mentioned_users       28106
hashtags              38160
dtype: int64

In [13]:
tweets['tweet_text'] = tweets['tweet_text'].fillna('<UNK>')
tweets['tweet_text'] = tweets['tweet_text'].replace('nan', '<UNK>')
tweets['hashtags'] = tweets['hashtags'].fillna('<NO_HASHTAG>')
tweets['mentioned_users'] = tweets['mentioned_users'].fillna('<NO_mentioned_users>')

In [14]:
tweets.isna().sum()

tweet_text            0
cyberbullying_type    0
lang                  0
mentioned_users       0
hashtags              0
dtype: int64

In [15]:
tweets.columns

Index(['tweet_text', 'cyberbullying_type', 'lang', 'mentioned_users',
       'hashtags'],
      dtype='object')

In [16]:
tweets.cyberbullying_type.unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [17]:
tweets['cyberbullying_type'] = tweets['cyberbullying_type'].astype('category')

In [32]:
train_data, test_data = train_test_split(tweets, test_size = 0.2, random_state = 20)
train_data, val_data = train_test_split(train_data, test_size = 0.25, random_state = 20)

In [33]:
label_to_int = {
    'not_cyberbullying': 0,
    'gender': 1,
    'religion': 2,
    'other_cyberbullying': 3,
    'age': 4,
    'ethnicity': 5
}

class TweetDataset(Dataset):
    def __init__(self, data, tokenizer, max_len = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
    

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['tweet_text'] 
        label = label_to_int[self.data.iloc[idx]['cyberbullying_type']]
        try:
            encoding = self.tokenizer.encode_plus(input_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
        except Exception as e:
            print(f"Error at idx={idx}, input_text='{input_text}': {e}")
            raise e
        return{
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label':torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = TweetDataset(train_data, tokenizer)
val_dataset = TweetDataset(val_data, tokenizer)
test_dataset = TweetDataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [34]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, n_layers, bidirectional,batch_size, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.num_directions = 2 if bidirectional else 1
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.n_layers = n_layers
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers = n_layers, dropout = dropout, bidirectional = bidirectional, batch_first = True)
        self.fc = nn.Linear(hidden_dim * self.num_directions, num_classes)
        self.softmax = nn.LogSoftmax(dim = 1)
    
    def forward(self, x):
        self.batch_size = x.size(0)
        embedding = self.embedding(x)
        out, hidden = self.rnn(embedded, hidden)
        out = out[:, -1, :]
        out = self.fc(out)
        out = self.softmax(out)
        
        return out, hidden

    def init_hidden(self, batch_size):
        h0 = torch.zeros((self.n_layers * self.num_directions, batch_size, self.hidden_dim)).detach()
        c0 = torch.zeros((self.n_layers * self.num_directions, batch_size, self.hidden_dim)).detach()
        hidden = (h0, c0)
        return hidden
        

In [35]:
#Hyperparameters:
VOCAB_SIZE = len(tokenizer.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
NUM_CLASSES = len(np.unique(tweets['cyberbullying_type']))
N_LAYERS = 3
BIDIRECTIONAL = True
DROPOUT = 0.5
BATCH_SIZE = 32

In [36]:
model = RNNClassifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES, N_LAYERS, BIDIRECTIONAL,BATCH_SIZE, DROPOUT)

optimizer = optim.Adam(model.parameters()) #Using Adam Optimizer
criterion = nn.CrossEntropyLoss() #Using CrossEntropyLoss function


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_accuracy = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad() #Clears the gradients of all optimized model parameters before calculating the next backward pass
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label'].view(-1)
        
        predictions = model(input_ids)
        
        loss = criterion(predictions, labels) 
        accuracy = (predictions.argmax(1) == labels).float().mean()
        
        loss.backward() #Computes the gradients of the loss with respect to model parameters 
        optimizer.step() #updates the parameters using the optimizer.
        
        epoch_loss += loss.item()
        epoch_accuracy += accuracy.item()
        
        return epoch_loss / len(iterator), epoch_accuracy / len(iterator) #Calculates the average loss and accuracy across all batches in the epoch, and returns them as a tuple.
    

In [37]:
def evaluate(model, iterator,criterion):
    epoch_loss = 0
    epoch_accuracy = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']
            
            predictions = model(input_ids)
        
            loss = criterion(predictions, labels)
            accuracy = (predictions.argmax(1) == labels).float().mean()
        
            epoch_loss += loss.item()
            epoch_accuracy += accuracy.item()
        
            return epoch_loss / len(iterator), epoch_accuracy / len(iterator) 

In [28]:
EPOCHS = 1001

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)
    if epoch % 100 == 0:
        print(f'Epoch: {epoch}')
        print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

Epoch: 0
Train Loss: 0.001 | Train Acc: 0.01%
Test Loss: 0.003 | Test Acc: 0.03%
Epoch: 100
Train Loss: 0.001 | Train Acc: 0.01%
Test Loss: 0.003 | Test Acc: 0.00%
Epoch: 200
Train Loss: 0.001 | Train Acc: 0.01%
Test Loss: 0.003 | Test Acc: 0.01%
Epoch: 300
Train Loss: 0.001 | Train Acc: 0.01%
Test Loss: 0.003 | Test Acc: 0.09%
Epoch: 400
Train Loss: 0.001 | Train Acc: 0.01%
Test Loss: 0.003 | Test Acc: 0.03%
Error at idx=22324, input_text='radical want know christian right find ok jesus would support would': Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).