In [51]:
import pandas as pd
import numpy as np

import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt


In [35]:
data = pd.read_csv('cyber_bully_cleaned.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type
0,0,word katandandre food crapilicious mkr,not_cyberbullying
1,1,aussietv white mkr theblock imacelebrityau tod...,not_cyberbullying
2,2,xochitlsuckkks classy whore red velvet cupcake,not_cyberbullying
3,3,meh thanks head concerned another angry dude t...,not_cyberbullying
4,4,rudhoeenglish isi account pretending kurdish a...,not_cyberbullying


In [42]:
unique_words = set()

for i, tweet in enumerate(data["tweet_text"]):
    for word in tweet.split():
        unique_words.add(word)

print(len(unique_words))
        

44005


### Word2Vec

In [7]:
import gensim.downloader as api
word2vec_model = api.load('word2vec-google-news-300')

In [43]:
for word in list(unique_words):
    if word not in word2vec_model.key_to_index:
        unique_words.remove(word)
        
print(len(unique_words))

22276


In [44]:
embedding_matrix =[[0] * 300] #this matrix will contain the word embeddings for the top 5000 words #the first row will be the embedding for the padding token which is 0
word_indices = {} #this dictionary will map each word to its index in the embedding matrix, this is needed when we train the neural network, the input to the neural network will be the index of the word in the embedding matrix
idx = 1
for i, word in enumerate(unique_words):
    if word in word2vec_model.key_to_index:
        embedding_matrix.append(word2vec_model[word])
        word_indices[word] = idx
        idx += 1
    
    else:
        continue
embedding_matrix = np.array(embedding_matrix)
print(embedding_matrix.shape)

(22277, 300)


### Models

In [52]:
#LSTM 
class LSTMClf(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, embedding_matrix, device):
        super(LSTMClf, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # self.embedding = nn.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1])
        # self.embedding.load_state_dict({'weight': torch.FloatTensor(embedding_matrix)})
        # self.embedding.weight.requires_grad = False #freeze the embedding layer
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), padding_idx=0, freeze=True)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(hidden_size, 32)
        self.output = nn.Linear(32, 1)
        self.device = device
        
    def forward(self, x):
        #randomly initialize the hidden state and cell state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)
        
        out = self.embedding(x)
        out, _ = self.lstm(out, (h0, c0))
        out = self.fc(out[:, -1, :]).relu()
        out = self.output(out)
        return out
    

class RNNClf(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, embedding_matrix, device):
        super(RNNClf, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1])
        self.embedding.load_state_dict({'weight': torch.FloatTensor(embedding_matrix)})
        self.embedding.weight.requires_grad = False #freeze the embedding layer
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True) 
        self.fc = nn.Linear(hidden_size, 32)
        self.output = nn.Linear(32, 1)
        self.device = device
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)
        
        out = self.embedding(x)
        out, _ = self.rnn(out, h0)
        out = self.fc(out[:, -1, :]).relu()
        out = self.output(out)
        return out
    
class CNNClf(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_matrix):
        super(CNNClf, self).__init__()
        self.embedding = nn.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1])
        self.embedding.load_state_dict({'weight': torch.FloatTensor(embedding_matrix)})
        self.embedding.weight.requires_grad = False
        
        self.conv1 = nn.Conv1d(input_size, hidden_size, kernel_size=3) #output size = (input_size - kernel_size + 1) = (300 - 3 + 1) = 298 
        
        self.fc = nn.Linear(hidden_size * 298, 64)  # 298 is the output size of the convolutional layer
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

        
    def forward(self, x):
        out = self.embedding(x)
        out = self.conv1(out).relu()
        out = out.flatten(1)
        out = self.fc(out).relu()
        out = self.fc2(out).relu()
        out = self.fc3(out)
        return out
    
    
class BiLSTMClf(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, embedding_matrix, device):
        super(BiLSTMClf, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1])
        self.embedding.load_state_dict({'weight': torch.FloatTensor(embedding_matrix)})
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True) 
        self.fc = nn.Linear(hidden_size * 2, 32)
        self.output = nn.Linear(32, 1)
        self.device = device
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(self.device)
        c0  = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(self.device)
        
        out = self.embedding(x)
        out, _ = self.lstm(out, (h0, c0)) 
        out = self.fc(out[:, -1, :]).relu()
        out = self.output(out)
        return out

In [49]:
def padding(sequence, max_len):
    if len(sequence) <= max_len:
        sequences = [sequence + [0] * (max_len - len(sequence))]
    
    elif len(sequence) > max_len:
        #break sequence into chunks of max_len
        sequences = []
        while len(sequence) > max_len:
            sequences.append(sequence[:max_len])
            sequence = sequence[max_len:]
        #if the last chunk is less than max_len, then pad it with zeros
        if len(sequences[-1]) < max_len and len(sequences[-1]) > 5:
            sequences[-1] += [0] * (max_len - len(sequences[-1]))
    
    for seq in sequences:
        assert len(seq) == max_len
    return sequences

In [50]:
def prepare_data(df, word_indices, max_len):
    X = []
    y = []
    for i, row in df.iterrows():
        
        sequence = [word_indices[word] for word in row["preprocessed_text"] if word in word_indices and word in word2vec_model.key_to_index]
        sequences = padding(sequence, max_len)
        X += sequences
        if row["subreddit"] == "climate_science":
            y += [0] * len(sequences)
        else:
            y += [1] * len(sequences)
    
    # print(len(X[2]))
    #convert X and y to numpy arrays then to tensors
    X = torch.tensor(X).clone().detach()
    y = torch.tensor(y).float().clone().detach()
    return X, y

In [None]:
#prepare the data  
batch_size = 32
max_len = 100
X_train, y_train = prepare_data(data[:-406], word_indices, max_len)
num_0_class = len(y) - sum(y)
num_1_class = sum(y)
print(num_0_class, num_1_class)

X_test, y_test = prepare_data(data[-406:], word_indices, max_len)
# X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
#find number of each class in the training set

#convert the data into data loaders
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

# val_dataset = TensorDataset(X_val, y_val)
# val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

print(len(train_loader), len(test_loader))

for x, y in train_loader:
    print(x.shape, y.shape)
    break

In [None]:
def train(train_data, validation_data, criterion, optimizer, device, model, num_epochs):
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        train_corr = 0
        total_train = 0

        for x, y in train_data:
            # Send data to the correct device
            x, y = x.to(device), y.to(device).float()
            y = y.unsqueeze(1)  # Ensure y has correct shape for binary classification
            total_train += y.size(0)
            
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output, y)
            total_loss += loss.item() 

            # Calculate training accuracy
            train_corr += ((output > 0.5).float() == y).sum().item()
            
            loss.backward()
            optimizer.step()
        
        train_losses.append(total_loss / len(train_data))
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for x, y in validation_data:
                x, y = x.to(device), y.to(device).float()
                y = y.unsqueeze(1)
                output = model(x)
                val_loss += criterion(output, y).item()
            
            val_losses.append(val_loss / len(validation_data))
        
        # Display metrics
        train_accuracy = 100 * train_corr / total_train
        # print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_losses[-1]:.4f}, "
        #       f"Train Accuracy: {train_accuracy:.2f}%, Validation Loss: {val_losses[-1]:.4f}")


    return train_losses, val_losses


def test(test_data, model, device):
    model.eval()
    y_pred = []
    y_true = []
    
    with torch.no_grad():
        for x, y in test_data:
            x, y = x.to(device), y.to(device).float()
            output = model(x).squeeze(1)
            
            # Collect predictions and true labels as lists of integers
            y_pred.extend((output > 0.5).int().cpu().tolist())
            y_true.extend(y.int().cpu().tolist())

    # Ensure that y_true is the first argument in the metric functions
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Test Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    return accuracy, precision, recall, f1

In [None]:
#hyperparameters
input_size = 300
hidden_size = 16
num_layers = 2
num_classes = 1
learning_rate = 0.01
num_epochs = 20
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
criterion = torch.nn.BCEWithLogitsLoss()

In [None]:
lstm_clf = LSTMClf(input_size, hidden_size, num_layers, num_classes, embedding_matrix, device).to(device)
rnn_clf = RNNClf(input_size, hidden_size, num_layers, num_classes, embedding_matrix, device).to(device)
cnn_clf = CNNClf(100, hidden_size, embedding_matrix).to(device)
bi_lstm_clf = BiLSTMClf(input_size, hidden_size, num_layers, num_classes, embedding_matrix, device).to(device)

#optimizer
models = [lstm_clf, cnn_clf, rnn_clf, bi_lstm_clf]
dl_results = {}
for model in models:
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) 
    print(f"Training {model.__class__.__name__}")
    train_losses, val_losses = train(train_loader, test_loader, criterion, optimizer, device, model, num_epochs)
    accuracy, precision, recall, f1 = test(test_loader, model, device)
    dl_results[model.__class__.__name__] = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
    print("\n\n")
    #plot the training and validation losses
    plt.plot(train_losses, label="Training Loss")
    plt.plot(val_losses, label="Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    
    plt.show()