In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
import seaborn as sns
from torch.utils.data import DataLoader, Dataset
import nltk
from nltk.tokenize import word_tokenize

In [2]:
tweets = pd.read_csv("data/cleaned/cleaned_lemmatized_english.csv")
tweets.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,lang,mentioned_users,hashtags
0,0,word #hashtag food crapilicious #hashtag,not_cyberbullying,en,,katandandre mkr
1,1,#hashtag white #hashtag #hashtag #hashtag #has...,not_cyberbullying,en,,aussietv MKR theblock ImACelebrityAU today sun...
2,2,@username classy whore red velvet cupcake,not_cyberbullying,en,XochitlSuckkks,
3,3,@username meh p thanks head concern angry dude...,not_cyberbullying,en,Jason_Gio,
4,4,@username isi account pretend kurdish account ...,not_cyberbullying,en,RudhoeEnglish,


In [4]:

tweets = tweets.drop('Unnamed: 0', axis = 1)

In [9]:
tweets.columns

Index(['tweet_text', 'cyberbullying_type', 'lang', 'mentioned_users',
       'hashtags'],
      dtype='object')

In [5]:
tweets.cyberbullying_type.unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [6]:
tweets['cyberbullying_type'] = tweets['cyberbullying_type'].astype('category')

In [7]:
train_data, test_data = train_test_split(tweets, test_size = 0.2, random_state = 20)
train_data, val_data = train_test_split(train_data, test_size = 0.25, random_state = 20)

In [24]:
def tokenize_and_pad(text, max_length=100):
    text = str(text)
    tokens = word_tokenize(text)
    tokens = tokens[:max_length]
    padded = tokens + ['<pad>'] * (max_length - len(tokens))
    return padded

In [31]:
class TweetDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = word_tokenize
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(data['cyberbullying_type'])
        self.texts = [tokenize_and_pad(text) for text in data['tweet_text']]
        
        self.word_to_idx = {'<pad>': 0}
        idx = 1
        for text in self.texts:
            for word in text:
                if word not in self.word_to_idx:
                    self.word_to_idx[word] = idx
                    idx += 1

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        input_text = self.texts[idx]
        input_indices = [self.word_to_idx[word] for word in input_text]
        return torch.tensor(input_indices, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)


train_dataset = TweetDataset(train_data)
val_dataset = TweetDataset(val_data)
test_dataset = TweetDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [32]:
# Create the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1, dropout=0.5):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        output = self.fc(output[:, -1, :])
        return output

hidden_size = 128
vocab_size = 10000
output_size = len(train_dataset.label_encoder.classes_)
model = RNNModel(vocab_size, hidden_size, output_size)


def train(model, train_loader, val_loader, num_epochs):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    losses = []
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer

def evaluate(model, test_loader):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    running_loss = 0.0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)

    accuracy = correct_predictions / total_predictions
    avg_loss = running_loss / len(test_loader)

    return accuracy, avg_loss

accuracy, loss = evaluate(model, test_loader)
print(f"Test Accuracy: {accuracy:.2f}, Test Loss: {loss:.4f}")


sns.lineplot(x=range(1, len(losses) + 1), y=losses)



IndexError: index out of range in self