In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
import seaborn as sns
from torch.utils.data import DataLoader, Dataset
import nltk
from transformers import BertTokenizer

In [3]:
tweets = pd.read_csv("data/cleaned/nohashtag_cleaned_lemmatized_english.csv")
tweets.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,lang,mentioned_users,hashtags
0,0,word food crapilicious,not_cyberbullying,en,,katandandre mkr
1,1,white,not_cyberbullying,en,,aussietv MKR theblock ImACelebrityAU today sun...
2,2,@username classy whore red velvet cupcake,not_cyberbullying,en,XochitlSuckkks,
3,3,@username meh p thanks head concern angry dude...,not_cyberbullying,en,Jason_Gio,
4,4,@username isi account pretend kurdish account ...,not_cyberbullying,en,RudhoeEnglish,


In [4]:
tweets = tweets.drop('Unnamed: 0', axis = 1)

In [6]:
tweets.shape

(44660, 5)

In [7]:
tweets.columns

Index(['tweet_text', 'cyberbullying_type', 'lang', 'mentioned_users',
       'hashtags'],
      dtype='object')

In [8]:
tweets.cyberbullying_type.unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [9]:
tweets['cyberbullying_type'] = tweets['cyberbullying_type'].astype('category')

In [10]:
train_data, test_data = train_test_split(tweets, test_size = 0.2, random_state = 20)
train_data, val_data = train_test_split(train_data, test_size = 0.25, random_state = 20)

In [33]:
class TweetDataset(Dataset):
    def __init__(self, data, tokenizer, max_len = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(data['cyberbullying_type'])
        self.max_len = max_len
    

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['tweet_text']
        encoding = self.tokenizer.encode_plus(input_text, max_length = self.max_len, padding = 'max_length', truncation = True, return_tensors = 'pt')
        return{
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label':torch.tensor(self.labels, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = TweetDataset(train_data, tokenizer)
val_dataset = TweetDataset(val_data, tokenizer)
test_dataset = TweetDataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [45]:
class RNNClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional=bidirectional, dropout = dropout, batch_first = True)
        if bidirectional:
            input_dim = hidden_dim * 2
        else:
            input_dim = hidden_dim
        self.fc = nn.Linear(input_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        output, hidden = self.rnn(embedded)
        
        if self.rnn.bidirectional:
            hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        else:
            hidden = hidden[-1, :, :]
        
        return self.fc(hidden)
        

In [46]:
#Hyperparameters:
INPUT_DIM = len(tokenizer.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = len(np.unique(tweets['cyberbullying_type']))
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

In [62]:
model = RNNClassifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_accuracy = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label'].view(-1)
        
        predictions = model(input_ids)
        
        loss = criterion(predictions, labels)
        accuracy = (predictions.argmax(1) == labels).float().mean()
        
        loss.backward()
        opimizer.step()
        
        epoch_loss += loss.item()
        epoch_accuracy += accuracy.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    

In [63]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_accuracy = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']
            
            predictions = model(input_ids, attention_mask)
        
            loss = criterion(prediction, labels)
            accuracy = (prediction.argmax(1) == labels).float().mean()
        
            epoch_loss += loss.item()
            epoch_accuracy += accuracy.item()
        
            return epoch_loss / len(iterator), epoch_acc / len(iterator) 

In [64]:
EPOCHS = 10

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, optimizer, criterion)

ValueError: Expected input batch_size (16) to match target batch_size (428736).

In [None]:
print