In [2]:
import pandas as pd
import numpy as np 
import torch
import torch.nn as nn
import torch.nn.functional as F
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from sklearn.model_selection import train_test_split
from datetime import datetime
from torch.utils.data import DataLoader
from copy import deepcopy


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'spacy'

In [299]:
class vocab_builder:
    def __init__(self, tokenized_df):
        self.longest= 0
        self.idx_word= {}
        self.word_idx = {}
        self.tracker = {}
        tokenized_df['tweet'].apply(vocab_builder.vocab_gen, args=(self,))
        self.word_idx['<PAD>'] = len(self.word_idx)
        self.idx_word[len(self.idx_word)] = '<PAD>'
        self.word_idx['<UNK>'] = len(self.word_idx)
        self.idx_word[len(self.idx_word)] = '<UNK>'
        
    def vocab_gen(sentence, self):
        count = 0 
        for word in sentence:
            count +=1
            if word.norm not in self.tracker:
                self.tracker[word.norm] = 1
                self.word_idx[str(word)] = len(self.word_idx)
                self.idx_word[len(self.idx_word)] = word
            else:
                self.tracker[word.norm] += 1
        if self.longest < count:
            self.longest = count
        return 

In [300]:
class Data_Creator:
    def __init__(self, filepath):
        data = pd.read_csv(filepath)
        data = self.clean(data)
        self.tokenized_df = self.tokenize(data)
        

    def tokenize(self, df):
        nlp= spacy.load("en_core_web_sm", disable=["parser"])
        tweet = df['tweet'].apply(lambda x: nlp(x.strip()))
        tokenized = df.assign(tweet = tweet)
        return tokenized
    
    def clean(self, data):
        repl = {'@\w*': ' ', '&amp;' : 'and','\su\s':' you ', '&#\w*;': ' ', 
        '#':' ', '\s2\s': 'two', 'bihday':"birthday", "ð[^ ]*": ' ' ,
        "â[^ ]*": ' ',"(dont)|(don't)": 'do not', "(cant)|(can't)": "can not",
        "(yous)|(you's)": "you is", "(yous)|(you's)": "you is", 
        "(youve)|(you've)": "you have", "(doesnt)|(doesn't)": 'does not', 
        "(wont)|(won't)": 'will not', "[0-9]+\.*[0-9%]+\w*" : "<NUMBER>",'\\n\.':' ' ,'\\n':' ',
        "\.{2,}": '.', "!{2,}":'!', "\?{2,}":'?', 'ing[^a-z]':' ', 'ed[^a-z]': ' ', '_':" ",
        ' +': ' '}

        cleaned_tweet = data['tweet'].str.lower()
        cleaned_tweet = cleaned_tweet.replace(repl, regex=True)
        cleaned = data.assign(tweet = cleaned_tweet)
        return cleaned.drop("Unnamed: 0", axis=1)

In [301]:
class data_processor:
    def __init__(self, tokenized_df, vocab, test_size, train_size, threshold, most):
        self.vocab = vocab
        self.most = most
        self.threshold = threshold
        normalized = tokenized_df['tweet'].apply(data_processor.sentence_normalizer, args=(self,))
        normalized_df = tokenized_df.assign(tweet = normalized)
        numerized = normalized_df['tweet'].apply(data_processor.numerizer, args=(self,))
        numerized_df = normalized_df.assign(numerized_tweet = numerized)
        train_valid, self.test = train_test_split(numerized_df, test_size=test_size)
        self.train, self.validate = train_test_split(train_valid, train_size=train_size)


    def sentence_normalizer(sentence, self):
        final_tok = []
        count = 0 
        for token in sentence:
            final_tok.append(token)
            count+=1
            if count >= self.most:
                break
        if len(final_tok)<self.most:
            final_tok.extend(['<PAD>']*(self.most-len(final_tok)))
        return final_tok


    def numerizer(x, self):
        base = []
        for token in x:
            try:
                if token.norm in self.vocab.tracker:
                    if self.vocab.tracker[token.norm]>= self.threshold:
                        base.append(self.vocab.word_idx[str(token)])
                    else:
                        base.append(self.vocab.word_idx['<UNK>'])
                else:
                    base.append(self.vocab.word_idx['<UNK>'])
            except:
                base.append(self.vocab.word_idx['<PAD>'])

        return base

In [302]:
class Dataset:
    def __init__(self, df):
        self.features = torch.tensor(np.stack(df['numerized_tweet']))
        self.targets = torch.tensor(np.asarray(df['Toxicity'])).unsqueeze(1).float()

    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, idx):
        features = self.features[idx]
        target = self.targets[idx]
        return features, target

In [303]:
class NLP_LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, layers, dropout, bi):

        super(NLP_LSTM, self).__init__()
        
        ## Params
        self.hdim = hidden_dim
        self.layers = layers
        self.drop = dropout
        if bi:
            self.bi = 2
        else:
            self.bi = 1
        
        
        ## layers
        self.embedder = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=layers, batch_first=True, dropout=dropout, bidirectional = bi)
        self.fc1 = nn.Linear(hidden_dim*self.bi, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()
        self.drop_layer = nn.Dropout(p=dropout)
        

    def forward(self, inputs):
        
        hidden = torch.zeros(layers*self.bi, inputs.shape[0], self.hdim).cuda()
        cell = torch.zeros(layers*self.bi, inputs.shape[0], self.hdim).cuda()
        
        embeddings = self.embedder(inputs)
        outputs, (hidden,cell) = self.lstm(embeddings, (hidden,cell))
        linear_layer1 = self.relu(self.fc1(outputs[:,-1,:]))
        dropper = self.drop_layer(linear_layer1)
        linear_layer2 = self.fc2(dropper)
        prediction = torch.sigmoid(linear_layer2)
        return prediction
        


In [304]:
class Trainer:
    def __init__(self, filepath, batch_size, emded_dim, hidden_dim, learning_rate, epochs, gam, train_size, test_size, early, layers, thresh, most, dropout, bi):

        self.gpu_avail = torch.cuda.is_available()
        loaded_Data = Data_Creator(filepath)
        self.vocabulary = vocab_builder(loaded_Data.tokenized_df)
        processed_data = data_processor(loaded_Data.tokenized_df, self.vocabulary, test_size, train_size, thresh, most)
        
        ## Data
        train_data = processed_data.train
        vlad_data = processed_data.validate
        test_data = processed_data.test

        ## Data Loaders

        self.train_loader = DataLoader(Dataset(train_data), batch_size=batch_size, shuffle=True)
        self.vlad_loader = DataLoader(Dataset(vlad_data), batch_size=batch_size, shuffle=True)
        self.test_loader = DataLoader(Dataset(test_data), batch_size=batch_size, shuffle=True)

        ## Params
        self.last_epoch = epochs
        self.early = early
        self.bs = batch_size

        ## model
        if self.gpu_avail:
            self.loss = nn.BCELoss().cuda()
            self.model = NLP_LSTM(emded_dim, hidden_dim, len(self.vocabulary.word_idx), layers, dropout, bi).cuda().float()
            self.best = deepcopy(self.model.state_dict())
        else: 
            self.loss = torch.nn.BCELoss()
            self.model = NLP_LSTM(emded_dim, hidden_dim, len(self.vocabulary.word_idx))

        ## Optimizer    
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        self.lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=gam)

        ## Loss Tracking
        self.min_loss = float('inf')
        self.train_loss = []
        self.vlad_loss = []
        self.test_loss = []

    def run(self):
        print("Training Commencing")
        for epoch in range(0, self.last_epoch):
            start_time = datetime.now()
            print(F'epoch: {epoch+1}')
            print('Training\n')
            self.model.train(True)
            self.train()
            print('Testing\n')
            self.model.train(False)
            self.validation()
            if self.vlad_loss[-1] < self.min_loss:
                count=0
                self.min_loss = self.vlad_loss[-1]
                self.best = deepcopy(self.model.state_dict())
            if count> self.early:
                print('Early Stopping\n')
                break
            count += 1
            end_time = datetime.now()
        self.model.load_state_dict(self.best)
        self.model.train(False)
        self.test()

    
    def train(self):
        epoch_loss = []
        correct = 0
        total = 0
        
        for text, tag in self.train_loader:
            text = text.cuda()
            tag = tag.cuda()
            self.optimizer.zero_grad()
            outputs = self.model.forward(text)
            loss = self.loss(outputs, tag)
            loss.backward()
            self.optimizer.step()
            epoch_loss.append(loss.item())
            classification = torch.round(outputs.squeeze())
            num_correct = torch.eq(classification, tag.squeeze()).squeeze()
            correct += torch.sum(num_correct)
            total += (tag.squeeze()).size(0)

        self.lr_scheduler.step()
        avg = np.array(epoch_loss).mean()
        self.train_loss.append(avg)
        acc = correct/total
        print(F"train loss: {avg}")
        print(F"train accuracy: {acc}")


    def validation(self):
        epoch_loss = []
        correct = 0
        total = 0
        
        for text, tag in self.vlad_loader:
            text = text.cuda()
            tag = tag.cuda()
            outputs = self.model.forward(text)
            loss = self.loss(outputs, tag)
            epoch_loss.append(loss.item())
            classification = torch.round(outputs.squeeze())
            num_correct = torch.eq(classification, tag.squeeze()).squeeze()
            correct += torch.sum(num_correct)
            total += (tag.squeeze()).size(0)
            
        acc = correct/total
        avg = np.array(epoch_loss).mean()
        self.vlad_loss.append(avg)
        print(F"validation loss: {avg}")
        print(F"validation accuracy: {acc}")

    def test(self):
        epoch_loss = []
        correct = 0 
        total = 0
        
        for text, tag in self.test_loader:
            text = text.cuda()
            tag = tag.cuda()
            outputs = self.model.forward(text)
            loss = self.loss(outputs, tag)
            epoch_loss.append(loss.item())
            classification = torch.round(outputs.squeeze())
            num_correct = torch.eq(classification, tag.squeeze()).squeeze()
            correct += torch.sum(num_correct)
            total += (tag.squeeze()).size(0)
            
        acc = correct/total
        avg = np.array(epoch_loss).mean()
        self.test_loss.append(avg)
        print(F"test_loss: {avg}")
        print(F"test accuracy: {acc}")

        

In [305]:
path = "./data/FinalBalancedDataset.csv"

## HyperParms
bs = 50
embed = 200
hdim = 100
lr = 0.00005
epochs = 20
gam= 0.96
tr = 0.9
ts= 0.1
early = 3
layers = 2
thresh = 12
most = 40
dropout = 0.5
bi = True

h = Trainer(path, bs, embed, hdim, lr, epochs, gam, tr, ts, early, layers, thresh, most, dropout, bi)

In [None]:
h.run()

In [5]:
df =pd.read_csv("../data/FinalBalancedDataset.csv")

In [6]:
df[df['Toxicity']==1]

Unnamed: 0.1,Unnamed: 0,Toxicity,tweet
13,13,1,@user #cnn calls #michigan middle school 'buil...
14,14,1,no comment! in #australia #opkillingbay #se...
17,17,1,retweet if you agree!
23,23,1,@user @user lumpy says i am a . prove it lumpy.
34,34,1,it's unbelievable that in the 21st century we'...
...,...,...,...
56739,56739,1,you're such a retard i hope you get type 2 dia...
56740,56740,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
56741,56741,1,"you've gone and broke the wrong heart baby, an..."
56742,56742,1,young buck wanna eat!!.. dat nigguh like I ain...
