In [2]:
import pandas as pd
import numpy as np 
import torch
import torch.nn as nn
import torch.nn.functional as F
import spacy
from sklearn.model_selection import train_test_split
from datetime import datetime
from torch.utils.data import DataLoader
from copy import deepcopy
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
class vocab_builder:
    def __init__(self, tokenized_df):
        self.longest= 0
        self.idx_word= {}
        self.word_idx = {}
        self.tracker = {}
        tokenized_df['tweet'].apply(vocab_builder.vocab_gen, args=(self,))
        self.word_idx['<PAD>'] = len(self.word_idx)
        self.idx_word[len(self.idx_word)] = '<PAD>'
        self.word_idx['<UNK>'] = len(self.word_idx)
        self.idx_word[len(self.idx_word)] = '<UNK>'
        
    def vocab_gen(sentence, self):
        count = 0 
        for word in sentence:
            count +=1
            if word.norm not in self.tracker:
                self.tracker[word.norm] = 1
                self.word_idx[str(word)] = len(self.word_idx)
                self.idx_word[len(self.idx_word)] = str(word)
            else:
                self.tracker[word.norm] += 1
        if self.longest < count:
            self.longest = count
        return 

In [11]:
class Data_Creator:
    def __init__(self, filepath):
        data = pd.read_csv(filepath)
        data = self.clean(data)
        self.tokenized_df = self.tokenize(data)
        

    def tokenize(self, df):
        nlp = spacy.load("en_core_web_sm")
        tweet = df['tweet'].apply(lambda x: nlp(x.strip()))
        tokenized = df.assign(tweet = tweet)
        return tokenized
    
    def clean(self, data):
        
        repl = {'@\w*': ' ', '&amp;' : 'and','\su\s':' you ', '&#\w*;': ' ', 
        '#':' ', '\s2\s': 'two', 'bihday':"birthday", "ð[^ ]*": ' ' ,
        "â[^ ]*": ' ',"(dont)|(don't)": 'do not', "(cant)|(can't)": "can not",
        "(yous)|(you's)": "you is", "(yous)|(you's)": "you is", 
        "(youve)|(you've)": "you have", "(doesnt)|(doesn't)": 'does not', 
        "(wont)|(won't)": 'will not', "[0-9]+\.*[0-9%]+\w*" : "NUMBER",'\\n\.':' ' ,'\\n':' ',
        "\.{2,}": '.', "!{2,}":'!', "\?{2,}":'?', 'ing[^a-z]':' ', 'ed[^a-z]': ' ', '_':" ",
        ' +': ' '}

        cleaned_tweet = data['tweet'].str.lower()
        cleaned_tweet = cleaned_tweet.replace(repl, regex=True)
        cleaned = data.assign(tweet = cleaned_tweet)
        return cleaned.drop("Unnamed: 0", axis=1)

In [13]:
class data_processor:
    def __init__(self, tokenized_df, vocab, test_size, train_size, threshold, most):
        self.vocab = vocab
        self.most = most
        self.threshold = threshold
        normalized = tokenized_df['tweet'].apply(data_processor.sentence_normalizer, args=(self,))
        normalized_df = tokenized_df.assign(tweet = normalized)
        numerized = normalized_df['tweet'].apply(data_processor.numerizer, args=(self,))
        numerized_df = normalized_df.assign(numerized_tweet = numerized)
        train_valid, self.test = train_test_split(numerized_df, test_size=test_size)
        self.train, self.validate = train_test_split(train_valid, train_size=train_size)


    def sentence_normalizer(sentence, self):
        final_tok = []
        count = 0 
        for token in sentence:
            final_tok.append(token)
            count+=1
            if count >= self.most:
                break
        if len(final_tok)<self.most:
            final_tok.extend(['<PAD>']*(self.most-len(final_tok)))
        return final_tok


    def numerizer(x, self):
        base = []
        for token in x:
            try:
                if token.norm in self.vocab.tracker:
                    if self.vocab.tracker[token.norm]>= self.threshold:
                        base.append(self.vocab.word_idx[str(token)])
                    else:
                        base.append(self.vocab.word_idx['<UNK>'])
                else:
                    base.append(self.vocab.word_idx['<UNK>'])
            except:
                base.append(self.vocab.word_idx['<PAD>'])

        return base

In [14]:
class Dataset:
    def __init__(self, df):
        self.features = torch.tensor(np.stack(df['numerized_tweet']))
        self.targets = torch.tensor(np.asarray(df['Toxicity'])).unsqueeze(1).float()

    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, idx):
        features = self.features[idx]
        target = self.targets[idx]
        return features, target

In [15]:
class NLP_LSTM(nn.Module):
    def __init__(self, gpu, embedding_dim, hidden_dim, vocab_size, layers, dropout, bi):

        super(NLP_LSTM, self).__init__()
        
        # GPU
        self.gpu = gpu
        
        ## Params
        self.hdim = hidden_dim
        self.layers = layers
        self.drop = dropout
        if bi:
            self.bi = 2
        else:
            self.bi = 1
        
        
        ## layers
        self.embedder = nn.Embedding(vocab_size, embedding_dim)
        self.fc0 = nn.Linear(embedding_dim, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=layers, batch_first=True, dropout=dropout, bidirectional = bi)
        self.fc1 = nn.Linear(hidden_dim*self.bi, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()
        self.drop_layer = nn.Dropout(p=dropout)
        

    def forward(self, inputs):
        
        if self.gpu:
            hidden = torch.zeros(self.layers*self.bi, inputs.shape[0], self.hdim).cuda()
            cell = torch.zeros(self.layers*self.bi, inputs.shape[0], self.hdim).cuda()
        else:
            hidden = torch.zeros(self.layers*self.bi, inputs.shape[0], self.hdim)
            cell = torch.zeros(self.layers*self.bi, inputs.shape[0], self.hdim)
        
        embeddings = self.embedder(inputs)
        linear_layer1 = self.relu(self.fc0(embeddings))
        linear_layer2 = self.relu(self.fc0(linear_layer1))
        dropper1 = self.drop_layer(linear_layer2)
        outputs, (hidden,cell) = self.lstm(dropper1, (hidden,cell))
        linear_layer3 = self.relu(self.fc1(outputs[:,-1,:]))
        dropper2 = self.drop_layer(linear_layer3)
        linear_layer4 = self.fc2(dropper2)
        prediction = torch.sigmoid(linear_layer4)
        return prediction
        
    def tester(self, inputs):
        
        if self.gpu:
            hidden = torch.zeros(self.layers*self.bi, inputs.shape[0], self.hdim).cuda()
            cell = torch.zeros(self.layers*self.bi, inputs.shape[0], self.hdim).cuda()
        else:
            hidden = torch.zeros(self.layers*self.bi, inputs.shape[0], self.hdim)
            cell = torch.zeros(self.layers*self.bi, inputs.shape[0], self.hdim)
        
        embeddings = self.embedder(inputs)
        x = self.relu(self.fc0(embeddings))
        y = self.relu(self.fc0(x))
        outputs, (hidden,cell) = self.lstm(y, (hidden,cell))
        linear_layer1 = self.relu(self.fc1(outputs[:,-1,:]))
        linear_layer2 = self.fc2(linear_layer1)
        prediction = torch.sigmoid(linear_layer2)
        return prediction

In [16]:
class Trainer:
    def __init__(self, filepath, models, batch_size, emded_dim, hidden_dim, learning_rate, epochs, gam, train_size, test_size, early, layers, thresh, most, dropout, bi):

        self.gpu_avail = torch.cuda.is_available()
        loaded_Data = Data_Creator(filepath)
        self.vocabulary = vocab_builder(loaded_Data.tokenized_df)
        processed_data = data_processor(loaded_Data.tokenized_df, self.vocabulary, test_size, train_size, thresh, most)
        
        ## Data
        self.train_data = processed_data.train
        self.vlad_data = processed_data.validate
        self.test_data = processed_data.test

        ## Data Loaders
        self.train_loader = DataLoader(Dataset(self.train_data), batch_size=batch_size, shuffle=True)
        self.vlad_loader = DataLoader(Dataset(self.vlad_data), batch_size=batch_size, shuffle=True)
        self.test_loader = DataLoader(Dataset(self.test_data), batch_size=batch_size, shuffle=True)

        ## Models
        self.models = models

        ## Params
        self.last_epoch = epochs
        self.early = early
        self.bs = batch_size

        ## model
        if self.gpu_avail:
            self.loss = nn.BCELoss().cuda()
            self.model = NLP_LSTM(self.gpu_avail, emded_dim, hidden_dim, len(self.vocabulary.word_idx), layers, dropout, bi).cuda().float()
            self.best = deepcopy(self.model.state_dict())
        else: 
            self.loss = torch.nn.BCELoss()
            self.model = NLP_LSTM(self.gpu_avail, emded_dim, hidden_dim, len(self.vocabulary.word_idx), layers, dropout, bi)

        ## Optimizer    
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        self.lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=gam)

        ## Stat Tracking
        self.min_loss = float('inf')
        
        self.train_loss = []
        self.vlad_loss = []
        
        self.train_acc = []
        self.vlad_acc = []

    def run(self):
        if 'simple' in self.models:
            simple = self.Simple_gusser_Model()
            print('The Simple Evaluation Metrics:\n')
            print(simple)
            print("\n")
        if 'svm' in self.models:
            svm = self.svm_model()
            print('The SVM Evaluation Metrics:\n')
            print(svm)
            print("\n")
        if 'lstm' in self.models:
            lstm = self.run_LSTM()
            print('The LSTM Evaluation Metrics:\n')
            print(lstm)
            print("\n")


    def run_LSTM(self):
        print("Training Commencing")
        start_time = datetime.now()
        for epoch in range(0, self.last_epoch):
            print(F'epoch: {epoch+1}')
            print('Training\n')
            self.model.train(True)
            self.train()
            print('Testing\n')
            self.model.train(False)
            self.validation()
            if self.vlad_loss[-1] < self.min_loss:
                count=0
                self.min_loss = self.vlad_loss[-1]
                self.best = deepcopy(self.model.state_dict())
            if count> self.early:
                print('Early Stopping\n')
                break
            count += 1
        end_time = datetime.now()
        print(end_time - start_time)
        self.model.load_state_dict(self.best)
        self.model.train(False)
        self.save_model()
        self.plot_stats()
        return self.test()

    
    def train(self):
        epoch_loss = []
        correct = 0
        total = 0
        
        for text, tag in self.train_loader:
            if self.gpu_avail:
                text = text.cuda()
                tag = tag.cuda()
            self.optimizer.zero_grad()
            outputs = self.model.forward(text)
            loss = self.loss(outputs, tag)
            loss.backward()
            self.optimizer.step()
            epoch_loss.append(loss.item())
            classification = torch.round(outputs.squeeze())
            num_correct = torch.eq(classification, tag.squeeze()).squeeze()
            correct += torch.sum(num_correct)
            total += (tag.squeeze()).size(0)

        self.lr_scheduler.step()
        avg = np.array(epoch_loss).mean()
        self.train_loss.append(avg)
        acc = correct/total
        self.train_acc.append(acc)
        print(F"train loss: {avg}")
        print(F"train accuracy: {acc}")


    def validation(self):
        epoch_loss = []
        correct = 0
        total = 0
        
        for text, tag in self.vlad_loader:
            if self.gpu_avail:
                text = text.cuda()
                tag = tag.cuda()
            outputs = self.model.forward(text)
            loss = self.loss(outputs, tag)
            epoch_loss.append(loss.item())
            classification = torch.round(outputs.squeeze())
            num_correct = torch.eq(classification, tag.squeeze()).squeeze()
            correct += torch.sum(num_correct)
            total += (tag.squeeze()).size(0)
            
        acc = correct/total
        avg = np.array(epoch_loss).mean()
        self.vlad_loss.append(avg)
        self.vlad_acc.append(acc)
        print(F"validation loss: {avg}")
        print(F"validation accuracy: {acc}")

    def test(self):
        epoch_loss = []
        pred =[]
        actual = []
        correct = 0 
        total = 0
        
        for text, tag in self.test_loader:
            if self.gpu_avail:
                text = text.cuda()
                tag = tag.cuda()
            outputs = self.model.tester(text)
            loss = self.loss(outputs, tag)
            epoch_loss.append(loss.item())
            classification = torch.round(outputs.squeeze())
            pred.extend(classification.cpu().detach().numpy())
            actual.extend(tag.squeeze().cpu().detach().numpy())
            num_correct = torch.eq(classification, tag.squeeze()).squeeze()
            correct += torch.sum(num_correct)
            total += (tag.squeeze()).size(0)
            
        acc = correct/total
        avg = np.array(epoch_loss).mean()
        print(F"test_loss: {avg}")
        print(F"test accuracy: {acc}")
        return classification_report(actual, pred, output_dict=True)
        
    def save_model(self):
        model_path = 'latest_model.pt'
        model_dict = self.model.state_dict()
        state_dict = {'model': model_dict, 'optimizer': self.optimizer.state_dict()}
        torch.save(state_dict, model_path)

    def plot_stats(self):
        e = len(self.train_loss)
        x_axis = np.arange(1, e + 1, 1)
        plt.figure(figsize=(10,6))
        plt.plot(x_axis, self.train_loss, label="Training Loss")
        plt.plot(x_axis, self.vlad_loss, label="Validation Loss")
        plt.xlabel("Epochs")
        plt.legend(loc='best')
        plt.title("LSTM Sentiment" + " Stats Plot")
        plt.savefig("Loss_plot.png")
        plt.show()
        plt.figure(figsize=(10,6))
        plt.plot(x_axis, self.train_loss, label="Training Acc")
        plt.plot(x_axis, self.vlad_loss, label="Validation Acc")
        plt.xlabel("Epochs")
        plt.legend(loc='best')
        plt.title("LSTM Sentiment" + " Stats Plot")
        plt.savefig("Accuracy_plot.png")
        plt.show()


    # SVM model
    def svm_model(self):
        vectorizer = TfidfVectorizer(min_df = 5,
                                 max_df = 0.8,
                                 stop_words = 'english',
                                 sublinear_tf = True,
                                 use_idf = True)

        def convert(x):
            keep = []
            for i in x:
                keep.append(self.vocabulary.idx_word[i])
            return ' '.join(keep)

        ## Pre-process
        svm_train = self.train_data['numerized_tweet'].apply(lambda x: convert(x))
        svm_test = self.test_data['numerized_tweet'].apply(lambda x: convert(x))
        
        #TD-IDF
        X_train = vectorizer.fit_transform(svm_train)
        X_test = vectorizer.transform(svm_test)
        
        # Labels
        y_train = self.train_data['Toxicity']
        y_test = self.test_data['Toxicity']
        
        
        #SVM 
        classifier = svm.LinearSVC(C = 10**-2)
        
        
        #SVM Train
        classifier.fit(X_train, y_train)
        
        #SVM Test
        predictions = classifier.predict(X_test)
        
        return classification_report(y_test, predictions, output_dict=True)
    

    # Simple model
    def Simple_gusser_Model(self):

        np.random.choice([1,0], p =[0.45, 0.55])
        
        base = []
        for i in range(len(self.test_data['Toxicity'])):
            base.append(np.random.choice([1,0], p =[0.45, 0.55]))
        
        return classification_report(self.test_data['Toxicity'], base, output_dict=True)


        

In [17]:
path = "./data/FinalBalancedDataset.csv"

## Models
# put any of the models in teh brackets into the list {svm, simple, lstm}
models = ['svm', 'simple', 'lstm']

## HyperParms
bs = 64
embed = 150
hdim = 150
lr = 0.00005
epochs = 30
gam= 0.96
tr = 0.9
ts= 0.1
early = 4
layers = 4
thresh = 5
most = 29
dropout = 0.44
bi = True

model = Trainer(path, models, bs, embed, hdim, lr, epochs, gam, tr, ts, early, layers, thresh, most, dropout, bi)
model.run()

