## <b>Import libraries<b>

In [129]:
import torch
import pandas as pd
import numpy as np
from collections import Counter
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from sklearn.metrics import f1_score
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [3]:
#drew inspiration from https://github.com/dmesquita/understanding_pytorch_nn and
#and https://github.com/msahamed/yelp_comments_classification_nlp/blob/master/word_embeddings.ipynb
#and https://github.com/nyu-mll/DS-GA-1011-Fall2017/blob/master/week%20eight/Week%20Eight%20Solutions.ipynb
#and https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
#https://github.com/claravania/lstm-pytorch/blob/master/model.py

## <b> Data Processing<b>

In [18]:
train = pd.read_csv("../train.csv")

In [19]:
labels = ['CAPS', 'Obscenity', 'Threat', 'hatespeech', 'namecalling', 'negprejudice', 'noneng', 'porn', 'stereotypes']

for label in labels:
    cols = [label + str(x) for x in range(1,8)]
    train[label + '_num_yes'] = train[cols].sum(axis = 1)
    train[label] = train[label + '_num_yes'] >= 2

In [30]:
train = train.loc[train['clean_tweet'].isnull() == False,]

In [32]:
vocab = Counter()

for text in train.clean_tweet:
    for word in text.split(' '):
        vocab[word.lower()]+=1

for text in train.clean_tweet:
    for word in text.split(' '):
        vocab[word.lower()]+=1

total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i

    return word2index

word2index = get_word_2_index(vocab)

In [144]:
train['seq_len'] = [len(x.split(' ')) for x in train['clean_tweet']]

In [145]:
def get_batch(df,i,batch_size,x_name,target_name,length_name):
    batches = []
    results = []
    texts = df[x_name][i*batch_size:i*batch_size+batch_size]
    categories = df[target_name][i*batch_size:i*batch_size+batch_size]
    lengths = df[length_name][i*batch_size:i*batch_size+batch_size]
    for text in texts:
        layer = np.zeros(total_words,dtype=float)
        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1

        batches.append(layer)

    for category in categories:
        index_y = -1
        if category == 0:
            index_y = 0
        elif category == 1:
            index_y = 1
        else:
            index_y = 2
        results.append(index_y)


    return np.array(batches),np.array(results),np.array(lengths)

In [58]:
class OurNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(OurNet, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)
    
    def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

In [162]:
hidden_size = 100 
num_classes=2
learning_rate = 0.001
num_epochs = 5
batch_size = 32

In [175]:
net = OurNet(total_words, hidden_size,num_classes)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  

# Train the Model
for epoch in range(num_epochs):
    total_batch = int(len(train.clean_tweet)/batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x,batch_y,batch_len = get_batch(train,i,batch_size,'clean_tweet','hatespeech','seq_len')
        tweets = Variable(torch.FloatTensor(batch_x))
        labels = Variable(torch.LongTensor(batch_y))
        lengths = Variable(torch.LongTensor(batch_len))
        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = net(tweets)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i+1) % 4 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, i+1, len(train.clean_tweet)//batch_size, loss.data[0]))

Epoch [1/5], Step [4/124], Loss: 0.6760
Epoch [1/5], Step [8/124], Loss: 0.6338
Epoch [1/5], Step [12/124], Loss: 0.5884
Epoch [1/5], Step [16/124], Loss: 0.5210
Epoch [1/5], Step [20/124], Loss: 0.5078
Epoch [1/5], Step [24/124], Loss: 0.4425
Epoch [1/5], Step [28/124], Loss: 0.3274
Epoch [1/5], Step [32/124], Loss: 0.2328
Epoch [1/5], Step [36/124], Loss: 0.2718
Epoch [1/5], Step [40/124], Loss: 0.1794
Epoch [1/5], Step [44/124], Loss: 0.0771
Epoch [1/5], Step [48/124], Loss: 0.1039
Epoch [1/5], Step [52/124], Loss: 0.1014
Epoch [1/5], Step [56/124], Loss: 0.3253
Epoch [1/5], Step [60/124], Loss: 0.1622
Epoch [1/5], Step [64/124], Loss: 0.0638
Epoch [1/5], Step [68/124], Loss: 0.0178
Epoch [1/5], Step [72/124], Loss: 0.2167
Epoch [1/5], Step [76/124], Loss: 0.0536
Epoch [1/5], Step [80/124], Loss: 0.0884
Epoch [1/5], Step [84/124], Loss: 0.0166
Epoch [1/5], Step [88/124], Loss: 0.0264
Epoch [1/5], Step [92/124], Loss: 0.0253
Epoch [1/5], Step [96/124], Loss: 0.1968
Epoch [1/5], Step 

In [178]:
correct = 0
total = 0
batch_x_test,batch_y_test,batch_x_len = get_batch(train,0,500,'clean_tweet','hatespeech','seq_len')
articles = Variable(torch.FloatTensor(batch_x_test))
labels = torch.LongTensor(batch_y_test)
outputs = net(articles)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()
print (correct/total)

0.996


In [None]:
#99.6% accuracy on hatespeech? only caveat is, it may be overfit b/c didn't use a holdout set