## <b>Import libraries<b>

In [1]:
import torch
import pandas as pd
import numpy as np
from collections import Counter
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from sklearn.metrics import f1_score
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable

In [2]:
#drew inspiration from https://github.com/dmesquita/understanding_pytorch_nn and
#and https://github.com/msahamed/yelp_comments_classification_nlp/blob/master/word_embeddings.ipynb
#and https://github.com/nyu-mll/DS-GA-1011-Fall2017/blob/master/week%20eight/Week%20Eight%20Solutions.ipynb
#and https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
#https://github.com/claravania/lstm-pytorch/blob/master/model.py
#https://medium.com/@sonicboom8/sentiment-analysis-with-variable-length-sequences-in-pytorch-6241635ae130
#https://github.com/hpanwar08/sentence-classification-pytorch/blob/master/Sentiment%20analysis%20pytorch.ipynb

## <b> Data Processing<b>

In [3]:
train = pd.read_csv("../train.csv")

In [4]:
#create labels

In [5]:
labels = ['CAPS', 'Obscenity', 'Threat', 'hatespeech', 'namecalling', 'negprejudice', 'noneng', 'porn', 'stereotypes']

for label in labels:
    cols = [label + str(x) for x in range(1,8)]
    train[label + '_num_yes'] = train[cols].sum(axis = 1)
    train[label] = pd.Series(train[label + '_num_yes'] >= 2).astype(int)

In [6]:
train = train.loc[train['clean_tweet'].isnull() == False,]

In [7]:
train.reset_index(inplace = True, drop = True)

In [8]:
train.head()

Unnamed: 0.1,Unnamed: 0,CAPS1,CAPS2,CAPS3,CAPS4,CAPS5,CAPS6,CAPS7,Obscenity1,Obscenity2,...,namecalling_num_yes,namecalling,negprejudice_num_yes,negprejudice,noneng_num_yes,noneng,porn_num_yes,porn,stereotypes_num_yes,stereotypes
0,420,0.0,0.0,0.0,0.0,,,,0.0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
1,1676,0.0,,,,0.0,,0.0,0.0,,...,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0
2,742,,,,,0.0,0.0,0.0,,,...,0.0,0,0.0,0,0.0,0,0.0,0,1.0,0
3,791,,,0.0,0.0,,0.0,,,,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
4,164,,,0.0,0.0,,0.0,,,,...,0.0,0,0.0,0,3.0,1,0.0,0,0.0,0


In [9]:
vocab = Counter()

for text in train.clean_tweet:
    for word in text.split(' '):
        vocab[word.lower()]+=1

for text in train.clean_tweet:
    for word in text.split(' '):
        vocab[word.lower()]+=1

total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i+1

    return word2index

word2index = get_word_2_index(vocab)

In [10]:
#need to make indexer start at 1, because 0 is a pad token

In [27]:
def pad_data(s, length):
    padded = np.zeros((length,), dtype = np.int64)
    if len(s) > length: 
        padded = s[:length]
    else:
        padded[:len(s)] = s
    return np.array(padded)

In [28]:
train['seq_len'] = [len(x.split(' ')) for x in train['clean_tweet']]

train['numeric'] = [[word2index[y] for y in x.split(' ')] for x in train['clean_tweet']]

train['padded_tweet'] = [pad_data(x, 20) for x in train.numeric]

In [29]:
# subclass the custom dataset class with torch.utils.data.Dataset
# implement __len__ and __getitem__ function
class VectorizeData(Dataset):
    def __init__(self, df, label, maxlen=20):
        self.df = df
        self.label = label

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        X = self.df.padded_tweet[idx]
        y = self.df[self.label][idx]
        lens = self.df.seq_len[idx]
        return X,y,lens

In [30]:
data = VectorizeData(train, label = 'hatespeech')

In [31]:
dl = DataLoader(data, batch_size = 32)

125

In [115]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, batch_size):
        super(LSTMClassifier, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1)
        self.hidden2out = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.LogSoftmax()
        self.dropout_layer = nn.Dropout(p=0.2)
        self.batch_size = batch_size
        #self.hidden = self.init_hidden(batch_size)

    def init_hidden(self, batch_size):
        return(autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)), \
               autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)))

    def forward(self, batch): #removed lengths
        #should reinitalize hidden states before each batch?
        self.hidden = self.init_hidden(batch_size)
        embeds = self.embedding(batch)
        #packed_input = pack_padded_sequence(embeds, lengths)
        outputs, (ht, ct) = self.lstm(embeds, self.hidden)
        # ht is the last hidden state of the sequences
        # ht = (1 x batch_size x hidden_dim)
        # ht[-1] = (batch_size x hidden_dim)
        output = self.dropout_layer(ht[-1])
        output = self.hidden2out(output)
        output = self.softmax(output)
        return output

In [116]:
hidden_size = 100 
num_classes = 2
learning_rate = 0.0001
num_epochs = 5
batch_size = 32

In [117]:
net = LSTMClassifier(total_words, hidden_size, hidden_size, num_classes, batch_size)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  

# Train the Model
for epoch in range(num_epochs):
    it = iter(dl)
    total_batch = int(len(train.clean_tweet)/batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x,batch_y,batch_len = next(it)
        tweets = Variable(batch_x.transpose(0,1))
        labels = Variable(batch_y)
        lengths = Variable(batch_len)
        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = net(tweets)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i+1) % 4 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, i+1, len(train.clean_tweet)//batch_size, loss.data[0]))

Epoch [1/5], Step [4/124], Loss: 0.7770
Epoch [1/5], Step [8/124], Loss: 0.6642
Epoch [1/5], Step [12/124], Loss: 0.5778
Epoch [1/5], Step [16/124], Loss: 0.4910
Epoch [1/5], Step [20/124], Loss: 0.4693
Epoch [1/5], Step [24/124], Loss: 0.4203
Epoch [1/5], Step [28/124], Loss: 0.3052
Epoch [1/5], Step [32/124], Loss: 0.2595
Epoch [1/5], Step [36/124], Loss: 0.3130
Epoch [1/5], Step [40/124], Loss: 0.2492
Epoch [1/5], Step [44/124], Loss: 0.1811
Epoch [1/5], Step [48/124], Loss: 0.2075
Epoch [1/5], Step [52/124], Loss: 0.1843
Epoch [1/5], Step [56/124], Loss: 0.3317
Epoch [1/5], Step [60/124], Loss: 0.1812
Epoch [1/5], Step [64/124], Loss: 0.1469
Epoch [1/5], Step [68/124], Loss: 0.0927
Epoch [1/5], Step [72/124], Loss: 0.2293
Epoch [1/5], Step [76/124], Loss: 0.1484
Epoch [1/5], Step [80/124], Loss: 0.1551
Epoch [1/5], Step [84/124], Loss: 0.0687
Epoch [1/5], Step [88/124], Loss: 0.0501
Epoch [1/5], Step [92/124], Loss: 0.0557
Epoch [1/5], Step [96/124], Loss: 0.3378
Epoch [1/5], Step 

In [118]:
correct = 0
total = 0
it = iter(dl)
batch_x,batch_y,batch_len = next(it)
tweets = Variable(batch_x.transpose(0,1))
labels = Variable(batch_y)
lengths = Variable(batch_len)
outputs = net(tweets)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct = (predicted == labels.data).sum()
print (correct/total)

0.9375
