# IMDB Sentiment Analysis Challenge

In this notebook, I will create and train two SA models that use the IMDB dataset.

---




First install necessary packages and download and extract The IMBD dataset.

In [0]:
import zipfile
import tensorflow as tf
import os
import math
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from six.moves.urllib.request import urlretrieve
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

try:
  print("GPU device found")
  device = torch.device("cuda")
except:
  print("GPU device not found, I advise changing the runtime type.")
  device = torch.device("cpu")

url = "https://resemble.sfo2.digitaloceanspaces.com/imdb-review-dataset.zip"
path = os.path.join("imdb_dataset")
if not os.path.exists(path):
  os.makedirs(path)
  fpath = os.path.join(path, "imdb-review-dataset.zip")
  urlretrieve(url, fpath)
  review_data = zipfile.ZipFile(fpath, 'r')
  review_data.extractall(path)
  review_data.close()

fpath = os.path.join(path, "imdb_master.csv")
f = open(fpath)
movieReviews = pd.read_csv(f)

Using TensorFlow backend.


GPU device found


Data preprocessing step, here I group data by its respective train/test label and shuffle the training and testing data. Next  I use keras's tokenizer to vectorize each review into a sequence of integers corresponding to the diction from the reviews.

In [0]:
#Take the first half of dataset, because the latter reviews are unlabeled
reviews = movieReviews['review'].tolist()[0:49999]
sentiments = [0 if sentiment == "neg" else 1 for sentiment in movieReviews['label'].tolist()[0:49999]]

#assort training and test data accordingly to the dataset 
types = movieReviews['type'].tolist()[0:49999]
training_idxs = []
testing_idxs = []
for i in range(49999):
  if types[i] == "test":
    testing_idxs.append(i)
  if types[i] == "train":
    training_idxs.append(i)

random.shuffle(training_idxs)
random.shuffle(testing_idxs)

x_train_temp, y_train = [reviews[i] for i in training_idxs], [sentiments[i] for i in training_idxs]
x_test_temp, y_test = [reviews[i] for i in testing_idxs], [sentiments[i] for i in testing_idxs]

#vectorize each review into a sequence of integers corresponding to words from reviews
vocab_size = 15000
tokenizer = Tokenizer(num_words=vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(reviews)
x_train_tokenized = tokenizer.texts_to_sequences(x_train_temp)
x_test_tokenized = tokenizer.texts_to_sequences(x_test_temp)

#set a maximum review length as well as pad sequences with zeros under the review length
#make sure to tokenize the train and test set seperately
x_train = pad_sequences(x_train_tokenized, maxlen = 250)
x_test = pad_sequences(x_test_tokenized, maxlen = 250)


Here I use Keras to implement simple LSTM model with dropout. I used a binary cross entropy as my loss function (since it's a binary classification problem), and the classic adam optimizer. 

In [0]:
#Basic RNN using LSTM blocks used to recall information over long sequences, implements 
#dropout to prevent overfitting and a sigmoidal activation func
model = Sequential()
model.add(Embedding(vocab_size, 250))
model.add(LSTM(64, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 1024
epochs = 10

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test Set score:', score)
print('Test Set Accuracy:', acc)

It turns out that I was able to get a pretty good accuracy score on my test dataset. However in an attempt to score a little bit higher, (and show off my Pytorch skills), I will next implement a bidirectional LSTM model with attention.  I use a self attention network since the sequence is pretty long (set to 250 words per review) as well as gradient clipping to prevent the issues of exploding gradients common with long reccurent networks. I also add a regularization layer and dropout layers to prevent overfitting.

# Take 2

In [0]:
class LSTM_PLUS_ATTN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, attention_model, drop_prob=0.3):
      
        super(LSTM_PLUS_ATTN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.attention_model = attention_model
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.LSTM = nn.LSTM(embedding_dim, self.hidden_size, 1, dropout=drop_prob, batch_first=True, bidirectional = True)
        
        self.attention_weights1 = nn.Linear(hidden_size,hidden_size,bias=False)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.lin1 = nn.Linear(self.hidden_size*2, vocab_size/3)
        
        self.lin2 = nn.Linear(vocab_size/3, 1)
        
        self.sig = nn.Sigmoid()
        

    def forward(self, x):
      
        w = next(self.parameters()).data
        
        hidden = (w.new(2, batch_size, self.hidden_size).zero_().cuda(),
                  w.new(2, batch_size, self.hidden_size).zero_().cuda())
        
        embeds = self.embedding(x)
        
        lstm_out, hidden = self.LSTM(embeds, hidden)        
        
        lin_comb, attn = self.attention_model(lstm_out)
        
        l1 = self.lin1(lin_comb)
        
        d1 = self.dropout(l1)
        
        l2 = self.lin2(d1)
        
        avg_sentence_embeddings = torch.sum(l2,1)/30
        
        sig_out = self.sig(avg_sentence_embeddings)
        
        return sig_out 
      
class Attention(nn.Module):
  def __init__(self, query_dim, hidden_size):
    
    super(Attention, self).__init__()
    
    self.scale = 1. / math.sqrt(query_dim)
    
    self.l1 = nn.Linear(hidden_size*2, hidden_size*2)
    
    self.l1.bias.data.fill_(0)
    
    self.dropout = nn.Dropout(0.3)
    
    self.l2 = nn.Linear(hidden_size*2, 30)
    
    self.l2.bias.data.fill_(0)
    
    self.tanh = nn.Tanh()
    
  def forward(self, inputs):
    
    l1 = self.l1(inputs)
    
    tanh = self.tanh(l1)
    
    drop = self.dropout(tanh)
    
    l2 = self.l2(drop)
    
    attn = torch.nn.functional.softmax(l2, dim=2)
        
    linear_combination = torch.bmm(attn.transpose(1,2), inputs).squeeze(2)

    return linear_combination, attn
    

See summary of the model I used below...

In [0]:
vocab_size = 15000
embedding_dim = 300
hidden_size = 256
batch_size = 500
epochs = 15

# Data preprocessing
train = TensorDataset(torch.from_numpy(np.array(x_train)), torch.from_numpy(np.array(y_train)))
test = TensorDataset(torch.from_numpy(np.array(x_test)), torch.from_numpy(np.array(y_test)))

# Data iterators
train_loader = DataLoader(train, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test, shuffle=True, batch_size=batch_size)
dataiter = iter(train_loader)
dataiter = iter(test_loader)

# Instantiate model
attn_mod = Attention(hidden_size, hidden_size)
model = LSTM_PLUS_ATTN(vocab_size, embedding_dim, hidden_size, attn_mod)
print(model)


LSTM_PLUS_ATTN(
  (attention_model): Attention(
    (l1): Linear(in_features=512, out_features=512, bias=True)
    (dropout): Dropout(p=0.3)
    (l2): Linear(in_features=512, out_features=30, bias=True)
    (tanh): Tanh()
  )
  (embedding): Embedding(15000, 300)
  (LSTM): LSTM(300, 256, batch_first=True, dropout=0.3, bidirectional=True)
  (attention_weights1): Linear(in_features=256, out_features=256, bias=False)
  (dropout): Dropout(p=0.3)
  (lin1): Linear(in_features=512, out_features=5000, bias=True)
  (lin2): Linear(in_features=5000, out_features=1, bias=True)
  (sig): Sigmoid()
)


  "num_layers={}".format(dropout, num_layers))


Note: if the gpu is running out of vram you may need to restart the runtime and run cells 1, 2, (skip 3), 4, and 5 before running the 6th code cell.

In [0]:

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
clipping=10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_loss = []
num_correct = 0
model.cuda()
model.train()
print("training commences...")
for e in range(epochs):
    counter = 0
    for inputs, labels in train_loader:
        counter += 1
        if (25000 - counter * batch_size < batch_size):
          continue
          
        inputs, labels = inputs.cuda(), labels.cuda()
        model.zero_grad()
        inputs = inputs.type(torch.LongTensor)
        inputs = inputs.to(device)
        output = model(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clipping)
        optimizer.step()

    print("Epoch #" + str(e+1) + "/" + str(epochs) + ", " + "Loss: " + str(loss.item()))

val_losses = []
model.eval()
num_correct = 0
for inputs, labels in test_loader:
    #Finished the epoch
    if(inputs.shape[0]<batch_size):
      continue

    inputs, labels = inputs.cuda(), labels.cuda()
    inputs = inputs.type(torch.LongTensor)
    inputs = inputs.to(device)
    output = model(inputs)
    val_loss = criterion(output.squeeze(), labels.float())
    val_losses.append(val_loss.item())
    pred = torch.round(output.squeeze())
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
    test_acc = num_correct/len(test_loader.dataset)
    model.train()

print("Test accuracy: " + str(test_acc))      

training commences...
Epoch #1/15, Loss: 0.555325090885
Epoch #2/15, Loss: 0.372678458691
Epoch #3/15, Loss: 0.269668340683
Epoch #4/15, Loss: 0.222672760487
Epoch #5/15, Loss: 0.204688951373
Epoch #6/15, Loss: 0.143753200769
Epoch #7/15, Loss: 0.107513204217
Epoch #8/15, Loss: 0.0357235558331
Epoch #9/15, Loss: 0.0165668465197
Epoch #10/15, Loss: 0.00814997218549
Epoch #11/15, Loss: 0.0106172319502
Epoch #12/15, Loss: 0.00995420571417
Epoch #13/15, Loss: 0.00415983935818
Epoch #14/15, Loss: 0.097490273416
Epoch #15/15, Loss: 0.0130673609674
Test accuracy: 0.87356


As expected we acheive a better test set accuracy than the prior network. Thank you for taking the time to look through my code! 