In [None]:
# This code is a modified version of the code in the following repo:
#                      https://github.com/prakashpandey9/Text-Classification-Pytorch
## We made several changes to make it compatible with the latest version of torchtext, and to also add some more
## features to it to improve the final performance. 




# Here is the licence of the source code:


# MIT License
# Copyright (c) 2018 Prakash Pandey

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

Defining the dataset loader function which is written using torchtext. As mentioned in the README file, here we focus on sentiment detection on IMDB reviews dataset. As we can see in this cell, torchtext has separate module for this dataset which makes it even easier for us to work with it.

In [38]:
import os
import time
import sys
import torch
from torch.nn import functional as F
import numpy as np
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim



def load_dataset( BS, emb_DIM, test_sen=None):

    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : Here we are using fix_length which will pad each sequence to have a fix length of 200.
                 
    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in
                  the train_data to an idx and then after it will use GloVe word embedding to map the 
                  index to the corresponding word embedding.
                  
    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim)
                    containing the pre-trained word embeddings.
                    
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize
                     the amount of padding needed.
    
    """
    
    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True,
                                              include_lengths=True, batch_first=True, fix_length=200)
    LABEL = data.LabelField( dtype=torch.float )
    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL )
    
    #Loading Glove pretrained vectors with dimension of DIM
    TEXT.build_vocab( train_data, vectors=GloVe(name='6B', dim=emb_DIM ))
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))

    # Further splitting of training_data to create new training_data & validation_data
    train_data, valid_data = train_data.split() 
    train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), 
                                       batch_size=BS, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)

    '''Alternatively we can also use the default configurations'''
    # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter





Defining model architecture which is a LSTM on the input word embeddings, and then its last hidden state is passed to a fully connected layer to map to ["pos","neg"] classes.

In [35]:
class LSTMClassifier(nn.Module):
    def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
        super(LSTMClassifier, self).__init__()

        """
        Arguments
        ---------
        batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
        output_size : 2 = (pos, neg)
        hidden_sie : Size of the hidden_state of the LSTM
        vocab_size : Size of the vocabulary containing unique words
        embedding_length : Embeddding dimension of GloVe word embeddings
        weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table
        """

        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length

        self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.
        self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.
        self.lstm = nn.LSTM(embedding_length, hidden_size)
        self.label = nn.Linear(hidden_size, output_size)

    def forward(self, input_sentence, batch_size=None):

        """ 
        Parameters
        ----------
        input_sentence: input_sentence of shape = (batch_size, num_sequences)
        batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

        Returns
        -------
        Output of the linear layer containing logits for positive & negative class which receives its input as the final_hidden_state of the LSTM
        final_output.shape = (batch_size, output_size)

        """
        device_temp = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        ''' Here we will map all the indexes present in the input sequence to the corresponding word vector using our pre-trained word_embedddins.'''
        input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences,  embedding_length)
        input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)
        if batch_size is None:
            h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size) ).to(device_temp) # Initial hidden state of the LSTM
            c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size) ).to(device_temp) # Initial cell state of the LSTM
        else:
            h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size) ).to(device_temp)
            c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size) ).to(device_temp)
        output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))
        final_output = self.label(final_hidden_state[-1]) # final_hidden_state.size() = (1, batch_size, hidden_size) & final_output.size() = (batch_size, output_size)

        return final_output

In order to prevent the gradients from exploding (which might happen in a RNN architecture), we defind a clip_gradient class that clips all gradients to some predefined value.
Moreover, it's a good practice to define separate train_model and eval_model classes that are responsibe for batch_training steps and also model evaluation in the middle of training to monitor the overall performance. 
Defining these two classes separately will make the code cleaner, and also helps you to debug the whole architecture more easily once you encounter an error.

In [36]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)
    
def train_model(model, train_iter, epoch, optimizer, BS):
    total_epoch_loss = 0
    total_epoch_acc = 0
    
    
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        text = batch.text[0]
        target = batch.label
        target = Variable(target).long()
        if torch.cuda.is_available():
            text = text.cuda()
            target = target.cuda()        
        if (text.size()[0] is not BS):# One of the batch returned by BucketIterator has length different than 32.
            continue
        optimizer.zero_grad()
        prediction = model(text)
        loss = loss_fn(prediction, target)
        num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        loss.backward()
        clip_gradient(model, 1e-1)
        optimizer.step()
        steps += 1
        
        if steps % 100 == 0:
            print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        
    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)


def eval_model(model, val_iter):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text = batch.text[0]
            if (text.size()[0] is not 32):
                continue
            target = batch.label
            target = Variable(target).long()
            if torch.cuda.is_available():
                    text = text.cuda()
                    target = target.cuda()            
            prediction = model(text)
            loss = loss_fn(prediction, target)
            num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)


Downloading IMDB reviews dataset (with their sentiment label), and also the pretrained Glove word embeddings. Here we use 300-dimensional word embeddings, but you might also consider other dimensions in your hyper-parameter search phase.

In [39]:
batch_size = 32
embedding_length = 300

TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_dataset( batch_size , embedding_length )


Length of Text Vocabulary: 251639
Vector size of Text Vocabulary:  torch.Size([251639, 300])
Label Length: 2


In [33]:
learning_rate = 2e-5
output_size = 2
hidden_size = 256

EPOCH = 20
ckpt_saving_path = "model_ckpt.pth"



model = LSTMClassifier( batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
if torch.cuda.is_available():
        model = model.cuda()
optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters() ) , lr = learning_rate )
loss_fn = F.cross_entropy





best_val_acc = 0
for epoch in range(EPOCH):
    train_loss, train_acc = train_model(model, train_iter, epoch, optim, batch_size)
    val_loss, val_acc = eval_model(model, valid_iter)
    
    if val_acc > best_val_acc :
            torch.save( model.state_dict(), ckpt_saving_path )
            best_val_acc = val_acc
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
    
    
    
    
    
model.load_state_dict( torch.load(ckpt_saving_path) )    
with torch.no_grad():
        model.eval()
        test_loss, test_acc = eval_model(model, test_iter)
        print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')
        model.train()


        
# best validation score after 20 epochs : 0.8102
# best performing model through learning (on validation set) accuracy on test set: 0.8067

Epoch: 1, Idx: 100, Training Loss: 0.6736, Training Accuracy:  59.38%
Epoch: 1, Idx: 200, Training Loss: 0.6764, Training Accuracy:  53.12%
Epoch: 1, Idx: 300, Training Loss: 0.7234, Training Accuracy:  56.25%
Epoch: 1, Idx: 400, Training Loss: 0.6768, Training Accuracy:  50.00%
Epoch: 1, Idx: 500, Training Loss: 0.6837, Training Accuracy:  59.38%
Epoch: 01, Train Loss: 0.679, Train Acc: 55.15%, Val. Loss: 0.685150, Val. Acc: 55.77%
Epoch: 2, Idx: 100, Training Loss: 0.7238, Training Accuracy:  53.12%
Epoch: 2, Idx: 200, Training Loss: 0.7021, Training Accuracy:  56.25%
Epoch: 2, Idx: 300, Training Loss: 0.5586, Training Accuracy:  71.88%
Epoch: 2, Idx: 400, Training Loss: 0.6955, Training Accuracy:  59.38%
Epoch: 2, Idx: 500, Training Loss: 0.7518, Training Accuracy:  37.50%
Epoch: 02, Train Loss: 0.657, Train Acc: 61.26%, Val. Loss: 0.691126, Val. Acc: 50.73%
Epoch: 3, Idx: 100, Training Loss: 0.6897, Training Accuracy:  56.25%
Epoch: 3, Idx: 200, Training Loss: 0.6977, Training Accu

KeyboardInterrupt: 

Now you can also test different hyper parameters on validation set to find a better model that is able to boost performance on held-out test dataset.
Moreover, we could think of adding some modules like dropout in the architecture to evaluate its effect on performance, or even maybe replacing RNN by CNN, as it shows promising result in sentiment analysis, however these methods are completely out-of-scope for this introductory session, and we leave it for more advanced ML courses.

You can also use Google Colab service to accelerate your model training by using GPUs. The code is modified to be compatible with GPUs, and once a GPU-enabled environment is used, model is trained on GPU as well. Check their service if you have trouble with training due to training time:
#### https://colab.research.google.com