Get data and preprocess

In [286]:
import torch

from torchtext import data
from torchtext import datasets


SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
# get length of sentence to help RNN learn only the unpadded sentences
TEXT = data.Field(tokenize='spacy', include_lengths=True)
# this dtype converted to float coz criterion needs data to be in floatTensor. Originally it is LongTensor
LABEL = data.LabelField(dtype = torch.float)

print(TEXT)


<torchtext.data.field.Field object at 0x7fd9b47129e8>


In [287]:
import random
# Get the train and test data splits using TEXT and LABEL criteria
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data,valid_data=train_data.split(random_state = random.seed(SEED))
print(len(train_data))


17500


Interesting aspect to explore is if the model works when trained on reversed string input, given we use GloVe word embedding.

In [288]:
'''train on reversed string.'''
for sentence in train_data:
  vars(sentence).get('text').reverse()

In [289]:
# validate on not reversed and reversed string
for sentence in valid_data:
  vars(sentence).get('text').reverse()



In [290]:
for i in train_data[0:7]:
  print(vars(i)['text'])
  # break
print('-----')
for i in valid_data[0:7]:
  print(vars(i)['text'])

['.', 'eight', 'an', 'maybe', 'as', 'high', 'as', ',', 'rating', 'higher', 'a', 'this', 'given', 'have', 'might', 'I', 'and', 'funnier', 'be', 'might', 'he', 'mental', 'so', 'act', "n't", 'did', 'just', 'Dane', 'If', '.', 'joke', 'Pieces', "'s", 'Reese', 'the', 'liked', 'also', 'I', '.', 'thru', 'drive', 'the', 'at', 'yell', 'people', 'where', 'joke', 'King', 'Burger', 'the', 'like', 'jokes', 'good', 'few', 'a', 'had', 'He', '.', 'alive', 'was', 'he', 'when', 'that', 'into', 'really', 'was', 'Dangerfield', 'Rodney', 'sure', "'m", 'I', '.', 'star', 'rock', 'a', 'be', 'to', 'wants', 'comedian', 'every', 'that', 'said', 'He', '?', 'something', 'or', 'stage', 'on', 'up', 'both', 'there', 'because', 'Just', '.', 'stars', 'rock', 'and', 'comics', 'toward', 'comparison', 'his', 'understand', 'quite', "n't", 'did', 'I', '.', 'laugh', 'us', 'make', 'and', 'together', 'yourself', 'get', ',', 'down', 'Calm', '.', 'much', 'To', '.', 'floor', 'the', 'on', 'rolling', 'and', 'himself', 'on', 'water',

In [291]:
MAX_VOCAB_SIZE = 25000
''' Use max of 25000 frequently occuring words to create vocabulary. Use pretrained GloVe embedding. Embedding of words that exist in the data but not in the
dictionary are set using normal distribution.
'''
TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)


In [292]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

'''Creates batches with samples of nearly same sizes. may not be exact'''
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True, 
    device = device)

In [293]:
for i in train_iterator:
  print(i)
  print('text size',i.text[0].shape, '\n' 'text length tensor', i.text[1], i.text[1].shape, '\n' 'label', i.label, i.label.shape)
  break


[torchtext.data.batch.Batch of size 64]
	[.text]:('[torch.cuda.LongTensor of size 132x64 (GPU 0)]', '[torch.cuda.LongTensor of size 64 (GPU 0)]')
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]
text size torch.Size([132, 64]) 
text length tensor tensor([132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132,
        132, 132, 132, 132, 132, 132, 132, 132, 132, 131, 131, 131, 131, 131,
        131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
        131, 131, 131, 131, 131, 131, 130, 130, 130, 130, 130, 130, 130, 130,
        130, 130, 130, 130, 130, 130, 130, 130], device='cuda:0') torch.Size([64]) 
label tensor([1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1.,
        0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1.,
        1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1.,
        0., 0., 0., 1., 0., 0., 1., 1., 0., 0.], device='cuda:0') torch.Size([64])


In [294]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
    super().__init__()
    self.embedding= nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
    self.rnn = nn.LSTM(embedding_dim, 
                       hidden_dim,
                       num_layers=n_layers,
                       bidirectional= bidirectional,
                       dropout=dropout)
    self.rnn2 = nn.LSTM(hidden_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)

    direction_count = 2 if bidirectional else 1

    self.fc = nn.Linear(hidden_dim * direction_count, output_dim)
    self.dropout = nn.Dropout(dropout)
    self.bidirectional=bidirectional

  def forward(self, text, text_lengths ):
    # print('text:',text_lengths)
    embedded = self.dropout(self.embedding(text)); #  [sent len, batch size, embedded_dim]
    # print('embedded',embedded.shape)

    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,text_lengths)  # [sent length * batch size,  embedded_dim]
    '''???? check how the dimension change is accounted for in RNN''' 
    # print('packed_embedded[0]',packed_embedded[0].shape)
    packed_output,(hidden, cell)= self.rnn(packed_embedded)  #hidden = [num layers * num directions, batch size, hid dim] #cell = [num layers * num directions, batch size, hid dim]
    # print('packed_output[0]',packed_output[0].shape,'hidden',hidden.shape, 'cell',cell.shape)
    
    '''For stacked LSTM, the hidden output of one block is fed as input to the next.
        ?????  Just last layer or entire hidden block? What's the idea?'''
    for i in range(2):
      packed_output,(hidden, cell) = self.rnn2(hidden)

    '''All layer hidden layers are stacked up. The last layer is the final output. In case of bidirectional RNN, there is a set of hidden layers for
    each direction. These pairs are stacked up layerwise. 
    [Layer0_forward Layer0_backward, Layer1_forward Layer1_backward, Layer2_forward Layer2_backward ....]'''
    if self.bidirectional:
      ''' need to take final layer output and since bidirectional is true, take last two layers or last pair of layers'''
      hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
      # print('bidirectional', hidden.shape)
    else:
      hidden = self.dropout(hidden[-1,:,:]) # hidden = [batch size, hid dim * num directions]
      # print('unidirectional', hidden.shape)
    return self.fc(hidden)


In [295]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = False
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM,
           EMBEDDING_DIM,
           HIDDEN_DIM,
           OUTPUT_DIM,
           N_LAYERS,
           BIDIRECTIONAL,
           DROPOUT,
           PAD_IDX
           )


In [296]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,446,057 trainable parameters


In [297]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
print(UNK_IDX)

'''set padding and unknown words embedding to zero to ensure RNN knows what to learn and where to stop'''
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

torch.Size([25002, 100])
0
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.2146,  0.6712,  0.3821,  ...,  0.4095,  0.7454,  0.0046],
        [-0.2187,  0.2048, -0.7215,  ...,  0.2931,  0.6471,  0.0721],
        [-0.5528, -1.5618,  0.8585,  ..., -0.7364,  0.5901, -1.4599]])


In [298]:
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

optimizer = optim.Adam(model.parameters())



In [299]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [300]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    text, text_lengths = batch.text
    predictions = model(text, text_lengths.cpu() ).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)
   
    loss.backward()
    optimizer.step()
    epoch_loss+=loss.item()
    epoch_acc+=acc.item()
    
  return epoch_loss / len(iterator), epoch_acc / len(iterator)



In [301]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()

  with torch.no_grad():
    for batch in iterator:
      text, text_lengths = batch.text

      predictions = model(text, text_lengths.cpu() ).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = binary_accuracy(predictions, batch.label)
      epoch_loss+=loss.item()
      epoch_acc+=acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [302]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [303]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
  start_time = time.time()
  train_loss, train_acc= train(model,train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  end_time = time.time()
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
  if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
  # break
  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


Epoch: 01 | Epoch Time: 0m 18s
	Train Loss: 0.689 | Train Acc: 52.92%
	 Val. Loss: 0.693 |  Val. Acc: 50.85%
Epoch: 02 | Epoch Time: 0m 19s
	Train Loss: 0.693 | Train Acc: 50.81%
	 Val. Loss: 0.693 |  Val. Acc: 49.15%
Epoch: 03 | Epoch Time: 0m 19s
	Train Loss: 0.693 | Train Acc: 50.85%
	 Val. Loss: 0.694 |  Val. Acc: 49.15%
Epoch: 04 | Epoch Time: 0m 19s
	Train Loss: 0.693 | Train Acc: 50.97%
	 Val. Loss: 0.693 |  Val. Acc: 49.15%
Epoch: 05 | Epoch Time: 0m 19s
	Train Loss: 0.693 | Train Acc: 49.68%
	 Val. Loss: 0.694 |  Val. Acc: 49.15%
Epoch: 06 | Epoch Time: 0m 19s
	Train Loss: 0.693 | Train Acc: 50.57%
	 Val. Loss: 0.693 |  Val. Acc: 49.15%
Epoch: 07 | Epoch Time: 0m 19s
	Train Loss: 0.691 | Train Acc: 52.39%
	 Val. Loss: 0.693 |  Val. Acc: 49.15%
Epoch: 08 | Epoch Time: 0m 19s
	Train Loss: 0.686 | Train Acc: 54.49%
	 Val. Loss: 0.671 |  Val. Acc: 58.34%
Epoch: 09 | Epoch Time: 0m 19s
	Train Loss: 0.609 | Train Acc: 68.07%
	 Val. Loss: 0.440 |  Val. Acc: 83.51%
Epoch: 10 | Epoch T

In [304]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.349 | Test Acc: 85.62%


In [305]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    print('tokenized;',tokenized)
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    print('indexed;',indexed)
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    print('tensor;',tensor.shape)
    tensor = tensor.unsqueeze(1)
    print('tensor unsqueezed;',tensor.shape,length)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [306]:
predict_sentiment(model, "This film is terrible")

tokenized; ['This', 'film', 'is', 'terrible']
indexed; [66, 24, 9, 442]
tensor; torch.Size([4])
tensor unsqueezed; torch.Size([4, 1]) [4]


0.11247031390666962

In [307]:
predict_sentiment(model, "This film is great")

tokenized; ['This', 'film', 'is', 'great']
indexed; [66, 24, 9, 103]
tensor; torch.Size([4])
tensor unsqueezed; torch.Size([4, 1]) [4]


0.9194293022155762

## Observations
Increasing number of layers in the RNN cell increases the number of fully connected layers, thereby exponentially increasing the number of parameters with no improvement in accuracy.

Unidirectional RNN needs higher complexity to achieve good accuracy. Interestingly, training on reverse string also produces good results. 


Bidirectional RNN achieves higher accuracy with lower complexity and fewer epochs.
