#imports


In [1]:
import torch
from torchtext import data
from torchtext import datasets

#Defining text and label field
 We are using spacy tokenizer. It takes a string and tokenizes every words and punctuations. We can further use this tokens to create our vocabulary for the model. 

In [2]:
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

importing imdb movie review dataset from torchtext library.

In [3]:

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 10.4MB/s]


splitting data into training and validation set. since our dataset is large we can afford to split 50% for training and validation.

In [4]:
import random
train_data , valid_data = train_data.split(random_state = random.seed(123))

example of our training data

In [5]:
vars(train_data[-1])

{'label': 'neg',
 'text': ['Man',
  ',',
  'I',
  'really',
  'find',
  'it',
  'hard',
  'to',
  'believe',
  'that',
  'the',
  'wonderful',
  'Alan',
  'Ball',
  'had',
  'anything',
  'to',
  'do',
  'with',
  'this',
  'mess',
  '.',
  'Having',
  'seen',
  'the',
  'first',
  'two',
  'episodes',
  'thus',
  'far',
  ',',
  'I',
  'think',
  'I',
  'can',
  'safely',
  'say',
  'this',
  'show',
  'is',
  "n't",
  'going',
  'to',
  'be',
  'on',
  'my',
  'must',
  'see',
  'list',
  '.',
  'It',
  "'s",
  'just',
  'got',
  'so',
  'many',
  'things',
  'working',
  'against',
  'it.<br',
  '/><br',
  '/>None',
  'of',
  'the',
  'actors',
  'cast',
  'are',
  'particularly',
  'good',
  '.',
  'Anna',
  'Paquin',
  'as',
  'the',
  'lead',
  'character',
  'Sookie',
  ',',
  'is',
  'just',
  'awful',
  '.',
  'I',
  'remember',
  'her',
  'being',
  'better',
  'in',
  'a',
  'lot',
  'of',
  'other',
  'things',
  'I',
  "'ve",
  'seen',
  'her',
  'in',
  'so',
  'maybe',
 

creating vocabulary for our model. we set maximum vocabulary size to 25000. Hence our model tokenizes 25000 unique tokens in our model and anything else will be marked with  unknown token. We are using pretrained embedding vector 'glove.6B.100d' prepared by stanford

In [6]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 398552/400000 [00:23<00:00, 17992.02it/s]

creating an iterator for training. We are using BucketIterator for the job. And assining gpu as our primary processor.

In [7]:

BATCH_SIZE = 64

device = 'cuda'

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

defining the model. we are using  bidirectional LSTM(long short term memory) model ie. our model passes through  the through first to last token and back in a sentence . Thus in the fully connected layer we have defined dimension (hidden_dim * 2,output_dim) . hidden_dim*2 because  of the biderectional nature of our lstm model .We have given the padding idx for the model to determine on which tokens not to train. 

In [8]:

import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
       
        
        embedded = self.dropout(self.embedding(text))
        
        
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)


        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)


Hyperparameters of our model.

In [9]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

our pretrained embedding. From 'glove.6B.100D vectors' . since we defined max vocabulary as 25000 we have 25002 tokens. The other two being unknown and padding. 

In [10]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


copying our pretrained embedding to our model's embedding

In [11]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.2208,  0.5414,  0.6170,  ...,  0.3331,  1.0870, -1.4192],
        [ 1.2872, -0.0612,  0.1770,  ..., -0.8446,  1.5656,  0.1120],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.5454,  0.3363, -1.1108,  ..., -0.4860,  0.9797, -0.3946],
        [-0.2630,  0.1020,  1.2268,  ...,  0.3066, -0.8744,  0.9514],
        [ 1.2182, -1.1912,  0.0040,  ..., -0.0603, -1.9024, -0.9452]])

changing the values of unknown tokens and padding tokens to zeroes.

In [12]:

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.5454,  0.3363, -1.1108,  ..., -0.4860,  0.9797, -0.3946],
        [-0.2630,  0.1020,  1.2268,  ...,  0.3066, -0.8744,  0.9514],
        [ 1.2182, -1.1912,  0.0040,  ..., -0.0603, -1.9024, -0.9452]])


In [13]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

defining loss function and loading the model and loss function to cuda

In [14]:

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

Function to determine accuracy of our model

In [15]:

def binary_accuracy(preds, y):

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc


Function for training. The weights of the parameters are updated after each batch of 64 training samples since we are using iterator.

In [16]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

evaluation function for our validation dataset

In [17]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Training for 5 epochs and saving the model with best validation accuracy.

In [19]:

N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
  

    
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01
	Train Loss: 0.530 | Train Acc: 73.75%
	 Val. Loss: 0.419 |  Val. Acc: 81.05%
Epoch: 02
	Train Loss: 0.459 | Train Acc: 78.33%
	 Val. Loss: 0.474 |  Val. Acc: 78.93%
Epoch: 03
	Train Loss: 0.360 | Train Acc: 84.74%
	 Val. Loss: 0.315 |  Val. Acc: 87.40%
Epoch: 04
	Train Loss: 0.340 | Train Acc: 85.87%
	 Val. Loss: 0.309 |  Val. Acc: 87.47%
Epoch: 05
	Train Loss: 0.263 | Train Acc: 89.50%
	 Val. Loss: 0.285 |  Val. Acc: 88.69%


loading the best performing model and using the model on the test dataset.

In [20]:

model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.288 | Test Acc: 88.34%


function to predict the sentiment of our custom statement

In [21]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

trying custom prediction

In [28]:
predict_sentiment(model, "i loved this movie")

0.9853605628013611

In [30]:
predict_sentiment(model, "I want my money back that i paid for this movie")

0.042114295065402985