In [1]:
import os
import time
import numpy as np
from tqdm import tqdm
from string import punctuation
from collections import Counter
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(123)

import random
from torchtext import (data, datasets)

## Preprocess the data

In [2]:
TEXT_FIELD = data.Field(tokenize = data.get_tokenizer("basic_english"), include_lengths = True)
LABEL_FIELD = data.LabelField(dtype = torch.float)

train_dataset, test_dataset = datasets.IMDB.splits(TEXT_FIELD, LABEL_FIELD)
train_dataset, valid_dataset = train_dataset.split(random_state = random.seed(123))

In [3]:
MAX_VOCABULARY_SIZE = 25000

TEXT_FIELD.build_vocab(train_dataset, 
                 max_size = MAX_VOCABULARY_SIZE)

LABEL_FIELD.build_vocab(train_dataset)

In [4]:
B_SIZE = 64 # batch size

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data_iterator, valid_data_iterator, test_data_iterator = data.BucketIterator.splits(
    (train_dataset, valid_dataset, test_dataset), 
    batch_size = B_SIZE,
    sort_within_batch = True,
    device = device)

In [5]:
## If you are training using GPUs, we need to use the following function for the pack_padded_sequence method to work 
## (reference : https://discuss.pytorch.org/t/error-with-lengths-in-pack-padded-sequence/35517/3)
if torch.cuda.is_available():
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence

def cuda_pack_padded_sequence(input, lengths, batch_first=False, enforce_sorted=False):
    # length becoms a tensor. ref: https://pytorch.org/docs/stable/generated/torch.as_tensor.html
    lengths = torch.as_tensor(lengths, dtype=torch.int64)
    lengths = lengths.cpu()
    
    if enforce_sorted:
        sorted_indices = None
    else:
        # sorted_indices means original position at after sorted tensor.
        # ref: https://pytorch.org/docs/stable/generated/torch.sort.html
        lengths, sorted_indices = torch.sort(lengths, descending=True)
        sorted_indices = sorted_indices.to(input.device)
        
    batch_dim = 0 if batch_first else 1
    input = input.index_select(batch_dim, sorted_indices)

    data, batch_sizes = \
    torch._C._VariableFunctions._pack_padded_sequence(input, lengths, batch_first)
    return PackedSequence(data, batch_sizes, sorted_indices)

## Define GRU class

In [6]:
class GRU(nn.Module):
    def __init__(self, vocabulary_size, embedding_dimension, hidden_dimension, output_dimension, dropout, pad_index):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocabulary_size, embedding_dimension, padding_idx = pad_index)
        # num_layers should be > 1. otherwise, it cause below warning
        # UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.5 and num_layers=1
        self.gru_layer = nn.GRU(embedding_dimension, hidden_dimension, num_layers=2,
dropout=0.8, bidirectional=True)
        self.fc_layer = nn.Linear(hidden_dimension * 2, output_dimension)
        self.dropout_layer = nn.Dropout(dropout)
        
    def forward(self, sequence, sequence_lengths=None):
        if sequence_lengths is None:
            sequence_lengths = torch.LongTensor([len(sequence)])
        
        # sequence := (sequence_length, batch_size)
        embedded_output = self.dropout_layer(self.embedding_layer(sequence))
        
        
        # embedded_output := (sequence_length, batch_size, embedding_dimension)
        if torch.cuda.is_available():
            packed_embedded_output = cuda_pack_padded_sequence(embedded_output, sequence_lengths)
        else:
            packed_embedded_output = nn.utils.rnn.pack_padded_sequence(embedded_output, sequence_lengths)
        
        packed_output, hidden_state = self.gru_layer(packed_embedded_output)
        # hidden_state := (num_layers * num_directions, batch_size, hidden_dimension)
        # num_directions = 2 if bidirectional.
        # cell_state := (num_layers * num_directions, batch_size, hidden_dimension)
        
        op, op_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        # op := (sequence_length, batch_size, hidden_dimension * num_directions)
        
        hidden_output = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)        
        # hidden_output := (batch_size, hidden_dimension * num_directions)
        
        return self.fc_layer(hidden_output)

In [7]:
INPUT_DIMENSION = len(TEXT_FIELD.vocab)
EMBEDDING_DIMENSION = 100
HIDDEN_DIMENSION = 32
OUTPUT_DIMENSION = 1
DROPOUT = 0.5
PAD_INDEX = TEXT_FIELD.vocab.stoi[TEXT_FIELD.pad_token]

gru_model = GRU(INPUT_DIMENSION, 
            EMBEDDING_DIMENSION, 
            HIDDEN_DIMENSION, 
            OUTPUT_DIMENSION, 
            DROPOUT, 
            PAD_INDEX)

In [8]:
UNK_INDEX = TEXT_FIELD.vocab.stoi[TEXT_FIELD.unk_token] # unk means unknown

gru_model.embedding_layer.weight.data[UNK_INDEX] = torch.zeros(EMBEDDING_DIMENSION)
gru_model.embedding_layer.weight.data[PAD_INDEX] = torch.zeros(EMBEDDING_DIMENSION)

## Define training functions

In [9]:
optim = torch.optim.Adam(gru_model.parameters())
loss_func = nn.BCEWithLogitsLoss() # binary cross entropy.

gru_model = gru_model.to(device)
loss_func = loss_func.to(device)

In [10]:
def accuracy_metric(predictions, ground_truth):
    """
    Returns 0-1 accuracy for the given set of predictions and ground truth
    """
    # round predictions to either 0 or 1
    rounded_predictions = torch.round(torch.sigmoid(predictions))
    success = (rounded_predictions == ground_truth).float() #convert into float for division 
    accuracy = success.sum() / len(success)
    return accuracy

In [11]:
def train(model, data_iterator, optim, loss_func):
    loss = 0
    accuracy = 0
    model.train()
    
    for curr_batch in data_iterator:
        optim.zero_grad()
        sequence, sequence_lengths = curr_batch.text
        preds = model(sequence, sequence_lengths).squeeze(1)
        
        loss_curr = loss_func(preds, curr_batch.label)
        accuracy_curr = accuracy_metric(preds, curr_batch.label)
        
        loss_curr.backward()
        optim.step()
        
        loss += loss_curr.item()
        accuracy += accuracy_curr.item()
        
    return loss/len(data_iterator), accuracy/len(data_iterator)

In [12]:
def validate(model, data_iterator, loss_func):
    loss = 0
    accuracy = 0
    model.eval()
    
    with torch.no_grad():
        for curr_batch in data_iterator:
            sequence, sequence_lengths = curr_batch.text
            preds = model(sequence, sequence_lengths).squeeze(1)
            
            loss_curr = loss_func(preds, curr_batch.label)
            accuracy_curr = accuracy_metric(preds, curr_batch.label)

            loss += loss_curr.item()
            accuracy += accuracy_curr.item()
        
    return loss/len(data_iterator), accuracy/len(data_iterator)

## Start training

In [13]:
num_epochs = 10
best_validation_loss = float('inf')

for ep in range(num_epochs):

    time_start = time.time()
    training_loss, train_accuracy = train(gru_model, train_data_iterator, optim, loss_func)
    time_train = time.time()
    validation_loss, validation_accuracy = validate(gru_model, valid_data_iterator, loss_func)
    time_val = time.time()
    
    time_end = time.time()
    time_delta = time_end - time_start 
    
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        torch.save(gru_model.state_dict(), 'gru_model.pt')
    
    print(f'epoch number: {ep+1} | time elapsed: {time_delta}s')
    print(f"train time = {time_train - time_start}, validation time = {time_val - time_train}")    
    print(f'training loss: {training_loss:.3f} | training accuracy: {train_accuracy*100:.2f}%')
    print(f'validation loss: {validation_loss:.3f} |  validation accuracy: {validation_accuracy*100:.2f}%')
    print()

epoch number: 1 | time elapsed: 10.142573118209839s
train time = 9.305720090866089, validation time = 0.8368189334869385
training loss: 0.693 | training accuracy: 52.23%
validation loss: 0.677 |  validation accuracy: 57.58%

epoch number: 2 | time elapsed: 19.76072382926941s
train time = 17.982912063598633, validation time = 1.7778115272521973
training loss: 0.681 | training accuracy: 55.93%
validation loss: 0.641 |  validation accuracy: 63.58%

epoch number: 3 | time elapsed: 20.421406507492065s
train time = 17.444645166397095, validation time = 2.9767611026763916
training loss: 0.631 | training accuracy: 64.18%
validation loss: 0.644 |  validation accuracy: 69.56%

epoch number: 4 | time elapsed: 23.67748498916626s
train time = 20.439718008041382, validation time = 3.237766742706299
training loss: 0.588 | training accuracy: 69.00%
validation loss: 0.521 |  validation accuracy: 76.07%

epoch number: 5 | time elapsed: 20.615106344223022s
train time = 18.653815031051636, validation time

## Load training result & Run inference

In [14]:
gru_model.load_state_dict(torch.load('../../Mastering-PyTorch/Chapter04/gru_model.pt'))

test_loss, test_accuracy = validate(gru_model, test_data_iterator, loss_func)

print(f'test loss: {test_loss:.3f} | test accuracy: {test_accuracy*100:.2f}%')

test loss: 0.397 | test accuracy: 86.21%


In [15]:
def sentiment_inference(model, sentence):
    model.eval()
    
    # text transformations
    tokenized = data.get_tokenizer("basic_english")(sentence)
    tokenized = [TEXT_FIELD.vocab.stoi[t] for t in tokenized]
    
    # model inference
    model_input = torch.LongTensor(tokenized).to(device)
    model_input = model_input.unsqueeze(1)
    
    pred = torch.sigmoid(model(model_input))
    
    return pred.item()

In [16]:
print(sentiment_inference(gru_model, "This film is horrible"))
print(sentiment_inference(gru_model, "Director tried too hard but this film is bad"))
print(sentiment_inference(gru_model, "Decent movie, although could be shorter"))
print(sentiment_inference(gru_model, "This film will be houseful for weeks"))
print(sentiment_inference(gru_model, "I loved the movie, every part of it"))

0.05916042998433113
0.015965210273861885
0.12806203961372375
0.4671665132045746
0.9657514691352844


In [17]:
print(sentiment_inference(gru_model, "hogehoge"))
print(sentiment_inference(gru_model, "fugafuga"))
print(sentiment_inference(gru_model, "I like it, but my friend hate it."))
print(sentiment_inference(gru_model, "I hate it, but my friend likes it."))
print(sentiment_inference(gru_model, "I love you."))
print(sentiment_inference(gru_model, "I love this movie, by the way I hate the book."))
print(sentiment_inference(gru_model, "I hate this movie, by the way I love the book."))
print(sentiment_inference(gru_model, "With best New Year’s wishes!"))

0.4714809060096741
0.4714809060096741
0.880179226398468
0.8819989562034607
0.8881919384002686
0.7195391058921814
0.606378972530365
0.9625920057296753
