# LSTM 
### good explanation:  http://colah.github.io/posts/2015-08-Understanding-LSTMs/

In [1]:
import os
import time
import numpy as np
from tqdm import tqdm
from string import punctuation
from collections import Counter
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(123)

import random
from torchtext import (data, datasets)

## Preprocess the data

In [2]:
TEXT_FIELD = data.Field(tokenize = data.get_tokenizer("basic_english"), include_lengths = True)
# get_tokenizer(): split sentence by spaces. [ref](https://pytorch.org/text/stable/data_utils.html)
LABEL_FIELD = data.LabelField(dtype = torch.float)

train_dataset, test_dataset = datasets.IMDB.splits(TEXT_FIELD, LABEL_FIELD)
# IMDB: [Stanford's large movie review dataset](https://ai.stanford.edu/~amaas/data/sentiment/)
train_dataset, valid_dataset = train_dataset.split(random_state = random.seed(123))

In [3]:
MAX_VOCABULARY_SIZE = 25000

TEXT_FIELD.build_vocab(train_dataset, 
                 max_size = MAX_VOCABULARY_SIZE)

LABEL_FIELD.build_vocab(train_dataset)

In [4]:
B_SIZE = 64 # batch size

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data_iterator, valid_data_iterator, test_data_iterator = data.BucketIterator.splits(
    (train_dataset, valid_dataset, test_dataset), 
    batch_size = B_SIZE,
    sort_within_batch = True,
    device = device)

In [5]:
## If you are training using GPUs, we need to use the following function for the pack_padded_sequence method to work 
## (reference : https://discuss.pytorch.org/t/error-with-lengths-in-pack-padded-sequence/35517/3)
if torch.cuda.is_available():
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence

def cuda_pack_padded_sequence(input, lengths, batch_first=False, enforce_sorted=False):
    # length becoms a tensor. ref: https://pytorch.org/docs/stable/generated/torch.as_tensor.html
    lengths = torch.as_tensor(lengths, dtype=torch.int64)
    lengths = lengths.cpu()
    
    if enforce_sorted:
        sorted_indices = None
    else:
        # sorted_indices means original position at after sorted tensor.
        # ref: https://pytorch.org/docs/stable/generated/torch.sort.html
        lengths, sorted_indices = torch.sort(lengths, descending=True)
        sorted_indices = sorted_indices.to(input.device)
        
    batch_dim = 0 if batch_first else 1
    input = input.index_select(batch_dim, sorted_indices)

    data, batch_sizes = \
    torch._C._VariableFunctions._pack_padded_sequence(input, lengths, batch_first)
    return PackedSequence(data, batch_sizes, sorted_indices)

## Define LSTM class

In [6]:
class LSTM(nn.Module):
    def __init__(self, vocabulary_size, embedding_dimension, hidden_dimension, output_dimension, dropout, pad_index):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocabulary_size, embedding_dimension, padding_idx = pad_index)
        # num_layers should be > 1. otherwise, it cause below warning
        # UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.5 and num_layers=1
        self.lstm_layer = nn.LSTM(embedding_dimension, 
                           hidden_dimension, 
                           num_layers=2, 
                           bidirectional=True, 
                           dropout=dropout)
        self.fc_layer = nn.Linear(hidden_dimension * 2, output_dimension)
        self.dropout_layer = nn.Dropout(dropout)
        
    def forward(self, sequence, sequence_lengths=None):
        if sequence_lengths is None:
            sequence_lengths = torch.LongTensor([len(sequence)])
        
        # sequence := (sequence_length, batch_size)
        embedded_output = self.dropout_layer(self.embedding_layer(sequence))
        
        
        # embedded_output := (sequence_length, batch_size, embedding_dimension)
        if torch.cuda.is_available():
            packed_embedded_output = cuda_pack_padded_sequence(embedded_output, sequence_lengths)
        else:
            packed_embedded_output = nn.utils.rnn.pack_padded_sequence(embedded_output, sequence_lengths)
        
        packed_output, (hidden_state, cell_state) = self.lstm_layer(packed_embedded_output)
        # hidden_state := (num_layers * num_directions, batch_size, hidden_dimension)
        # num_directions = 2 if bidirectional LSTM.
        # cell_state := (num_layers * num_directions, batch_size, hidden_dimension)
        
        op, op_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        # op := (sequence_length, batch_size, hidden_dimension * num_directions)
        
        hidden_output = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)        
        # hidden_output := (batch_size, hidden_dimension * num_directions)
        
        return self.fc_layer(hidden_output)


In [7]:
INPUT_DIMENSION = len(TEXT_FIELD.vocab)
EMBEDDING_DIMENSION = 100
HIDDEN_DIMENSION = 32
OUTPUT_DIMENSION = 1
DROPOUT = 0.5
PAD_INDEX = TEXT_FIELD.vocab.stoi[TEXT_FIELD.pad_token]

lstm_model = LSTM(INPUT_DIMENSION, 
            EMBEDDING_DIMENSION, 
            HIDDEN_DIMENSION, 
            OUTPUT_DIMENSION, 
            DROPOUT, 
            PAD_INDEX)

In [8]:
UNK_INDEX = TEXT_FIELD.vocab.stoi[TEXT_FIELD.unk_token] # unk means unknown

lstm_model.embedding_layer.weight.data[UNK_INDEX] = torch.zeros(EMBEDDING_DIMENSION)
lstm_model.embedding_layer.weight.data[PAD_INDEX] = torch.zeros(EMBEDDING_DIMENSION)

## Define training functions

In [9]:
optim = torch.optim.Adam(lstm_model.parameters())
loss_func = nn.BCEWithLogitsLoss() # binary cross entropy.

lstm_model = lstm_model.to(device)
loss_func = loss_func.to(device)

In [10]:
def accuracy_metric(predictions, ground_truth):
    """
    Returns 0-1 accuracy for the given set of predictions and ground truth
    """
    # round predictions to either 0 or 1
    rounded_predictions = torch.round(torch.sigmoid(predictions))
    success = (rounded_predictions == ground_truth).float() #convert into float for division 
    accuracy = success.sum() / len(success)
    return accuracy

In [11]:
def train(model, data_iterator, optim, loss_func):
    loss = 0
    accuracy = 0
    model.train()
    
    for curr_batch in data_iterator:
        optim.zero_grad()
        sequence, sequence_lengths = curr_batch.text
        preds = lstm_model(sequence, sequence_lengths).squeeze(1)
        
        loss_curr = loss_func(preds, curr_batch.label)
        accuracy_curr = accuracy_metric(preds, curr_batch.label)
        
        loss_curr.backward()
        optim.step()
        
        loss += loss_curr.item()
        accuracy += accuracy_curr.item()
        
    return loss/len(data_iterator), accuracy/len(data_iterator)

In [12]:
def validate(model, data_iterator, loss_func):
    loss = 0
    accuracy = 0
    model.eval()
    
    with torch.no_grad():
        for curr_batch in data_iterator:
            sequence, sequence_lengths = curr_batch.text
            preds = model(sequence, sequence_lengths).squeeze(1)
            
            loss_curr = loss_func(preds, curr_batch.label)
            accuracy_curr = accuracy_metric(preds, curr_batch.label)

            loss += loss_curr.item()
            accuracy += accuracy_curr.item()
        
    return loss/len(data_iterator), accuracy/len(data_iterator)

## Start training

In [13]:
num_epochs = 10
best_validation_loss = float('inf')

for ep in range(num_epochs):

    time_start = time.time()
    
    training_loss, train_accuracy = train(lstm_model, train_data_iterator, optim, loss_func)
    validation_loss, validation_accuracy = validate(lstm_model, valid_data_iterator, loss_func)
    
    time_end = time.time()
    time_delta = time_end - time_start 
    
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        torch.save(lstm_model.state_dict(), 'lstm_model.pt')
    
    print(f'epoch number: {ep+1} | time elapsed: {time_delta}s')
    print(f'training loss: {training_loss:.3f} | training accuracy: {train_accuracy*100:.2f}%')
    print(f'validation loss: {validation_loss:.3f} |  validation accuracy: {validation_accuracy*100:.2f}%')
    print()

epoch number: 1 | time elapsed: 14.727524042129517s
training loss: 0.683 | training accuracy: 55.39%
validation loss: 0.629 |  validation accuracy: 64.82%

epoch number: 2 | time elapsed: 18.555153608322144s
training loss: 0.637 | training accuracy: 63.47%
validation loss: 0.618 |  validation accuracy: 64.90%

epoch number: 3 | time elapsed: 18.414896249771118s
training loss: 0.597 | training accuracy: 68.32%
validation loss: 0.543 |  validation accuracy: 72.93%

epoch number: 4 | time elapsed: 19.25935125350952s
training loss: 0.570 | training accuracy: 70.77%
validation loss: 0.503 |  validation accuracy: 76.57%

epoch number: 5 | time elapsed: 19.893606185913086s
training loss: 0.513 | training accuracy: 74.96%
validation loss: 0.500 |  validation accuracy: 77.48%

epoch number: 6 | time elapsed: 18.90823459625244s
training loss: 0.472 | training accuracy: 78.17%
validation loss: 0.427 |  validation accuracy: 80.81%

epoch number: 7 | time elapsed: 19.202368021011353s
training loss:

## Load training result & Run inference

In [14]:
lstm_model.load_state_dict(torch.load('../../Mastering-PyTorch/Chapter04/lstm_model.pt'))

test_loss, test_accuracy = validate(lstm_model, test_data_iterator, loss_func)

print(f'test loss: {test_loss:.3f} | test accuracy: {test_accuracy*100:.2f}%')

test loss: 0.345 | test accuracy: 85.73%


In [15]:
def sentiment_inference(model, sentence):
    model.eval()
    
    # text transformations
    tokenized = data.get_tokenizer("basic_english")(sentence)
    tokenized = [TEXT_FIELD.vocab.stoi[t] for t in tokenized]
    
    # model inference
    model_input = torch.LongTensor(tokenized).to(device)
    model_input = model_input.unsqueeze(1)
    
    pred = torch.sigmoid(model(model_input))
    
    return pred.item()

In [16]:
print(sentiment_inference(lstm_model, "This film is horrible"))
print(sentiment_inference(lstm_model, "Director tried too hard but this film is bad"))
print(sentiment_inference(lstm_model, "Decent movie, although could be shorter"))
print(sentiment_inference(lstm_model, "This film will be houseful for weeks"))
print(sentiment_inference(lstm_model, "I loved the movie, every part of it"))

0.1848733276128769
0.06664347648620605
0.48671120405197144
0.7506839632987976
0.909300684928894


In [17]:
print(sentiment_inference(lstm_model, "hogehoge"))
print(sentiment_inference(lstm_model, "fugafuga"))
print(sentiment_inference(lstm_model, "I like it, but my friend hate it."))
print(sentiment_inference(lstm_model, "I hate it, but my friend likes it."))
print(sentiment_inference(lstm_model, "I love you."))
print(sentiment_inference(lstm_model, "I love this movie, by the way I hate the book."))
print(sentiment_inference(lstm_model, "I hate this movie, by the way I love the book."))
print(sentiment_inference(lstm_model, "With best New Year’s wishes!"))

0.45216652750968933
0.45216652750968933
0.7792609930038452
0.7865818738937378
0.9087597131729126
0.7414801120758057
0.7132746577262878
0.8986361026763916


# Try to visualize using captum
### but it doen't work yet by the original implementation.

In [18]:
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization
# [captum](https://captum.ai/): library for model interpretability
# [Layer Integrated Gradients](https://github.com/pytorch/captum/blob/master/captum/attr/_core/layer/layer_integrated_gradients.py)

In [19]:
# [spacy](https://spacy.io/): Library for Natural Language Processing
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 636 kB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [20]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [21]:
lig = LayerIntegratedGradients(lstm_model, lstm_model.embedding_layer)

In [22]:
def forward_with_sigmoid(input, l):
    return torch.sigmoid(lstm_model(input, l))

In [23]:
token_reference = TokenReferenceBase(reference_token_idx=PAD_INDEX)

In [24]:
# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 7, label = 0):
    # text transformations
    tokenized = data.get_tokenizer("basic_english")(sentence)
    tokenized = [TEXT_FIELD.vocab.stoi[t] for t in tokenized]
    
    # model inference
    model_input = torch.LongTensor(tokenized).to(device)
    model_input = model_input.unsqueeze(1)
    length_input = torch.LongTensor([len(tokenized)])
    pred = torch.sigmoid(model(model_input, length_input))

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(len(tokenized), device=device).unsqueeze(0)
    
    
    print(f"model_input.shape={model_input.shape}")
    print(f"reference_indices.shape={reference_indices.shape}")
    print(f"reference_indices={reference_indices}")
    # compute attributions and approximation delta using layer integrated gradients
    # below original line is buggy.
    attributions_ig, delta = lig.attribute(model_input,
                                           reference_indices.reshape(model_input.shape), 
                                           n_steps=500, return_convergence_delta=True)

    print('pred: ', Label.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta))

    add_attributions_to_visualizer(attributions_ig, text, pred, pred_ind, label, delta, vis_data_records_ig)
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            Label.vocab.itos[pred_ind],
                            Label.vocab.itos[label],
                            Label.vocab.itos[1],
                            attributions.sum(),       
                            text,
                            delta))

In [25]:
interpret_sentence(lstm_model, 'It was a fantastic performance !', label=1)
interpret_sentence(lstm_model, 'Best film ever', label=1)
interpret_sentence(lstm_model, 'Such a great show!', label=1)
interpret_sentence(lstm_model, 'It was a horrible movie', label=0)
interpret_sentence(lstm_model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(lstm_model, 'It is a disgusting movie!', label=0)

model_input.shape=torch.Size([6, 1])
reference_indices.shape=torch.Size([1, 6])
reference_indices=tensor([[1, 1, 1, 1, 1, 1]], device='cuda:0')


RuntimeError: cudnn RNN backward can only be called in training mode