# RNN LSTM VS GRU
#### Chen Zhong (John)
cz379@cornell.edu

### 1. Data Download and Set up

In [None]:
import torch
from torchtext import data
from torchtext import datasets
import random

# set the seed for reproduction
SEED = 1234

# set seed for torch process for either cpu or gpu devices
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

*  text field handles review and label handles sentiment
* field determines how data is handled
* text should be tokenized with 'spacy'
* data should be processed as float tensor

In [None]:
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

* download IMDB data from datasets in torchtext
* these data are torchtext.datasets objects then split data into train and test set
* train set is then subset into training and validation set

In [2]:
train, test = datasets.IMDB.splits(TEXT, LABEL)
train, valid = train.split(random_state=random.seed(SEED))

In [3]:
# spare code to subset input

# train_t, train_other = train.split(split_ratio=0.5, random_state=random.seed(SEED))
# test_t, test_other = test.split(split_ratio=0.5, random_state=random.seed(SEED))
# valid_t, valid_other = valid.split(split_ratio=0.5, random_state=random.seed(SEED))

* build vocabulary using the top 25000 most common words 
* apply one hot encoding to each word
* initialize vector with pretrained embeddings where words in similar context appear in proximity in the vector

In [3]:
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

In [4]:
# sapred code for smaller dataset

# TEXT.build_vocab(train_t, max_size=25000, vectors="glove.6B.100d")
# LABEL.build_vocab(train_t)

* create iterators
* batch size determine how much data passes through the network each iteration
* this step sorts the data into buckets of similar length and when iterator is called it returns a batch of examples from each bucket

In [4]:
BATCH_SIZE = 16

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

In [6]:
# spare code for smaller dataset

# BATCH_SIZE = 16

# train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
#     (train_t, valid_t, test_t), 
#     batch_size=BATCH_SIZE, 
#     sort_key=lambda x: len(x.text), 
#     repeat=False)

### Building the model
* General process:
    * takes one-hot vector as input
    * embedding layer transforms input vector to a dense vector which will then enter the RNN algorithm
* LSTM and GRU models, defines all specifications of model
* First defnine 2 different model classes, detail explanation see in-line comments
* LSTM package computes the following process:
$$
\begin{aligned}
i_t &= \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
f_t &= \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
g_t &= \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
o_t &= \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
c_t &= f_t c_{(t-1)} + i_t g_t \\
h_t &= o_t \tanh(c_t)
\end{aligned}
$$
    * where h is the hidden state, c is the cell state, which incorporates long and short term information, x is the input. i, f, g, o are the input, forget, cell, and output gates, respectively. σ is the sigmoid function. (from official document of torch.nn package)
* GRU package computes the following process:
$$
\begin{aligned}
r_t &= \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
z_t &= \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
n_t &= \tanh(W_{in} x_t + b_{in} + r_t (W_{hn} h_{(t-1)}+ b_{hn})) \\
h_t &= (1 - z_t) n_t + z_t h_{(t-1)} \\
\end{aligned}
$$
    * where h is the hidden state, x is the input. r, z, n are the reset, update, and new gates, respectively. σ is the sigmoid function. (from official document of torch.nn package)


In [5]:
import torch.nn as nn

class RNN_LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        """
        vocab_size: input dimension, dimension of one-hot vector, which is length of TEXT.vocab
        embedding_dim: dimension of the dense word vector post embedding
        hidden_dim: size of hidden states, hidden states are layers for previous inputs to pass through to get updated values
        output_dim: dimension of output class, we only need a real value 0-1 
        n_layers: number of layers in the neural network, more than 1 is called deep neural network, 
            output of hidden state in first layer is the input to the hidden state in the next layer
        bidirectional: adds an extra layer that processes values from last to first, where originally only from first to last
        dropout: a regularization method to avoid overfitting, randomly dropout a node from the forward process, getting less
            parameters and hence avoid over parameterization
        """
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # LSTM package that takes in specifications
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        # defines forward process, bidirectional requires the square of hidden dimension
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        """
        defines the forwarding process between each node
        note that LSTM output specifies cell state
        """
        #x = [sent len, batch size]
        
        # regularization in the embedding process
        embedded = self.dropout(self.embedding(x))
        #embedded = [sent len, batch size, emb dim]
        
        # output of the LSTM RNN process in each node, including the output, new hidden state, and cell state
        output, (hidden, cell) = self.rnn(embedded)        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        # regularize the hidden state to avoid overfitting
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))


class RNN_GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        """
        same as LSTM specification, instead we use GRU package
        """
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        """
        difference between LSTM and GRU here is that GRU output does not have cell state,
        as we can see from mathematical definition above
        """
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

### Model Implementation
1. setup parameters
2. pass parameters to class and build model

In [6]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model_lstm = RNN_LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model_gru = RNN_GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [7]:
"""
Check size of pretrained embeddings
"""

pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [None]:
"""
Assign pretrained embeddings to embedding layer for 2 separate models
"""

In [8]:
model_lstm.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [9]:
model_gru.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [10]:
import torch.optim as optim
"""
use Adam optimization algorithm
a stochastic optimization algorithm
"""
optimizer_lstm = optim.Adam(model_lstm.parameters())
optimizer_gru = optim.Adam(model_gru.parameters())

In [11]:
"""
specify loss function: BCE with logits loss
"""
criterion = nn.BCEWithLogitsLoss()

"""
use GPU if availbale, otherwise use CPU
"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_lstm = model_lstm.to(device)
model_gru = model_gru.to(device)
criterion = criterion.to(device)

In [12]:
import torch as F

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [13]:
def train_model(model, iterator, optimizer, criterion):
    """
    main train function to train with each batch in iterator, iterates through all examples
    """
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        """
        optimization step
        """
        
        # first zero the gradients
        optimizer.zero_grad()
        
        # feed batch of sentences to model
        predictions = model(batch.text).squeeze(1)
        
        # calculate loss
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        # calculate gradient
        loss.backward()
        
        # update parameters
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [14]:
def evaluate(model, iterator, criterion):
    """
    main function for evaluation
    similar to train
    do not need to zero gradients
    do not update parameters
    """
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# RNN-LSTM

In [16]:
N_EPOCHS = 5
"""
train model for 5 epochs and output training statistics and validation statstics
"""
for epoch in range(N_EPOCHS):

    train_loss_lstm, train_acc_lstm = train_model(model_lstm, train_iterator, optimizer_lstm, criterion)
    valid_loss_lstm, valid_acc_lstm = evaluate(model_lstm, valid_iterator, criterion)
    torch.cuda.empty_cache()
    print("RNN-LSTM training data")
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss_lstm:.3f}, Train Acc: {train_acc_lstm*100:.2f}%, Val. Loss: {valid_loss_lstm:.3f}, Val. Acc: {valid_acc_lstm*100:.2f}%')

  return Variable(arr, volatile=not train)


RNN-LSTM training data
Epoch: 01, Train Loss: 0.432, Train Acc: 80.26%, Val. Loss: 0.304, Val. Acc: 88.12%
RNN-LSTM training data
Epoch: 02, Train Loss: 0.246, Train Acc: 90.56%, Val. Loss: 0.273, Val. Acc: 89.18%
RNN-LSTM training data
Epoch: 03, Train Loss: 0.167, Train Acc: 93.89%, Val. Loss: 0.279, Val. Acc: 88.32%
RNN-LSTM training data
Epoch: 04, Train Loss: 0.117, Train Acc: 95.86%, Val. Loss: 0.312, Val. Acc: 89.17%
RNN-LSTM training data
Epoch: 05, Train Loss: 0.085, Train Acc: 97.20%, Val. Loss: 0.369, Val. Acc: 89.13%


In [17]:
"""
test final model
"""
test_loss_lstm, test_acc_lstm = evaluate(model_lstm, test_iterator, criterion)
torch.cuda.empty_cache()
print("RNN-LSTM test result")
print(f'Test Loss: {test_loss_lstm:.3f}, Test Acc: {test_acc_lstm*100:.2f}%')

  return Variable(arr, volatile=not train)


RNN-LSTM test result
Test Loss: 0.478, Test Acc: 86.23%


# RNN-GRU

In [18]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss_gru, train_acc_gru = train_model(model_gru, train_iterator, optimizer_gru, criterion)
    valid_loss_gru, valid_acc_gru = evaluate(model_gru, valid_iterator, criterion)
    torch.cuda.empty_cache()
    print("RNN-GRU training data")
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss_gru:.3f}, Train Acc: {train_acc_gru*100:.2f}%, Val. Loss: {valid_loss_gru:.3f}, Val. Acc: {valid_acc_gru*100:.2f}%')

  return Variable(arr, volatile=not train)


RNN-GRU training data
Epoch: 01, Train Loss: 0.601, Train Acc: 64.28%, Val. Loss: 0.301, Val. Acc: 87.69%
RNN-GRU training data
Epoch: 02, Train Loss: 0.268, Train Acc: 88.99%, Val. Loss: 0.232, Val. Acc: 90.45%
RNN-GRU training data
Epoch: 03, Train Loss: 0.176, Train Acc: 93.35%, Val. Loss: 0.246, Val. Acc: 90.06%
RNN-GRU training data
Epoch: 04, Train Loss: 0.125, Train Acc: 95.60%, Val. Loss: 0.273, Val. Acc: 90.03%
RNN-GRU training data
Epoch: 05, Train Loss: 0.090, Train Acc: 96.79%, Val. Loss: 0.290, Val. Acc: 89.90%


In [19]:
test_loss_gru, test_acc_gru = evaluate(model_gru, test_iterator, criterion)
torch.cuda.empty_cache()
print("RNN-GRU test result")
print(f'Test Loss: {test_loss_gru:.3f}, Test Acc: {test_acc_gru*100:.2f}%')

  return Variable(arr, volatile=not train)


RNN-GRU test result
Test Loss: 0.375, Test Acc: 87.33%


# Conclusion
Using the IMDB dataset, GRU yields slightly more accurate (1% more) result than LSTM, though both training, validationa and test results are with 1% of each other.

# Implementing models

In [20]:
import spacy
nlp = spacy.load('en')

def predict_sentiment_lstm(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction_lstm = F.sigmoid(model_lstm(tensor))
    return prediction_lstm.item()

def predict_sentiment_gru(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction_gru = F.sigmoid(model_gru(tensor))
    return prediction_gru.item()

In [21]:
predict_sentiment_lstm("The result is hugely enjoyable, and hooray for Hollywood for making it happen.")

0.9209697842597961

In [22]:
predict_sentiment_lstm("A disordered and unfocused ghost story that bears all the very worst habits of the genre.")

0.00314916018396616

In [23]:
predict_sentiment_gru("The result is hugely enjoyable, and hooray for Hollywood for making it happen.")

0.6537795662879944

In [24]:
predict_sentiment_gru("A disordered and unfocused ghost story that bears all the very worst habits of the genre.")

0.01172313466668129