In [2]:
from __future__ import print_function 
import numpy as np
import tensorflow as tf 

import os
import sys

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Lambda
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import SimpleRNN, GRU, LSTM, Bidirectional

import keras.backend as K
from keras import optimizers

Using TensorFlow backend.


Recurrent Neural Networks are ideal for processing sequential data (in which there is an intrinsic order, for example, a time series). Commonly, a data instance is decomposed into smaller units, each will be processed by the network cell at a time step in a predetermined order. In this way, information from earlier states will be able to accumulate and influence the computation of later states, similar to how human memory works. 

<img src="../images/rnn.png">

Mathematically, RNN can be represented as a cyclic graph, or a recursive formula, in which each new states are computed based on the new inputs and old states. There is also the unrolled version of it as follows. 

<img src="../images/lstm.png">

Each box in the network is a network cell. Typically it is a complex function that takes the previous hidden states and the input at the current time step, and produce a output and a new hidden state. Two major designs for a RNN cell are LSTM and GRU, but there are other variants depending on the task to tackle.

<img src="../images/lstmcell.png">

A LSTM cell consists of four gates (some may argue for three gates), the essential ones are the input gate, forget gate and output gate. LSTM preserves long term dependencies well, it prevents the exploding gradients problem by implementing new state updates with simple additions instead of matrix multiplications.

<img src="../images/gru.png"> 

A GRU cell only has two gates: a reset gate and a update gate. Essentially it also prevents exploding gradients by gating mechanisms but is more efficient and stable in terms of implementation. It is reported to outperform a LSTM on various tasks like text classification. 

<img src="../images/bidirrnn.png">

A bidirectional RNN goes over the sequece in both forward and backward directions, essentially it is stacking two RNNs to generate states that captures information before and after the current time step. A bidireectional RNN usually outperforms a uni-directional RNN, but is computationally expensive for 2x or above. 

<img src="../images/deeplstm.png">

It is also possible to stack more layers of RNNs to make the model deeper but the computation needed scales up quickly and in practice it seldom goes over two layers in maximum.  

a great article & reference to explaining RNN and its variants is here: http://colah.github.io/posts/2015-08-Understanding-LSTMs/

In [1]:
class RNNClassifier():
    """
    """
    def __init__(self, batch_size, epochs, raw_data_path=None, embedded_data_path='data/data_and_embedding100.npz',
                 embedding_dim=100, rnn_type='lstm'):
        super(RNNClassifier, self).__init__(batch_size, epochs, raw_data_path=None,
                embedded_data_path=embedded_data_path, embedding_dim=embedding_dim) 
        
        # get the RNN model type 
        self.type = rnn_type
        
    def build(self):
        """ 
        """
        if self.type == 'simple':
            self.model = self.build_simple_rnn()
        elif self.type == 'lstm':
            self.model = self.build_lstm()
        elif self.type == 'gru':
            self.model = self.build_gru()
        elif self.type == 'bidirectional':
            self.model = self.build_bidirectional_lstm()
            
    def build_simple_rnn(self):
        """
        """
        sequence_input = Input(shape=(max_sequence_length, ), dtype='int32')
        embedded_sequences = self.embedding_layer(sequence_input)
        x = SimpleRNN(50, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)
        preds = Dense(6, activation='softmax')(x)
        model_rnn_final_state = Model(sequence_input, preds)
        return model_rnn_final_state
    
    def build_lstm(self):
        """
        """
        sequence_input = Input(shape=(max_sequence_length, ), dtype='int32')
        embedded_sequences = self.embedding_layer(sequence_input)
        x = LSTM(50, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)
        preds = Dense(6, activation='softmax')(x)
        model_lstm_final_state = Model(sequence_input, preds)
        print(model_lstm_final_state)
    ￼
    def build_gru(self):
        """
        """
        sequence_input = Input(shape=(max_sequence_length, ), dtype='int32')
        embedded_sequences = self.embedding_layer(sequence_input)
        x = GRU(50, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)
        preds = Dense(6, activation='softmax')(x)
        model_gru_final_state = Model(sequence_input, preds)
        return model_gru_final_state
    
    def build_bidirectional_lstm(self):
        """
        """
        sequence_input = Input(shape=(max_sequence_length, ), dtype='int32')
        embedded_sequences = self.embedding_layer(sequence_input)
        x = Bidirectional(LSTM(50, dropout=0.2, recurrent_dropout=0.2))(embedded_sequences)
        preds = Dense(6, activation='softmax')(x)
        model_bidirlstm_final_state = Model(sequence_input, preds)
        return model_bidirlstm_final_state
        
    def train(self, loss='categorical_crossentropy', optimizer='adam', model_base_path="models/"):
        """ for Simple RNN, the optimizer needs to implement gradients clipping to prevent explosion 
        """
        if self.type == 'simple':
            optimizer = optimizers.Adam(clipnorm=1.)
        super(RNNClassifier, self).train(optimizer=optimizer, model_base_path=model_base_path)

In [None]:
if __name__ == "__main__":
    """ test different RNN models
    """
    
    # Simple RNN with gradient clipping
    simple_RNN = RNNClassifier(batch_size=128, epochs=10, raw_data_path=None,
                        embedded_data_path='data/data_and_embedding100.npz', embedding_dim=100, rnn_type='simple')
    simple_RNN.build()
    simple_RNN.train()
    print("constructed simple RNN classifier")
    simple_RNN.evaluate()
    print("simple RNN classifier evaluated")
    
    # LSTM 
    lstm = RNNClassifier(batch_size=128, epochs=10, raw_data_path=None,
                        embedded_data_path='data/data_and_embedding100.npz', embedding_dim=100, rnn_type='lstm')
    lstm.build()
    lstm.train()
    print("constructed LSTM classifier")
    lstm.evaluate()
    print("LSTM classifier evaluated")
    
    # GRU
    gru = RNNClassifier(batch_size=128, epochs=10, raw_data_path=None,
                        embedded_data_path='data/data_and_embedding100.npz', embedding_dim=100, rnn_type='gru')
    gru.build()
    gru.train()
    print("constructed GRU classifier")
    gru.evaluate()
    print("GRU classifier evaluated")
    
    # Bidirectional LSTM
    bidirectional_lstm = RNNClassifier(batch_size=128, epochs=10, raw_data_path=None,
                        embedded_data_path='data/data_and_embedding100.npz', embedding_dim=100, rnn_type='bidirectional')
    bidirectional_lstm.build()
    bidirectional_lstm.train()
    print("constructed bidirectional LSTM classifier")
    bidirectional_lstm.evaluate()
    print("bidirectional LSTM classifier evaluated")