# LSTM for sequence classification in the IMDB dataset

In [4]:
# Credits: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence

#### Dataset
Say V be the whole corpus of words in the dataset. There is a table where all the words are given a rank. 
Rank is assigned based on the frequency of that word in the corpus.
Eg: 'The' : 1 ; 'a':2 ..... 'phone':378...etc

In [5]:
#Refer: https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification

# load the dataset but only keep the top n words, zero the rest
top_words = 5000  # limiting the words upto rank 5000

(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)



In [6]:
print(X_train[1])
print(type(X_train[1]))
print(len(X_train[1]))
print(len(X_train[0]))

[1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369, 2, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 2, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 2, 5, 163, 11, 3215, 2, 4, 1153, 9, 194, 775, 7, 2, 2, 349, 2637, 148, 605, 2, 2, 15, 123, 125, 68, 2, 2, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 2, 5, 2, 656, 245, 2350, 5, 4, 2, 131, 152, 491, 18, 2, 32, 2, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]
<class 'list'>
189
218


In [10]:
print(X_train.shape)
print(X_test.shape)

(25000,)
(25000,)


In [10]:
print(max(numpy.max(X_test)))
print(max(numpy.max(X_train)))

4998
4987


### Q: Input of LSTM is supposed to be variable length time series data. Then why to apply padding on the input vector to make all data points same length?
A: Assume there are 3 input vectors. X1 contains 189 words, X2- 310, X3 -150. We want to back propagate each of the input, back propagate over time. What we are doing is an SGD operation with batch size =1, ie at any point of time we are processing one sequence, not combining sequences for processing. And this approach is too slow and takes lot of time. So the remedy is perform SGD with batch size = k. ie LSTM has several inputs. One input receives X11,X21,X31… (first element of every data points. So instead of processing SGD for whole inputs, the method is collecting all the first words from input vectors of batch size k as a set and sent. Similarly X12,X22,X32.. will sent simultaneously. So in order to perform based on batch we need input vector size to be same. This will speed up the training in LSTM.


In [12]:
# truncate and/or pad input sequences
max_review_length = 600  # Several reviews has varying length. Inorder to make a fixed sized matrix we do the padding
# Eg: X1 has a length of 189 words. Make it 600 by adding zeroes in the beginning

X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

print(X_train.shape)
print(X_train[1])

(25000, 600)
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    

In [13]:
# create the model
embedding_vecor_length = 32
model = Sequential()
# The first layer of model is embedding layer. What is the need of this ? Refer que word doc
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100)) # we are defining 100 LSTMs parallely ie one behind the other geometrically. 
#32 dim input from embedding feeding to all LSTMs.and gives 100 D output
# no. of parameters in a LSTM having 32 inputs (m) and 100 outputs (n) is 4(nm+ n square+n).n term is the bias (b). LSTM keras has bias as default
# here no of params is 4(32 square + 32*100 + 100)= 53200
model.add(Dense(1, activation='sigmoid')) # outputs of all LSTMs connected to a single sigmooid unit and gives y_hat (binary)
# sigma ( WTX+b) . we have 100 weights associated with output plus a bias term. So 101 params.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
#Refer: https://datascience.stackexchange.com/questions/10615/number-of-parameters-in-an-lstm-model

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 600, 32)           160000    
                                                                 
 lstm (LSTM)                 (None, 100)               53200     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
model.fit(X_train, y_train, epochs=10, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 86.05%
