#### HARI PRASATH S

# Lab16: Design of LSTM and GRU RNN for classification of IMDB reviews

##### Step 1

In [1]:
import pandas as pd    
import numpy as np     
from nltk.corpus import stopwords   
from sklearn.model_selection import train_test_split       
from tensorflow.keras.preprocessing.text import Tokenizer  
from tensorflow.keras.preprocessing.sequence import pad_sequences   
from tensorflow.keras.models import Sequential     
from tensorflow.keras.layers import Embedding, LSTM, Dense 
from tensorflow.keras.callbacks import ModelCheckpoint   
from tensorflow.keras.models import load_model  
import re
from tensorflow.keras.layers import Bidirectional

In [3]:
data = pd.read_csv('IMDB Dataset.csv')

print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


##### Step 2

In [7]:
import nltk
nltk.download('stopwords')
english_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review']       
    y_data = df['sentiment']    

    
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()



##### Step 3

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
29955    [it, may, although, still, two, three, i, miss...
39093    [i, years, old, i, saw, musical, movie, vacati...
48965    [recovery, well, judged, balanced, drama, sens...
3170     [anyone, watched, alien, vs, predator, must, k...
49564    [they, say, david, duchovny, took, six, days, ...
                               ...                        
33504    [forget, recent, dire, american, remake, sadly...
19565    [oh, wow, i, saw, film, irish, international, ...
9631     [what, i, say, this, one, perfect, films, ever...
40735    [i, hate, programme, concept, ludicrous, tries...
20107    [come, get, pakistan, bashing, guys, bollywood...
Name: review, Length: 40000, dtype: object 

19850    [movies, like, need, sequels, part, advantage,...
33276    [too, much, added, much, taken, away, great, w...
27090    [friend, made, watch, awful, film, ugh, stupid...
37230    [ed, wood, perhaps, worst, film, maker, time, ...
27416    [antwone, fisher, tells, young, black, u, s, n...
 

In [10]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [11]:
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[    7   107   167 ...     0     0     0]
 [    1    71    72 ...  4990 15844    71]
 [14061    16  6465 ...     0     0     0]
 ...
 [  105     1    59 ...     0     0     0]
 [    1   635  5110 ...     0     0     0]
 [  122    19  9955 ...     0     0     0]] 

Encoded X Test
 [[   28     6   274 ...     0     0     0]
 [ 1410    17  1148 ...     0     0     0]
 [  332    24    35 ...     0     0     0]
 ...
 [   38    37   845 ...     0     0     0]
 [   40   154    80 ... 26696  9356   721]
 [   40    14   300 ...  1650  6766     2]] 

Maximum review length:  131


##### Step 4

In [12]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 131, 32)           2963392   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2,992,449
Trainable params: 2,992,449
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
model.fit(x_train, y_train, batch_size = 128, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2070253eb90>

In [14]:
model.evaluate(x_test,y_test)



[0.500201940536499, 0.8607000112533569]

##### Step 5

In [15]:
# ARCHITECTURE
EMBED_DIM = 32

model1 = Sequential()
model1.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model1.add(LSTM(32))
model1.add(Dense(64, activation='relu'))
model1.add(Dense(64, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model1.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 131, 32)           2963392   
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense_2 (Dense)             (None, 64)                2112      
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2,978,049
Trainable params: 2,978,049
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
model1.fit(x_train, y_train, batch_size = 128, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x20701b889a0>

##### Step 6

In [17]:
EMBED_DIM = 32
LSTM_OUT = 64

model2 = Sequential()
model2.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model2.add(Bidirectional(LSTM(LSTM_OUT)))
model2.add(Dense(64, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model2.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 131, 32)           2963392   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              49664     
 l)                                                              
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dense_6 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3,021,377
Trainable params: 3,021,377
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
model2.fit(x_train, y_train, batch_size = 128)



<keras.callbacks.History at 0x20701b2f010>