**LSTM is implemented in Recurring Neural Network (RNN) architecture with IMDb movie reviews dataset in TensorFlow.**

In [1]:
import pandas as pd    
import numpy as np    
import nltk
from nltk.corpus import stopwords   
import re
from sklearn.model_selection import train_test_split 
from tensorflow.keras.models import Sequential     
from tensorflow.keras.layers import Embedding, Dense,  LSTM 
from tensorflow.keras.callbacks import ModelCheckpoint      
from tensorflow.keras.preprocessing.text import Tokenizer  
from tensorflow.keras.preprocessing.sequence import pad_sequences      
from tensorflow.keras.models import load_model   

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/IMDB.csv')  # dataset is originally downloaded from Kaggle and mounted in my gdrive later
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
X = df['review']       
y = df['sentiment']    

stops = set(stopwords.words('english'))

 # removing html tags & non alphabets
X = X.replace({'<.*?>[^A-Za-z]': ''}, regex = True)         
# removing stop words
X = X.apply(lambda review: [w for w in review.split() if w not in stops])  
X.apply(lambda review: [w.lower() for w in review]) 
    
y = y.replace('positive', 1)  #binary form
y = y.replace('negative', 0)

In [5]:
print('Reviews')
print(X, '\n')
print('Sentiment')
print(y)

Reviews
0        [One, reviewers, mentioned, watching, 1, Oz, e...
1        [A, wonderful, little, production., br, />The,...
2        [I, thought, wonderful, way, spend, time, hot,...
3        [Basically, there's, family, little, boy, (Jak...
4        [Petter, Mattei's, "Love, Time, Money", visual...
                               ...                        
49995    [I, thought, movie, right, good, job., It, cre...
49996    [Bad, plot,, bad, dialogue,, bad, acting,, idi...
49997    [I, Catholic, taught, parochial, elementary, s...
49998    [I'm, going, disagree, previous, comment, side...
49999    [No, one, expects, Star, Trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

print('Train Set')
print(X_train, '\n')
print(X_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
41640    [This, film, really, makes, cringe., In, 1951,...
43613    [Rather, long, dance, sequences, close, ups, c...
8184     [I'm, words, describe, beauty, "The, Cranes, F...
13381    [I, expecting, lot, film, directed, Sidney, J....
39170    [THHE2, entertaining, laugh, lot, cringe, prob...
                               ...                        
9302     [The, film, collection, cliche's, anything, th...
41053    [You, Belong, To, Me, final, teaming, Henry, F...
45408    [Opening, credits:, great., Music:, right, fil...
10265    [This, movie, stinks!, You, want, back, two-pl...
23340    [I, kid, .., crazy, Michael, Jackson., His, mu...
Name: review, Length: 40000, dtype: object 

30547    [OK..., so..., I, really, like, Kris, Kristoff...
22874    [Picture, classic, noir, story, lines, infused...
4845     [By, time, Hellraiser, franchise, reaching, fo...
20068    [This, favorite, game, Nintendo, 64, platform....
11593    [here,, let, wave, hands, keyboard,, i'll, tel...
 

In [7]:
def maximum_length():
    review_length = []
    for review in X_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [8]:
def maximum_length():
    review_length = []
    for review in X_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

token = Tokenizer(lower=False)   
token.fit_on_texts(X_train)
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

max_length = maximum_length()

X_train = pad_sequences(X_train, maxlen=max_length, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1 

print('Encoded X Train\n', X_train, '\n')
print('Encoded X Test\n', X_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[     7      4     11 ...      0      0      0]
 [  5174    134    967 ...      0      0      0]
 [    70    832   1502 ...    180    223   4159]
 ...
 [ 19016 100026   1140 ...      0      0      0]
 [     7      3  43596 ...      0      0      0]
 [     1    578   4791 ...      0      0      0]] 

Encoded X Test
 [[ 40729  18047      1 ...      0      0      0]
 [  4479    327   2239 ...    409    390     21]
 [   874     18  24759 ...      0      0      0]
 ...
 [     1  23802    100 ...     16    772  49133]
 [138609    973    122 ...      0      0      0]
 [     1     11    354 ...      0      0      0]] 

Maximum review length:  135


In [9]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 135, 32)           12181312  
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 12,206,209
Trainable params: 12,206,209
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
model.fit(X_train, y_train, batch_size = 64, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f53d2a42250>

In [15]:
y_pred = (model.predict(X_test, batch_size = 128) > 0.5).astype("int32")

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 8207
Wrong Prediction: 1793
Accuracy: 82.07
