# Movie Review Prediction using Bidirectional LSTM Technique

In [69]:
# import necessary libraries

import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten, Dropout
from sklearn.model_selection import train_test_split
import re
import pickle

In [70]:
df=pd.read_csv(os.path.join(os.path.abspath(''),r"IMDB Dataset.csv"))

In [71]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [72]:
check_null = df.isnull().sum()

In [73]:
check_null

review       0
sentiment    0
dtype: int64

In [74]:
df.shape

(50000, 2)

In [75]:
df["review"].describe()

count                                                 50000
unique                                                49582
top       Loved today's show!!! It was a variety and not...
freq                                                      5
Name: review, dtype: object

In [76]:
df["sentiment"].describe()

count        50000
unique           2
top       negative
freq         25000
Name: sentiment, dtype: object

In [77]:
len(df["review"])


50000

# Data Preprocessing

In [78]:
for review in range(len(df["review"])):
    df["review"][review]=re.sub(r'<[^<>]+>', repl=" ",string=df["review"][review]) #remove html tags
    df["review"][review]=re.sub(r'[^a-zA-Z0-9\s]', repl=" ",string=df["review"][review]) #remove special characters/whitespaces

In [79]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming t...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there s a family where a little boy ...,negative
4,Petter Mattei s Love in the Time of Money is...,positive


In [80]:

df["review"][1]

'A wonderful little production    The filming technique is very unassuming  very old time BBC fashion and gives a comforting  and sometimes discomforting  sense of realism to the entire piece    The actors are extremely well chosen  Michael Sheen not only  has got all the polari  but he has all the voices down pat too  You can truly see the seamless editing guided by the references to Williams  diary entries  not only is it well worth the watching but it is a terrificly written and performed piece  A masterful production about one of the great master s of comedy and his life    The realism really comes home with the little things  the fantasy of the guard which  rather than use the traditional  dream  techniques remains solid then disappears  It plays on our knowledge and our senses  particularly with the scenes concerning Orton and Halliwell and the sets  particularly of their flat with Halliwell s murals decorating every surface  are terribly well done '

# Word Embedding

In [81]:
tokenizer = Tokenizer(num_words=5000)  # unique words limit set to 5000

In [82]:
tokenizer

<keras_preprocessing.text.Tokenizer at 0x1fe3579ed08>

In [83]:
tokenizer.fit_on_texts(df['review'])

In [84]:
X = tokenizer.texts_to_sequences(df['review'])

In [85]:
X[0]

[28,
 4,
 1,
 77,
 2061,
 46,
 1063,
 11,
 100,
 149,
 41,
 303,
 3090,
 398,
 20,
 231,
 29,
 3212,
 32,
 25,
 204,
 14,
 10,
 6,
 619,
 47,
 598,
 17,
 68,
 1,
 87,
 148,
 11,
 3258,
 68,
 44,
 3090,
 13,
 91,
 2,
 135,
 4,
 565,
 61,
 267,
 8,
 204,
 37,
 1,
 653,
 141,
 1743,
 68,
 10,
 6,
 23,
 3,
 116,
 16,
 1,
 2336,
 40,
 10,
 116,
 2597,
 56,
 17,
 5,
 1471,
 375,
 40,
 565,
 91,
 6,
 3831,
 8,
 1,
 358,
 360,
 4,
 1,
 653,
 7,
 6,
 436,
 3090,
 14,
 11,
 6,
 1,
 361,
 5,
 1,
 2542,
 1044,
 7,
 2712,
 1411,
 22,
 524,
 34,
 4681,
 2462,
 4,
 1,
 1196,
 115,
 30,
 1,
 27,
 2912,
 2,
 389,
 36,
 6,
 23,
 299,
 22,
 1,
 4902,
 2923,
 524,
 6,
 345,
 5,
 107,
 2450,
 2,
 52,
 36,
 327,
 2,
 25,
 112,
 224,
 242,
 9,
 60,
 132,
 1,
 282,
 1333,
 4,
 1,
 116,
 6,
 685,
 5,
 1,
 193,
 11,
 7,
 268,
 115,
 77,
 276,
 575,
 21,
 3016,
 827,
 183,
 1305,
 4177,
 16,
 2496,
 1230,
 827,
 1436,
 827,
 874,
 3090,
 152,
 21,
 949,
 185,
 1,
 87,
 398,
 9,
 123,
 211,
 3258,
 68,
 14,
 36,


In [86]:
len(X[0])

278

In [87]:
# padding so that all reviews will be of length 500
X = pad_sequences(X,maxlen=500)




In [88]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [89]:
len(X[0])

500

In [91]:
Y = df['sentiment']

# Create holdout Environment

In [93]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [102]:
X_train.shape

(40000, 500)

# Save tokenizer in a file to use it for prediction later

In [95]:
pickle_out=open('tokenizer.pickle',"wb")
pickle.dump(tokenizer,pickle_out)
pickle_out.close()

# One hot encoding

In [97]:
y_train=pd.get_dummies(Y_train)
y_test=pd.get_dummies(Y_test)

In [132]:
y_train

Unnamed: 0,negative,positive
7063,0,1
12869,1,0
24021,0,1
13722,0,1
43117,0,1
...,...,...
16481,0,1
12763,0,1
46274,1,0
45733,0,1


# Model Creation using LSTM Technique

In [98]:
vocab_size = len(tokenizer.word_index) + 1 # +1 is necessary for embedding method

In [99]:
vocab_size

102211

In [101]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=500))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(2,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])
model.summary() 

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 50)           5110550   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               183296    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 514       
Total params: 5,294,360
Trainable params: 5,294,360
Non-trainable params: 0
_________________________________________________________________


In [107]:
from tensorflow.keras.callbacks import EarlyStopping
earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

modelTraining=model.fit(X_train, y_train,
                        batch_size=128,
                        epochs=20,
                        validation_data=[X_test, y_test],
                        callbacks=[earlyStopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 00006: early stopping


# Model Performance

In [111]:
score=model.evaluate(X_test, y_test, verbose=0)

print("Test_score = ",score[0])
print("Test_accuracy = ",score[1])

Test_score =  0.32752764225006104
Test_accuracy =  0.8824999928474426


# Save Model

In [108]:

model.save('SentimentalAnalysis_LSTM.h5')

# Evaluate Model 

In [112]:
testModel=load_model('SentimentalAnalysis_LSTM.h5')

In [190]:
myInput="""I watched this movie on a flight and absolutely loved it, I ended up watching on Starz about 5 times... it's funny, cute, silly, emotional and I want What a Hottie Hottie as a ring tone or better yet a full song! I love the animation part of the movie as well, the whole story reads like a fairytale."""

In [191]:
myInput=re.sub(r'<[^<>]+>', repl=" ",string=myInput) #Excluding html tags
myInput=re.sub(r'[^a-zA-Z0-9\s]', repl=" ",string=myInput)

In [192]:
myInput

'I watched this movie on a flight and absolutely loved it  I ended up watching on Starz about 5 times    it s funny  cute  silly  emotional and I want What a Hottie Hottie as a ring tone or better yet a full song  I love the animation part of the movie as well  the whole story reads like a fairytale '

In [193]:
prediction=tokenizer.texts_to_sequences([myInput])
prediction=pad_sequences(prediction,maxlen=500)
model.predict(prediction)

array([[0.05850308, 0.9405468 ]], dtype=float32)

In [194]:
model.predict(prediction)[0][1]

0.9405468

In [None]:
# model.predict output is closer to 1 it is postive review, else a negative review
# I have set 0.75 as a threshold, if the output is greater than 0.75 it is a positive review else a negative review 