<a href="https://colab.research.google.com/github/Hemanthhh/IMDB-reviews-sentiment-analysis/blob/main/IMDB_Sentiment_Analysis_Using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
dataframe = pd.read_csv('/content/IMDB Dataset.csv', encoding='utf-8')

In [None]:
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
english_stop_words = set(stopwords.words('english'))

In [None]:
def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review']    
    y_data = df['sentiment']    

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stop_words])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
43828    [before, sunrise, romance, slacker, generation...
37635    [popular, radio, storyteller, gabriel, no, one...
5685     [this, crock, doodoo, award, they, must, despe...
48231    [just, watched, film, rd, time, enjoyed, linds...
539      [ok, i, italian, many, italian, film, like, i,...
                               ...                        
18888    [spoiler, plot, explanation, poor, family, thr...
5188     [the, comeback, starts, looking, promising, br...
28268    [spoilers, in, first, paragraph, this, movie, ...
34156    [i, sure, review, contains, spoilers, i, playi...
4471     [cobb, it, sucked, i, learned, nothing, man, i...
Name: review, Length: 40000, dtype: object 

12501    [ugh, i, say, ugh, i, rented, film, labeled, s...
40437    [this, is, not, a, love, song, brilliant, exam...
29462    [let, start, saying, i, fan, horror, movies, i...
15043    [deep, blood, its, one, movies, say, another, ...
46029    [i, read, comments, film, judging, average, ra...
 

In [None]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [None]:
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 1632  8883   776 ...   671   132     8]
 [  987  1435 11686 ...   975   347 30149]
 [    8 23754 57103 ...     0     0     0]
 ...
 [  943    49    23 ...  6155 45284    55]
 [    1   158   612 ...   547   433     4]
 [ 6523     7  1922 ...   493    55  1066]] 

Encoded X Test
 [[ 5597     1    58 ...     0     0     0]
 [    8   511   153 ...     2     4 10423]
 [  181   280   528 ...     0     0     0]
 ...
 [    8     3  1156 ...     0     0     0]
 [    5   160   723 ...     0     0     0]
 [  166     1 16317 ...   736   354    47]] 

Maximum review length:  130


In [None]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2952160   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2,977,057
Trainable params: 2,977,057
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [None]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
Epoch 00001: accuracy improved from -inf to 0.72927, saving model to models/LSTM.h5
Epoch 2/5
Epoch 00002: accuracy improved from 0.72927 to 0.92112, saving model to models/LSTM.h5
Epoch 3/5
Epoch 00003: accuracy improved from 0.92112 to 0.96127, saving model to models/LSTM.h5
Epoch 4/5
Epoch 00004: accuracy improved from 0.96127 to 0.97705, saving model to models/LSTM.h5
Epoch 5/5
Epoch 00005: accuracy improved from 0.97705 to 0.98652, saving model to models/LSTM.h5


<keras.callbacks.History at 0x7fac51e91710>

In [None]:
y_prediction = model.predict(x_test, batch_size = 128)
y_prediction_classes = np.where(y_prediction > 0.7, 1, 0)

true = 0
for i, y in enumerate(y_test):
    if y == y_prediction_classes[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 8687
Wrong Prediction: 1313
Accuracy: 86.87


In [None]:
loaded_model = load_model('models/LSTM.h5')

In [None]:
review = str(input('Movie Review: '))

Movie Review: Movie felt awful and story was not interesting


In [None]:
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print(f'Cleaned String: {review}')

words = review.split(' ')
filtered = [w for w in words if w not in english_stop_words]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print(f'Filtered String: {filtered}')

Cleaned String: Movie felt awful and story was not interesting
Filtered String: ['movie felt awful story interesting']


In [None]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen= max_length, padding= 'post', truncating= 'post')
print(tokenize_words)

[[  3 344 279  14 128   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]]


In [None]:
result = loaded_model.predict(tokenize_words)
print(result)

[[0.02301535]]


In [None]:
if result >0.7:
  print('positive')
else:
  print('negative')

negative
