In [1]:
import pandas as pd
import sklearn
import nltk
import re

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, accuracy_score

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [2]:
df = pd.read_csv(r"C:\Users\hp\Downloads\archive (1)\IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape

(50000, 2)

In [5]:
#Check for duplicates and remove them
df.drop_duplicates (inplace = True)

In [6]:
print(f"Shape of the dataframe after duplicates removal: {df.shape}.")

Shape of the dataframe after duplicates removal: (49582, 2).


In [7]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [8]:
df['sentiment'].value_counts()

positive    24884
negative    24698
Name: sentiment, dtype: int64

In [9]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
# stopwords_list

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
stopwords_list.remove('no')
stopwords_list.remove('not')

In [11]:
df['sentiment'][6]

'positive'

In [13]:
x_data = df['review']       # Reviews/Input
y_data = df['sentiment']    # Sentiment/Output
english_stops = set(stopwords.words('english'))

# PRE-PROCESS REVIEW
x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
cleaned_reviews = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

In [14]:
cleaned_reviews

0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 49582, dtype: object

In [16]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [17]:
X=cleaned_reviews
y=df['sentiment']

In [18]:
X

0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 49582, dtype: object

In [19]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [20]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [22]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [23]:
# ENCODE REVIEW
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[  34  118  323 ...    0    0    0]
 [   4 1012  558 ... 6720  345   17]
 [ 275  394 4605 ...    0    0    0]
 ...
 [  49 1088  449 ... 2803   72  254]
 [  55 1347   92 ...  963 1941  716]
 [   1  101   47 ...    1  105  126]] 

Encoded X Test
 [[ 5002  6845   934 ... 17301   836   194]
 [    2  2240    38 ...  1177  1984  1930]
 [  684   396  1539 ...   582 19684    75]
 ...
 [    1    34   350 ...    30     3   562]
 [  123    18  9202 ...     0     0     0]
 [  461  1998    56 ...     0     0     0]] 

Maximum review length:  130


In [24]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2960928   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2,985,825
Trainable params: 2,985,825
Non-trainable params: 0
_________________________________________________________________
None


In [25]:
from tensorflow.keras.callbacks import ModelCheckpoint 
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [26]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.73871, saving model to models\LSTM.h5
Epoch 2/5
Epoch 2: accuracy improved from 0.73871 to 0.92139, saving model to models\LSTM.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.92139 to 0.96087, saving model to models\LSTM.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.96087 to 0.97766, saving model to models\LSTM.h5
Epoch 5/5
Epoch 5: accuracy improved from 0.97766 to 0.98608, saving model to models\LSTM.h5


<keras.callbacks.History at 0x26640e61970>

In [34]:
predict=model.predict(x_test) 




In [35]:
predict

array([[0.26423803],
       [0.0081638 ],
       [0.00476506],
       ...,
       [0.00393365],
       [0.997298  ],
       [0.9975193 ]], dtype=float32)

In [36]:
predict.shape

(9917, 1)

In [38]:
predict[0]

array([0.26423803], dtype=float32)

In [42]:
y_predict = np.argmax(model.predict(x_test), axis=-1)



In [43]:
y_predict[0]

0

In [44]:
y_test[0]

1

In [None]:
print("Accuracy score of {} is:{}".format(classifier, accuracy_score(y_test, y_pred)))