In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt')

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,Dropout
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')
sns.set()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
imdb = pd.read_csv('/content/IMDB Dataset.csv')

In [None]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
imdb.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [None]:
text = imdb['review'][0]
print(text)
print(word_tokenize(text))

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

In [None]:
corpus = []
for text in imdb['review']:
  words = [word.lower() for word in word_tokenize(text)]
  corpus.append(words)

In [None]:
num_words = len(corpus)
print(num_words)

50000


In [None]:
imdb.shape

(50000, 2)

In [None]:
train_size = int(imdb.shape[0] * 0.8)
X_train = imdb.review[:train_size]
y_train = imdb.sentiment[:train_size]
X_test = imdb.review[train_size:]
y_test = imdb.sentiment[train_size:]

In [None]:
tokenizer = Tokenizer(num_words)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,maxlen=128,truncating='post',padding='post')

In [None]:
X_train,len(X_train[0])

(array([[   27,     4,     1, ..., 12191,     2,   392],
        [    3,   394,   120, ...,  1262,  1203,    91],
        [   10,   190,    11, ...,  1446,     2,  5092],
        ...,
        [   11,   118,     6, ...,    79,    10,   111],
        [    1,   192,    12, ...,   124,    21,   159],
        [   10,    25,     5, ...,     3,   824,     2]], dtype=int32),
 128)

In [None]:
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test,maxlen=128,truncating='post',padding='post')

In [None]:
X_test[0], len(X_test[0])

(array([   87,   122,    10,   180,     5,   132,    12,    10,  7131,
         3717,    20,     1,  1001,  2285,     2,    10,   255,     1,
           17,  2431,    10,  1311,     5,   103,     1,   222,  6349,
            4,     3,    19,    11,    17,   974,     3,   351,     5,
          215,  1011,   415,     9,    13,   215,  1380,    56,   235,
          402,   300,     4,   316,    23,   257,    19,   961,    12,
        22250,    12,    33,    66,    61,   212,    53,    16,    11,
          113,    13,   497,     2,     1,   102,    70,  5358,    15,
            1,    88,   172,     1,   473,   824,     8,     1,    64,
           67,    54,    49,  2406,    30,    29,    33,    90,    40,
        35787,    83,    46,   438,     4,     3,    74,   220,     2,
           10,   115,    21,    63,    12,    30,    29,   268,    10,
         1059,   137,    10,    78,    21,   119,    28,    13,     1,
           88,   175,     5,   728,  3423,   108,     8,     1,    17,
      

In [None]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(40000, 128) (40000,)
(10000, 128) (10000,)


In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=num_words,output_dim=100,input_length=128,trainable=True))
model.add(LSTM(100,dropout=0.1,return_sequences=True))
model.add(LSTM(100,dropout=0.1))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 100)          5000000   
                                                                 
 lstm (LSTM)                 (None, 128, 100)          80400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 5160901 (19.69 MB)
Trainable params: 5160901 (19.69 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
history = model.fit(X_train,y_train,epochs=5,batch_size=64,validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5

In [None]:
validation_sentence = ['It had some bad parts like the storyline although the actors performed really well and that is why overall I enjoyed it']
validation_sentence_tokened = tokenizer.texts_to_sequences(validation_sentence)
validation_sentence_padded = pad_sequences(validation_sentence_tokened,maxlen=128,truncating='post',padding='post')
print(validation_sentence[0])
print("Probability of Positive {}".format(model.predict(validation_sentence_padded)[0]))

It had some bad parts like the storyline although the actors performed really well and that is why overall I enjoyed it
Probability of Positive [0.96568465]


In [None]:
validation_sentence = ['Movie is Bad']
validation_sentence_tokened = tokenizer.texts_to_sequences(validation_sentence)
validation_sentence_padded = pad_sequences(validation_sentence_tokened,maxlen=128,truncating='post',padding='post')
print(validation_sentence[0])
print("Probability of Positive {}".format(model.predict(validation_sentence_padded)[0]))

Movie is Bad
Probability of Positive [0.24512449]


In [None]:
validation_sentence = ['Movie is Very Good, the best']
validation_sentence_tokened = tokenizer.texts_to_sequences(validation_sentence)
validation_sentence_padded = pad_sequences(validation_sentence_tokened,maxlen=128,truncating='post',padding='post')
print(validation_sentence[0])
print("Probability of Positive {}".format(model.predict(validation_sentence_padded)[0]))

Movie is Very Good, the best
Probability of Positive [0.98227644]


In [None]:
validation_sentence = ['The movie is average, kind of good,little bit bad']
validation_sentence_tokened = tokenizer.texts_to_sequences(validation_sentence)
validation_sentence_padded = pad_sequences(validation_sentence_tokened,maxlen=128,truncating='post',padding='post')
print(validation_sentence[0])
print("Probability of Positive {}".format(model.predict(validation_sentence_padded)[0]))

The movie is average, kind of good,little bit bad
Probability of Positive [0.6668498]
