In [1]:
# sklearn is the machine learning model library, keras(tensorflow) is deep learning model library
# pandas(numpy)
import numpy as np
from keras.datasets import imdb # Balanced dataset, 1 and 0 classes are balanced
from keras.models import Sequential
from keras.layers import Dense # Simple neural network layer
from keras.layers import LSTM # sequence processing
from keras.layers.embeddings import Embedding # to convert words into vectors for processing
from keras.preprocessing import sequence # Zero padding of small sentences to be able to create batches of sentence data of different lengths.

In [2]:
# load the dataset but only keep the top n words, zero the rest
# tf-idf .. unique word size -- 1600
top_words = 5000 # vocabulary size, number of unique words will be 5000 >> 1600
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words) # select those sentences such that number of unique words <= 5000.
print(X_train.shape)
print(X_train[0])
print(len(X_train[0]))
print(X_test.shape)

# All that we need is the serial number of a word in the unique word list and the actual word is not required for saving data space and also processing

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])


(25000,)
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]
218
(25000,)


  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [3]:
unique_word_dictionary = imdb.get_word_index()
# unique_word_dictionary

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [4]:
reverse_index_word_dictionary = {}
for i in unique_word_dictionary:
  if not unique_word_dictionary[i] in reverse_index_word_dictionary:
    reverse_index_word_dictionary[unique_word_dictionary[i]] = i

In [5]:
# reverse_index_word_dictionary

In [6]:
for i in X_train[1]:
  print(reverse_index_word_dictionary[i])

the
thought
solid
thought
and
do
making
to
is
spot
nomination
and
while
he
of
jack
in
where
picked
as
getting
on
was
did
hands
fact
characters
to
always
life
thrillers
not
as
me
can't
in
at
are
br
of
sure
your
way
of
little
it
strongly
random
to
view
of
love
it
so
and
of
guy
it
used
producer
of
where
it
of
here
icon
film
of
outside
to
don't
all
unique
some
like
of
direction
it
if
out
her
imagination
below
keep
of
queen
he
and
to
makes
this
stretch
and
of
solid
it
thought
begins
br
and
and
budget
worthwhile
though
ok
and
and
for
ever
better
were
and
and
for
budget
look
kicked
any
to
of
making
it
out
and
follows
for
effects
show
to
show
cast
this
family
us
scenes
more
it
severe
making
and
to
and
finds
tv
tend
to
of
and
these
thing
wants
but
and
an
and
cult
as
it
is
video
do
you
david
see
scenery
it
in
few
those
are
of
ship
for
with
of
wild
to
one
is
very
work
dark
they
don't
do
dvd
with
those
them


In [7]:
len(X_train[0])

218

In [8]:
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 2,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 2,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 2,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 2,
 18,
 51,
 36,
 

In [9]:
np.sum(y_test)

12500

In [10]:
# truncate and pad input sequences
max_review_length = 500 # maximum 500 words in a review/ per example (very big reviews will take lot of computation time)
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) # do sequence zero padding on train data so that all sentences are of 500 words
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) # do sequence zero padding on test data so that all sentences are of 500 words
# Reviews of length > 500, will get truncated while
# Reviews of length < 500, will get zero padded
# so the end result is that all sentences will of lenghth 500 words.
print(X_train[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [11]:
type(y_train)

numpy.ndarray

In [12]:
# create the model
embedding_vector_length = 32 # Embedding vector is used by the embedding layer internally
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
#top_words = 5000, 5000 unique words
# input length = 500
# input to LSTM should be of size 32, embedding_vector_length .. we can also make it 5000, or we can also keep at 100
model.add(LSTM(100)) # 100 hidden neurons
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# loss='binary_crossentropy'--> logistic loss
# optimizer='adam' for adaptive learning rate
# metrics=['accuracy'], because data is balanced, for imbalanced data use Auc-ROC score
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           160000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)
# every 64 reviews a backpropagation will happen because batch_size = 64

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f5a0ea5f110>

In [14]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 87.23%


In [15]:
from sklearn.metrics import roc_auc_score

In [17]:
print('AUC on train =', roc_auc_score(y_true=y_train, y_score=model.predict_proba(X_train)[:, 0]))



AUC on train = 0.9818249024000001


In [18]:
print('AUC on test =', roc_auc_score(y_true=y_test, y_score=model.predict_proba(X_test)[:, 0]))



AUC on test = 0.9426695871999999
