In [9]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, SimpleRNN

In [2]:
# load data
vocabulary_size = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Loaded dataset with 25000 training samples, 25000 test samples


In [7]:
# Get word index
word_index = imdb.get_word_index()
index_to_word = {i: word for word, i in word_index.items()}

# Check a sample of the data
print('Review:', ' '.join([index_to_word.get(i) for i in X_train[0]]))
print('\nLabel:', y_train[0])  # 1 for positive, 0 for negative

# Check the maximum and minimum length of the reviews
max_review_length = max([len(x) for x in X_train + X_test])
min_review_length = min([len(x) for x in X_train + X_test])
print('\nMaximum review length:', max_review_length)
print('Minimum review length:', min_review_length)

Review: the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room and it so heart shows to years of every never going and help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of 70s and with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other and in of seen over and for anyone of and br show's to whether from than out themselves history he name half some br of and odd was two most of mean for 1 any an boat she he should is thought and but of script you not while history he heart to real at and but when from one bit then have two of 

In [11]:
# Pad the reviews to the same length
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen = max_words)
X_test = sequence.pad_sequences(X_test, maxlen = max_words)

In [12]:
# Create the model
embedding_size = 32  # Dimension of the embedding vector
model = Sequential()
# The input to the embedding layer is a matrix of integers (the indices of the words from word_index dict).
# So, for each review, we have a matrix of shape (max_words, 1). As of now, each word is just a single integer.
# The embedding layer will learn a vector of shape (max_words, embedding_size)
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(SimpleRNN(100))
model.add(Dense(1, activation = 'sigmoid'))

print(model.summary())

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 32)           160000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               13300     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 173,401
Trainable params: 173,401
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
# Train the model
batch_size = 64
epochs = 3
model.fit(X_train, y_train, batch_size=batch_size, epochs = epochs, validation_split = 0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2a454a835b0>

In [14]:
# Evaluate the model
score, acc = model.evaluate(X_test, y_test)
print('Test accuracy:', acc)

Test accuracy: 0.6984000205993652


In [31]:
# Should print a very high score like 0.98.
test_1 = np.array([word_index[j] for j in "i loved it highly recommend it to anyone and everyone looking for a great movie to watch".split()])
test_1 = sequence.pad_sequences([test_1], maxlen = max_words)
print(model.predict(test_1))

# Should print a very low score like 0.01.
test_2 = np.array([ word_index[j] for j in "this was awful i hated it so much nobody should watch this the acting was terrible the music was terrible overall it was just bad".split()])
test_2 = sequence.pad_sequences([test_2], maxlen = max_words)
print(model.predict(test_2))  # I guess, it misclassified the review as positive.

[[0.7816201]]
[[0.84944516]]


In [None]:
# Further improvements:
# - Try to use a bidirectional RNN.
# - Try Dropout.
# - Try LSTM.
# - Maybe include more stuff in the vocabulary, like punctuation, etc.