In [3]:
# Cache the dataset
from os import listdir, makedirs
from os.path import join, exists, expanduser

cache_dir = expanduser(join('~', '.keras'))
if not exists(cache_dir):
    makedirs(cache_dir)
datasets_dir = join(cache_dir, 'datasets')
if not exists(datasets_dir):
    makedirs(datasets_dir)

# If you have multiple input files, change the below cp commands accordingly, typically:
# !cp ../input/keras-imdb-reviews/imdb* ~/.keras/datasets/
# !cp ../input/imdb* ~/.keras/datasets/

In [4]:
from tensorflow import keras

# Step by step

In [24]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, GRU, Flatten, LSTM
# from keras.layers.embeddings import Embedding
from keras.layers import Embedding
# from keras.preprocessing import sequence
from keras.utils import pad_sequences

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

max_words = 500
X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)


In [19]:
type(X_train)

numpy.ndarray

In [22]:
X_train.shape

(25000, 500)

## Feel the data

In [6]:
# word_index is a dictionary mapping words to an integer index
word_index = imdb.get_word_index()
# We reverse it, mapping integer indices to words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# We decode the review; note that our indices were offset by 3
# because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".

def print_review_text(review_one_hot):
    print(review_one_hot)
    review_text = ' '.join([reverse_word_index.get(i - 3, '?') for i in review_one_hot])
    print(review_text)

def print_label_text(label_integer):
    label = "Positive" if label_integer else "Negative"
    print(label)


In [7]:
for review_one_hot, label_integer in zip(X_train[:10], y_train[:10]):
    print_review_text(review_one_hot)
    print("\n")
    print_label_text(label_integer)
    print("\n")

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

**Option 1 Define a simple model, Multilayer Neural Network**

In [11]:
# create the model MLP

# Option 1
# Create a multilayer Neural Network with the following layers:
# 1. Embedding layer with 32 neurons and input length of 500
# 2. Flatten layer
# 3. Dense layer with 250 neurons and relu activation
# 4. Dense layer with 1 neuron and sigmoid activation
# Compile the model with binary_crossentropy loss, adam optimizer and accuracy metric



model = Sequential()
model.add(Embedding(top_words, 32, input_length=500))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 32)           160000    
                                                                 
 flatten (Flatten)           (None, 16000)             0         
                                                                 
 dense (Dense)               (None, 250)               4000250   
                                                                 
 dense_1 (Dense)             (None, 1)                 251       
                                                                 
Total params: 4,160,501
Trainable params: 4,160,501
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
# Option 2
# Create a LSTM model with the following layers:
# 1. Embedding layer with top_words as input dim, and 32(embedding_vecor_length)as output dim with input length of 500(max_words)
# 2. LSTM layer with 100 neurons
# 3. Dense layer with 1 neuron and sigmoid activation
# Compile the model with binary_crossentropy loss, adam optimizer and accuracy metric

model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [15]:
# Testing the training loop
model.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 74.61%


In [25]:
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.02%


**Train the model**

In [16]:
# Fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/10
196/196 - 138s - loss: 0.2360 - acc: 0.9075 - val_loss: 0.3189 - val_acc: 0.8663 - 138s/epoch - 704ms/step
Epoch 2/10
196/196 - 140s - loss: 0.1831 - acc: 0.9330 - val_loss: 0.3075 - val_acc: 0.8752 - 140s/epoch - 717ms/step
Epoch 3/10
196/196 - 145s - loss: 0.1559 - acc: 0.9439 - val_loss: 0.3463 - val_acc: 0.8766 - 145s/epoch - 742ms/step
Epoch 4/10
196/196 - 148s - loss: 0.1633 - acc: 0.9401 - val_loss: 0.3570 - val_acc: 0.8727 - 148s/epoch - 757ms/step
Epoch 5/10
196/196 - 146s - loss: 0.1308 - acc: 0.9535 - val_loss: 0.4034 - val_acc: 0.8676 - 146s/epoch - 745ms/step
Epoch 6/10
196/196 - 146s - loss: 0.1188 - acc: 0.9582 - val_loss: 0.3815 - val_acc: 0.8662 - 146s/epoch - 745ms/step
Epoch 7/10
196/196 - 146s - loss: 0.1526 - acc: 0.9437 - val_loss: 0.4158 - val_acc: 0.8644 - 146s/epoch - 746ms/step
Epoch 8/10
196/196 - 151s - loss: 0.1295 - acc: 0.9511 - val_loss: 0.3987 - val_acc: 0.8614 - 151s/epoch - 771ms/step
Epoch 9/10
196/196 - 151s - loss: 0.0942 - acc: 0.9674 -

**Make predictions with new reviews**

In [17]:
# Predict the label for test data (pretend this is new reviews)
y_predict = model.predict(X_test)



In [18]:
# Check the predict label and the real label
for review_one_hot, real_label_integer, predict_label_integer in zip(X_test[:10], y_test[:10], y_predict[:10]):
    print_review_text(review_one_hot)
    print("\n")
    print("Actual:")
    print(real_label_integer)
    print("\n")
    print("Predicted:")
    print(predict_label_integer)
    print("\n")

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 