In [None]:
#Step 1: Import our dataset
from keras.datasets import imdb #IMBD built-in dataset in Keras

#Set vocaab size
vocabulary_size = 5000

#Load in training and test sets
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)

print('We have imported {} training samples and {} test samples.'.format(len(X_train), len(X_test)))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])


We have imported 25000 training samples and 25000 test samples.


  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [None]:
#Inspect a sample review & it's label
print(X_train[7])
print('----Label----')
print(y_train[7])

[1, 4, 2, 716, 4, 65, 7, 4, 689, 4367, 2, 2343, 4804, 2, 2, 2, 2, 2315, 2, 2, 2, 2, 4, 2, 628, 2, 37, 9, 150, 4, 2, 4069, 11, 2909, 4, 2, 847, 313, 6, 176, 2, 9, 2, 138, 9, 4434, 19, 4, 96, 183, 26, 4, 192, 15, 27, 2, 799, 2, 2, 588, 84, 11, 4, 3231, 152, 339, 2, 42, 4869, 2, 2, 345, 4804, 2, 142, 43, 218, 208, 54, 29, 853, 659, 46, 4, 882, 183, 80, 115, 30, 4, 172, 174, 10, 10, 1001, 398, 1001, 1055, 526, 34, 3717, 2, 2, 2, 17, 4, 2, 1094, 871, 64, 85, 22, 2030, 1109, 38, 230, 9, 4, 4324, 2, 251, 2, 1034, 195, 301, 14, 16, 31, 7, 4, 2, 8, 783, 2, 33, 4, 2945, 103, 465, 2, 42, 845, 45, 446, 11, 1895, 19, 184, 76, 32, 4, 2, 207, 110, 13, 197, 4, 2, 16, 601, 964, 2152, 595, 13, 258, 4, 1730, 66, 338, 55, 2, 4, 550, 728, 65, 1196, 8, 1839, 61, 1546, 42, 2, 61, 602, 120, 45, 2, 6, 320, 786, 99, 196, 2, 786, 2, 4, 225, 4, 373, 1009, 33, 4, 130, 63, 69, 72, 1104, 46, 1292, 225, 14, 66, 194, 2, 1703, 56, 8, 803, 1004, 6, 2, 155, 11, 4, 2, 3231, 45, 853, 2029, 8, 30, 6, 117, 430, 19, 6, 2, 9, 

In [None]:
#At the moment all words are represented by integers. We can undo this. 

word2id = imdb.get_word_index()
id2word = {i : word for word, i in word2id.items()}
print('----Review with words ----')
print([id2word.get(i, ' ') for i in X_train[7]])
print('----Label----')
print(y_train[7])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
----Review with words ----
['the', 'of', 'and', 'local', 'of', 'their', 'br', 'of', 'attention', 'widow', 'and', 'captures', 'parties', 'and', 'and', 'and', 'and', 'excitement', 'and', 'and', 'and', 'and', 'of', 'and', 'english', 'and', 'like', 'it', 'years', 'of', 'and', 'unintentional', 'this', 'hitchcock', 'of', 'and', 'learn', 'everyone', 'is', 'quite', 'and', 'it', 'and', 'such', 'it', 'bonus', 'film', 'of', 'too', 'seems', 'he', 'of', 'enough', 'for', 'be', 'and', 'editing', 'and', 'and', 'please', 'great', 'this', 'of', 'shoots', 'thing', '3', 'and', "it's", 'mentioning', 'and', 'and', 'given', 'parties', 'and', 'back', 'out', 'interesting', 'times', 'no', 'all', 'average', 'talking', 'some', 'of', 'nor', 'seems', 'into', 'best', 'at', 'of', 'every', 'cast', 'i', 'i', 'inside', 'keep', 'inside', 'large', 'viewer', 'who', 'obscure', 'and', 'and', 'and', 'movie', 'of', 'and', 'en

In [None]:
#What's the max and min length of any of these reviews?

print('The max length is {}'.format(len(max((X_train + X_test), key=len))))
print('The min length is {}'.format(len(min((X_train + X_test), key=len))))

The max length is 2697
The min length is 70


In [None]:
#We cannot feed these data into a RNN without making all input documents of equal length. 
#We will limit the length to max_words by truncating longer reviews and padding shorter reviews. 

from keras.preprocessing import sequence

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen = max_words)

In [None]:
#Design our RNN model! 

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential() #Building a sequential neural network
embedding_size = 32 #Not sure what this does 
model.add(Embedding(vocabulary_size, embedding_size, input_length = max_words))
model.add(LSTM(100))
model.add(Dense(1, activation = 'sigmoid'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           160000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


Our model has 213,301 parameters to train. It includes the following:


*   Takes in the review and embeds it into a dense vector space rather than a sparse bag-of-words model, creating a dense array;
*   The dense array is taken by the Long Short-Term Memory layer and turned into a single vector of size 100;
*   The sigmoid activation function outputs a value between 0 and 1. 



In [None]:
#Compile:
model.compile(optimizer='Adam', loss = 'binary_crossentropy', metrics= ['accuracy'])

Now we have finished compilation, we can begin *training* the model. 
We will specify the **batch size** ```batch_size``` and the **number of training epochs** ```num_epochs```. 



In [None]:
batch_size = 64
num_epochs = 3

#Reserve validation data - take off the last batch for use as a validation set
X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

#Train model!!!
model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size = batch_size, epochs = num_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fb091e48f90>

In [9]:
scores = model.evaluate(X_test, y_test, verbose = 0)
print('Model accuracy: {}'.format(scores[1]))

Model accuracy: 0.8615599870681763
