In [219]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

In [2]:
from keras.models import Sequential
from keras.layers import Dense,GRU,Embedding
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
import imdb

## Load data

In [4]:
imdb.download_and_extract()

Data has apparently already been downloaded and unpacked.


In [5]:
x_train_text, y_train = imdb.load_data(train=True)
x_test_text, y_test = imdb.load_data(train=False)


In [6]:
print("Train-set size: ", len(x_train_text))
print("Test-set size:  ", len(x_test_text))

Train-set size:  25000
Test-set size:   25000


In [7]:
data_text = x_train_text + x_test_text

In [8]:
x_train_text[12500]

'Where to begin, there\'s so much wrong and horrible about this movie I am not sure where to start. Okay, the two stooges who wrote this crapper. Joseph Green and Rex Carlton, first they couldn\'t make up their so-called minds for a name. My guess they split the difference, that\'s why the main title is BRAIN THAT WOULDN\'T DIE, but the end screen says HEAD THAT WOULDN\'T DIE. Neither one knows anything about the Medical profession. After all Doctors take oaths to "do no harm". Killing a woman for a head transplant would be considered "harm". Plus, a little thing called blood and tissue matching. Rejection would spell death for Jan in the pan. Plus who keeps a patch work monster. What medical school did Bill graduate from, FRANKENSTIEN UNIVERSITY? Old FU, or MAD SCIENTIST TECH? The monster had no name, that bugs the hell out of me. Plus, the brilliant surgeon Doctor Bill Cortner doesn\'t know how to keep a patient sedated? All and all a disaster of a movie, it\'s incredibly stupid and 

In [9]:
y_train[12500]  #negative sentiment

0.0

## Tokenizer

In [10]:
num_words = 10000

In [11]:
# Tokenizer converts words to integers 
tokenizer = Tokenizer(num_words=num_words)

In [12]:
%%time
tokenizer.fit_on_texts(data_text) #building a vocabulary

CPU times: user 8.64 s, sys: 0 ns, total: 8.64 s
Wall time: 8.64 s


In [14]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [15]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)

In [16]:
x_train_text[1]

'This movie was awesome...it made me laugh, it make a bawl, and most of all it has talking animals in it!! this movie should be seen by all kinds of people! it is one of my favorite movies, and i just love it so much that i just had to comment on it!!!it rox! it is so heart felt and a wonderful storyline that makes up a great and heartfelt movie!my favorite character is shadow. this is because i think that he is the most interesting and charming. i used to have a golden retriever just like shadow, i miss him so much!!! he was my best friend and i knew that when he died, he would be in a happier place, but i miss him with all of my heart!! this movie is the best i love it and everyone should! Love your pets no matter what they do, cherish them forever!!!'

In [17]:
np.array(x_train_tokens[1])

array([  11,   17,   13, 1169,    9,   90,   68,  423,    9,   94,    3,
          2,   88,    4,   29,    9,   45,  681, 1598,    8,    9,   11,
         17,  142,   26,  107,   31,   29, 2784,    4,   83,    9,    6,
         27,    4,   56,  521,   97,    2,   10,   39,  112,    9,   34,
         72,   12,   10,   39,   66,    5,  929,   20,    9,    9,    9,
          6,   34,  488,  436,    2,    3,  393,  773,   12,  162,   53,
          3,   78,    2, 5686,   17,   56,  521,  108,    6, 2744,   11,
          6,   84,   10,  101,   12,   28,    6,    1,   88,  218,    2,
       1313,   10,  328,    5,   25,    3, 1877,   39,   37, 2744,   10,
        698,   87,   34,   72,   28,   13,   56,  116,  444,    2,   10,
        677,   12,   50,   28, 1083,   28,   58,   26,    8,    3, 9719,
        274,   18,   10,  698,   87,   16,   29,    4,   56,  488,   11,
         17,    6,    1,  116,   10,  112,    9,    2,  304,  142,  112,
        125,   54,  504,   48,   33,   77,   93, 14

In [18]:
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

## padding and truncating data

In [19]:
#calculating length of every training tokens and converted it to an array
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [20]:
# mean length of token in training data
np.mean(num_tokens)

221.27716

In [21]:
#max length of training token data before truncate
np.max(num_tokens)

2209

In [22]:
#we find max number of tokens we will allow is set to the average plus 2 standard deviations.
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens  #maximum token length in our data

544

In [23]:
#This covers about 95% of the data-set.
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.94528

In [24]:
pad = 'pre' #we add zeros to the beginning and also truncating from beginning

In [25]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

In [26]:
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [27]:
x_train_pad.shape

(25000, 544)

In [28]:
x_test_pad.shape

(25000, 544)

In [29]:
np.array(x_train_tokens[1])

array([  11,   17,   13, 1169,    9,   90,   68,  423,    9,   94,    3,
          2,   88,    4,   29,    9,   45,  681, 1598,    8,    9,   11,
         17,  142,   26,  107,   31,   29, 2784,    4,   83,    9,    6,
         27,    4,   56,  521,   97,    2,   10,   39,  112,    9,   34,
         72,   12,   10,   39,   66,    5,  929,   20,    9,    9,    9,
          6,   34,  488,  436,    2,    3,  393,  773,   12,  162,   53,
          3,   78,    2, 5686,   17,   56,  521,  108,    6, 2744,   11,
          6,   84,   10,  101,   12,   28,    6,    1,   88,  218,    2,
       1313,   10,  328,    5,   25,    3, 1877,   39,   37, 2744,   10,
        698,   87,   34,   72,   28,   13,   56,  116,  444,    2,   10,
        677,   12,   50,   28, 1083,   28,   58,   26,    8,    3, 9719,
        274,   18,   10,  698,   87,   16,   29,    4,   56,  488,   11,
         17,    6,    1,  116,   10,  112,    9,    2,  304,  142,  112,
        125,   54,  504,   48,   33,   77,   93, 14

if we had padded 'post' then it would input the integer-tokens first and then a lot of zeros. This may confuse the Recurrent Neural Network.

In [30]:
x_train_pad[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

## Token to Text (Inverse Mapping)

In [31]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [32]:
def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    
    # Concatenate all words.
    text = " ".join(words)

    return text

In [33]:
x_train_text[2]

'This is a very good, under-rated action/drama/and slightly historical movie.<br /><br />The basic story concerns Rob Roy\'s borrowing of 1000 pounds, its theft, and the problems it causes for his family and indirectly his clansmen.<br /><br />Cunningham( Tim Roth) is an amazing villain and character in this story. Brutally cold and if you watch his face he seems to be able to turn his eyes off and look completely evil.<br /><br />Rob Roy (Liam Neeson) is excellent too, but i think the writers used the word "honour" 1 too many times.<br /><br />The rest of the cast is strong, and the whole movie is very well acted and filmed.<br /><br />The Action is exciting and the sword play very realistic, but not too gory. The story is good and you really want Rob to win.<br /><br />All in all just shy of a classic.'

In [34]:
tokens_to_string(x_train_pad[2])

'this is a very good under rated action drama and slightly historical movie br br the basic story concerns rob of 1000 pounds its theft and the problems it causes for his family and his br br cunningham tim roth is an amazing villain and character in this story brutally cold and if you watch his face he seems to be able to turn his eyes off and look completely evil br br rob roy liam is excellent too but i think the writers used the word honour 1 too many times br br the rest of the cast is strong and the whole movie is very well acted and filmed br br the action is exciting and the sword play very realistic but not too gory the story is good and you really want rob to win br br all in all just shy of a classic'

In [134]:
model = Sequential()

In [135]:
embedding_size = 8

In [136]:
model.add(Embedding(input_dim = num_words,
                    output_dim = embedding_size,
                    input_length = max_tokens,
                    name = 'Embedding_layer'))

In [137]:
model.add(GRU(units=16,return_sequences=True)) #Also return sequences to next layer

In [138]:
model.add(GRU(units=8,return_sequences=True))

In [139]:
model.add(GRU(units=4))

In [140]:
model.add(Dense(1, activation='sigmoid'))

In [141]:
optimizer = Adam(lr=1e-3)

In [142]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [143]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding_layer (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru_28 (GRU)                 (None, 544, 16)           1200      
_________________________________________________________________
gru_29 (GRU)                 (None, 544, 8)            600       
_________________________________________________________________
gru_30 (GRU)                 (None, 4)                 156       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


In [144]:
%%time
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)

Train on 23750 samples, validate on 1250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 25min 4s, sys: 4min 33s, total: 29min 38s
Wall time: 9min 34s


<keras.callbacks.History at 0x7f35ad67ac50>

In [145]:
accuracy = model.evaluate(x_test_pad,y_test)



In [146]:
print("Accuracy =", accuracy[1]*100,"%")

Accuracy = 86.664 %


In [147]:
model.save("model.h5")

## Check Mis-Classified Text

In [158]:
%%time
cls_pred = model.predict_classes(x=x_test_pad[0:1000])
cls_pred = cls_pred.T[0]
print(cls_pred)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1
 0 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1
 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1
 1 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 

In [163]:
cls_true = np.array(y_test[:1000])
cls_true

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [174]:
incorrect = np.where(cls_pred != cls_true)
incorrect[0]

array([ 17,  18,  20,  23,  25,  36,  43,  44,  68,  71,  74,  76,  79,
        82,  96, 114, 126, 138, 143, 150, 155, 162, 167, 191, 198, 209,
       221, 226, 231, 234, 258, 274, 286, 304, 318, 323, 346, 348, 368,
       373, 385, 394, 395, 397, 403, 410, 411, 417, 440, 467, 478, 481,
       516, 536, 537, 552, 559, 569, 574, 580, 584, 595, 607, 609, 610,
       686, 689, 691, 692, 711, 721, 726, 746, 750, 758, 759, 768, 776,
       783, 784, 788, 794, 795, 799, 804, 806, 809, 810, 813, 814, 815,
       817, 830, 858, 870, 891, 892, 899, 906, 926, 932, 937, 938, 948,
       956, 957, 973, 975, 978, 985, 986])

In [175]:
len(incorrect[0])

111

### mis-classified text

In [180]:
x_test_text[incorrect[0][0]]

'Seriously, I don´t really get why people here are bashing it. I mean,<br /><br />the idea of a killer snowman wreaking havoc on a tropical island paradise is pretty absurd. The good news is, the producers realized it and made it a comedy in the vein of Army of Darkness. <br /><br />Especially in the second half of the film, when the little killer snowballs attack, I laughed my ass off. For example, the put one of the little creeps into a blender (a la Gremlins 1) and mix it. After that, it morphs back into a snowball and squeals with a high pitched voice "That was fun!".<br /><br />Bottom line - incredible movie, rent it.'

### New Data

In [198]:
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

In [199]:
tokens = tokenizer.texts_to_sequences(texts)

In [200]:
token_pad = pad_sequences(tokens, maxlen=max_tokens, padding=pad,truncating=pad)
token_pad

array([[  0,   0,   0, ...,   6,  34,  49],
       [  0,   0,   0, ...,   0,  49,  17],
       [  0,   0,   0, ...,  37,  11,  17],
       ...,
       [  0,   0,   0, ...,   0,  74,  17],
       [  0,   0,   0, ...,   3,  49,  17],
       [  0,   0,   0, ..., 290, 141, 592]], dtype=int32)

In [202]:
model.predict(token_pad)

array([[0.8967977 ],
       [0.71277875],
       [0.49117064],
       [0.6211485 ],
       [0.40174288],
       [0.25788698],
       [0.631384  ],
       [0.24203168]], dtype=float32)

### Embeddings

In [204]:
layer_embedding = model.get_layer('Embedding_layer')

In [208]:
embedding_weights = layer_embedding.get_weights()[0]

In [209]:
embedding_weights.shape

(10000, 8)

In [210]:
token_good = tokenizer.word_index['good']
token_good

49

In [211]:
token_great = tokenizer.word_index['great']
token_great


78

In [212]:
embedding_weights[token_good]

array([ 0.07034972, -0.00783297,  0.09449942,  0.05234849, -0.04407967,
        0.02403229,  0.00120365, -0.0273092 ], dtype=float32)

In [213]:
embedding_weights[token_great]

array([ 0.10237348,  0.12813893,  0.07358269,  0.08561803, -0.08031715,
        0.08874631,  0.10133855, -0.1064107 ], dtype=float32)

In [218]:
token_bad = tokenizer.word_index['bad']
token_horrible = tokenizer.word_index['horrible']
token_bad,token_horrible

(74, 489)

In [216]:
embedding_weights[token_bad]

array([-0.10842818, -0.12535363, -0.09989487, -0.13759847,  0.07413789,
       -0.08203904, -0.09880546,  0.14099665], dtype=float32)

In [217]:
embedding_weights[token_horrible]

array([-0.18104154, -0.18058367, -0.14930211, -0.2077289 ,  0.10702441,
       -0.12166744, -0.16186088,  0.18688914], dtype=float32)