# Sentiment analysis CNN in Keras - example of suing IMDb dataset with Glove pretrained word embeddings

## Modules import

In [1]:
%matplotlib inline
from __future__ import division, print_function 
from utils import *

In [2]:
import numpy as np

In [3]:
from keras.utils import *
from keras.models import Sequential
from keras.layers import Embedding, Dense, Activation, Dropout, Flatten, Convolution1D, SpatialDropout1D, MaxPooling1D
from keras import metrics #Only one metric is supported at the moment and that is accuracy
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [4]:
import zipfile
import pickle
import re

#### Model path

In [5]:
model_path = 'C:/Users/Gavrilov/My Projects/Sentiment Analysis CNN/'

## IMDb Dataset setup

In [6]:
from keras.datasets import imdb 
    #dataset of 25,000 movies reviews from IMDB #https://keras.io/datasets/
    #each review labeled by sentiment (positive/negative)
    #each review is encoded as a sequence of word indexes (integers)
    #words are indexed by overall frequency in the dataset (integer "3" encodes the 3rd most frequent word)
    #"0" does not stand for a specific word, but instead is used to encode any unknown word

### IMDB Keras words frequency - dictionary manipulation

##### dictionary of word (key) and index (value) - which is necessary when passing indices to the model

In [7]:
dataset_words_idx = imdb.get_word_index() #IMDb indices assignment to idx variable

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [8]:
print("Index of word 'the' is", dataset_words_idx['the'])
print({i:dataset_words_idx[i] for i in list(dataset_words_idx)[:20]}) #print out some elements

Index of word 'the' is 1
{'fawn': 34701, 'tsukino': 52006, 'nunnery': 52007, 'sonja': 16816, 'vani': 63951, 'woods': 1408, 'spiders': 16115, 'hanging': 2345, 'woody': 2289, 'trawling': 52008, "hold's": 52009, 'comically': 11307, 'localized': 40830, 'disobeying': 30568, "'royale": 52010, "harpo's": 40831, 'canet': 52011, 'aileen': 19313, 'acurately': 52012, "diplomat's": 52013}


##### dictionary of index (value) and word (key) - which is necessary when decoding models predictions

In [9]:
dataset_idx_words = {value: key for key, value in iter(dataset_words_idx.items())} 
    #mapping of words to indices will give dictionary, where key and value positions switched
#dataset_idx_words = {}
#for key, value in iter(dataset_words_idx.items()): #iter(ind_words.items()) -iterates over the dictionary returning key, value 
#    dataset_idx_words[value]=key

In [10]:
print(dataset_idx_words[34701], dataset_idx_words[52006], dataset_idx_words[52007])
print({i:dataset_idx_words[i] for i in list(dataset_idx_words)[:20]}) #printing out of mapped elements of new dictionary

fawn tsukino nunnery
{34701: 'fawn', 52006: 'tsukino', 52007: 'nunnery', 16816: 'sonja', 63951: 'vani', 1408: 'woods', 16115: 'spiders', 2345: 'hanging', 2289: 'woody', 52008: 'trawling', 52009: "hold's", 11307: 'comically', 40830: 'localized', 30568: 'disobeying', 52010: "'royale", 40831: "harpo's", 52011: 'canet', 19313: 'aileen', 52012: 'acurately', 52013: "diplomat's"}


### Dataset frequent words in sorted array 

In [11]:
dataset_words_arr_sorted = sorted(dataset_words_idx, key=dataset_words_idx.get)
    #sorted() returns sorted list but not mutates the original list
    #idx is iterable (words in our example), key function will sort the given iterable
    #idx.get returns value None because value in idx.get() not specified
print(dataset_words_arr_sorted[34700], dataset_words_arr_sorted[52005], dataset_words_arr_sorted[52006])
    #words from dataset dictionary by specific index
print('Most frequent words are: ', dataset_words_arr_sorted[:20])

fawn tsukino nunnery
Most frequent words are:  ['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i', 'this', 'that', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'on']


### Using IMDB Keras dataset directly

In [12]:
vocab_size=5000 #truncate vocabulary down to 5000
review_len=500 #truncate every review to 500 words

In [13]:
(reviews_train, labels_train), (reviews_test, labels_test) = imdb.load_data(path='imdb.npz',
                                                      num_words=vocab_size, #top most frequent words to consider 
                                                      #skip_top=0, #top most frequent words to ignore
                                                      #maxlen=review_len, #any longer sequence will be truncated
                                                                          #will not use it here because it will cause disproportion in shapes of reviews_train and reviews_test
                                                                          #will truncate manually instead using Keras sequence padding
                                                      #seed=500, 
                                                      start_char=1, #the start of a sequence will be marked with this character
                                                                     #set to 1 because 0 is usually the padding character        
                                                      #ovv_char=0, #words that were cut out because of the num_words or skip_top limit will be replaced with this character  
                                                      index_from=0) #index actual words with this index and higher
    #reviews_train, reviews_test -list of sequences, which are lists of indexes (integers) (list or reviews which captures list of indices of words)
    #labels_train, labels_test -list of integer labels (1 positive or 0 negative sentiment) given to reviews in dataset
    #labeled reviews it's pretty much same as labeled pictures in Dogs vs Cats task
    #what the convolutional neural net does is learning to predict where is negarive or positive review on unlabeled data (Dog vs Cat in the example with pictures) 

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [14]:
len(reviews_train), len(reviews_test)

(25000, 25000)

In [15]:
reviews_train[0]

[1,
 11,
 19,
 13,
 40,
 527,
 970,
 1619,
 1382,
 62,
 455,
 4465,
 63,
 3938,
 1,
 170,
 33,
 253,
 2,
 22,
 97,
 40,
 835,
 109,
 47,
 667,
 2,
 6,
 32,
 477,
 281,
 2,
 147,
 1,
 169,
 109,
 164,
 2,
 333,
 382,
 36,
 1,
 169,
 4533,
 1108,
 14,
 543,
 35,
 10,
 444,
 1,
 189,
 47,
 13,
 3,
 144,
 2022,
 16,
 11,
 19,
 1,
 1917,
 4610,
 466,
 1,
 19,
 68,
 84,
 9,
 13,
 40,
 527,
 35,
 73,
 12,
 10,
 1244,
 1,
 19,
 14,
 512,
 14,
 9,
 13,
 623,
 15,
 2,
 2,
 59,
 383,
 9,
 5,
 313,
 5,
 103,
 2,
 1,
 2220,
 2,
 13,
 477,
 63,
 3782,
 30,
 1,
 127,
 9,
 13,
 35,
 616,
 2,
 22,
 121,
 48,
 33,
 132,
 45,
 22,
 1412,
 30,
 3,
 19,
 9,
 212,
 25,
 74,
 49,
 2,
 11,
 404,
 13,
 79,
 2,
 5,
 1,
 104,
 114,
 2,
 12,
 253,
 1,
 2,
 4,
 3763,
 2,
 720,
 33,
 68,
 40,
 527,
 473,
 23,
 397,
 314,
 43,
 4,
 1,
 2,
 1026,
 10,
 101,
 85,
 1,
 378,
 12,
 294,
 95,
 29,
 2068,
 53,
 23,
 138,
 3,
 191,
 2,
 15,
 1,
 223,
 19,
 18,
 131,
 473,
 23,
 477,
 2,
 141,
 27,
 2,
 15,
 48,
 33,
 25,
 2

In [16]:
' '.join([dataset_idx_words[i] for i in reviews_train[0]]) 
#so our goal is to take 25,000 reviews, and predict whether it will be positive or negative in sentiment

"the this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert and is an amazing actor and now the same being director and father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for and and would recommend it to everyone to watch and the fly and was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also and to the two little and that played the and of norman and paul they were just brilliant children are often left out of the and list i think because the stars that play them all grown up are such a big and for the whole film but these children are amazing and should be and for what they have done don't you th

In [17]:
labels_train[:10], labels_test[:10]   

(array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0], dtype=int64),
 array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1], dtype=int64))

#### Rectangular matrix - Zero padding of each sequence for making consistent length

In [18]:
labels_train.shape, labels_test.shape #given shape

((25000,), (25000,))

In [19]:
len(reviews_train[2]), len(reviews_test[2]) #we need to make all of reviews the same lenght

(141, 603)

In [20]:
reviews_train_padded = pad_sequences(reviews_train, maxlen=500, value=0) 
reviews_test_padded = pad_sequences(reviews_test, maxlen=500, value=0)
#truncates everything greater than maxlen, but padds everything with given value

In [21]:
reviews_train_padded.shape, reviews_test_padded.shape #at the end of this we have numpy array with identical shapes

((25000, 500), (25000, 500))

In [22]:
reviews_train_padded[2], reviews_test_padded[2]

(array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [23]:
reviews_train_padded.shape==reviews_test_padded.shape

True

In [24]:
len(reviews_train_padded[2])==len(reviews_test_padded[2]) 

True

## Glove Pretrained Word Embeddings setup

#### Downloading Glove word embeddings

In [25]:
glove_6B = get_file('glove.6B.zip', origin='http://nlp.stanford.edu/data/glove.6B.zip')

Downloading data from http://nlp.stanford.edu/data/glove.6B.zip


#### Unzipping files

In [26]:
with zipfile.ZipFile(glove_6B, 'r') as zip_ref:
    zip_ref.extractall('C:/Users/Gavrilov/.keras/datasets/glove_6B')

#### Preprocessing

##### setting up a variable of 6B version tokens Glove with 50 dimensions words embeddings

In [27]:
name = "glove.6B.50d.txt"

##### reading file 

In [28]:
with open('C:/Users/Gavrilov/.keras/datasets/glove_6B/'+name, 'r', encoding="utf8") as f: 
    lines = [line.split() for line in f] #lines.split() -without value '\n' because each iteration gives line and that is fine

In [29]:
lines[0]

['the',
 '0.418',
 '0.24968',
 '-0.41242',
 '0.1217',
 '0.34527',
 '-0.044457',
 '-0.49688',
 '-0.17862',
 '-0.00066023',
 '-0.6566',
 '0.27843',
 '-0.14767',
 '-0.55677',
 '0.14658',
 '-0.0095095',
 '0.011658',
 '0.10204',
 '-0.12792',
 '-0.8443',
 '-0.12181',
 '-0.016801',
 '-0.33279',
 '-0.1552',
 '-0.23131',
 '-0.19181',
 '-1.8823',
 '-0.76746',
 '0.099051',
 '-0.42125',
 '-0.19526',
 '4.0071',
 '-0.18594',
 '-0.52287',
 '-0.31681',
 '0.00059213',
 '0.0074449',
 '0.17778',
 '-0.15897',
 '0.012041',
 '-0.054223',
 '-0.29871',
 '-0.15749',
 '-0.34758',
 '-0.045637',
 '-0.44251',
 '0.18785',
 '0.0027849',
 '-0.18411',
 '-0.11514',
 '-0.78581']

##### setting up main arrays

In [30]:
glove_words = [elem[0] for elem in lines]
glove_words_idx = {elem:idx for idx,elem in enumerate(glove_words)} #is elem:idx equal to glove_words_idx[elem]=idx?
glove_vecs = np.stack(np.array(elem[1:], dtype=np.float32) for elem in lines) #np.float32 -standard double-precision floating point

In [31]:
' '.join(glove_words[:20]), glove_words_idx['the'], glove_vecs.shape, glove_vecs[glove_words_idx['the']] 

('the , . of to and in a " \'s for - that on is was said with he as',
 0,
 (400000, 50),
 array([  4.18000013e-01,   2.49679998e-01,  -4.12420005e-01,
          1.21699996e-01,   3.45270008e-01,  -4.44569997e-02,
         -4.96879995e-01,  -1.78619996e-01,  -6.60229998e-04,
         -6.56599998e-01,   2.78430015e-01,  -1.47670001e-01,
         -5.56770027e-01,   1.46579996e-01,  -9.50950012e-03,
          1.16579998e-02,   1.02040000e-01,  -1.27920002e-01,
         -8.44299972e-01,  -1.21809997e-01,  -1.68009996e-02,
         -3.32789987e-01,  -1.55200005e-01,  -2.31309995e-01,
         -1.91809997e-01,  -1.88230002e+00,  -7.67459989e-01,
          9.90509987e-02,  -4.21249986e-01,  -1.95260003e-01,
          4.00710011e+00,  -1.85939997e-01,  -5.22870004e-01,
         -3.16810012e-01,   5.92130003e-04,   7.44489999e-03,
          1.77780002e-01,  -1.58969998e-01,   1.20409997e-02,
         -5.42230010e-02,  -2.98709989e-01,  -1.57490000e-01,
         -3.47579986e-01,  -4.56370004e-02,

##### saving results

In [32]:
pickle.dump(glove_words, open('C:/Users/Gavrilov/.keras/datasets/glove_6B/'+name+'_glove_words.pkl', 'wb'))
pickle.dump(glove_words_idx, open('C:/Users/Gavrilov/.keras/datasets/glove_6B/'+name+'_glove_words_idx.pkl', 'wb'))
np.save('C:/Users/Gavrilov/.keras/datasets/glove_6B/'+name+'_glove_vecs', glove_vecs, allow_pickle=True) #file would have a name glove.6B.50d.txt.dat which is a little buggy

##### loading results (if required)

In [53]:
pickle.load(open('C:/Users/Gavrilov/.keras/datasets/glove_6B/'+name+'_glove_words.pkl', 'rb'))

['the',
 ',',
 '.',
 'of',
 'to',
 'and',
 'in',
 'a',
 '"',
 "'s",
 'for',
 '-',
 'that',
 'on',
 'is',
 'was',
 'said',
 'with',
 'he',
 'as',
 'it',
 'by',
 'at',
 '(',
 ')',
 'from',
 'his',
 "''",
 '``',
 'an',
 'be',
 'has',
 'are',
 'have',
 'but',
 'were',
 'not',
 'this',
 'who',
 'they',
 'had',
 'i',
 'which',
 'will',
 'their',
 ':',
 'or',
 'its',
 'one',
 'after',
 'new',
 'been',
 'also',
 'we',
 'would',
 'two',
 'more',
 "'",
 'first',
 'about',
 'up',
 'when',
 'year',
 'there',
 'all',
 '--',
 'out',
 'she',
 'other',
 'people',
 "n't",
 'her',
 'percent',
 'than',
 'over',
 'into',
 'last',
 'some',
 'government',
 'time',
 '$',
 'you',
 'years',
 'if',
 'no',
 'world',
 'can',
 'three',
 'do',
 ';',
 'president',
 'only',
 'state',
 'million',
 'could',
 'us',
 'most',
 '_',
 'against',
 'u.s.',
 'so',
 'them',
 'what',
 'him',
 'united',
 'during',
 'before',
 'may',
 'since',
 'many',
 'while',
 'where',
 'states',
 'because',
 'now',
 'city',
 'made',
 'like',
 

In [55]:
pickle.load(open('C:/Users/Gavrilov/.keras/datasets/glove_6B/'+name+'_glove_words_idx.pkl', 'rb'))

{'the': 0,
 ',': 1,
 '.': 2,
 'of': 3,
 'to': 4,
 'and': 5,
 'in': 6,
 'a': 7,
 '"': 8,
 "'s": 9,
 'for': 10,
 '-': 11,
 'that': 12,
 'on': 13,
 'is': 14,
 'was': 15,
 'said': 16,
 'with': 17,
 'he': 18,
 'as': 19,
 'it': 20,
 'by': 21,
 'at': 22,
 '(': 23,
 ')': 24,
 'from': 25,
 'his': 26,
 "''": 27,
 '``': 28,
 'an': 29,
 'be': 30,
 'has': 31,
 'are': 32,
 'have': 33,
 'but': 34,
 'were': 35,
 'not': 36,
 'this': 37,
 'who': 38,
 'they': 39,
 'had': 40,
 'i': 41,
 'which': 42,
 'will': 43,
 'their': 44,
 ':': 45,
 'or': 46,
 'its': 47,
 'one': 48,
 'after': 49,
 'new': 50,
 'been': 51,
 'also': 52,
 'we': 53,
 'would': 54,
 'two': 55,
 'more': 56,
 "'": 57,
 'first': 58,
 'about': 59,
 'up': 60,
 'when': 61,
 'year': 62,
 'there': 63,
 'all': 64,
 '--': 65,
 'out': 66,
 'she': 67,
 'other': 68,
 'people': 69,
 "n't": 70,
 'her': 71,
 'percent': 72,
 'than': 73,
 'over': 74,
 'into': 75,
 'last': 76,
 'some': 77,
 'government': 78,
 'time': 79,
 '$': 80,
 'you': 81,
 'years': 82,
 'i

In [57]:
np.load('C:/Users/Gavrilov/.keras/datasets/glove_6B/'+name+'_glove_vecs'+'.npy')

array([[ 0.41800001,  0.24968   , -0.41242   , ..., -0.18411   ,
        -0.11514   , -0.78580999],
       [ 0.013441  ,  0.23682   , -0.16899   , ..., -0.56656998,
         0.044691  ,  0.30392   ],
       [ 0.15164   ,  0.30177   , -0.16763   , ..., -0.35652   ,
         0.016413  ,  0.10216   ],
       ..., 
       [-0.51181   ,  0.058706  ,  1.09130001, ..., -0.25003001,
        -1.125     ,  1.58630002],
       [-0.75897998, -0.47426   ,  0.47369999, ...,  0.78953999,
        -0.014116  ,  0.64480001],
       [ 0.072617  , -0.51393002,  0.47279999, ..., -0.18907   ,
        -0.59021002,  0.55558997]], dtype=float32)

#### Embedding matrix - extracting indices and vectors from Glove regarding words in IMDb 

In [33]:
n_fact = glove_vecs.shape[1] #getting value, which is same as dimensions of Glove vectors (amount of floats in words array)

In [34]:
emb = np.zeros((vocab_size, n_fact)) #create an array of zeros 
#creating a numpy array 2D tempalate with number of axes values same as our vocab_size () and dimentions 

In [35]:
emb.shape, emb[1] #is shape of our future array shape

((5000, 50),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]))

In [36]:
glove_vecs.shape, glove_vecs[1] #is shape of exist Glove numpy array shape, from wich we will chose our vectors for words in our vocab

((400000, 50),
 array([ 0.013441  ,  0.23682   , -0.16899   ,  0.40950999,  0.63812   ,
         0.47709   , -0.42851999, -0.55641001, -0.36399999, -0.23938   ,
         0.13000999, -0.063734  , -0.39574999, -0.48162001,  0.23291001,
         0.090201  , -0.13324   ,  0.078639  , -0.41633999, -0.15428001,
         0.10068   ,  0.48890999,  0.31226   , -0.1252    , -0.037512  ,
        -1.51789999,  0.12612   , -0.02442   , -0.042961  , -0.28351   ,
         3.54159999, -0.11956   , -0.014533  , -0.1499    ,  0.21864   ,
        -0.33412001, -0.13872001,  0.31806001,  0.70358002,  0.44858   ,
        -0.080262  ,  0.63002998,  0.32111001, -0.46765   ,  0.22786   ,
         0.36034   , -0.37818   , -0.56656998,  0.044691  ,  0.30392   ], dtype=float32))

In [37]:
for num in range(1, len(emb)): #mapping words vectors from Glove to IMDb words
    word = dataset_idx_words[num]
    if word and re.match(r"^[a-zA-Z0-9\-]*$", word): #bool to find if word match with all variances it could appear (which can happen in Glove) using regular expression (see http://www.regular-expressions.info/python.html)
        temp_idx = glove_words_idx[word] #getting index of word in Glove dictionary
        emb[num] = glove_vecs[temp_idx] #assigning of created emb matrix of given temp_idx dimensions with relevant values from Glove word embeddings 
    else: #if no match identified (if we can't find a match in Glove, will just assign to random value)
        emb[num] = np.random.normal(scale=0.6, size=(n_fact,)) #have had to create random embedding, because sometimes the word we've lokked in IMDb didn't exist in Glove (for example, words with apostrophe 's')
        # basically whole preprocess with Glove is only needed for extracting vectors of words, wich then becomes part of matrix embedding

In [38]:
emb[1]

array([  4.18000013e-01,   2.49679998e-01,  -4.12420005e-01,
         1.21699996e-01,   3.45270008e-01,  -4.44569997e-02,
        -4.96879995e-01,  -1.78619996e-01,  -6.60229998e-04,
        -6.56599998e-01,   2.78430015e-01,  -1.47670001e-01,
        -5.56770027e-01,   1.46579996e-01,  -9.50950012e-03,
         1.16579998e-02,   1.02040000e-01,  -1.27920002e-01,
        -8.44299972e-01,  -1.21809997e-01,  -1.68009996e-02,
        -3.32789987e-01,  -1.55200005e-01,  -2.31309995e-01,
        -1.91809997e-01,  -1.88230002e+00,  -7.67459989e-01,
         9.90509987e-02,  -4.21249986e-01,  -1.95260003e-01,
         4.00710011e+00,  -1.85939997e-01,  -5.22870004e-01,
        -3.16810012e-01,   5.92130003e-04,   7.44489999e-03,
         1.77780002e-01,  -1.58969998e-01,   1.20409997e-02,
        -5.42230010e-02,  -2.98709989e-01,  -1.57490000e-01,
        -3.47579986e-01,  -4.56370004e-02,  -4.42510009e-01,
         1.87849998e-01,   2.78489990e-03,  -1.84110001e-01,
        -1.15139998e-01,

In [39]:
emb[-1] = np.random.normal(scale=0.6, size=(n_fact,)) #this is our "rare word" id - we want to randomly initialize

In [40]:
emb/=3 

In [41]:
emb[1]

array([  1.39333338e-01,   8.32266659e-02,  -1.37473335e-01,
         4.05666654e-02,   1.15090003e-01,  -1.48189999e-02,
        -1.65626665e-01,  -5.95399986e-02,  -2.20076666e-04,
        -2.18866666e-01,   9.28100049e-02,  -4.92233336e-02,
        -1.85590009e-01,   4.88599986e-02,  -3.16983337e-03,
         3.88599994e-03,   3.40133334e-02,  -4.26400006e-02,
        -2.81433324e-01,  -4.06033322e-02,  -5.60033321e-03,
        -1.10929996e-01,  -5.17333349e-02,  -7.71033317e-02,
        -6.39366657e-02,  -6.27433340e-01,  -2.55819996e-01,
         3.30169996e-02,  -1.40416662e-01,  -6.50866677e-02,
         1.33570004e+00,  -6.19799991e-02,  -1.74290001e-01,
        -1.05603337e-01,   1.97376668e-04,   2.48163333e-03,
         5.92600008e-02,  -5.29899995e-02,   4.01366657e-03,
        -1.80743337e-02,  -9.95699962e-02,  -5.24966667e-02,
        -1.15859995e-01,  -1.52123335e-02,  -1.47503336e-01,
         6.26166662e-02,   9.28299967e-04,  -6.13700002e-02,
        -3.83799995e-02,

## Convolutional Neural Network model

In [42]:
model = Sequential([
    Embedding(vocab_size, n_fact, input_length=review_len, weights=[emb], trainable=False), 
        #vocab_size -word IDs are not used mathematically, they used an index to look up into integer
        #n_fact (50) -each word in our vocabulary of 5000 is being converted into a vector of 50 elements
        #weights=[emb] -our pretrained embeddings
        #training=False -since we think that our pretrained embeddings are pretty good. We start with False, but we can't leave it because of words differencies in words of IMDb and Glove 
    SpatialDropout1D(0.2),
        #dropout applied to the embedding layer zeroes out at random 20% of each of 32 embeddings (i.e. 20% of each word). Its avoiding overfitting the specifics of each word's embeddding    
        #a `keras.layers.SpatialDropout1D` layer right after the `Embedding` layer to get the same behavior
    Dropout(0.25), 
        #dropout after Dense layer is removing at random some of the words effectively (some of the whole vectors)
    Convolution1D(64, 5, activation='relu', padding='same'),
        #sentences are in 1D convolution, so model would be 1D convolution
        #64 - how many filters do you want to create
        #5 - size of model convolution
        #padding="same" instead of border_mode='same' 
    Dropout(0.25),
        #put same amout of Dropout seems to work well
    MaxPooling1D(),
        #Conv, Drop, Maxp -simplest CNN 
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [43]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [44]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 50)           250000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 500, 50)           0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 50)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 64)           16064     
_________________________________________________________________
dropout_2 (Dropout)          (None, 500, 64)           0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 64)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
__________

#### Model fit

In [45]:
model.fit(reviews_train_padded, labels_train, validation_data=(reviews_test_padded, labels_test), epochs=1, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0xecfdb7630>

### Printing out predictions

In [46]:
review_sample = "the movie was terrible you could not find any worse actors that were in the film that's an objective perspective on that kind of garbage product you can have nowadays in our hollywood i was terribly surprised how the plot of this picture is linear actions developing slowly that you want to fall asleep all the time there were no kissing scenes in the whole movie i definitely wouldn't recommend that movie to watch you can watch it if only you are a boring person who has a bunch of a time i guess i will not watch any movie with those actors i saw ever "

In [47]:
review_sample_2 = "the movie was awesome you could not find any better actors that were in the film that's an objective perspective on that kind of glorious product you can have nowadays in our hollywood i was kindly surprised how the plot of this picture is linear actions developing fast that you want to awake all the time there were kissing scenes in the whole movie i definitely would recommend that movie to watch you can watch it if only you are a bright person who has a valuable time i guess i will watch any movie with those actors i saw again "

#### Creating a specific function

In [55]:
def review_prediction(x):
    assert len(x)>500, 'review length should be less than 500 words'
    review_sample_words = x.split( )
    review_sample_idx = [dataset_words_idx[word] for word in review_sample_words]
    review_sample_idx_trn = np.array([num if num<5000-1 else 5000-1 for num in review_sample_idx])
    review_sample_idx_trn_2 = np.array([review_sample_idx_trn])
    review_sample_idx_trn_2_padded = pad_sequences(review_sample_idx_trn_2, maxlen=500, value=0)
    y = model.predict(review_sample_idx_trn_2_padded, verbose=0)
    if y[0] > 0.5:
        print("Prediction is: ", y, '\nReview has positive sentiment')
    else:
        print("Prediction is: ", y, '\nReview has negative sentiment')

In [56]:
review_prediction(review_sample)

Prediction is:  [[ 0.43317056]] 
Review has negative sentiment


In [57]:
review_prediction(review_sample_2)

Prediction is:  [[ 0.56625235]] 
Review has positive sentiment


#### Saving weights

In [58]:
model.save_weights(model_path+"CNN_weights_1.h5")

#### Training a neural network to reach out state of the art result 

In [59]:
model.layers[0].Training=True #we going to fine-tune embeddings

In [61]:
model.optimizer.lr=1e-4

In [62]:
model.fit(reviews_train_padded, labels_train, validation_data=(reviews_test_padded, labels_test), epochs=12, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0xf0fe4dcc0>

In [63]:
model.fit(reviews_train_padded, labels_train, validation_data=(reviews_test_padded, labels_test), epochs=12, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0xf0fe4d160>

In [64]:
model.fit(reviews_train_padded, labels_train, validation_data=(reviews_test_padded, labels_test), epochs=12, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0xf0fe4d390>

In [65]:
model.fit(reviews_train_padded, labels_train, validation_data=(reviews_test_padded, labels_test), epochs=12, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0xf0fe4d240>

#### Saving weights

In [66]:
model.save_weights(model_path+"CNN_weights_1.h5")

### Printing out predictions

In [67]:
review_prediction(review_sample)

Prediction is:  [[ 0.01205496]] 
Review has negative sentiment


In [68]:
review_prediction(review_sample_2)

Prediction is:  [[ 0.92599201]] 
Review has positive sentiment
