# Sentiment analysis of imdb data

In [1]:
from theano.sandbox import cuda

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Convolution1D, MaxPooling1D, Dropout, Flatten, BatchNormalization

Using Theano backend.


In [3]:
import numpy as np
import cPickle as pickle
import pandas as pd
import itertools
import bcolz
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences

In [4]:
from keras.datasets import imdb

In [5]:
%matplotlib inline

In [6]:
model_path = 'data/imdb/models'

In [7]:
idx = imdb.get_word_index()

In [8]:
idx_arr = sorted(idx, key=idx.get)

In [9]:
def get_word(id):
    return idx_arr[id]

idx2word = {v: k for k, v in idx.iteritems()}

In [10]:
idx2word[101]

'think'

In [11]:
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train,labels_train), (x_test, labels_test) = pickle.load(f)

In [12]:
len(x_train)

25000

In [13]:
review1 = [idx2word[x] for x in x_train[0]]
' '.join(review1)

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

### Simplify data by truncating down to 5000

In [14]:
vocab_size = 5000
train = [np.array([i if i<=vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test =  [np.array([i if i<=vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [15]:
lens = np.array(map(len, train))
print(lens.max(),lens.min(),lens.mean())

(2493, 10, 237.71364)


In [16]:
seq_len = 500
train_padded = pad_sequences(train, maxlen=seq_len, value=0)
test_padded = pad_sequences(test, maxlen=seq_len, value=0)

In [17]:
print(train_padded.shape)
print(test_padded.shape)


(25000, 500)
(25000, 500)


## Build basic dense model

In [18]:
model = Sequential()
model.add(Embedding(vocab_size,32,input_length=seq_len))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(1,activation='sigmoid'))

In [19]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 500, 32)       160000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 16000)         0           embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 100)           1600100     flatten_1[0][0]                  
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 100)           0           dense_1[0][0]                    
___________________________________________________________________________________________

In [20]:
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
model.fit(train_padded, labels_train, validation_data=(test_padded,labels_test), nb_epoch=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f89979e1cd0>

## CNN with single convolutional layer

In [29]:
cnn = Sequential()
cnn.add(Embedding(vocab_size,64,input_length=seq_len,dropout=0.2))
cnn.add(Dropout(0.2))
cnn.add(Convolution1D(64,5,border_mode='same', activation='relu'))
cnn.add(Dropout(0.2))
cnn.add(MaxPooling1D())
cnn.add(Flatten())
cnn.add(Dense(100,activation='relu'))
cnn.add(Dropout(0.7))
cnn.add(Dense(1,activation='sigmoid'))

In [30]:
cnn.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_4 (Embedding)          (None, 500, 64)       320000      embedding_input_4[0][0]          
____________________________________________________________________________________________________
dropout_8 (Dropout)              (None, 500, 64)       0           embedding_4[0][0]                
____________________________________________________________________________________________________
convolution1d_3 (Convolution1D)  (None, 500, 64)       20544       dropout_8[0][0]                  
____________________________________________________________________________________________________
dropout_9 (Dropout)              (None, 500, 64)       0           convolution1d_3[0][0]            
___________________________________________________________________________________________

In [31]:
cnn.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

In [32]:
cnn.fit(train_padded, labels_train, validation_data=(test_padded,labels_test), nb_epoch=3, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f897097e110>

In [33]:
cnn.save_weights(model_path + 'cnn1.h5')

In [34]:
cnn.load_weights(model_path + 'cnn1.h5')

## Using pre-trained embeddings

In [None]:
def load_glove(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))