# Sentiment analysis of imdb data

In [1]:
from theano.sandbox import cuda

In [2]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Convolution1D, MaxPooling1D, Dropout, Flatten, BatchNormalization

Using Theano backend.


In [3]:
import numpy as np
import cPickle as pickle
import re
import pandas as pd
import itertools
import bcolz
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences

In [4]:
from keras.datasets import imdb

In [5]:
%matplotlib inline

In [6]:
model_path = 'data/imdb/models'

In [7]:
idx = imdb.get_word_index()

In [8]:
idx_arr = sorted(idx, key=idx.get)

In [47]:
idx_arr[1]

'and'

In [9]:
def get_word(id):
    return idx_arr[id]

idx2word = {v: k for k, v in idx.iteritems()}

In [10]:
idx2word[101]

'think'

In [11]:
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train,labels_train), (x_test, labels_test) = pickle.load(f)

In [12]:
len(x_train)

25000

In [13]:
len(x_train[0])

138

In [14]:
review1 = [idx2word[x] for x in x_train[0]]
' '.join(review1)

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

### Simplify data by truncating down to 5000

In [15]:
vocab_size = 5000
train = [np.array([i if i<=vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test =  [np.array([i if i<=vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [16]:
lens = np.array(map(len, train))
print(lens.max(),lens.min(),lens.mean())

(2493, 10, 237.71364)


In [17]:
seq_len = 500
train_padded = pad_sequences(train, maxlen=seq_len, value=0)
test_padded = pad_sequences(test, maxlen=seq_len, value=0)

In [18]:
print(train_padded.shape)
print(test_padded.shape)


(25000, 500)
(25000, 500)


## Build basic dense model

In [18]:
model = Sequential()
model.add(Embedding(vocab_size,32,input_length=seq_len))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(1,activation='sigmoid'))

In [19]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 500, 32)       160000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 16000)         0           embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 100)           1600100     flatten_1[0][0]                  
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 100)           0           dense_1[0][0]                    
___________________________________________________________________________________________

In [20]:
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
model.fit(train_padded, labels_train, validation_data=(test_padded,labels_test), nb_epoch=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc45e6b0c50>

## CNN with single convolutional layer

In [22]:
cnn = Sequential()
cnn.add(Embedding(vocab_size,64,input_length=seq_len,dropout=0.2))
cnn.add(Dropout(0.2))
cnn.add(Convolution1D(64,5,border_mode='same', activation='relu'))
cnn.add(Dropout(0.2))
cnn.add(MaxPooling1D())
cnn.add(Flatten())
cnn.add(Dense(100,activation='relu'))
cnn.add(Dropout(0.7))
cnn.add(Dense(1,activation='sigmoid'))

In [23]:
cnn.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 500, 64)       320000      embedding_input_2[0][0]          
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 500, 64)       0           embedding_2[0][0]                
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D)  (None, 500, 64)       20544       dropout_2[0][0]                  
____________________________________________________________________________________________________
dropout_3 (Dropout)              (None, 500, 64)       0           convolution1d_1[0][0]            
___________________________________________________________________________________________

In [24]:
cnn.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
cnn.fit(train_padded, labels_train, validation_data=(test_padded,labels_test), nb_epoch=3, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3

In [None]:
cnn.save_weights(model_path + 'cnn1.h5')

In [34]:
cnn.load_weights(model_path + 'cnn1.h5')

## Using pre-trained embeddings

In [19]:
def load_array(arr):
    return bcolz.open(arr)[:]

def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [20]:
vecs, words, wordidx = load_vectors('data/glove/results/6B.50d')

In [21]:
wordidx['health']

360

In [22]:
def create_embedding():
    num_factors = vecs.shape[1]
    emb = np.zeros((vocab_size,num_factors))
    
    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            emb[i] = vecs[wordidx[word]]
        else:
            emb[i] = np.random.normal(scale=0.6,size=(num_factors,))
    
    emb[-1] = np.random.normal(scale=0.6,size=(num_factors,))
    emb/= 3
    return emb

emb = create_embedding()        
        

In [23]:
emb.shape

(5000, 50)

In [27]:
glove = Sequential()
glove.add(Embedding(vocab_size,50,input_length=seq_len,dropout=0.2, weights=[emb], trainable=True))
glove.add(Dropout(0.25))
glove.add(Convolution1D(64,5,border_mode='same', activation='relu'))
glove.add(Dropout(0.25))
glove.add(MaxPooling1D())
glove.add(Flatten())
glove.add(Dense(100,activation='relu'))
glove.add(Dropout(0.7))
glove.add(Dense(1,activation='sigmoid'))

In [28]:
glove.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

In [29]:
glove.fit(train_padded, labels_train, validation_data=(test_padded,labels_test), nb_epoch=3, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f6a36c5cfd0>