# Sentiment Classification


## Loading the dataset

In [2]:
from keras.datasets import imdb
import numpy as np
# save np.load
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a,allow_pickle=True)

vocab_size = 10000 #vocab size

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.

# restore np.load for future normal usage
np.load = np_load_old

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [2]:

x_train.shape

(25000,)

In [3]:
x_test.shape

(25000,)

In [0]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 #vocab size
maxlen = 300  #number of word used from each review

## Train test split

In [0]:
#load dataset as a list of ints
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
#make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [6]:

print(x_train)

[[   0    0    0 ...   19  178   32]
 [   0    0    0 ...   16  145   95]
 [   0    0    0 ...    7  129  113]
 ...
 [   0    0    0 ...    4 3586    2]
 [   0    0    0 ...   12    9   23]
 [   0    0    0 ...  204  131    9]]


In [7]:
print(x_test)

[[   0    0    0 ...   14    6  717]
 [   0    0    0 ...  125    4 3077]
 [1239 5189  137 ...    9   57  975]
 ...
 [   0    0    0 ...   21  846 5518]
 [   0    0    0 ... 2302    7  470]
 [   0    0    0 ...   34 2005 2643]]


## Build Keras Embedding Layer Model
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [0]:

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional,SimpleRNN
from keras.optimizers import Adam
from keras import backend

In [9]:
x_train.shape

(25000, 300)

In [10]:
x_test.shape

(25000, 300)

In [11]:
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=maxlen))
model.add(
    SimpleRNN(
        1, return_sequences=False, dropout=0.1))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 8)            80000     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 1)                 10        
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 2         
Total params: 80,012
Trainable params: 80,012
Non-trainable params: 0
_________________________________________________________________
None


## Retrive the output of each layer in keras for a given single test sample from the trained model you built

In [12]:

# fit the model
model.fit(x_train, y_train, epochs=50)




Epoch 1/50





Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f2cf25f82e8>

In [0]:
model.save("./SeqNLP_Project1.h5")

In [14]:

loss, accuracy = model.evaluate(x_test, y_test)
print('Accuracy: %f' % (accuracy*100))
print('Loss: %f' % (loss))

Accuracy: 76.700000
Loss: 0.674408


In [15]:

model.predict(x_test[[0]])

array([[0.9511795]], dtype=float32)

In [16]:
from keras import backend as back

inpt = model.input
output = [layer.output for layer in model.layers]
evalFunction = [back.function([inpt, back.learning_phase()], [out]) for out in output]

layerOpt = [func([x_test[[0]], 1.]) for func in evalFunction]
print(layerOpt)

[[array([[[-0.00515717,  0.06356188,  0.01696332, ...,  0.06018887,
          0.05213069,  0.02424783],
        [-0.00515717,  0.06356188,  0.01696332, ...,  0.06018887,
          0.05213069,  0.02424783],
        [-0.00515717,  0.06356188,  0.01696332, ...,  0.06018887,
          0.05213069,  0.02424783],
        ...,
        [ 0.04749067, -0.09079694, -0.07947935, ..., -0.00694379,
         -0.03578651, -0.1598345 ],
        [ 0.01074871,  0.02934771,  0.03399529, ..., -0.07170125,
          0.05007496,  0.02882561],
        [ 0.12924863,  0.0158901 , -0.00834902, ...,  0.22296906,
          0.02161513,  0.1067538 ]]], dtype=float32)], [array([[0.5340278]], dtype=float32)], [array([[0.96764797]], dtype=float32)]]


In [17]:
print('Embedding layer Output')
layerOpt[0][0]

Embedding layer Output


array([[[-0.00515717,  0.06356188,  0.01696332, ...,  0.06018887,
          0.05213069,  0.02424783],
        [-0.00515717,  0.06356188,  0.01696332, ...,  0.06018887,
          0.05213069,  0.02424783],
        [-0.00515717,  0.06356188,  0.01696332, ...,  0.06018887,
          0.05213069,  0.02424783],
        ...,
        [ 0.04749067, -0.09079694, -0.07947935, ..., -0.00694379,
         -0.03578651, -0.1598345 ],
        [ 0.01074871,  0.02934771,  0.03399529, ..., -0.07170125,
          0.05007496,  0.02882561],
        [ 0.12924863,  0.0158901 , -0.00834902, ...,  0.22296906,
          0.02161513,  0.1067538 ]]], dtype=float32)

In [19]:
print('RNN Output')
layerOpt[1]

RNN Output


[array([[0.5340278]], dtype=float32)]

In [20]:
print('Dense Layer output')
layerOpt[2]

Dense Layer output


[array([[0.96764797]], dtype=float32)]