In [1]:
# Convolutional neural network
# done by keras

In [3]:
# import necessary library
from keras.preprocessing import sequence
from keras.models import Sequential  # this is to build a neural network model using keras
from keras.layers import Dense, Dropout, Activation # to create dense dataset.apply dropout rate,
# to use activation 
from keras.layers import Embedding  # for performing word embeddings in keras
from keras.layers import LSTM # long short term memory(LSTM) is a specific type of RNN model
from keras.layers import Conv1D, MaxPool1D # convolution 1D and maxpooling 1D 

In [4]:
# import the dataset
from keras.datasets import imdb   # this is sample dataset given in keras library

In [5]:
# data has text(sentiments or review) and label (binary value i.e. positive or negative)

In [6]:
# Embedding
max_features=20000
maxlen=100    # don't take more than 100 words (people may have written more than 100 review 
# sentence which consumes lots of time to process, so take only 100 words for analysis)
#
embedding_size=128

In [7]:
# Convolution
kernel_size=5
filters=64    # 8x8 filter
pool_size=4

In [8]:
# LSTM
lstm_output_size=70  # max neural output (hidden layer)

In [9]:
#Training
batch_size=30
epochs=2
'''
Note :
batch_size is highly sensitive.
only 2 epochs are needed as the dataset is very small
'''
# we can do more epochs to obtain more accuracy

In [10]:
print("Loading Data .......")
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features) # loading data
# load the dataset but only keep the top n words(i.e. max_features), zero the rest
print(len(x_train),"train sequences")
print(len(x_test),"test sequences")

Loading Data .......
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
25000 train sequences
25000 test sequences


In [11]:
# sequence are not same length so we do padding.
# truncate and pad input sequences
print("Pad sequences (samples x times)")
x_train=sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)
print("x_train shape: ",x_train.shape)
print("x_test Shape : ",x_test.shape)

Pad sequences (samples x times)
x_train shape:  (25000, 100)
x_test Shape :  (25000, 100)


In [14]:
print("Build model .....")
model= Sequential()     # initialize the model
model.add(Embedding(max_features,embedding_size,input_length=maxlen)) #
model.add(Dropout(0.25))  # we don't want all neuron to connect to hidden layer,so do dropout

# now do convolutional  1 dimensional
model.add(Conv1D(filters,
                kernel_size,
                padding="valid",
                activation="relu",
                strides=1))
# After convolution do maxpooling
model.add(MaxPool1D(pool_size=pool_size))
model.add(LSTM(lstm_output_size))
model.add(Dense(1))    # output is 1 i.e positive or negative . less than 1 is zero
model.add(Activation("sigmoid"))  # it is binary problem so use sigmoid
# we can do this in one line
# model.add(Dense(1,activation='sigmoid'))

Build model .....


In [21]:
## compile the model
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])
print(model.summary())

In [22]:
print("Train...")
model.fit(x_train,y_train,
         batch_size=batch_size,
         epochs=epochs,
         validation_data=(x_test,y_test))

Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x160165e9438>

In [23]:
# let see the accuracy and score
# Final evaluation of the model
score,acc=model.evaluate(x_test,y_test,batch_size=batch_size)
print("Test score :  ",score)
print("Test accurary : ",acc)

Test score :   0.351885968233645
Test accurary :  0.8511999934911728


In [24]:
model.predict(x_test)

array([[0.7038166 ],
       [0.97548705],
       [0.9513949 ],
       ...,
       [0.08909649],
       [0.09484072],
       [0.4224007 ]], dtype=float32)

In [25]:
y_test  # gives binary classification

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [26]:
# we can apply filter also
import numpy as np
np.where(model.predict(x_test)>0.5,1,0)   # if >0.5 then 1 else 0

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])