# IMDB Sentiment Classification

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from keras.callbacks import LambdaCallback, ModelCheckpoint
from keras.datasets import imdb
from keras.layers import Dense, AlphaDropout, BatchNormalization, GRU, Embedding
from keras.models import Sequential, load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import classification_report

Using TensorFlow backend.


In [3]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=None,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)

In [4]:
print("Train-set size: ", len(x_train))
print("Test-set size:  ", len(x_test))

Train-set size:  25000
Test-set size:   25000


In [5]:
num_tokens = [len(tokens) for tokens in x_train + x_test]
num_tokens = np.array(num_tokens)

In [6]:
np.mean(num_tokens)

469.51784

In [7]:
np.max(num_tokens)

2697

In [8]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

960

In [9]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.9474

In [10]:
pad = 'pre'

In [11]:
x_train_pad = pad_sequences(x_train, maxlen=max_tokens,
                            padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [12]:
embedding_size = 128
max_features = 10000
epochs = 1
modelh5 = 'IMDBGRUClassifier'
loadmodelh5 = 'IMDBGRUClassifier-best'

In [13]:
try:
    model = load_model(loadmodelh5 + '.h5')
    print('Model loaded successfully')
except IOError:
    print('Building the model for the first time')
    model = Sequential()
    model.add(Embedding(input_dim=max_features,
                        output_dim=embedding_size,
                        input_length=max_tokens,
                        name='layer_embedding'))
    model.add(GRU(units=128))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
savebestmodel = ModelCheckpoint(modelh5 + '-best.h5', monitor='loss', verbose=0, save_best_only=True)

Model loaded successfully


In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 960, 128)          1280000   
_________________________________________________________________
gru_1 (GRU)                  (None, 128)               98688     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 1,378,817
Trainable params: 1,378,817
Non-trainable params: 0
_________________________________________________________________


In [15]:
%%time
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=epochs, batch_size=256, callbacks=[savebestmodel])

Train on 23750 samples, validate on 1250 samples
Epoch 1/1
Wall time: 8min 12s


<keras.callbacks.History at 0x694ff92f98>

In [16]:
%%time
result = model.evaluate(x_test_pad, y_test, batch_size=512)

Wall time: 57.7 s


In [17]:
y_true, y_prob = y_train, model.predict(x_train_pad, batch_size=512)
y_pred = y_prob >= 0.5
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.96      0.98      0.97     12500
          1       0.98      0.96      0.97     12500

avg / total       0.97      0.97      0.97     25000



In [18]:
y_true, y_prob = y_test, model.predict(x_test_pad, batch_size=512)
y_pred = y_prob >= 0.5
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.82      0.87      0.85     12500
          1       0.86      0.81      0.84     12500

avg / total       0.84      0.84      0.84     25000

