In [1]:
from __future__ import absolute_import, division, print_function
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
(x_train, y_train), (x_test, y_test)=  tf.keras.datasets.imdb.load_data(
    path = 'imdb.npz',
    num_words = None,
    skip_top = 0,
    maxlen = None, 
    seed = 113,
    start_char = 1,
    oov_char = 2,
    index_from = 3
)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [3]:
unique, counts=  np.unique(y_train, return_counts=  True)

In [4]:
print(np.asarray((unique, counts)))

[[    0     1]
 [12500 12500]]


In [5]:
idx = np.argwhere(y_train > 0) # select positive comments index in training data
np.random.seed(seed = 100) # yse seed to enure selected records are always same
np.random.shuffle(idx) # shuffle at random

In [6]:
idx

array([[  581],
       [  433],
       [ 8960],
       ...,
       [16030],
       [13730],
       [11208]])

In [7]:
FRAC = 0.25
idxs = idx[:round(FRAC*len(idx))]

# fractioned positive cases
y_trains = y_train[idxs]
x_trains = x_train[idxs]

print(len(y_trains), len(x_trains))

# preserve negative cases
idxn = np.argwhere(y_train==0)
x_train0 = x_train[idxn]
y_train0 = y_train[idxn]

print(len(x_train0), len(y_train0))

3125 3125
12500 12500


In [8]:
over_idxs = np.random.choice(idxs.squeeze(), size = 12500, replace=True)

# ovesampled positive reviews
y_train1 = y_train[over_idxs]
x_train1 = x_train[over_idxs] 

print(len(y_train1), len(x_train1))
print(len(x_train0), len(y_train0))

12500 12500
12500 12500


In [9]:
# combining Xs and ys
x_train = np.concatenate((x_train0, x_train1), axis = None)
y_train = np.concatenate((y_train0, y_train1), axis = None)

print(x_train.shape, y_train.shape)
print('\n')
print(x_train)
print('\n')
print(len(x_train[1]))

(25000,) (25000,)


[list([1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 4369, 5012, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 8163, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 6853, 5, 163, 11, 3215, 10156, 4, 1153, 9, 194, 775, 7, 8255, 11596, 349, 2637, 148, 605, 15358, 8003, 15, 123, 125, 68, 23141, 6853, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 36893, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 8255, 5, 25249, 656, 245, 2350, 5, 4, 9837, 131, 152, 491, 18, 46151, 32, 7464, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95])
 list([1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5974, 54, 61, 369, 13, 71, 149, 14, 22, 112, 4, 2401, 311, 12,

In [10]:
shuffled_idx = np.arange(0, len(y_train), 1)
np.random.seed(seed=300)
np.random.shuffle(shuffled_idx)
print(shuffled_idx.shape)

(25000,)


In [11]:
shuffled_idx

array([14841, 22400, 10026, ...,  5834,  6625, 17617])

In [12]:
x_train_shuffled = x_train[shuffled_idx]
y_train_shuffled = y_train[shuffled_idx]

print(x_train_shuffled.shape, y_train_shuffled.shape)
print('\n')
print(x_train_shuffled)
print('\n')
print(len(x_train_shuffled[1]))

(25000,) (25000,)


[list([1, 6, 212, 15, 952, 1238, 73, 16, 4, 117, 698, 781, 4, 4313, 7, 761, 1745, 9075, 12, 3734, 4, 86, 8173, 7, 5428, 4745, 5, 15918, 13384, 159, 15, 117, 22, 44, 14362, 1797, 23, 4, 5269, 1117, 631, 39513, 5, 137, 4745, 5, 13384, 26, 24, 572, 73, 3941, 8, 7004, 212, 12, 32, 7900, 367, 1780, 13384, 9, 2814, 6, 185, 255, 37, 630, 56, 11, 37145, 4313, 2408, 12629, 4745, 1455, 10873, 34, 96, 7, 6, 1606, 631, 5382, 11356, 2106, 59, 127, 285, 614, 8, 16038, 90, 21, 11, 4, 932, 96, 108, 140, 29, 734, 18, 41, 5, 59, 734, 18, 90, 21, 6, 622, 420, 7, 4074, 2043, 3453, 54, 51152, 761, 1745, 198, 32, 59, 889, 2402, 11, 6, 854, 9, 197, 34, 4745, 8, 30, 6, 1021, 255, 8, 97, 2294, 433, 5, 53, 1139, 1635, 2691, 3164, 4798, 271, 8, 4745, 1786, 6, 4313, 39, 27, 322, 937, 3214, 2796, 1291, 215, 30, 13384, 50, 9, 49, 327, 985, 7, 4, 10548, 414, 5, 253, 4665, 7, 861, 2420, 25, 70, 67, 15, 4745, 9, 24, 99, 5669, 19, 4, 212, 21, 11, 283, 4745, 240, 6, 11152, 1624, 5, 1545, 5, 29, 5931,

In [13]:
word_index = tf.keras.datasets.imdb.get_word_index()

word_index = {k:(v+3) for k,v in word_index.items()} 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

index_word = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(encoded_array):
    return ' '.join([index_word.get(i, '?') for i in encoded_array])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [14]:
train_data = tf.keras.preprocessing.sequence.pad_sequences(x_train_shuffled,
                                                        value=word_index["<PAD>"],
                                                        padding='pre',
                                                        maxlen=256)

test_data = tf.keras.preprocessing.sequence.pad_sequences(x_test,
                                                       value=word_index["<PAD>"],
                                                       padding='pre',
                                                       maxlen=256)

print(train_data.shape, test_data.shape)

(25000, 256) (25000, 256)


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train_data, y_train, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(20000, 256) (20000,) (5000, 256) (5000,)


In [15]:
# Shuffle training data for cross validation during training cycle
FRAC = 0.8 # fraction of training data used for training. Remaining is for cross validation.
idx = np.arange(len(train_data))
np.random.seed(seed=400)
np.random.shuffle(idx)

idxs = idx[:round(len(idx)*FRAC)] # Select random 80% for training data
partial_x_train = train_data[idxs]
partial_y_train = y_train[idxs]

x_val = np.delete(train_data, idxs.tolist(), axis=0) # select remaining as cross validation data
y_val = np.delete(y_train, idxs.tolist(), axis=0)

print(partial_x_train.shape, partial_y_train.shape)
print(x_val.shape, y_val.shape)

(20000, 256) (20000,)
(5000, 256) (5000,)


In [16]:
vocab_size = len(word_index)

MAX_SENTENCE_LENGTH=256
EMBEDDING_SIZE=16
HIDDEN_LAYER_SIZE=64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [17]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [21]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=40,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [22]:
predicted = model.predict(test_data)
predicted

array([[0.6254888 ],
       [0.19100961],
       [0.5520443 ],
       ...,
       [0.703851  ],
       [0.30264166],
       [0.50384945]], dtype=float32)

In [23]:
predicted[predicted > 0.5] = 1
predicted[predicted <= 0.5] = 0
predictedf = predicted.flatten().astype(int)

import pandas as pd
df3 = pd.DataFrame(data=predictedf, columns=['predicted'])
refdf = pd.DataFrame(data=y_test, columns=['actual'])

y_actu = pd.Series(refdf['actual'], name='ACTUAL')
y_pred = pd.Series(df3['predicted'], name='PREDICTED')
predicted_results = y_pred.tolist()
truth = y_actu.tolist()

dl_confusion = pd.crosstab(y_actu, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
dl_confusion

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6055,6445,12500
1,6520,5980,12500
All,12575,12425,25000


In [24]:
from sklearn.metrics import classification_report
report = classification_report(truth, predicted_results)
print(report)

              precision    recall  f1-score   support

           0       0.48      0.48      0.48     12500
           1       0.48      0.48      0.48     12500

    accuracy                           0.48     25000
   macro avg       0.48      0.48      0.48     25000
weighted avg       0.48      0.48      0.48     25000

