In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [2]:
data = keras.datasets.imdb

In [3]:
(train_data,train_label),(test_data,test_label) =  data.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step


In [4]:
# For the simplicity of training data into neural networks they have encoded words into integer values, 
# they have given word index dictnory in that dataset
print(train_data[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [6]:
word_index = data.get_word_index()

In [10]:
word_dict = {key:(value+3) for key,value in word_index.items()}


# Adding first 4 extra encoding for general use cases
word_dict['<PAD>'] = 0  # Used for padding extra character at the start or the end
word_dict['<START>'] = 1 # Indicate Review starts here
word_dict['<UNK>'] = 2  # Indicate unknown character
word_dict['<UNUSED>'] = 3 # Indicate unused character


In [13]:
reverse_word_dict = dict((value,key) for (key,value) in word_dict.items())

In [18]:
def decode_text(text):
    return " ".join([reverse_word_dict[word] for word in text])

In [19]:
decode_text(train_data[0])

"<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and sh

In [53]:
# Now as all the movie do not have same word length, it is better to make all the lenght same
# Suppose we want to make fix length 250, now if any review has lenght smaller than we will pad the values, if hight then 250 then we will trim those words
# we can use keras preprocessing for that as well

train_data = keras.preprocessing.sequence.pad_sequences(train_data,value=word_dict['<PAD>'],maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data,value=word_dict['<PAD>'],maxlen=250)

In [60]:
model = keras.Sequential()

model.add(keras.layers.Embedding(100000,16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16,activation='relu'))
model.add(keras.layers.Dense(1,activation='sigmoid'))

model.summary()


In [61]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics = ['accuracy'])

In [62]:
x_val = train_data[0:10000]
y_val = train_label[0:10000]

x_train = train_data[10000:]
y_train = train_label[10000:]

In [63]:
fit_model = model.fit(x_train,y_train,
          epochs = 40,
          batch_size = 512,
          validation_data=(x_val,y_val),
          verbose = 1
          )

Epoch 1/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.5682 - loss: 0.6922 - val_accuracy: 0.6877 - val_loss: 0.6852
Epoch 2/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.6854 - loss: 0.6806 - val_accuracy: 0.7123 - val_loss: 0.6658
Epoch 3/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7415 - loss: 0.6558 - val_accuracy: 0.7613 - val_loss: 0.6326
Epoch 4/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7790 - loss: 0.6156 - val_accuracy: 0.7810 - val_loss: 0.5876
Epoch 5/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7918 - loss: 0.5656 - val_accuracy: 0.7981 - val_loss: 0.5383
Epoch 6/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8218 - loss: 0.5092 - val_accuracy: 0.8148 - val_loss: 0.4895
Epoch 7/40
[1m30/30[0m [32m━━━━

In [64]:
model.save('Text-Classification.h5')



In [66]:
model_2 = keras.models.load_model('Text-Classification.h5')



In [68]:
model_2.predict(test_data)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 688us/step


array([[0.03596258],
       [0.99962866],
       [0.75130796],
       ...,
       [0.0140977 ],
       [0.10333832],
       [0.8388589 ]], dtype=float32)