In [3]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras import layers

In [4]:
#Loading the dataset
data = keras.datasets.imdb
#Splitting the dataset
(train_data,train_labels),(test_data,test_labels) = data.load_data(num_words=10000)

In [5]:
#View words(encoded)
print("The encoded data is: ", train_data[0]) 
print("----------------------------")
data2 = test_data[0]

The encoded data is:  [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
----------------------------


In [6]:
#Mapping 
word_index = data.get_word_index() #this creates a dictionary with each key as word and each value as indices
word_index = {k:(v+3) for k,v in word_index.items()} # Here 3 is added to each value of the key,
#This is often done to leave room for special tokens (like padding, start-of-sequence, end-of-sequence)
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<UNUSED>'] = 3

In [7]:
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])#this returns a dictionary where each key is a index and each value is word
def decode(text):
    return " ".join([reverse_word_index.get(i,'?') for i in text])
print(f"The decoded data is: {decode(data2)}")

The decoded data is: <START> please give this one a miss br br <UNK> <UNK> and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you madison fans give this a miss


In [8]:
#The length of the data
print(f"The length of training data at index 0 is: {len(train_data[0])} and at index 1 is: {len(train_data[1])}") 
#Here the sentences has varying lengths,neural networks require input data to have specific shape each time data is fed 
#To overcome this issue we use padding
#Preprocessing the data
train_data = pad_sequences(train_data,value=word_index['<PAD>'],maxlen=250,padding="post")
test_data = pad_sequences(test_data,value=word_index['<PAD>'],maxlen=250,padding="post")
print("The length of training data and testing data after padding",len(train_data[0]),len(test_data[1]))

The length of training data at index 0 is: 218 and at index 1 is: 189
The length of training data and testing data after padding 250 250


In [9]:
#Training the model
model = keras.Sequential()
model.add(layers.Embedding(10000,16))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1,activation= 'sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160000    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160289 (626.13 KB)
Trainable params: 160289 (626.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
#Compiling the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [11]:
#Splitting data into validation data and training data to prevent overfitting
x_val =train_data[:10000] 
y_val = train_labels[:10000]
x_train = train_data[10000:]
y_train = train_labels[10000:]
#Fitting the model
modelf = model.fit(x_train,y_train,epochs= 20,batch_size=550,validation_data=(x_val,y_val),verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
#Evaluation
test_loss, test_accuracy = model.evaluate(test_data,test_labels)
print(f"The loss is: {test_loss},and the accuracy is: {test_accuracy}")

The loss is: 0.31449630856513977,and the accuracy is: 0.8730000257492065


In [14]:
#Making a prediction
test_review = test_data[1]
predict = model.predict(np.array([test_review])) 
print("Review:")
print(decode(test_review))
print("Prediction: " + str(predict[0])) #The str(predict[0]) part converts this numerical prediction into a string, 
#so it can be concatenated with the "Prediction: " string for display using the + operator
print("Actual: " + str(test_labels[1]))

Review:
focuses on mood and character development the plot is very simple and many of the scenes take place on the same set in frances <UNK> the sandy dennis character apartment but the film builds to a disturbing climax br br the characters create an atmosphere <UNK> with sexual tension and psychological <UNK> it's very interesting that robert altman directed this considering the style and structure of his other films still the trademark altman audio style is evident here and there i think what really makes this film work is the brilliant performance by sandy dennis it's definitely one of her darker characters but she plays it so perfectly and convincingly that it's scary michael burns does a good job as the mute young man regular altman player michael murphy has a small part the <UNK> moody set fits the content of the story very well in short this movie is a powerful study of loneliness sexual <UNK> and desperation be patient <UNK> up the atmosphere and pay attention to the wonderful