In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

## Loading Data and Preprocessing

In [23]:
# Load data
data = keras.datasets.imdb

In [24]:
# Split train and test dataset
# Consider only the 10000 most frequent words in the dataset
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=88000)

In [25]:
# Data example
train_data[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [26]:
# Word dictionary 
# transform numbers that represent words in the dataset to readable words
# Retrieves a dictionary mapping words to their integer indices in the IMDb dataset.
word_index = data.get_word_index()
# Adds an offset of 3 to all the values in the dictionary
# Make room for defined special token like <START>
word_index  = {k:(v+3) for k, v in word_index.items()}
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<UNUSED>'] = 3
# Creates a reverse dictionary mapping integer indices to words.
# Instead of the mapping words to integer indices.
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# Iterates through each integer i in the text list and looks up its corresponding word in the reverse_word_index dictionary.
# If not found default to "?"
def decode_review(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [27]:
# Decode Example
print(train_data[0])
print(decode_review(train_data[0]))

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
<START> this film was just brilliant

In [28]:
# Each movie review has a different length
print(len(train_data[0]), len(train_data[1]))

218 189


In [29]:
# Different movie review do not work for the model
# Needs to have same input shape
# So limit all movie reviews to 250 words max.
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index['<PAD>'], padding="post", maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index['<PAD>'], padding="post", maxlen=250)

In [30]:
# Movie reviews have the same length
print(len(train_data[0]), len(train_data[1]))

250 250


In [31]:
# Example after limiting movie review length
print(decode_review(train_data[0]))

<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and sho

## Defining the Model

In [32]:
model = keras.Sequential()
# embedding layer learns to represent words in a continuous vector space where similar words have similar representations.  This allows the model to capture semantic relationships between words.
model.add(keras.layers.Embedding(88000, 16))
# Global average pooling reduces the dimensionality of the representation while retaining important information.
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))

In [33]:
model.summary()

In [34]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [35]:
# Define validation set and labels
x_val = train_data[:10000]
x_train = train_data[10000:]

y_val = train_labels[:10000]
y_train = train_labels[10000:]

## Training the Model

In [36]:
# verbose = 1 means that during training, progress bars will be displayed for each epoch, showing the progress of training and validation.
# batch_size is the number of examples used for each iteration
# an epoch is composed of many iterations
# after each iteration the model parameters are updated (mini-batch gradient descent)
fitModel = model.fit(x_train, y_train, epochs=50, batch_size=512, validation_data=(x_val, y_val), verbose=1)

Epoch 1/50


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.5052 - loss: 0.6928 - val_accuracy: 0.6108 - val_loss: 0.6902
Epoch 2/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.6472 - loss: 0.6879 - val_accuracy: 0.6474 - val_loss: 0.6775
Epoch 3/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.7168 - loss: 0.6709 - val_accuracy: 0.7036 - val_loss: 0.6515
Epoch 4/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.7476 - loss: 0.6399 - val_accuracy: 0.7237 - val_loss: 0.6144
Epoch 5/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - accuracy: 0.7668 - loss: 0.5964 - val_accuracy: 0.7949 - val_loss: 0.5706
Epoch 6/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - accuracy: 0.7994 - loss: 0.5505 - val_accuracy: 0.7623 - val_loss: 0.5298
Epoch 7/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━

## Testing the Model

In [37]:
results = model.evaluate(test_data, test_labels)
results

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8782 - loss: 0.3209


[0.32778599858283997, 0.8762400150299072]

## Saving the Model

In [39]:
model.save('model.keras')

## Loading the Model

In [42]:
loaded_model = keras.models.load_model('model.keras')

  trackable.load_own_variables(weights_store.get(inner_path))


## One Prediction Example

In [43]:
review = test_data[10]
test_review = np.expand_dims(review, axis=0)
test_label = test_labels[10]
prediction = loaded_model.predict([test_review])
print(f'Review: {decode_review(review)}')
print(f'Prediction: {str(prediction[0])}')
print(f'Actual: {str(test_label)}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
Review: <START> inspired by hitchcock's strangers on a train concept of two men swapping murders in exchange for getting rid of the two people messing up their lives throw momma from the train is an original and very inventive comedy take on the idea it's a credit to danny devito that he both wrote and starred in this minor comedy gem br br anne ramsey is the mother who inspires the film's title and it's understandable why she gets under the skin of danny devito with her sharp tongue and relentlessly putting him down for any minor infraction billy crystal is the writer who's wife has stolen his book idea and is now being lionized as a great new author even appearing on the oprah show to in adulation he should be enjoying thus devito gets the idea of swapping murders to rid themselves of these nuisance factors br br of course everything and anything can happen when writer carl reiner lets his imagination roam with u

## True Inference (using a real example)

In [44]:
def review_encode(text):
    encoded = [1]
    for word in text:
        if word.lower() in word_index:
            encoded.append(word_index[word.lower()])
        else:
            encoded.append(2)
    return encoded

In [51]:
with open("datasets/movie_review.txt", encoding="utf-8") as f:
    for line in f.readlines():
        words = line.replace(",", "").replace(".", "").replace("(", "").replace(")", "").replace(":", "").replace("\"", "").strip().split(" ")
        encode = review_encode(words)
        encode = keras.preprocessing.sequence.pad_sequences([encode], value=word_index['<PAD>'], padding="post", maxlen=255)
        prediction = loaded_model.predict(encode)
        print(line)
        print(encode)
        print(prediction[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
I absolutely loved this movie! From the captivating storyline to the stellar performances, every aspect of it was a masterpiece. The plot was engaging and kept me on the edge of my seat throughout. The characters were well-developed and relatable, each with their own depth and complexity. The acting was phenomenal, with the cast delivering powerful performances that truly brought the story to life. The cinematography was stunning, with breathtaking visuals that added to the overall experience. The direction was top-notch, seamlessly weaving together humor, drama, and suspense. In the end, this movie left me feeling uplifted and inspired. It's a must-see for anyone looking for an unforgettable cinematic experience.
[[    1    13   427   447    14     2    39     4  3726   769     8     4
   4244   354   175  1251     7    12    16     6   991     4   114    16
   1728     5   828    72    23     4  1289     7    61