In [8]:
# Import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow import keras
import plotly.express as px
import plotly.graph_objects as go
from rich import print

# Set the random seed for reproducibility
np.random.seed(0)

# Load the IMDB dataset
imdb = keras.datasets.imdb

# Set vocabulary size to 10,000
vocab_size = 10000
# Load the data, keeping only the top 10,000 most frequently occurring words
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size)

# Get the word index from the IMDB dataset
word_index = imdb.get_word_index()

# Shift the word index by 3 to accommodate special tokens
word_index = {k: (v + 3) for k, v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

# Create a reverse word index for decoding reviews
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Define a function to decode the review from integers back to words
def decode_review(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

# Print the first two decoded reviews with their labels
for i in range(2):
    print("example {}, label {}".format(i, train_labels[i]))
    print(decode_review(train_data[i]))

# Pad the sequences to a fixed length of 256
train_data = keras.preprocessing.sequence.pad_sequences(
    train_data, value=word_index["<PAD>"], padding="post", maxlen=256
)
test_data = keras.preprocessing.sequence.pad_sequences(
    test_data, value=word_index["<PAD>"], padding="post", maxlen=256
)

# Set the embedding size
embedding_size = 16

# Define a function to create the model
def create_model(embedding_size):
    tf.random.set_seed(42)
    np.random.seed(42)
    model = keras.Sequential()
    model.add(keras.layers.Embedding(vocab_size, embedding_size))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(16, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
    return model

# Create the model and print the summary
model = create_model(embedding_size)
model.summary()

# Split the training data into training and validation sets
x_val = train_data[:10000]
x_train = train_data[10000:]
y_val = train_labels[:10000]
y_train = train_labels[10000:]

# Train the model
history = model.fit(x_train, y_train, epochs=50, batch_size=512, validation_data=(x_val, y_val), verbose=1)

# Get the history dictionary, evaluate the model on the test set, and print the results
history_dict = history.history
results = model.evaluate(test_data, test_labels)

# Extract training and validation accuracy and loss from the history dictionary
acc = history_dict["acc"]
val_acc = history_dict["val_acc"]
loss = history_dict["loss"]
val_loss = history_dict["val_loss"]
epochs = range(1, len(acc) + 1)

# Plot the training and validation loss
fig = go.Figure()
fig.add_scatter(x=list(epochs), y=loss, mode='markers+lines', marker_symbol='square', name="Training loss")
fig.add_scatter(x=list(epochs), y=val_loss, mode='markers+lines', marker_symbol='star', name="Validation loss")
fig.update_layout(title="Training and validation loss")
fig.update_xaxes(title='Epochs')
fig.update_yaxes(title='Loss')
fig.show()

# Plot the training and validation accuracy
fig = go.Figure()
fig.add_scatter(x=list(epochs), y=acc, mode='markers+lines', marker_symbol='square', name="Training loss")
fig.add_scatter(x=list(epochs), y=val_acc, mode='markers+lines', marker_symbol='star', name="Validation loss")
fig.update_layout(title="Training and validation loss")
fig.update_xaxes(title='Epochs')
fig.update_yaxes(title='Accuracy')
fig.show()


tf.get_logger().setLevel("WARNING")
# Define callbacks for early stopping and saving the best model
checkpoint_filepath = '05model_checkpoint'
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_acc", patience=2),
    keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, monitor="val_acc", save_best_only=True, verbose=0),
]

# Create a new model and train it with the custom callbacks
model = create_model(embedding_size)
history = model.fit(
    x_train, y_train, epochs=50, batch_size=512, validation_data=(x_val, y_val), verbose=0, callbacks=callbacks
)

# Extract training and validation accuracy and loss from the history dictionary
history_dict = history.history
acc = history_dict["acc"]
val_acc = history_dict["val_acc"]
loss = history_dict["loss"]
val_loss = history_dict["val_loss"]
epochs = range(1, len(acc) + 1)

# Plot the training and validation loss with early stopping
fig = go.Figure()
fig.add_scatter(x=list(epochs), y=loss, mode='markers+lines', marker_symbol='square', name="Training loss")
fig.add_scatter(x=list(epochs), y=val_loss, mode='markers+lines', marker_symbol='star', name="Validation loss")
fig.update_layout(title="Training and validation loss with early stopping")
fig.update_xaxes(title='Epochs')
fig.update_yaxes(title='Loss')
fig.show()

# Plot the training and validation accuracy with early stopping
fig = go.Figure()
fig.add_scatter(x=list(epochs), y=acc, mode='markers+lines', marker_symbol='square', name="Training accuracy")
fig.add_scatter(x=list(epochs), y=val_acc, mode='markers+lines', marker_symbol='star', name="Validation accuracy")
fig.update_layout(title="Training and validation accuracy with early stopping")
fig.update_xaxes(title='Epochs')
fig.update_yaxes(title='Accuracy')
fig.show()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, None, 16)          160000    
                                                                 
 global_average_pooling1d_14  (None, 16)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_28 (Dense)            (None, 16)                272       
                                                                 
 dense_29 (Dense)            (None, 1)                 17        
                                                                 
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Ep