# Experimenting with Deep Learning and IMDB dataset

In [14]:
# imports
import os

import pandas as pd
import tensorflow as tf
import numpy as np
import mlflow

from mlflow.tracking import MlflowClient
from tensorflow import keras
from keras import layers

The following cells are exactly the same from the worked example in Chapter 4 from Chollet's book.

In [3]:
# Loading the IMDB dataset

(train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data(num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [4]:
# Encoding the integer sequences via multi-hot encoding

def vectorize_sequences(sequences, dimension=10000):

    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        for j in sequence:
            results[i, j] = 1.
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

y_train = np.asarray(train_labels).astype("float32")
y_test = np.asarray(test_labels).astype("float32")

In [5]:
# Setting aside a validation set

x_val = x_train[:10000]
partial_x_train = x_train[10000:]

y_val = y_train[:10000]
partial_y_train = y_train[10000:]

Here we add a basic `MLflow` functionality to track the experiments.

In [6]:
# Initialize client and experiment

EXPERIMENT_NAME = "deepLearning_IMDB_dataset"
client = MlflowClient()
mlflow.set_experiment(EXPERIMENT_NAME)
exp = client.get_experiment_by_name(EXPERIMENT_NAME)

2024/04/29 11:46:46 INFO mlflow.tracking.fluent: Experiment with name 'deepLearning_IMDB_dataset' does not exist. Creating a new experiment.


We create a list with different layer sizes to check the performance variation. 

In [7]:
units_2_hidden = [(16, 16), (32, 32), (64,64)]

In [8]:
count = 0
for units in units_2_hidden:

    model = keras.Sequential([
                  layers.Dense(units[0], activation='relu'),
                  layers.Dense(units[1], activation='relu'),
                  layers.Dense(1, activation="sigmoid")
              ])

    model.compile(
          optimizer="rmsprop",
          loss="binary_crossentropy",
          metrics=["accuracy"]
      )

    history = model.fit(
                    partial_x_train,
                    partial_y_train,
                    epochs=4,
                    batch_size=512,
                    validation_data=(x_val, y_val)
                )

    hist_dict = history.history
    test_loss, test_accuracy = model.evaluate(x_test, y_test)


    with mlflow.start_run():
        mlflow.set_tag("model", "Base_relu_{}".format(count))
        mlflow.log_param("units_1st_layer", units[0])
        mlflow.log_param("units_2nd_layer", units[1])
        mlflow.log_param("hidden_activation", 'relu')
        mlflow.log_param("epochs", 4)
        mlflow.log_param("batch_size", 512)
        mlflow.log_metric("accuracy", test_accuracy)
        mlflow.log_metric("loss", test_loss)
    count+=1

    del hist_dict
    del history
    del model

runs = mlflow.search_runs()

Epoch 1/4


2024-04-29 11:46:47.869770: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 600000000 exceeds 10% of free system memory.


[1m11/30[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m0s[0m 10ms/step - accuracy: 0.5859 - loss: 0.6691

2024-04-29 11:46:49.299893: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 20480000 exceeds 10% of free system memory.
2024-04-29 11:46:49.299965: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 20480000 exceeds 10% of free system memory.
2024-04-29 11:46:49.300000: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 20480000 exceeds 10% of free system memory.
2024-04-29 11:46:49.300033: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 20480000 exceeds 10% of free system memory.


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.6721 - loss: 0.6239 - val_accuracy: 0.8414 - val_loss: 0.4519
Epoch 2/4
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8813 - loss: 0.3906 - val_accuracy: 0.8712 - val_loss: 0.3454
Epoch 3/4
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9125 - loss: 0.2799 - val_accuracy: 0.8811 - val_loss: 0.3045
Epoch 4/4
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9245 - loss: 0.2264 - val_accuracy: 0.8880 - val_loss: 0.2808
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8799 - loss: 0.2936
Epoch 1/4
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.6975 - loss: 0.5827 - val_accuracy: 0.8403 - val_loss: 0.3908
Epoch 2/4
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.893

The above code can be optimized using a function to "set up" the model.

In [15]:
PATH_TO_RUNS = "/mnt/0A2AAC152AABFBB7/sideProjects/deepLearning/mlflow_runs"
runs.to_csv(
    os.path.join(PATH_TO_RUNS, 'imdb_runs_1.csv')
)

In [16]:
runs[['experiment_id', 'status', 'metrics.loss', 'metrics.accuracy', 'params.units_1st_layer', 'params.units_2nd_layer']]

Unnamed: 0,experiment_id,status,metrics.loss,metrics.accuracy,params.units_1st_layer,params.units_2nd_layer
0,926124755463801135,FINISHED,0.385586,0.84884,64,64
1,926124755463801135,FINISHED,0.313741,0.87392,32,32
2,926124755463801135,FINISHED,0.294502,0.8806,16,16


In [None]:
# import local modules
import sys
sys.path.append('/mnt/0A2AAC152AABFBB7/sideProjects/deepLearning/deeplearning')
from deeplearning.deep_utils import imdb_model_setup

In [None]:
count = 0
for units in units_2_hidden:

    model, history = imdb_model_setup(
        units_layer1=units[0],
        units_layer2=units[1],
        activ_func="relu",
        X_train=partial_x_train,
        y_train=partial_y_train,
        X_val=x_val,
        y_val=y_val
    )

    hist_dict = history.history
    test_loss, test_accuracy = model.evaluate(x_test, y_test)


    with mlflow.start_run():
        mlflow.set_tag("model", "Base_relu_{}".format(count))
        mlflow.log_param("units_1st_layer", units[0])
        mlflow.log_param("units_2nd_layer", units[1])
        mlflow.log_param("hidden_activation", 'relu')
        mlflow.log_param("epochs", 4)
        mlflow.log_param("batch_size", 512)
        mlflow.log_metric("accuracy", test_accuracy)
        mlflow.log_metric("loss", test_loss)
    count+=1

    del hist_dict
    del history
    del model

runs = mlflow.search_runs()