# Hyperparameter tuning with Optuna for the Temporal Difference Variational Autoencoder (TD-VAE) model
This notebook performs a hyperparameter optimization for the Temporal Difference Variational Autoencoder (TD-VAE) model using the Optuna library. 
The TD-VAE model is a variant of the VAE model that incorporates temporal difference learning to improve the quality of the learned representations. 
The hyperparameters that will be optimized include the learning rate, batch size, and the number of training epochs. 
The optimization will be performed using the Fashion MNIST dataset.

## Imports

In [1]:
import json
import logging
import pathlib
import pickle
import sys
import time

import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import torch
import tqdm
import umap
from matplotlib import gridspec
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader

utils_path = pathlib.Path("../../utils/").resolve(strict=True)
sys.path.append(str(utils_path))

from model import TD_VAE, DBlock, Decoder, PreProcess
from prep_data import MNIST_Dataset
from rollout import rollout_func

  from .autonotebook import tqdm as notebook_tqdm


## Set Constants

In [None]:
# set constants
X_DIM = 784
PROCESSED_X_DIM = 784
OPTIM_EPOCHS = 5
NUM_TRIALS = 250
SAMPLES_PER_SEQ = 20
TIME_CONSTANTS_MAX = 16  # There are 20 frames total
TIME_JUMP_OPTIONS = [1, 2, 3, 4]  # Jump up to 4 frames away

## Set paths

In [2]:
# set path to the MNIST images
mnist_pickle_path = pathlib.Path("../../data/mnist/MNIST.pkl").resolve(strict=True)
# create the log directory if it does not exist
log_path = pathlib.Path("../log/").resolve()
log_path.mkdir(exist_ok=True)
log_file_path = pathlib.Path("../log/loginfo.txt").resolve()
# set and make the models directory
models_path = pathlib.Path("../models/").resolve()
models_path.mkdir(exist_ok=True, parents=True)
# set the path to the results directory
results_path = pathlib.Path("../results/").resolve()
results_path.mkdir(exist_ok=True, parents=True)

In [3]:
# set up logging
logger = logging.getLogger(__name__)
# make the log directory
pathlib.Path("../log").mkdir(exist_ok=True)
logging.basicConfig(
    filename=str(pathlib.Path(log_path / "training_log.log")), level=logging.INFO
)

In [4]:
with open(mnist_pickle_path, "rb") as file_handle:
    MNIST = pickle.load(file_handle)

# get the MNIST data keys
print(MNIST.keys())
MNIST["train_image"].shape

dict_keys(['train_image', 'train_label', 'test_image', 'test_label'])


(60000, 28, 28)

## Hyperparameter tuning with Optuna

### Set up the hyperparameter search space

In [5]:
# set hyperparameters search space
BATCH_SIZE_MIN = 8
BATCH_SIZE_MAX = 256
BATCH_SIZE_STEP = 8

LEARNING_RATE_MIN = 0.000001
LEARNING_RATE_MAX = 0.001
LEARNING_RATE_STEP = 0.0005

BELIEF_STATE_SIZE_MIN = 5
BELIEF_STATE_SIZE_MAX = 500

STATE_SIZE_MIN = 2
STATE_SIZE_MAX = 50

D_BLOCK_HIDDEN_SIZE_MIN = 5
D_BLOCK_HIDDEN_SIZE_MAX = 500

DECODER_HIDDEN_SIZE_MIN = 5
DECODER_HIDDEN_SIZE_MAX = 500

OPTIMIZING_EPOCHS = 25
OPTIMIZER_OPTIONS = ["Adam", "SGD", "RMSprop"]

SGD_MOMENTUM_MIN = 0.01
SGD_MOMENTUM_MAX = 0.99

RMSprop_ALPHA_MIN = 0.01
RMSprop_ALPHA_MAX = 0.99

RMSprop_MOMENTUM_MIN = 0.01
RMSprop_MOMENTUM_MAX = 0.99

RMSprop_EPSILON_MIN = 0.0001
RMSprop_EPSILON_MAX = 0.1

N_LAYERS_MIN = 1
N_LAYERS_MAX = 15

### Define the objective function for optimization

In [6]:
# optuna objective


def objective(trial: optuna.Trial) -> float:
    """
    Optuna objective function for hyperparameter search

    Parameters
    ----------
    trial: optuna.Trial
        The optuna trial object

    Returns
    -------
    float
        The loss value for the model or the metric to be optimized
    """
    # sample hyperparameters
    batch_size = trial.suggest_int(
        "batch_size", BATCH_SIZE_MIN, BATCH_SIZE_MAX, step=BATCH_SIZE_STEP
    )
    learning_rate = trial.suggest_float(
        "learning_rate", LEARNING_RATE_MIN, LEARNING_RATE_MAX
    )
    belief_state_size = trial.suggest_int(
        "belief_state_size", BELIEF_STATE_SIZE_MIN, BELIEF_STATE_SIZE_MAX
    )
    state_size = trial.suggest_int("state_size", STATE_SIZE_MIN, STATE_SIZE_MAX)
    d_block_hidden_size = trial.suggest_int(
        "d_block_hidden_size", D_BLOCK_HIDDEN_SIZE_MIN, D_BLOCK_HIDDEN_SIZE_MAX
    )
    decoder_hidden_size = trial.suggest_int(
        "decoder_hidden_size", DECODER_HIDDEN_SIZE_MIN, DECODER_HIDDEN_SIZE_MAX
    )
    optimizer = trial.suggest_categorical("optimizer", OPTIMIZER_OPTIONS)
    n_layers = trial.suggest_int("n_layers", N_LAYERS_MIN, N_LAYERS_MAX)

    # set up the model
    model = TD_VAE(
        x_size=X_DIM,
        processed_x_size=PROCESSED_X_DIM,
        b_size=belief_state_size,
        z_size=state_size,
        d_block_hidden_size=d_block_hidden_size,
        decoder_hidden_size=decoder_hidden_size,
        layers=n_layers,
        samples_per_seq=SAMPLES_PER_SEQ,
        t_diff_min=1,
        t_diff_max=TIME_CONSTANTS_MAX,
    )
    # model.to_device()
    model.cuda()

    # set up the optimizer
    if optimizer == "SGD":
        momentum = trial.suggest_float("momentum", SGD_MOMENTUM_MIN, SGD_MOMENTUM_MAX)
        optimizer_kwargs = {"momentum": momentum}
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, **optimizer_kwargs)
    elif optimizer == "RMSprop":
        alpha = trial.suggest_float("alpha", RMSprop_ALPHA_MIN, RMSprop_ALPHA_MAX)
        momentum = trial.suggest_float(
            "momentum", RMSprop_MOMENTUM_MIN, RMSprop_MOMENTUM_MAX
        )
        eps = trial.suggest_float("epsilon", RMSprop_EPSILON_MIN, RMSprop_EPSILON_MAX)
        optimizer_kwargs = {"alpha": alpha, "momentum": momentum, "eps": eps}
        optimizer = optim.RMSprop(
            model.parameters(), lr=learning_rate, **optimizer_kwargs
        )
    elif optimizer == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    else:
        raise ValueError("Invalid optimizer")

    # set up the data loader
    train_data = MNIST_Dataset(MNIST["train_image"], MNIST["train_label"])
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    # train the model

    for epoch in range(OPTIM_EPOCHS):
        epoch_loss = 0

        for batch, (idx, images) in enumerate(train_loader):
            batch_counter = 0
            batch_loss = 0
            images = images["image"].cuda()
            # Make a forward step of preprocessing and LSTM
            forward_return_tuple = model.forward(images)

            # Randomly sample a time step and jumpy step
            t_1 = np.random.choice(TIME_CONSTANTS_MAX)
            t_2 = t_1 + np.random.choice(TIME_JUMP_OPTIONS)

            # Calculate loss function based on two time points
            loss, bce_diff, kl_div_qs_pb, kl_shift_qb_pt, bce_optimal = (
                model.calculate_loss(forward_return_tuple)
            )
            if loss.isnan():
                print("loss is nan")
                pass
            elif loss.isinf():
                print("loss is inf")
                pass
            elif loss.item() == 0:
                print("loss is zero")
                pass
            elif loss.item() < 0:
                print("loss is negative")
                pass
            elif loss.item() > 0:
                batch_counter += 1
                batch_loss += loss.item()
                # must clear out stored gradient
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        epoch_loss += batch_loss / batch_counter
    return epoch_loss

### Run the hyperparameter optimization

In [7]:
hyperparameter_search_path = pathlib.Path("../hyperparameter_search").resolve()
hyperparameter_search_path.mkdir(exist_ok=True, parents=True)
# set up the optuna study
study = optuna.create_study(
    direction="minimize",
    study_name="TD_VAE_hyperparameter_search",
    storage="sqlite:///../hyperparameter_search/td_vae_hyperparameter_search.db",
    load_if_exists=True,
    sampler=optuna.samplers.TPESampler(seed=0),
)

study.optimize(objective, n_trials=NUM_TRIALS)

# get best hyperparameters
best_params = study.best_params
best_params["best_loss"] = study.best_value
print(best_params)

# save the best hyperparameters and the best model
best_params_path = pathlib.Path(models_path / "best_params.json")
with open(best_params_path, "w") as file_handle:
    json.dump(best_params, file_handle)

[I 2024-08-16 08:43:48,389] Using an existing study with name 'TD_VAE_hyperparameter_search' instead of creating a new one.
[I 2024-08-16 08:46:27,363] Trial 4 finished with value: 199.5407257080078 and parameters: {'batch_size': 144, 'learning_rate': 0.0007154741770060472, 'belief_state_size': 303, 'state_size': 28, 'd_block_hidden_size': 215, 'decoder_hidden_size': 325, 'optimizer': 'RMSprop', 'n_layers': 6, 'alpha': 0.7858905373210113, 'momentum': 0.5283170213578464, 'epsilon': 0.05684765165328384}. Best is trial 3 with value: 139.24745178222656.


{'batch_size': 144, 'learning_rate': 0.0007154741770060472, 'belief_state_size': 303, 'state_size': 28, 'd_block_hidden_size': 215, 'decoder_hidden_size': 325, 'optimizer': 'RMSprop', 'n_layers': 6, 'alpha': 0.7858905373210113, 'momentum': 0.5283170213578464, 'epsilon': 0.05684765165328384, 'best_loss': 139.24745178222656}
