# **Introduction**

This notebook is for performing hyperparameter sweeps on the algorithms that have been designed thus far, for the ``cartpole`` environment in Gymnasium.

# **Import Packages**

This section imports the necessary packages.

In [27]:
# import these packages:
import gymnasium as gym 
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm
import json
import csv
import os
import itertools
from cartpole_classes import DQN_Agent

# **Environment Definition**

This section defines the relevant objects and functions to perform the hyperparameter sweep.

##### Sweep parameters:

In [28]:
# set the model to be evaluated:
model_name = "DQN"

# set the environment:
env = gym.make("CartPole-v1")

# agent hyperparameters:
grid = {"lr" : [1e-5, 5e-5, 1e-4],                          # learning rate α
        "buffer_size" : [2000, 5000, 10000],                # size of the replay buffer
        "batch_size" : [32, 64],                            # amount sampled from buffer
        "target_update_freq" : [1000, 2500],                # number of steps to update target network
        "neurons" : [64, 128],                              # how many neurons to have in each layer of the network
        "layers" : [2, 3]}                                  # how many layers to have in each network

# cartesian product of all combinations:
keys, values = zip(*grid.items())
combos = [dict(zip(keys, v)) for v in itertools.product(*values)]

# set the sweep conditions:
training_length = 2000                          # length of training
threshold = 450                                 # desired reward threshold
consecutive_episodes = 5                        # desired consecutive number of episodes above threshold
last_n = 50                                     # how many previous episodes to examine
window_size = 20                                # window size for moving average of reward
out_dir = f"sweep_results/{model_name}"         # output directory
os.makedirs(out_dir, exist_ok = True)           # make that a directory

# **Hyperparameter Sweeping**

This section performs the hyperparameter sweeping:

In [29]:
# for every combination of params:
for params in tqdm(combos, colour = "#33FF00", ncols = 100):
    # 1) create a filename and path:
    base = "_".join(f"{k}{v}" for k,v in params.items())
    run_dir = os.path.join(out_dir, base)
    os.makedirs(run_dir, exist_ok = True)

    weights_path    = os.path.join(run_dir, "model_weights.weights.h5")
    metrics_path    = os.path.join(run_dir, "metrics.json")
    rewards_path    = os.path.join(run_dir, "reward_history.csv")

    # 2) reset TF and the agent:
    tf.keras.backend.clear_session()
    dqn_agent = DQN_Agent(env, gamma = 0.99, epsilon = 1.0, epsilon_min = 0.1, epsilon_decay = 0.999, **params)

    # 3) run episodes, record the rewards received:
    reward_history = []
    consec_counter = 0
    stopped_ep = None
    ep_to_thresh = None
    ma_queue = []

    for ep in range(1, training_length + 1):
        # train the agent for a single episode:
        reward = dqn_agent.training(1)[0]
        reward_history.append(reward)

        # maintain sliding window for moving average:
        ma_queue.append(reward)
        if len(ma_queue) > window_size:
            ma_queue.pop(0)
        if ep_to_thresh is None and len(ma_queue) == window_size:
            if np.mean(ma_queue) >= threshold:
                ep_to_thresh = ep

        if reward >= threshold:
            consec_counter += 1
            if consec_counter >= consecutive_episodes:
                # save the model, and break training loop:
                stopped_ep = ep
                dqn_agent.q_network.save_weights(weights_path)
                break
        else:
            consec_counter = 0

    # dump reward history as .csv:
    with open(rewards_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["reward"])
        for r in reward_history:
            writer.writerow([r])

    # compute metrics:
    max_reward      = max(reward_history)           # the maximum reward received
    last_n_rewards  = reward_history[-last_n:]      # the last n rewards received
    last_mean     = float(np.mean(last_n_rewards))  # mean of the last n rewards
    last_std      = float(np.std(last_n_rewards))   # std of the last n rewards

    # 4) dump run record:
    run_record = {
        "params"            : params,
        "stopped_episode"   : stopped_ep,
        "episode_to_thresh" : ep_to_thresh,
        "max_reward"        : max_reward,
        "last_mean"         : last_mean,
        "last_std"          : last_std,
        "weights_path"      : weights_path if stopped_ep else None,
    }

    # write to file:
    with open(metrics_path, "w") as f:
        json.dump(run_record, f, indent = 2)


100%|[38;2;51;255;0m██████████████████████████████████████████████████████████[0m| 144/144 [7:12:24<00:00, 180.17s/it][0m
