# Hyperparameter Tuning PPO Notebook 

This Notebook contains the hyperparameter tuning for PPO Model

## Imports

In [1]:
!pip install vector-velocity-gym

from gym_vectorvelocity import VectorVelocityEnv
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
import numpy as np

from gymnasium import make
from stable_baselines3 import PPO
import optuna
from optuna.trial import Trial

Collecting vector-velocity-gym
  Downloading vector_velocity_gym-0.1.0-py3-none-any.whl.metadata (4.7 kB)
Collecting pygame (from vector-velocity-gym)
  Downloading pygame-2.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading vector_velocity_gym-0.1.0-py3-none-any.whl (25.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.6/25.6 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pygame-2.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pygame, vector-velocity-gym
Successfully installed pygame-2.6.0 vector-velocity-gym-0.1.0
pygame 2.6.0 (SDL 2.28.4, Python 3.10.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


2024-07-25 15:02:42.316754: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-25 15:02:42.316863: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-25 15:02:42.455366: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Configurations

In [2]:
############### Environment ###############

MODEL_NAME = "tuned_ppo" # name of the log folder and the model zip file
USE_MULTIPROCESSING = True # use multiprocessing via vec env or a single gym environment
USE_VERBOSE = True

NUMBER_OF_ENVS = 10 # only if USE_MULTIPROCESSING is True
TOTAL_TIMESTEPS = 1e+5 


LOG_DIR = "logs" # name of the folder where the logs will be stored

# env modifications if needed
GAMEOVER_PENALTY = 75
MISSED_COIN_PENALTY = 10

DODGED_OBSTACLE_REWARD = 1
COLLECTED_COIN_REWARD = 15

## the total time steps will be NUMBER_OF_ENVS * NUMBER_OF_TOTAL_STEPS, 
## comment out if you wont use this set up. it will then take the default value u have set :) 

if USE_MULTIPROCESSING:
    TOTAL_TIMESTEPS = NUMBER_OF_ENVS * TOTAL_TIMESTEPS


 ############### HYPERPARAMETERS  ###############

# epochs
MIN_EPOCHS = 5
MAX_EPOCHS = 30

# learning rate 
MIN_LR = 1e-5
MAX_LR = 0.09801

# number of steps to take
MIN_N_STEPS = 1024
MAX_N_STEPS = 3072 

# GAMMA
MIN_GAMMA = 0.8
MAX_GAMMA = 0.99  

# entropy coefficient
MIN_ENT_COEFF = 0.0001
MAX_ENT_COEFF = 0.0991  

# clip range
MIN_CLIP_RANGE = 0.1
MAX_CLIP_RANGE = 0.3

# gae lambda
MIN_GAE_LAMBDA = 0.8
MAX_GAE_LAMBDA = 0.99  



## Enviornment Setup

### Enviornment creation and checking

In [3]:
def create_env():
    env = VectorVelocityEnv()
    env.coin_missed_penalty = MISSED_COIN_PENALTY
    env.game_over_penalty = GAMEOVER_PENALTY
    env.dodged_obstacle_reward = DODGED_OBSTACLE_REWARD
    env.coin_reward = COLLECTED_COIN_REWARD
    return env

env = create_env()
check_env(env)    

In [4]:
if USE_MULTIPROCESSING:
    env = make_vec_env(create_env, n_envs=NUMBER_OF_ENVS)
else:
    env = make(create_env)

### Hyperparameter Objective

In [5]:

def objective(trial: Trial):
    
    # Reinitialize model with new hyperparameters
    learning_rate = trial.suggest_float("learning_rate", MIN_LR, MAX_LR, step=0.001)
    ent_coef = trial.suggest_float("ent_coef", MIN_ENT_COEFF, MAX_ENT_COEFF, step=0.001)
    gamma = trial.suggest_float("gamma", MIN_GAMMA, MAX_GAMMA, step=0.01)
    n_steps = trial.suggest_int("n_steps", MIN_N_STEPS, MAX_N_STEPS, step=128)
    clip_range = trial.suggest_float("clip_range", MIN_CLIP_RANGE, MAX_CLIP_RANGE, step=0.1)
    gae_lambda = trial.suggest_float("gae_lambda", MIN_GAE_LAMBDA, MAX_GAE_LAMBDA, step=0.01)
    epochs = trial.suggest_int("epochs", MIN_EPOCHS, MAX_EPOCHS, step=1)
    
    model = PPO("MultiInputPolicy", 
                env, verbose=USE_VERBOSE, 
                learning_rate=learning_rate,
                ent_coef=ent_coef,
                gamma=gamma,
                n_steps=n_steps,
                clip_range=clip_range,
                gae_lambda=gae_lambda,
                batch_size=64,
                n_epochs=epochs,
                )
    
    
    # Training the model
    model.learn(total_timesteps=TOTAL_TIMESTEPS,
                       tb_log_name=f"{MODEL_NAME}_{trial.number}")

    # Evaluate the model
    mean_reward, _ = evaluate_policy(model, env, 
                                     n_eval_episodes=20, 
                                     deterministic=True)

    # Reporting to Optuna
    trial.report(mean_reward, trial.number)

    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return mean_reward

In [6]:
storage_url = "sqlite:///hyperparameter_tuning_data.db"

study = optuna.create_study(direction="maximize", storage=storage_url, study_name="kaggle_tune_2")

study.optimize(objective,
               n_trials=10, 
               timeout=600,
               n_jobs=4,
               )


best_params = study.best_params

print("Best HP Parameters: ", best_params)

[I 2024-07-25 15:02:54,940] A new study created in RDB with name: kaggle_tune_2


Using cuda device
Using cuda device
Using cuda device
Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 431      |
|    ep_rew_mean     | -62.6    |
| time/              |          |
|    fps             | 160      |
|    iterations      | 1        |
|    time_elapsed    | 71       |
|    total_timesteps | 11520    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 472      |
|    ep_rew_mean     | -56.4    |
| time/              |          |
|    fps             | 163      |
|    iterations      | 1        |
|    time_elapsed    | 78       |
|    total_timesteps | 12800    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 408      |
|    ep_rew_mean     | -61.6    |
| time/              |          |
|    fps             | 196      |
|    iterations      | 1        |
|    time_

[I 2024-07-25 18:38:28,967] Trial 3 finished with value: -67.8381423 and parameters: {'learning_rate': 0.05901000000000001, 'ent_coef': 0.0671, 'gamma': 0.8500000000000001, 'n_steps': 2048, 'clip_range': 0.2, 'gae_lambda': 0.9400000000000001, 'epochs': 14}. Best is trial 3 with value: -67.8381423.


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 367      |
|    ep_rew_mean          | -56.7    |
| time/                   |          |
|    fps                  | 63       |
|    iterations           | 64       |
|    time_elapsed         | 12957    |
|    total_timesteps      | 819200   |
| train/                  |          |
|    approx_kl            | 0.0      |
|    clip_fraction        | 0        |
|    clip_range           | 0.1      |
|    entropy_loss         | -6.6e-23 |
|    explained_variance   | 0        |
|    learning_rate        | 0.026    |
|    loss                 | 0.579    |
|    n_updates            | 1197     |
|    policy_gradient_loss | 3.11e-11 |
|    value_loss           | 56.7     |
--------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 488       |
|    ep_rew_mean          | 102       |
| time/              

[I 2024-07-25 18:45:12,408] Trial 2 finished with value: -63.91821565 and parameters: {'learning_rate': 0.011009999999999999, 'ent_coef': 0.040100000000000004, 'gamma': 0.9400000000000001, 'n_steps': 1152, 'clip_range': 0.1, 'gae_lambda': 0.92, 'epochs': 15}. Best is trial 2 with value: -63.91821565.


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 414       |
|    ep_rew_mean          | -41.3     |
| time/                   |           |
|    fps                  | 64        |
|    iterations           | 67        |
|    time_elapsed         | 13365     |
|    total_timesteps      | 857600    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.1       |
|    entropy_loss         | -6.55e-23 |
|    explained_variance   | 0         |
|    learning_rate        | 0.026     |
|    loss                 | 7.46      |
|    n_updates            | 1254      |
|    policy_gradient_loss | -3.84e-11 |
|    value_loss           | 55.6      |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 463       |
|    ep_rew_mean          | -1.97     |


[I 2024-07-25 19:07:08,333] Trial 0 finished with value: -59.069939850000004 and parameters: {'learning_rate': 0.026010000000000002, 'ent_coef': 0.0551, 'gamma': 0.98, 'n_steps': 1280, 'clip_range': 0.1, 'gae_lambda': 0.8200000000000001, 'epochs': 19}. Best is trial 0 with value: -59.069939850000004.


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 547       |
|    ep_rew_mean          | -0.159    |
| time/                   |           |
|    fps                  | 63        |
|    iterations           | 43        |
|    time_elapsed         | 14725     |
|    total_timesteps      | 935680    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.08e-10 |
|    explained_variance   | 0         |
|    learning_rate        | 0.00701   |
|    loss                 | 7.82      |
|    n_updates            | 924       |
|    policy_gradient_loss | 1.21e-10  |
|    value_loss           | 29        |
---------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 553       |
|    ep_rew_mean          | -1.56     |


[I 2024-07-25 19:16:12,097] Trial 1 finished with value: -17.410992549999996 and parameters: {'learning_rate': 0.00701, 'ent_coef': 0.0011, 'gamma': 0.8, 'n_steps': 2176, 'clip_range': 0.2, 'gae_lambda': 0.99, 'epochs': 22}. Best is trial 1 with value: -17.410992549999996.


Best HP Parameters:  {'learning_rate': 0.00701, 'ent_coef': 0.0011, 'gamma': 0.8, 'n_steps': 2176, 'clip_range': 0.2, 'gae_lambda': 0.99, 'epochs': 22}
