In [12]:
import gym
import numpy as np
from typing import Any, Dict
import logging
import sys


from stable_baselines3 import PPO, A2C, SAC, TD3, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise


#from Custom_functions import CustomEnv2

from sb3_contrib import QRDQN, TQC

import torch
import torch.nn as nn


import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances



In [13]:
N_TRIALS = 30  # Maximum number of trials  was originally 100
N_JOBS = 1 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 5  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 2  # Number of evaluations during the training
N_TIMESTEPS = 50_000  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
TIMEOUT = int(60*60*24*4)          # 4 days         # int(60 * 15)  # 15 minutes

ENV_ID = "MyEnv-v0"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID,
}


Search space:

In [26]:
def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for TD3 hyperparams.
    :param trial:
    :return:
    """
    print("sample td3 is run now")
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048])
    buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
    # Polyak coeff
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])

    train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512])
    gradient_steps = train_freq

    noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
    noise_std = trial.suggest_uniform("noise_std", 0, 1)

    # NOTE: Add "verybig" to net_arch when tuning HER
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])

    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
        # Uncomment for tuning HER
        # "verybig": [256, 256, 256],
    }[net_arch]

    hyperparams = {
        "gamma": gamma,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "policy_kwargs": dict(net_arch=net_arch),
        "tau": tau,
    }

    if noise_type == "normal":
        hyperparams["action_noise"] = NormalActionNoise(
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
        )
    elif noise_type == "ornstein-uhlenbeck":
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
        )

    # if trial.using_her_replay_buffer:
    #     hyperparams = sample_her_params(trial, hyperparams)

    return hyperparams

In [27]:
def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for A2C hyperparameters.

    :param trial: Optuna trial object
    :return: The sampled hyperparameters for the given trial.
    """
    # Discount factor between 0.9 and 0.9999
    gamma = 1.0 - trial.suggest_float("gamma", 0.000001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
    # 8, 16, 32, ... 1024
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 3, 10)

    ### YOUR CODE HERE
    # TODO:
    # - define the learning rate search space [1e-5, 1] (log) -> `suggest_float`
    # - define the network architecture search space ["tiny", "small"] -> `suggest_categorical`
    # - define the activation function search space ["tanh", "relu"]
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)
    net_arch = trial.suggest_categorical("net_arch", ["tiny", "small"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    ### END OF YOUR CODE

    # Display true values
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("n_steps", n_steps)

    net_arch = [
        {"pi": [64], "vf": [64]}
        if net_arch == "tiny"
        else {"pi": [64, 64], "vf": [64, 64]}
    ]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "max_grad_norm": max_grad_norm,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
        },
    }

Define objective function

First we define a custom callback to report the results of periodic evaluations to Optuna:

In [28]:
class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.
    
    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

Then we define the objective function that is in charge of sampling hyperparameters, creating the model and then returning the result to Optuna

In [29]:
def objective(trial: optuna.Trial) -> float:
    """
    Objective function using by Optuna to evaluate
    one configuration (i.e., one set of hyperparameters).

    Given a trial object, it will sample hyperparameters,
    evaluate it and report the result (mean episodic reward after training)

    :param trial: Optuna trial object
    :return: Mean episodic reward after training
    """

    kwargs = DEFAULT_HYPERPARAMS.copy()
    ### YOUR CODE HERE
    # TODO: 
    # 1. Sample hyperparameters and update the default keyword arguments: `kwargs.update(other_params)`
    # 2. Create the evaluation envs
    # 3. Create the `TrialEvalCallback`

    # 1. Sample hyperparameters and update the keyword arguments
    #kwargs.update(sample_a2c_params(trial))
    kwargs.update(sample_td3_params(trial))

    # Create the RL model
    #model = A2C(**kwargs)
    model = TD3(**kwargs)

    # 2. Create envs used for evaluation using `make_vec_env`, `ENV_ID` and `N_EVAL_ENVS`
    eval_envs = make_vec_env(ENV_ID, N_EVAL_ENVS)
    #eval_envs = make_vec_env(lambda: env, N_EVAL_ENVS)
          #env = make_vec_env(lambda: env, n_envs=num_cpu, seed=SEED, vec_env_cls=DummyVecEnv)

    # 3. Create the `TrialEvalCallback` callback defined above that will periodically evaluate
    # and report the performance using `N_EVAL_EPISODES` every `EVAL_FREQ`
    # TrialEvalCallback signature:
    # TrialEvalCallback(eval_env, trial, n_eval_episodes, eval_freq, deterministic, verbose)
    eval_callback = TrialEvalCallback(eval_envs, 
                                      trial, 
                                      N_EVAL_EPISODES, 
                                      EVAL_FREQ, 
                                      deterministic = True)

    ### END OF YOUR CODE

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_envs.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

The optimization loop

In [6]:
# Set pytorch num threads to 1 for faster training

#torch.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)



# Add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

study_name = "example-study"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)

# Create the study and start the hyperparameter optimization
study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True,
                            sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_td3_floris_v0_1.csv")

# fig1 = plot_optimization_history(study)
# fig2 = plot_param_importances(study)

# fig1.show()
# fig2.show()

[32m[I 2022-09-15 16:21:40,085][0m Using an existing study with name 'example-study' instead of creating a new one.[0m


Using an existing study with name 'example-study' instead of creating a new one.


[33m[W 2022-09-15 16:41:54,753][0m Trial 4 failed because of the following error: AttributeError('rotor_area')[0m
Traceback (most recent call last):
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Marcus\AppData\Local\Temp\ipykernel_9704\1639221093.py", line 46, in objective
    model.learn(N_TIMESTEPS, callback=eval_callback)
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\a2c\a2c.py", line 197, in learn
    return super().learn(
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\common\on_policy_algorithm.py", line 248, in learn
    continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\common\on_policy_algorithm.py", line 176, in collect_rollouts
    new_obs, re

Trial 4 failed because of the following error: AttributeError('rotor_area')
Traceback (most recent call last):
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Marcus\AppData\Local\Temp\ipykernel_9704\1639221093.py", line 46, in objective
    model.learn(N_TIMESTEPS, callback=eval_callback)
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\a2c\a2c.py", line 197, in learn
    return super().learn(
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\common\on_policy_algorithm.py", line 248, in learn
    continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\common\on_policy_algorithm.py", line 176, in collect_rollouts
    new_obs, rewards, dones, infos = env.step(clipped_ac

[32m[I 2022-09-15 17:12:05,346][0m Trial 5 finished with value: -79.155482 and parameters: {'gamma': 3.404650086248227e-05, 'max_grad_norm': 0.6887783003669585, 'exponent_n_steps': 6, 'lr': 0.47732311907924363, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 5 with value: -79.155482.[0m


Trial 5 finished with value: -79.155482 and parameters: {'gamma': 3.404650086248227e-05, 'max_grad_norm': 0.6887783003669585, 'exponent_n_steps': 6, 'lr': 0.47732311907924363, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 5 with value: -79.155482.


[32m[I 2022-09-15 17:13:02,003][0m Trial 6 finished with value: -77.2378358 and parameters: {'gamma': 0.00922492394512576, 'max_grad_norm': 0.5237848569712041, 'exponent_n_steps': 6, 'lr': 0.03373916703153714, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 6 with value: -77.2378358.[0m


Trial 6 finished with value: -77.2378358 and parameters: {'gamma': 0.00922492394512576, 'max_grad_norm': 0.5237848569712041, 'exponent_n_steps': 6, 'lr': 0.03373916703153714, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 6 with value: -77.2378358.


[32m[I 2022-09-15 17:14:50,793][0m Trial 7 finished with value: -0.2511216 and parameters: {'gamma': 0.0005509560504807344, 'max_grad_norm': 3.0644518622973624, 'exponent_n_steps': 3, 'lr': 0.0001937577640685693, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 7 with value: -0.2511216.[0m


Trial 7 finished with value: -0.2511216 and parameters: {'gamma': 0.0005509560504807344, 'max_grad_norm': 3.0644518622973624, 'exponent_n_steps': 3, 'lr': 0.0001937577640685693, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 7 with value: -0.2511216.


AttributeError: rotor_area

In [30]:
# Set pytorch num threads to 1 for faster training

#torch.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)




In [31]:
# Add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

study_name = "example-study_test"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)

In [32]:
# Create the study and start the hyperparameter optimization
study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True,
                            sampler=sampler, pruner=pruner, direction="maximize")


[32m[I 2022-09-16 09:56:52,364][0m Using an existing study with name 'example-study_test' instead of creating a new one.[0m


Using an existing study with name 'example-study_test' instead of creating a new one.
Using an existing study with name 'example-study_test' instead of creating a new one.
Using an existing study with name 'example-study_test' instead of creating a new one.


In [33]:
study

<optuna.study.study.Study at 0x1c6c3dc1e50>

In [34]:
try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

sample td3 is run now


  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
  noise_std = trial.suggest_uniform("noise_std", 0, 1)
[33m[W 2022-09-16 09:57:05,487][0m Trial 2 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Marcus\AppData\Local\Temp\ipykernel_9704\2579575912.py", line 48, in objective
    model.learn(N_TIMESTEPS, callback=eval_callback)
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\td3\td3.py", line 219, in learn
    return super().learn(
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\common\off_policy_algorithm.py", line 347, in learn
    rollout = self.collect_rollouts(
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\common\off_policy_algorithm.py", line 580

Trial 2 failed because of the following error: KeyboardInterrupt()
Traceback (most recent call last):
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Marcus\AppData\Local\Temp\ipykernel_9704\2579575912.py", line 48, in objective
    model.learn(N_TIMESTEPS, callback=eval_callback)
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\td3\td3.py", line 219, in learn
    return super().learn(
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\common\off_policy_algorithm.py", line 347, in learn
    rollout = self.collect_rollouts(
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\common\off_policy_algorithm.py", line 580, in collect_rollouts
    new_obs, rewards, dones, infos = env.step(actions)
  File "C:\Users\Marcus\anaconda3\envs\thesis\lib\site-packages\stable_baselines3\common

In [None]:






print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_td3_floris_v0_1.csv")



In [10]:
study2 = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True, direction="maximize")
df = study2.trials_dataframe(attrs=("number", "value", "params", "state"))

[32m[I 2022-09-16 08:40:42,350][0m Using an existing study with name 'example-study' instead of creating a new one.[0m


Using an existing study with name 'example-study' instead of creating a new one.


In [11]:
df

Unnamed: 0,number,value,params_activation_fn,params_exponent_n_steps,params_gamma,params_lr,params_max_grad_norm,params_net_arch,state
0,0,,tanh,10,0.000647,0.024629,0.331793,tiny,RUNNING
1,1,,tanh,7,0.003299,0.023104,0.373381,tiny,RUNNING
2,2,,tanh,5,2e-06,0.464115,0.683503,small,RUNNING
3,3,,relu,9,2e-06,0.022439,2.862986,tiny,RUNNING
4,4,,tanh,3,2.3e-05,0.003144,0.671054,tiny,FAIL
5,5,-79.155482,relu,6,3.4e-05,0.477323,0.688778,small,COMPLETE
6,6,-77.237836,relu,6,0.009225,0.033739,0.523785,small,COMPLETE
7,7,-0.251122,relu,3,0.000551,0.000194,3.064452,small,COMPLETE
