In [3]:
import os

import torch as th
from torch import nn
import gymnasium as gym
import panda_gym
from sb3_contrib import TQC
from huggingface_sb3 import load_from_hub, package_to_hub
import optuna
from stable_baselines3 import A2C, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor, FlattenExtractor
from huggingface_hub import notebook_login

In [4]:
env_id = "PandaPickAndPlace-v3"

In [24]:
def objective(trial: optuna.Trial):
    # Create environment
    env = make_vec_env(env_id, n_envs=24)
    env = VecNormalize(env,norm_obs=True, norm_reward=True, clip_obs=20.)
    activ_functions = (th.nn.ReLU, th.nn.Tanh, th.nn.LeakyReLU)
    ### Policy
    activation_fn = trial.suggest_categorical("activation_fn", (0, 1, 2))
    policy_layers = trial.suggest_int("policy_layers", 1, 3)
    policy_neurons = trial.suggest_categorical("policy_neurons", (512, 1024,2048))
    policy_kwargs = {
        "net_arch": [policy_neurons] * policy_layers,
        "activation_fn": activ_functions[activation_fn]
    }

    ### Model
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3)
    gamma = trial.suggest_float("gamma", 0.9, 0.999)
    gae_lambda = trial.suggest_float("gae_lambda", 0.8, 0.99)
    ent_coef = trial.suggest_float("ent_coef", 0.0, 0.1)
    vf_coef = trial.suggest_float("vf_coef", 0.2, 0.8)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.1, 10.0)
    clip_range = trial.suggest_float("clip_range", 0.1, 0.4)

    model = PPO(
        "MultiInputPolicy", 
        env, 
        
        policy_kwargs=policy_kwargs,
        learning_rate=learning_rate,
        gamma=gamma,
        gae_lambda=gae_lambda,
        ent_coef=ent_coef,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        clip_range = clip_range,

        n_steps=2048,
        batch_size=1024,
        n_epochs=20,
        tensorboard_log="./ppo_optuna/")

    ### Training
    model.learn(total_timesteps=1_000_000)

    # Evaluate model
    env.training = False
    env.norm_reward = False
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)
    env.close()

    return mean_reward

In [25]:
study = optuna.create_study(direction="maximize", study_name="ppo")

[I 2024-06-09 21:00:06,685] A new study created in memory with name: ppo


In [26]:
study.optimize(objective, n_trials=6)

argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color

[I 2024-06-09 21:33:09,535] Trial 0 finished with value: -50.0 and parameters: {'activation_fn': 0, 'policy_layers': 2, 'policy_neurons': 2048, 'lr': 0.000627868301851335, 'gamma': 0.9037172705386535, 'gae_lambda': 0.8622160774525537, 'ent_coef': 0.07756672565633314, 'vf_coef': 0.5833083309123268, 'max_grad_norm': 8.855848287594695, 'clip_range': 0.15377439640007448}. Best is trial 0 with value: -50.0.


argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color

[I 2024-06-09 22:03:13,230] Trial 1 finished with value: -50.0 and parameters: {'activation_fn': 1, 'policy_layers': 2, 'policy_neurons': 1024, 'lr': 0.0009183356977373085, 'gamma': 0.9942452057702492, 'gae_lambda': 0.9344641200024421, 'ent_coef': 0.09044761785868519, 'vf_coef': 0.5672319567083997, 'max_grad_norm': 7.241472030353654, 'clip_range': 0.1297896142516951}. Best is trial 0 with value: -50.0.


argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color

[I 2024-06-09 22:35:15,808] Trial 2 finished with value: -50.0 and parameters: {'activation_fn': 1, 'policy_layers': 3, 'policy_neurons': 512, 'lr': 0.0007668786701443685, 'gamma': 0.9032209630320456, 'gae_lambda': 0.9844835984387246, 'ent_coef': 0.008989849246099723, 'vf_coef': 0.6939609703269833, 'max_grad_norm': 5.63166699398222, 'clip_range': 0.2675242926723941}. Best is trial 0 with value: -50.0.


argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color

[I 2024-06-09 23:15:57,420] Trial 3 finished with value: -50.0 and parameters: {'activation_fn': 1, 'policy_layers': 3, 'policy_neurons': 2048, 'lr': 0.0007995435731287492, 'gamma': 0.9330756167484276, 'gae_lambda': 0.9213137440425374, 'ent_coef': 0.006327951992660752, 'vf_coef': 0.23217032756000858, 'max_grad_norm': 3.507234444010423, 'clip_range': 0.18277245588111043}. Best is trial 0 with value: -50.0.


argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color

[I 2024-06-09 23:47:27,140] Trial 4 finished with value: -45.0 and parameters: {'activation_fn': 0, 'policy_layers': 3, 'policy_neurons': 1024, 'lr': 0.0008879846800549682, 'gamma': 0.9141137345124133, 'gae_lambda': 0.8074702429862561, 'ent_coef': 0.09041919981896826, 'vf_coef': 0.5869081852762454, 'max_grad_norm': 3.024514619819479, 'clip_range': 0.1270730123818508}. Best is trial 4 with value: -45.0.


argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color_green=0.21176470816135406
argv[2]=--background_color_blue=0.1764705926179886
argv[0]=--background_color_red=0.8745098114013672
argv[1]=--background_color

[I 2024-06-10 00:20:53,080] Trial 5 finished with value: -50.0 and parameters: {'activation_fn': 2, 'policy_layers': 2, 'policy_neurons': 2048, 'lr': 0.00036350568636517007, 'gamma': 0.9747128284421233, 'gae_lambda': 0.84950075638525, 'ent_coef': 0.00437396465441694, 'vf_coef': 0.2073469835068466, 'max_grad_norm': 1.9677990713072635, 'clip_range': 0.3457445455326133}. Best is trial 4 with value: -45.0.


In [27]:
trials = study.trials
for trial in trials:
    print(f"Trial number: {trial.number}")
    print(f"Parameters: {trial.params}")
    print(f"Value: {trial.value}\n")

Trial number: 0
Parameters: {'activation_fn': 0, 'policy_layers': 2, 'policy_neurons': 2048, 'lr': 0.000627868301851335, 'gamma': 0.9037172705386535, 'gae_lambda': 0.8622160774525537, 'ent_coef': 0.07756672565633314, 'vf_coef': 0.5833083309123268, 'max_grad_norm': 8.855848287594695, 'clip_range': 0.15377439640007448}
Value: -50.0

Trial number: 1
Parameters: {'activation_fn': 1, 'policy_layers': 2, 'policy_neurons': 1024, 'lr': 0.0009183356977373085, 'gamma': 0.9942452057702492, 'gae_lambda': 0.9344641200024421, 'ent_coef': 0.09044761785868519, 'vf_coef': 0.5672319567083997, 'max_grad_norm': 7.241472030353654, 'clip_range': 0.1297896142516951}
Value: -50.0

Trial number: 2
Parameters: {'activation_fn': 1, 'policy_layers': 3, 'policy_neurons': 512, 'lr': 0.0007668786701443685, 'gamma': 0.9032209630320456, 'gae_lambda': 0.9844835984387246, 'ent_coef': 0.008989849246099723, 'vf_coef': 0.6939609703269833, 'max_grad_norm': 5.63166699398222, 'clip_range': 0.2675242926723941}
Value: -50.0

Tr

In [28]:
with open('optuna_trials.txt', 'w') as f:
    trials = study.trials
    for trial in trials:
        f.write(f"Trial number: {trial.number}\n")
        f.write(f"Parameters: {trial.params}\n")
        f.write(f"Value: {trial.value}\n\n")
