In [3]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.evaluation import evaluate_policy
import optuna

In [4]:
def make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight):
    """
    Crea e restituisce un'istanza dell'ambiente "HalfCheetah-v5" dalla libreria Gymnasium con parametri personalizzati.

    Args:
        reset_noise_scale (float): Intensità del rumore all'inizio di ogni episodio.
        forward_reward_weight (float): Peso della ricompensa per il movimento in avanti.
        ctrl_cost_weight (float): Peso del costo del controllo (energia usata per muoversi).

    Returns:
        gym.Env: Un'istanza dell'ambiente con i parametri specificati.
    """
    return gym.make("HalfCheetah-v5",
                    reset_noise_scale=reset_noise_scale,
                    forward_reward_weight=forward_reward_weight,
                    ctrl_cost_weight=ctrl_cost_weight)


def objective(trial):
    """
    Funzione obiettivo per Optuna, che definisce il processo di tuning degli iperparametri per PPO.

    Optuna esplora diverse combinazioni di parametri per trovare la configurazione che massimizza
    la ricompensa media dell'agente.

    Args:
        trial (optuna.Trial): Oggetto che permette di campionare nuovi iperparametri.

    Returns:
        float: Ricompensa media ottenuta dopo l'addestramento con i parametri selezionati.
    """
    # Ricerca dei migliori parametri dell'ambiente tramite Optuna
    reset_noise_scale = trial.suggest_float('reset_noise_scale', 0.005, 0.3)  # Intensità del rumore all'inizio
    forward_reward_weight = trial.suggest_float('forward_reward_weight', 0.7, 1.5)  # Peso della ricompensa
    ctrl_cost_weight = trial.suggest_float('ctrl_cost_weight', 0.02, 0.3)  # Peso del costo del controllo

    # Creazione dell'ambiente con i parametri suggeriti
    env = make_env(reset_noise_scale, forward_reward_weight, ctrl_cost_weight)
    env = DummyVecEnv([lambda: env])  # Necessario per Stable-Baselines3
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)  # Normalizzazione delle osservazioni e ricompense

    # Ricerca degli iperparametri per l'algoritmo PPO
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-3)  # Range logaritmico per migliore esplorazione
    n_steps = trial.suggest_int('n_steps', 1024, 16384, step=1024)  # Numero di passi prima dell'aggiornamento
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256, 512, 1024])  # Dimensione del batch
    gamma = trial.suggest_float('gamma', 0.97, 0.9999)  # Fattore di sconto per le ricompense future
    gae_lambda = trial.suggest_float('gae_lambda', 0.85, 1.0)  # Trade-off tra bias e varianza nel GAE
    clip_range = trial.suggest_float('clip_range', 0.1, 0.35)  # Clip per la perdita PPO
    ent_coef = trial.suggest_float('ent_coef', 1e-5, 0.05)  # Coefficiente di entropia per regolare l'esplorazione

    # Creazione e addestramento del modello PPO con i parametri scelti
    model = PPO("MlpPolicy", env,
                learning_rate=learning_rate,
                n_steps=n_steps,
                batch_size=batch_size,
                gamma=gamma,
                gae_lambda=gae_lambda,
                clip_range=clip_range,
                ent_coef=ent_coef,
                verbose=0)  # Disabilita output per Optuna
    model.learn(total_timesteps=150000)  # Addestramento del modello

    # Disattiviamo l'aggiornamento della normalizzazione per la valutazione finale
    env.training = False
    env.norm_reward = False

    # Valutiamo il modello su 100 episodi
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=100, deterministic=True)

    print(f'Mean reward: {mean_reward}')
    return mean_reward  # Optuna massimizzerà questa ricompensa


# Creazione dello studio Optuna e avvio dell'ottimizzazione
study = optuna.create_study(direction='maximize')  # Massimizziamo la ricompensa media
study.optimize(objective, n_trials=100)  # Eseguiamo 100 tentativi con Optuna

# Stampa dei migliori iperparametri trovati
print("Best hyperparameters: ", study.best_params)



[I 2025-02-20 19:45:15,239] A new study created in memory with name: no-name-1892ba40-77bc-4286-95f5-30555f36ed5a
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-3)  # Range più ampio
[I 2025-02-20 19:46:02,765] Trial 0 finished with value: -169.96075044236377 and parameters: {'reset_noise_scale': 0.005407723526896945, 'forward_reward_weight': 1.265691657276868, 'ctrl_cost_weight': 0.10597432710663796, 'learning_rate': 3.2392866668478713e-05, 'n_steps': 14336, 'batch_size': 512, 'gamma': 0.9967491942949904, 'gae_lambda': 0.8770345379385267, 'clip_range': 0.16007754591201798, 'ent_coef': 0.009780012810592625}. Best is trial 0 with value: -169.96075044236377.


Mean reward: -169.96075044236377


[I 2025-02-20 19:46:48,810] Trial 1 finished with value: 728.330663225758 and parameters: {'reset_noise_scale': 0.0666162714817425, 'forward_reward_weight': 1.3745629927321112, 'ctrl_cost_weight': 0.1134428355179037, 'learning_rate': 0.0003689745029262415, 'n_steps': 5120, 'batch_size': 512, 'gamma': 0.9969248931970972, 'gae_lambda': 0.8713390232527614, 'clip_range': 0.2519626812357027, 'ent_coef': 0.012319628051307801}. Best is trial 1 with value: 728.330663225758.


Mean reward: 728.330663225758


[I 2025-02-20 19:47:42,259] Trial 2 finished with value: 1055.501501187583 and parameters: {'reset_noise_scale': 0.14229291483494422, 'forward_reward_weight': 0.8155771709086337, 'ctrl_cost_weight': 0.033211708592419266, 'learning_rate': 0.0001147387479895581, 'n_steps': 1024, 'batch_size': 128, 'gamma': 0.9849348980072848, 'gae_lambda': 0.8932729333364544, 'clip_range': 0.3016255849216237, 'ent_coef': 0.011696879105870743}. Best is trial 2 with value: 1055.501501187583.


Mean reward: 1055.501501187583


[I 2025-02-20 19:48:46,438] Trial 3 finished with value: 673.93177391838 and parameters: {'reset_noise_scale': 0.05571373698600266, 'forward_reward_weight': 1.4305890073283538, 'ctrl_cost_weight': 0.24869133279296327, 'learning_rate': 0.0007324754742474734, 'n_steps': 1024, 'batch_size': 64, 'gamma': 0.9744498635658839, 'gae_lambda': 0.9580923384202017, 'clip_range': 0.3335086409569408, 'ent_coef': 0.040536006898663754}. Best is trial 2 with value: 1055.501501187583.


Mean reward: 673.93177391838


[I 2025-02-20 19:49:32,379] Trial 4 finished with value: -0.4784573489227744 and parameters: {'reset_noise_scale': 0.03787936306465215, 'forward_reward_weight': 1.350124192549429, 'ctrl_cost_weight': 0.056255261133448954, 'learning_rate': 7.307538215498785e-06, 'n_steps': 5120, 'batch_size': 512, 'gamma': 0.9917954505696055, 'gae_lambda': 0.9545545096143995, 'clip_range': 0.2869466887142775, 'ent_coef': 0.02327567237924816}. Best is trial 2 with value: 1055.501501187583.


Mean reward: -0.4784573489227744


[I 2025-02-20 19:50:18,430] Trial 5 finished with value: -0.5007458535382061 and parameters: {'reset_noise_scale': 0.1214279617830776, 'forward_reward_weight': 1.466064190987511, 'ctrl_cost_weight': 0.23664558306089467, 'learning_rate': 1.0779657367492772e-06, 'n_steps': 4096, 'batch_size': 512, 'gamma': 0.9945909417188032, 'gae_lambda': 0.9376903350698377, 'clip_range': 0.13596824014264247, 'ent_coef': 0.04475513633694537}. Best is trial 2 with value: 1055.501501187583.


Mean reward: -0.5007458535382061


[I 2025-02-20 19:51:12,409] Trial 6 finished with value: -248.49286366550638 and parameters: {'reset_noise_scale': 0.19644718167395117, 'forward_reward_weight': 1.317257862108006, 'ctrl_cost_weight': 0.23366530268345534, 'learning_rate': 5.131757114916681e-05, 'n_steps': 3072, 'batch_size': 128, 'gamma': 0.9774612332039696, 'gae_lambda': 0.992251751743408, 'clip_range': 0.1435515923349044, 'ent_coef': 0.010483555303897305}. Best is trial 2 with value: 1055.501501187583.


Mean reward: -248.49286366550638


[I 2025-02-20 19:52:06,836] Trial 7 finished with value: 116.8750243410172 and parameters: {'reset_noise_scale': 0.2827714169226681, 'forward_reward_weight': 0.8602713072718766, 'ctrl_cost_weight': 0.1961692098387186, 'learning_rate': 0.0009360911005751743, 'n_steps': 1024, 'batch_size': 128, 'gamma': 0.9994867134951445, 'gae_lambda': 0.8963675778787956, 'clip_range': 0.1556056711551906, 'ent_coef': 0.025172318599561545}. Best is trial 2 with value: 1055.501501187583.


Mean reward: 116.8750243410172


[I 2025-02-20 19:52:52,113] Trial 8 finished with value: -60.54416204578847 and parameters: {'reset_noise_scale': 0.1717501540921622, 'forward_reward_weight': 1.3675407375031667, 'ctrl_cost_weight': 0.24005681167294313, 'learning_rate': 0.00012230685446803545, 'n_steps': 8192, 'batch_size': 1024, 'gamma': 0.9974066982788812, 'gae_lambda': 0.8910414941582505, 'clip_range': 0.29911259842517196, 'ent_coef': 0.003227654596315968}. Best is trial 2 with value: 1055.501501187583.


Mean reward: -60.54416204578847


[I 2025-02-20 19:53:40,638] Trial 9 finished with value: -1.3120269275636278 and parameters: {'reset_noise_scale': 0.28935082336786233, 'forward_reward_weight': 1.0898472583964336, 'ctrl_cost_weight': 0.18763529656716357, 'learning_rate': 1.280013061732392e-06, 'n_steps': 15360, 'batch_size': 256, 'gamma': 0.9849384495318054, 'gae_lambda': 0.8584897795201876, 'clip_range': 0.25716623756119383, 'ent_coef': 0.035930774812185706}. Best is trial 2 with value: 1055.501501187583.


Mean reward: -1.3120269275636278


[I 2025-02-20 19:55:07,481] Trial 10 finished with value: -119.21095312065496 and parameters: {'reset_noise_scale': 0.21242931066560605, 'forward_reward_weight': 0.7150132255271713, 'ctrl_cost_weight': 0.038026116011714295, 'learning_rate': 7.606568964436744e-06, 'n_steps': 11264, 'batch_size': 32, 'gamma': 0.9839618002619109, 'gae_lambda': 0.9045287666133063, 'clip_range': 0.20452314098141847, 'ent_coef': 0.022839928134801086}. Best is trial 2 with value: 1055.501501187583.


Mean reward: -119.21095312065496


[I 2025-02-20 19:56:00,386] Trial 11 finished with value: -91.16861562468591 and parameters: {'reset_noise_scale': 0.11103696585754197, 'forward_reward_weight': 1.0996980296586303, 'ctrl_cost_weight': 0.10804996114166993, 'learning_rate': 0.00019483838080736442, 'n_steps': 7168, 'batch_size': 128, 'gamma': 0.990697433425248, 'gae_lambda': 0.8512724859760127, 'clip_range': 0.22486444097539898, 'ent_coef': 0.014246817063991194}. Best is trial 2 with value: 1055.501501187583.


Mean reward: -91.16861562468591


[I 2025-02-20 19:56:45,103] Trial 12 finished with value: 561.2049360770155 and parameters: {'reset_noise_scale': 0.09553445824972584, 'forward_reward_weight': 0.9045839463124052, 'ctrl_cost_weight': 0.10290542101220547, 'learning_rate': 0.0002477267580871093, 'n_steps': 6144, 'batch_size': 1024, 'gamma': 0.9806496456981049, 'gae_lambda': 0.9149591718152772, 'clip_range': 0.34980194029756373, 'ent_coef': 0.0029014322953506907}. Best is trial 2 with value: 1055.501501187583.


Mean reward: 561.2049360770155


[I 2025-02-20 19:57:34,721] Trial 13 finished with value: -250.08432331063665 and parameters: {'reset_noise_scale': 0.07029743248262223, 'forward_reward_weight': 1.2044874674077426, 'ctrl_cost_weight': 0.2988541576608375, 'learning_rate': 0.0003370601844638699, 'n_steps': 11264, 'batch_size': 256, 'gamma': 0.9894656395084986, 'gae_lambda': 0.8750887909053316, 'clip_range': 0.27098088744122056, 'ent_coef': 0.01679091704470081}. Best is trial 2 with value: 1055.501501187583.


Mean reward: -250.08432331063665


[I 2025-02-20 19:58:37,937] Trial 14 finished with value: 1362.1291426940213 and parameters: {'reset_noise_scale': 0.1460054641656465, 'forward_reward_weight': 0.9402924843470438, 'ctrl_cost_weight': 0.02369658752973898, 'learning_rate': 8.6142752755535e-05, 'n_steps': 3072, 'batch_size': 64, 'gamma': 0.9704347214403404, 'gae_lambda': 0.8767325175374124, 'clip_range': 0.31699343883910364, 'ent_coef': 0.03171122881468322}. Best is trial 14 with value: 1362.1291426940213.


Mean reward: 1362.1291426940213


[I 2025-02-20 19:59:41,606] Trial 15 finished with value: 1065.7129110775415 and parameters: {'reset_noise_scale': 0.22805024932769752, 'forward_reward_weight': 0.9543432395910676, 'ctrl_cost_weight': 0.02848759844392174, 'learning_rate': 6.8666488819895e-05, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.9706453980296426, 'gae_lambda': 0.9216287128514012, 'clip_range': 0.3151657206495686, 'ent_coef': 0.03361680311219174}. Best is trial 14 with value: 1362.1291426940213.


Mean reward: 1065.7129110775415


[I 2025-02-20 20:00:45,545] Trial 16 finished with value: -15.87690437414793 and parameters: {'reset_noise_scale': 0.23316211536001646, 'forward_reward_weight': 0.9843904696851754, 'ctrl_cost_weight': 0.07419952162776107, 'learning_rate': 1.4064262193891325e-05, 'n_steps': 3072, 'batch_size': 64, 'gamma': 0.9716564343318274, 'gae_lambda': 0.929843163067512, 'clip_range': 0.325221973801583, 'ent_coef': 0.032506966381385685}. Best is trial 14 with value: 1362.1291426940213.


Mean reward: -15.87690437414793


[I 2025-02-20 20:01:49,408] Trial 17 finished with value: -1.8299670375022332 and parameters: {'reset_noise_scale': 0.24896189752035375, 'forward_reward_weight': 0.9776958845611926, 'ctrl_cost_weight': 0.144769089276267, 'learning_rate': 5.4608229907210894e-05, 'n_steps': 10240, 'batch_size': 64, 'gamma': 0.9701102508974855, 'gae_lambda': 0.9931321027378445, 'clip_range': 0.1926754625321839, 'ent_coef': 0.0319003087277262}. Best is trial 14 with value: 1362.1291426940213.


Mean reward: -1.8299670375022332


[I 2025-02-20 20:02:53,377] Trial 18 finished with value: -20.026647242409073 and parameters: {'reset_noise_scale': 0.14726898663830307, 'forward_reward_weight': 0.9853391759428498, 'ctrl_cost_weight': 0.024667550834997755, 'learning_rate': 1.899920837798115e-05, 'n_steps': 3072, 'batch_size': 64, 'gamma': 0.9755454550482568, 'gae_lambda': 0.9471249511636605, 'clip_range': 0.31437402211920434, 'ent_coef': 0.048392244873507746}. Best is trial 14 with value: 1362.1291426940213.


Mean reward: -20.026647242409073


[I 2025-02-20 20:03:58,092] Trial 19 finished with value: -136.56465880335278 and parameters: {'reset_noise_scale': 0.18187231908070728, 'forward_reward_weight': 0.7536971614295767, 'ctrl_cost_weight': 0.07206625012699314, 'learning_rate': 7.198082134112533e-05, 'n_steps': 8192, 'batch_size': 64, 'gamma': 0.9795964647659697, 'gae_lambda': 0.9711535449393667, 'clip_range': 0.34746085394196197, 'ent_coef': 0.02908915903935411}. Best is trial 14 with value: 1362.1291426940213.


Mean reward: -136.56465880335278


[I 2025-02-20 20:05:22,834] Trial 20 finished with value: -218.50592255895245 and parameters: {'reset_noise_scale': 0.2516906846108743, 'forward_reward_weight': 1.1779057601004876, 'ctrl_cost_weight': 0.13439432108819582, 'learning_rate': 2.003634828127847e-05, 'n_steps': 2048, 'batch_size': 32, 'gamma': 0.9730341887852347, 'gae_lambda': 0.9144908423051548, 'clip_range': 0.10041237710354552, 'ent_coef': 0.0350561084975817}. Best is trial 14 with value: 1362.1291426940213.


Mean reward: -218.50592255895245


[I 2025-02-20 20:06:16,344] Trial 21 finished with value: 987.1785992602481 and parameters: {'reset_noise_scale': 0.14729637513976976, 'forward_reward_weight': 0.8184226357483072, 'ctrl_cost_weight': 0.020283120278827463, 'learning_rate': 0.00010428850809082748, 'n_steps': 1024, 'batch_size': 128, 'gamma': 0.9700310059222983, 'gae_lambda': 0.8886441031719093, 'clip_range': 0.3006309626671575, 'ent_coef': 0.039403728001892374}. Best is trial 14 with value: 1362.1291426940213.


Mean reward: 987.1785992602481


[I 2025-02-20 20:07:19,298] Trial 22 finished with value: 996.9923754578791 and parameters: {'reset_noise_scale': 0.13819851149200701, 'forward_reward_weight': 0.908686975696088, 'ctrl_cost_weight': 0.05110267539007464, 'learning_rate': 0.00013868554801315823, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.9873247946691145, 'gae_lambda': 0.9133815976067825, 'clip_range': 0.27925565358878485, 'ent_coef': 0.019226461810948742}. Best is trial 14 with value: 1362.1291426940213.


Mean reward: 996.9923754578791


[I 2025-02-20 20:08:23,362] Trial 23 finished with value: -166.9306294962809 and parameters: {'reset_noise_scale': 0.16932034339383586, 'forward_reward_weight': 0.7917847717751724, 'ctrl_cost_weight': 0.08365352469436813, 'learning_rate': 2.9924864576821147e-05, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.9816033993323834, 'gae_lambda': 0.8836918244633065, 'clip_range': 0.30774585082978484, 'ent_coef': 0.028001872049307007}. Best is trial 14 with value: 1362.1291426940213.


Mean reward: -166.9306294962809


[I 2025-02-20 20:09:16,676] Trial 24 finished with value: 571.6732688349299 and parameters: {'reset_noise_scale': 0.22268340671712492, 'forward_reward_weight': 1.0261980059345148, 'ctrl_cost_weight': 0.05026425860298106, 'learning_rate': 0.00010063419129691544, 'n_steps': 6144, 'batch_size': 128, 'gamma': 0.9770389208807391, 'gae_lambda': 0.9023886263687297, 'clip_range': 0.24437036656134745, 'ent_coef': 0.03946891331529377}. Best is trial 14 with value: 1362.1291426940213.


Mean reward: 571.6732688349299


[I 2025-02-20 20:10:20,191] Trial 25 finished with value: 1436.8397330747864 and parameters: {'reset_noise_scale': 0.09587136875729971, 'forward_reward_weight': 0.893454861016723, 'ctrl_cost_weight': 0.021173527377576468, 'learning_rate': 0.00045968854671729387, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.9738558762431299, 'gae_lambda': 0.8666118025658481, 'clip_range': 0.326231237155658, 'ent_coef': 0.0272680045836958}. Best is trial 25 with value: 1436.8397330747864.


Mean reward: 1436.8397330747864


[I 2025-02-20 20:11:23,598] Trial 26 finished with value: 1082.0951914304728 and parameters: {'reset_noise_scale': 0.082787124457097, 'forward_reward_weight': 0.9166081640160147, 'ctrl_cost_weight': 0.061478471822835855, 'learning_rate': 0.0005536807033884938, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.9730911840758466, 'gae_lambda': 0.8622080375079209, 'clip_range': 0.3294981754724912, 'ent_coef': 0.028001262526652793}. Best is trial 25 with value: 1436.8397330747864.


Mean reward: 1082.0951914304728


[I 2025-02-20 20:12:26,895] Trial 27 finished with value: 1285.4633245831847 and parameters: {'reset_noise_scale': 0.10054595531728189, 'forward_reward_weight': 1.0592264563232021, 'ctrl_cost_weight': 0.08382038741630485, 'learning_rate': 0.000560477959791407, 'n_steps': 5120, 'batch_size': 64, 'gamma': 0.9735768174982472, 'gae_lambda': 0.8623728822123078, 'clip_range': 0.3317554820322231, 'ent_coef': 0.027567035475825882}. Best is trial 25 with value: 1436.8397330747864.


Mean reward: 1285.4633245831847


[I 2025-02-20 20:13:29,692] Trial 28 finished with value: 1407.0844886684154 and parameters: {'reset_noise_scale': 0.1090065713447085, 'forward_reward_weight': 1.0606398548699292, 'ctrl_cost_weight': 0.0872634323400774, 'learning_rate': 0.000496990051762455, 'n_steps': 6144, 'batch_size': 64, 'gamma': 0.9779846305748077, 'gae_lambda': 0.8503440232225996, 'clip_range': 0.34012427606090295, 'ent_coef': 0.01973257995677454}. Best is trial 25 with value: 1436.8397330747864.


Mean reward: 1407.0844886684154


[I 2025-02-20 20:14:34,628] Trial 29 finished with value: 118.65306883682089 and parameters: {'reset_noise_scale': 0.036700634260444134, 'forward_reward_weight': 1.1806878017315139, 'ctrl_cost_weight': 0.13231530238022393, 'learning_rate': 0.00037927205673542956, 'n_steps': 9216, 'batch_size': 64, 'gamma': 0.9783356973268461, 'gae_lambda': 0.8519783304935199, 'clip_range': 0.2696074450759247, 'ent_coef': 0.019860599155732534}. Best is trial 25 with value: 1436.8397330747864.


Mean reward: 118.65306883682089


[I 2025-02-20 20:15:17,906] Trial 30 finished with value: 503.1038501457904 and parameters: {'reset_noise_scale': 0.00517494098649432, 'forward_reward_weight': 1.1345807675644253, 'ctrl_cost_weight': 0.1635101167360087, 'learning_rate': 0.00020651379361183214, 'n_steps': 7168, 'batch_size': 1024, 'gamma': 0.9759047589727545, 'gae_lambda': 0.8706650679368741, 'clip_range': 0.34724399985172366, 'ent_coef': 0.019791845303236612}. Best is trial 25 with value: 1436.8397330747864.


Mean reward: 503.1038501457904


[I 2025-02-20 20:16:21,100] Trial 31 finished with value: 1564.304624117277 and parameters: {'reset_noise_scale': 0.10772514918456715, 'forward_reward_weight': 1.0433192188000509, 'ctrl_cost_weight': 0.09019222815409828, 'learning_rate': 0.0005755356309954765, 'n_steps': 5120, 'batch_size': 64, 'gamma': 0.9734089678146142, 'gae_lambda': 0.8646156419588821, 'clip_range': 0.3304040477107332, 'ent_coef': 0.030672533038846744}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 1564.304624117277


[I 2025-02-20 20:17:24,376] Trial 32 finished with value: 1190.9021593853113 and parameters: {'reset_noise_scale': 0.11600866971998351, 'forward_reward_weight': 1.0411693039301662, 'ctrl_cost_weight': 0.08992323908341325, 'learning_rate': 0.0005270870253744592, 'n_steps': 6144, 'batch_size': 64, 'gamma': 0.9720138022384628, 'gae_lambda': 0.8778866392068254, 'clip_range': 0.2899782553140083, 'ent_coef': 0.024187675761644396}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 1190.9021593853113


[I 2025-02-20 20:18:27,141] Trial 33 finished with value: 1243.6336803219092 and parameters: {'reset_noise_scale': 0.12909396638585885, 'forward_reward_weight': 0.8707101336163356, 'ctrl_cost_weight': 0.04361865991920517, 'learning_rate': 0.0003254800273486124, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.9752375135955966, 'gae_lambda': 0.8668203464588056, 'clip_range': 0.32079885642226497, 'ent_coef': 0.007188681571534506}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 1243.6336803219092


[I 2025-02-20 20:19:52,424] Trial 34 finished with value: -120.30505847519544 and parameters: {'reset_noise_scale': 0.0911085818484144, 'forward_reward_weight': 1.0190481271023897, 'ctrl_cost_weight': 0.09649014299174365, 'learning_rate': 0.0009750201297855644, 'n_steps': 5120, 'batch_size': 32, 'gamma': 0.9821657334187437, 'gae_lambda': 0.8768259943191611, 'clip_range': 0.33549769224059994, 'ent_coef': 0.030026333069427338}. Best is trial 31 with value: 1564.304624117277.


Mean reward: -120.30505847519544


[I 2025-02-20 20:20:40,222] Trial 35 finished with value: 1395.7350875962677 and parameters: {'reset_noise_scale': 0.041972246477526697, 'forward_reward_weight': 1.2399618594465838, 'ctrl_cost_weight': 0.11971948371633077, 'learning_rate': 0.00046710285295959087, 'n_steps': 7168, 'batch_size': 256, 'gamma': 0.9789841161574554, 'gae_lambda': 0.8548382893474203, 'clip_range': 0.29592303342411164, 'ent_coef': 0.03650931133807246}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 1395.7350875962677


[I 2025-02-20 20:21:29,263] Trial 36 finished with value: 984.4629191591937 and parameters: {'reset_noise_scale': 0.028299583209234487, 'forward_reward_weight': 1.2644132739511593, 'ctrl_cost_weight': 0.12113812707451925, 'learning_rate': 0.0006158373011138258, 'n_steps': 13312, 'batch_size': 256, 'gamma': 0.9790218055593706, 'gae_lambda': 0.8560266870496077, 'clip_range': 0.2921496565860643, 'ent_coef': 0.041983421913434266}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 984.4629191591937


[I 2025-02-20 20:22:17,695] Trial 37 finished with value: 1195.0686017803566 and parameters: {'reset_noise_scale': 0.05188908134858501, 'forward_reward_weight': 1.2301964622489885, 'ctrl_cost_weight': 0.1533414527261126, 'learning_rate': 0.0003924284372774255, 'n_steps': 9216, 'batch_size': 256, 'gamma': 0.9771461544247322, 'gae_lambda': 0.8660874474097415, 'clip_range': 0.33754557841714855, 'ent_coef': 0.036754720032931225}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 1195.0686017803566


[I 2025-02-20 20:23:03,021] Trial 38 finished with value: -290.7302879523557 and parameters: {'reset_noise_scale': 0.07572146217818412, 'forward_reward_weight': 1.3090567071999506, 'ctrl_cost_weight': 0.1767383173538842, 'learning_rate': 0.00018583465099255236, 'n_steps': 7168, 'batch_size': 512, 'gamma': 0.9835233805901002, 'gae_lambda': 0.8600533890618733, 'clip_range': 0.24143570110292562, 'ent_coef': 0.015827142064958444}. Best is trial 31 with value: 1564.304624117277.


Mean reward: -290.7302879523557


[I 2025-02-20 20:23:51,088] Trial 39 finished with value: 1014.4532180888657 and parameters: {'reset_noise_scale': 0.056521779157712074, 'forward_reward_weight': 1.4025073450240158, 'ctrl_cost_weight': 0.12540043713194352, 'learning_rate': 0.0008123176417926512, 'n_steps': 5120, 'batch_size': 256, 'gamma': 0.9803667055924509, 'gae_lambda': 0.8848685231211609, 'clip_range': 0.30238198941442956, 'ent_coef': 0.04375705825805659}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 1014.4532180888657


[I 2025-02-20 20:24:37,612] Trial 40 finished with value: -93.94392242427315 and parameters: {'reset_noise_scale': 0.02621668791376125, 'forward_reward_weight': 1.127041306494236, 'ctrl_cost_weight': 0.06488047265381972, 'learning_rate': 0.0002982002577820682, 'n_steps': 8192, 'batch_size': 512, 'gamma': 0.9741699558383722, 'gae_lambda': 0.8507499650315611, 'clip_range': 0.2736737876464154, 'ent_coef': 0.026224285796356682}. Best is trial 31 with value: 1564.304624117277.


Mean reward: -93.94392242427315


[I 2025-02-20 20:25:40,565] Trial 41 finished with value: 1130.9613174437118 and parameters: {'reset_noise_scale': 0.15953195481150043, 'forward_reward_weight': 0.9494320924579387, 'ctrl_cost_weight': 0.10914588399097311, 'learning_rate': 0.0004132648391854671, 'n_steps': 3072, 'batch_size': 64, 'gamma': 0.97248750903221, 'gae_lambda': 0.8710395085571693, 'clip_range': 0.31660805831002287, 'ent_coef': 0.03119200095014898}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 1130.9613174437118


[I 2025-02-20 20:26:28,669] Trial 42 finished with value: 448.24582608909566 and parameters: {'reset_noise_scale': 0.12991220047727303, 'forward_reward_weight': 0.8486344068919408, 'ctrl_cost_weight': 0.03855610612276811, 'learning_rate': 0.00015434189092633063, 'n_steps': 6144, 'batch_size': 256, 'gamma': 0.9747804877395994, 'gae_lambda': 0.8806894902680575, 'clip_range': 0.32429452830633543, 'ent_coef': 0.022760281492096968}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 448.24582608909566


[I 2025-02-20 20:27:32,299] Trial 43 finished with value: -1.1288041088139311 and parameters: {'reset_noise_scale': 0.10906420687969076, 'forward_reward_weight': 1.0636824884949947, 'ctrl_cost_weight': 0.06877479588153596, 'learning_rate': 2.392958989635281e-06, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.9761571996311132, 'gae_lambda': 0.8575543912001092, 'clip_range': 0.33673644527442403, 'ent_coef': 0.035781894511082095}. Best is trial 31 with value: 1564.304624117277.


Mean reward: -1.1288041088139311


[I 2025-02-20 20:28:35,128] Trial 44 finished with value: 1266.4991064357444 and parameters: {'reset_noise_scale': 0.05969204503713129, 'forward_reward_weight': 1.1331361583669937, 'ctrl_cost_weight': 0.09565332543668453, 'learning_rate': 0.0007707780688751296, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.971819340101264, 'gae_lambda': 0.8980420911104566, 'clip_range': 0.28292984297691615, 'ent_coef': 0.03781387971033749}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 1266.4991064357444


[I 2025-02-20 20:29:20,013] Trial 45 finished with value: -301.11374786773956 and parameters: {'reset_noise_scale': 0.08839695858966629, 'forward_reward_weight': 1.4986515713957778, 'ctrl_cost_weight': 0.22104172768877445, 'learning_rate': 0.00023516287915304623, 'n_steps': 5120, 'batch_size': 1024, 'gamma': 0.9781754287694048, 'gae_lambda': 0.8672058891925751, 'clip_range': 0.25901984219025653, 'ent_coef': 0.02143281438899298}. Best is trial 31 with value: 1564.304624117277.


Mean reward: -301.11374786773956


[I 2025-02-20 20:30:07,028] Trial 46 finished with value: 764.8308123166416 and parameters: {'reset_noise_scale': 0.104469240213629, 'forward_reward_weight': 0.8836455439173594, 'ctrl_cost_weight': 0.11233196530492826, 'learning_rate': 0.0004777963053788174, 'n_steps': 7168, 'batch_size': 256, 'gamma': 0.9741419857402989, 'gae_lambda': 0.8727105159085873, 'clip_range': 0.30822799341568985, 'ent_coef': 0.03367030042855722}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 764.8308123166416


[I 2025-02-20 20:31:30,462] Trial 47 finished with value: 818.0736980522281 and parameters: {'reset_noise_scale': 0.12535471841833248, 'forward_reward_weight': 1.0142026581385197, 'ctrl_cost_weight': 0.03902721236935791, 'learning_rate': 0.0002624046833017821, 'n_steps': 3072, 'batch_size': 32, 'gamma': 0.9711968969895218, 'gae_lambda': 0.8557765354768818, 'clip_range': 0.29286023021098034, 'ent_coef': 0.02544358480580036}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 818.0736980522281


[I 2025-02-20 20:32:34,456] Trial 48 finished with value: 305.982624282111 and parameters: {'reset_noise_scale': 0.1573845182692727, 'forward_reward_weight': 0.9367278528466699, 'ctrl_cost_weight': 0.08175544390277023, 'learning_rate': 3.428140341261392e-05, 'n_steps': 1024, 'batch_size': 64, 'gamma': 0.9868332884911306, 'gae_lambda': 0.8922615799911637, 'clip_range': 0.3399233197067534, 'ent_coef': 0.03082737382471786}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 305.982624282111


[I 2025-02-20 20:33:20,651] Trial 49 finished with value: -162.5541834751662 and parameters: {'reset_noise_scale': 0.0737232182267404, 'forward_reward_weight': 1.0838451009873182, 'ctrl_cost_weight': 0.055561234524725896, 'learning_rate': 0.0007007545858603956, 'n_steps': 6144, 'batch_size': 512, 'gamma': 0.9768150372080211, 'gae_lambda': 0.9720427801224701, 'clip_range': 0.32185222820014, 'ent_coef': 0.017596148210906794}. Best is trial 31 with value: 1564.304624117277.


Mean reward: -162.5541834751662


[I 2025-02-20 20:34:23,401] Trial 50 finished with value: 1455.408611568639 and parameters: {'reset_noise_scale': 0.04368276017901248, 'forward_reward_weight': 1.3236808072513802, 'ctrl_cost_weight': 0.028978459361616894, 'learning_rate': 0.0001625504506257751, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.9820622551525833, 'gae_lambda': 0.8632960734165328, 'clip_range': 0.22366944474777056, 'ent_coef': 0.00922764762992235}. Best is trial 31 with value: 1564.304624117277.


Mean reward: 1455.408611568639


[I 2025-02-20 20:35:25,893] Trial 51 finished with value: 1573.801011614274 and parameters: {'reset_noise_scale': 0.019646156360152628, 'forward_reward_weight': 1.3233580549495705, 'ctrl_cost_weight': 0.030321632915604628, 'learning_rate': 8.19546201257429e-05, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.9831309036224238, 'gae_lambda': 0.8638547627333991, 'clip_range': 0.1988560910337444, 'ent_coef': 0.0004371916742215761}. Best is trial 51 with value: 1573.801011614274.


Mean reward: 1573.801011614274


[I 2025-02-20 20:36:29,438] Trial 52 finished with value: -63.88125165720034 and parameters: {'reset_noise_scale': 0.022370748151542566, 'forward_reward_weight': 1.3221276154751256, 'ctrl_cost_weight': 0.03259901845275205, 'learning_rate': 0.00013729399233940702, 'n_steps': 5120, 'batch_size': 64, 'gamma': 0.9825570166177456, 'gae_lambda': 0.8642387289811776, 'clip_range': 0.1975121488044288, 'ent_coef': 0.006143274534001927}. Best is trial 51 with value: 1573.801011614274.


Mean reward: -63.88125165720034


[I 2025-02-20 20:37:32,567] Trial 53 finished with value: 668.7974012671865 and parameters: {'reset_noise_scale': 0.04654724170893071, 'forward_reward_weight': 1.421441956675186, 'ctrl_cost_weight': 0.04740842225860613, 'learning_rate': 0.0002872367771799036, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.9811077236341573, 'gae_lambda': 0.851605351731433, 'clip_range': 0.21623905496000723, 'ent_coef': 0.0011867645816340104}. Best is trial 51 with value: 1573.801011614274.


Mean reward: 668.7974012671865


[I 2025-02-20 20:38:35,393] Trial 54 finished with value: 891.1815182394434 and parameters: {'reset_noise_scale': 0.014982400050212792, 'forward_reward_weight': 1.2745487630462098, 'ctrl_cost_weight': 0.289876914938514, 'learning_rate': 0.0001730374049328682, 'n_steps': 4096, 'batch_size': 64, 'gamma': 0.9836946380050785, 'gae_lambda': 0.8579551961592519, 'clip_range': 0.1813650385953968, 'ent_coef': 0.012201198482363706}. Best is trial 51 with value: 1573.801011614274.


Mean reward: 891.1815182394434


[I 2025-02-20 20:39:39,676] Trial 55 finished with value: -29.9161644495921 and parameters: {'reset_noise_scale': 0.03990320867213854, 'forward_reward_weight': 1.3452351981021426, 'ctrl_cost_weight': 0.021767967778896617, 'learning_rate': 5.0002962322792464e-05, 'n_steps': 6144, 'batch_size': 64, 'gamma': 0.9862662040499911, 'gae_lambda': 0.8694746434713537, 'clip_range': 0.18036577971698695, 'ent_coef': 0.00722719896299488}. Best is trial 51 with value: 1573.801011614274.


Mean reward: -29.9161644495921


[I 2025-02-20 20:40:27,327] Trial 56 finished with value: 2129.905469860487 and parameters: {'reset_noise_scale': 0.06753114374868827, 'forward_reward_weight': 1.388703904589273, 'ctrl_cost_weight': 0.058760739255121694, 'learning_rate': 0.00046554317976780704, 'n_steps': 2048, 'batch_size': 256, 'gamma': 0.97985576941778, 'gae_lambda': 0.886925035817648, 'clip_range': 0.21653700187441735, 'ent_coef': 0.004431624049753458}. Best is trial 56 with value: 2129.905469860487.


Mean reward: 2129.905469860487


[I 2025-02-20 20:41:11,348] Trial 57 finished with value: 2063.056392113853 and parameters: {'reset_noise_scale': 0.06636050785295972, 'forward_reward_weight': 1.3903609482879289, 'ctrl_cost_weight': 0.032632029291211284, 'learning_rate': 0.0007203541006690706, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9802411679496303, 'gae_lambda': 0.8867558151493844, 'clip_range': 0.2267172332469943, 'ent_coef': 0.005133259393394398}. Best is trial 56 with value: 2129.905469860487.


Mean reward: 2063.056392113853


[I 2025-02-20 20:41:55,387] Trial 58 finished with value: 2081.5779589767017 and parameters: {'reset_noise_scale': 0.06781412054148939, 'forward_reward_weight': 1.3876568315559727, 'ctrl_cost_weight': 0.031875953914183436, 'learning_rate': 0.0009488122301877996, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9853237187401938, 'gae_lambda': 0.9066594542736889, 'clip_range': 0.22638662415591423, 'ent_coef': 0.0001486798949073883}. Best is trial 56 with value: 2129.905469860487.


Mean reward: 2081.5779589767017


[I 2025-02-20 20:42:40,897] Trial 59 finished with value: 1875.5672415508493 and parameters: {'reset_noise_scale': 0.0642913847477026, 'forward_reward_weight': 1.4589178302345152, 'ctrl_cost_weight': 0.032889825678397946, 'learning_rate': 0.0009179950917274472, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9853353212388789, 'gae_lambda': 0.9098739088382447, 'clip_range': 0.23018403338054172, 'ent_coef': 0.004701209679469834}. Best is trial 56 with value: 2129.905469860487.


Mean reward: 1875.5672415508493


[I 2025-02-20 20:43:26,039] Trial 60 finished with value: 4698.895084980981 and parameters: {'reset_noise_scale': 0.06369684242560415, 'forward_reward_weight': 1.44617835103446, 'ctrl_cost_weight': 0.0544226645937837, 'learning_rate': 0.0009699365825096747, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9857859792010212, 'gae_lambda': 0.9107811154406141, 'clip_range': 0.2091038335716418, 'ent_coef': 0.0004353350261388343}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 4698.895084980981


[I 2025-02-20 20:44:10,879] Trial 61 finished with value: 2147.8059832703793 and parameters: {'reset_noise_scale': 0.06435773388786759, 'forward_reward_weight': 1.4569356497207444, 'ctrl_cost_weight': 0.05690514462558757, 'learning_rate': 0.0009841044571654318, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9886722688813842, 'gae_lambda': 0.9089088770602772, 'clip_range': 0.21286732781539058, 'ent_coef': 0.003922933969938333}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2147.8059832703793


[I 2025-02-20 20:44:56,110] Trial 62 finished with value: 3025.7148932491154 and parameters: {'reset_noise_scale': 0.05963044089005639, 'forward_reward_weight': 1.4649978848760425, 'ctrl_cost_weight': 0.05439382028989462, 'learning_rate': 0.000851505220313217, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9886188947758634, 'gae_lambda': 0.9073988419197015, 'clip_range': 0.2153226751175662, 'ent_coef': 0.0004764318548964267}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 3025.7148932491154


[I 2025-02-20 20:45:41,372] Trial 63 finished with value: 3028.502842808446 and parameters: {'reset_noise_scale': 0.06583920522049258, 'forward_reward_weight': 1.4598749474731745, 'ctrl_cost_weight': 0.056890758176870636, 'learning_rate': 0.0008361157206924195, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9922172276017714, 'gae_lambda': 0.9063568521159283, 'clip_range': 0.21298952316471678, 'ent_coef': 0.0038308150213302635}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 3028.502842808446


[I 2025-02-20 20:46:26,338] Trial 64 finished with value: 2025.5868157095997 and parameters: {'reset_noise_scale': 0.08046118334462277, 'forward_reward_weight': 1.3936999026276904, 'ctrl_cost_weight': 0.057203669132660194, 'learning_rate': 0.0009707265920224485, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.989739248878473, 'gae_lambda': 0.9224988211247689, 'clip_range': 0.2095218751898864, 'ent_coef': 0.0026003790414306054}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2025.5868157095997


[I 2025-02-20 20:47:10,619] Trial 65 finished with value: 3912.7364855735404 and parameters: {'reset_noise_scale': 0.06566545259255407, 'forward_reward_weight': 1.457920791058716, 'ctrl_cost_weight': 0.07148328632931297, 'learning_rate': 0.0006972072908904952, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9925147344833138, 'gae_lambda': 0.9297027516538564, 'clip_range': 0.23226659382365317, 'ent_coef': 0.003995307275090006}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 3912.7364855735404


[I 2025-02-20 20:47:55,570] Trial 66 finished with value: 2019.4401309457078 and parameters: {'reset_noise_scale': 0.05187283966359925, 'forward_reward_weight': 1.455023052930855, 'ctrl_cost_weight': 0.0759586888332783, 'learning_rate': 0.000688287021818127, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9930673030792219, 'gae_lambda': 0.9305957900317681, 'clip_range': 0.18870624941825767, 'ent_coef': 0.0017032576244325934}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2019.4401309457078


[I 2025-02-20 20:48:39,988] Trial 67 finished with value: 2299.3387675323775 and parameters: {'reset_noise_scale': 0.06790552067436076, 'forward_reward_weight': 1.4905318924931552, 'ctrl_cost_weight': 0.0614117071872053, 'learning_rate': 0.0008194402765650098, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9881504600530759, 'gae_lambda': 0.9070550835227954, 'clip_range': 0.23610760247874454, 'ent_coef': 0.009578747230828761}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2299.3387675323775


[I 2025-02-20 20:49:25,350] Trial 68 finished with value: 3447.8681813438516 and parameters: {'reset_noise_scale': 0.08821731621481067, 'forward_reward_weight': 1.475262417048826, 'ctrl_cost_weight': 0.0644067101635838, 'learning_rate': 0.00077444455238894, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9957985574490197, 'gae_lambda': 0.9186471058147057, 'clip_range': 0.24171809572349656, 'ent_coef': 0.009244930852001206}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 3447.8681813438516


[I 2025-02-20 20:50:10,429] Trial 69 finished with value: 1577.8618648056372 and parameters: {'reset_noise_scale': 0.08044426334867578, 'forward_reward_weight': 1.499531821938327, 'ctrl_cost_weight': 0.07405171523855798, 'learning_rate': 0.000661765635548451, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9957729638767009, 'gae_lambda': 0.9188258466319864, 'clip_range': 0.23545747931029054, 'ent_coef': 0.00946982293036588}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 1577.8618648056372


[I 2025-02-20 20:50:55,428] Trial 70 finished with value: 1571.1635037809551 and parameters: {'reset_noise_scale': 0.034620077955095155, 'forward_reward_weight': 1.4266722343444833, 'ctrl_cost_weight': 0.045142317748826016, 'learning_rate': 0.0007954980158667891, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9884030099098237, 'gae_lambda': 0.9398579503881859, 'clip_range': 0.2509905750141534, 'ent_coef': 0.013968692246804854}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 1571.1635037809551


[I 2025-02-20 20:51:39,813] Trial 71 finished with value: 2343.685674058789 and parameters: {'reset_noise_scale': 0.06037394138999969, 'forward_reward_weight': 1.4756232921499768, 'ctrl_cost_weight': 0.062468283464451387, 'learning_rate': 0.000807141022809287, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9923313661418953, 'gae_lambda': 0.8993892044080143, 'clip_range': 0.21185247853771197, 'ent_coef': 0.00413912004965787}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2343.685674058789


[I 2025-02-20 20:52:23,762] Trial 72 finished with value: 1711.7074796645268 and parameters: {'reset_noise_scale': 0.08693495230674371, 'forward_reward_weight': 1.4815109944530636, 'ctrl_cost_weight': 0.06597105601920192, 'learning_rate': 0.0006131580514042664, 'n_steps': 3072, 'batch_size': 1024, 'gamma': 0.9923376819175111, 'gae_lambda': 0.9010424948597611, 'clip_range': 0.2142228994568995, 'ent_coef': 0.0032534222238491148}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 1711.7074796645268


[I 2025-02-20 20:53:08,749] Trial 73 finished with value: 836.2900303233619 and parameters: {'reset_noise_scale': 0.056103970458779934, 'forward_reward_weight': 1.4574670261033524, 'ctrl_cost_weight': 0.05464422443581953, 'learning_rate': 0.0008402229133101798, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9989635007042997, 'gae_lambda': 0.927879370505821, 'clip_range': 0.20539165744979665, 'ent_coef': 0.00793035391596107}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 836.2900303233619


[I 2025-02-20 20:53:53,715] Trial 74 finished with value: 1901.1344010449864 and parameters: {'reset_noise_scale': 0.07519022407271994, 'forward_reward_weight': 1.4337905416066339, 'ctrl_cost_weight': 0.07726586169296477, 'learning_rate': 0.0009792604664906489, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9938106090980641, 'gae_lambda': 0.9074297537693362, 'clip_range': 0.16278396395272285, 'ent_coef': 0.01060048208294738}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 1901.1344010449864


[I 2025-02-20 20:54:40,777] Trial 75 finished with value: -63.579606486915836 and parameters: {'reset_noise_scale': 0.09855700099832859, 'forward_reward_weight': 1.4769248217791415, 'ctrl_cost_weight': 0.06605698239526447, 'learning_rate': 0.0005716751560291485, 'n_steps': 16384, 'batch_size': 1024, 'gamma': 0.9910874527212494, 'gae_lambda': 0.9177277418743788, 'clip_range': 0.2361868875189178, 'ent_coef': 0.0036707426626086627}. Best is trial 60 with value: 4698.895084980981.


Mean reward: -63.579606486915836


[I 2025-02-20 20:55:25,221] Trial 76 finished with value: 2254.5003930600706 and parameters: {'reset_noise_scale': 0.05978383122992442, 'forward_reward_weight': 1.4425366836874336, 'ctrl_cost_weight': 0.04902676950660939, 'learning_rate': 0.0007704637466240495, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.988192709448547, 'gae_lambda': 0.9111093976718151, 'clip_range': 0.24882246008705028, 'ent_coef': 0.0022388743010609558}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2254.5003930600706


[I 2025-02-20 20:56:09,317] Trial 77 finished with value: 1748.646858619325 and parameters: {'reset_noise_scale': 0.051066584150087205, 'forward_reward_weight': 1.4353771795289367, 'ctrl_cost_weight': 0.09965460323415892, 'learning_rate': 0.0003689077182486042, 'n_steps': 3072, 'batch_size': 1024, 'gamma': 0.9953749228892889, 'gae_lambda': 0.9123950964355206, 'clip_range': 0.2557238390849301, 'ent_coef': 0.0020159861875084424}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 1748.646858619325


[I 2025-02-20 20:56:53,661] Trial 78 finished with value: 2050.424601466923 and parameters: {'reset_noise_scale': 0.03206847894769806, 'forward_reward_weight': 1.3547691705034945, 'ctrl_cost_weight': 0.049385504122816976, 'learning_rate': 0.0007320699815241774, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9902820523066435, 'gae_lambda': 0.8958164279513477, 'clip_range': 0.24497844419595205, 'ent_coef': 0.005669339454626894}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2050.424601466923


[I 2025-02-20 20:57:38,257] Trial 79 finished with value: 1769.7748762592787 and parameters: {'reset_noise_scale': 0.09144323207788921, 'forward_reward_weight': 1.4826370998664455, 'ctrl_cost_weight': 0.04188421551883281, 'learning_rate': 0.000427622725442753, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9912535207338709, 'gae_lambda': 0.9334037697012508, 'clip_range': 0.26523586006191735, 'ent_coef': 0.006190240072779124}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 1769.7748762592787


[I 2025-02-20 20:58:22,647] Trial 80 finished with value: -0.5175239886173277 and parameters: {'reset_noise_scale': 0.06028693059428049, 'forward_reward_weight': 1.4124531344402544, 'ctrl_cost_weight': 0.07193783891264609, 'learning_rate': 7.017355404112543e-06, 'n_steps': 3072, 'batch_size': 1024, 'gamma': 0.9882122004133347, 'gae_lambda': 0.9021998544997135, 'clip_range': 0.2491985660195898, 'ent_coef': 3.714975193496096e-05}. Best is trial 60 with value: 4698.895084980981.


Mean reward: -0.5175239886173277


[I 2025-02-20 20:59:07,997] Trial 81 finished with value: 1131.1055665926467 and parameters: {'reset_noise_scale': 0.07476206338290214, 'forward_reward_weight': 1.4611666005220951, 'ctrl_cost_weight': 0.060440182688612606, 'learning_rate': 0.0008326582715579842, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9887067577158689, 'gae_lambda': 0.9250904491058903, 'clip_range': 0.23551943051205285, 'ent_coef': 0.00857778954285868}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 1131.1055665926467


[I 2025-02-20 20:59:52,959] Trial 82 finished with value: 2135.205090388677 and parameters: {'reset_noise_scale': 0.04731369527923411, 'forward_reward_weight': 1.446837274820651, 'ctrl_cost_weight': 0.050800915893308395, 'learning_rate': 0.0005996220783707598, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9921234301418913, 'gae_lambda': 0.9102902203552494, 'clip_range': 0.22033481048817558, 'ent_coef': 0.004093702199462579}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2135.205090388677


[I 2025-02-20 21:00:38,162] Trial 83 finished with value: 419.0046606092326 and parameters: {'reset_noise_scale': 0.05981091446399217, 'forward_reward_weight': 1.372742524212453, 'ctrl_cost_weight': 0.08209284353278057, 'learning_rate': 0.0005212614065651051, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9894179540969895, 'gae_lambda': 0.9161618714476996, 'clip_range': 0.20916636588723858, 'ent_coef': 0.010735636741022125}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 419.0046606092326


[I 2025-02-20 21:01:31,118] Trial 84 finished with value: 2076.2160937294234 and parameters: {'reset_noise_scale': 0.08464040395183305, 'forward_reward_weight': 1.4738018741835022, 'ctrl_cost_weight': 0.06523562023138418, 'learning_rate': 0.0007819845630729654, 'n_steps': 3072, 'batch_size': 128, 'gamma': 0.9970903103972965, 'gae_lambda': 0.8986703148643629, 'clip_range': 0.2306622293336142, 'ent_coef': 0.002867992731056048}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2076.2160937294234


[I 2025-02-20 21:02:16,468] Trial 85 finished with value: 1421.8502379732058 and parameters: {'reset_noise_scale': 0.07101577277660581, 'forward_reward_weight': 1.414145550760931, 'ctrl_cost_weight': 0.04337186085316456, 'learning_rate': 0.00034196354241328164, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.987518577366089, 'gae_lambda': 0.9042207079138972, 'clip_range': 0.19982230302360646, 'ent_coef': 0.0014585319136043765}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 1421.8502379732058


[I 2025-02-20 21:03:01,063] Trial 86 finished with value: 3238.3108893758663 and parameters: {'reset_noise_scale': 0.013459312664159742, 'forward_reward_weight': 1.4435374113892951, 'ctrl_cost_weight': 0.09129087622076545, 'learning_rate': 0.0006365820963392328, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9932509667338772, 'gae_lambda': 0.9196254842611007, 'clip_range': 0.19119739932498195, 'ent_coef': 0.007152371678457134}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 3238.3108893758663


[I 2025-02-20 21:03:46,184] Trial 87 finished with value: -54.04210857782771 and parameters: {'reset_noise_scale': 0.010178580858408753, 'forward_reward_weight': 1.4945487881961494, 'ctrl_cost_weight': 0.07935722702525015, 'learning_rate': 0.0006560042361865211, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9932132396828863, 'gae_lambda': 0.9398274025009891, 'clip_range': 0.18383127910759064, 'ent_coef': 0.013408180222219204}. Best is trial 60 with value: 4698.895084980981.


Mean reward: -54.04210857782771


[I 2025-02-20 21:04:31,907] Trial 88 finished with value: -4.476393685578223 and parameters: {'reset_noise_scale': 0.03176225158532031, 'forward_reward_weight': 1.4394732737442553, 'ctrl_cost_weight': 0.08940127182542727, 'learning_rate': 0.0005176371119065751, 'n_steps': 13312, 'batch_size': 1024, 'gamma': 0.994405306932008, 'gae_lambda': 0.9213661337800335, 'clip_range': 0.165962295496787, 'ent_coef': 0.00672234500842182}. Best is trial 60 with value: 4698.895084980981.


Mean reward: -4.476393685578223


[I 2025-02-20 21:05:16,479] Trial 89 finished with value: 2378.011432091436 and parameters: {'reset_noise_scale': 0.19519743351397098, 'forward_reward_weight': 1.413475778741262, 'ctrl_cost_weight': 0.07223269983628883, 'learning_rate': 0.0004080290964742112, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9956791402055636, 'gae_lambda': 0.9342147268117025, 'clip_range': 0.1748498941402424, 'ent_coef': 0.008250678796201361}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2378.011432091436


[I 2025-02-20 21:06:39,685] Trial 90 finished with value: 965.0497451313516 and parameters: {'reset_noise_scale': 0.11847078345715836, 'forward_reward_weight': 1.3562297792647524, 'ctrl_cost_weight': 0.09376349830202653, 'learning_rate': 0.0003921485002209587, 'n_steps': 3072, 'batch_size': 32, 'gamma': 0.9979360165810195, 'gae_lambda': 0.9345135962733566, 'clip_range': 0.17302572755632314, 'ent_coef': 0.008186380677532985}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 965.0497451313516


[I 2025-02-20 21:07:24,234] Trial 91 finished with value: 1188.8376282586582 and parameters: {'reset_noise_scale': 0.2608023757942522, 'forward_reward_weight': 1.4072533391771456, 'ctrl_cost_weight': 0.07178467961151815, 'learning_rate': 0.0006858130794943026, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9960837239376026, 'gae_lambda': 0.9143813798381621, 'clip_range': 0.19357549981348576, 'ent_coef': 0.011126135089023931}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 1188.8376282586582


[I 2025-02-20 21:08:09,458] Trial 92 finished with value: 156.76961211518778 and parameters: {'reset_noise_scale': 0.17178471236117182, 'forward_reward_weight': 1.475899360923305, 'ctrl_cost_weight': 0.10410310827802394, 'learning_rate': 0.0008215810176345179, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9943574485529495, 'gae_lambda': 0.9442420363628499, 'clip_range': 0.1357941061964495, 'ent_coef': 0.00527498517018102}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 156.76961211518778


[I 2025-02-20 21:08:54,358] Trial 93 finished with value: -80.44612721368141 and parameters: {'reset_noise_scale': 0.197564293384659, 'forward_reward_weight': 1.4322434249306997, 'ctrl_cost_weight': 0.051199418351817225, 'learning_rate': 0.0005735532224381335, 'n_steps': 3072, 'batch_size': 1024, 'gamma': 0.9916654853358225, 'gae_lambda': 0.8948456584053667, 'clip_range': 0.15453954898826627, 'ent_coef': 0.0015201115612448377}. Best is trial 60 with value: 4698.895084980981.


Mean reward: -80.44612721368141


[I 2025-02-20 21:09:39,946] Trial 94 finished with value: 1097.7285326898302 and parameters: {'reset_noise_scale': 0.18908983242269284, 'forward_reward_weight': 1.4866215716092315, 'ctrl_cost_weight': 0.06295302829241503, 'learning_rate': 0.0004471234299165594, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9950670541976754, 'gae_lambda': 0.9206665702887112, 'clip_range': 0.20371322739868006, 'ent_coef': 0.0071807207878107696}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 1097.7285326898302


[I 2025-02-20 21:10:24,624] Trial 95 finished with value: 2033.177439252304 and parameters: {'reset_noise_scale': 0.21026511287751884, 'forward_reward_weight': 1.4192854276392348, 'ctrl_cost_weight': 0.08599253683556224, 'learning_rate': 0.0008499854731554041, 'n_steps': 2048, 'batch_size': 1024, 'gamma': 0.9928019335823813, 'gae_lambda': 0.9252425957913694, 'clip_range': 0.23905563609633362, 'ent_coef': 0.009533263077923787}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2033.177439252304


[I 2025-02-20 21:11:18,181] Trial 96 finished with value: 2606.7381096226045 and parameters: {'reset_noise_scale': 0.03810472496469499, 'forward_reward_weight': 1.446963100172822, 'ctrl_cost_weight': 0.04122462769056366, 'learning_rate': 0.0006568508203868824, 'n_steps': 1024, 'batch_size': 128, 'gamma': 0.9965246063757676, 'gae_lambda': 0.950374640391001, 'clip_range': 0.1737277364387635, 'ent_coef': 0.003311023960020385}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2606.7381096226045


[I 2025-02-20 21:12:11,911] Trial 97 finished with value: 2120.0764839226413 and parameters: {'reset_noise_scale': 0.01588313482443447, 'forward_reward_weight': 1.4686483923314504, 'ctrl_cost_weight': 0.039684130826961615, 'learning_rate': 0.00021800079943907282, 'n_steps': 1024, 'batch_size': 128, 'gamma': 0.9983782808685563, 'gae_lambda': 0.9529553629417565, 'clip_range': 0.18811719258169554, 'ent_coef': 0.01543391801125838}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 2120.0764839226413


[I 2025-02-20 21:13:05,461] Trial 98 finished with value: 3553.0941224342237 and parameters: {'reset_noise_scale': 0.04143153804195014, 'forward_reward_weight': 1.3741765081158461, 'ctrl_cost_weight': 0.06965838903486682, 'learning_rate': 0.0002869089598817114, 'n_steps': 1024, 'batch_size': 128, 'gamma': 0.9964383155408642, 'gae_lambda': 0.9443600164220389, 'clip_range': 0.17616056689611564, 'ent_coef': 0.005113659605647694}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 3553.0941224342237


[I 2025-02-20 21:13:59,065] Trial 99 finished with value: 1864.2691478518284 and parameters: {'reset_noise_scale': 0.023340401689453526, 'forward_reward_weight': 1.3703010923961716, 'ctrl_cost_weight': 0.06955308847223407, 'learning_rate': 0.00026606278983739364, 'n_steps': 1024, 'batch_size': 128, 'gamma': 0.996102404541061, 'gae_lambda': 0.9671458744694584, 'clip_range': 0.16987597676513275, 'ent_coef': 0.004711116174813325}. Best is trial 60 with value: 4698.895084980981.


Mean reward: 1864.2691478518284
Best hyperparameters:  {'reset_noise_scale': 0.06369684242560415, 'forward_reward_weight': 1.44617835103446, 'ctrl_cost_weight': 0.0544226645937837, 'learning_rate': 0.0009699365825096747, 'n_steps': 1024, 'batch_size': 1024, 'gamma': 0.9857859792010212, 'gae_lambda': 0.9107811154406141, 'clip_range': 0.2091038335716418, 'ent_coef': 0.0004353350261388343}
