In [1]:
import gymnasium as gym


#定义环境
class MyWrapper(gym.Wrapper):

    def __init__(self):
        env = gym.make('CartPole-v1')
        super().__init__(env)
        self.env = env

    def reset(self, seed=None, options=None):
        state, info = self.env.reset()
        return state, info

    def step(self, action):
        state, reward, done, truncated, info = self.env.step(action)
        return state, reward, done, truncated, info


MyWrapper().reset()

(array([-4.5846730e-02, -3.9549603e-05,  4.1472476e-02, -3.7735011e-02],
       dtype=float32),
 {})

In [2]:
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor

#创建训练环境和测试环境
env_train = make_vec_env(MyWrapper, n_envs=4)
env_test = Monitor(MyWrapper())

env_train, env_test

(<stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv at 0x1f0c0657c20>,
 <Monitor<MyWrapper<TimeLimit<OrderEnforcing<PassiveEnvChecker<CartPoleEnv<CartPole-v1>>>>>>>)

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy


#测试超参数
def test_params(params):
    #定义一个模型
    model = PPO(
        policy='MlpPolicy',
        env=env_train,
        n_steps=1024,
        batch_size=64,
        #取超参数
        n_epochs=params['n_epochs'],
        #取超参数
        gamma=params['gamma'],
        gae_lambda=0.98,
        ent_coef=0.01,
        verbose=0,
        device='cpu'
    )

    #训练
    #取超参数
    model.learn(total_timesteps=params['total_timesteps'], progress_bar=True)

    #测试
    mean_reward, std_reward = evaluate_policy(model,
                                              env_test,
                                              n_eval_episodes=50,
                                              deterministic=True)

    #最终的分数就是简单的求差,这也是study要优化的数
    score = mean_reward - std_reward

    return score


test_params({'n_epochs': 2, 'gamma': 0.99, 'total_timesteps': 500})

np.float64(81.75799024501725)

In [4]:
import optuna
from optuna.samplers import TPESampler

#定义一个超参数学习器
study = optuna.create_study(sampler=TPESampler(),
                            study_name='PPO-LunarLander-v2',
                            direction='maximize')


#求最优超参数
def f(trial):
    #定义要找的超参数,并设置上下限
    params = {
        'n_epochs': trial.suggest_int('n_epochs', 3, 5),
        'gamma': trial.suggest_uniform('gamma', 0.99, 0.9999),
        'total_timesteps': trial.suggest_int('total_timesteps', 500, 2000),
    }

    #测试超参数
    return test_params(params)


study.optimize(f, n_trials=5)

#输出最佳分数和超参数
study.best_trial.values, study.best_trial.params

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-06-30 19:26:21,136] A new study created in memory with name: PPO-LunarLander-v2


  'gamma': trial.suggest_uniform('gamma', 0.99, 0.9999),


[I 2025-06-30 19:26:24,994] Trial 0 finished with value: 61.13555052642612 and parameters: {'n_epochs': 5, 'gamma': 0.9905258768003454, 'total_timesteps': 1928}. Best is trial 0 with value: 61.13555052642612.


[I 2025-06-30 19:26:27,927] Trial 1 finished with value: 73.53361545386778 and parameters: {'n_epochs': 3, 'gamma': 0.9950450746348383, 'total_timesteps': 1416}. Best is trial 1 with value: 73.53361545386778.


[I 2025-06-30 19:26:31,682] Trial 2 finished with value: 71.00435662356603 and parameters: {'n_epochs': 5, 'gamma': 0.9915865224900453, 'total_timesteps': 1933}. Best is trial 1 with value: 73.53361545386778.


[I 2025-06-30 19:26:36,450] Trial 3 finished with value: 84.40964150235163 and parameters: {'n_epochs': 5, 'gamma': 0.9980272079640348, 'total_timesteps': 1608}. Best is trial 3 with value: 84.40964150235163.


[I 2025-06-30 19:26:42,393] Trial 4 finished with value: 150.14159175610666 and parameters: {'n_epochs': 5, 'gamma': 0.9982933699484305, 'total_timesteps': 1822}. Best is trial 4 with value: 150.14159175610666.


([150.14159175610666],
 {'n_epochs': 5, 'gamma': 0.9982933699484305, 'total_timesteps': 1822})

In [5]:
#用最优超参数训练一个模型
test_params(study.best_trial.params)

np.float64(134.0411876134619)