In [5]:
import gymnasium as gym


#定义环境
class MyWrapper(gym.Wrapper):

    def __init__(self):
        env = gym.make('LunarLander-v3')
        super().__init__(env)
        self.env = env

    def reset(self, seed=None, options=None):
        state, info = self.env.reset()
        return state, info

    def step(self, action):
        state, reward, done, truncated, info = self.env.step(action)
        return state, reward, done, truncated, info


env = MyWrapper()

env.reset()

(array([ 0.00502644,  1.4166392 ,  0.50910664,  0.25417387, -0.00581757,
        -0.11532021,  0.        ,  0.        ], dtype=float32),
 {})

In [7]:
#认识游戏环境
def test_env():
    print('env.observation_space=', env.observation_space)
    print('env.action_space=', env.action_space)

    state = env.reset()
    action = env.action_space.sample()
    next_state, reward, done, _, _ = env.step(action)

    print('state=', state)
    print('action=', action)
    print('next_state=', next_state)
    print('reward=', reward)
    print('done=', done)


test_env()

env.observation_space= Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)
env.action_space= Discrete(4)
state= (array([-8.9550020e-05,  1.4155642e+00, -9.0858061e-03,  2.0640540e-01,
        1.1055406e-04,  2.0580797e-03,  0.0000000e+00,  0.0000000e+00],
      dtype=float32), {})
action= 0
next_state= [-1.7919540e-04  1.4196303e+00 -9.0732994e-03  1.8071297e-01
  2.1229628e-04  2.0357957e-03  0.0000000e+00  0.0000000e+00]
reward= 2.149683863760572
done= False


In [8]:
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3 import PPO

#初始化模型
model = PPO(
    policy='MlpPolicy',
    env=make_vec_env(MyWrapper, n_envs=4),  #创建N个环境用于训练
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=0,
    device='cpu')

model

<stable_baselines3.ppo.ppo.PPO at 0x2075b88ade0>

In [9]:
from stable_baselines3.common.evaluation import evaluate_policy

#测试
evaluate_policy(model, env, n_eval_episodes=10, deterministic=False)



(np.float64(-135.15116650802082), np.float64(139.95567875679728))

In [None]:
#训练
model.learn(total_timesteps=20_0000, progress_bar=True)
model.save('models/ppo-LunarLander-v3')

In [6]:
model = PPO.load('models/ppo-LunarLander-v3')

evaluate_policy(model, env, n_eval_episodes=10, deterministic=False)

(45.54798891161171, 139.1048836822021)

In [7]:
from huggingface_sb3 import load_from_hub

#!pip install huggingface-sb3

#加载其他训练好的模型
#https://huggingface.co/models?library=stable-baselines3
model = PPO.load(
    load_from_hub('araffin/ppo-LunarLander-v2', 'ppo-LunarLander-v2.zip'),
    custom_objects={
        'learning_rate': 0.0,
        'lr_schedule': lambda _: 0.0,
        'clip_range': lambda _: 0.0,
    },
    print_system_info=True,
)

evaluate_policy(model, env, n_eval_episodes=10, deterministic=False)

Downloading:   0%|          | 0.00/144k [00:00<?, ?B/s]

== CURRENT SYSTEM INFO ==
- OS: Linux-5.15.0-3.60.5.1.el9uek.x86_64-x86_64-with-glibc2.34 # 2 SMP Wed Oct 19 20:27:31 PDT 2022
- Python: 3.9.15
- Stable-Baselines3: 1.8.0a1
- PyTorch: 1.13.0+cpu
- GPU Enabled: False
- Numpy: 1.23.5
- Gym: 0.26.2

== SAVED MODEL SYSTEM INFO ==
OS: Linux-5.13.0-40-generic-x86_64-with-debian-bullseye-sid #45~20.04.1-Ubuntu SMP Mon Apr 4 09:38:31 UTC 2022
Python: 3.7.10
Stable-Baselines3: 1.5.1a5
PyTorch: 1.11.0
GPU Enabled: False
Numpy: 1.21.2
Gym: 0.21.0





(250.9974542026721, 86.61020518339575)