In [1]:
#!pip3 install swig
#!pip3 install gymnasium[box2d]
#!pip install huggingface-sb3

In [2]:
import gymnasium as gym

#定义环境
class MyWrapper(gym.Wrapper):

    def __init__(self):
        env = gym.make('LunarLander-v2',render_mode="human")
        super().__init__(env)
        self.env = env

    def reset(self, seed=None):
        state, info = self.env.reset()
        return state, info

    def step(self, action):
        state, reward, done, truncated, info = self.env.step(action)
        return state, reward, done, truncated, info

env = MyWrapper()

env.reset()

(array([-0.00568075,  1.4071829 , -0.5754184 , -0.16610847,  0.00658939,
         0.13034084,  0.        ,  0.        ], dtype=float32),
 {})

In [3]:
#认识游戏环境
def test_env():
    print('env.observation_space=', env.observation_space)
    print('env.action_space=', env.action_space)

    state, _ = env.reset()
    action = env.action_space.sample()
    next_state, reward, done, _, _ = env.step(action)

    print('state=', state)
    print('action=', action)
    print('next_state=', next_state)
    print('reward=', reward)
    print('done=', done)


test_env()

env.observation_space= Box([-90.        -90.         -5.         -5.         -3.1415927  -5.
  -0.         -0.       ], [90.        90.         5.         5.         3.1415927  5.
  1.         1.       ], (8,), float32)
env.action_space= Discrete(4)
state= [ 0.00238934  1.4079379   0.2419908  -0.13254927 -0.00276176 -0.0548145
  0.          0.        ]
action= 3
next_state= [ 0.00484896  1.4043831   0.25049347 -0.15800944 -0.00724102 -0.08959296
  0.          0.        ]
reward= -2.1481609531482477
done= False


In [4]:
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3 import PPO

#初始化模型
model = PPO(
    policy='MlpPolicy',
    env=make_vec_env(MyWrapper, n_envs=4),  #创建N个环境用于训练
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=0)

model

<stable_baselines3.ppo.ppo.PPO at 0x75612867f850>

In [5]:
from stable_baselines3.common.evaluation import evaluate_policy

#测试
evaluate_policy(model, env, n_eval_episodes=10, deterministic=False)



(-188.5438869135105, 103.98350718401628)

In [6]:
#训练
model.learn(total_timesteps=20_0000, progress_bar=True)
model.save('models/ppo-LunarLander-v2')

Output()

In [7]:
model = PPO.load('models/ppo-LunarLander-v2')

evaluate_policy(model, env, n_eval_episodes=10, deterministic=False)

(61.79367736785989, 64.77595907446783)

In [8]:
from huggingface_sb3 import load_from_hub

#加载其他训练好的模型
#https://huggingface.co/models?library=stable-baselines3
model = PPO.load(
    load_from_hub('araffin/ppo-LunarLander-v2', 'ppo-LunarLander-v2.zip'),
    custom_objects={
        'learning_rate': 0.0,
        'lr_schedule': lambda _: 0.0,
        'clip_range': lambda _: 0.0,
    },
    print_system_info=True,
)

evaluate_policy(model, env, n_eval_episodes=10, deterministic=False)

== CURRENT SYSTEM INFO ==
- OS: Linux-6.5.0-27-generic-x86_64-with-glibc2.35 # 28~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Fri Mar 15 10:51:06 UTC 2
- Python: 3.10.14
- Stable-Baselines3: 2.3.0
- PyTorch: 2.2.2+cu121
- GPU Enabled: True
- Numpy: 1.26.4
- Cloudpickle: 3.0.0
- Gymnasium: 0.28.1
- OpenAI Gym: 0.26.2

== SAVED MODEL SYSTEM INFO ==
OS: Linux-5.13.0-40-generic-x86_64-with-debian-bullseye-sid #45~20.04.1-Ubuntu SMP Mon Apr 4 09:38:31 UTC 2022
Python: 3.7.10
Stable-Baselines3: 1.5.1a5
PyTorch: 1.11.0
GPU Enabled: False
Numpy: 1.21.2
Gym: 0.21.0





(249.5697353206968, 47.365977760362)