```bash
# 安装依赖 查漏补缺
# install dependencies
pip install stable_baselines3
pip install --upgrade stable
pip install --upgrade gym pygame setuptools importlib-metadata
```

**Stable Baselines3 Documentation**  
Official Website: https://stable-baselines3.readthedocs.io/en/master/  
Supported Algorithm：A2C, PPO, DDPG, DQN, SAC, TD3   

### **Introduction | 简单介绍**

In [None]:
# Impoort Dependencies
import os
import gymnasium as gym
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
# 简单运行 倒立摆 可视化环境
# Run a simple CartPole environment with visualization
env = gym.make("CartPole-v1", 
                render_mode="human")
n_episodes = 50

for episode in range(n_episodes):
    env.reset()
    pass

env.close()

In [None]:
# 查看环境的信息，状态空间和动作空间
# Check the environment information, state space, and action space
env = gym.make("CartPole-v1")
print(env.reset())
print(env.action_space)
env.close()

(array([-0.02554033,  0.02242392, -0.00096789,  0.00861304], dtype=float32), {})
Discrete(2)


In [36]:
PPO??

[1;31mInit signature:[0m
[0mPPO[0m[1;33m([0m[1;33m
[0m    [0mpolicy[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mtype[0m[1;33m[[0m[0mstable_baselines3[0m[1;33m.[0m[0mcommon[0m[1;33m.[0m[0mpolicies[0m[1;33m.[0m[0mActorCriticPolicy[0m[1;33m][0m[1;33m][0m[1;33m,[0m[1;33m
[0m    [0menv[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mgymnasium[0m[1;33m.[0m[0mcore[0m[1;33m.[0m[0mEnv[0m[1;33m,[0m [0mForwardRef[0m[1;33m([0m[1;34m'VecEnv'[0m[1;33m)[0m[1;33m,[0m [0mstr[0m[1;33m][0m[1;33m,[0m[1;33m
[0m    [0mlearning_rate[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mfloat[0m[1;33m,[0m [0mCallable[0m[1;33m[[0m[1;33m[[0m[0mfloat[0m[1;33m][0m[1;33m,[0m [0mfloat[0m[1;33m][0m[1;33m][0m [1;33m=[0m [1;36m0.0003[0m[1;33m,[0m[1;33m
[0m    [0mn_steps[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m2048[0m[1;33m,[0m[1;33m
[0m    [0mbatch_size[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;

### **Officially Start | 正式开始**

In [None]:
# Impoort Dependencies
import os
import gymnasium as gym
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

#### **1.Create Env and Agent | 创建环境+智能体**

In [None]:
# 设置日志文件夹
# Set the log folder
log_folder = os.path.join("Training", "Logs")
if not os.path.exists(log_folder):
    os.makedirs(log_folder)

# 创建 Env
# Create the environment
env = gym.make("CartPole-v1")
env = DummyVecEnv([lambda: env])  # Wrap the environment in a DummyVecEnv

# 创建 PPO Agent
# Create the PPO agent
model = PPO(policy = "MlpPolicy", 
            env = env, 
            device = "cpu",                  # use device "cpu"                           # 设备类型 "cuda"， 这种情况官方建议使用 cpu
            verbose = 1,                     # Log type                                   # 日志类型
            tensorboard_log = log_folder)    # Tensorboard log folder location            # Tensorboard 日志文件夹位置                  

Using cpu device


#### **2.Train Model | 训练模型**

In [None]:
# 开始训练
# Start training
model.learn(total_timesteps = 20000)

# 保存模型
# Save the model
PPO_Path = os.path.join("Training", "Saved Models", "PPO_CartPole")
model.save(PPO_Path)

#### **3.Load Model | 加载模型**

In [None]:
# 删除模型
# Delete the model
del model  

# 加载模型 测试
# Load the model for testing
model = PPO.load(PPO_Path,
                 device = "cpu", 
                 env    = env)   # 加载模型完后可以继续使用 model.learn() 训练模型

#### **3.Evaluate Model | 评估模型**

In [None]:
# 评估模型
# Evaluate the model

# 创建一个有渲染的环境
# Create a rendering environment
env = gym.make("CartPole-v1", 
                render_mode="human")

evaluate_policy(model, 
                env, 
                n_eval_episodes = 2, 
                render = True)




(np.float64(207.5), np.float64(0.5))

In [None]:
# 关闭环境
# Close the environment
env.close()

#### **4.Test Model | 测试模型**

In [None]:
def test_model(model_type, model_path, n_episodes=5):

    print(model_path)

    # 初始化环境
    # Initialize the environment
    env = gym.make("CartPole-v1", render_mode="human") 
    env = DummyVecEnv([lambda: env])  
    obs = env.reset()

    # 加载模型
    # Load the model
    if model_type == "PPO":
        model = PPO.load(model_path,
                         device = "cpu",
                         env = env)
    else:
        model = DQN.load(model_path,
                        device = "cpu",
                        env = env)


    # 开始测试
    # Start testing
    for episode in range(n_episodes):
        score = 0
        done = False
        while done == False:
            action, _ = model.predict(obs)            # 获取行为  
            obs, reward, done, _ = env.step(action)   # 环境交互
            score += reward                           # 回报计算
        print(f"Episode: {episode + 1} Score: {score}")
    env.close()

test_model("PPO", PPO_Path, n_episodes=5)

Training\Saved Models\PPO_CartPole_Mod
Episode: 1 Score: [187.]
Episode: 2 Score: [247.]
Episode: 3 Score: [201.]
Episode: 4 Score: [315.]
Episode: 5 Score: [171.]


#### **5.Ckeck Log | 查看日志 TensorBoard**

In [None]:
training_log_path = os.path.join(log_folder,"PPO_2")
!tensorboard --logdir={training_log_path}
# 然后打开 localhost:6006 查看训练过程中的数据

^C


#### **6.添加 Callback 回调函数，到达设定奖励就停止训练**

In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

save_path = os.path.join('Training', 'Saved Models')

################################################################################################
# 当奖励到达 200 就停止训练
# Stop training when the reward reaches 200

# 训练暂停的回调函数
# Callback function to stop training
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, 
                                              verbose=1)
# 训练评估的回调函数
# Callback function for training evaluation
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,   # 评估回调函数          # callback function for evaluation
                             eval_freq=10000,                      # 每 10000 步评估一次   # Evaluate every 10000 steps
                             best_model_save_path=save_path,       # 最佳模型保存路径      # Best model save path
                             verbose=1)

################################################################################################
# 创建 Env
# Create the environment
env = gym.make("CartPole-v1")
env = DummyVecEnv([lambda: env])  # Wrap the environment in a DummyVecEnv

# 创建 PPO Agent
# Create the PPO agent
model = PPO(policy = "MlpPolicy", 
            env = env, 
            device = "cpu",                  
            verbose = 1,                     
            tensorboard_log = log_folder)    

Using cpu device


In [None]:
# 训练模型
# Train the model
model.learn(total_timesteps = 20000, 
            callback = eval_callback)  # 添加回调函数 # Add callback function

Logging to Training\Logs\PPO_5
-----------------------------
| time/              |      |
|    fps             | 1954 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1339        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007790219 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.000961   |
|    learning_rate        | 0.0003      |
|    loss                 | 6.85        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.016      |
|    value_loss           | 58          |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x2725e19d310>

#### **7.Change Policy | 修改策略**

In [None]:
# 自定义策略网络 Policy Network 结构
# Custom policy network structure
net_arch = dict(pi=[128,128,128,128],   # policy (Actor)            4层隐藏层， 每层128个神经元   # 4 hidden layers, each with 128 neurons
                vf=[128,128,128,128])   # value function (Critic)   4层隐藏层， 每层128个神经元   # 4 hidden layers, each with 128 neurons

In [None]:
# 创建 Env
# Create the environment
env = gym.make("CartPole-v1")
env = DummyVecEnv([lambda: env])  # Wrap the environment in a DummyVecEnv

# 创建 PPO
# Create the PPO agent
model = PPO(policy = "MlpPolicy",
            env = env,
            verbose = 1,
            device = 'cpu',
            tensorboard_log = log_folder,
            policy_kwargs = {'net_arch':net_arch})  # 添加网络结构参数   # Add network structure parameters

Using cpu device


In [None]:
model.learn(total_timesteps = 50000,
            callback = eval_callback)  # 添加回调函数 # Add callback function

# 保存模型
# Save the model
PPO_Path = os.path.join("Training", "Saved Models", "PPO_CartPole_Mod")
model.save(PPO_Path)

Logging to Training\Logs\PPO_12
-----------------------------
| time/              |      |
|    fps             | 1175 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 744         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014662875 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | 0.00156     |
|    learning_rate        | 0.0003      |
|    loss                 | 3.37        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0227     |
|    value_loss           | 16.4        |
-----------------------------------------
--

In [171]:
test_model('PPO',PPO_Path)

Training\Saved Models\PPO_CartPole_Mod
Episode: 1 Score: [234.]
Episode: 2 Score: [45.]
Episode: 3 Score: [120.]
Episode: 4 Score: [370.]
Episode: 5 Score: [391.]


#### **8.Use DQN Algorithm | 使用 DQN 算法**

In [None]:
# 创建 Env
# Create the environment
env = gym.make("CartPole-v1")
env = DummyVecEnv([lambda: env])  # Wrap the environment in a DummyVecEnv

# 创建 DQN Agent
# Create the DQN agent
model = DQN(policy = "MlpPolicy",
            env = env, 
            device = "cpu",                  # 设备类型 "cuda"， 这种情况官方建议使用 cpu  # Device type "cuda", in this case the official recommendation is to use cpu
            verbose = 1,                     # 日志类型                                  # Log type  
            tensorboard_log = log_folder)    # Tensorboard 日志文件夹位置                 # Tensorboard log folder location

Using cpu device


In [None]:
# 训练模型
# Train the model
model.learn(total_timesteps=100_000,
            callback = eval_callback)  # 添加回调函数 # Add callback function

# 保存模型
# Save the model
DQN_Path = os.path.join("Training", "Saved Models", "DQN_CartPole")
model.save(DQN_Path)

Logging to Training\Logs\DQN_3
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.991    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4699     |
|    time_elapsed     | 0        |
|    total_timesteps  | 97       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.98     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1946     |
|    time_elapsed     | 0        |
|    total_timesteps  | 206      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0808   |
|    n_updates        | 29976    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.973    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 



Eval num_timesteps=10000, episode_reward=283.80 +/- 105.07
Episode length: 283.80 +/- 105.07
----------------------------------
| eval/               |          |
|    mean_ep_length   | 284      |
|    mean_reward      | 284      |
| rollout/            |          |
|    exploration_rate | 0.0501   |
| time/               |          |
|    total_timesteps  | 10000    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00844  |
|    n_updates        | 32424    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 168      |
|    fps              | 987      |
|    time_elapsed     | 10       |
|    total_timesteps  | 10606    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0256   |
|    n_updates        | 32576    |
--------------------------------

In [170]:
test_model('DQN',"Training/Saved Models/DQN_CartPole")

Training/Saved Models/DQN_CartPole
Episode: 1 Score: [235.]
Episode: 2 Score: [235.]
Episode: 3 Score: [182.]
Episode: 4 Score: [223.]
Episode: 5 Score: [229.]
