# RLlib 的入门介绍
首先，为算法创建一个 config，它定义了 RL 环境以及其他所需的设置和参数。

In [17]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.connectors.env_to_module import FlattenObservations

# Configure the algorithm.
config = (
    PPOConfig()
    .environment("Taxi-v3")
    .env_runners(
        num_env_runners=2,
        # Observations are discrete (ints) -> We need to flatten (one-hot) them.
        env_to_module_connector=lambda env: FlattenObservations(),
    )
    .evaluation(evaluation_num_env_runners=1)
)

接下来，build 算法并 train 共五次迭代。一次训练迭代包括由 EnvRunner actor 并行、分布式地收集样本，然后对收集到的数据计算损失，以及模型更新步骤。

In [18]:
from pprint import pprint

# Build the algorithm.
algo = config.build_algo()

# Train it for 5 iterations ...
for _ in range(5):
    pprint(algo.train())

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
[2025-08-04 14:07:24,863 E 2483888 2483888] core_worker.cc:2740: Actor with class name: 'SingleAgentEnvRunner' and ID: '2b5c199acc21edf6159dcd7201000000' has constructor arguments in the object store and max_restarts > 0. If the arguments in the object store go out of scope or are lost, the actor resta

{'config': {'_disable_action_flattening': False,
            '_disable_execution_plan_api': -1,
            '_disable_initialize_loss_from_dummy_batch': False,
            '_disable_preprocessor_api': False,
            '_dont_auto_sync_env_runner_states': False,
            '_enable_rl_module_api': -1,
            '_env_to_module_connector': <function <lambda> at 0x7feb32864790>,
            '_fake_gpus': False,
            '_is_atari': None,
            '_is_online': True,
            '_learner_class': None,
            '_learner_connector': None,
            '_model_config': {},
            '_module_to_env_connector': None,
            '_per_module_overrides': {},
            '_prior_exploration_config': {'type': 'StochasticSampling'},
            '_rl_module_spec': None,
            '_tf_policy_handles_more_than_one_loss': False,
            '_torch_grad_scaler_class': None,
            '_torch_lr_scheduler_classes': None,
            '_train_batch_size_per_learner': None,
        

在脚本结束时，评估训练好的 Algorithm 并释放其所有资源。

In [19]:
# ... and evaluate it.
pprint(algo.evaluate())

# Release the algo's resources (remote actors, like EnvRunners and Learners).
algo.stop()

{'env_runners': {'agent_episode_return_mean': {'default_agent': -524.9},
                 'env_reset_timer': 0.0005784661043435335,
                 'env_step_timer': 0.00012521755513990153,
                 'env_to_module_connector': {'connector_pipeline_timer': 0.00031241419131039006,
                                             'timers': {'connectors': {'add_observations_from_episodes_to_batch': 1.2183304170345956e-05,
                                                                       'add_states_from_episodes_to_batch': 7.226803572083017e-06,
                                                                       'add_time_dim_to_batch_and_zero_pad': 1.2250860198382362e-05,
                                                                       'batch_individual_items': 3.216233114627092e-05,
                                                                       'flatten_observations': 7.306785694877981e-05,
                                                                       '

# RLlib 的基本构建流程
可以通过调用 config 的 environment() 方法来设置要使用的强化学习环境。env_runners()方法来设置希望利用多少 EnvRunner Actor。training()来设置与训练相关的或任何算法特定的参数。 build_algo() 方法来构建实际的 Algorithm 实例。

In [20]:
from ray.rllib.algorithms.ppo import PPOConfig

# Create a config instance for the PPO algorithm.
config = (
    PPOConfig()
    .environment("Pendulum-v1")
)

config.env_runners(num_env_runners=2)

config.training(
    lr=0.0002,
    train_batch_size_per_learner=2000,
    num_epochs=10,
)

# Build the Algorithm (PPO).
ppo = config.build_algo()


`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
[2025-08-04 14:42:14,831 E 2483888 2483888] core_worker.cc:2740: Actor with class name: 'SingleAgentEnvRunner' and ID: 'ec530fdf6ae7223a6c0541b901000000' has constructor arguments in the object store and max_restarts > 0. If the arguments in the object store go out of scope or are lost, the actor resta

调用 train() 方法对其进行多次迭代训练。save_to_path() 方法创建一个检查点，该方法返回保存检查点的目录。

In [25]:
from pprint import pprint
import os

for _ in range(4):
    pprint(ppo.train())

abs_path = os.path.abspath("./checkpoints")
checkpoint_path = ppo.save(checkpoint_dir=f"file://{abs_path}")
# checkpoint_path = ppo.save_to_path(os.getcwd())
print("checkpoint path: ", checkpoint_path)
# OR:
# ppo.save_to_path([a checkpoint location of your choice])

{'config': {'_disable_action_flattening': False,
            '_disable_execution_plan_api': -1,
            '_disable_initialize_loss_from_dummy_batch': False,
            '_disable_preprocessor_api': False,
            '_dont_auto_sync_env_runner_states': False,
            '_enable_rl_module_api': -1,
            '_env_to_module_connector': None,
            '_fake_gpus': False,
            '_is_atari': None,
            '_is_online': True,
            '_learner_class': None,
            '_learner_connector': None,
            '_model_config': {},
            '_module_to_env_connector': None,
            '_per_module_overrides': {},
            '_prior_exploration_config': {'type': 'StochasticSampling'},
            '_rl_module_spec': None,
            '_tf_policy_handles_more_than_one_loss': False,
            '_torch_grad_scaler_class': None,
            '_torch_lr_scheduler_classes': None,
            '_train_batch_size_per_learner': 2000,
            '_use_msgpack_checkpoints': F

In [22]:
config.evaluation(
    # Run one evaluation round every iteration.
    evaluation_interval=1,

    # Create 2 eval EnvRunners in the extra EnvRunnerGroup.
    evaluation_num_env_runners=2,

    # Run evaluation for exactly 10 episodes. Note that because you have
    # 2 EnvRunners, each one runs through 5 episodes.
    evaluation_duration_unit="episodes",
    evaluation_duration=10,
)

# Rebuild the PPO, but with the extra evaluation EnvRunnerGroup
ppo_with_evaluation = config.build_algo()

for _ in range(3):
    pprint(ppo_with_evaluation.train())

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
[2025-08-04 14:45:36,718 E 2483888 2483888] core_worker.cc:2740: Actor with class name: 'SingleAgentEnvRunner' and ID: 'ee84b34ab9fe8ad6ba59e43c01000000' has constructor arguments in the object store and max_restarts > 0. If the arguments in the object store go out of scope or are lost, the actor resta

{'config': {'_disable_action_flattening': False,
            '_disable_execution_plan_api': -1,
            '_disable_initialize_loss_from_dummy_batch': False,
            '_disable_preprocessor_api': False,
            '_dont_auto_sync_env_runner_states': False,
            '_enable_rl_module_api': -1,
            '_env_to_module_connector': None,
            '_fake_gpus': False,
            '_is_atari': None,
            '_is_online': True,
            '_learner_class': None,
            '_learner_connector': None,
            '_model_config': {},
            '_module_to_env_connector': None,
            '_per_module_overrides': {},
            '_prior_exploration_config': {'type': 'StochasticSampling'},
            '_rl_module_spec': None,
            '_tf_policy_handles_more_than_one_loss': False,
            '_torch_grad_scaler_class': None,
            '_torch_lr_scheduler_classes': None,
            '_train_batch_size_per_learner': 2000,
            '_use_msgpack_checkpoints': F

# RLlib 与 Ray Tune 结合
PPO 进行超参数扫描，创建三个 Trials。

In [13]:
from ray import train, tune
from ray.rllib.algorithms.ppo import PPOConfig
import os
abs_path = os.path.abspath("./checkpoints")

config = (
    PPOConfig()
    .environment("Pendulum-v1")
    # Specify a simple tune hyperparameter sweep.
    .training(
        lr=tune.grid_search([0.001, 0.0005, 0.0001]),
    )
    .framework("torch")
)

# Create a Tuner instance to manage the trials.
tuner = tune.Tuner(
    config.algo_class,
    param_space=config,
    # Specify a stopping criterion. Note that the criterion has to match one of the
    # pretty printed result metrics from the results returned previously by
    # ``.train()``. Also note that -1100 is not a good episode return for
    # Pendulum-v1, we are using it here to shorten the experiment time.
    run_config=train.RunConfig(
        storage_path=f"file://{abs_path}",  # local saving directory
        name="PPO_Pendulum_Experiment",    # 
        stop={"env_runners/episode_return_mean": -1100.0},
    ),
)
# Run the Tuner and capture the results.
results = tuner.fit()


0,1
Current time:,2025-08-04 21:12:33
Running for:,00:04:28.43
Memory:,10.1/15.3 GiB

Trial name,status,loc,lr,iter,total time (s),num_training_step_ca lls_per_iteration,num_env_steps_sample d_lifetime
PPO_Pendulum-v1_0ed13_00000,TERMINATED,10.110.34.88:3183870,0.001,17,256.716,1,68000
PPO_Pendulum-v1_0ed13_00001,TERMINATED,10.110.34.88:3183871,0.0005,3,50.8424,1,12000
PPO_Pendulum-v1_0ed13_00002,TERMINATED,10.110.34.88:3183872,0.0001,4,65.1553,1,16000


2025-08-04 21:12:33,370	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/robotarm/data/Ray_Tutorial/checkpoints/PPO_Pendulum_Experiment' in 0.0275s.
2025-08-04 21:12:33,890	INFO tune.py:1041 -- Total run time: 268.96 seconds (268.40 seconds for the tuning loop).


In [14]:
best_checkpoint = results.get_best_result(
    metric="env_runners/episode_return_mean",
    mode="max"  # maximum reward
)

# Get the best checkpoint corresponding to the best result
# from the preceding experiment.
print("best_checkpoints: ", best_checkpoint)

best_checkpoints:  Result(
  metrics={'timers': {'training_iteration': 16.730737225361874, 'restore_env_runners': 4.772419527650345e-05, 'training_step': 16.73012513332691, 'env_runner_sampling_timer': 4.186174309930475, 'learner_update_timer': 12.539987718354029, 'synch_weights': 0.003198508155695046, 'synch_env_connectors': 0.00342173548793653}, 'env_runners': {'env_to_module_sum_episodes_length_in': np.float64(131.73765420742725), 'env_to_module_sum_episodes_length_out': np.float64(131.73765420742725), 'module_to_env_connector': {'timers': {'connectors': {'tensor_to_numpy': np.float64(9.014953726898591e-05), 'remove_single_ts_time_rank_from_batch': np.float64(3.406815096758622e-06), 'get_actions': np.float64(0.0003605052017329991), 'normalize_and_clip_actions': np.float64(9.72058707174931e-05), 'un_batch_to_individual_items': np.float64(3.085118661835977e-05), 'listify_data_for_vector_env': np.float64(5.314183380117411e-05)}}, 'connector_pipeline_timer': np.float64(0.000818457916622

部署训练好的模型。从 checkpoints 创建新的模型实例并在强化学习环境中运行一个 episode 的推理

In [None]:
from pathlib import Path
import gymnasium as gym
import numpy as np
import torch
from ray.rllib.core.rl_module import RLModule

# Create only the neural network (RLModule) from our algorithm checkpoint.
# See here (https://docs.rayai.org.cn/en/master/rllib/checkpoints.html)
# to learn more about checkpointing and the specific "path" used.
rl_module = RLModule.from_checkpoint(
    Path(best_checkpoint.path)
    / "checkpoint_000000"
    / "learner_group"
    / "learner"
    / "rl_module"
    / "default_policy"
)
# if the python file is running, you can use the following get_module() to get RLModule
# rl_module = ppo.get_module("default_policy")  # Equivalent to `rl_module = ppo.get_module()`

# Create the RL environment to test against (same as was used for training earlier).
env = gym.make("Pendulum-v1", render_mode="human")

episode_return = 0.0
done = False

# Reset the env to get the initial observation.
obs, info = env.reset()

while not done:
    # Uncomment this line to render the env.
    # env.render()

    # Compute the next action from a batch (B=1) of observations.
    obs_batch = torch.from_numpy(obs).unsqueeze(0)  # add batch B=1 dimension
    model_outputs = rl_module.forward_inference({"obs": obs_batch})

    # Extract the action distribution parameters from the output and dissolve batch dim.
    action_dist_params = model_outputs["action_dist_inputs"][0].numpy()

    # We have continuous actions -> take the mean (max likelihood).
    greedy_action = np.clip(
        action_dist_params[0:1],  # 0=mean, 1=log(stddev), [0:1]=use mean, but keep shape=(1,)
        a_min=env.action_space.low[0],
        a_max=env.action_space.high[0],
    )
    # For discrete actions, you should take the argmax over the logits:
    # greedy_action = np.argmax(action_dist_params)

    # Send the action to the environment for the next step.
    obs, reward, terminated, truncated, info = env.step(greedy_action)

    # Perform env-loop bookkeeping.
    episode_return += reward
    done = terminated or truncated

print(f"Reached episode return of {episode_return}.")


Reached episode return of -987.1930495822006.


: 