## Environment with Simultaneously Stepping Agents

A good and simple example for a multi-agent env, in which all agents always step simultaneously is the Rock-Paper-Scissors game, in which two agents have to play N moves altogether, each choosing between the actions “Rock”, “Paper”, or “Scissors”. After each move, the action choices are compared. Rock beats Scissors, Paper beats Rock, and Scissors beats Paper. The player winning the move receives a +1 reward, the losing player -1.

In [2]:
!pip install ray
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import gymnasium as gym


Collecting ray
  Downloading ray-2.53.0-cp312-cp312-manylinux2014_x86_64.whl.metadata (22 kB)
Downloading ray-2.53.0-cp312-cp312-manylinux2014_x86_64.whl (72.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: ray
Successfully installed ray-2.53.0




In [3]:
class RockPaperScissors(MultiAgentEnv):
    """
    Two player environment for the famous rock paper scissors game.
    Both players always move simultaneously over a course of 10 timesteps in total.
    The winner of each timestep receives reward of +1, the losing player -1.0.

    The observation of each player is the last opponent action.
    """

    ROCK = 0
    PAPER = 1
    SCISSORS = 2
    LIZARD = 3
    SPOCK = 4

    PLAYER1 = "player1"
    PLAYER2 = "player2"

    WIN_MATRIX = {
        (ROCK, ROCK): (0, 0),
        (ROCK, PAPER): (-1, 1),
        (ROCK, SCISSORS): (1, -1),
        (PAPER, ROCK): (1, -1),
        (PAPER, PAPER): (0, 0),
        (PAPER, SCISSORS): (-1, 1),
        (SCISSORS, ROCK): (-1, 1),
        (SCISSORS, PAPER): (1, -1),
        (SCISSORS, SCISSORS): (0, 0),
    }

    def __init__(self, config=None):
        super().__init__()

        # agents doesnt changes in episodes
        self.agents = self.possible_agents = [self.PLAYER1, self.PLAYER2]

        # The observations are always the last taken actions.
        # Hence observation and action spaces are identical.
        self.observation_spaces = self.action_spaces = {
            self.PLAYER1: gym.spaces.Discrete(3),
            self.PLAYER2: gym.spaces.Discrete(3),
        }

        self.last_move = None
        self.num_moves = 0

    def reset(self, *, seed=None, options=None):
        self.num_moves = 0

        observations = {
            self.PLAYER1: 0,
            self.PLAYER2: 0,
        }
        infos = {}

        return observations, infos

    def step(self, action_dict):
        self.num_moves += 1

        move1 = action_dict[self.PLAYER1]
        move2 = action_dict[self.PLAYER2]

        observations = {self.PLAYER1: move2, self.PLAYER2: move1}

        # Compute rewards for each player based on the win-matrix.
        r1, r2 = self.WIN_MATRIX[move1, move2]
        rewards = {self.PLAYER1: r1, self.PLAYER2: r2}

        # Terminate and truncate the entire episode (for all agents) once 10 moves have been made.
        terminateds = truncateds = {"__all__": self.num_moves >= 10}

        infos = {}
        return observations, rewards, terminateds, truncateds, infos

In [4]:
# Register the environment with RLlib
from ray.tune.registry import register_env

def env_creator(config=None):
    return RockPaperScissors(config)

register_env('rps_multiagent', env_creator)


In [5]:
from ray.rllib.algorithms.ppo import PPOConfig

# PPOConfig setup for RLlib
config = PPOConfig()
config = config.environment(env="rps_multiagent", env_config={})
config = config.multi_agent(
    policies={
        "player1": (None, gym.spaces.Discrete(3), gym.spaces.Discrete(3), {}),
        "player2": (None, gym.spaces.Discrete(3), gym.spaces.Discrete(3), {}),
    },
    policy_mapping_fn=lambda agent_id, *args, **kwargs: agent_id,
)
config.num_env_runners = 1

config.api_stack(enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False)

<ray.rllib.algorithms.ppo.ppo.PPOConfig at 0x78cfb05e60c0>

In [6]:
# Build the algorithm

algo = config.build_algo()

2026-01-16 18:05:10,033	INFO tensorboardx.py:45 -- pip install "ray[tune]" to see TensorBoard files.
2026-01-16 18:05:15,072	INFO worker.py:2007 -- Started a local Ray instance.
[36m(pid=1835)[0m E0000 00:00:1768586723.584691    1835 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=1835)[0m E0000 00:00:1768586723.590571    1835 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(pid=1835)[0m W0000 00:00:1768586723.607947    1835 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
[36m(pid=1835)[0m W0000 00:00:1768586723.607978    1835 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
[36m(pid=1835)[0m W0000 00:00:1768586

In [7]:
# Train the agents
results = []

for i in range(20):
    result = algo.train()
    learner_stats = result['info']['learner']
    print(f"Iteration {i+1}:")
    for agent, stats in learner_stats.items():
        ls = stats['learner_stats']
        print(
            f"  {agent}: total_loss={ls['total_loss']:.3f}, "
            f"policy_loss={ls['policy_loss']:.3f}, "
            f"vf_loss={ls['vf_loss']:.3f}, "
            f"entropy={ls['entropy']:.3f}, "
            f"kl={ls['kl']:.3f}"
        )
    print(f"  env_steps_sampled={result['info']['num_env_steps_sampled']}")

[33m(raylet)[0m [2026-01-16 18:05:45,038 E 1796 1796] (raylet) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[36m(RolloutWorker pid=1835)[0m [2026-01-16 18:05:52,616 E 1835 1917] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14


Iteration 1:
  player1: total_loss=2.695, policy_loss=-0.005, vf_loss=2.698, entropy=1.089, kl=0.009
  player2: total_loss=2.696, policy_loss=-0.004, vf_loss=2.698, entropy=1.090, kl=0.009
  env_steps_sampled=4000
Iteration 2:
  player1: total_loss=2.784, policy_loss=-0.007, vf_loss=2.789, entropy=1.079, kl=0.009
  player2: total_loss=2.788, policy_loss=-0.005, vf_loss=2.791, entropy=1.087, kl=0.010
  env_steps_sampled=8000
Iteration 3:
  player1: total_loss=2.740, policy_loss=-0.007, vf_loss=2.744, entropy=1.056, kl=0.013
  player2: total_loss=2.740, policy_loss=-0.007, vf_loss=2.744, entropy=1.083, kl=0.015
  env_steps_sampled=12000
Iteration 4:
  player1: total_loss=2.553, policy_loss=-0.003, vf_loss=2.554, entropy=1.066, kl=0.008
  player2: total_loss=2.544, policy_loss=-0.015, vf_loss=2.555, entropy=1.040, kl=0.022
  env_steps_sampled=16000
Iteration 5:
  player1: total_loss=2.836, policy_loss=-0.021, vf_loss=2.851, entropy=1.085, kl=0.032
  player2: total_loss=2.849, policy_loss=

In [12]:
# Play a game using trained policies
import numpy as np

env = RockPaperScissors()
obs, _ = env.reset()
print("Initial observations: ", obs)

total_rewards = {env.PLAYER1: 0, env.PLAYER2: 0}
for step in range(10):
    actions = {}
    for agent_id in obs:
        # Convert scalar obs to one-hot for the policy
        obs_onehot = np.eye(3)[obs[agent_id]]
        policy = algo.get_policy(agent_id)
        action = policy.compute_single_action(obs_onehot)
        actions[agent_id] = int(action[0])

    # actions dict contains integer actions, which is correct for env.step
    obs, rewards, terminateds, truncateds, infos = env.step(actions)
    print(f"Step {step+1}: Actions: {actions}, Rewards: {rewards}")

    for agent_id in rewards:
        total_rewards[agent_id] += rewards[agent_id]

    if terminateds.get("__all__", False):
        break

print(f"Total rewards: {total_rewards}")

Initial observations:  {'player1': 0, 'player2': 0}
Step 1: Actions: {'player1': 0, 'player2': 0}, Rewards: {'player1': 0, 'player2': 0}
Step 2: Actions: {'player1': 2, 'player2': 1}, Rewards: {'player1': 1, 'player2': -1}
Step 3: Actions: {'player1': 1, 'player2': 1}, Rewards: {'player1': 0, 'player2': 0}
Step 4: Actions: {'player1': 1, 'player2': 0}, Rewards: {'player1': 1, 'player2': -1}
Step 5: Actions: {'player1': 1, 'player2': 0}, Rewards: {'player1': 1, 'player2': -1}
Step 6: Actions: {'player1': 2, 'player2': 0}, Rewards: {'player1': -1, 'player2': 1}
Step 7: Actions: {'player1': 1, 'player2': 0}, Rewards: {'player1': 1, 'player2': -1}
Step 8: Actions: {'player1': 1, 'player2': 0}, Rewards: {'player1': 1, 'player2': -1}
Step 9: Actions: {'player1': 2, 'player2': 0}, Rewards: {'player1': -1, 'player2': 1}
Step 10: Actions: {'player1': 0, 'player2': 0}, Rewards: {'player1': 0, 'player2': 0}
Total rewards: {'player1': 3, 'player2': -3}
