In [1]:
!pip install pettingzoo[butterfly]>=1.24.0
!pip install supersuit>=3.9.0
!pip install stable-baselines3>=2.0.0
!pip install imageio



### PettingZoo

Gym is the environment manager for single agent, and PettingZoo is the equivalent of Gym for multi agent environments.

We will set up a PPO training run for a multi agent environment, Knights and Zombies.

In [2]:
from __future__ import annotations

import glob
import os
import time

import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy, MlpPolicy
from stable_baselines3.common.vec_env import VecVideoRecorder

from pettingzoo.butterfly import knights_archers_zombies_v10

  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


### Env kwargs
This is a set of keywords in SB3 that tells the environment how it should initialise it.

- `max_cycles`: Run for only 100 cycles maximally
- `max_zombies`: Tell it to only create 4 zombies at any point of time
- `vector_state`: Use vectors to represent the state or use images instead.

In [3]:
env_kwargs = dict(max_cycles=100, max_zombies=4, vector_state=True)
env_fn = knights_archers_zombies_v10

### Training parameters
steps = 81_920
seed = 0


### Today's environment: Knights Archers Zombies
<img src="https://pettingzoo.farama.org/_images/butterfly_knights_archers_zombies.gif"/>
Consists of two types of agents running around killing zombies
- Archer: Can chuck arrows at zombies
- Knights: Go in to melee zombies

Actions: Can move in any direction and rotate character
Reward: The more zombies an agent kills, the more rewards it gets.

Setup: Can be AEC or Parallel

#### Types of Environment
AEC: Each agent takes a turn before passing on to the next agent

Parallel: Everyone moves in parallel



In [4]:
### Initialise environment

env = env_fn.parallel_env(**env_kwargs)

### Introducing Supersuit

Supersuit is a collections of small functions which can wrap reinforcement learning environments to do preprocessing. It supports both Gymnasium and PettingZoo.

In [5]:
# Add black death wrapper so the number of agents stays constant
# MarkovVectorEnv does not support environments with varying numbers of active agents unless black_death is set to True
env = ss.black_death_v3(env)

# Pre-process using SuperSuit (Only if visual is set to True)
visual_observation = not env.unwrapped.vector_state
if visual_observation:
    # If the observation space is visual, reduce the color channels, resize from 512px to 84px, and apply frame stacking
    env = ss.color_reduction_v0(env, mode="B")
    env = ss.resize_v1(env, x_size=84, y_size=84)
    env = ss.frame_stack_v1(env, 3)


### Setting up the policy and training

#### Housekeeping code

`ss.pettingzoo_env_to_vec_env_v1(env)` vectorizes the PettingZoo environment into a Gym-style VecEnv. It treats each agent in the multigent env as if it was in its own sub environment. The result is a single VecEnv where each index correspond to one PettingZoo agent. This bridges the gap between whats Stable Baselines 3 expects and PettingZoo's wrapper.

`ss.concat_vec_envs_v1(env, num_vec_envs=8, num_cpus=1, base_class="stable_baselines3")` takes an existing vecEnv and stacks 8 identical copies of it into a bigger VecEnv that runs 8 parallel rollouts.
- `num_cpus = 1` means it runs everything in a single process
- `base_class="stable_baselines3"` forces the VecEnv to use VecEnv's base class from SB3 so that SB3 algorithms can work with it.




In [6]:
env.reset(seed=seed)

print(f"Starting training on {str(env.metadata['name'])}.")

env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(env, 8, num_cpus=1, base_class="stable_baselines3")

# Use a CNN policy if the observation space is visual
model = PPO(
    CnnPolicy if visual_observation else MlpPolicy,
    env,
    verbose=3,
    batch_size=256,
)

model.learn(total_timesteps=steps, progress_bar = True)

model.save(f"{env.unwrapped.metadata.get('name')}_{time.strftime('%Y%m%d-%H%M%S')}")

print("Model has been saved.")

print(f"Finished training on {str(env.unwrapped.metadata['name'])}.")

env.close()

Starting training on knights_archers_zombies_v10.
Using cpu device
------------------------------
| time/              |       |
|    fps             | 815   |
|    iterations      | 1     |
|    time_elapsed    | 80    |
|    total_timesteps | 65536 |
------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 735          |
|    iterations           | 2            |
|    time_elapsed         | 178          |
|    total_timesteps      | 131072       |
| train/                  |              |
|    approx_kl            | 0.0068222075 |
|    clip_fraction        | 0.0504       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.79        |
|    explained_variance   | -0.306       |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00554     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00334     |
|    value_loss           | 

### Evaluation of Model

We load our trained model, and experiment with it here. We use imageio to save a video of the entire run. Notice that we use the same visual observations here, but removed Black Death. That is because we are no longer running this within SB3, and are running it on PettingZoo directly.

In [7]:
import imageio

def eval(env_fn, num_games: int = 100, render_mode: str | None = None, **env_kwargs):
    # Evaluate a trained agent vs a random agent
    video_folder ="./logs"
    os.makedirs("logs",exist_ok=True)
    video_length = 200
    env = env_fn.env(render_mode=render_mode, **env_kwargs)

    # Pre-process using SuperSuit
    visual_observation = not env.unwrapped.vector_state
    if visual_observation:
        # If the observation space is visual, reduce the color channels, resize from 512px to 84px, and apply frame stacking
        env = ss.color_reduction_v0(env, mode="B")
        env = ss.resize_v1(env, x_size=84, y_size=84)
        env = ss.frame_stack_v1(env, 3)
    print(
        f"\nStarting evaluation on {str(env.metadata['name'])} (num_games={num_games}, render_mode={render_mode})"
    )

    try:
        latest_policy = max(
            glob.glob(f"{env.metadata['name']}*.zip"), key=os.path.getctime
        )
    except ValueError:
        print("Policy not found.")
        exit(0)



    model = PPO.load(latest_policy)

    rewards = {agent: 0 for agent in env.possible_agents}

    # Note: we evaluate here using an AEC environments, to allow for easy A/B testing against random policies
    # For example, we can see here that using a random agent for archer_0 results in less points than the trained agent
    for i in range(num_games):
        env.reset(seed=i)
        env.action_space(env.possible_agents[0]).seed(i)
        frames = []
        for agent in env.agent_iter():
            obs, reward, termination, truncation, info = env.last()
            if render_mode == "rgb_array":
              frames.append(env.render())

            for a in env.agents:
                rewards[a] += env.rewards[a]

            if termination or truncation:
                break
            else:
                if agent == env.possible_agents[0]:
                    act = env.action_space(agent).sample()
                else:
                    act = model.predict(obs, deterministic=True)[0]
            env.step(act)
        if render_mode == "rgb_array":
            path = f"{video_folder}/PPO_game_{i}.mp4"
            imageio.mimsave(path, frames, fps = 30)
            print(f"Saved to {path}")
    env.close()

    avg_reward = sum(rewards.values()) / len(rewards.values())
    avg_reward_per_agent = {
        agent: rewards[agent] / num_games for agent in env.possible_agents
    }
    print(f"Avg reward: {avg_reward}")
    print("Avg reward per agent, per game: ", avg_reward_per_agent)
    print("Full rewards: ", rewards)
    return avg_reward


In [8]:
env_fn = knights_archers_zombies_v10

# Set vector_state to false in order to use visual observations (significantly longer training time)
env_kwargs = dict(max_cycles=100, max_zombies=4, vector_state=True)


# Evaluate 10 games (takes ~10 seconds on a laptop CPU)
eval(env_fn, num_games=10, render_mode=None, **env_kwargs)

# Watch 2 games (takes ~10 seconds on a laptop CPU)
eval(env_fn, num_games=2, render_mode="rgb_array", **env_kwargs)


Starting evaluation on knights_archers_zombies_v10 (num_games=10, render_mode=None)
Avg reward: 0.75
Avg reward per agent, per game:  {'archer_0': 0.2, 'archer_1': 0.1, 'knight_0': 0.0, 'knight_1': 0.0}
Full rewards:  {'archer_0': 2, 'archer_1': 1, 'knight_0': 0, 'knight_1': 0}

Starting evaluation on knights_archers_zombies_v10 (num_games=2, render_mode=rgb_array)
Saved to ./logs/PPO_game_0.mp4
Saved to ./logs/PPO_game_1.mp4
Avg reward: 0.0
Avg reward per agent, per game:  {'archer_0': 0.0, 'archer_1': 0.0, 'knight_0': 0.0, 'knight_1': 0.0}
Full rewards:  {'archer_0': 0, 'archer_1': 0, 'knight_0': 0, 'knight_1': 0}


0.0

In [9]:
### Visualise one of the games
import IPython.display as ipd

ipd.Video("logs/PPO_game_0.mp4", embed = True)