In [None]:
import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy

from pettingzoo.butterfly import pistonball_v6

initialize the PettingZoo environment(information could be found: https://www.pettingzoo.ml/butterfly/pistonball ):

In [None]:
env = pistonball_v6.parallel_env(
    n_pistons=20,
    time_penalty=-0.1,
    continuous=True,
    random_drop=True,
    random_rotate=True,
    ball_mass=0.75,
    ball_friction=0.3,
    ball_elasticity=1.5,
    max_cycles=125,
)

adjust the environment with SuperSuit: 

In [None]:
env = ss.color_reduction_v0(env, mode="B") #remove useless color
env = ss.resize_v1(env, x_size=84, y_size=84) #shrink observation
env = ss.frame_stack_v1(env, 3) #stack the past few frames together to compute acceleration
env = ss.pettingzoo_env_to_vec_env_v1(env) #parameter sharing of the policy netword on the environment
env = ss.concat_vec_envs_v1(env, 8, num_cpus=4, base_class="stable_baselines3") #set the environment to run multiple versions of itself in parallel

This instantiates the PPO learning object, define model 
(model information could be found: https://medium.com/aureliantactics/ppo-hyperparameters-and-ranges-6fc2d29bccbe ): 

In [None]:
model = PPO(
    CnnPolicy,
    env,
    verbose=3,
    gamma=0.95,
    n_steps=256,
    ent_coef=0.0905168,
    learning_rate=0.00062211,
    vf_coef=0.042202,
    max_grad_norm=0.9,
    gae_lambda=0.99,
    n_epochs=5,
    clip_range=0.3,
    batch_size=256,
)

train model:

In [None]:
model.learn(total_timesteps=2000000) #actions taken by an individual agent
model.save("policy") #saves the policy network to disk

reinstantiate the environment and load policy

In [None]:
env = pistonball_v6.env()
env = ss.color_reduction_v0(env, mode="B")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 3)

model = PPO.load("policy")

use the policy to render it on your desktop 

In [None]:
env.reset()
for agent in env.agent_iter():
    obs, reward, done, info = env.last()
    act = model.predict(obs, deterministic=True)[0] if not done else None
    env.step(act)
    env.render()