In [15]:
import gymnasium as gym
from pettingzoo.butterfly import pistonball_v6

# env = pistonball_v6.parallel_env(render_mode="human")
# observations, infos = env.reset()

# while env.agents:
#     actions = {agent: env.action_space(agent).sample() for agent in env.agents}
#     observations, rewards, terminations, truncations, infos = env.step(actions)
#     env.render()
# env.close()

In [16]:
import supersuit as ss
    
def make_env():
    env = pistonball_v6.parallel_env(n_pistons=5, 
                                    time_penalty=-0.1, 
                                    continuous=False, 
                                    random_drop=True, 
                                    random_rotate=False, 
                                    ball_mass=0.75, 
                                    ball_friction=0.3, 
                                    ball_elasticity=1.5, 
                                    max_cycles=100)
    env = ss.color_reduction_v0(env, mode='B')
    env = ss.resize_v1(env, x_size=240, y_size=60)
    env = ss.frame_stack_v1(env, 4)
    env = ss.pettingzoo_env_to_vec_env_v1(env)
    env = ss.concat_vec_envs_v1(env, 8, num_cpus=4, base_class='stable_baselines3')
    return env

In [17]:
env = make_env()

In [18]:
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy
from stable_baselines3.common.callbacks import EvalCallback

# Create the evaluation callback
eval_callback = EvalCallback(env, best_model_save_path='./best_model/',
                             log_path='./logs/', eval_freq=256,
                             deterministic=True, render=False)

model = PPO(CnnPolicy, env, verbose=3, 
            gamma=0.95, 
            n_steps=256, 
            ent_coef=0.09, 
            learning_rate=0.0006, 
            vf_coef=0.04, 
            max_grad_norm=0.9, 
            gae_lambda=0.99, 
            n_epochs=5, 
            clip_range=0.2, 
            batch_size=256)

# Pass the evaluation callback to the learn method
model.learn(total_timesteps=500_000, callback=eval_callback)
model.save("policy")

Using cuda device
Wrapping the env in a VecTransposeImage.




Eval num_timesteps=10240, episode_reward=-10.00 +/- 0.00
Episode length: 100.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 100      |
|    mean_reward     | -10      |
| time/              |          |
|    total_timesteps | 10240    |
---------------------------------
New best mean reward!
------------------------------
| time/              |       |
|    fps             | 577   |
|    iterations      | 1     |
|    time_elapsed    | 17    |
|    total_timesteps | 10240 |
------------------------------
Eval num_timesteps=20480, episode_reward=-21.24 +/- 0.00
Episode length: 100.00 +/- 0.00
-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 100           |
|    mean_reward          | -21.2         |
| time/                   |               |
|    total_timesteps      | 20480         |
| train/                  |               |
|    approx_kl            | 0.0007475

In [None]:
import imageio

model = PPO.load("policy")

# Create the environment
env = pistonball_v6.env(render_mode='rgb_array', n_pistons=5, 
                                                time_penalty=-0.1, 
                                                continuous=False, 
                                                random_drop=True, 
                                                random_rotate=False, 
                                                ball_mass=0.75, 
                                                ball_friction=0.3, 
                                                ball_elasticity=1.5, 
                                                max_cycles=100)
env = ss.color_reduction_v0(env, mode='B')
env = ss.resize_v1(env, x_size=240, y_size=60)
env = ss.frame_stack_v1(env, 4)

# Reset the environment
obs = env.reset()

# Create a list to store the frames of the GIF
frames = []
done = False
# Run the environment for one episode
for agent in env.agent_iter():
    observation, reward, termination, truncation, info = env.last()
    if termination or truncation:
        action = None
    else:
        action = model.predict(observation, deterministic=True)[0]
    env.step(action)
    frames.append(env.render())
env.close()

# Save the frames as a GIF
imageio.mimsave('sample_episode.gif', frames, duration=30)