In [1]:
import numpy as np
import gymnasium as gym
from coverage_env import CoverageEnv

In [2]:
gym.register(
    id="Coverage-v0",
    entry_point="coverage_env:CoverageEnv",
    max_episode_steps=200,
)


In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

# instantiate a single env (you can wrap VecEnv for parallelism later)
env = CoverageEnv(seed=42)

# create the DQN model
model = PPO(
    policy="MlpPolicy",      # flatten your 5×8×8 obs into a 320‐dim vector
    env=env,             # or env directly if you prefer a single env
    learning_rate=1e-4,      # slightly lower than default 3e-4 for stability
    n_steps=2048,            # rollout length per update (≈10 episodes worth)
    batch_size=64,           # minibatch size for each epoch
    n_epochs=10,             # number of passes over the rollout buffer
    gamma=0.99,              # reward discount
    gae_lambda=0.95,         # GAE smoothing
    clip_range=0.2,          # PPO clipping parameter
    ent_coef=0.01,           # small entropy bonus to encourage exploration
    vf_coef=0.5,             # value function loss coefficient
    max_grad_norm=0.5,       # clip gradients
    verbose=1,
)

# train for 50k timesteps
model.learn(total_timesteps=500_000)

# save it
model.save("models/ppo_mlp_coverage")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 174      |
|    ep_rew_mean     | -142     |
| time/              |          |
|    fps             | 6428     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 173          |
|    ep_rew_mean          | -139         |
| time/                   |              |
|    fps                  | 3736         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0067777247 |
|    clip_fraction        | 0.00469      |
|    clip_range           | 0.2          |
|    en

In [5]:
mean_reward, std_reward = evaluate_policy(
    model,
    env,
    n_eval_episodes=20,
    deterministic=True,
)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

Mean reward: 43.35 ± 1.71




In [6]:
obs, _ = env.reset(seed=42)
for i in range(env.max_steps):
    action_arr, _ = model.predict(obs, deterministic=True)
    action = int(action_arr)       # unwrap numpy array
    print("step:", i, "action:", action)

    obs, reward, terminated, truncated, info = env.step(action)
    env.render()
    print()  # blank line between frames

    if terminated or truncated:
        print("Done!", "Terminated" if terminated else "Truncated")
        break

step: 0 action: 2
........
.A......
....TTT.
....#TT.
....TTT.
........
........
........

step: 1 action: 2
........
..A.....
....TTT.
....#TT.
....TTT.
........
........
........

step: 2 action: 0
........
........
..A.TTT.
....#TT.
....TTT.
........
........
........

step: 3 action: 2
........
........
...ATTT.
....#TT.
....TTT.
........
........
........

step: 4 action: 2
........
........
....ATT.
....#TT.
....TTT.
........
........
........

step: 5 action: 2
........
........
....TAT.
....#TT.
....TTT.
........
........
........

step: 6 action: 2
........
........
....TTA.
....#TT.
....TTT.
........
........
........

step: 7 action: 0
........
........
....TTT.
....#TA.
....TTT.
........
........
........

step: 8 action: 0
........
........
....TTT.
....#TT.
....TTA.
........
........
........

step: 9 action: 3
........
........
....TTT.
....#TT.
....TAT.
........
........
........

step: 10 action: 3
........
........
....TTT.
....#TT.
....ATT.
........
........
........