In [3]:
import numpy as np
import gymnasium as gym
from coverage_env import CoverageEnv
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from torch.optim import AdamW


In [4]:
# instantiate a single env (you can wrap VecEnv for parallelism later)
env = CoverageEnv(curriculum=0)

policy_kwargs = dict(
    optimizer_class  = AdamW,
    optimizer_kwargs = dict(
        weight_decay = 1e-4
    )
)

model = PPO(
    "MlpPolicy",
    env,
    learning_rate = 1e-4,
    n_steps       = 2048,
    batch_size    = 64,
    n_epochs      = 10,
    gamma         = 0.99,
    gae_lambda    = 0.95,
    clip_range    = 0.2,
    ent_coef      = 0.01,
    vf_coef       = 0.5,
    max_grad_norm = 0.5,
    verbose       = 1,
    policy_kwargs = policy_kwargs,
)

# train for 50k timesteps
model.learn(total_timesteps=500_000)

# save it
model.save("models/ppo_mlp_coverage_lvl0")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 193      |
|    ep_rew_mean     | -168     |
| time/              |          |
|    fps             | 6516     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 181         |
|    ep_rew_mean          | -150        |
| time/                   |             |
|    fps                  | 3622        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.018075809 |
|    clip_fraction        | 0.0911      |
|    clip_range           | 0.2         |
|    entropy_loss   

In [9]:
mean_reward, std_reward = evaluate_policy(
    model,
    env,
    n_eval_episodes=20,
    deterministic=True,
)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

Mean reward: 43.15 ± 1.31


In [10]:
obs, _ = env.reset(seed=42)
for i in range(env.max_steps):
    action_arr, _ = model.predict(obs, deterministic=True)
    action = int(action_arr)       # unwrap numpy array
    print("step:", i, "action:", action)

    obs, reward, terminated, truncated, info = env.step(action)
    env.render()
    print()  # blank line between frames

    if terminated or truncated:
        print("Done!", "Terminated" if terminated else "Truncated")
        break

step: 0 action: 2
........
.A......
....TTT.
....#TT.
....TTT.
........
........
........

step: 1 action: 0
........
........
.A..TTT.
....#TT.
....TTT.
........
........
........

step: 2 action: 2
........
........
..A.TTT.
....#TT.
....TTT.
........
........
........

step: 3 action: 2
........
........
...ATTT.
....#TT.
....TTT.
........
........
........

step: 4 action: 2
........
........
....ATT.
....#TT.
....TTT.
........
........
........

step: 5 action: 2
........
........
....TAT.
....#TT.
....TTT.
........
........
........

step: 6 action: 0
........
........
....TTT.
....#AT.
....TTT.
........
........
........

step: 7 action: 0
........
........
....TTT.
....#TT.
....TAT.
........
........
........

step: 8 action: 3
........
........
....TTT.
....#TT.
....ATT.
........
........
........

step: 9 action: 2
........
........
....TTT.
....#TT.
....TAT.
........
........
........

step: 10 action: 2
........
........
....TTT.
....#TT.
....TTA.
........
........
........

In [12]:
# Instantiate the environment at curriculum level 1
env = CoverageEnv(curriculum=1)

# Load the pretrained model from curriculum 0
model = PPO.load("models/ppo_mlp_coverage.zip", env=env)

# Evaluate the model
mean_reward, std_reward = evaluate_policy(
    model, 
    env, 
    n_eval_episodes=20, 
    deterministic=True,
)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

### Can add the pygame vizualization here as well

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Mean reward: -199.85 ± 0.65


In [13]:
# Create a new environment for curriculum level 1
env_1 = CoverageEnv(curriculum=1)

# Load the trained model and bind it to the new environment
model = PPO.load("models/ppo_mlp_coverage.zip", env=env_1)

# Continue training on level 1
model.learn(total_timesteps=300_000)

# Save the fine-tuned model
model.save("models/ppo_mlp_coverage_lvl1")

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -194     |
| time/              |          |
|    fps             | 6475     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -195         |
| time/                   |              |
|    fps                  | 3854         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0044827475 |
|    clip_fraction        | 0.0546       |
|    clip_range           | 0.2          |
|    entropy_loss       

In [14]:
env_0 = CoverageEnv(curriculum=0)
env_1 = CoverageEnv(curriculum=1)

mean_reward, std_reward = evaluate_policy(
    model,
    env_1,
    n_eval_episodes=20,
    deterministic=True,
)
print(f"Fine-tuned performance on level 1: {mean_reward:.2f} ± {std_reward:.2f}")


mean_reward, std_reward = evaluate_policy(
    model,
    env_0,
    n_eval_episodes=20,
    deterministic=True,
)
print(f"Fine-tuned performance on level 0: {mean_reward:.2f} ± {std_reward:.2f}")

# Can add vizualization here as well



Fine-tuned performance on level 1: 44.35 ± 2.54
Fine-tuned performance on level 0: -196.85 ± 5.65
