In [1]:
import numpy as np
import gymnasium as gym
from coverage_env import CoverageEnv

In [2]:
gym.register(
    id="Coverage-v0",
    entry_point="coverage_env:CoverageEnv",
    max_episode_steps=200,
)


In [None]:
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

# instantiate a single env (you can wrap VecEnv for parallelism later)
env = CoverageEnv(seed=42)

# create the DQN model
model = DQN(
    policy="MlpPolicy",   # a simple MLP
    env=env,
    buffer_size=100000,
    learning_rate=1e-4,
    batch_size=32,
    gamma=0.99,
    verbose=1,
)

# train for 50k timesteps
model.learn(total_timesteps=500_000)

# save it
model.save("models/dqn_mlp_coverage")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | -185     |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3924     |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.161    |
|    n_updates        | 174      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 197      |
|    ep_rew_mean      | -174     |
|    exploration_rate | 0.97     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4296     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1575     |
| train/              |        

In [4]:
# load (if needed)
# model = DQN.load("dqn_coverage", env=env)

mean_reward, std_reward = evaluate_policy(
    model, 
    env, 
    n_eval_episodes=20, 
    deterministic=True,
)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")


Mean reward: 42.75 ± 1.61




In [17]:
obs, _ = env.reset()

print("starting state:")
env.render()
print("\n")

for i in range(env.max_steps):
    # model.predict returns e.g. array([2], dtype=int64)
    action_arr, _ = model.predict(obs, deterministic=True)
    action = int(action_arr)       # unwrap to Python int
    print("step:", i, "action:", action)

    obs, reward, terminated, truncated, info = env.step(action)
    env.render()
    print("\n")

    if terminated or truncated:
        break


starting state:
........
....A...
....TTT.
....#TT.
....TTT.
........
........
........


step: 0 action: 0
........
........
....ATT.
....#TT.
....TTT.
........
........
........


step: 1 action: 2
........
........
....TAT.
....#TT.
....TTT.
........
........
........


step: 2 action: 2
........
........
....TTA.
....#TT.
....TTT.
........
........
........


step: 3 action: 0
........
........
....TTT.
....#TA.
....TTT.
........
........
........


step: 4 action: 0
........
........
....TTT.
....#TT.
....TTA.
........
........
........


step: 5 action: 3
........
........
....TTT.
....#TT.
....TAT.
........
........
........


step: 6 action: 1
........
........
....TTT.
....#AT.
....TTT.
........
........
........


step: 7 action: 0
........
........
....TTT.
....#TT.
....TAT.
........
........
........


step: 8 action: 3
........
........
....TTT.
....#TT.
....ATT.
........
........
........


