# **Model per Level**

The goal of this notebook is to train one model instance of our best model (PPO with L2) for each environment without curriculum learning, to understand if it really learns faster with exploration.

In [3]:
import numpy as np
import gymnasium as gym
from coverage_env import CoverageEnv
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from torch.optim import AdamW

from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import os
import matplotlib.pyplot as plt

In [2]:
env_0 = CoverageEnv(curriculum=0)
env_1 = CoverageEnv(curriculum=1)
env_2 = CoverageEnv(curriculum=2)
env_3 = CoverageEnv(curriculum=3)
env_4 = CoverageEnv(curriculum=4)

In [None]:
policy_kwargs = dict(
    optimizer_class  = AdamW,
    optimizer_kwargs = dict(
        weight_decay = 1e-4
    )
)

# **Environment 0**

In [None]:
model = PPO(
    "MlpPolicy",
    env_0,
    learning_rate = 1e-4,
    n_steps       = 2048,
    batch_size    = 64,
    n_epochs      = 10,
    gamma         = 0.99,
    gae_lambda    = 0.95,
    clip_range    = 0.2,
    ent_coef      = 0.01,
    vf_coef       = 0.5,
    max_grad_norm = 0.5,
    verbose       = 1,
    policy_kwargs = policy_kwargs,
    tensorboard_log="logs/general/coverage_lvl0",
)

# train for 500k timesteps
model.learn(total_timesteps=500_000)

# save it
model.save("models/general/coverage_lvl0")

In [None]:
log_dir = "logs/general/coverage_lvl0/PPO_1"
run_id = os.listdir(log_dir)[0]
event_path = os.path.join(log_dir, run_id)

# Load the TensorBoard logs
event_acc = EventAccumulator(event_path)
event_acc.Reload()

# Get 'rollout/ep_rew_mean' scalar events
rewards = event_acc.Scalars("rollout/ep_rew_mean")

# Extract steps and reward values
steps = [event.step for event in rewards]
reward_values = [event.value for event in rewards]

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(steps, reward_values, label="Mean Episode Reward")
plt.xlabel("Timesteps")
plt.ylabel("Reward")
plt.title("Learning Curve without Curriculum (Level 0)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
mean_reward, std_reward = evaluate_policy(
    model,
    env_0,
    n_eval_episodes=20,
    deterministic=True,
)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

# **Environment 1**

In [None]:
model = PPO(
    "MlpPolicy",
    env_1,
    learning_rate = 1e-4,
    n_steps       = 2048,
    batch_size    = 64,
    n_epochs      = 10,
    gamma         = 0.99,
    gae_lambda    = 0.95,
    clip_range    = 0.2,
    ent_coef      = 0.01,
    vf_coef       = 0.5,
    max_grad_norm = 0.5,
    verbose       = 1,
    policy_kwargs = policy_kwargs,
    tensorboard_log="logs/general/coverage_lvl1",
)

# train for 500k timesteps
model.learn(total_timesteps=500_000)

# save it
model.save("models/general/coverage_lvl1")

In [None]:
log_dir = "logs/general/coverage_lvl1/PPO_1"
run_id = os.listdir(log_dir)[0]
event_path = os.path.join(log_dir, run_id)

# Load the TensorBoard logs
event_acc = EventAccumulator(event_path)
event_acc.Reload()

# Get 'rollout/ep_rew_mean' scalar events
rewards = event_acc.Scalars("rollout/ep_rew_mean")

# Extract steps and reward values
steps = [event.step for event in rewards]
reward_values = [event.value for event in rewards]

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(steps, reward_values, label="Mean Episode Reward")
plt.xlabel("Timesteps")
plt.ylabel("Reward")
plt.title("Learning Curve without Curriculum (Level 1)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
mean_reward, std_reward = evaluate_policy(
    model,
    env_1,
    n_eval_episodes=20,
    deterministic=True,
)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

# **Environment 2**

In [None]:
model = PPO(
    "MlpPolicy",
    env_2,
    learning_rate = 1e-4,
    n_steps       = 2048,
    batch_size    = 64,
    n_epochs      = 10,
    gamma         = 0.99,
    gae_lambda    = 0.95,
    clip_range    = 0.2,
    ent_coef      = 0.01,
    vf_coef       = 0.5,
    max_grad_norm = 0.5,
    verbose       = 1,
    policy_kwargs = policy_kwargs,
    tensorboard_log="logs/general/coverage_lvl2",
)

# train for 500k timesteps
model.learn(total_timesteps=500_000)

# save it
model.save("models/general/coverage_lvl2")

In [None]:
log_dir = "logs/general/coverage_lvl2/PPO_1"
run_id = os.listdir(log_dir)[0]
event_path = os.path.join(log_dir, run_id)

# Load the TensorBoard logs
event_acc = EventAccumulator(event_path)
event_acc.Reload()

# Get 'rollout/ep_rew_mean' scalar events
rewards = event_acc.Scalars("rollout/ep_rew_mean")

# Extract steps and reward values
steps = [event.step for event in rewards]
reward_values = [event.value for event in rewards]

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(steps, reward_values, label="Mean Episode Reward")
plt.xlabel("Timesteps")
plt.ylabel("Reward")
plt.title("Learning Curve without Curriculum (Level 2)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
mean_reward, std_reward = evaluate_policy(
    model,
    env_2,
    n_eval_episodes=20,
    deterministic=True,
)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

# **Environment 3**

In [None]:
model = PPO(
    "MlpPolicy",
    env_3,
    learning_rate = 1e-4,
    n_steps       = 2048,
    batch_size    = 64,
    n_epochs      = 10,
    gamma         = 0.99,
    gae_lambda    = 0.95,
    clip_range    = 0.2,
    ent_coef      = 0.01,
    vf_coef       = 0.5,
    max_grad_norm = 0.5,
    verbose       = 1,
    policy_kwargs = policy_kwargs,
    tensorboard_log="logs/general/coverage_lvl3",
)

# train for 500k timesteps
model.learn(total_timesteps=500_000)

# save it
model.save("models/general/coverage_lvl3")

In [None]:
log_dir = "logs/general/coverage_lvl3/PPO_1"
run_id = os.listdir(log_dir)[0]
event_path = os.path.join(log_dir, run_id)

# Load the TensorBoard logs
event_acc = EventAccumulator(event_path)
event_acc.Reload()

# Get 'rollout/ep_rew_mean' scalar events
rewards = event_acc.Scalars("rollout/ep_rew_mean")

# Extract steps and reward values
steps = [event.step for event in rewards]
reward_values = [event.value for event in rewards]

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(steps, reward_values, label="Mean Episode Reward")
plt.xlabel("Timesteps")
plt.ylabel("Reward")
plt.title("Learning Curve without Curriculum (Level 3)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
mean_reward, std_reward = evaluate_policy(
    model,
    env_3,
    n_eval_episodes=20,
    deterministic=True,
)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

# **Environment 4**

In [None]:
model = PPO(
    "MlpPolicy",
    env_4,
    learning_rate = 1e-4,
    n_steps       = 2048,
    batch_size    = 64,
    n_epochs      = 10,
    gamma         = 0.99,
    gae_lambda    = 0.95,
    clip_range    = 0.2,
    ent_coef      = 0.01,
    vf_coef       = 0.5,
    max_grad_norm = 0.5,
    verbose       = 1,
    policy_kwargs = policy_kwargs,
    tensorboard_log="logs/general/coverage_lvl4",
)

# train for 500k timesteps
model.learn(total_timesteps=500_000)

# save it
model.save("models/general/coverage_lvl4")

In [None]:
log_dir = "logs/general/coverage_lvl4/PPO_1"
run_id = os.listdir(log_dir)[0]
event_path = os.path.join(log_dir, run_id)

# Load the TensorBoard logs
event_acc = EventAccumulator(event_path)
event_acc.Reload()

# Get 'rollout/ep_rew_mean' scalar events
rewards = event_acc.Scalars("rollout/ep_rew_mean")

# Extract steps and reward values
steps = [event.step for event in rewards]
reward_values = [event.value for event in rewards]

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(steps, reward_values, label="Mean Episode Reward")
plt.xlabel("Timesteps")
plt.ylabel("Reward")
plt.title("Learning Curve without Curriculum (Level 4)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
mean_reward, std_reward = evaluate_policy(
    model,
    env_4,
    n_eval_episodes=20,
    deterministic=True,
)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")