## Imports

In [None]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback, CallbackList
import warnings
import os

warnings.simplefilter("ignore", category=UserWarning)

## Environment

In [118]:
easy = [[1, 1, 1, 1, 1],
        [1, 0, 0, 0, 1],
        [1, 1, 1, 1, 1]]

medium = [[1, 1, 1, 1, 1, 1, 1, 1],
          [1, 0, 0, 1, 1, 0, 0, 1],
          [1, 0, 0, 1, 0, 0, 0, 1],
          [1, 1, 0, 0, 0, 1, 1, 1],
          [1, 0, 0, 1, 0, 0, 0, 1],
          [1, 0, 1, 0, 0, 1, 0, 1],
          [1, 0, 0, 0, 1, 0, 0, 1],
          [1, 1, 1, 1, 1, 1, 1, 1]]

hard = [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
        [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1],
        [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
        [1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1],
        [1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
        [1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1],
        [1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

maps = {'easy': easy, 'medium': medium, 'hard': hard}

In [119]:
class GridMazeEnv(gym.Env):
    metadata = {"render_modes": ["human"]}

    def __init__(self, maps, render_mode=None):
        super().__init__()
        self.maps = maps
        self.render_mode = render_mode
        self.load_map("hard")

        self.observation_space = gym.spaces.Box(
            low=0, high=max(self.width, self.height),
            shape=(2,), dtype=np.float32
        )
        self.action_space = gym.spaces.Discrete(4)

        self.agent_pos = None

    def load_map(self, map_name):
        self.map_name = map_name
        self.grid = np.array(self.maps[map_name])
        self.height, self.width = self.grid.shape

        free_cells = np.argwhere(self.grid == 0)
        self.agent_pos = free_cells[0] + 0.5
        self.goal_pos = free_cells[-1] + 0.5

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        free_cells = np.argwhere(self.grid == 0)
        self.agent_pos = free_cells[0] + 0.5
        return self.agent_pos.copy().astype(np.float32), {}

    def step(self, action):
        # Define movement directions for 4 discrete actions
        move_map = {
            0: np.array([0, -1]),  # Up
            1: np.array([1, 0]),   # Right
            2: np.array([0, 1]),   # Down
            3: np.array([-1, 0])   # Left
        }

        delta = move_map[action] * 0.3
        new_pos = self.agent_pos + delta

        # Keep within bounds
        if not (0 <= new_pos[0] < self.width and 0 <= new_pos[1] < self.height):
            new_pos = self.agent_pos

        # Block movement into walls
        cell_x, cell_y = int(new_pos[0]), int(new_pos[1])
        if self.grid[cell_y, cell_x] == 1:
            new_pos = self.agent_pos

        self.agent_pos = new_pos

        dist_to_goal = np.linalg.norm(self.agent_pos - self.goal_pos)
        terminated = dist_to_goal < 0.5
        truncated = False
        reward = -dist_to_goal
        info = {}

        return self.agent_pos.copy().astype(np.float32), reward, terminated, truncated, info

    def render(self):
        if self.render_mode != "human":
            return

        print(f"\nMap: {self.map_name}")
        for y in range(self.height):
            row = ""
            for x in range(self.width):
                if int(self.agent_pos[0]) == x and int(self.agent_pos[1]) == y:
                    row += "A"
                elif int(self.goal_pos[0]) == x and int(self.goal_pos[1]) == y:
                    row += "G"
                elif self.grid[y, x] == 1:
                    row += "#"
                else:
                    row += "."
            print(row)

In [120]:
class StructuredMapSwitchCallback(BaseCallback):
    def __init__(self, env, total_timesteps, verbose=0):
        super().__init__(verbose)
        self.env = env
        self.total_timesteps = total_timesteps
        self.milestones = [total_timesteps // 3, 2 * total_timesteps // 3]
        self.current_map = None

    def _on_step(self) -> bool:
        num_steps = self.num_timesteps

        if num_steps < self.milestones[0]:
            desired_map = "easy"
            difficulty = 0
        elif num_steps < self.milestones[1]:
            desired_map = "medium"
            difficulty = 1
        else:
            desired_map = "hard"
            difficulty = 2

        if desired_map != self.current_map:
            self.env.load_map(desired_map)
            self.current_map = desired_map
            if self.verbose > 0:
                print(f"Step {num_steps}: Switched to map '{desired_map}'")

        self.logger.record("map/difficulty", difficulty)

        return True

In [121]:
class ConsoleRenderCallback(BaseCallback):
    def __init__(self, env, render_freq=1000, verbose=0):
        super().__init__(verbose)
        self.env = env
        self.render_freq = render_freq

    def _on_step(self) -> bool:
        if self.n_calls % self.render_freq == 0:
            self.env.render()
        return True

## Train Model

In [None]:
total_timesteps = 10240000
env = GridMazeEnv(maps)
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log="Training/logs/")

map_switch_callback = StructuredMapSwitchCallback(env, total_timesteps=total_timesteps, verbose=1)
render_callback = ConsoleRenderCallback(env, render_freq=1000, verbose=0)
callbacks = CallbackList([map_switch_callback, render_callback])

model.learn(total_timesteps=total_timesteps, callback=callbacks)

Use "tensorboard --logdir logs/" in the Training terminal to get to tensorboard

## Test Model

In [None]:
env = GridMazeEnv(maps)
model1 = PPO("MlpPolicy", env, verbose=1, tensorboard_log="Training/logs/PPO_1")
model2 = PPO("MlpPolicy", env, verbose=1, tensorboard_log="Training/logs/PPO_2")

In [125]:
ppo_path1 = os.path.join("Training", "Saved Models", "PPO_1")
model1.save(ppo_path1)
ppo_path2 = os.path.join("Training", "Saved Models", "PPO_2")
model2.save(ppo_path2)

In [126]:
model1 = PPO.load(ppo_path1)
model2 = PPO.load(ppo_path2)

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
import matplotlib.pyplot as plt
import numpy as np

episodes = 10000
max_steps = 10000

episode_rewards_model1 = []
episode_rewards_model2 = []

for ep in range(1, episodes + 1):
    for model, env, model_name, ep_reward_store in [
        (model1, env1, "Model1", episode_rewards_model1),
        (model2, env2, "Model2", episode_rewards_model2)
    ]:
        obs = env.reset()
        done = False
        total_reward = 0
        step = 0

        while not done and step < max_steps:
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)

            total_reward += reward[0]
            step += 1

        print(f"[{model_name}] Episode {ep}, Total Reward: {total_reward}, Steps: {step}")
        ep_reward_store.append(total_reward)

model1_rewards = np.array(episode_rewards_model1)
model2_rewards = np.array(episode_rewards_model2)
episodes_range = np.arange(1, episodes + 1)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

def smooth(x, window=10):
    return np.convolve(x, np.ones(window)/window, mode='valid')

# Plot raw reward curves with transparency
plt.plot(episodes_range, model1_rewards, label="Model1", alpha=0.3, color='blue')
plt.plot(episodes_range, model2_rewards, label="Model2", alpha=0.3, color='red')

# Plot smoothed (rolling average) reward curves
smoothed_model1 = smooth(model1_rewards)
smoothed_model2 = smooth(model2_rewards)
plt.plot(episodes_range[9:], smoothed_model1, label="Model1 Rolling Avg", color='blue')
plt.plot(episodes_range[9:], smoothed_model2, label="Model2 Rolling Avg", color='red')

# Rolling standard deviation
def rolling_std(x, window=10):
    return np.array([np.std(x[max(0, i - window + 1):i + 1]) for i in range(len(x))])

std1 = rolling_std(model1_rewards)
std2 = rolling_std(model2_rewards)

# Fill between ± std deviation
plt.fill_between(episodes_range[9:], smoothed_model1 - std1[9:], smoothed_model1 + std1[9:], alpha=0.2, color='blue')
plt.fill_between(episodes_range[9:], smoothed_model2 - std2[9:], smoothed_model2 + std2[9:], alpha=0.2, color='red')

plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Total Reward per Episode with Rolling Average and Std")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

# Boxplot of reward distributions
plt.figure(figsize=(6, 5))
plt.boxplot([model1_rewards, model2_rewards], labels=["Model1", "Model2"])
plt.ylabel("Total Reward")
plt.title("Reward Distribution per Model")
plt.grid(True)
plt.tight_layout()
plt.show()

# Histogram / Distribution plot
plt.figure(figsize=(8, 5))
plt.hist(model1_rewards, bins=30, alpha=0.5, label="Model1", color='blue', density=True)
plt.hist(model2_rewards, bins=30, alpha=0.5, label="Model2", color='red', density=True)
plt.xlabel("Total Reward")
plt.ylabel("Density")
plt.title("Reward Distribution Histogram")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from scipy.stats import ttest_ind

t_stat, p_val = ttest_ind(model1_rewards, model2_rewards)
print(f"T-test: t={t_stat:.3f}, p={p_val:.3f}")

In [None]:
import seaborn as sns
import pandas as pd

data = {
    "Reward": np.concatenate([model1_rewards, model2_rewards]),
    "Model": ["Model1"] * len(model1_rewards) + ["Model2"] * len(model2_rewards)
}
df = pd.DataFrame(data)

plt.figure(figsize=(6, 5))
sns.violinplot(x="Model", y="Reward", data=df, palette=["blue", "red"], inner="quartile", legend=False)

plt.title("Reward Distribution per Model (Violin Plot)")
plt.ylabel("Total Reward")
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()


In [None]:
sns.violinplot(x="Model", y="Reward", data=df, palette=["blue", "red"], inner="quartile")
sns.stripplot(x="Model", y="Reward", data=df, color='black', alpha=0.3, jitter=True)