In [3]:
# Imports

import minigrid
import gymnasium as gym
from gym.utils import play
import numpy as np
import torch
from torch import nn
from minigrid.wrappers import ImgObsWrapper
from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.callbacks import CheckpointCallback

In [4]:
class MinigridFeaturesExtractor(BaseFeaturesExtractor):
    """Custom features extractor for Minigrid to work with Stable-Baselines3.
    From official documentation."""

    def __init__(
        self,
        observation_space: gym.Space,
        features_dim: int = 512,
        normalized_image: bool = False,
    ) -> None:
        super().__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 16, (2, 2)),
            nn.ReLU(),
            nn.Conv2d(16, 32, (2, 2)),
            nn.ReLU(),
            nn.Conv2d(32, 64, (2, 2)),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with torch.no_grad():
            n_flatten = self.cnn(
                torch.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.linear(self.cnn(observations))

In [2]:
seed = 42
max_steps = 1000
device = "auto"
log_folder = "./log/"
learning_timesteps = 10_000_000
policy_kwargs = dict(
    features_extractor_class=MinigridFeaturesExtractor,
    features_extractor_kwargs=dict(features_dim=128),
)

np.random.seed(seed)

env = gym.make('MiniGrid-FourRooms-v0', render_mode='rgb_array', max_steps=max_steps)
env = ImgObsWrapper(env)

checkpoint_callback = CheckpointCallback(
    save_freq=50_000,
    save_path="./models/",
    name_prefix="ppo_minigrid_model_random_map",
    save_replay_buffer=False,
    save_vecnormalize=True,
)

# Create PPO Model with parameters
model = PPO(
    "CnnPolicy",
    env,
    tensorboard_log=log_folder,
    device=device,
    policy_kwargs=policy_kwargs,
    verbose=1,
)

# Reload previous model as starting point
#model.set_parameters("models/ppo_minigrid_model_determinist_map")

# Train the model
model.learn(
    total_timesteps=learning_timesteps,
    callback=checkpoint_callback,
    progress_bar=True,
)
model.save("models/ppo_minigrid_model_random_map")

pygame 2.5.2 (SDL 2.28.3, Python 3.10.13)
Hello from the pygame community. https://www.pygame.org/contribute.html
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./log/PPO_3


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 0        |
| time/              |          |
|    fps             | 2034     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 0           |
| time/                   |             |
|    fps                  | 872         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011610186 |
|    clip_fraction        | 0.0527      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.94       |
|    explained_variance   | -5.6        |
|    learning_rate        | 0.

KeyboardInterrupt: 

In [21]:
model = PPO.load('models/ppo_minigrid_model_random_map_2100000_steps.zip')

env = gym.make('MiniGrid-FourRooms-v0', render_mode='rgb_array')
env = ImgObsWrapper(env)

mean_reward = []
num_successes = 0
for episode in range(1,101):
    episode_reward = 0
    obs, info = env.reset()
    for timestep in range(1000):
        action, _states = model.predict(obs)
        obs, reward, terminated, truncated, info = env.step(action)
        if reward != 0:
            #print('Timestep', timestep, 'Reward', reward)
            num_successes += 1
        episode_reward += reward
        if terminated or truncated:
            break
    mean_reward.append(episode_reward)
    #print(f"Episode {episode}: Episode Reward = {episode_reward}")

# Calculate the average reward across all episodes
average_reward = sum(mean_reward) / len(mean_reward)
print(f"Average Reward over {len(mean_reward)} episodes: {average_reward}")
print(f'Number of Sucessful Episodes {num_successes}')

env.close()  # Close the environment when done

: 