# Vanilla DQN - with ActionWrapper

Run on 3 environments

Empty Room

Empty Room Random

Four Rooms

In [1]:
import gymnasium as gym
import minigrid
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from minigrid.wrappers import FlatObsWrapper
import time
import numpy as np
from minigrid.core.world_object import Goal
import random
from stable_baselines3.common.callbacks import BaseCallback
from gymnasium.envs.registration import register

pygame 2.1.0 (SDL 2.0.16, Python 3.10.9)
Hello from the pygame community. https://www.pygame.org/contribute.html


  from pandas.core import (


# First Empty

In [2]:
# Create and wrap the environment
env_id = "MiniGrid-Empty-16x16-v0"

In [3]:
class ActionWrapper(gym.ActionWrapper):
    def __init__(self, env):
        super(ActionWrapper, self).__init__(env)
        # Define a new action space with only the relevant actions
        self.action_space = gym.spaces.Discrete(3)  # Only three actions: left, right, forward

    def action(self, action):
        # Map the new actions to the original actions
        action_mapping = {
            0: 0,  # left
            1: 1,  # right
            2: 2   # forward
        }
        return action_mapping[action]

In [4]:
def create_env():
    env = gym.make(env_id, render_mode="rgb_array",max_episode_steps=200)
    env = ActionWrapper(env)
    env = FlatObsWrapper(env)  
    return env

# Use the function in make_vec_env
env_vec = make_vec_env(create_env, n_envs=1)
env_vec2 = make_vec_env(create_env, n_envs=4)

In [5]:
model = DQN("MlpPolicy", env_vec2, verbose=1,
            learning_rate = 0.0001,
            buffer_size = 1000000,
            learning_starts = 100,
            batch_size = 32,
            tau = 1,
            gamma = 0.99,
            exploration_fraction = 0.1,
            exploration_initial_eps = 1,
            exploration_final_eps = 0.05,
            tensorboard_log="./dqnactionrand/"
            )

# Train the model
model.learn(total_timesteps=200000)

Using cpu device
Logging to ./dqnactionrand/DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.962    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1418     |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000129 |
|    n_updates        | 43       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.924    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1385     |
|    time_elapsed     | 1        |
|    total_timesteps  | 1600     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    

<stable_baselines3.dqn.dqn.DQN at 0x27d592ac190>

In [6]:
model.save("dqn_action_empty")

In [7]:
model = DQN.load("dqn_action_empty",env=env_vec,tensorboard_log="./dqnactionrand/")

mean_reward, std_reward = evaluate_policy(model, env=env_vec, n_eval_episodes=50)
print(f"Mean Reward: {mean_reward} +/- {std_reward}")



Mean Reward: 0.0 +/- 0.0


In [8]:
num_episodes = 50
total_rewards = []  # List to store total rewards for each episode

for episode in range(num_episodes):
    obs = env_vec.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env_vec.step(action)
        #print(done)
        total_reward += reward
        env_vec.render('human')  # Render the environment at each step
        #time.sleep(0.05)  # Adjust this to control the speed of the rendering
    total_rewards.append(total_reward)  # Store the total reward for this episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Calculate the average reward across all episodes
average_reward = sum(total_rewards) / num_episodes
print(f"Average Reward over {num_episodes} episodes: {average_reward}")

env_vec.close()  # Close the environment when done

Episode 1: Total Reward = [0.]
Episode 2: Total Reward = [0.]
Episode 3: Total Reward = [0.]
Episode 4: Total Reward = [0.]
Episode 5: Total Reward = [0.]
Episode 6: Total Reward = [0.]
Episode 7: Total Reward = [0.]
Episode 8: Total Reward = [0.]
Episode 9: Total Reward = [0.]
Episode 10: Total Reward = [0.]
Episode 11: Total Reward = [0.]
Episode 12: Total Reward = [0.]
Episode 13: Total Reward = [0.]
Episode 14: Total Reward = [0.]
Episode 15: Total Reward = [0.]
Episode 16: Total Reward = [0.]
Episode 17: Total Reward = [0.]
Episode 18: Total Reward = [0.]
Episode 19: Total Reward = [0.]
Episode 20: Total Reward = [0.]
Episode 21: Total Reward = [0.]
Episode 22: Total Reward = [0.]
Episode 23: Total Reward = [0.]
Episode 24: Total Reward = [0.]
Episode 25: Total Reward = [0.]
Episode 26: Total Reward = [0.]
Episode 27: Total Reward = [0.]
Episode 28: Total Reward = [0.]
Episode 29: Total Reward = [0.]
Episode 30: Total Reward = [0.]
Episode 31: Total Reward = [0.]
Episode 32: Total

# Empty random room

In [9]:
register(
    id='custom_empty-v0',
    entry_point='emptyrandom:RandomGoalEmptyEnv',
    kwargs={}
)

In [10]:
#gym.pprint_registry()  # to see all registered environments

if 'custom_empty-v0' in gym.envs.registry:
    print("Environment 'custom_empty-v0' is registered.")
else:
    print("Environment 'custom_empty-v0' is NOT registered.")

Environment 'custom_empty-v0' is registered.


In [11]:
env_id2 = 'custom_empty-v0'
def create_env():
    env = gym.make(env_id2, render_mode="rgb_array",size=16)
    env = ActionWrapper(env)
    env = FlatObsWrapper(env)  
    return env

env_vec_ran = make_vec_env(create_env, n_envs=1)
env_vec_ran2 = make_vec_env(create_env, n_envs=4)

In [12]:
# Initialize the DQN model
model2 = DQN("MlpPolicy", env_vec_ran2, verbose=1,
            learning_rate = 0.0001,
            buffer_size = 1000000,
            learning_starts = 100,
            batch_size = 32,
            tau = 1,
            gamma = 0.99,
            exploration_fraction = 0.1,
            exploration_initial_eps = 1,
            exploration_final_eps = 0.05,
            tensorboard_log="./dqnactionrandempty/"
            )

# Train the model
model2.learn(total_timesteps=200000)

Using cpu device
Logging to ./dqnactionrandempty/DQN_1




----------------------------------
| rollout/            |          |
|    ep_len_mean      | 92       |
|    ep_rew_mean      | 0.097    |
|    exploration_rate | 0.981    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1636     |
|    time_elapsed     | 0        |
|    total_timesteps  | 400      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000101 |
|    n_updates        | 18       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 96       |
|    ep_rew_mean      | 0.0485   |
|    exploration_rate | 0.962    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1517     |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000118 |
|    n_updates      

<stable_baselines3.dqn.dqn.DQN at 0x27d0b024b20>

In [13]:
model2.save("dqn_action_empty_ran")

In [14]:
model2 = DQN.load("dqn_action_empty_ran",env=env_vec_ran,tensorboard_log="./dqnactionrandempty/")

mean_reward, std_reward = evaluate_policy(model2, env=env_vec_ran, n_eval_episodes=50)
print(f"Mean Reward: {mean_reward} +/- {std_reward}")



Mean Reward: 0.07676 +/- 0.2603089364582015


In [15]:
num_episodes = 50
total_rewards = []  # List to store total rewards for each episode

for episode in range(num_episodes):
    obs = env_vec_ran.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model2.predict(obs)
        obs, reward, done, info = env_vec_ran.step(action)
        #print(done)
        total_reward += reward
        env_vec_ran.render('human')  # Render the environment at each step
        #time.sleep(0.05)  # Adjust this to control the speed of the rendering
    total_rewards.append(total_reward)  # Store the total reward for this episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Calculate the average reward across all episodes
average_reward = sum(total_rewards) / num_episodes
print(f"Average Reward over {num_episodes} episodes: {average_reward}")

env_vec_ran.close()  # Close the environment when done

Episode 1: Total Reward = [0.928]
Episode 2: Total Reward = [0.928]
Episode 3: Total Reward = [0.]
Episode 4: Total Reward = [0.946]
Episode 5: Total Reward = [0.]
Episode 6: Total Reward = [0.]
Episode 7: Total Reward = [0.]
Episode 8: Total Reward = [0.973]
Episode 9: Total Reward = [0.]
Episode 10: Total Reward = [0.964]
Episode 11: Total Reward = [0.]
Episode 12: Total Reward = [0.]
Episode 13: Total Reward = [0.946]
Episode 14: Total Reward = [0.]
Episode 15: Total Reward = [0.]
Episode 16: Total Reward = [0.]
Episode 17: Total Reward = [0.]
Episode 18: Total Reward = [0.]
Episode 19: Total Reward = [0.]
Episode 20: Total Reward = [0.]
Episode 21: Total Reward = [0.937]
Episode 22: Total Reward = [0.]
Episode 23: Total Reward = [0.775]
Episode 24: Total Reward = [0.]
Episode 25: Total Reward = [0.]
Episode 26: Total Reward = [0.946]
Episode 27: Total Reward = [0.208]
Episode 28: Total Reward = [0.]
Episode 29: Total Reward = [0.154]
Episode 30: Total Reward = [0.]
Episode 31: Tota

# Now Four rooms

In [16]:
env_id3 = "MiniGrid-FourRooms-v0"
def create_env():
    env = gym.make(env_id3, render_mode="rgb_array")
    env = ActionWrapper(env)
    env = FlatObsWrapper(env)  
    return env

env_vec_four = make_vec_env(create_env, n_envs=1)
env_vec_four2 = make_vec_env(create_env, n_envs=4)

In [17]:
# Initialize the DQN model
model3 = DQN("MlpPolicy", env_vec_four2, verbose=1,
            learning_rate = 0.0001,
            buffer_size = 1000000,
            learning_starts = 100,
            batch_size = 32,
            tau = 1,
            gamma = 0.99,
            exploration_fraction = 0.1,
            exploration_initial_eps = 1,
            exploration_final_eps = 0.05,
            tensorboard_log="./dqnactionfour/"
            )

# Train the model
model3.learn(total_timesteps=1000000)

Using cpu device
Logging to ./dqnactionfour/DQN_1




----------------------------------
| rollout/            |          |
|    ep_len_mean      | 100      |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.996    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1510     |
|    time_elapsed     | 0        |
|    total_timesteps  | 400      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000446 |
|    n_updates        | 18       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 100      |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.992    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1363     |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000185 |
|    n_updates      

<stable_baselines3.dqn.dqn.DQN at 0x27d0b108d60>

In [18]:
model3.save("dqn_action_four")

In [19]:
model3 = DQN.load("dqn_action_four",env=env_vec_four,tensorboard_log="./dqnactionfour/")

mean_reward, std_reward = evaluate_policy(model3, env=env_vec_four, n_eval_episodes=50)
print(f"Mean Reward: {mean_reward} +/- {std_reward}")



Mean Reward: 0.07676 +/- 0.2603711627657717


In [20]:
num_episodes = 50
total_rewards = []  # List to store total rewards for each episode

for episode in range(num_episodes):
    obs = env_vec_four.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model3.predict(obs)
        obs, reward, done, info = env_vec_four.step(action)
        #print(done)
        total_reward += reward
        env_vec_four.render('human')  # Render the environment at each step
        #time.sleep(0.05)  # Adjust this to control the speed of the rendering
    total_rewards.append(total_reward)  # Store the total reward for this episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Calculate the average reward across all episodes
average_reward = sum(total_rewards) / num_episodes
print(f"Average Reward over {num_episodes} episodes: {average_reward}")

env_vec_four.close()  # Close the environment when done

Episode 1: Total Reward = [0.]
Episode 2: Total Reward = [0.712]
Episode 3: Total Reward = [0.]
Episode 4: Total Reward = [0.]
Episode 5: Total Reward = [0.]
Episode 6: Total Reward = [0.]
Episode 7: Total Reward = [0.]
Episode 8: Total Reward = [0.]
Episode 9: Total Reward = [0.]
Episode 10: Total Reward = [0.901]
Episode 11: Total Reward = [0.]
Episode 12: Total Reward = [0.]
Episode 13: Total Reward = [0.955]
Episode 14: Total Reward = [0.]
Episode 15: Total Reward = [0.]
Episode 16: Total Reward = [0.]
Episode 17: Total Reward = [0.]
Episode 18: Total Reward = [0.]
Episode 19: Total Reward = [0.955]
Episode 20: Total Reward = [0.]
Episode 21: Total Reward = [0.694]
Episode 22: Total Reward = [0.]
Episode 23: Total Reward = [0.]
Episode 24: Total Reward = [0.]
Episode 25: Total Reward = [0.406]
Episode 26: Total Reward = [0.]
Episode 27: Total Reward = [0.]
Episode 28: Total Reward = [0.]
Episode 29: Total Reward = [0.]
Episode 30: Total Reward = [0.]
Episode 31: Total Reward = [0.]