In [1]:
import gymnasium as gym
import minigrid
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from minigrid.wrappers import FlatObsWrapper
import time
import numpy as np
from minigrid.core.world_object import Goal
import random

pygame 2.1.0 (SDL 2.0.16, Python 3.10.9)
Hello from the pygame community. https://www.pygame.org/contribute.html


  from pandas.core import (


log

100K time steps best so far, default epsilon greedy

200K stuck, 600K stuck


In [2]:
# Create and wrap the environment
env_id = "MiniGrid-FourRooms-v0"

In [3]:
env = gym.make(env_id, render_mode = "rgb_array")
env = FlatObsWrapper(env)  # Wrap the environment to flatten the observations

# Use make_vec_env to handle vectorization
env = make_vec_env(lambda: env, n_envs=1)


In [None]:
# Initialize the DQN model
model = DQN("MlpPolicy", env, verbose=1,
            buffer_size=10000,
            learning_rate=1e-4,
            batch_size=32,
            exploration_initial_eps=1,
            exploration_final_eps= 0.1,
            exploration_fraction=0.8,
            )

# Train the model
model.learn(total_timesteps=100000)
model.save("dqn_minigrid_four_rooms")

In [None]:
# Assuming the model is saved under 'dqn_minigrid_four_rooms'
# env = gym.make("MiniGrid-FourRooms-v0", render_mode="rgb_array")
# env = FlatObsWrapper(env)
# env = make_vec_env(lambda: env, n_envs=1)

model = DQN.load("dqn_minigrid_four_rooms",env=env)

# Evaluate the model
mean_reward, std_reward = evaluate_policy(model, env=env, n_eval_episodes=100)
print(f"Mean Reward: {mean_reward} +/- {std_reward}")

In [None]:
num_episodes = 100
total_rewards = []  # List to store total rewards for each episode

for episode in range(num_episodes):
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        env.render('human')  # Render the environment at each step
        #time.sleep(0.1)  # Adjust this to control the speed of the rendering
    total_rewards.append(total_reward)  # Store the total reward for this episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Calculate the average reward across all episodes
average_reward = sum(total_rewards) / num_episodes
print(f"Average Reward over {num_episodes} episodes: {average_reward}")

env.close()  # Close the environment when done


In [9]:
class RewardShapingWrapper(gym.Wrapper):
    def __init__(self, env):
        super(RewardShapingWrapper, self).__init__(env)
        self.last_pos = None
        self.goal_pos = None
        self.stuck_counter = 0

    def step(self, action):
        action = self.noisy_action(action)  # Apply noise to the action for exploration
        obs, reward, done, info, extra = self.env.step(action)
        current_pos = self.env.agent_pos

        if self.goal_pos is None:
            self.goal_pos = self.get_goal_position()

        current_room = self.get_room(current_pos)
        last_room = self.get_room(self.last_pos) if self.last_pos else None

        # Check if transitioned to a new room
        if last_room and current_room and last_room != current_room:
            reward += 5.0  # Reward for moving to a new room

        # Additional vector to the nearest gap or doorway
        nearest_gap_vector = self.get_vector_to_nearest_gap(current_pos)
        obs['nearest_gap_direction'] = nearest_gap_vector  # Ensure obs is a dictionary

        # Assuming obs is a dictionary that includes 'image' and possibly other keys
        goal_direction = self.get_goal_direction(current_pos)
        # If you intend to add goal_direction to obs, you should add it as a new key-value pair
        obs['goal_direction'] = goal_direction  # Add it like this if obs is a dictionary

        # Reward shaping calculations
        if self.last_pos is not None and self.goal_pos is not None:
            distance_reward = self.calculate_reward_shaping(current_pos)
            additional_reward = self.additional_rewards(action, current_pos, self.last_pos)
            reward += distance_reward + additional_reward

        if np.array_equal(current_pos, self.goal_pos):
            reward += 100
            done = True

        if self.last_pos == current_pos:
            self.stuck_counter += 1
        else:
            self.stuck_counter = 0

        if self.stuck_counter > 3:
            obs = self.nudge_agent(current_pos) or obs
            self.stuck_counter = 0

        self.last_pos = current_pos
        return obs, reward, done, info, extra

    def reset(self, **kwargs):
        obs = self.env.reset(**kwargs)
        self.last_pos = self.env.agent_pos
        self.goal_pos = self.get_goal_position()
        self.gaps = self.find_gaps()  # Find gaps upon reset
        return obs
    
    def get_vector_to_nearest_gap(self, current_pos):
        if not self.gaps:
            return np.zeros(2)  # Default to zero vector if no gaps identified

        nearest_gap = min(self.gaps, key=lambda gap: np.linalg.norm(np.array(gap) - np.array(current_pos)))
        direction_vector = np.array(nearest_gap) - np.array(current_pos)
        norm = np.linalg.norm(direction_vector)
        return direction_vector / norm if norm != 0 else direction_vector

    
    def find_gaps(self):
        width, height = self.env.width, self.env.height
        gaps = []
        # Typically, gaps will be in the walls that divide the rooms
        # We scan horizontal and vertical mid-lines for empty spaces
        mid_vertical = width // 2
        mid_horizontal = height // 2

        # Vertical mid-line (check for empty cells)
        for y in range(height):
            if self.env.grid.get(mid_vertical, y) is None:
                gaps.append((mid_vertical, y))

        # Horizontal mid-line (check for empty cells)
        for x in range(width):
            if self.env.grid.get(x, mid_horizontal) is None:
                gaps.append((x, mid_horizontal))

        return gaps


    def get_goal_position(self):
        for x in range(self.env.width):
            for y in range(self.env.height):
                if self.env.grid.get(x, y) is not None and isinstance(self.env.grid.get(x, y), Goal):
                    return (x, y)
        return None
    
    
    def calculate_reward_shaping(self, current_pos):
        if not self.goal_pos:
            return 0
        
        epsilon = 0.01  # Small value to avoid division by zero
        prev_distance = np.linalg.norm(np.array(self.last_pos) - np.array(self.goal_pos))
        current_distance = np.linalg.norm(np.array(current_pos) - np.array(self.goal_pos))
        
        if current_distance < prev_distance:
            return 0.1 * (1 / (current_distance + epsilon))
        elif current_distance > prev_distance:
            return -0.1 * (1 / (current_distance + epsilon))
        return 0

    def additional_rewards(self, action, current_pos, prev_pos):
        prev_distance = np.linalg.norm(np.array(prev_pos) - np.array(self.goal_pos))
        current_distance = np.linalg.norm(np.array(current_pos) - np.array(self.goal_pos))
        distance_change = prev_distance - current_distance
        
        if action in [0, 1] and distance_change <= 0:
            return -0.05  # Unchanged, penalty for non-improvement
        
        reward = 0
        if action == 2 and distance_change > 0:
            reward = min(0.05 * distance_change, 1.0)  # Cap the reward to avoid excessively high values

        return reward
    

    def get_goal_direction(self, current_pos):
        if not self.goal_pos:
            return np.zeros_like(current_pos)  # Return a zero vector if goal position is not set
        # Assuming current_pos and goal_pos are numpy arrays or can be converted to them
        direction_vector = np.array(self.goal_pos) - np.array(current_pos)
        # Normalize the vector
        norm = np.linalg.norm(direction_vector)
        if norm == 0:
            return direction_vector  # Avoid division by zero if already at the goal
        return direction_vector / norm
    
    def noisy_action(self, action):
        # Add random noise to the action occasionally
        if random.random() < 0.1:  # With 10% probability, alter the action randomly
            return random.choice([0, 1, 2])  # Assuming actions are 0, 1, 2
        return action
    

    def get_possible_moves(self, current_pos):
        moves = []
        directions = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # Directions: up, down, left, right

        for dx, dy in directions:
            new_pos = (current_pos[0] + dx, current_pos[1] + dy)
            if self.is_position_valid(new_pos):
                moves.append(new_pos)
        return moves

    def is_position_valid(self, pos):
        x, y = pos
        # Check if within grid bounds
        if 0 <= x < self.env.width and 0 <= y < self.env.height:
            # Check if the cell is empty (None means empty in MiniGrid)
            return self.env.grid.get(x, y) is None
        return False
    
    def nudge_agent(self, current_pos):
        possible_moves = self.get_possible_moves(current_pos)
        if possible_moves:
            new_pos = random.choice(possible_moves)
            self.env.agent_pos = new_pos  # Assuming you can set the position
            # Simulate a no-op or a neutral action to update state and get observation
            obs, _, _, _, _ = self.env.step(0)  # Consider '0' as a no-op action, if applicable
            return obs
        # If no moves are possible or other issues arise, return a default observation
        return np.zeros(self.env.observation_space.shape)  # Adjust based on the specific obs space
    
    def get_room(self, position):
        x, y = position
        if x < 10 and y < 10:
            return 1  # Top-left room
        elif x >= 10 and y < 10:
            return 2  # Top-right room
        elif x < 10 and y >= 10:
            return 3  # Bottom-left room
        elif x >= 10 and y >= 10:
            return 4  # Bottom-right room
        return 0  # Should not happen unless position is out of expected range


In [11]:
class ActionWrapper(gym.ActionWrapper):
    def __init__(self, env):
        super(ActionWrapper, self).__init__(env)
        # Define a new action space with only the relevant actions
        self.action_space = gym.spaces.Discrete(3)  # Only three actions: left, right, forward

    def action(self, action):
        # Map the new actions to the original actions
        action_mapping = {
            0: 0,  # left
            1: 1,  # right
            2: 2   # forward
        }
        return action_mapping[action]

In [12]:
from stable_baselines3.common.callbacks import BaseCallback

# class ExponentialDecayCallback(BaseCallback):
#     def __init__(self, initial_epsilon, min_epsilon, decay_rate, verbose=0):
#         super(ExponentialDecayCallback, self).__init__(verbose)
#         self.initial_epsilon = initial_epsilon
#         self.min_epsilon = min_epsilon
#         self.decay_rate = decay_rate
#         self.current_epsilon = initial_epsilon

#     def _on_step(self) -> bool:
#         # Apply exponential decay
#         self.current_epsilon = max(self.min_epsilon, self.current_epsilon * self.decay_rate)
#         self.model.exploration_rate = self.current_epsilon
#         return True


In [13]:
class DynamicEpsilonCallback(BaseCallback):
    def __init__(self, initial_epsilon, min_epsilon, decay_rate, check_freq, verbose=0):
        super(DynamicEpsilonCallback, self).__init__(verbose)
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.decay_rate = decay_rate
        self.check_freq = check_freq
        self.best_mean_reward = -float('inf')

    def _on_step(self) -> bool:
        # Check if it's time to update
        if self.n_calls % self.check_freq == 0:
            # Retrieve performance (could be mean reward or other metric)
            mean_reward = np.mean([self.locals['rewards']])
            if mean_reward > self.best_mean_reward:
                self.best_mean_reward = mean_reward
                if self.verbose > 0:
                    print(f"New best mean reward: {self.best_mean_reward}, not reducing epsilon.")
            else:
                # Reduce exploration rate
                self.model.exploration_rate = max(self.min_epsilon, self.model.exploration_rate * self.decay_rate)
                if self.verbose > 0:
                    print(f"Reduced exploration rate: {self.model.exploration_rate}")

        return True


In [14]:
class PerformanceBasedEpsilonCallback(BaseCallback):
    def __init__(self, initial_epsilon, min_epsilon, decay_rate, increase_rate, check_freq, performance_threshold, verbose=0):
        super().__init__(verbose)
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.decay_rate = decay_rate
        self.increase_rate = increase_rate
        self.check_freq = check_freq
        self.performance_threshold = performance_threshold
        self.best_mean_reward = -float('inf')
        self.num_episodes = 0 

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            recent_rewards = self.model.ep_info_buffer
            mean_reward = np.mean([ep_info['r'] for ep_info in recent_rewards]) if recent_rewards else 0

            # Check if performance has improved sufficiently
            if mean_reward > self.best_mean_reward * self.performance_threshold:
                self.best_mean_reward = mean_reward
                self.model.exploration_rate *= self.increase_rate
                self.model.exploration_rate = min(1.0, self.model.exploration_rate)  # Ensure it does not exceed 1
                if self.verbose > 0:
                    print(f"Increased exploration rate to: {self.model.exploration_rate}")
            elif mean_reward < self.best_mean_reward:
                # Apply decay
                self.model.exploration_rate *= self.decay_rate
                self.model.exploration_rate = max(self.min_epsilon, self.model.exploration_rate)
                if self.verbose > 0:
                    print(f"Reduced exploration rate to: {self.model.exploration_rate}")

        return True
    
    def _on_rollout_end(self) -> None:
        # Only check to adjust epsilon at the end of each rollout
        if self.num_timesteps % self.check_freq == 0:
            recent_rewards = self.model.ep_info_buffer
            mean_reward = np.mean([ep_info['r'] for ep_info in recent_rewards]) if recent_rewards else 0

            if mean_reward > self.best_mean_reward * self.performance_threshold:
                self.best_mean_reward = mean_reward
                self.model.exploration_rate *= self.increase_rate
                self.model.exploration_rate = min(1.0, self.model.exploration_rate)  # Cap the exploration rate
                if self.verbose:
                    print(f"Increased exploration rate to: {self.model.exploration_rate}")

            elif mean_reward < self.best_mean_reward:
                self.model.exploration_rate *= self.decay_rate
                self.model.exploration_rate = max(self.min_epsilon, self.model.exploration_rate)
                if self.verbose:
                    print(f"Reduced exploration rate to: {self.model.exploration_rate}")

            self.num_episodes += 1
            if self.verbose:
                print(f"Rollout ended. Total episodes: {self.num_episodes}")


In [15]:
# Assuming `RewardShapingWrapper` and `FlatObsWrapper` have been correctly imported and configured.
env_base = gym.make(env_id, render_mode="rgb_array")
env_reward = RewardShapingWrapper(env_base)
env_flat = FlatObsWrapper(env_reward)

obs = env_flat.reset()
action = env_flat.action_space.sample()
try:
    obs, reward, done, info, extra = env_flat.step(action)
    print("Output from FlatObsWrapper:", obs, reward, done, info)
except ValueError as e:
    print("Error during step:", e)
    output = env_flat.step(action)
    print("Actual output from FlatObsWrapper:", output)


Output from FlatObsWrapper: [0. 0. 0. ... 0. 0. 0.] 0 False False


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


In [16]:
def create_env():
    env = gym.make(env_id, render_mode="rgb_array")
    env = ActionWrapper(env)
    env = RewardShapingWrapper(env)
    env = FlatObsWrapper(env)  
    return env

# Use the function in make_vec_env
env_vec = make_vec_env(create_env, n_envs=1)


In [None]:
# class CustomDQN(DQN):
#     def __init__(self, *args, decay_rate=0.995, **kwargs):
#         # Remove the decay_rate from kwargs before passing it to the superclass constructor
#         self.decay_rate = decay_rate  # Save the decay rate before calling the superclass constructor
#         super().__init__(*args, **kwargs)  # Now call the superclass constructor without the decay_rate argument
        
#         # Initialize the exploration rate with the initial exploration rate from the DQN settings
#         self.exploration_rate = self.exploration_initial_eps

#     def _on_step(self):
#         super()._on_step()  # Call the superclass method
#         # Dynamically update the exploration rate using the decay rate
#         self.exploration_rate = max(self.exploration_final_eps, self.exploration_rate * self.decay_rate)
#         #print(f"Updated exploration rate: {self.exploration_rate}")

#     def predict(self, observation, state=None, episode_start=None, deterministic=False):
#         if not deterministic and np.random.rand() < self.exploration_rate:
#             action = np.array([self.action_space.sample()])
#         else:
#             action, state = super().predict(observation, state, episode_start, deterministic=True)
#         return action, state



In [36]:
# call exponential callback doesn't work
#epsilon_decay_callback = ExponentialDecayCallback(initial_epsilon=1.0, min_epsilon=0.1, decay_rate=0.999)

dynamic_decay = DynamicEpsilonCallback(initial_epsilon=1.0, min_epsilon=0.1, decay_rate=0.999, check_freq=10, verbose=1)

dynamic_decay2 = PerformanceBasedEpsilonCallback(
    initial_epsilon=1.0,
    min_epsilon=0.1,
    decay_rate=0.999,
    increase_rate=1.15,
    check_freq=100,
    performance_threshold=1.1,
    verbose=0
)

model2 = DQN("MlpPolicy", env_vec, verbose=1,
            buffer_size=500000,
            learning_rate=1e-4,
            batch_size=64,
            )

# Train the model
model2.learn(total_timesteps=400000, callback=dynamic_decay2)
model2.save("dqn_minigrid_four_rooms_rs")

Using cpu device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 83.8     |
|    ep_rew_mean      | -1.47    |
|    exploration_rate | 0.992    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 550      |
|    time_elapsed     | 0        |
|    total_timesteps  | 335      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000234 |
|    n_updates        | 58       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 74.8     |
|    ep_rew_mean      | -1.55    |
|    exploration_rate | 0.986    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 476      |
|    time_elapsed     | 1        |
|    total_timesteps  | 598      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000209 |
|  

In [17]:
model2 = DQN.load("dqn_minigrid_four_rooms_rs",env=env_vec)


# Evaluate the model
mean_reward2, std_reward2 = evaluate_policy(model2, env=env_vec, n_eval_episodes=100)
print(f"Mean Reward with reward shaping: {mean_reward2} +/- {std_reward2}")

  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


Mean Reward with reward shaping: 0.7903041999999999 +/- 2.778390124658389


In [18]:
num_episodes = 200
total_rewards2 = []  # List to store total rewards for each episode

for episode in range(num_episodes):
    obs = env_vec.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model2.predict(obs, deterministic=True)
        print("Action chosen:", action)
        obs, reward, done, info= env_vec.step(action)  # Adjust to capture both terminated and truncated
        #done = truncated  # Consider episode done if either flag is True
        print("Reward received:", reward)
        total_reward += reward
        env_vec.render('human')  # Render the environment at each step
        time.sleep(0.05)  # Adjust this to control the speed of the rendering
    total_rewards2.append(total_reward)  # Store the total reward for this episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Calculate the average reward across all episodes
average_reward2 = sum(total_rewards2) / num_episodes
print(f"Average Reward over {num_episodes} episodes: {average_reward2}")

env_vec.close()  # Close the environment when done


Action chosen: [2]
Reward received: [100.]
Episode 1: Total Reward = [100.]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [-0.02231079]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [-0.01996008]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [-0.05]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [0.06138277]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [-0.01996008]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward received: [0.]
Action chosen: [2]
Reward

KeyboardInterrupt: 