# Vanilla DQN + action wrapper + reward shaping

Run on 3 environments

Empty Room

Empty Room Random

Four Rooms

In [1]:
import gymnasium as gym
import minigrid
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from minigrid.wrappers import FlatObsWrapper
import time
import numpy as np
from minigrid.core.world_object import Goal
import random
from stable_baselines3.common.callbacks import BaseCallback
from gymnasium.envs.registration import register
from minigrid.core.constants import OBJECT_TO_IDX, IDX_TO_OBJECT, COLOR_TO_IDX, IDX_TO_COLOR, DIR_TO_VEC

pygame 2.1.0 (SDL 2.0.16, Python 3.10.9)
Hello from the pygame community. https://www.pygame.org/contribute.html


  from pandas.core import (


# First Empty

In [2]:
# Create and wrap the environment
env_id = "MiniGrid-Empty-16x16-v0"

In [2]:
class ActionWrapper(gym.ActionWrapper):
    def __init__(self, env):
        super(ActionWrapper, self).__init__(env)
        # Define a new action space with only the relevant actions
        self.action_space = gym.spaces.Discrete(3)  # Only three actions: left, right, forward

    def action(self, action):
        # Map the new actions to the original actions
        action_mapping = {
            0: 0,  # left
            1: 1,  # right
            2: 2   # forward
        }
        return action_mapping[action]

In [3]:
class RewardShapingWrapper1(gym.Wrapper):
    def __init__(self, env):
        super(RewardShapingWrapper1, self).__init__(env)
        self.last_action = None
        self.spin_counter = 0  # Tracks consecutive left-right turns

    def step(self, action):
        obs, reward, done, info, extra = self.env.step(action)
        current_pos = self.env.agent_pos

        # Check for spinning behavior
        if self.last_action in [0, 1] and action in [0, 1] and action != self.last_action:
            self.spin_counter += 1
        else:
            self.spin_counter = 0

        if self.spin_counter > 2:  # Threshold for considering it spinning
            reward -= 10
            self.spin_counter = 0  # Reset counter after penalty

        if self.last_action == 0 and action == 1 or self.last_action == 1 and action == 0:
            reward -= 10  # Increase penalty for oscillating between left and right

        self.last_action = action
        return obs, reward, done, info, extra

    def reset(self, **kwargs):
        obs = self.env.reset(**kwargs)
        self.last_action = None
        self.spin_counter = 0
        return obs

    def is_facing_wall(self):
        x, y = self.env.agent_pos
        direction_idx = self.env.agent_dir
        delta = self.env.DIR_TO_VEC[direction_idx]
        next_x, next_y = x + delta[0], y + delta[1]
        if 0 <= next_x < self.env.width and 0 <= next_y < self.env.height:
            next_cell = self.env.grid.get(next_x, next_y)
            return next_cell is not None and next_cell.type == 'wall'
        return False


In [4]:
class RewardShapingWrapper2(gym.Wrapper):
    def __init__(self, env):
        super(RewardShapingWrapper2, self).__init__(env)
        self.last_action = None
        self.spin_counter = 0  # Tracks consecutive left-right turns
        self.last_distance = None
        self.goal_pos = self.get_goal_position()  # Method to determine goal position

    def step(self, action):
        obs, reward, done, info,extra = self.env.step(action)
        #print(f"Observation: {obs}")
        current_pos = self.env.agent_pos
        
        # Calculate distance to the goal
        if self.last_distance is None:
            self.last_distance = self.calculate_distance(current_pos, self.goal_pos)
            #print("distance to goal",self.last_distance)

        new_distance = self.calculate_distance(current_pos, self.goal_pos)
        if new_distance < self.last_distance:
            # Reward for moving closer to the goal
            reward += 5 * (self.last_distance - new_distance)
        self.last_distance = new_distance

        if action == 2 and self.is_facing_wall():
            reward -= 10  # Penalty for trying to move into a wall

        # Check for spinning behavior
        if self.last_action in [0, 1] and action in [0, 1] and action != self.last_action:
            self.spin_counter += 1
        else:
            self.spin_counter = 0

        if self.spin_counter > 2:  # Threshold for considering it spinning
            reward -= 10
            self.spin_counter = 0  # Reset counter after penalty

        #dditional logic to discourage spinning or other suboptimal behaviors
        if self.last_action == 0 and action == 1 or self.last_action == 1 and action == 0:
            reward -= 10  # Increase penalty for oscillating between left and right

        # High terminal reward when reaching the goal
        if np.array_equal(current_pos, self.goal_pos):
            reward += 500  # High reward for reaching the goal
            done = True

        self.last_action = action
        return obs, reward, done, info,extra

    
    def reset(self, **kwargs):  # Accept any additional keyword arguments
        obs = self.env.reset(**kwargs)  # Reset the underlying environment
        self.goal_pos = self.env.get_goal_position()  # Retrieve the updated goal position
        self.last_distance = None
        self.last_action = None
        #self.spin_counter = 0
        return obs  # Return the observation

    def calculate_distance(self, current_pos, goal_pos):
        if current_pos is None or goal_pos is None:
            return float('inf')  # Return a large distance if any position is not initialized
        dist_to_goal = np.linalg.norm(np.array(current_pos) - np.array(goal_pos))
        #print('distance to goal:', dist_to_goal)
        return dist_to_goal

    def is_facing_wall(self):
        x, y = self.env.agent_pos
        direction_idx = self.env.agent_dir
        delta = DIR_TO_VEC[direction_idx]
        next_x, next_y = x + delta[0], y + delta[1]
        if 0 <= next_x < self.env.width and 0 <= next_y < self.env.height:
            next_cell = self.env.grid.get(next_x, next_y)
            #print("Object to Index Mapping:", OBJECT_TO_IDX)
            #print("Index for goal:", OBJECT_TO_IDX['goal'])
            #print("Index for wall:", OBJECT_TO_IDX['wall'])
            return next_cell and next_cell.type == OBJECT_TO_IDX['wall']
        return False


In [16]:
class RewardShapingWrapper3(gym.Wrapper):
    def __init__(self, env):
        super(RewardShapingWrapper3, self).__init__(env)
        self.last_pos = None
        self.goal_pos = None
        self.stuck_counter = 0

    def step(self, action):
        action = self.noisy_action(action)  # Apply noise to the action for exploration
        obs, reward, done, info, extra = self.env.step(action)
        current_pos = self.env.agent_pos

        if self.goal_pos is None:
            self.goal_pos = self.get_goal_position()

        current_room = self.get_room(current_pos)
        last_room = self.get_room(self.last_pos) if self.last_pos else None

        # Check if transitioned to a new room
        if last_room and current_room and last_room != current_room:
            reward += 5.0  # Reward for moving to a new room

        # Additional vector to the nearest gap or doorway
        nearest_gap_vector = self.get_vector_to_nearest_gap(current_pos)
        obs['nearest_gap_direction'] = nearest_gap_vector  # Ensure obs is a dictionary

        # Assuming obs is a dictionary that includes 'image' and possibly other keys
        goal_direction = self.get_goal_direction(current_pos)
        # If you intend to add goal_direction to obs, you should add it as a new key-value pair
        obs['goal_direction'] = goal_direction  # Add it like this if obs is a dictionary

        # Reward shaping calculations
        if self.last_pos is not None and self.goal_pos is not None:
            distance_reward = self.calculate_reward_shaping(current_pos)
            additional_reward = self.additional_rewards(action, current_pos, self.last_pos)
            reward += distance_reward + additional_reward

        if np.array_equal(current_pos, self.goal_pos):
            reward += 100
            done = True

        if self.last_pos == current_pos:
            self.stuck_counter += 1
        else:
            self.stuck_counter = 0

        if self.stuck_counter > 3:
            obs = self.nudge_agent(current_pos) or obs
            self.stuck_counter = 0

        self.last_pos = current_pos
        return obs, reward, done, info, extra

    def reset(self, **kwargs):
        obs = self.env.reset(**kwargs)
        self.last_pos = self.env.agent_pos
        self.goal_pos = self.get_goal_position()
        self.gaps = self.find_gaps()  # Find gaps upon reset
        return obs
    
    def get_vector_to_nearest_gap(self, current_pos):
        if not self.gaps:
            return np.zeros(2)  # Default to zero vector if no gaps identified

        nearest_gap = min(self.gaps, key=lambda gap: np.linalg.norm(np.array(gap) - np.array(current_pos)))
        direction_vector = np.array(nearest_gap) - np.array(current_pos)
        norm = np.linalg.norm(direction_vector)
        return direction_vector / norm if norm != 0 else direction_vector

    
    def find_gaps(self):
        width, height = self.env.width, self.env.height
        gaps = []
        # Typically, gaps will be in the walls that divide the rooms
        # We scan horizontal and vertical mid-lines for empty spaces
        mid_vertical = width // 2
        mid_horizontal = height // 2

        # Vertical mid-line (check for empty cells)
        for y in range(height):
            if self.env.grid.get(mid_vertical, y) is None:
                #print('vertical', self.env.grid.get(mid_vertical, y))
                #print('the gap vertical mid-line is',(mid_vertical, y) )
                gaps.append((mid_vertical, y))

        # Horizontal mid-line (check for empty cells)
        for x in range(width):
            if self.env.grid.get(x, mid_horizontal) is None:
                gaps.append((x, mid_horizontal))

        return gaps


    def get_goal_position(self):
        for x in range(self.env.width):
            for y in range(self.env.height):
                if self.env.grid.get(x, y) is not None and isinstance(self.env.grid.get(x, y), Goal):
                    return (x, y)
        return None
    
    
    def calculate_reward_shaping(self, current_pos):
        if not self.goal_pos:
            return 0
        
        epsilon = 0.01  # Small value to avoid division by zero
        prev_distance = np.linalg.norm(np.array(self.last_pos) - np.array(self.goal_pos))
        current_distance = np.linalg.norm(np.array(current_pos) - np.array(self.goal_pos))
        
        if current_distance < prev_distance:
            return 0.1 * (1 / (current_distance + epsilon))
        elif current_distance > prev_distance:
            return -0.1 * (1 / (current_distance + epsilon))
        return 0

    def additional_rewards(self, action, current_pos, prev_pos):
        prev_distance = np.linalg.norm(np.array(prev_pos) - np.array(self.goal_pos))
        current_distance = np.linalg.norm(np.array(current_pos) - np.array(self.goal_pos))
        distance_change = prev_distance - current_distance
        
        if action in [0, 1] and distance_change <= 0:
            return -0.05  # Unchanged, penalty for non-improvement
        
        reward = 0
        if action == 2 and distance_change > 0:
            reward = min(0.05 * distance_change, 1.0)  # Cap the reward to avoid excessively high values

        return reward
    

    def get_goal_direction(self, current_pos):
        if not self.goal_pos:
            return np.zeros_like(current_pos)  # Return a zero vector if goal position is not set
        # Assuming current_pos and goal_pos are numpy arrays or can be converted to them
        direction_vector = np.array(self.goal_pos) - np.array(current_pos)
        # Normalize the vector
        norm = np.linalg.norm(direction_vector)
        if norm == 0:
            return direction_vector  # Avoid division by zero if already at the goal
        return direction_vector / norm
    
    def noisy_action(self, action):
        # Add random noise to the action occasionally
        if random.random() < 0.1:  # With 10% probability, alter the action randomly
            return random.choice([0, 1, 2])  # Assuming actions are 0, 1, 2
        return action
    

    def get_possible_moves(self, current_pos):
        moves = []
        directions = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # Directions: up, down, left, right

        for dx, dy in directions:
            new_pos = (current_pos[0] + dx, current_pos[1] + dy)
            if self.is_position_valid(new_pos):
                moves.append(new_pos)
        return moves

    def is_position_valid(self, pos):
        x, y = pos
        # Check if within grid bounds
        if 0 <= x < self.env.width and 0 <= y < self.env.height:
            # Check if the cell is empty (None means empty in MiniGrid)
            return self.env.grid.get(x, y) is None
        return False
    
    def nudge_agent(self, current_pos):
        possible_moves = self.get_possible_moves(current_pos)
        if possible_moves:
            new_pos = random.choice(possible_moves)
            self.env.agent_pos = new_pos  # Assuming you can set the position
            # Simulate a no-op or a neutral action to update state and get observation
            obs, _, _, _, _ = self.env.step(0)  # Consider '0' as a no-op action, if applicable
            return obs
        # If no moves are possible or other issues arise, return a default observation
        return np.zeros(self.env.observation_space.shape)  # Adjust based on the specific obs space
    
    def get_room(self, position):
        x, y = position
        if x < 10 and y < 10:
            return 1  # Top-left room
        elif x >= 10 and y < 10:
            return 2  # Top-right room
        elif x < 10 and y >= 10:
            return 3  # Bottom-left room
        elif x >= 10 and y >= 10:
            return 4  # Bottom-right room
        return 0  # Should not happen unless position is out of expected range

In [20]:
def create_env():
    env = gym.make(env_id, render_mode="rgb_array",max_episode_steps=200)
    env = ActionWrapper(env)
    env = RewardShapingWrapper1(env)
    env = FlatObsWrapper(env)  
    return env

# Use the function in make_vec_env
env_vec = make_vec_env(create_env, n_envs=1)
env_vec2 = make_vec_env(create_env, n_envs=4)

In [21]:
model = DQN("MlpPolicy", env_vec2, verbose=1,
            learning_rate = 0.0001,
            buffer_size = 1000000,
            learning_starts = 100,
            batch_size = 32,
            tau = 1,
            gamma = 0.99,
            exploration_fraction = 0.1,
            exploration_initial_eps = 1,
            exploration_final_eps = 0.05,
            tensorboard_log="./dqnrewardrand/"
            )

# Train the model
model.learn(total_timesteps=200000)

Using cpu device
Logging to ./dqnrewardrand/DQN_2


  logger.warn(


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | -442     |
|    exploration_rate | 0.962    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1455     |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.74     |
|    n_updates        | 43       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | -398     |
|    exploration_rate | 0.924    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1349     |
|    time_elapsed     | 1        |
|    total_timesteps  | 1600     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.05     |
|    n_updates      

<stable_baselines3.dqn.dqn.DQN at 0x217b821e8f0>

In [22]:
model.save("dqn_reward_empty")

In [23]:
model = DQN.load("dqn_reward_empty",env=env_vec,tensorboard_log="./dqnrewardrand/")

mean_reward, std_reward = evaluate_policy(model, env=env_vec, n_eval_episodes=50)
print(f"Mean Reward: {mean_reward} +/- {std_reward}")

  logger.warn(


Mean Reward: 0.0 +/- 0.0


In [24]:
num_episodes = 50
total_rewards = []  # List to store total rewards for each episode

for episode in range(num_episodes):
    obs = env_vec.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env_vec.step(action)
        #print(done)
        total_reward += reward
        env_vec.render('human')  # Render the environment at each step
        #time.sleep(0.05)  # Adjust this to control the speed of the rendering
    total_rewards.append(total_reward)  # Store the total reward for this episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Calculate the average reward across all episodes
average_reward = sum(total_rewards) / num_episodes
print(f"Average Reward over {num_episodes} episodes: {average_reward}")

env_vec.close()  # Close the environment when done

Episode 1: Total Reward = [0.]
Episode 2: Total Reward = [0.]
Episode 3: Total Reward = [0.8901367]
Episode 4: Total Reward = [0.]
Episode 5: Total Reward = [0.]
Episode 6: Total Reward = [0.9727539]
Episode 7: Total Reward = [0.92441404]
Episode 8: Total Reward = [0.92089844]
Episode 9: Total Reward = [0.]
Episode 10: Total Reward = [0.9727539]
Episode 11: Total Reward = [0.9296875]
Episode 12: Total Reward = [0.]
Episode 13: Total Reward = [0.]
Episode 14: Total Reward = [0.8892578]
Episode 15: Total Reward = [0.]
Episode 16: Total Reward = [0.8901367]
Episode 17: Total Reward = [0.]
Episode 18: Total Reward = [-9.080859]
Episode 19: Total Reward = [0.]
Episode 20: Total Reward = [0.9578125]
Episode 21: Total Reward = [0.9015625]
Episode 22: Total Reward = [0.]
Episode 23: Total Reward = [0.96132815]
Episode 24: Total Reward = [0.8989258]
Episode 25: Total Reward = [0.9604492]
Episode 26: Total Reward = [0.]
Episode 27: Total Reward = [0.93056643]
Episode 28: Total Reward = [-9.04746

# Empty random room

In [6]:
register(
    id='custom_empty-v0',
    entry_point='emptyrandom:RandomGoalEmptyEnv',
    kwargs={}
)

In [7]:
#gym.pprint_registry()  # to see all registered environments

if 'custom_empty-v0' in gym.envs.registry:
    print("Environment 'custom_empty-v0' is registered.")
else:
    print("Environment 'custom_empty-v0' is NOT registered.")

Environment 'custom_empty-v0' is registered.


In [8]:
env_id2 = 'custom_empty-v0'
def create_env():
    env = gym.make(env_id2, render_mode="rgb_array",size=16,max_episode_steps=200)
    env = ActionWrapper(env)
    env = RewardShapingWrapper2(env)
    env = FlatObsWrapper(env)  
    return env

env_vec_ran = make_vec_env(create_env, n_envs=1)
env_vec_ran2 = make_vec_env(create_env, n_envs=4)

  logger.warn(


In [9]:
# Initialize the DQN model
model2 = DQN("MlpPolicy", env_vec_ran2, verbose=1,
            learning_rate = 0.0001,
            buffer_size = 1000000,
            learning_starts = 100,
            batch_size = 32,
            tau = 1,
            gamma = 0.99,
            exploration_fraction = 0.1,
            exploration_initial_eps = 1,
            exploration_final_eps = 0.05,
            tensorboard_log="./dqnrewardrandempty/"
            )

# Train the model
model2.learn(total_timesteps=200000)

Using cpu device
Logging to ./dqnrewardrandempty/DQN_1


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 100      |
|    ep_rew_mean      | -172     |
|    exploration_rate | 0.981    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1066     |
|    time_elapsed     | 0        |
|    total_timesteps  | 400      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 3.1      |
|    n_updates        | 18       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 100      |
|    ep_rew_mean      | -181     |
|    exploration_rate | 0.962    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1173     |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 3.19     |
|    n_updates      

<stable_baselines3.dqn.dqn.DQN at 0x26de1e058a0>

In [10]:
model2.save("dqn_reward_empty_ran")

In [11]:
model2 = DQN.load("dqn_reward_empty_ran",env=env_vec_ran,tensorboard_log="./dqnrewardrandempty/")

mean_reward, std_reward = evaluate_policy(model2, env=env_vec_ran, n_eval_episodes=50)
print(f"Mean Reward: {mean_reward} +/- {std_reward}")



Mean Reward: 437.69339864 +/- 198.2545935499547


In [12]:
num_episodes = 50
total_rewards = []  # List to store total rewards for each episode

for episode in range(num_episodes):
    obs = env_vec_ran.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model2.predict(obs)
        obs, reward, done, info = env_vec_ran.step(action)
        #print(done)
        total_reward += reward
        env_vec_ran.render('human')  # Render the environment at each step
        #time.sleep(0.05)  # Adjust this to control the speed of the rendering
    total_rewards.append(total_reward)  # Store the total reward for this episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Calculate the average reward across all episodes
average_reward = sum(total_rewards) / num_episodes
print(f"Average Reward over {num_episodes} episodes: {average_reward}")

env_vec_ran.close()  # Close the environment when done

Episode 1: Total Reward = [526.4231]
Episode 2: Total Reward = [557.40027]
Episode 3: Total Reward = [576.0323]
Episode 4: Total Reward = [640.2158]
Episode 5: Total Reward = [580.4591]
Episode 6: Total Reward = [557.3975]
Episode 7: Total Reward = [536.2653]
Episode 8: Total Reward = [61.391777]
Episode 9: Total Reward = [561.83276]
Episode 10: Total Reward = [535.928]
Episode 11: Total Reward = [32.111023]
Episode 12: Total Reward = [572.22614]
Episode 13: Total Reward = [627.6891]
Episode 14: Total Reward = [523.8772]
Episode 15: Total Reward = [159.00934]
Episode 16: Total Reward = [23.033628]
Episode 17: Total Reward = [-83.6429]
Episode 18: Total Reward = [104.32257]
Episode 19: Total Reward = [569.9144]
Episode 20: Total Reward = [109.1703]
Episode 21: Total Reward = [584.1961]
Episode 22: Total Reward = [579.50616]
Episode 23: Total Reward = [540.3355]
Episode 24: Total Reward = [516.7574]
Episode 25: Total Reward = [560.5838]
Episode 26: Total Reward = [565.55304]
Episode 27: 

# Now Four rooms

In [17]:
env_id3 = "MiniGrid-FourRooms-v0"
def create_env():
    env = gym.make(env_id3, render_mode="rgb_array")
    env = ActionWrapper(env)
    env = RewardShapingWrapper3(env)
    env = FlatObsWrapper(env)  
    return env

env_vec_four = make_vec_env(create_env, n_envs=1)
env_vec_four2 = make_vec_env(create_env, n_envs=4)

In [18]:
# Initialize the DQN model
model3 = DQN("MlpPolicy", env_vec_four2, verbose=1,
            learning_rate = 0.0001,
            buffer_size = 1000000,
            learning_starts = 100,
            batch_size = 32,
            tau = 1,
            gamma = 0.99,
            exploration_fraction = 0.1,
            exploration_initial_eps = 1,
            exploration_final_eps = 0.05,
            tensorboard_log="./dqnrewardfour/"
            )

# Train the model
model3.learn(total_timesteps=1000000)

Using cpu device
Logging to ./dqnrewardfour/DQN_2




----------------------------------
| rollout/            |          |
|    ep_len_mean      | 84.8     |
|    ep_rew_mean      | -2.17    |
|    exploration_rate | 0.997    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1028     |
|    time_elapsed     | 0        |
|    total_timesteps  | 344      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00159  |
|    n_updates        | 15       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 84.2     |
|    ep_rew_mean      | -2.3     |
|    exploration_rate | 0.994    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1039     |
|    time_elapsed     | 0        |
|    total_timesteps  | 680      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000397 |
|    n_updates      

<stable_baselines3.dqn.dqn.DQN at 0x26d8d71c250>

In [19]:
model3.save("dqn_reward_four")

In [20]:
model3 = DQN.load("dqn_reward_four",env=env_vec_four,tensorboard_log="./dqnrewardfour/")

mean_reward, std_reward = evaluate_policy(model3, env=env_vec_four, n_eval_episodes=50)
print(f"Mean Reward: {mean_reward} +/- {std_reward}")



Mean Reward: 0.4895082 +/- 1.1690605222019943


In [21]:
num_episodes = 50
total_rewards = []  # List to store total rewards for each episode

for episode in range(num_episodes):
    obs = env_vec_four.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model3.predict(obs)
        obs, reward, done, info = env_vec_four.step(action)
        #print(done)
        total_reward += reward
        env_vec_four.render('human')  # Render the environment at each step
        #time.sleep(0.05)  # Adjust this to control the speed of the rendering
    total_rewards.append(total_reward)  # Store the total reward for this episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Calculate the average reward across all episodes
average_reward = sum(total_rewards) / num_episodes
print(f"Average Reward over {num_episodes} episodes: {average_reward}")

env_vec_four.close()  # Close the environment when done

Episode 1: Total Reward = [-0.38729355]
Episode 2: Total Reward = [-0.07023036]
Episode 3: Total Reward = [-0.37541193]
Episode 4: Total Reward = [0.49189624]
Episode 5: Total Reward = [0.03009739]
Episode 6: Total Reward = [0.52255654]
Episode 7: Total Reward = [-0.44386178]
Episode 8: Total Reward = [-0.20010193]
Episode 9: Total Reward = [-0.05612034]
Episode 10: Total Reward = [-0.00029896]
Episode 11: Total Reward = [-0.20032798]
Episode 12: Total Reward = [0.07328919]
Episode 13: Total Reward = [-0.29448465]
Episode 14: Total Reward = [-0.32154104]
Episode 15: Total Reward = [0.03803198]
Episode 16: Total Reward = [-0.30327803]
Episode 17: Total Reward = [-0.21098684]
Episode 18: Total Reward = [0.14747807]
Episode 19: Total Reward = [-0.5880928]
Episode 20: Total Reward = [-0.15970244]
Episode 21: Total Reward = [-0.2416734]
Episode 22: Total Reward = [-0.19814318]
Episode 23: Total Reward = [9.918353]
Episode 24: Total Reward = [5.2451687]
Episode 25: Total Reward = [-0.3802792