# Vanilla PPO

Run on 3 environments

Empty Room

Empty Room Random

Four Rooms

In [1]:
import gymnasium as gym
import minigrid
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from minigrid.wrappers import FlatObsWrapper
import time
import numpy as np
from minigrid.core.world_object import Goal
import random
from stable_baselines3.common.callbacks import BaseCallback
from gymnasium.envs.registration import register

pygame 2.5.2 (SDL 2.28.3, Python 3.10.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


# First Empty

In [2]:
# Create and wrap the environment
env_id = "MiniGrid-Empty-16x16-v0"

In [3]:
def create_env():
    env = gym.make(env_id, render_mode="rgb_array")#,max_episode_steps=200)
    env = FlatObsWrapper(env)  
    return env

# Use the function in make_vec_env
env_vec = make_vec_env(create_env, n_envs=1)
env_vec2 = make_vec_env(create_env, n_envs=4)

In [4]:
model = PPO("MlpPolicy", env_vec2, verbose=1,
            tensorboard_log="log/ppo_vanilla_empty_no_step_cap/"
            )

# Train the model
model.learn(total_timesteps=200000)

Using cpu device
Logging to log/ppo_vanilla_empty_no_step_cap/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 0.0334   |
| time/              |          |
|    fps             | 5271     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.01e+03    |
|    ep_rew_mean          | 0.0167      |
| time/                   |             |
|    fps                  | 2836        |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.019228809 |
|    clip_fraction        | 0.164       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.93       |
|   

<stable_baselines3.ppo.ppo.PPO at 0x175b51240>

In [5]:
model.save("models/ppo_vanilla_empty_no_step_cap")

In [6]:
model = PPO.load("models/ppo_vanilla_empty_no_step_cap",env=env_vec)

mean_reward, std_reward = evaluate_policy(model, env=env_vec, n_eval_episodes=50)
print(f"Mean Reward: {mean_reward} +/- {std_reward}")

Mean Reward: 0.9762699999999999 +/- 1.1102230246251565e-16


In [7]:
num_episodes = 50
total_rewards = []  # List to store total rewards for each episode

for episode in range(num_episodes):
    obs = env_vec.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env_vec.step(action)
        #print(done)
        total_reward += reward
        env_vec.render('human')  # Render the environment at each step
        #time.sleep(0.05)  # Adjust this to control the speed of the rendering
    total_rewards.append(total_reward)  # Store the total reward for this episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Calculate the average reward across all episodes
average_reward = sum(total_rewards) / num_episodes
print(f"Average Reward over {num_episodes} episodes: {average_reward}")

env_vec.close()  # Close the environment when done

Episode 1: Total Reward = [0.97626954]
Episode 2: Total Reward = [0.97626954]
Episode 3: Total Reward = [0.97626954]
Episode 4: Total Reward = [0.9753906]
Episode 5: Total Reward = [0.9753906]
Episode 6: Total Reward = [0.97626954]
Episode 7: Total Reward = [0.97626954]
Episode 8: Total Reward = [0.97626954]
Episode 9: Total Reward = [0.97451174]
Episode 10: Total Reward = [0.97626954]
Episode 11: Total Reward = [0.97626954]
Episode 12: Total Reward = [0.97626954]
Episode 13: Total Reward = [0.97626954]
Episode 14: Total Reward = [0.97626954]
Episode 15: Total Reward = [0.9753906]
Episode 16: Total Reward = [0.97451174]
Episode 17: Total Reward = [0.97626954]
Episode 18: Total Reward = [0.97626954]
Episode 19: Total Reward = [0.97626954]
Episode 20: Total Reward = [0.97626954]
Episode 21: Total Reward = [0.97626954]
Episode 22: Total Reward = [0.9753906]
Episode 23: Total Reward = [0.97626954]
Episode 24: Total Reward = [0.9727539]
Episode 25: Total Reward = [0.97626954]
Episode 26: To

# Empty random room

In [8]:
register(
    id='custom_empty-v0',
    entry_point='emptyrandom:RandomGoalEmptyEnv',
    kwargs={}
)

In [9]:
#gym.pprint_registry()  # to see all registered environments

if 'custom_empty-v0' in gym.envs.registry:
    print("Environment 'custom_empty-v0' is registered.")
else:
    print("Environment 'custom_empty-v0' is NOT registered.")

Environment 'custom_empty-v0' is registered.


In [10]:
env_id2 = 'custom_empty-v0'
def create_env():
    env = gym.make(env_id2, render_mode="rgb_array",size=16)
    env = FlatObsWrapper(env)  
    return env

env_vec_ran = make_vec_env(create_env, n_envs=1)
env_vec_ran2 = make_vec_env(create_env, n_envs=4)

In [11]:
# Initialize the PPO model2
model2 = PPO("MlpPolicy", env_vec_ran2, verbose=1,
            tensorboard_log="log/ppo_vanilla_empty_random_no_step_cap"
            )

# Train the model
model2.learn(total_timesteps=200000)

Using cpu device
Logging to log/ppo_vanilla_empty_random_no_step_cap/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 564      |
|    ep_rew_mean     | 0.465    |
| time/              |          |
|    fps             | 5227     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 757         |
|    ep_rew_mean          | 0.272       |
| time/                   |             |
|    fps                  | 2683        |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.010537488 |
|    clip_fraction        | 0.0884      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.94      

<stable_baselines3.ppo.ppo.PPO at 0x32da7f790>

In [12]:
model2.save("models/ppo_vanilla_empty_random_no_step_cap")

In [13]:
model2 = PPO.load("models/ppo_vanilla_empty_random_no_step_cap",env=env_vec_ran)

mean_reward, std_reward = evaluate_policy(model2, env=env_vec_ran, n_eval_episodes=50)
print(f"Mean Reward: {mean_reward} +/- {std_reward}")

Mean Reward: 0.39618552 +/- 0.4852366355860711


In [14]:
num_episodes = 50
total_rewards = []  # List to store total rewards for each episode

for episode in range(num_episodes):
    obs = env_vec_ran.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model2.predict(obs)
        obs, reward, done, info = env_vec_ran.step(action)
        #print(done)
        total_reward += reward
        env_vec_ran.render('human')  # Render the environment at each step
        #time.sleep(0.05)  # Adjust this to control the speed of the rendering
    total_rewards.append(total_reward)  # Store the total reward for this episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Calculate the average reward across all episodes
average_reward = sum(total_rewards) / num_episodes
print(f"Average Reward over {num_episodes} episodes: {average_reward}")

env_vec_ran.close()  # Close the environment when done

Episode 1: Total Reward = [0.9850586]
Episode 2: Total Reward = [0.9586914]
Episode 3: Total Reward = [0.9639648]
Episode 4: Total Reward = [0.9736328]
Episode 5: Total Reward = [0.9833008]
Episode 6: Total Reward = [0.9542969]
Episode 7: Total Reward = [0.9490234]
Episode 8: Total Reward = [0.99560547]
Episode 9: Total Reward = [0.9824219]
Episode 10: Total Reward = [0.971875]
Episode 11: Total Reward = [0.9595703]
Episode 12: Total Reward = [0.9885742]
Episode 13: Total Reward = [0.9868164]
Episode 14: Total Reward = [0.99472654]
Episode 15: Total Reward = [0.96132815]
Episode 16: Total Reward = [0.9657227]
Episode 17: Total Reward = [0.9446289]
Episode 18: Total Reward = [0.99208987]
Episode 19: Total Reward = [0.9868164]
Episode 20: Total Reward = [0.9727539]
Episode 21: Total Reward = [0.99121094]
Episode 22: Total Reward = [0.96132815]
Episode 23: Total Reward = [0.97451174]
Episode 24: Total Reward = [0.97978514]
Episode 25: Total Reward = [0.86025393]
Episode 26: Total Reward =

# Now Four rooms

In [15]:
env_id3 = "MiniGrid-FourRooms-v0"
def create_env():
    env = gym.make(env_id3, render_mode="rgb_array")
    env = FlatObsWrapper(env)  
    return env

env_vec_four = make_vec_env(create_env, n_envs=1)
env_vec_four2 = make_vec_env(create_env, n_envs=4)

In [16]:
# Initialize the DQN model
model3 = PPO("MlpPolicy", env_vec_four2, verbose=1,
            tensorboard_log="log/ppo_vanilla_four_no_step_cap"
            )

# Train the model
model3.learn(total_timesteps=1000000)

Using cpu device
Logging to log/ppo_vanilla_four_no_step_cap/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 99       |
|    ep_rew_mean     | 0.0099   |
| time/              |          |
|    fps             | 4562     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 99.7        |
|    ep_rew_mean          | 0.00406     |
| time/                   |             |
|    fps                  | 2576        |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.017583378 |
|    clip_fraction        | 0.167       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.93       |
|    

<stable_baselines3.ppo.ppo.PPO at 0x32daf5930>

In [17]:
model3.save("models/ppo_vanilla_four_no_step_cap")

In [18]:
model3 = PPO.load("models/ppo_vanilla_four_no_step_cap",env=env_vec_four)

mean_reward, std_reward = evaluate_policy(model3, env=env_vec_four, n_eval_episodes=50)
print(f"Mean Reward: {mean_reward} +/- {std_reward}")

Mean Reward: 0.17768 +/- 0.3591202272220266


In [19]:
num_episodes = 50
total_rewards = []  # List to store total rewards for each episode

for episode in range(num_episodes):
    obs = env_vec_four.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model3.predict(obs)
        obs, reward, done, info = env_vec_four.step(action)
        #print(done)
        total_reward += reward
        env_vec_four.render('human')  # Render the environment at each step
        #time.sleep(0.05)  # Adjust this to control the speed of the rendering
    total_rewards.append(total_reward)  # Store the total reward for this episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Calculate the average reward across all episodes
average_reward = sum(total_rewards) / num_episodes
print(f"Average Reward over {num_episodes} episodes: {average_reward}")

env_vec_four.close()  # Close the environment when done

Episode 1: Total Reward = [0.748]
Episode 2: Total Reward = [0.]
Episode 3: Total Reward = [0.577]
Episode 4: Total Reward = [0.]
Episode 5: Total Reward = [0.604]
Episode 6: Total Reward = [0.865]
Episode 7: Total Reward = [0.955]
Episode 8: Total Reward = [0.307]
Episode 9: Total Reward = [0.928]
Episode 10: Total Reward = [0.]
Episode 11: Total Reward = [0.91]
Episode 12: Total Reward = [0.883]
Episode 13: Total Reward = [0.]
Episode 14: Total Reward = [0.442]
Episode 15: Total Reward = [0.]
Episode 16: Total Reward = [0.775]
Episode 17: Total Reward = [0.]
Episode 18: Total Reward = [0.757]
Episode 19: Total Reward = [0.]
Episode 20: Total Reward = [0.]
Episode 21: Total Reward = [0.766]
Episode 22: Total Reward = [0.703]
Episode 23: Total Reward = [0.127]
Episode 24: Total Reward = [0.919]
Episode 25: Total Reward = [0.595]
Episode 26: Total Reward = [0.838]
Episode 27: Total Reward = [0.307]
Episode 28: Total Reward = [0.]
Episode 29: Total Reward = [0.109]
Episode 30: Total Rewa