# **PPO Hyperparameters Tuning**

# Import Libraries

In [None]:
import gym
import time
import torch
from torch import nn
import numpy as np
import pandas as pd
import cufflinks as cf
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from stable_baselines3.common.logger import configure
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [12]:
# For Notebooks
init_notebook_mode(connected=True)
# For offline use
cf.go_offline()

In [7]:
# Environment
from treasure_env import *

# PPO Training 1

In [8]:
env = TreasureHuntEnv(size=9, max_steps=500, lives=3)

In [9]:
logger = configure("../Tensorboard logs/ppo_logs/PPO Tuning 1", ["csv", "tensorboard"])
model = PPO(
    "MultiInputPolicy",
    env,
    verbose=2,
    tensorboard_log='../Tensorboard logs/ppo_logs/',
    learning_rate=0.001,
    gamma=0.95,
    n_steps=2048,
    batch_size=16,
    n_epochs=15,
    clip_range=0.5,
    ent_coef=0.03,
    vf_coef=0.5,
    normalize_advantage=True,
    max_grad_norm=0.5,
    seed=420,    
)
model.set_logger(logger)
model.learn(total_timesteps=100000)
print("Training is done")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.



You provided an OpenAI Gym environment. We strongly recommend transitioning to Gymnasium environments. Stable-Baselines3 is automatically wrapping your environments in a compatibility layer, which could potentially cause issues.



Training is done


# PPO Training 2

In [10]:
logger = configure("../Tensorboard logs/ppo_logs/PPO Tuning 2", ["csv", "tensorboard"])
model_2 = PPO(
    "MultiInputPolicy", 
    env,
    verbose=2,
    tensorboard_log='../Tensorboard logs/ppo_logs/',
    learning_rate=0.0005,      
    gamma=0.98,                
    n_steps=1024,              
    batch_size=32,             
    n_epochs=10,               
    clip_range=0.3,            
    ent_coef=0.02,             
    vf_coef=0.8,               
    normalize_advantage=True,
    max_grad_norm=0.5,
    seed=420,
)
model_2.set_logger(logger)
model_2.learn(total_timesteps=100000)
print("Model 2 Training is done")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.



You provided an OpenAI Gym environment. We strongly recommend transitioning to Gymnasium environments. Stable-Baselines3 is automatically wrapping your environments in a compatibility layer, which could potentially cause issues.



Model 2 Training is done


# PPO Training 3

In [11]:
logger = configure("../Tensorboard logs/ppo_logs/PPO Tuning 3", ["csv", "tensorboard"])
model_3 = PPO(
    "MultiInputPolicy", 
    env,
    verbose=2,
    tensorboard_log='../Tensorboard logs/ppo_logs/',
    learning_rate=0.0001,      
    gamma=0.99,                
    n_steps=512,             
    batch_size=64,                 
    n_epochs=20,               
    clip_range=0.2,            
    ent_coef=0.01,             
    vf_coef=1.0,               
    normalize_advantage=True,
    max_grad_norm=0.5,
    seed=420,
)
model_3.set_logger(logger)
model_3.learn(total_timesteps=100000)
print("Model 3 Training is done")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Model 3 Training is done


# Results

In [13]:
df_1 = pd.read_csv("../Tensorboard logs/ppo_logs/PPO Tuning 1/progress.csv")
df_2 = pd.read_csv("../Tensorboard logs/ppo_logs/PPO Tuning 2/progress.csv")
df_3 = pd.read_csv("../Tensorboard logs/ppo_logs/PPO Tuning 3/progress.csv")

In [15]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_1['time/iterations'], 
                         y=df_1['rollout/ep_rew_mean'], 
                         mode='lines', 
                         name='Model 1', 
                         line=dict(color='blue')))
fig.add_trace(go.Scatter(x=df_2['time/iterations'], 
                         y=df_2['rollout/ep_rew_mean'], 
                         mode='lines', 
                         name='Model 2', 
                         line=dict(color='green')))
fig.add_trace(go.Scatter(x=df_3['time/iterations'], 
                         y=df_3['rollout/ep_rew_mean'], 
                         mode='lines', 
                         name='Model 3', 
                         line=dict(color='red')))
fig.update_layout(
    title='Rollout Episode Reward Mean vs Iterations',
    xaxis_title='Iterations',
    yaxis_title='Episode Reward Mean',
    legend_title='Model',
)
fig.show()

In [16]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_1['time/iterations'],
    y=df_1['train/value_loss'],
    mode='lines',
    name='Model 1'
))
fig.add_trace(go.Scatter(
    x=df_2['time/iterations'],
    y=df_2['train/value_loss'],
    mode='lines',
    name='Model 2'
))
fig.add_trace(go.Scatter(
    x=df_3['time/iterations'],
    y=df_3['train/value_loss'],
    mode='lines',
    name='Model 3'
))
fig.update_layout(
    title='Value Loss vs Iterations',
    xaxis_title='Iterations',
    yaxis_title='Value Loss',
    legend_title='Models'
)
fig.show()

=> Best Model : 1

In [17]:
model.save("../Models/PPO Tuning/PPO_Tuning_1")
model_2.save("../Models/PPO Tuning/PPO_Tuning_2")
model_3.save("../Models/PPO Tuning/PPO_Tuning_3")
print("Models are saved")

Models are saved


# Testing Best Model

In [18]:
model = PPO.load("../Models/PPO Tuning/PPO_Tuning_1")
envir = TreasureHuntEnv(size=9, max_steps=200, lives=3, render_mode="human")

obs, _ = envir.reset()

done = False
total_reward = 0

while not done:
    action, _states = model.predict(obs, deterministic=True) 
    action = int(action)  
    obs, reward, done, _, _ = envir.step(action)  
    total_reward += reward
    envir.render()  
    time.sleep(0.3)
env.close()
print(f"Total reward: {total_reward}")

Total reward: 148
