# **PPO**

# Import Libraries

In [1]:
import gym
import time
import torch
from torch import nn
import numpy as np
import pandas as pd
import cufflinks as cf
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from stable_baselines3.common.logger import configure
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
# For Notebooks
init_notebook_mode(connected=True)
# For offline use
cf.go_offline()

In [12]:
# Environment
from treasure_env import *

# PPO Training

In [13]:
env = TreasureHuntEnv(size=9, max_steps=500, lives=3)

In [None]:
logger = configure("../Tensorboard logs/ppo_logs/PPO", ["csv", "tensorboard"])
model = PPO(
    "MultiInputPolicy",
    env,
    verbose=2,
    tensorboard_log='../Tensorboard logs/ppo_logs/',
    learning_rate=0.0003,
    gamma=0.99,
    n_steps=2048, 
    batch_size=16,
    n_epochs=15,
    clip_range=0.5,
    ent_coef=0.03,
    vf_coef=0.5,
    normalize_advantage=True,
    max_grad_norm=0.5,
    seed=420,    
)
model.set_logger(logger)
model.learn(total_timesteps=800000)
print("Training is done")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Training is done


In [6]:
model.save("../Models/PPO/PPO_model_6")
print("Model is saved")

Model is saved


# Results

In [3]:
df = pd.read_csv("../Tensorboard logs/ppo_logs/PPO/progress.csv")

In [4]:
fig = px.line(df, x='time/iterations', y='rollout/ep_rew_mean', 
              title='Rollout Episode Reward Mean vs Iterations',
              labels={'time/iterations': 'Iterations', 'rollout/ep_rew_mean': 'Episode Reward Mean'})
fig.show()

In [5]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df['time/iterations'],
    y=df['train/value_loss'],
    mode='lines',
    name='Value Loss'
))
fig.add_trace(go.Scatter(
    x=df['time/iterations'],
    y=df['train/loss'],
    mode='lines',
    name='Training Loss'
))
fig.update_layout(
    title='Training Loss/Value Loss vs Iterations',
    xaxis_title='Iterations',
    yaxis_title='Loss',
    legend_title='Loss Types'
)

fig.show()

In [6]:
fig = px.line(df, x='time/iterations', y='train/entropy_loss', 
              title='Training Entropy Loss vs Iterations',
              labels={'time/iterations': 'Iterations', 'train/entropy_loss': 'Training Entropy Loss'})
fig.show()

In [7]:
fig = px.line(df, x='time/iterations', y='train/policy_gradient_loss', 
              title='Training Policy Gradient Loss vs Iterations',
              labels={'time/iterations': 'Iterations', 'train/policy_gradient_loss': 'Training Policy Gradient Loss'})
fig.show()

In [8]:
fig = px.line(df, x='time/iterations', y='train/explained_variance', 
              title='Training Explained Variance vs Iterations',
              labels={'time/iterations': 'Iterations', 'train/explained_variance': 'Training Explained Variance'})
fig.show()

# Policy Evaluation

In [14]:
model = PPO.load("../Models/PPO/PPO_model_6")

In [16]:
num_eval_episodes = 100
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=num_eval_episodes)
print(f"Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}")

Mean Reward: 532.83 ± 266.25


# Testing

In [18]:
model = PPO.load("../Models/PPO/PPO_model_6")
envir = TreasureHuntEnv(size=9, max_steps=200, lives=3, render_mode="human")

obs, _ = envir.reset()

done = False
total_reward = 0

while not done:
    action, _states = model.predict(obs, deterministic=True) 
    action = int(action)  
    obs, reward, done, _, _ = envir.step(action)  
    total_reward += reward
    envir.render()  
    time.sleep(0.3)
env.close()
print(f"Total reward: {total_reward}")

Total reward: 312
