# RL Test
## 1. Import

In [1]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## 2. Load Env

In [11]:
env_name = 'CartPole-v1'
render_mode = None
env = gym.make(env_name,render_mode=render_mode) # Loads gymnasiums cart pole environment, render mode:("human","rgb_array","asci")

In [3]:
episodes =5
for episode in range(1,episodes+1):
    state = env.reset()
    terminated = False
    score = 0
    
    while not terminated:
        if render_mode:
            env.render()
        action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(action) # 'terminated' and 'truncated' instead of 'done'
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

Episode:1 Score:12.0
Episode:2 Score:17.0
Episode:3 Score:13.0
Episode:4 Score:11.0
Episode:5 Score:47.0


### 2.1 Understand Env

Observation Space:

    | Num | Observation           | Min                 | Max               |
    |-----|-----------------------|---------------------|-------------------|
    | 0   | Cart Position         | -4.8                | 4.8               |
    | 1   | Cart Velocity         | -Inf                | Inf               |
    | 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
    | 3   | Pole Angular Velocity | -Inf                | Inf               |

Action Space:

    | Num | Action                 |
    |-----|------------------------|
    | 0   | Push cart to the left  |
    | 1   | Push cart to the right |

In [4]:
print(observation)

[ 0.05585453 -0.20431228  0.21620396  1.0914639 ]


## 3. Train Model

In [5]:
log_path = os.path.join('Training','Logs')

In [38]:
env = gym.make(env_name,render_mode=render_mode)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path) # MlpPolicy: Multi layer perceptron Policy

Using cuda device


In [7]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_4
-----------------------------
| time/              |      |
|    fps             | 834  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 693         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008619828 |
|    clip_fraction        | 0.0983      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00102     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.86        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.015      |
|    value_loss           | 50.1        |
-----------------------------------------


KeyboardInterrupt: 

## 4. Save and Reload Model

In [8]:
PPO_Path=os.path.join('Training','Saved_Models','PPO_Model_Cartpole')

In [None]:
model.save(PPO_Path)

In [14]:
del model

In [39]:
model = PPO.load(PPO_Path,env=env)

## 5. Evaluation

In [16]:
evaluate_policy(model,env,n_eval_episodes=10,render=True if render_mode else False)

(500.0, 0.0)

In [17]:
env.close()

### 5.1 Test Model

In [37]:
episodes =5
for episode in range(1,episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        if render_mode:
            env.render()
        action,_ = model.predict(obs) # outputs (action,state)
        obs, reward, done, info = env.step(action) # Because of the wrapper back to 'done' instead of terminated and truncated
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

Episode:1 Score:[422.]
Episode:2 Score:[500.]
Episode:3 Score:[278.]
Episode:4 Score:[444.]
Episode:5 Score:[200.]


### 5.2 View Logs in Tensorboard

In [41]:
training_log_path = os.path.join(log_path,'PPO_3')

In [45]:
#!tensorboard --logdir={training_log_path}

#### Core metrics to look at: 
1. Average Reward
2. Average Episode Length

#### Training strategies:
1. Train for longer
2. Hyperparameter Tuning

## 6. Add Callback to Training

In [46]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [49]:
save_path = os.path.join('Training','Saved_Models')

In [58]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=500,verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=4000,
                             best_model_save_path=save_path,
                             verbose=1)

In [51]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cuda device


In [52]:
model.learn(total_timesteps=60000,callback=eval_callback)

Logging to Training/Logs/PPO_4
-----------------------------
| time/              |      |
|    fps             | 732  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 651         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008851534 |
|    clip_fraction        | 0.115       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.0117     |
|    learning_rate        | 0.0003      |
|    loss                 | 10.1        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0167     |
|    value_loss           | 57          |
-----------------------------------------
---



Eval num_timesteps=10000, episode_reward=256.60 +/- 126.15
Episode length: 256.60 +/- 126.15
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 257          |
|    mean_reward          | 257          |
| time/                   |              |
|    total_timesteps      | 10000        |
| train/                  |              |
|    approx_kl            | 0.0076232445 |
|    clip_fraction        | 0.0519       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.61        |
|    explained_variance   | 0.23         |
|    learning_rate        | 0.0003       |
|    loss                 | 16.4         |
|    n_updates            | 40           |
|    policy_gradient_loss | -0.0128      |
|    value_loss           | 63.3         |
------------------------------------------
New best mean reward!
------------------------------
| time/              |       |
|    fps             | 561   |
|    iterations   

<stable_baselines3.ppo.ppo.PPO at 0x7f2859eab850>

## 7. Change Policy

In [55]:
net_arch = dict(pi=[128,128,128,128],vf=[128,128,128,128])

In [59]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path,policy_kwargs={'net_arch':net_arch})

Using cuda device


In [60]:
model.learn(total_timesteps=60000,callback=eval_callback)

Logging to Training/Logs/PPO_6
-----------------------------
| time/              |      |
|    fps             | 779  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------




Eval num_timesteps=4000, episode_reward=307.80 +/- 173.23
Episode length: 307.80 +/- 173.23
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 308          |
|    mean_reward          | 308          |
| time/                   |              |
|    total_timesteps      | 4000         |
| train/                  |              |
|    approx_kl            | 0.0141510675 |
|    clip_fraction        | 0.219        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.681       |
|    explained_variance   | 0.00159      |
|    learning_rate        | 0.0003       |
|    loss                 | 2.34         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0258      |
|    value_loss           | 17.9         |
------------------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 553  |
|    iterations      |

<stable_baselines3.ppo.ppo.PPO at 0x7f285872c310>

## 8. Use an Alternate Algorithm

In [61]:
from stable_baselines3 import DQN

In [62]:
model = DQN('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cuda device


In [63]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.956    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4184     |
|    time_elapsed     | 0        |
|    total_timesteps  | 92       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.917    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5221     |
|    time_elapsed     | 0        |
|    total_timesteps  | 174      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.882    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 5675     |
|    time_elapsed     | 0        |
|    total_timesteps  | 249      |
----------------------------------
------------------------

<stable_baselines3.dqn.dqn.DQN at 0x7f285b39d610>