# A Gentle Introduction to Reinforcement Learning 

# Reinforcement Learning with Python

## 1. Installing and Importing Dependencies

In [1]:
!pip install 'stable-baselines3[extra]'
!pip install wandb



In [2]:
import gym
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## 2. Load Environment

In [3]:
env = gym.make('CarRacing-v0')
env = DummyVecEnv([lambda:env])

## 3. Train RL Model

In [4]:
log_path = os.path.join('Training','Logs')

model = PPO('MlpPolicy', env, verbose = 1)

model.learn(total_timesteps = 20000)

Using cpu device
Wrapping the env in a VecTransposeImage.
Track generation: 1108..1396 -> 288-tiles track
Track generation: 1188..1489 -> 301-tiles track
Track generation: 1211..1518 -> 307-tiles track
-----------------------------
| time/              |      |
|    fps             | 177  |
|    iterations      | 1    |
|    time_elapsed    | 11   |
|    total_timesteps | 2048 |
-----------------------------
Track generation: 1136..1424 -> 288-tiles track
Track generation: 1084..1359 -> 275-tiles track
-----------------------------------------
| time/                   |             |
|    fps                  | 133         |
|    iterations           | 2           |
|    time_elapsed         | 30          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008181527 |
|    clip_fraction        | 0.0928      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.23       |
|    explained_variance   | 0.

<stable_baselines3.ppo.ppo.PPO at 0x1449dfd90>

## 4. Evaluate the RL Model

In [5]:
evaluate_policy(model,env,n_eval_episodes = 10, render = True)



Track generation: 1039..1305 -> 266-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1079..1353 -> 274-tiles track
Track generation: 1152..1444 -> 292-tiles track
Track generation: 1011..1268 -> 257-tiles track
Track generation: 1088..1364 -> 276-tiles track
Track generation: 1159..1453 -> 294-tiles track
Track generation: 1084..1368 -> 284-tiles track
Track generation: 1228..1539 -> 311-tiles track
Track generation: 1023..1290 -> 267-tiles track
Track generation: 1131..1418 -> 287-tiles track
Track generation: 1090..1367 -> 277-tiles track
Track generation: 1060..1328 -> 268-tiles track


(-45.58824533745646, 12.061828955619957)

# Tracking your Reinforcement Learning models with Weights and Biases

## 1. Importing Libraries

In [6]:
import gym
import os 
import wandb
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv,VecVideoRecorder
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from wandb.integration.sb3 import WandbCallback 

## 2. Initialising our Run

In [3]:
config = {
    "policy_type": "MlpPolicy",
    "total_timesteps": 250000,
    "env_name": "CarRacing-v0",
}
run = wandb.init(
    project="intro_to_rl",
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    monitor_gym=True,  # auto-upload the videos of agents playing the game
    save_code=True,  
)

[34m[1mwandb[0m: Currently logged in as: [33mmukilan[0m. Use [1m`wandb login --relogin`[0m to force relogin


## 3. Load Environment

In [4]:
def make_env():
    env = gym.make(config["env_name"])
    env = Monitor(env)  # record stats such as returns
    return env

In [7]:
env = DummyVecEnv([make_env])
env = VecVideoRecorder(env, f"videos/{run.id}", record_video_trigger=lambda x: x % 2000 == 0, video_length=200)

## 4. Train RL Model

In [8]:
model = PPO(config["policy_type"], env, verbose=1, tensorboard_log=f"runs/{run.id}")

Using cpu device
Wrapping the env in a VecTransposeImage.


In [9]:
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        model_save_path=f"models/{run.id}",
        verbose=2,
    ),
)

Track generation: 1180..1479 -> 299-tiles track
Logging to runs/3cv0xc49/PPO_1
Saving video to /Users/mukilankrishnakumar/Desktop/Mukilan/Professional/W&B/intro_rl/videos/3cv0xc49/rl-video-step-0-to-step-200.mp4
Track generation: 1005..1267 -> 262-tiles track
Track generation: 1101..1380 -> 279-tiles track
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -53.3    |
| time/              |          |
|    fps             | 88       |
|    iterations      | 1        |
|    time_elapsed    | 23       |
|    total_timesteps | 2048     |
---------------------------------
Saving video to /Users/mukilankrishnakumar/Desktop/Mukilan/Professional/W&B/intro_rl/videos/3cv0xc49/rl-video-step-2000-to-step-2200.mp4
Track generation: 1107..1388 -> 281-tiles track
Track generation: 1135..1423 -> 288-tiles track
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03  

<stable_baselines3.ppo.ppo.PPO at 0x145bf3370>

## 5. Saving RL Model

In [10]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_Driving_model_250k')
model.save(PPO_path)

## 6. Evaluate the RL Model

In [11]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

Track generation: 1314..1647 -> 333-tiles track
Saving video to /Users/mukilankrishnakumar/Desktop/Mukilan/Professional/W&B/intro_rl/videos/3cv0xc49/rl-video-step-251904-to-step-252104.mp4
Track generation: 1083..1358 -> 275-tiles track
Track generation: 1107..1388 -> 281-tiles track
Saving video to /Users/mukilankrishnakumar/Desktop/Mukilan/Professional/W&B/intro_rl/videos/3cv0xc49/rl-video-step-254000-to-step-254200.mp4
Track generation: 1141..1431 -> 290-tiles track
Track generation: 1203..1517 -> 314-tiles track
Saving video to /Users/mukilankrishnakumar/Desktop/Mukilan/Professional/W&B/intro_rl/videos/3cv0xc49/rl-video-step-256000-to-step-256200.mp4
Track generation: 1056..1324 -> 268-tiles track
Track generation: 1192..1494 -> 302-tiles track
Saving video to /Users/mukilankrishnakumar/Desktop/Mukilan/Professional/W&B/intro_rl/videos/3cv0xc49/rl-video-step-258000-to-step-258200.mp4
Track generation: 971..1218 -> 247-tiles track
Track generation: 978..1233 -> 255-tiles track
Saving

(-82.4730956, 1.6772241870385838)

In [12]:
run.finish()

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,██████████████████████▇▇▆▅▅▄▄▄▄▄▄▂▂▁▁▂▂▂
rollout/ep_rew_mean,▃▆▆▇█▇▇▆▆▇██▇▇▇▇▆▆▅▅▆▆▆▆▅▄▄▅▄▅▅▅▆▄▃▃▂▂▂▁
time/fps,▇▆▇▇▇██████▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅
train/approx_kl,▅▃▃▂▂▄▃▃▂▃▃▃▂▃▂▃▅▂▃▃▂▃▃▂▃▄▄▁▃▁▃▃▃▄▂█▄▄▃▂
train/clip_fraction,▆▆▂▃▃▆▃▃▂▃▄▅▂▂▂▃▇▂▃▃▁▁▃▅▅▅▆▁▇▃▅▃▂▂▂██▆▅▃
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▂▃▃▄▄▅▅▅▆▆▇▇▆▆▆▇▇▇▇▇▇▇▇▇▇████████████▇
train/explained_variance,▁▁▁▁▁▁▁▁▂▁▁▆▇▅▆▆▇▆▆▅▇▂▃▅▆▇▆▅▄▅▃▆▆▇██▇▁▄▇
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,251904.0
rollout/ep_len_mean,842.34998
rollout/ep_rew_mean,-58.19298
time/fps,81.0
train/approx_kl,0.00327
train/clip_fraction,0.04214
train/clip_range,0.2
train/entropy_loss,-2.97585
train/explained_variance,0.7229
train/learning_rate,0.0003
