## Part 3 : Stable Baselines on the racetrack environment

#### Importations

In [8]:
# %pip install stable-baselines3

In [6]:
import gymnasium as gym
import numpy as np
import torch
from torch import nn
import random
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from collections import deque
import torch.optim as optim
import torch.nn.functional as F
from collections import namedtuple
import math

#### Configurations

In [5]:
config = {
    "observation": {
        "type": "OccupancyGrid",
        "features": ['presence', 'on_road'],
        "grid_size": [[-18, 18], [-18, 18]],
        "grid_step": [3, 3],
        "as_image": False,
        "align_to_vehicle_axes": True
    },
    "action": {
        "type": "ContinuousAction",
        "longitudinal": False,
        "lateral": True
    },
    "simulation_frequency": 15,
    "policy_frequency": 5,
    "duration": 300,
    "collision_reward": -1,
    "lane_centering_cost": 4,
    "action_reward": -0.3,
    "controlled_vehicles": 1,
    "other_vehicles": 1,
    "screen_width": 600,
    "screen_height": 600,
    "centering_position": [0.5, 0.5],
    "scaling": 7,
    "show_trajectories": False,
    "render_agent": True,
    "offscreen_rendering": False
}

#### Usage

In [6]:
env = gym.make("racetrack-v0", config=config)

#### Création et entraînement de l'agent

In [11]:
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)  # Nombre d'itérations d'entraînement

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 792      |
|    ep_rew_mean     | 5.03     |
| time/              |          |
|    fps             | 20       |
|    iterations      | 1        |
|    time_elapsed    | 102      |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.03e+03     |
|    ep_rew_mean          | 8.36         |
| time/                   |              |
|    fps                  | 20           |
|    iterations           | 2            |
|    time_elapsed         | 204          |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0057886387 |
|    clip_fraction        | 0.0234       |
|    clip_range           | 0.2          |
|    en

<stable_baselines3.ppo.ppo.PPO at 0x290694a1310>

#### Evaluation de l'agent

In [14]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")



Mean reward: 210.2277609330311, Std reward: 215.50786414734887


#### Model racetracks_ppo de StableBaselines

In [2]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv

import highway_env  # noqa: F401




In [16]:
TRAIN = True

In [3]:
n_cpu = 6
batch_size = 64
env = make_vec_env("racetrack-v0", n_envs=n_cpu, vec_env_cls=SubprocVecEnv)
model = PPO(
    "MlpPolicy",
    env,
    policy_kwargs=dict(net_arch=[dict(pi=[256, 256], vf=[256, 256])]),
    n_steps=batch_size * 12 // n_cpu,
    batch_size=batch_size,
    n_epochs=10,
    learning_rate=5e-4,
    gamma=0.9,
    verbose=2,
    tensorboard_log="racetrack_ppo/",
)

Using cpu device




In [18]:
# Train the model
if TRAIN:
    model.learn(total_timesteps=int(1e5))
    model.save("racetrack_ppo/model")
    del model

Logging to racetrack_ppo/PPO_1
----------------------------
| time/              |     |
|    fps             | 56  |
|    iterations      | 1   |
|    time_elapsed    | 13  |
|    total_timesteps | 768 |
----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 2           |
|    time_elapsed         | 29          |
|    total_timesteps      | 1536        |
| train/                  |             |
|    approx_kl            | 0.017705945 |
|    clip_fraction        | 0.0832      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.4        |
|    explained_variance   | -0.258      |
|    learning_rate        | 0.0005      |
|    loss                 | -0.0338     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.029      |
|    std                  | 0.955       |
|    value_loss           | 0.0876      |
----------

In [21]:
%pip install moviepy

Note: you may need to restart the kernel to use updated packages.Collecting moviepy
  Downloading moviepy-1.0.3.tar.gz (388 kB)
     ---------------------------------------- 0.0/388.3 kB ? eta -:--:--
     -- ------------------------------------- 20.5/388.3 kB ? eta -:--:--
     -- ---------------------------------- 30.7/388.3 kB 330.3 kB/s eta 0:00:02
     --- --------------------------------- 41.0/388.3 kB 245.8 kB/s eta 0:00:02
     ----- ------------------------------- 61.4/388.3 kB 328.2 kB/s eta 0:00:01
     ------ ------------------------------ 71.7/388.3 kB 281.8 kB/s eta 0:00:02
     -------- ---------------------------- 92.2/388.3 kB 309.1 kB/s eta 0:00:01
     ---------- ------------------------- 112.6/388.3 kB 312.2 kB/s eta 0:00:01
     ----------- ------------------------ 122.9/388.3 kB 313.8 kB/s eta 0:00:01
     ------------- ---------------------- 143.4/388.3 kB 304.6 kB/s eta 0:00:01
     -------------- --------------------- 153.6/388.3 kB 296.2 kB/s eta 0:00:01
     

In [7]:
# Run the algorithm
model = PPO.load("racetrack_ppo/model", env=env)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")

env = gym.make("racetrack-v0", render_mode='rgb_array')
env = RecordVideo(
    env, video_folder="racetrack_ppo/videos", episode_trigger=lambda e: True
)
env.unwrapped.set_record_video_wrapper(env)

for video in range(10):
    done = truncated = False
    obs, info = env.reset()
    while not (done or truncated):
        # Predict
        action, _states = model.predict(obs, deterministic=True)
        # Get reward
        obs, reward, done, truncated, info = env.step(action)
        # Render
        env.render()
env.close()