Dependencies

In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
import os

Look at the environment 

In [2]:
environment_name = "CarRacing-v2"
env = gym.make(environment_name, render_mode='human')

In [42]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    truncated = False
    score = 0 
    
    while not done or truncated:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, truncated, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

KeyboardInterrupt: 

In [43]:
env.close()

In [12]:
env.action_space.sample()

array([-0.35008425,  0.9896195 ,  0.03609927], dtype=float32)

In [13]:
env.observation_space.sample()

array([[[252,  59,  80],
        [ 90,  57, 177],
        [170,  20,  19],
        ...,
        [178,   0, 184],
        [102,  49,   3],
        [ 93, 173, 214]],

       [[192,  67, 247],
        [187, 200,  23],
        [ 93, 101, 148],
        ...,
        [ 86,  33, 152],
        [215, 230, 193],
        [136,  95, 113]],

       [[ 67,  93,   6],
        [155, 109, 195],
        [251, 209, 210],
        ...,
        [ 33, 249,  39],
        [ 11, 115, 126],
        [207,   0, 128]],

       ...,

       [[230, 200,  34],
        [239, 153,  31],
        [113, 135, 191],
        ...,
        [185, 215, 123],
        [ 80, 212, 224],
        [ 97, 187,   4]],

       [[174, 226, 165],
        [224,  25, 103],
        [243, 129, 213],
        ...,
        [139,  32, 191],
        [214, 165, 122],
        [250,  95,  16]],

       [[ 69,  40, 147],
        [ 29, 254,   0],
        [ 57, 153, 144],
        ...,
        [103,  57, 255],
        [234,  33,  16],
        [216, 119, 146]]

In [14]:
env.action_space

Box([-1.  0.  0.], 1.0, (3,), float32)

In [15]:
env.observation_space

Box(0, 255, (96, 96, 3), uint8)

Training

In [3]:
from stable_baselines3.common.vec_env import DummyVecEnv

In [20]:
env = gym.make(environment_name, render_mode=None)
env = DummyVecEnv([lambda: env])

In [5]:
log_path = os.path.join('Training', 'Logs')

In [52]:
model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [53]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\PPO_10
-----------------------------
| time/              |      |
|    fps             | 28   |
|    iterations      | 1    |
|    time_elapsed    | 71   |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 21           |
|    iterations           | 2            |
|    time_elapsed         | 188          |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0073313005 |
|    clip_fraction        | 0.066        |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.22        |
|    explained_variance   | 0.0126       |
|    learning_rate        | 0.0003       |
|    loss                 | 0.211        |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00771     |
|    std                  | 0.978        |
|    value_loss           |

<stable_baselines3.ppo.ppo.PPO at 0x29060b02a10>

In [6]:
car_path = os.path.join('Training', 'Saved Models', 'car_model1')

In [9]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\PPO_12
-----------------------------
| time/              |      |
|    fps             | 28   |
|    iterations      | 1    |
|    time_elapsed    | 72   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 21          |
|    iterations           | 2           |
|    time_elapsed         | 194         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.046060868 |
|    clip_fraction        | 0.326       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.56       |
|    explained_variance   | 0.982       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0831      |
|    n_updates            | 500         |
|    policy_gradient_loss | -0.036      |
|    std                  | 0.578       |
|    value_loss           | 0.361       |
--

<stable_baselines3.ppo.ppo.PPO at 0x203cb6201f0>

In [11]:
model.save(car_path)

In [21]:
model = PPO.load(car_path, env)

Wrapping the env in a VecTransposeImage.


Validation

In [23]:
env = gym.make(environment_name, render_mode='human')
env = DummyVecEnv([lambda: env])

In [24]:
model = PPO.load(car_path, env)

Wrapping the env in a VecTransposeImage.


In [25]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(183.7277955338359, 122.85435607001057)