### 1. Import Dependencies

In [1]:
import gymnasium as gym 
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv 
import os

### 2. Test Environment

In [2]:
environment_name = "CarRacing-v3"
env = gym.make(environment_name, render_mode="human")

  from pkg_resources import resource_stream, resource_exists


In [3]:
env.reset()

(array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        ...,
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]], shape=(96, 96, 3), dtype=uint8),
 {})

In [4]:
env.close()

In [8]:
env.action_space.sample()

array([-0.65969336,  0.31168652,  0.18348151], dtype=float32)

In [None]:
episodes = 5 
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    terminated = False
    truncated = False

    while not (terminated or truncated):
        env.render()
        action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(action)
        score += reward
    print(f"Episode:{episode} Score:{score}")
env.close()

### 3. Train Model

In [6]:
env = gym.make(environment_name, render_mode="human")
env = DummyVecEnv([lambda: env])

In [21]:
log_path = os.path.join("..","Training", "Logs", environment_name)
model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=log_path) 

Using cuda device
Wrapping the env in a VecTransposeImage.


In [22]:
model.learn(total_timesteps=2500000)

Logging to ../Training/Logs/CarRacing-v3/PPO_1
-----------------------------
| time/              |      |
|    fps             | 161  |
|    iterations      | 1    |
|    time_elapsed    | 12   |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 156          |
|    iterations           | 2            |
|    time_elapsed         | 26           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0066842055 |
|    clip_fraction        | 0.0727       |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.25        |
|    explained_variance   | 0.00562      |
|    learning_rate        | 0.0003       |
|    loss                 | 0.22         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00524     |
|    std                  | 0.998        |
|    value_l

<stable_baselines3.ppo.ppo.PPO at 0x7f639cd57e50>

### 4. Save Model

In [8]:
PPO_path = os.path.join("..","Training", "Saved Models", environment_name,"PPO_CarRacing_Model")
PPO_path

'../Training/Saved Models/CarRacing-v3/PPO_CarRacing_Model'

In [24]:
model.save(PPO_path)



In [15]:
del model

In [None]:
model

AttributeError: 'PPO' object has no attribute 'close'

In [16]:
model = PPO.load(PPO_path)

In [20]:
env.close()

In [24]:
# Recreate the environment in headless mode to avoid "video system not initialized".
# Use rgb_array so rendering works without a display and the env is compatible with SB3 wrappers.
env = gym.make(environment_name, render_mode="human")
env = DummyVecEnv([lambda: env])
env.reset()

array([[[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        ...,

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]]], shape=(1, 96, 96, 3), dtype=uint8)

### 5. Evaluate and Test

In [26]:
env.close()

In [25]:
evaluate_policy(model, env, n_eval_episodes=5, render=True)

KeyboardInterrupt: 