## Set Up Environment

In [2]:
import os
import gymnasium
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [3]:
# create env
env = gymnasium.make('CarRacing-v2', render_mode = 'human').env
env = DummyVecEnv([lambda: env])
# state and action dimensions
state_dim = env.observation_space.shape
action_dim = env.action_space.shape[0]

In [8]:
env.action_space

Box([-1.  0.  0.], 1.0, (3,), float32)

## Build Model

In [4]:
model = PPO('CnnPolicy', env, verbose=1)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [5]:
model.learn(total_timesteps=5000)

-----------------------------
| time/              |      |
|    fps             | 17   |
|    iterations      | 1    |
|    time_elapsed    | 119  |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 14           |
|    iterations           | 2            |
|    time_elapsed         | 285          |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0071460307 |
|    clip_fraction        | 0.0794       |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.23        |
|    explained_variance   | 0.0253       |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0925       |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00769     |
|    std                  | 0.985        |
|    value_loss           | 0.463        |
----------------

<stable_baselines3.ppo.ppo.PPO at 0x2b3212aa850>

In [37]:
env.close()

## Play Games

In [14]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)


KeyboardInterrupt



In [39]:
state = env.reset()
# 1 game, 100 actions
score = 0
for _ in range(1000):
    env.render()
    # get action to take
    action, _states = model.predict(state)
    # take action
    # get: new state, reward gained after action, if done (died, won, etc), _
    state, reward, done, info, = env.step(action)
    # new score
    score += reward
    
    # died/out of bounds/too much time/won
    if done:
        # end
        exit
else:
    # didn't win/die before 100 loops
    print("Timeout")

#final state and info
print("Total score:", score)

# close when done
env.close()

Timeout
Total score: [-68.453674]
