In [1]:
import gymnasium as gym
import numpy as np

In [2]:
class DiscreteCarRacingWrapper(gym.ActionWrapper):
    def __init__(self, env):
        super().__init__(env)
        # Define a set of discrete actions: [steer, gas, brake]
        self.actions = [
            np.array([0.0, 0.0, 0.0]),   # Do nothing
            np.array([-1.0, 1.0, 0.0]),  # Turn left + gas
            np.array([1.0, 1.0, 0.0]),   # Turn right + gas
            np.array([0.0, 1.0, 0.0]),   # Straight + gas
            np.array([0.0, 0.0, 0.8]),   # Brake
        ]
        self.action_space = gym.spaces.Discrete(len(self.actions))

    def action(self, action_index):
        return self.actions[action_index]


In [3]:
env = gym.make("CarRacing-v3", render_mode="human")
env = DiscreteCarRacingWrapper(env)

In [4]:
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.atari_wrappers import WarpFrame

env = DummyVecEnv([lambda: DiscreteCarRacingWrapper(gym.make("CarRacing-v3"))])
env = VecFrameStack(env, n_stack=4)

2025-05-21 10:58:19.624902: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
#%pip install "stable-baselines3[extra]"

**DQN**

In [5]:
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env


check_env(env, warn=True)

model = DQN(
    policy="CnnPolicy",
    env=env,
    learning_rate=1e-4,
    buffer_size=100_000,
    learning_starts=10_000,
    batch_size=64,
    train_freq=1,
    target_update_interval=1000,
    exploration_fraction=0.1,
    exploration_final_eps=0.01,
    tensorboard_log="./dqn_car_racing_tensorboard/",
    verbose=1,
)


model.learn(total_timesteps=1_000_000)


2025-05-21 10:48:58.376514: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./dqn_car_racing_tensorboard/DQN_2
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 600      |
|    ep_rew_mean      | -24.8    |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 33       |
|    time_elapsed     | 72       |
|    total_timesteps  | 2401     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0241   |
|    n_updates        | 350      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 800      |
|    ep_rew_mean      | -48.7    |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 29       |
|    time_elap

<stable_baselines3.dqn.dqn.DQN at 0x197db5650>

**PPO**

In [9]:
import gymnasium as gym
from stable_baselines3 import PPO

env = gym.make("CarRacing-v3", render_mode="human")

model = PPO(
    "CnnPolicy",
    env=env,
    verbose=1,
    tensorboard_log="./ppo_car_racing_tensorboard/",
    learning_rate=2.5e-4,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    clip_range=0.2,
)

model.learn(total_timesteps=100_000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./ppo_car_racing_tensorboard/PPO_4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -55.9    |
| time/              |          |
|    fps             | 36       |
|    iterations      | 1        |
|    time_elapsed    | 28       |
|    total_timesteps | 1024     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -58.8       |
| time/                   |             |
|    fps                  | 35          |
|    iterations           | 2           |
|    time_elapsed         | 58          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.006197861 |
|    clip_fractio

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1e+03        |
|    ep_rew_mean          | -49.7        |
| time/                   |              |
|    fps                  | 34           |
|    iterations           | 11           |
|    time_elapsed         | 328          |
|    total_timesteps      | 11264        |
| train/                  |              |
|    approx_kl            | 0.0033331197 |
|    clip_fraction        | 0.0505       |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.22        |
|    explained_variance   | 0.406        |
|    learning_rate        | 0.00025      |
|    loss                 | 0.237        |
|    n_updates            | 40           |
|    policy_gradient_loss | 0.00143      |
|    std                  | 0.986        |
|    value_loss           | 0.736        |
------------------------------------------
-----------------------------------------
| rollout/  

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1e+03        |
|    ep_rew_mean          | -41.4        |
| time/                   |              |
|    fps                  | 34           |
|    iterations           | 20           |
|    time_elapsed         | 594          |
|    total_timesteps      | 20480        |
| train/                  |              |
|    approx_kl            | 0.0053090258 |
|    clip_fraction        | 0.151        |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.17        |
|    explained_variance   | 0.847        |
|    learning_rate        | 0.00025      |
|    loss                 | 0.0479       |
|    n_updates            | 76           |
|    policy_gradient_loss | -0.00549     |
|    std                  | 0.97         |
|    value_loss           | 0.304        |
------------------------------------------
-----------------------------------------
| rollout/  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -43.8       |
| time/                   |             |
|    fps                  | 34          |
|    iterations           | 29          |
|    time_elapsed         | 859         |
|    total_timesteps      | 29696       |
| train/                  |             |
|    approx_kl            | 0.010922686 |
|    clip_fraction        | 0.0845      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.14       |
|    explained_variance   | 0.966       |
|    learning_rate        | 0.00025     |
|    loss                 | 0.159       |
|    n_updates            | 112         |
|    policy_gradient_loss | -0.00645    |
|    std                  | 0.959       |
|    value_loss           | 0.403       |
-----------------------------------------


KeyboardInterrupt: 

In [None]:
obs, _ = env.reset()
done = False
while not done:
    action, _ = model.predict(obs)
    obs, reward, done, truncated, info = env.step(action)
    env.render()
