# Car Racing Training

### Pre-requisites

In [1]:
#!apt-get update
#!apt-get install -y swig python3-dev

In [2]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
# !pip install -r requirements.txt
# !pip install ipywidgets

### Imports

In [3]:
import os
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement

import uuid

### Hyperparameters

In [4]:
# Create a Directory where we save the outputs of this training
runDir = f'./checkpoints/{uuid.uuid4()}'
os.makedirs(runDir, exist_ok=True)

# Save the monitor logs into a csv file
train_monitor_file = os.path.join(runDir, "train_monitor.csv")

total_timesteps = 1_048_576   # Training Steps
n_eval_episodes = 8        # Eval Steps

### Training

In [None]:
# Create the training environment
env_train = gym.make("CarRacing-v3", render_mode="rgb_array")
env_train = Monitor(env_train, filename=train_monitor_file)

stop_train_callback = StopTrainingOnNoModelImprovement(max_no_improvement_evals=32, min_evals=128, verbose=0)
eval_callback = EvalCallback(env_train, eval_freq=1024, best_model_save_path=runDir, log_path=runDir, deterministic=True, render=False, callback_after_eval=stop_train_callback, verbose=0)

# PPO Model
model = PPO(
    policy="CnnPolicy",
    env=env_train,
    verbose=0,
    tensorboard_log=runDir,
    n_steps=1024,
    learning_rate=1e-4,
    batch_size=128,
    n_epochs=8,
    gamma=0.99,
    ent_coef=0.01,
    clip_range=0.2,
    gae_lambda=0.95,
)

# tensorboard --logdir AI/checkpoints/

# Train the model
model.learn(total_timesteps=total_timesteps, progress_bar=True, callback=eval_callback)

# Save the model
model_path = f"{runDir}/final_ppo_carracing"
model.save(model_path)
del model

### Training graphical

In [None]:
# Read monitor csv
df_train = pd.read_csv(train_monitor_file, skiprows=1)
rewards_train = df_train["r"].values
episodes_train = np.arange(1, len(rewards_train) + 1)

# Moving average
window = 50
if len(rewards_train) >= window:
    mov_avg = np.convolve(rewards_train, np.ones(window) / window, mode="valid")
else:
    mov_avg = None

# Plot
plt.figure(figsize=(10,5))
plt.plot(episodes_train, rewards_train, label="Episode Reward", color="tab:blue")
if mov_avg is not None:
    plt.plot(episodes_train[window-1:], mov_avg,
             label=f"Moving average ({window} ep)", color="tab:orange")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("CarRacing: Training Reward")
plt.legend()
plt.grid(alpha=0.3)
plt.savefig(f"{runDir}/plot.png")

plt.show()

### Evaluation

In [None]:
# Load last model
eval_env = gym.make("CarRacing-v3", render_mode="human")
env_train = TransformedEnv(eval_env,
    Compose(
        DoubleToFloat(),
        ToTensorImage(),
        GrayScale(),
        UnsqueezeTransform(-4),
        CatFrames(dim=-3, N=4),
        # ObservationNorm(in_keys=["pixels"]),
        # StepCounter()
    )
)

model = PPO.load(model_path, env=eval_env)

# Manual evaluation loop
rewards_eval = []
for epi in range(n_eval_episodes):
    obs, _ = eval_env.reset()
    done = False
    total_r = 0.0

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, r, terminated, truncated, info = eval_env.step(action)
        done = terminated or truncated
        total_r += r

    rewards_eval.append(total_r)

rewards_eval = np.array(rewards_eval)
episodes_eval = np.arange(1, n_eval_episodes + 1)
mean_eval = rewards_eval.mean()
std_eval = rewards_eval.std()

# Plot
plt.figure(figsize=(8,4))
plt.plot(episodes_eval, rewards_eval, '-o', color="tab:green",
         label="Episode rewards")
plt.axhline(mean_eval, color="tab:red", linestyle="--",
            label=f"Average: {mean_eval:.2f} ± {std_eval:.2f}")
plt.xlabel("Episode")
plt.ylabel("Total Rewards")
plt.title(f"CarRacing: Evaluation in {n_eval_episodes} episodes")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

print(f"Eval → Average Reward: {mean_eval:.2f}; Standard Deviation: {std_eval:.2f}")
