# Car Racing Training

### Pre-requisites

In [1]:
#!apt-get update
#!apt-get install -y swig python3-dev

In [2]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
# !pip install -r requirements.txt
# !pip install ipywidgets

### Imports

In [None]:
import os
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor, VecNormalize, VecFrameStack

from gym.wrappers import GrayScaleObservation

import uuid

### Hyperparameters

In [None]:
# Create a Directory where we save the outputs of this training
runDir = f'./checkpoints/{uuid.uuid4()}'
os.makedirs(runDir, exist_ok=True)

# Save the monitor logs into a csv file
train_monitor_file = os.path.join(runDir, "train_monitor.csv")

total_timesteps = 5_000_000 # Training Steps
n_eval_episodes = 8         # Eval Steps

## Training
### Environment

In [None]:
class ShapedCarRacing(gym.Env):
    def __init__(self):
        super().__init__()
        self.env = gym.make("CarRacing-v3", render_mode="rgb_array")
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space

    def reset(self):
        obs, info = self.env.reset()
        return obs, info

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        # Extract speed from info dict if available (CarRacing provides car speed in info)
        speed = info.get("speed", 0.0)
        # Add small speed bonus (e.g., +0.1 * speed)
        shaped_reward = reward + 0.1 * speed
        # If speed < 0.1 (agent is stuck) and reward is low, give a penalty
        if speed < 0.1 and reward < 0.1:
            shaped_reward -= 0.2
        return obs, shaped_reward, terminated, truncated, info

    def render(self, mode="human"):
        return self.env.render(mode)

    def close(self):
        self.env.close()

In [None]:
# Create the training environment
def make_env(env):
    env = ShapedCarRacing()
    # Convert to grayscale → (96×96×1)
    env = GrayScaleObservation(env, keep_dim=True)
    return env

# Create 8 parallel envs
vec_env = DummyVecEnv([make_env for _ in range(8)])
# Keep monitor logs for each sub‐env
vec_env = VecMonitor(vec_env, filename=train_monitor_file)
# Normalize observations and (optionally) rewards
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
# Stack last 4 grayscale frames → final obs shape: (4×84×84)
#vec_env = VecFrameStack(vec_env, n_stack=4)

### Eval Environment

In [None]:
# Training VecEnv
train_env = vec_env

# Create a separate eval env 
def make_eval_env():
    env = ShapedCarRacing()
    env = GrayScaleObservation(env, keep_dim=True)
    return env

eval_vec_env = DummyVecEnv([make_eval_env for _ in range(4)])
eval_vec_env = VecNormalize(eval_vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
#eval_vec_env = VecFrameStack(eval_vec_env, n_stack=4)

stop_train_callback = StopTrainingOnNoModelImprovement(max_no_improvement_evals=32, min_evals=128, verbose=0)

# Now pass eval_vec_env to EvalCallback:
eval_callback = EvalCallback(
    eval_vec_env,
    best_model_save_path=runDir,
    log_path=runDir,
    n_eval_episodes=8,       # as you had configured
    eval_freq=4096,          # e.g., after every 4096 steps across 8 envs
    deterministic=True,
    render=False,
    callback_after_eval=stop_train_callback,
    verbose=1
)

In [None]:
# PPO Model
model = PPO(
    policy="CnnPolicy",
    env=vec_env,
    tensorboard_log=runDir,
    n_steps=256,
    learning_rate=2.5e-4,
    batch_size=64,
    n_epochs=8,
    gamma=0.99,
    ent_coef=0.005,
    clip_range=0.1,
    gae_lambda=0.95,
    verbose=1,
)

# tensorboard --logdir AI/checkpoints/

# Train the model
model.learn(total_timesteps=total_timesteps, progress_bar=True, callback=eval_callback)

# Save the model
model_path = f"{runDir}/final_ppo_carracing"
model.save(model_path)
del model

### Training graphical

In [None]:
df = pd.read_csv(train_monitor_file, skiprows=1)
rewards = df["r"].values
episodes = np.arange(1, len(rewards) + 1)

# 50-episode moving average
window = 50
if len(rewards) >= window:
    mov_avg = np.convolve(rewards, np.ones(window)/window, mode="valid")

plt.figure(figsize=(10,5))
plt.plot(episodes, rewards, label="Per‐Episode Reward")
plt.plot(episodes[window-1:], mov_avg, label=f"MA ({window})", linewidth=2)
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("CarRacing Training Rewards After Modifications")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

### Evaluation

In [None]:
# Load last model
def make_eval_env():
    env = gym.make("CarRacing-v3", render_mode="human")
    env = GrayScaleObservation(env, keep_dim=True)
    return env

eval_env = make_eval_env()
model = PPO.load(model_path, env=eval_env)

# Manual evaluation loop
rewards_eval = []
for epi in range(n_eval_episodes):
    obs, _ = eval_env.reset()
    done = False
    total_r = 0.0

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, r, terminated, truncated, info = eval_env.step(action)
        done = terminated or truncated
        total_r += r

    rewards_eval.append(total_r)

rewards_eval = np.array(rewards_eval)
episodes_eval = np.arange(1, n_eval_episodes + 1)
mean_eval = rewards_eval.mean()
std_eval = rewards_eval.std()

# Plot
plt.figure(figsize=(8,4))
plt.plot(episodes_eval, rewards_eval, '-o', color="tab:green",
         label="Episode rewards")
plt.axhline(mean_eval, color="tab:red", linestyle="--",
            label=f"Average: {mean_eval:.2f} ± {std_eval:.2f}")
plt.xlabel("Episode")
plt.ylabel("Total Rewards")
plt.title(f"CarRacing: Evaluation in {n_eval_episodes} episodes")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

print(f"Eval → Average Reward: {mean_eval:.2f}; Standard Deviation: {std_eval:.2f}")
