In [None]:
# Import necessary libraries
import torch
import numpy as np
import cv2
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.vec_env import VecVideoRecorder


In [None]:
# Set configurations
CONFIG = {
    "env_name": "CarRacing-v3",
    "timesteps": 500_000,
    "checkpoint_dir": "./checkpoints/",
    "video_dir": "./videos/",
    "save_model_path": "./ppo_carracing_model.zip",
    "evaluate_episodes": 5,
    "video_length": 1000,
    "seed": 42,
}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# Create and preprocess the environment
def create_env(env_name, monitor_path=None):
    env = gym.make(env_name, render_mode="rgb_array")
    env.reset(seed=CONFIG["seed"])
    if monitor_path:
        env = Monitor(env, monitor_path)
    return DummyVecEnv([lambda: env])


In [None]:
# Training
def setup_directories():
    os.makedirs(CONFIG["checkpoint_dir"], exist_ok=True)
    os.makedirs(CONFIG["video_dir"], exist_ok=True)

def create_model(env):
    return PPO(
        "CnnPolicy",
        env,
        verbose=1,
        tensorboard_log="./tensorboard_logs/",
        seed=CONFIG["seed"],
        device=DEVICE
    )

def train_model(model, env):
    checkpoint_callback = CheckpointCallback(
        save_freq=10_000,
        save_path=CONFIG["checkpoint_dir"],
        name_prefix="ppo_carracing_checkpoint"
    )

    print("Starting training...")
    model.learn(
        total_timesteps=CONFIG["timesteps"],
        callback=checkpoint_callback
    )
    print("Training complete.")

    model.save(CONFIG["save_model_path"])


In [None]:
# Evaluation
def evaluate_model(model, env, episodes):
    total_rewards = []
    for episode in range(episodes):
        obs = env.reset()
        done = False
        total_reward = 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            total_reward += reward
        total_rewards.append(total_reward)
        print(f"Episode {episode + 1}: Total Reward = {total_reward}")
    avg_reward = np.mean(total_rewards)
    print(f"Average Reward over {episodes} episodes: {avg_reward}")
    return avg_reward

def record_video(env, model, video_length, video_path):
    env = VecVideoRecorder(
        env,
        video_path,
        record_video_trigger=lambda x: x == 0,
        video_length=video_length,
        name_prefix="ppo_carracing"
    )
    obs = env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs, deterministic=True)
        obs, _, done, info = env.step(action)
        if done:
            obs = env.reset()
    env.close()


In [None]:
def main():
    # Setup
    setup_directories()
    env = create_env(CONFIG["env_name"])

    # Training
    model = create_model(env)
    train_model(model, env)

    # Evaluation
    print("Final evaluation...")
    final_reward = evaluate_model(model, env, CONFIG["evaluate_episodes"])
    print(f"Final evaluation completed. Average reward: {final_reward}")

main()


Final evaluation...
Episode 1: Total Reward = [69.61107]
Episode 2: Total Reward = [425.24808]
Episode 3: Total Reward = [313.83887]
Episode 4: Total Reward = [289.23624]
Episode 5: Total Reward = [167.26285]
Average Reward over 5 episodes: 253.03939819335938
Final evaluation completed. Average reward: 253.03939819335938


In [None]:
# Visualization
def visualize_agent(model_path, video_path="agent_gameplay.mp4", num_episodes=3, fps=60):
    env = gym.make("CarRacing-v3", render_mode="rgb_array")
    model = PPO.load(model_path)

    desired_width = 800
    desired_height = 600

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(video_path, fourcc, fps, (desired_width, desired_height))

    for episode in range(num_episodes):
        obs, _ = env.reset()
        done = truncated = False
        total_reward = 0

        while not (done or truncated):
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, truncated, _ = env.step(action)
            total_reward += reward

            frame = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR)
            frame = cv2.resize(frame, (desired_width, desired_height),
                             interpolation=cv2.INTER_LANCZOS4)

            cv2.putText(frame, f"Episode: {episode+1}", (20, 40),
                       cv2.FONT_HERSHEY_DUPLEX, 1, (255, 255, 255), 2)
            cv2.putText(frame, f"Reward: {total_reward:.1f}", (20, 80),
                       cv2.FONT_HERSHEY_DUPLEX, 1, (255, 255, 255), 2)

            out.write(frame)

        print(f"Episode {episode+1} completed with reward: {total_reward:.2f}")

    out.release()
    env.close()


Episode 1 completed with reward: 924.40
Episode 2 completed with reward: 463.38
Episode 3 completed with reward: 252.38
