In [4]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from huggingface_hub import HfApi, create_repo

api = HfApi()
create_repo(repo_id="YoussefTolba/td3-lunarlander-v3", exist_ok=True)


RepoUrl('https://huggingface.co/YoussefTolba/td3-lunarlander-v3', endpoint='https://huggingface.co', repo_type='model', repo_id='YoussefTolba/td3-lunarlander-v3')

# TD3 model

In [6]:
# 1. Install system-level dependencies (Fixes the box2d build error)
!sudo apt-get update
!sudo apt-get install -y swig

# 2. Install Python libraries
# We force a reinstall of gymnasium to ensure box2d links correctly
!pip install gymnasium[box2d] moviepy imageio wandb huggingface_hub

# 3. (Optional) Fix specific Colab rendering issues
!pip install imageio-ffmpeg

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Connecting to security.ub                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/u

##  LunarLander-v3

In [7]:
import os
import random
import time
import uuid
from dataclasses import dataclass, asdict
from typing import Optional

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import wandb
from huggingface_hub import HfApi, create_repo, upload_folder

# ==========================================
# 1. Configuration & Hyperparameters
# ==========================================
@dataclass
class TD3Config:
    # Experiment Settings
    env_id: str = "LunarLander-v3"
    project_name: str = "td3-lunarlander-continuous"
    run_name: str = f"td3_run_{str(uuid.uuid4())[:8]}"
    seed: int = 42

    # Training Duration
    total_timesteps: int = 100_000
    learning_starts: int = 10_000

    # Hyperparameters
    hidden_dim: int = 256
    actor_lr: float = 3e-4
    critic_lr: float = 3e-4
    batch_size: int = 256
    buffer_size: int = 1_000_000
    gamma: float = 0.99
    tau: float = 0.005

    # TD3 Specifics
    policy_noise: float = 0.2
    noise_clip: float = 0.5
    policy_delay: int = 2
    exploration_noise: float = 0.1

    # Logging & Saving
    eval_freq: int = 5_000
    save_model: bool = True
    hf_repo_id: str = "YoussefTolba/td3-lunarlander-v3"

# ==========================================
# 2. Replay Buffer
# ==========================================
class ReplayBuffer:
    def __init__(self, state_dim, action_dim, max_size=1e6):
        self.max_size = int(max_size)
        self.ptr = 0
        self.size = 0
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.state = np.zeros((self.max_size, state_dim))
        self.action = np.zeros((self.max_size, action_dim))
        self.next_state = np.zeros((self.max_size, state_dim))
        self.reward = np.zeros((self.max_size, 1))
        self.not_done = np.zeros((self.max_size, 1))

    def add(self, state, action, next_state, reward, done):
        self.state[self.ptr] = state
        self.action[self.ptr] = action
        self.next_state[self.ptr] = next_state
        self.reward[self.ptr] = reward
        self.not_done[self.ptr] = 1. - done

        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def sample(self, batch_size):
        ind = np.random.randint(0, self.size, size=batch_size)
        return (
            torch.FloatTensor(self.state[ind]).to(self.device),
            torch.FloatTensor(self.action[ind]).to(self.device),
            torch.FloatTensor(self.next_state[ind]).to(self.device),
            torch.FloatTensor(self.reward[ind]).to(self.device),
            torch.FloatTensor(self.not_done[ind]).to(self.device)
        )

# ==========================================
# 3. Neural Networks
# ==========================================
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action, hidden_dim=256):
        super(Actor, self).__init__()
        self.l1 = nn.Linear(state_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, hidden_dim)
        self.l3 = nn.Linear(hidden_dim, action_dim)
        self.max_action = max_action

    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        return self.max_action * torch.tanh(self.l3(a))

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(Critic, self).__init__()
        self.l1 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, hidden_dim)
        self.l3 = nn.Linear(hidden_dim, 1)

        self.l4 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.l5 = nn.Linear(hidden_dim, hidden_dim)
        self.l6 = nn.Linear(hidden_dim, 1)

    def forward(self, state, action):
        sa = torch.cat([state, action], 1)
        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2

    def Q1(self, state, action):
        sa = torch.cat([state, action], 1)
        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1

# ==========================================
# 4. TD3 Algorithm
# ==========================================
class TD3:
    def __init__(self, state_dim, action_dim, max_action, config: TD3Config):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.conf = config
        self.max_action = max_action
        self.total_it = 0

        self.actor = Actor(state_dim, action_dim, max_action, config.hidden_dim).to(self.device)
        self.actor_target = Actor(state_dim, action_dim, max_action, config.hidden_dim).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=config.actor_lr)

        self.critic = Critic(state_dim, action_dim, config.hidden_dim).to(self.device)
        self.critic_target = Critic(state_dim, action_dim, config.hidden_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=config.critic_lr)

    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer):
        self.total_it += 1

        # Sample replay buffer
        state, action, next_state, reward, not_done = replay_buffer.sample(self.conf.batch_size)

        with torch.no_grad():
            noise = (torch.randn_like(action) * self.conf.policy_noise).clamp(-self.conf.noise_clip, self.conf.noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)

            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + not_done * self.conf.gamma * target_Q

        current_Q1, current_Q2 = self.critic(state, action)
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = None
        if self.total_it % self.conf.policy_delay == 0:
            actor_loss = -self.critic.Q1(state, self.actor(state)).mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.conf.tau * param.data + (1 - self.conf.tau) * target_param.data)
            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(self.conf.tau * param.data + (1 - self.conf.tau) * target_param.data)

        return critic_loss.item(), (actor_loss.item() if actor_loss else None)

    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic.pth")
        torch.save(self.actor.state_dict(), filename + "_actor.pth")

# ==========================================
# 5. Helpers: Evaluation & Recording
# ==========================================
def evaluate_and_record(policy, env_id, seed, step, video_folder="videos"):
    """
    Evaluates the agent for 5 episodes.
    Records the FIRST episode of the evaluation.
    """
    eval_env = gym.make(env_id, continuous=True, render_mode="rgb_array")

    # A custom prefix to identify the step
    video_prefix = f"td3-step-{step}"

    eval_env = gym.wrappers.RecordVideo(
        eval_env,
        video_folder=video_folder,
        name_prefix=video_prefix,
        episode_trigger=lambda x: x == 0, # Record only the first episode in this call
        disable_logger=True
    )

    rewards = []
    video_path = None

    for i in range(5):
        state, _ = eval_env.reset(seed=seed + 100 + i)
        terminated, truncated = False, False
        episode_reward = 0

        while not (terminated or truncated):
            action = policy.select_action(np.array(state))
            state, reward, terminated, truncated, _ = eval_env.step(action)
            episode_reward += reward

        rewards.append(episode_reward)

    eval_env.close()

    # Format: {prefix}-episode-0.mp4
    expected_path = os.path.join(video_folder, f"{video_prefix}-episode-0.mp4")
    if os.path.exists(expected_path):
        video_path = expected_path

    return np.mean(rewards), video_path

# ==========================================
# 6. Main Training Loop
# ==========================================
def run_training():
    conf = TD3Config()

    # === FIXED WANDB INIT ===
    # monitor_gym=False prevents the AttributeError with RecordVideo
    wandb.init(
        project=conf.project_name,
        name=conf.run_name,
        config=asdict(conf),
        monitor_gym=False,
        save_code=True
    )

    env = gym.make(conf.env_id, continuous=True)
    env.action_space.seed(conf.seed)
    torch.manual_seed(conf.seed)
    np.random.seed(conf.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    replay_buffer = ReplayBuffer(state_dim, action_dim, conf.buffer_size)
    policy = TD3(state_dim, action_dim, max_action, conf)

    state, _ = env.reset(seed=conf.seed)
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    print(f"---------------------------------------")
    print(f"Starting Training: {conf.env_id} | TD3 | Seed: {conf.seed}")
    print(f"---------------------------------------")

    for t in range(int(conf.total_timesteps)):
        episode_timesteps += 1

        if t < conf.learning_starts:
            action = env.action_space.sample()
        else:
            action = (
                policy.select_action(np.array(state))
                + np.random.normal(0, max_action * conf.exploration_noise, size=action_dim)
            ).clip(-max_action, max_action)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        replay_buffer.add(state, action, next_state, reward, float(terminated))
        state = next_state
        episode_reward += reward

        if t >= conf.learning_starts:
            critic_loss, actor_loss = policy.train(replay_buffer)

            if t % 100 == 0:
                logs = {"train/critic_loss": critic_loss}
                if actor_loss is not None:
                    logs["train/actor_loss"] = actor_loss
                wandb.log(logs, step=t)

        if done:
            wandb.log({
                "train/episode_reward": episode_reward,
                "train/episode_length": episode_timesteps
            }, step=t)

            print(f"Step {t} | Episode {episode_num} | Reward: {episode_reward:.2f}")

            state, _ = env.reset()
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        if (t + 1) % conf.eval_freq == 0:
            print(f"Evaluating at step {t+1}...")
            mean_eval_score, video_path = evaluate_and_record(policy, conf.env_id, conf.seed, step=t+1)

            wandb.log({"eval/mean_reward": mean_eval_score}, step=t)

            if video_path:
                print(f"Uploading video: {video_path}")
                wandb.log({"eval/video": wandb.Video(video_path, fps=30, format="mp4")}, step=t)

    print("Training Complete.")

    if conf.save_model:
        save_path = "td3_model_artifacts"
        os.makedirs(save_path, exist_ok=True)

        policy.save(os.path.join(save_path, "td3_lunarlander"))

        readme = f"""
---
tags:
- deep-reinforcement-learning
- reinforcement-learning
- gymnasium
- td3
library_name: stable-baselines3
env_id: {conf.env_id}
model-index:
- name: TD3
  results:
  - task:
      type: reinforcement-learning
      name: reinforcement-learning
    dataset:
      name: {conf.env_id}
      type: {conf.env_id}
    metrics:
    - type: mean_reward
      value: {mean_eval_score:.2f}
      name: mean_reward
---

# TD3 Agent for {conf.env_id}

This is a **Twin Delayed DDPG (TD3)** agent trained on `{conf.env_id}`.

## Experiment Results
- **Final Mean Reward:** {mean_eval_score:.2f}
- **Total Timesteps:** {conf.total_timesteps}

## Hyperparameters
```python
{asdict(conf)}


"""
    with open(os.path.join(save_path, "README.md"), "w") as f:
        f.write(readme)

    try:
        print(f"Uploading model to Hugging Face Hub: {conf.hf_repo_id}")
        api = HfApi()
        create_repo(repo_id=conf.hf_repo_id, exist_ok=True, repo_type="model")
        upload_folder(
            folder_path=save_path,
            repo_id=conf.hf_repo_id,
            repo_type="model",
            commit_message="Upload TD3 agent with video and charts logs"
        )
        print("Upload success!")
    except Exception as e:
        print(f"HF Upload failed (check your token): {e}")

wandb.finish()

run_training()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 409c26fd421dbaf2466bc661106ac4e6d9a8ae97


[34m[1mwandb[0m: Enter your choice:

 409c26fd421dbaf2466bc661106ac4e6d9a8ae97


[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myousefyousefyousef335[0m ([33myousefyousefyousef335-cairo-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


---------------------------------------
Starting Training: LunarLander-v3 | TD3 | Seed: 42
---------------------------------------
Step 134 | Episode 0 | Reward: -267.89
Step 261 | Episode 1 | Reward: -244.19
Step 405 | Episode 2 | Reward: -140.08
Step 492 | Episode 3 | Reward: -385.30
Step 623 | Episode 4 | Reward: -252.29
Step 777 | Episode 5 | Reward: -106.46
Step 869 | Episode 6 | Reward: -337.21
Step 1019 | Episode 7 | Reward: -199.72
Step 1180 | Episode 8 | Reward: -165.51
Step 1265 | Episode 9 | Reward: -42.14
Step 1372 | Episode 10 | Reward: -350.00
Step 1478 | Episode 11 | Reward: -133.85
Step 1602 | Episode 12 | Reward: -217.96
Step 1690 | Episode 13 | Reward: -439.36
Step 1883 | Episode 14 | Reward: -343.79
Step 1981 | Episode 15 | Reward: -241.82
Step 2109 | Episode 16 | Reward: -59.82
Step 2202 | Episode 17 | Reward: -332.47
Step 2289 | Episode 18 | Reward: -415.87
Step 2404 | Episode 19 | Reward: -118.47
Step 2554 | Episode 20 | Reward: -166.47
Step 2619 | Episode 21 | Re

  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"


Uploading video: videos/td3-step-5000-episode-0.mp4
Step 5083 | Episode 44 | Reward: -149.25
Step 5181 | Episode 45 | Reward: -340.65
Step 5267 | Episode 46 | Reward: -252.40
Step 5410 | Episode 47 | Reward: -558.30
Step 5540 | Episode 48 | Reward: -229.50
Step 5638 | Episode 49 | Reward: -313.85
Step 5725 | Episode 50 | Reward: -28.70
Step 5808 | Episode 51 | Reward: -411.59
Step 5876 | Episode 52 | Reward: -54.42
Step 5974 | Episode 53 | Reward: -381.23
Step 6064 | Episode 54 | Reward: -82.81
Step 6135 | Episode 55 | Reward: -77.13
Step 6325 | Episode 56 | Reward: -420.70
Step 6472 | Episode 57 | Reward: -408.22
Step 6567 | Episode 58 | Reward: -251.98
Step 6709 | Episode 59 | Reward: -157.51
Step 6854 | Episode 60 | Reward: -69.47
Step 6981 | Episode 61 | Reward: -280.15
Step 7072 | Episode 62 | Reward: -442.85
Step 7233 | Episode 63 | Reward: -68.36
Step 7403 | Episode 64 | Reward: -123.19
Step 7530 | Episode 65 | Reward: -375.03
Step 7647 | Episode 66 | Reward: -143.69
Step 7746 |

  logger.warn(


Uploading video: videos/td3-step-10000-episode-0.mp4
Step 10065 | Episode 88 | Reward: -466.58
Step 10182 | Episode 89 | Reward: -122.87
Step 10282 | Episode 90 | Reward: -142.48
Step 10402 | Episode 91 | Reward: -55.95
Step 10526 | Episode 92 | Reward: -282.45
Step 10644 | Episode 93 | Reward: -294.01
Step 10759 | Episode 94 | Reward: -293.58
Step 10862 | Episode 95 | Reward: -327.39
Step 11097 | Episode 96 | Reward: -363.35
Step 11264 | Episode 97 | Reward: -387.48
Step 11409 | Episode 98 | Reward: -319.44
Step 11485 | Episode 99 | Reward: -551.32
Step 11783 | Episode 100 | Reward: -480.05
Step 11911 | Episode 101 | Reward: -451.33
Step 12080 | Episode 102 | Reward: -38.04
Step 12660 | Episode 103 | Reward: -214.37
Step 12750 | Episode 104 | Reward: -358.49
Step 13750 | Episode 105 | Reward: -39.47
Step 14330 | Episode 106 | Reward: -66.60
Step 14635 | Episode 107 | Reward: 9.61
Evaluating at step 15000...




Uploading video: videos/td3-step-15000-episode-0.mp4
Step 15114 | Episode 108 | Reward: 224.63
Step 15776 | Episode 109 | Reward: 199.98
Step 16316 | Episode 110 | Reward: -88.86
Step 16858 | Episode 111 | Reward: -94.74
Step 17432 | Episode 112 | Reward: 258.53
Step 17883 | Episode 113 | Reward: -69.25
Step 18427 | Episode 114 | Reward: -267.88
Step 18936 | Episode 115 | Reward: -128.28
Step 19672 | Episode 116 | Reward: 140.35
Evaluating at step 20000...




Uploading video: videos/td3-step-20000-episode-0.mp4
Step 20074 | Episode 117 | Reward: 156.23
Step 20292 | Episode 118 | Reward: 34.54
Step 20553 | Episode 119 | Reward: -1.44
Step 20807 | Episode 120 | Reward: -15.68
Step 21266 | Episode 121 | Reward: -31.37
Step 21982 | Episode 122 | Reward: 222.68
Step 22647 | Episode 123 | Reward: 243.90
Step 23113 | Episode 124 | Reward: 270.79
Step 23441 | Episode 125 | Reward: 230.12
Step 24049 | Episode 126 | Reward: 170.00
Step 24245 | Episode 127 | Reward: 12.09
Step 24694 | Episode 128 | Reward: -25.40
Step 24938 | Episode 129 | Reward: -7.80
Evaluating at step 25000...




Uploading video: videos/td3-step-25000-episode-0.mp4
Step 25286 | Episode 130 | Reward: -83.67
Step 25672 | Episode 131 | Reward: 155.69
Step 26192 | Episode 132 | Reward: 247.02
Step 26614 | Episode 133 | Reward: -40.03
Step 26873 | Episode 134 | Reward: 15.86
Step 27191 | Episode 135 | Reward: 240.30
Step 27665 | Episode 136 | Reward: 271.04
Step 27970 | Episode 137 | Reward: 255.21
Step 28365 | Episode 138 | Reward: 183.88
Step 28655 | Episode 139 | Reward: -79.67
Step 28918 | Episode 140 | Reward: -65.25
Step 29337 | Episode 141 | Reward: -24.60
Step 29691 | Episode 142 | Reward: -173.08
Evaluating at step 30000...




Uploading video: videos/td3-step-30000-episode-0.mp4
Step 30254 | Episode 143 | Reward: 195.44
Step 30537 | Episode 144 | Reward: -51.30
Step 30949 | Episode 145 | Reward: 238.50
Step 31375 | Episode 146 | Reward: 208.71
Step 31658 | Episode 147 | Reward: -202.46
Step 31954 | Episode 148 | Reward: -17.79
Step 32208 | Episode 149 | Reward: 3.13
Step 32428 | Episode 150 | Reward: -82.28
Step 33340 | Episode 151 | Reward: 115.11
Step 33769 | Episode 152 | Reward: 240.28
Step 34099 | Episode 153 | Reward: 266.88
Step 34544 | Episode 154 | Reward: 215.11
Evaluating at step 35000...




Uploading video: videos/td3-step-35000-episode-0.mp4
Step 35155 | Episode 155 | Reward: 232.69
Step 35597 | Episode 156 | Reward: 234.48
Step 35956 | Episode 157 | Reward: -34.85
Step 36192 | Episode 158 | Reward: -23.08
Step 36486 | Episode 159 | Reward: -58.20
Step 36665 | Episode 160 | Reward: -97.19
Step 37206 | Episode 161 | Reward: -125.79
Step 37443 | Episode 162 | Reward: -60.86
Step 37608 | Episode 163 | Reward: -110.72
Step 38576 | Episode 164 | Reward: 134.51
Step 39072 | Episode 165 | Reward: -99.91
Step 39806 | Episode 166 | Reward: -62.68
Evaluating at step 40000...




Uploading video: videos/td3-step-40000-episode-0.mp4
Step 40093 | Episode 167 | Reward: -59.09
Step 40617 | Episode 168 | Reward: -54.19
Step 41246 | Episode 169 | Reward: -169.82
Step 41882 | Episode 170 | Reward: -135.53
Step 42377 | Episode 171 | Reward: -172.67
Step 42650 | Episode 172 | Reward: -73.93
Step 42805 | Episode 173 | Reward: -91.40
Step 43144 | Episode 174 | Reward: -152.28
Step 43278 | Episode 175 | Reward: -109.01
Step 43538 | Episode 176 | Reward: -164.13
Step 43948 | Episode 177 | Reward: -105.48
Step 44436 | Episode 178 | Reward: -175.98
Step 44580 | Episode 179 | Reward: -82.51
Step 44849 | Episode 180 | Reward: -93.25
Step 44999 | Episode 181 | Reward: -88.04
Evaluating at step 45000...




Uploading video: videos/td3-step-45000-episode-0.mp4
Step 45164 | Episode 182 | Reward: -69.06
Step 45698 | Episode 183 | Reward: -88.79
Step 46150 | Episode 184 | Reward: -178.56
Step 46304 | Episode 185 | Reward: -85.20
Step 46521 | Episode 186 | Reward: -114.37
Step 46783 | Episode 187 | Reward: -118.13
Step 47210 | Episode 188 | Reward: 158.61
Step 47853 | Episode 189 | Reward: 168.57
Step 48659 | Episode 190 | Reward: 117.72
Step 49110 | Episode 191 | Reward: 253.45
Step 49446 | Episode 192 | Reward: 191.90
Step 49629 | Episode 193 | Reward: -61.34
Step 49932 | Episode 194 | Reward: 215.01
Evaluating at step 50000...




Uploading video: videos/td3-step-50000-episode-0.mp4
Step 50253 | Episode 195 | Reward: -121.08
Step 50397 | Episode 196 | Reward: -60.30
Step 50941 | Episode 197 | Reward: 177.51
Step 51326 | Episode 198 | Reward: 161.60
Step 51504 | Episode 199 | Reward: -72.36
Step 51676 | Episode 200 | Reward: -85.53
Step 51848 | Episode 201 | Reward: -43.96
Step 52230 | Episode 202 | Reward: 202.15
Step 52486 | Episode 203 | Reward: -82.38
Step 52696 | Episode 204 | Reward: -57.20
Step 53359 | Episode 205 | Reward: 190.71
Step 53885 | Episode 206 | Reward: 115.75
Step 54061 | Episode 207 | Reward: -43.59
Step 54262 | Episode 208 | Reward: -80.67
Step 54621 | Episode 209 | Reward: 202.18
Step 54857 | Episode 210 | Reward: -48.03
Evaluating at step 55000...




Uploading video: videos/td3-step-55000-episode-0.mp4
Step 55738 | Episode 211 | Reward: 75.09
Step 56327 | Episode 212 | Reward: 238.23
Step 56794 | Episode 213 | Reward: 238.26
Step 57063 | Episode 214 | Reward: 217.57
Step 57571 | Episode 215 | Reward: -102.04
Step 58039 | Episode 216 | Reward: 243.87
Step 58504 | Episode 217 | Reward: 204.22
Step 59114 | Episode 218 | Reward: 204.70
Step 59612 | Episode 219 | Reward: 146.24
Evaluating at step 60000...




Uploading video: videos/td3-step-60000-episode-0.mp4
Step 60107 | Episode 220 | Reward: -151.06
Step 60520 | Episode 221 | Reward: 253.83
Step 60911 | Episode 222 | Reward: 261.31
Step 61245 | Episode 223 | Reward: 228.80
Step 61453 | Episode 224 | Reward: -12.02
Step 61905 | Episode 225 | Reward: 224.50
Step 62201 | Episode 226 | Reward: 225.72
Step 62544 | Episode 227 | Reward: 284.95
Step 62838 | Episode 228 | Reward: 215.79
Step 63402 | Episode 229 | Reward: 75.15
Step 63975 | Episode 230 | Reward: 186.32
Step 64267 | Episode 231 | Reward: 230.47
Step 64601 | Episode 232 | Reward: 234.40
Step 64959 | Episode 233 | Reward: 232.20
Evaluating at step 65000...




Uploading video: videos/td3-step-65000-episode-0.mp4
Step 65271 | Episode 234 | Reward: 209.61
Step 65730 | Episode 235 | Reward: 169.39
Step 66598 | Episode 236 | Reward: 157.39
Step 67170 | Episode 237 | Reward: -167.06
Step 67479 | Episode 238 | Reward: 236.41
Step 67983 | Episode 239 | Reward: 152.82
Step 68279 | Episode 240 | Reward: 247.36
Step 68562 | Episode 241 | Reward: 252.54
Step 68918 | Episode 242 | Reward: 245.17
Step 69236 | Episode 243 | Reward: 200.75
Step 69525 | Episode 244 | Reward: 264.09
Step 69850 | Episode 245 | Reward: 180.27
Evaluating at step 70000...




Uploading video: videos/td3-step-70000-episode-0.mp4
Step 70187 | Episode 246 | Reward: 243.11
Step 70564 | Episode 247 | Reward: 238.85
Step 70879 | Episode 248 | Reward: 238.46
Step 71169 | Episode 249 | Reward: 208.75
Step 71571 | Episode 250 | Reward: 186.88
Step 71897 | Episode 251 | Reward: 236.72
Step 72250 | Episode 252 | Reward: 226.79
Step 72756 | Episode 253 | Reward: 207.79
Step 73042 | Episode 254 | Reward: 245.43
Step 73265 | Episode 255 | Reward: -18.25
Step 73647 | Episode 256 | Reward: 190.51
Step 74114 | Episode 257 | Reward: 189.92
Step 74287 | Episode 258 | Reward: 4.11
Step 74511 | Episode 259 | Reward: 6.02
Step 74789 | Episode 260 | Reward: 258.17
Evaluating at step 75000...




Uploading video: videos/td3-step-75000-episode-0.mp4
Step 75266 | Episode 261 | Reward: 213.36
Step 75603 | Episode 262 | Reward: 245.33
Step 76423 | Episode 263 | Reward: 143.38
Step 76901 | Episode 264 | Reward: 209.68
Step 77114 | Episode 265 | Reward: -53.14
Step 77507 | Episode 266 | Reward: 196.24
Step 78161 | Episode 267 | Reward: 157.55
Step 78568 | Episode 268 | Reward: 244.38
Step 78896 | Episode 269 | Reward: 204.10
Step 79298 | Episode 270 | Reward: -109.09
Step 79634 | Episode 271 | Reward: 231.14
Step 79992 | Episode 272 | Reward: 207.44
Evaluating at step 80000...




Uploading video: videos/td3-step-80000-episode-0.mp4
Step 80436 | Episode 273 | Reward: 196.13
Step 80821 | Episode 274 | Reward: 243.23
Step 81069 | Episode 275 | Reward: -57.81
Step 81319 | Episode 276 | Reward: -20.73
Step 81640 | Episode 277 | Reward: -35.44
Step 81901 | Episode 278 | Reward: -63.43
Step 82354 | Episode 279 | Reward: 175.41
Step 82806 | Episode 280 | Reward: 164.20
Step 83322 | Episode 281 | Reward: 124.38
Step 83750 | Episode 282 | Reward: 239.28
Step 84089 | Episode 283 | Reward: 227.79
Step 84410 | Episode 284 | Reward: -76.07
Step 84770 | Episode 285 | Reward: 218.14
Evaluating at step 85000...




Uploading video: videos/td3-step-85000-episode-0.mp4
Step 85332 | Episode 286 | Reward: 183.46
Step 85667 | Episode 287 | Reward: 196.79
Step 86272 | Episode 288 | Reward: 212.10
Step 86921 | Episode 289 | Reward: 231.86
Step 87312 | Episode 290 | Reward: 281.89
Step 87529 | Episode 291 | Reward: -2.80
Step 87910 | Episode 292 | Reward: 210.00
Step 88464 | Episode 293 | Reward: -116.58
Step 88864 | Episode 294 | Reward: 233.98
Step 89208 | Episode 295 | Reward: -6.95
Step 89492 | Episode 296 | Reward: 46.38
Step 89890 | Episode 297 | Reward: 243.99
Evaluating at step 90000...




Uploading video: videos/td3-step-90000-episode-0.mp4
Step 90369 | Episode 298 | Reward: 244.55
Step 90884 | Episode 299 | Reward: 221.44
Step 91381 | Episode 300 | Reward: 227.49
Step 92060 | Episode 301 | Reward: 196.72
Step 92595 | Episode 302 | Reward: 191.68
Step 93001 | Episode 303 | Reward: 193.98
Step 93976 | Episode 304 | Reward: 168.10
Step 94341 | Episode 305 | Reward: -43.16
Step 94720 | Episode 306 | Reward: 215.33
Evaluating at step 95000...




Uploading video: videos/td3-step-95000-episode-0.mp4
Step 95411 | Episode 307 | Reward: 194.96
Step 96019 | Episode 308 | Reward: 227.12
Step 96678 | Episode 309 | Reward: -187.17
Step 97083 | Episode 310 | Reward: 222.64
Step 97462 | Episode 311 | Reward: 250.36
Step 98168 | Episode 312 | Reward: 147.79
Step 98503 | Episode 313 | Reward: 220.61
Step 98873 | Episode 314 | Reward: 240.02
Step 99246 | Episode 315 | Reward: 238.02
Step 99775 | Episode 316 | Reward: 198.46
Evaluating at step 100000...




Uploading video: videos/td3-step-100000-episode-0.mp4
Training Complete.
Uploading model to Hugging Face Hub: yousefyousefyousef335/td3-lunarlander-v3
HF Upload failed (check your token): (Request ID: Root=1-6939c45f-5862bcb4438d0b110f9c7ad0;e6650954-d85f-4e6a-925c-f409673039e8)

403 Forbidden: You don't have the rights to create a model under the namespace "yousefyousefyousef335".
Cannot access content at: https://huggingface.co/api/repos/create.
Make sure your token has the correct permissions.


In [8]:

api = HfApi()
create_repo(repo_id="YoussefTolba/td3-lunarlander-v3", exist_ok=True, repo_type="model")
upload_folder(
    folder_path="td3_model_artifacts",
    repo_id="YoussefTolba/td3-lunarlander-v3",
    repo_type="model",
    commit_message="Upload TD3 agent with video and charts logs"
)
print("Upload success!")


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...td3_lunarlander_actor.pth:  66%|######5   |  183kB /  278kB            

  ...d3_lunarlander_critic.pth:  66%|######5   |  367kB /  556kB            

Upload success!


##  CarRacing-v3

In [1]:
import os
import random
import time
import uuid
from dataclasses import dataclass, asdict
from typing import Optional, Tuple

# --- KAGGLE SPECIFIC: START VIRTUAL DISPLAY ---
from pyvirtualdisplay import Display
try:
    # Create a virtual screen to trick OpenGL
    display = Display(visible=0, size=(1400, 900))
    display.start()
    print("Virtual display started successfully.")
except Exception as e:
    print(f"Failed to start virtual display: {e}")
# ---------------------------------------------

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import wandb
from huggingface_hub import HfApi, create_repo, upload_folder
from kaggle_secrets import UserSecretsClient # For secure Auth

# ==========================================
# 0. Setup WandB Login for Kaggle
# ==========================================
try:
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("wandb_api_key")
    wandb.login(key=wandb_api_key)
    print("Logged into WandB via Kaggle Secrets.")
except:
    print("Could not find 'wandb_api_key' in Kaggle Secrets. Falling back to interactive login.")
    wandb.login()

# ==========================================
# 1. Configuration & Hyperparameters
# ==========================================
@dataclass
class TD3Config:
    # Experiment Settings
    env_id: str = "CarRacing-v3"
    project_name: str = "td3-carracing6"
    run_name: str = f"td3_car_{str(uuid.uuid4())[:8]}"
    seed: int = 42

    # Training Duration
    total_timesteps: int = 500_000
    moving_forwards_timsteps: int = 2000
    learning_starts: int = 10_000

    # Hyperparameters
    hidden_dim: int = 256
    actor_lr: float = 1e-4
    critic_lr: float = 1e-4
    batch_size: int = 64
    buffer_size: int = 100_000
    gamma: float = 0.99
    tau: float = 0.005

    # TD3 Specifics
    policy_noise: float = 0.2
    noise_clip: float = 0.5
    policy_delay: int = 2
    exploration_noise: float = 0.1

    # Logging & Saving
    eval_freq: int = 10_000
    save_model: bool = True
    hf_repo_id: str = "yousefyousefyousef335/td3-carracing-v3"

# ==========================================
# 2. Preprocessing & Wrappers
# ==========================================
class ImageTransposeWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=0, high=255,
            shape=(obs_shape[2], obs_shape[0], obs_shape[1]),
            dtype=np.uint8
        )

    def observation(self, observation):
        return np.transpose(observation, (2, 0, 1))

# ==========================================
# 3. Replay Buffer (Optimized for Images)
# ==========================================
class ReplayBuffer:
    def __init__(self, state_shape, action_dim, max_size=1e5):
        self.max_size = int(max_size)
        self.ptr = 0
        self.size = 0
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.state = np.zeros((self.max_size, *state_shape), dtype=np.uint8)
        self.action = np.zeros((self.max_size, action_dim), dtype=np.float32)
        self.next_state = np.zeros((self.max_size, *state_shape), dtype=np.uint8)
        self.reward = np.zeros((self.max_size, 1), dtype=np.float32)
        self.not_done = np.zeros((self.max_size, 1), dtype=np.float32)

    def add(self, state, action, next_state, reward, done):
        self.state[self.ptr] = state
        self.action[self.ptr] = action
        self.next_state[self.ptr] = next_state
        self.reward[self.ptr] = reward
        self.not_done[self.ptr] = 1. - done

        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def sample(self, batch_size):
        ind = np.random.randint(0, self.size, size=batch_size)

        return (
            torch.FloatTensor(self.state[ind]).to(self.device) / 255.0,
            torch.FloatTensor(self.action[ind]).to(self.device),
            torch.FloatTensor(self.next_state[ind]).to(self.device) / 255.0,
            torch.FloatTensor(self.reward[ind]).to(self.device),
            torch.FloatTensor(self.not_done[ind]).to(self.device)
        )

# ==========================================
# 4. Neural Networks (CNN + MLP)
# ==========================================
class CNNEncoder(nn.Module):
    def __init__(self, input_channels=3):
        super(CNNEncoder, self).__init__()
        # Input: (3, 96, 96)
        self.net = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4), # -> (32, 23, 23)
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),             # -> (64, 10, 10)
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),             # -> (64, 8, 8)
            nn.ReLU(),
            nn.Flatten()
        )
        self.out_dim = 64 * 8 * 8

    def forward(self, x):
        return self.net(x)

class Actor(nn.Module):
    def __init__(self, action_dim, max_action, hidden_dim=256):
        super(Actor, self).__init__()
        self.encoder = CNNEncoder()

        self.l1 = nn.Linear(self.encoder.out_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, hidden_dim)
        self.l3 = nn.Linear(hidden_dim, action_dim)
        self.max_action = max_action

    def forward(self, state):
        features = self.encoder(state)
        a = F.relu(self.l1(features))
        a = F.relu(self.l2(a))
        return self.max_action * torch.tanh(self.l3(a))

class Critic(nn.Module):
    def __init__(self, action_dim, hidden_dim=256):
        super(Critic, self).__init__()
        self.encoder1 = CNNEncoder()
        self.encoder2 = CNNEncoder()

        # Q1 Architecture
        self.l1 = nn.Linear(self.encoder1.out_dim + action_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, hidden_dim)
        self.l3 = nn.Linear(hidden_dim, 1)

        # Q2 Architecture
        self.l4 = nn.Linear(self.encoder2.out_dim + action_dim, hidden_dim)
        self.l5 = nn.Linear(hidden_dim, hidden_dim)
        self.l6 = nn.Linear(hidden_dim, 1)

    def forward(self, state, action):
        f1 = self.encoder1(state)
        sa1 = torch.cat([f1, action], 1)
        q1 = F.relu(self.l1(sa1))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        f2 = self.encoder2(state)
        sa2 = torch.cat([f2, action], 1)
        q2 = F.relu(self.l4(sa2))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2

    def Q1(self, state, action):
        f1 = self.encoder1(state)
        sa = torch.cat([f1, action], 1)
        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1

# ==========================================
# 5. TD3 Algorithm
# ==========================================
class TD3:
    def __init__(self, action_dim, max_action, config: TD3Config):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.conf = config
        self.max_action = max_action
        self.total_it = 0

        self.actor = Actor(action_dim, max_action, config.hidden_dim).to(self.device)
        self.actor_target = Actor(action_dim, max_action, config.hidden_dim).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=config.actor_lr)

        self.critic = Critic(action_dim, config.hidden_dim).to(self.device)
        self.critic_target = Critic(action_dim, config.hidden_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=config.critic_lr)

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device) / 255.0
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer):
        self.total_it += 1

        state, action, next_state, reward, not_done = replay_buffer.sample(self.conf.batch_size)

        with torch.no_grad():
            noise = (torch.randn_like(action) * self.conf.policy_noise).clamp(-self.conf.noise_clip, self.conf.noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)

            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + not_done * self.conf.gamma * target_Q

        current_Q1, current_Q2 = self.critic(state, action)
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = None
        if self.total_it % self.conf.policy_delay == 0:
            actor_loss = -self.critic.Q1(state, self.actor(state)).mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.conf.tau * param.data + (1 - self.conf.tau) * target_param.data)
            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(self.conf.tau * param.data + (1 - self.conf.tau) * target_param.data)

        return critic_loss.item(), (actor_loss.item() if actor_loss else None)

    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic.pth")
        torch.save(self.actor.state_dict(), filename + "_actor.pth")

# ==========================================
# 6. Helpers: Evaluation & Recording (UPDATED)
# ==========================================
def evaluate_and_record(policy, env_id, seed, step, run_name, video_folder="videos"):
    """
    Evaluates the policy and saves the video with a unique structure to avoid overwriting.
    """
    # 1. Create a unique folder for THIS specific run
    run_video_folder = os.path.join(video_folder, run_name)
    os.makedirs(run_video_folder, exist_ok=True)

    eval_env = gym.make(env_id, continuous=True, render_mode="rgb_array")
    eval_env = ImageTransposeWrapper(eval_env)

    # 2. Define a unique prefix using the step count
    video_prefix = f"step-{step}"

    eval_env = gym.wrappers.RecordVideo(
        eval_env,
        video_folder=run_video_folder,
        name_prefix=video_prefix,
        episode_trigger=lambda x: True,
        disable_logger=True
    )

    rewards = []
    video_path = None

    # 1 Episode Eval
    for i in range(1):
        state, _ = eval_env.reset(seed=seed + 100 + i)
        terminated, truncated = False, False
        episode_reward = 0

        while not (terminated or truncated):
            action = policy.select_action(np.array(state))
            state, reward, terminated, truncated, _ = eval_env.step(action)
            episode_reward += reward

        rewards.append(episode_reward)

    eval_env.close()

    # Construct the expected path to verify it exists
    expected_filename = f"{video_prefix}-episode-0.mp4"
    expected_path = os.path.join(run_video_folder, expected_filename)

    if os.path.exists(expected_path):
        video_path = expected_path

    return rewards, video_path

# ==========================================
# 7. Main Training Loop
# ==========================================
def run_training():
    conf = TD3Config()

    wandb.init(
        project=conf.project_name,
        name=conf.run_name,
        config=asdict(conf),
        monitor_gym=False,
        save_code=True
    )

    env = gym.make(conf.env_id, continuous=True)
    env = ImageTransposeWrapper(env)

    env.action_space.seed(conf.seed)
    torch.manual_seed(conf.seed)
    np.random.seed(conf.seed)

    state_shape = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    replay_buffer = ReplayBuffer(state_shape, action_dim, conf.buffer_size)
    policy = TD3(action_dim, max_action, conf)

    state, _ = env.reset(seed=conf.seed)
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    print(f"---------------------------------------")
    print(f"Starting Training: {conf.env_id} | TD3 | Seed: {conf.seed}")
    print(f"Observation Shape: {state_shape} | Action Dim: {action_dim}")
    print(f"---------------------------------------")

    for t in range(int(conf.total_timesteps)):
        episode_timesteps += 1

       if t < conf.learning_starts:
            action = env.action_space.sample()

        else:
            raw_action = policy.select_action(np.array(state))
            noise = np.random.normal(0, max_action * conf.exploration_noise, size=action_dim)

            # Standard clipping to environment bounds [-1, 1]
            action = (raw_action + noise).clip(-max_action, max_action)

            # >>> CUSTOM CLIPPING: First 100k Steps <<<
            if t < 100_000:
                # Steering: [-0.25, 0.25]
                action[0] = np.clip(action[0], -0.01, 0.01)
                # Gas: [0.1, 1.0] (Forces agent to keep moving)
                action[1] = np.clip(action[1], 0.1, 1.0)
                # Brake: [0.0, 0.8] (Prevents full hard braking)
                action[2] = np.clip(action[2], 0.0, 0.8)
            # >>> END CUSTOM CLIPPING <<<

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Store data in replay buffer
        replay_buffer.add(state, action, next_state, reward, float(terminated))
        state = next_state
        episode_reward += reward

        if t >= conf.learning_starts:
            critic_loss, actor_loss = policy.train(replay_buffer)

            if t % 100 == 0:
                logs = {"train/critic_loss": critic_loss}
                if actor_loss is not None:
                    logs["train/actor_loss"] = actor_loss
                wandb.log(logs, step=t)

        if done:
            wandb.log({
                "train/episode_reward": episode_reward,
                "train/episode_length": episode_timesteps
            }, step=t)

            print(f"Step {t} | Episode {episode_num} | Reward: {episode_reward:.2f}")

            state, _ = env.reset()
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        if (t + 1) % conf.eval_freq == 0:
            print(f"Evaluating at step {t+1}...")

            eval_rewards, video_path = evaluate_and_record(
                policy,
                conf.env_id,
                conf.seed,
                step=t+1,
                run_name=conf.run_name
            )

            mean_score = np.mean(eval_rewards)
            wandb.log({"eval/mean_reward": mean_score}, step=t)
            wandb.log({"eval/episode_reward": eval_rewards[0]}, step=t)

            if video_path:
                print(f"Uploading video: {video_path}")
                wandb.log({
                    "eval/video": wandb.Video(
                        video_path,
                        fps=30,
                        format="mp4",
                        caption=f"Eval Step {t+1} | Score: {mean_score:.2f}"
                    )
                }, step=t)

    print("Training Complete.")

    if conf.save_model:
        save_path = "td3_model_artifacts"
        os.makedirs(save_path, exist_ok=True)
        policy.save(os.path.join(save_path, "td3_carracing"))

    wandb.finish()

if __name__ == "__main__":
    run_training()

KeyboardInterrupt: 

## enforcing car to learn the true path

In [None]:
import os
import random
import time
import uuid
from dataclasses import dataclass, asdict
from typing import Optional, Tuple

# --- KAGGLE SPECIFIC: START VIRTUAL DISPLAY ---
from pyvirtualdisplay import Display
try:
    # Create a virtual screen to trick OpenGL
    display = Display(visible=0, size=(1400, 900))
    display.start()
    print("Virtual display started successfully.")
except Exception as e:
    print(f"Failed to start virtual display: {e}")
# ---------------------------------------------

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import wandb
from huggingface_hub import HfApi, create_repo, upload_folder
from kaggle_secrets import UserSecretsClient # For secure Auth

# ==========================================
# 0. Setup WandB Login for Kaggle
# ==========================================
try:
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("wandb_api_key")
    wandb.login(key=wandb_api_key)
    print("Logged into WandB via Kaggle Secrets.")
except:
    print("Could not find 'wandb_api_key' in Kaggle Secrets. Falling back to interactive login.")
    wandb.login()

# ==========================================
# 1. Configuration & Hyperparameters
# ==========================================
@dataclass
class TD3Config:
    # Experiment Settings
    env_id: str = "CarRacing-v3"
    project_name: str = "td3-carracing8"
    run_name: str = f"td3_car_{str(uuid.uuid4())[:8]}"
    seed: int = 42
    
    # Training Duration 
    total_timesteps: int = 500_000  
    # warm up steps (heuristic agent)
    learning_starts: int = 30_000       
    
    # Hyperparameters
    hidden_dim: int = 256
    actor_lr: float = 1e-4  
    critic_lr: float = 1e-4
    batch_size: int = 64
    buffer_size: int = 100_000 
    gamma: float = 0.99               
    tau: float = 0.005                
    
    # TD3 Specifics
    policy_noise: float = 0.2         
    noise_clip: float = 0.5           
    policy_delay: int = 2             
    exploration_noise: float = 0.1  

    # Logging & Saving
    eval_freq: int = 10_000
    save_model: bool = True
    hf_repo_id: str = "yousefyousefyousef335/td3-carracing-v3"

# ==========================================
# 2. Preprocessing & Wrappers
# ==========================================
class ImageTransposeWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=0, high=255, 
            shape=(obs_shape[2], obs_shape[0], obs_shape[1]), 
            dtype=np.uint8
        )

    def observation(self, observation):
        return np.transpose(observation, (2, 0, 1))

# ==========================================
# 3. Replay Buffer (Optimized for Images)
# ==========================================
class ReplayBuffer:
    def __init__(self, state_shape, action_dim, max_size=1e5):
        self.max_size = int(max_size)
        self.ptr = 0
        self.size = 0
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.state = np.zeros((self.max_size, *state_shape), dtype=np.uint8)
        self.action = np.zeros((self.max_size, action_dim), dtype=np.float32)
        self.next_state = np.zeros((self.max_size, *state_shape), dtype=np.uint8)
        self.reward = np.zeros((self.max_size, 1), dtype=np.float32)
        self.not_done = np.zeros((self.max_size, 1), dtype=np.float32)

    def add(self, state, action, next_state, reward, done):
        self.state[self.ptr] = state
        self.action[self.ptr] = action
        self.next_state[self.ptr] = next_state
        self.reward[self.ptr] = reward
        self.not_done[self.ptr] = 1. - done

        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def sample(self, batch_size):
        ind = np.random.randint(0, self.size, size=batch_size)
        
        return (
            torch.FloatTensor(self.state[ind]).to(self.device) / 255.0,
            torch.FloatTensor(self.action[ind]).to(self.device),
            torch.FloatTensor(self.next_state[ind]).to(self.device) / 255.0,
            torch.FloatTensor(self.reward[ind]).to(self.device),
            torch.FloatTensor(self.not_done[ind]).to(self.device)
        )

# ==========================================
# 4. Neural Networks (CNN + MLP)
# ==========================================
class CNNEncoder(nn.Module):
    def __init__(self, input_channels=3):
        super(CNNEncoder, self).__init__()
        # Input: (3, 96, 96)
        self.net = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4), # -> (32, 23, 23)
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),             # -> (64, 10, 10)
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),             # -> (64, 8, 8)
            nn.ReLU(),
            nn.Flatten()
        )
        self.out_dim = 64 * 8 * 8 

    def forward(self, x):
        return self.net(x)

class Actor(nn.Module):
    def __init__(self, action_dim, max_action, hidden_dim=256):
        super(Actor, self).__init__()
        self.encoder = CNNEncoder()
        
        self.l1 = nn.Linear(self.encoder.out_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, hidden_dim)
        self.l3 = nn.Linear(hidden_dim, action_dim)
        self.max_action = max_action

    def forward(self, state):
        features = self.encoder(state)
        a = F.relu(self.l1(features))
        a = F.relu(self.l2(a))
        return self.max_action * torch.tanh(self.l3(a)) 

class Critic(nn.Module):
    def __init__(self, action_dim, hidden_dim=256):
        super(Critic, self).__init__()
        self.encoder1 = CNNEncoder()
        self.encoder2 = CNNEncoder()

        # Q1 Architecture
        self.l1 = nn.Linear(self.encoder1.out_dim + action_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, hidden_dim)
        self.l3 = nn.Linear(hidden_dim, 1)

        # Q2 Architecture
        self.l4 = nn.Linear(self.encoder2.out_dim + action_dim, hidden_dim)
        self.l5 = nn.Linear(hidden_dim, hidden_dim)
        self.l6 = nn.Linear(hidden_dim, 1)

    def forward(self, state, action):
        f1 = self.encoder1(state)
        sa1 = torch.cat([f1, action], 1)
        q1 = F.relu(self.l1(sa1))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        f2 = self.encoder2(state)
        sa2 = torch.cat([f2, action], 1)
        q2 = F.relu(self.l4(sa2))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2

    def Q1(self, state, action):
        f1 = self.encoder1(state)
        sa = torch.cat([f1, action], 1)
        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1

# ==========================================
# 5. TD3 Algorithm
# ==========================================
class TD3:
    def __init__(self, action_dim, max_action, config: TD3Config):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.conf = config
        self.max_action = max_action
        self.total_it = 0

        self.actor = Actor(action_dim, max_action, config.hidden_dim).to(self.device)
        self.actor_target = Actor(action_dim, max_action, config.hidden_dim).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=config.actor_lr)

        self.critic = Critic(action_dim, config.hidden_dim).to(self.device)
        self.critic_target = Critic(action_dim, config.hidden_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=config.critic_lr)

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device) / 255.0
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer):
        self.total_it += 1
        
        state, action, next_state, reward, not_done = replay_buffer.sample(self.conf.batch_size)

        with torch.no_grad():
            noise = (torch.randn_like(action) * self.conf.policy_noise).clamp(-self.conf.noise_clip, self.conf.noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)

            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + not_done * self.conf.gamma * target_Q

        current_Q1, current_Q2 = self.critic(state, action)
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = None
        if self.total_it % self.conf.policy_delay == 0:
            actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
            
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.conf.tau * param.data + (1 - self.conf.tau) * target_param.data)
            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(self.conf.tau * param.data + (1 - self.conf.tau) * target_param.data)

        return critic_loss.item(), (actor_loss.item() if actor_loss else None)

    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic.pth")
        torch.save(self.actor.state_dict(), filename + "_actor.pth")

# ==========================================
# 6. EXPERT HEURISTIC AGENT
# ==========================================
class CarRacingHeuristic:
    def __init__(self):
        # We target a modest speed for safe "expert" data collection
        self.target_speed = 0.1 
        self.k_p = 1.8  # Proportional gain for steering

    def act(self, state):
        """
        Input state: (3, 96, 96) in [0..255]
        Returns: action [steering, gas, brake]
        """
        # Ensure numpy
        if isinstance(state, torch.Tensor):
            state = state.cpu().numpy()

        # Convert back to HWC for image processing (96, 96, 3)
        img = np.transpose(state, (1, 2, 0))
        
        # Look ahead crop (approx rows 60-78)
        crop = img[60:78, :, :]
        
        # Simple road detection: Road is gray (R ~= G ~= B)
        r, g, b = crop[:, :, 0], crop[:, :, 1], crop[:, :, 2]
        
        # Road is where colors are close to each other, and not too dark
        is_road = (np.abs(r - g) < 15) & (np.abs(g - b) < 15) & (g > 60)
        
        road_pixels = np.argwhere(is_road)
        
        if len(road_pixels) > 0:
            # Average column index (axis 1) is the center of the road
            target_x = np.mean(road_pixels[:, 1])
            
            # Center of the image is 48
            error = (target_x - 48.0) / 48.0
            
            steering = np.clip(error * self.k_p, -1.0, 1.0)
            
            # Slow down on sharp turns
            gas = 0.05 if abs(steering) > 0.3 else 0.2
            brake = 0.0
        else:
            # If lost, usually better to do nothing or slight brake than spin
            steering = 0.0
            gas = 0.0
            brake = 0.1

        return np.array([steering, gas, brake], dtype=np.float32)

# ==========================================
# 7. Helpers: Evaluation & Recording
# ==========================================
def evaluate_and_record(policy, env_id, seed, step, run_name, video_folder="videos"):
    run_video_folder = os.path.join(video_folder, run_name)
    os.makedirs(run_video_folder, exist_ok=True)
    
    eval_env = gym.make(env_id, continuous=True, render_mode="rgb_array")
    eval_env = ImageTransposeWrapper(eval_env)
    
    video_prefix = f"step-{step}"
    
    eval_env = gym.wrappers.RecordVideo(
        eval_env, 
        video_folder=run_video_folder, 
        name_prefix=video_prefix,
        episode_trigger=lambda x: True, 
        disable_logger=True
    )
    
    rewards = []
    video_path = None
    
    for i in range(1): 
        state, _ = eval_env.reset(seed=seed + 100 + i)
        terminated, truncated = False, False
        episode_reward = 0
        
        while not (terminated or truncated):
            action = policy.select_action(np.array(state))
            state, reward, terminated, truncated, _ = eval_env.step(action)
            episode_reward += reward
            
        rewards.append(episode_reward)
    
    eval_env.close()
    
    expected_filename = f"{video_prefix}-episode-0.mp4"
    expected_path = os.path.join(run_video_folder, expected_filename)
    
    if os.path.exists(expected_path):
        video_path = expected_path
        
    return rewards, video_path

# ==========================================
# 8. Main Training Loop
# ==========================================
def run_training():
    conf = TD3Config()
    
    wandb.init(
        project=conf.project_name,
        name=conf.run_name,
        config=asdict(conf),
        monitor_gym=False,
        save_code=True
    )

    env = gym.make(conf.env_id, continuous=True)
    env = ImageTransposeWrapper(env)
    
    env.action_space.seed(conf.seed)
    torch.manual_seed(conf.seed)
    np.random.seed(conf.seed)

    state_shape = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    replay_buffer = ReplayBuffer(state_shape, action_dim, conf.buffer_size)
    policy = TD3(action_dim, max_action, conf)
    
    # Initialize heuristic expert
    expert = CarRacingHeuristic()

    state, _ = env.reset(seed=conf.seed)
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    print(f"---------------------------------------")
    print(f"Starting Training: {conf.env_id} | TD3 | Seed: {conf.seed}")
    print(f"Observation Shape: {state_shape} | Action Dim: {action_dim}")
    print(f"Warmup (Expert) Steps: {conf.learning_starts}")
    print(f"---------------------------------------")

    for t in range(int(conf.total_timesteps)):
        episode_timesteps += 1

        # === 1. Warmup Phase (Expert Data Collection) ===
        if t < conf.learning_starts:
            # Use heuristic expert
            action = expert.act(state)
            
            # Add small noise so the data isn't identical (Robustness)
            action[0] += np.random.normal(0, 0.1) # Steering noise
            action[0] = np.clip(action[0], -1.0, 1.0)
            
            # Ensure Gas/Brake are valid [0,1]
            action[1] = np.clip(action[1], 0.0, 1.0)
            action[2] = np.clip(action[2], 0.0, 1.0)

        # === 2. Training Phase (TD3 Policy) ===
        else:
            raw_action = policy.select_action(np.array(state))
            noise = np.random.normal(0, max_action * conf.exploration_noise, size=action_dim)
            
            action = (raw_action + noise).clip(-max_action, max_action)
            
            # CarRacing Env Constraints: 
            # Steering: [-1, 1], Gas: [0, 1], Brake: [0, 1]
            action[1] = max(action[1], 0.0) 
            action[2] = max(action[2], 0.0)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        # Store in buffer
        replay_buffer.add(state, action, next_state, reward, float(terminated))
        state = next_state
        episode_reward += reward

        # Train Policy
        if t >= conf.learning_starts:
            critic_loss, actor_loss = policy.train(replay_buffer)
            
            if t % 100 == 0:
                logs = {"train/critic_loss": critic_loss}
                if actor_loss is not None:
                    logs["train/actor_loss"] = actor_loss
                wandb.log(logs, step=t)

        if done:
            wandb.log({
                "train/episode_reward": episode_reward,
                "train/episode_length": episode_timesteps
            }, step=t)
            
            print(f"Step {t} | Episode {episode_num} | Reward: {episode_reward:.2f}")
            
            state, _ = env.reset()
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Evaluation
        if (t + 1) % conf.eval_freq == 0:
            print(f"Evaluating at step {t+1}...")
            
            eval_rewards, video_path = evaluate_and_record(
                policy, 
                conf.env_id, 
                conf.seed, 
                step=t+1,
                run_name=conf.run_name 
            )
            
            mean_score = np.mean(eval_rewards)
            wandb.log({"eval/mean_reward": mean_score}, step=t)
            wandb.log({"eval/episode_reward": eval_rewards[0]}, step=t)
            
            if video_path:
                print(f"Uploading video: {video_path}")
                wandb.log({
                    "eval/video": wandb.Video(
                        video_path, 
                        fps=30, 
                        format="mp4", 
                        caption=f"Eval Step {t+1} | Score: {mean_score:.2f}"
                    )
                }, step=t)

    print("Training Complete.")
    
    if conf.save_model:
        save_path = "td3_model_artifacts"
        os.makedirs(save_path, exist_ok=True)
        policy.save(os.path.join(save_path, "td3_carracing"))

    wandb.finish()

if __name__ == "__main__":
    run_training()