In [1]:
# 1. Install system dependencies for Box2D and Rendering (Headless)
!sudo apt-get update -y
!sudo apt-get install -y swig xvfb python3-opengl ffmpeg

# 2. Install Python libraries
# pyvirtualdisplay is crucial for rendering on Kaggle
!pip install gymnasium[box2d] moviepy imageio wandb huggingface_hub pyvirtualdisplay

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:4 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Hit:5 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:6 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease    
Hit:7 http://archive.ubuntu.com/ubuntu jammy InRelease                         
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]        
Get:9 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,087 kB]
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,201 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,286 kB]
Get:12 http://security.ubuntu.com/ubuntu ja

In [2]:
import subprocess
import sys

# Upgrade gymnasium to latest version
subprocess.check_call([sys.executable, "-m", "pip", "install", "gymnasium", "--upgrade"])
!pip install swig
!pip install "gymnasium[box2d]"

Collecting gymnasium
  Downloading gymnasium-1.2.2-py3-none-any.whl.metadata (10 kB)
Downloading gymnasium-1.2.2-py3-none-any.whl (952 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 952.1/952.1 kB 26.1 MB/s eta 0:00:00
Installing collected packages: gymnasium
  Attempting uninstall: gymnasium
    Found existing installation: gymnasium 0.29.0
    Uninstalling gymnasium-0.29.0:
      Successfully uninstalled gymnasium-0.29.0
Successfully installed gymnasium-1.2.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
stable-baselines3 2.1.0 requires gymnasium<0.30,>=0.28.1, but you have gymnasium 1.2.2 which is incompatible.
kaggle-environments 1.18.0 requires gymnasium==0.29.0, but you have gymnasium 1.2.2 which is incompatible.




In [None]:
import os
import random
import time
import uuid
from dataclasses import dataclass, asdict
from typing import Optional, Tuple

# --- KAGGLE SPECIFIC: START VIRTUAL DISPLAY ---
from pyvirtualdisplay import Display
try:
    # Create a virtual screen to trick OpenGL
    display = Display(visible=0, size=(1400, 900))
    display.start()
    print("Virtual display started successfully.")
except Exception as e:
    print(f"Failed to start virtual display: {e}")
# ---------------------------------------------

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import wandb
from huggingface_hub import HfApi, create_repo, upload_folder
from kaggle_secrets import UserSecretsClient 

# ==========================================
# 0. Setup WandB Login for Kaggle
# ==========================================
try:
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("wandb_api_key")
    wandb.login(key=wandb_api_key)
    print("Logged into WandB via Kaggle Secrets.")
except:
    print("Could not find 'wandb_api_key' in Kaggle Secrets. Falling back to interactive login.")
    wandb.login()

# ==========================================
# 1. Configuration & Hyperparameters (SAC)
# ==========================================
@dataclass
class SACConfig:
    # Experiment Settings
    env_id: str = "CarRacing-v3"
    project_name: str = "sac-carracing16"
    run_name: str = f"sac_car_{str(uuid.uuid4())[:8]}"
    seed: int = 42
    
    # Training Duration 
    total_timesteps: int = 500_000  
    # warm up steps (heuristic agent)
    learning_starts: int = 20_000        
    
    # Hyperparameters
    hidden_dim: int = 256
    actor_lr: float = 3e-4  
    critic_lr: float = 3e-4
    batch_size: int = 256     # SAC typically benefits from larger batches
    buffer_size: int = 100_000 
    gamma: float = 0.99                
    tau: float = 0.005                
    
    # SAC Specifics
    alpha: float = 0.2        # Entropy regularization coefficient
    target_entropy: float = -3.0 # (Optional if we implemented auto-tune, but we stick to fixed alpha per image)

    # Logging & Saving
    eval_freq: int = 10_000
    save_model: bool = True
    hf_repo_id: str = "yousefyousefyousef335/sac-carracing-v3"

# ==========================================
# 2. Preprocessing & Wrappers
# ==========================================
class ImageTransposeWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(
            low=0, high=255, 
            shape=(obs_shape[2], obs_shape[0], obs_shape[1]), 
            dtype=np.uint8
        )

    def observation(self, observation):
        return np.transpose(observation, (2, 0, 1))

# ==========================================
# 3. Replay Buffer (Optimized for Images)
# ==========================================
class ReplayBuffer:
    def __init__(self, state_shape, action_dim, max_size=1e5):
        self.max_size = int(max_size)
        self.ptr = 0
        self.size = 0
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.state = np.zeros((self.max_size, *state_shape), dtype=np.uint8)
        self.action = np.zeros((self.max_size, action_dim), dtype=np.float32)
        self.next_state = np.zeros((self.max_size, *state_shape), dtype=np.uint8)
        self.reward = np.zeros((self.max_size, 1), dtype=np.float32)
        self.not_done = np.zeros((self.max_size, 1), dtype=np.float32)

    def add(self, state, action, next_state, reward, done):
        self.state[self.ptr] = state
        self.action[self.ptr] = action
        self.next_state[self.ptr] = next_state
        self.reward[self.ptr] = reward
        self.not_done[self.ptr] = 1. - done

        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def sample(self, batch_size):
        ind = np.random.randint(0, self.size, size=batch_size)
        
        return (
            torch.FloatTensor(self.state[ind]).to(self.device) / 255.0,
            torch.FloatTensor(self.action[ind]).to(self.device),
            torch.FloatTensor(self.next_state[ind]).to(self.device) / 255.0,
            torch.FloatTensor(self.reward[ind]).to(self.device),
            torch.FloatTensor(self.not_done[ind]).to(self.device)
        )

# ==========================================
# 4. Neural Networks (CNN + MLP)
# ==========================================
class CNNEncoder(nn.Module):
    def __init__(self, input_channels=3):
        super(CNNEncoder, self).__init__()
        # Input: (3, 96, 96)
        self.net = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=8, stride=4), # -> (32, 23, 23)
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),             # -> (64, 10, 10)
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),             # -> (64, 8, 8)
            nn.ReLU(),
            nn.Flatten()
        )
        self.out_dim = 64 * 8 * 8 

    def forward(self, x):
        return self.net(x)

class GaussianActor(nn.Module):
    def __init__(self, action_dim, max_action, hidden_dim=256):
        super(GaussianActor, self).__init__()
        self.encoder = CNNEncoder()
        
        self.l1 = nn.Linear(self.encoder.out_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, hidden_dim)
        
        # SAC outputs Mean and Log-Std
        self.mu_layer = nn.Linear(hidden_dim, action_dim)
        self.log_std_layer = nn.Linear(hidden_dim, action_dim)
        
        self.max_action = max_action
        self.LOG_STD_MAX = 2
        self.LOG_STD_MIN = -20

    def forward(self, state):
        features = self.encoder(state)
        a = F.relu(self.l1(features))
        a = F.relu(self.l2(a))
        
        mu = self.mu_layer(a)
        log_std = self.log_std_layer(a)
        
        # Clamp log_std to maintain numerical stability
        log_std = torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX)
        return mu, log_std

    def sample(self, state):
        mu, log_std = self.forward(state)
        std = log_std.exp()
        
        # Reparameterization Trick: a = mu + std * epsilon
        normal = torch.distributions.Normal(mu, std)
        x_t = normal.rsample()  # for reparameterization gradient
        y_t = torch.tanh(x_t)   # Squash to [-1, 1]
        
        action = y_t * self.max_action
        
        # Enforcing Action Bound (Log Prob Correction for Tanh)
        # log_prob = log_prob_normal - sum(log(1 - tanh(x)^2))
        log_prob = normal.log_prob(x_t)
        # 1e-6 for numerical stability
        log_prob -= torch.log(self.max_action * (1 - y_t.pow(2)) + 1e-6)
        log_prob = log_prob.sum(1, keepdim=True)
        
        return action, log_prob

    def select_action(self, state):
        # Deterministic action for evaluation (just the mean squashed)
        with torch.no_grad():
            mu, _ = self.forward(state)
            action = torch.tanh(mu) * self.max_action
        return action.cpu().data.numpy().flatten()

class Critic(nn.Module):
    def __init__(self, action_dim, hidden_dim=256):
        super(Critic, self).__init__()
        self.encoder1 = CNNEncoder()
        self.encoder2 = CNNEncoder()

        # Q1 Architecture
        self.l1 = nn.Linear(self.encoder1.out_dim + action_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, hidden_dim)
        self.l3 = nn.Linear(hidden_dim, 1)

        # Q2 Architecture
        self.l4 = nn.Linear(self.encoder2.out_dim + action_dim, hidden_dim)
        self.l5 = nn.Linear(hidden_dim, hidden_dim)
        self.l6 = nn.Linear(hidden_dim, 1)

    def forward(self, state, action):
        f1 = self.encoder1(state)
        sa1 = torch.cat([f1, action], 1)
        q1 = F.relu(self.l1(sa1))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        f2 = self.encoder2(state)
        sa2 = torch.cat([f2, action], 1)
        q2 = F.relu(self.l4(sa2))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2

# ==========================================
# 5. Soft Actor-Critic (SAC) Algorithm
# ==========================================
class SAC:
    def __init__(self, action_dim, max_action, config: SACConfig):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.conf = config
        self.max_action = max_action

        # Actor: Outputs (Mean, Log_Std)
        self.actor = GaussianActor(action_dim, max_action, config.hidden_dim).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=config.actor_lr)

        # Critic: Two Q-functions
        self.critic = Critic(action_dim, config.hidden_dim).to(self.device)
        self.critic_target = Critic(action_dim, config.hidden_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=config.critic_lr)
        
        # Entropy Coefficient
        self.alpha = config.alpha

    def select_action(self, state, evaluate=False):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device) / 255.0
        if evaluate:
            return self.actor.select_action(state)
        else:
            action, _ = self.actor.sample(state)
            return action.cpu().data.numpy().flatten()

    def train(self, replay_buffer):
        # Sample batch
        state, action, next_state, reward, not_done = replay_buffer.sample(self.conf.batch_size)

        # -------------------------------------
        # 1. Critic Update
        # -------------------------------------
        with torch.no_grad():
            # Sample next action from CURRENT policy (not target policy like TD3)
            # This follows the Algorithm 1 logic: a' ~ pi(.|s')
            next_action, next_log_prob = self.actor.sample(next_state)

            # Compute Target Q
            # y = r + gamma * (min(Q_targ1, Q_targ2) - alpha * log_pi)
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2) - self.alpha * next_log_prob
            target_Q = reward + not_done * self.conf.gamma * target_Q

        # Current Q
        current_Q1, current_Q2 = self.critic(state, action)
        
        # Critic Loss (MSE)
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # -------------------------------------
        # 2. Actor Update
        # -------------------------------------
        # Sample action from current policy for current state
        # Algorithm 1: Update phi by one step of gradient ascent using:
        # min(Q(s, a_tilde)) - alpha * log_pi(a_tilde | s)
        new_action, log_prob = self.actor.sample(state)
        
        q1_new, q2_new = self.critic(state, new_action)
        q_new = torch.min(q1_new, q2_new)
        
        # We want to maximize (Q - alpha * log_prob), so we minimize -(Q - alpha * log_prob)
        actor_loss = (self.alpha * log_prob - q_new).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # -------------------------------------
        # 3. Soft Update Target Networks
        # -------------------------------------
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.conf.tau * param.data + (1 - self.conf.tau) * target_param.data)

        return critic_loss.item(), actor_loss.item()

    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic.pth")
        torch.save(self.actor.state_dict(), filename + "_actor.pth")

# ==========================================
# 6. EXPERT HEURISTIC AGENT
# ==========================================
class CarRacingHeuristic:
    def __init__(self):
        self.target_speed = 0.1 
        self.k_p = 1.8  # Proportional gain for steering

    def act(self, state):
        if isinstance(state, torch.Tensor):
            state = state.cpu().numpy()

        img = np.transpose(state, (1, 2, 0))
        crop = img[60:78, :, :]
        r, g, b = crop[:, :, 0], crop[:, :, 1], crop[:, :, 2]
        is_road = (np.abs(r - g) < 15) & (np.abs(g - b) < 15) & (g > 60)
        road_pixels = np.argwhere(is_road)
        
        if len(road_pixels) > 0:
            target_x = np.mean(road_pixels[:, 1])
            error = (target_x - 48.0) / 48.0
            steering = np.clip(error * self.k_p, -1.0, 1.0)
            gas = 0.05 if abs(steering) > 0.3 else 0.2
            brake = 0.0
        else:
            steering = 0.0
            gas = 0.0
            brake = 0.1

        return np.array([steering, gas, brake], dtype=np.float32)

# ==========================================
# 7. Helpers: Evaluation & Recording
# ==========================================
def evaluate_and_record(policy, env_id, seed, step, run_name, video_folder="videos"):
    run_video_folder = os.path.join(video_folder, run_name)
    os.makedirs(run_video_folder, exist_ok=True)
    
    eval_env = gym.make(env_id, continuous=True, render_mode="rgb_array")
    eval_env = ImageTransposeWrapper(eval_env)
    
    video_prefix = f"step-{step}"
    
    eval_env = gym.wrappers.RecordVideo(
        eval_env, 
        video_folder=run_video_folder, 
        name_prefix=video_prefix,
        episode_trigger=lambda x: True, 
        disable_logger=True
    )
    
    rewards = []
    video_path = None
    
    for i in range(1): 
        state, _ = eval_env.reset(seed=seed + 100 + i)
        terminated, truncated = False, False
        episode_reward = 0
        
        while not (terminated or truncated):
            # Pass evaluate=True to use deterministic action (mean)
            action = policy.select_action(np.array(state), evaluate=True)
            state, reward, terminated, truncated, _ = eval_env.step(action)
            episode_reward += reward
            
        rewards.append(episode_reward)
    
    eval_env.close()
    
    expected_filename = f"{video_prefix}-episode-0.mp4"
    expected_path = os.path.join(run_video_folder, expected_filename)
    if os.path.exists(expected_path):
        video_path = expected_path
        
    return rewards, video_path

# ==========================================
# 8. Main Training Loop
# ==========================================
def run_training():
    conf = SACConfig()
    
    wandb.init(
        project=conf.project_name,
        name=conf.run_name,
        config=asdict(conf),
        monitor_gym=False,
        save_code=True
    )

    env = gym.make(conf.env_id, continuous=True)
    env = ImageTransposeWrapper(env)
    
    env.action_space.seed(conf.seed)
    torch.manual_seed(conf.seed)
    np.random.seed(conf.seed)

    state_shape = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    replay_buffer = ReplayBuffer(state_shape, action_dim, conf.buffer_size)
    
    # Initialize SAC instead of TD3
    policy = SAC(action_dim, max_action, conf)
    
    # Initialize heuristic expert
    expert = CarRacingHeuristic()

    state, _ = env.reset(seed=conf.seed)
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    print(f"---------------------------------------")
    print(f"Starting Training: {conf.env_id} | SAC | Seed: {conf.seed}")
    print(f"Observation Shape: {state_shape} | Action Dim: {action_dim}")
    print(f"Warmup (Expert) Steps: {conf.learning_starts}")
    print(f"Alpha (Entropy Coeff): {conf.alpha}")
    print(f"---------------------------------------")

    for t in range(int(conf.total_timesteps)):
        episode_timesteps += 1

        # === 1. Warmup Phase (Expert Data Collection) ===
        if t < conf.learning_starts:
            action = expert.act(state)
            
            # Robustness noise for expert data
            action[0] += np.random.normal(0, 0.1)
            action[0] = np.clip(action[0], -1.0, 1.0)
            action[1] = np.clip(action[1], 0.0, 1.0)
            action[2] = np.clip(action[2], 0.0, 1.0)

        # === 2. Training Phase (SAC Policy) ===
        else:
            # SAC select_action already handles sampling and exploration
            action = policy.select_action(np.array(state), evaluate=False)
            
            # Enforce CarRacing specific constraints
            # (Note: Actor outputs tanh [-1, 1], but Gas/Brake need [0, 1])
            action[1] = max(action[1], 0.0) 
            action[2] = max(action[2], 0.0)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        replay_buffer.add(state, action, next_state, reward, float(terminated))
        state = next_state
        episode_reward += reward

        # Train Policy
        if t >= conf.learning_starts:
            critic_loss, actor_loss = policy.train(replay_buffer)
            
            if t % 100 == 0:
                wandb.log({
                    "train/critic_loss": critic_loss,
                    "train/actor_loss": actor_loss
                }, step=t)

        if done:
            wandb.log({
                "train/episode_reward": episode_reward,
                "train/episode_length": episode_timesteps
            }, step=t)
            
            print(f"Step {t} | Episode {episode_num} | Reward: {episode_reward:.2f}")
            
            state, _ = env.reset()
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Evaluation
        if (t + 1) % conf.eval_freq == 0:
            print(f"Evaluating at step {t+1}...")
            
            eval_rewards, video_path = evaluate_and_record(
                policy, 
                conf.env_id, 
                conf.seed, 
                step=t+1,
                run_name=conf.run_name 
            )
            
            mean_score = np.mean(eval_rewards)
            wandb.log({"eval/mean_reward": mean_score}, step=t)
            wandb.log({"eval/episode_reward": eval_rewards[0]}, step=t)
            
            if video_path:
                print(f"Uploading video: {video_path}")
                wandb.log({
                    "eval/video": wandb.Video(
                        video_path, 
                        fps=30, 
                        format="mp4", 
                        caption=f"Eval Step {t+1} | Score: {mean_score:.2f}"
                    )
                }, step=t)

    print("Training Complete.")
    
    if conf.save_model:
        save_path = "sac_model_artifacts"
        os.makedirs(save_path, exist_ok=True)
        policy.save(os.path.join(save_path, "sac_carracing"))

    wandb.finish()

if __name__ == "__main__":
    run_training()



Virtual display started successfully.
Logged into WandB via Kaggle Secrets.


0,1
eval/mean_reward,▁
train/actor_loss,▃▅▂▃▃▃▂▄▁▃▅▃▃▃▃▆▂▃█▅▃▁▂
train/critic_loss,█▅▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/episode_length,▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁█
train/episode_reward,▂▂▂▂▂▂▂▂▂▂▃▂▁▂▂▂▂▂▂▂▁▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂█

0,1
eval/mean_reward,-69.75268
train/actor_loss,-0.70371
train/critic_loss,0.43919
train/episode_length,1000.0
train/episode_reward,-43.54839


---------------------------------------
Starting Training: CarRacing-v3 | SAC | Seed: 42
Observation Shape: (3, 96, 96) | Action Dim: 3
Warmup (Expert) Steps: 20000
Alpha (Entropy Coeff): 0.2
---------------------------------------
Step 999 | Episode 0 | Reward: 126.15
Step 1999 | Episode 1 | Reward: 129.51
Step 2999 | Episode 2 | Reward: 182.83
Step 3999 | Episode 3 | Reward: 198.25
Step 4999 | Episode 4 | Reward: 143.67
Step 5999 | Episode 5 | Reward: 137.29
Step 6999 | Episode 6 | Reward: 112.03
Step 7999 | Episode 7 | Reward: 134.97
Step 8999 | Episode 8 | Reward: 71.17
Step 9999 | Episode 9 | Reward: 228.47
Evaluating at step 10000...


  logger.warn(


Uploading video: videos/sac_car_8fe4f077/step-10000-episode-0.mp4
Step 10999 | Episode 10 | Reward: 75.32
Step 11999 | Episode 11 | Reward: 169.78
Step 12999 | Episode 12 | Reward: 98.05
Step 13999 | Episode 13 | Reward: 171.13
Step 14999 | Episode 14 | Reward: 190.32
Step 15999 | Episode 15 | Reward: 64.84
Step 16999 | Episode 16 | Reward: 205.92
Step 17999 | Episode 17 | Reward: 129.17
Step 18999 | Episode 18 | Reward: 206.86
Step 19999 | Episode 19 | Reward: 159.40
Evaluating at step 20000...




Uploading video: videos/sac_car_8fe4f077/step-20000-episode-0.mp4
Step 20999 | Episode 20 | Reward: -49.82
Step 21999 | Episode 21 | Reward: -74.03
Step 22999 | Episode 22 | Reward: -79.24
Step 23999 | Episode 23 | Reward: -78.57
Step 24999 | Episode 24 | Reward: -76.43
Step 25999 | Episode 25 | Reward: -52.38
Step 26999 | Episode 26 | Reward: -29.01
Step 27999 | Episode 27 | Reward: -23.33
Step 28999 | Episode 28 | Reward: -52.22
Step 29999 | Episode 29 | Reward: -22.26
Evaluating at step 30000...




Uploading video: videos/sac_car_8fe4f077/step-30000-episode-0.mp4
Step 30999 | Episode 30 | Reward: 48.41
Step 31999 | Episode 31 | Reward: 9.63
Step 32999 | Episode 32 | Reward: 28.30
Step 33999 | Episode 33 | Reward: 112.31
Step 34999 | Episode 34 | Reward: 166.44
Step 35999 | Episode 35 | Reward: 175.00
Step 36999 | Episode 36 | Reward: 155.89
Step 37999 | Episode 37 | Reward: 13.14
Step 38999 | Episode 38 | Reward: 157.73
Step 39999 | Episode 39 | Reward: 56.36
Evaluating at step 40000...




Uploading video: videos/sac_car_8fe4f077/step-40000-episode-0.mp4
Step 40999 | Episode 40 | Reward: 257.86
Step 41999 | Episode 41 | Reward: 224.23
Step 42999 | Episode 42 | Reward: -17.76
Step 43999 | Episode 43 | Reward: 239.22
Step 44999 | Episode 44 | Reward: 177.58
Step 45999 | Episode 45 | Reward: 314.01
Step 46999 | Episode 46 | Reward: 176.22
Step 47999 | Episode 47 | Reward: 59.85
Step 48999 | Episode 48 | Reward: 86.05
Step 49999 | Episode 49 | Reward: 226.80
Evaluating at step 50000...




Uploading video: videos/sac_car_8fe4f077/step-50000-episode-0.mp4
Step 50999 | Episode 50 | Reward: 246.67
Step 51999 | Episode 51 | Reward: 273.29
Step 52999 | Episode 52 | Reward: 318.18
Step 53999 | Episode 53 | Reward: 56.55
Step 54999 | Episode 54 | Reward: 208.51
Step 55999 | Episode 55 | Reward: 49.84
Step 56999 | Episode 56 | Reward: 65.32
Step 57999 | Episode 57 | Reward: 349.12
Step 58999 | Episode 58 | Reward: 347.37
Step 59999 | Episode 59 | Reward: 456.86
Evaluating at step 60000...




Uploading video: videos/sac_car_8fe4f077/step-60000-episode-0.mp4
Step 60999 | Episode 60 | Reward: 110.00
Step 61999 | Episode 61 | Reward: 391.35
Step 62715 | Episode 62 | Reward: 410.88
Step 63715 | Episode 63 | Reward: 186.18
Step 64715 | Episode 64 | Reward: 102.80
Step 65715 | Episode 65 | Reward: 223.62
Step 66715 | Episode 66 | Reward: 227.40
Step 67715 | Episode 67 | Reward: 121.88
Step 68715 | Episode 68 | Reward: 403.40
Step 69715 | Episode 69 | Reward: 455.89
Evaluating at step 70000...




Uploading video: videos/sac_car_8fe4f077/step-70000-episode-0.mp4
Step 70715 | Episode 70 | Reward: 334.78
Step 71715 | Episode 71 | Reward: 297.16
Step 72715 | Episode 72 | Reward: 151.68
Step 73715 | Episode 73 | Reward: 56.55
Step 74715 | Episode 74 | Reward: 46.76
Step 75715 | Episode 75 | Reward: 308.09
Step 76715 | Episode 76 | Reward: 113.74
Step 77715 | Episode 77 | Reward: 389.96
Step 78715 | Episode 78 | Reward: 254.72
Step 79715 | Episode 79 | Reward: 237.54
Evaluating at step 80000...




Uploading video: videos/sac_car_8fe4f077/step-80000-episode-0.mp4
Step 80715 | Episode 80 | Reward: 440.37
Step 81715 | Episode 81 | Reward: 312.64
Step 82715 | Episode 82 | Reward: 127.85
Step 83715 | Episode 83 | Reward: 440.00
Step 84715 | Episode 84 | Reward: 197.87
Step 85715 | Episode 85 | Reward: -39.94
Step 86715 | Episode 86 | Reward: 193.55
Step 87715 | Episode 87 | Reward: 238.52
Step 88715 | Episode 88 | Reward: -30.31
Step 89715 | Episode 89 | Reward: 177.78
Evaluating at step 90000...




Uploading video: videos/sac_car_8fe4f077/step-90000-episode-0.mp4
Step 90715 | Episode 90 | Reward: 497.60
Step 91715 | Episode 91 | Reward: 324.75
Step 92715 | Episode 92 | Reward: 283.05
Step 93715 | Episode 93 | Reward: 258.26
Step 94715 | Episode 94 | Reward: 200.00
Step 95715 | Episode 95 | Reward: 341.38
Step 96715 | Episode 96 | Reward: 401.72
Step 97715 | Episode 97 | Reward: 312.59
Step 98715 | Episode 98 | Reward: 163.89
Step 99715 | Episode 99 | Reward: 511.30
Evaluating at step 100000...




Uploading video: videos/sac_car_8fe4f077/step-100000-episode-0.mp4
Step 100715 | Episode 100 | Reward: 177.19
Step 101715 | Episode 101 | Reward: 240.07
Step 102715 | Episode 102 | Reward: 138.81
Step 103715 | Episode 103 | Reward: 118.75
Step 104715 | Episode 104 | Reward: 242.11
Step 105715 | Episode 105 | Reward: 197.03
Step 106715 | Episode 106 | Reward: 57.89
Step 107715 | Episode 107 | Reward: 219.39
Step 108715 | Episode 108 | Reward: 143.67
Step 109715 | Episode 109 | Reward: 132.48
Evaluating at step 110000...




Uploading video: videos/sac_car_8fe4f077/step-110000-episode-0.mp4
Step 110715 | Episode 110 | Reward: 182.76
Step 111715 | Episode 111 | Reward: 354.20
Step 112715 | Episode 112 | Reward: 116.03
Step 113715 | Episode 113 | Reward: 206.67
Step 114715 | Episode 114 | Reward: 177.03
Step 115715 | Episode 115 | Reward: 300.68
Step 116715 | Episode 116 | Reward: 422.44
Step 117715 | Episode 117 | Reward: 360.38
Step 118715 | Episode 118 | Reward: 175.64
Step 119715 | Episode 119 | Reward: 31.27
Evaluating at step 120000...




Uploading video: videos/sac_car_8fe4f077/step-120000-episode-0.mp4
Step 120715 | Episode 120 | Reward: -21.71
Step 121715 | Episode 121 | Reward: 225.84
Step 122715 | Episode 122 | Reward: 239.68
Step 123715 | Episode 123 | Reward: 860.29
Step 124715 | Episode 124 | Reward: 131.83
Step 125715 | Episode 125 | Reward: 881.95
Step 126715 | Episode 126 | Reward: 222.46
Step 127715 | Episode 127 | Reward: 247.13
Step 128715 | Episode 128 | Reward: 212.29
Step 129715 | Episode 129 | Reward: 460.13
Evaluating at step 130000...




Uploading video: videos/sac_car_8fe4f077/step-130000-episode-0.mp4
Step 130715 | Episode 130 | Reward: 183.28
Step 131715 | Episode 131 | Reward: 851.39
Step 132715 | Episode 132 | Reward: 893.03
Step 133715 | Episode 133 | Reward: 631.71
Step 134715 | Episode 134 | Reward: 235.88
Step 135715 | Episode 135 | Reward: 210.34
