In [1]:
!pip install gymnasium[classic-control] torch wandb
!pip install wandb -qU

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m98.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import wandb
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myousefyousefyousef335[0m ([33myousefyousefyousef335-cairo-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Carpole and Acrobot

In [5]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import deque, namedtuple
import math
import time
import wandb
import os

# Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 1. Replay Memory ---
# A named tuple to store individual transitions
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))

class ReplayMemory(object):
    """A simple replay buffer."""
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition."""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        """Sample a batch of transitions."""
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

# --- 2. Q-Network Model ---
class QNetwork(nn.Module):
    """Simple MLP network for Q-value approximation."""
    def __init__(self, n_observations, n_actions):
        super(QNetwork, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

# --- 3. DQN Agent ---
class DQNAgent:
    """
    Main agent class to handle training, testing, and logging.
    """
    def __init__(self, config):
        self.config = config
        self.env_name = config["env_name"]

        # Create environment
        self.env = gym.make(self.env_name)
        n_observations = self.env.observation_space.shape[0]
        n_actions = self.env.action_space.n

        # Get max steps from environment spec
        self.max_episode_steps = self.env.spec.max_episode_steps

        # Initialize W&B
        self.run = wandb.init(
            project=f"Assignment1_v1",
            config=config,
            group=config["run_group"],
            name=config["run_name"],
            reinit=True
        )

        # Initialize networks
        self.policy_net = QNetwork(n_observations, n_actions).to(device)
        self.target_net = QNetwork(n_observations, n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # Target network is only for evaluation

        self.optimizer = optim.Adam(
            self.policy_net.parameters(),
            lr=self.config["learning_rate"]
        )
        self.memory = ReplayMemory(self.config["memory_size"])
        self.criterion = nn.SmoothL1Loss() # Huber Loss

        self.steps_done = 0

    def select_action(self, state, epsilon):
        """Epsilon-greedy action selection."""
        if random.random() > epsilon:
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor(
                [[self.env.action_space.sample()]],
                device=device, dtype=torch.long
            )

    def optimize_model(self):
        """Performs one optimization step on the policy network."""
        if len(self.memory) < self.config["batch_size"]:
            return None  # Not enough memory to sample

        transitions = self.memory.sample(self.config["batch_size"])
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        non_final_mask = torch.tensor(
            tuple(s is not None for s in batch.next_state),
            device=device, dtype=torch.bool
        )
        next_state_batch = torch.cat(
            [s for s in batch.next_state if s is not None]
        )

        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        next_state_values = torch.zeros(self.config["batch_size"], device=device)

        if self.config["use_ddqn"]:
            # --- DDQN ---
            best_next_actions = self.policy_net(next_state_batch).max(1)[1].unsqueeze(1)
            next_state_values[non_final_mask] = self.target_net(
                next_state_batch
            ).gather(1, best_next_actions).squeeze(1).detach()
        else:
            # --- Standard DQN ---
            next_state_values[non_final_mask] = self.target_net(
                next_state_batch
            ).max(1)[0].detach()

        expected_state_action_values = (
            next_state_values * self.config["gamma"]
        ) + reward_batch

        loss = self.criterion(
            state_action_values,
            expected_state_action_values.unsqueeze(1)
        )

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def train_agent(self):
        """Main training loop."""
        print(f"--- Starting Training for: {self.config['run_name']} ---")
        num_episodes = self.config["num_episodes"]

        for i_episode in range(num_episodes):
            state, _ = self.env.reset()
            state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

            episode_loss = 0
            episode_duration = 0

            for t in range(self.max_episode_steps):
                epsilon = self.config["eps_end"] + (
                    self.config["eps_start"] - self.config["eps_end"]
                ) * math.exp(-1. * self.steps_done / self.config["eps_decay"])

                self.steps_done += 1

                action = self.select_action(state, epsilon)
                observation, reward, terminated, truncated, _ = self.env.step(action.item())
                done = terminated or truncated

                reward = torch.tensor([reward], device=device)

                if terminated:
                    next_state = None
                else:
                    next_state = torch.tensor(
                        observation, dtype=torch.float32, device=device
                    ).unsqueeze(0)

                self.memory.push(state, action, next_state, reward, done)
                state = next_state
                loss = self.optimize_model()

                if loss:
                    episode_loss += loss

                # Soft update
                target_net_state_dict = self.target_net.state_dict()
                policy_net_state_dict = self.policy_net.state_dict()
                tau = self.config["tau"]
                for key in policy_net_state_dict:
                    target_net_state_dict[key] = policy_net_state_dict[key]*tau + target_net_state_dict[key]*(1-tau)
                self.target_net.load_state_dict(target_net_state_dict)

                episode_duration += 1

                if done:
                    break

            # Log to W&B
            self.run.log({
                "episode": i_episode,
                "duration": episode_duration,
                "epsilon": epsilon,
                "avg_loss": episode_loss / episode_duration if episode_duration > 0 else 0
            })

            if i_episode % 100 == 0:
                print(f"Episode {i_episode}/{num_episodes} | Duration: {episode_duration} | Epsilon: {epsilon:.4f}")

        print("--- Training Complete ---")

    def test_agent(self, num_tests=100):
        """Test the trained agent for 100 episodes."""
        print(f"--- Running {num_tests} Test Episodes ---")
        test_env = gym.make(self.env_name)
        test_durations = []

        for _ in range(num_tests):
            state, _ = test_env.reset()
            state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            terminated = False
            truncated = False
            duration = 0

            while not (terminated or truncated):
                with torch.no_grad():
                    action = self.policy_net(state).max(1)[1].view(1, 1)

                observation, _, terminated, truncated, _ = test_env.step(action.item())
                state = torch.tensor(
                    observation, dtype=torch.float32, device=device
                ).unsqueeze(0)
                duration += 1
            test_durations.append(duration)

        test_env.close()

        avg_duration = np.mean(test_durations)
        std_duration = np.std(test_durations)

        print(f"Test Results: Avg Duration = {avg_duration:.2f} +/- {std_duration:.2f}")
        self.run.log({
            "test_avg_duration": avg_duration,
            "test_std_duration": std_duration,
            "test_durations": wandb.Histogram(test_durations)
        })

    def record_video(self):
        """Record a video of the agent acting in the environment."""
        print("--- Recording Video ---")
        video_dir = f"./videos/{self.config['run_name']}"

        if not os.path.exists(video_dir):
            os.makedirs(video_dir)

        video_env = gym.make(self.env_name, render_mode="rgb_array")
        video_env = gym.wrappers.RecordVideo(
            video_env,
            video_folder=video_dir,
            name_prefix=f"{self.config['run_name']}-agent"
        )

        state, _ = video_env.reset()
        state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
        terminated = False
        truncated = False

        while not (terminated or truncated):
            with torch.no_grad():
                action = self.policy_net(state).max(1)[1].view(1, 1)

            observation, _, terminated, truncated, _ = video_env.step(action.item())
            state = torch.tensor(
                observation, dtype=torch.float32, device=device
            ).unsqueeze(0)

        video_env.close()
        print(f"Video saved to {video_dir}")

    def close(self):
        """Close the environment and W&B run."""
        self.env.close()
        self.run.finish()

# --- 4. Main Execution and Hyperparameter Search ---
if __name__ == "__main__":

    # Environments to run
    # Pendulum-v1 is EXCLUDED as it requires a different algorithm
    compatible_environments = ["CartPole-v1", "Acrobot-v1"]

    # Define the hyperparameter search space
    search_space = [
        {
            "config_name": "DQN_High_LR_High_Mem",
            "use_ddqn": False,
            "learning_rate": 0.001,
            "memory_size": 10000,
            "batch_size": 128,
            "gamma": 0.99,
            "eps_decay": 1000,
        },
        {
            "config_name": "DQN_Low_LR_Low_Mem",
            "use_ddqn": False,
            "learning_rate": 0.0001,
            "memory_size": 2000,
            "batch_size": 32,
            "gamma": 0.99,
            "eps_decay": 1000,
        },
        {
            "config_name": "DDQN_High_LR_High_Mem",
            "use_ddqn": True,
            "learning_rate": 0.001,
            "memory_size": 10000,
            "batch_size": 128,
            "gamma": 0.99,
            "eps_decay": 1000,
        },
        {
            "config_name": "DDQN_Low_LR_Low_Mem",
            "use_ddqn": True,
            "learning_rate": 0.0001,
            "memory_size": 2000,
            "batch_size": 32,
            "gamma": 0.95,
            "eps_decay": 2000,
        },
    ]

    # Common parameters for all runs
    common_params = {
        "eps_start": 0.9,
        "eps_end": 0.05,
        "target_update_mode": "soft",
        "tau": 0.005,
    }

    # --- Run the Search for each Environment ---
    for env_name in compatible_environments:
        print(f"\n=========================================")
        print(f"    STARTING EXPERIMENTS FOR: {env_name} ")
        print(f"=========================================\n")

        # Adjust training length based on environment difficulty
        if "Acrobot" in env_name:
            num_episodes = 2000
        else:
            num_episodes = 500

        for i, specific_config in enumerate(search_space):
            print(f"\n--- Starting Run {i+1}/{len(search_space)}: {specific_config['config_name']} ---")

            config = {**common_params, **specific_config}
            config["env_name"] = env_name
            config["num_episodes"] = num_episodes

            # Create unique W&B names
            model_type = "DDQN" if config["use_ddqn"] else "DQN"
            config["run_name"] = f"{env_name}_{model_type}_{specific_config['config_name']}"
            config["run_group"] = f"Group_{env_name}"

            try:
                agent = DQNAgent(config)
                agent.train_agent()
                agent.test_agent(num_tests=100)
                agent.record_video()
            except Exception as e:
                print(f"Run {config['run_name']} failed: {e}")
            finally:
                if 'agent' in locals():
                    agent.close()
                time.sleep(5)

    print("--- All experiments complete. Check Weights & Biases dashboard! ---")


    STARTING EXPERIMENTS FOR: CartPole-v1 


--- Starting Run 1/4: DQN_High_LR_High_Mem ---


--- Starting Training for: CartPole-v1_DQN_DQN_High_LR_High_Mem ---
Episode 0/500 | Duration: 16 | Epsilon: 0.8873
Episode 100/500 | Duration: 51 | Epsilon: 0.0501
Episode 200/500 | Duration: 500 | Epsilon: 0.0500
Episode 300/500 | Duration: 102 | Epsilon: 0.0500
Episode 400/500 | Duration: 500 | Epsilon: 0.0500
--- Training Complete ---
--- Running 100 Test Episodes ---
Test Results: Avg Duration = 47.48 +/- 36.94
--- Recording Video ---


  logger.warn(


Video saved to ./videos/CartPole-v1_DQN_DQN_High_LR_High_Mem


0,1
avg_loss,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▁▁▂▁▂▂▃▄▅▇▆▆▆▆▆▇▇▇█▅▅▄▂▂▂
duration,▁▁▁▁▁▇▂██▃▄▅▂▁▃████▁████▁▂▂▁▂▁▂▂▅█▅██▁▄█
episode,▁▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████
epsilon,███▆▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_avg_duration,▁
test_std_duration,▁

0,1
avg_loss,0.12538
duration,117.0
episode,499.0
epsilon,0.05
test_avg_duration,47.48
test_std_duration,36.93521



--- Starting Run 2/4: DQN_Low_LR_Low_Mem ---


--- Starting Training for: CartPole-v1_DQN_DQN_Low_LR_Low_Mem ---
Episode 0/500 | Duration: 14 | Epsilon: 0.8890
Episode 100/500 | Duration: 14 | Epsilon: 0.2579
Episode 200/500 | Duration: 10 | Epsilon: 0.1090
Episode 300/500 | Duration: 171 | Epsilon: 0.0502
Episode 400/500 | Duration: 97 | Epsilon: 0.0500
--- Training Complete ---
--- Running 100 Test Episodes ---
Test Results: Avg Duration = 95.10 +/- 5.12
--- Recording Video ---


  logger.warn(


Video saved to ./videos/CartPole-v1_DQN_DQN_Low_LR_Low_Mem


0,1
avg_loss,▃▂▁▁▂▂▂▂▂▂▃▄▄▄▃▄▄▆▇▅▇▇▇▇█▃▄▄▃▄▄▄▄▄▄▄▄▄▄▃
duration,▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▇▇█▆▇▆▅▅▅▅▅▅▅▅▅▅
episode,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
epsilon,█▇▆▅▅▄▄▄▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_avg_duration,▁
test_std_duration,▁

0,1
avg_loss,0.54274
duration,92.0
episode,499.0
epsilon,0.05
test_avg_duration,95.1
test_std_duration,5.11957



--- Starting Run 3/4: DDQN_High_LR_High_Mem ---


--- Starting Training for: CartPole-v1_DDQN_DDQN_High_LR_High_Mem ---
Episode 0/500 | Duration: 10 | Epsilon: 0.8924
Episode 100/500 | Duration: 137 | Epsilon: 0.0500
Episode 200/500 | Duration: 44 | Epsilon: 0.0500
Episode 300/500 | Duration: 26 | Epsilon: 0.0500
Episode 400/500 | Duration: 500 | Epsilon: 0.0500
--- Training Complete ---
--- Running 100 Test Episodes ---
Test Results: Avg Duration = 199.74 +/- 4.92
--- Recording Video ---


  logger.warn(


Video saved to ./videos/CartPole-v1_DDQN_DDQN_High_LR_High_Mem


0,1
avg_loss,▁▁▁▂▂▃▃▃▂▂▂▃▅▆▄▃▄▆▅▆▆▇▆▇█▇█▆▇▄▄▅▅▃▁▂▃▂▃▂
duration,▁▁▁▅▃▂▂▃▃▃▃▂▂▂▃▁▃▃▃▁▂▂▂██▁▁▃▃▃█▂▂▅▁█▅▅█▅
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇████
epsilon,██▅▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_avg_duration,▁
test_std_duration,▁

0,1
avg_loss,0.16193
duration,202.0
episode,499.0
epsilon,0.05
test_avg_duration,199.74
test_std_duration,4.92467



--- Starting Run 4/4: DDQN_Low_LR_Low_Mem ---


--- Starting Training for: CartPole-v1_DDQN_DDQN_Low_LR_Low_Mem ---
Episode 0/500 | Duration: 26 | Epsilon: 0.8894
Episode 100/500 | Duration: 28 | Epsilon: 0.3845
Episode 200/500 | Duration: 163 | Epsilon: 0.1440
Episode 300/500 | Duration: 202 | Epsilon: 0.0500
Episode 400/500 | Duration: 500 | Epsilon: 0.0500
--- Training Complete ---
--- Running 100 Test Episodes ---
Test Results: Avg Duration = 243.53 +/- 30.54
--- Recording Video ---


  logger.warn(


Video saved to ./videos/CartPole-v1_DDQN_DDQN_Low_LR_Low_Mem


0,1
avg_loss,▁▂▂▃▃▄▄▄▅▅▆▅▆▇▇▆█▂▁▂▂▂▂▂▂▂▁▁▁▁▂▁▁▁▁▃▂▁▁▁
duration,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▄▄▃▄▄▃▄▄▄▄▅█▄██████▂█
episode,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇███
epsilon,█▇▇▆▆▅▅▄▄▄▃▃▃▃▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_avg_duration,▁
test_std_duration,▁

0,1
avg_loss,0.03775
duration,208.0
episode,499.0
epsilon,0.05
test_avg_duration,243.53
test_std_duration,30.53963



    STARTING EXPERIMENTS FOR: Acrobot-v1 


--- Starting Run 1/4: DQN_High_LR_High_Mem ---


--- Starting Training for: Acrobot-v1_DQN_DQN_High_LR_High_Mem ---
Episode 0/2000 | Duration: 500 | Epsilon: 0.5661
Episode 100/2000 | Duration: 71 | Epsilon: 0.0500
Episode 200/2000 | Duration: 97 | Epsilon: 0.0500
Episode 300/2000 | Duration: 109 | Epsilon: 0.0500
Episode 400/2000 | Duration: 84 | Epsilon: 0.0500
Episode 500/2000 | Duration: 84 | Epsilon: 0.0500
Episode 600/2000 | Duration: 88 | Epsilon: 0.0500
Episode 700/2000 | Duration: 89 | Epsilon: 0.0500
Episode 800/2000 | Duration: 94 | Epsilon: 0.0500
Episode 900/2000 | Duration: 100 | Epsilon: 0.0500
Episode 1000/2000 | Duration: 87 | Epsilon: 0.0500
Episode 1100/2000 | Duration: 125 | Epsilon: 0.0500
Episode 1200/2000 | Duration: 86 | Epsilon: 0.0500
Episode 1300/2000 | Duration: 89 | Epsilon: 0.0500
Episode 1400/2000 | Duration: 69 | Epsilon: 0.0500
Episode 1500/2000 | Duration: 66 | Epsilon: 0.0500
Episode 1600/2000 | Duration: 76 | Epsilon: 0.0500
Episode 1700/2000 | Duration: 107 | Epsilon: 0.0500
Episode 1800/2000 | Du

  logger.warn(


Video saved to ./videos/Acrobot-v1_DQN_DQN_High_LR_High_Mem


0,1
avg_loss,▁▁▆▆██▆▆▅▅▅▅▅▆▄▄▄▄▄▃▄▄▄▄▄▅▅▅▅▅▄▄▃▃▄▄▄▄▄▃
duration,▄▄▄▂▂▄▄▄▃▃▁▅▅▃▃▃▄▃▆▄▄▅█▂▁▃▂▄▄▅▂▂▆▃▄▇▂▂▇▃
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▇▇▇▇▇▇▇██
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_avg_duration,▁
test_std_duration,▁

0,1
avg_loss,0.22421
duration,72.0
episode,1999.0
epsilon,0.05
test_avg_duration,69.09
test_std_duration,10.86747



--- Starting Run 2/4: DQN_Low_LR_Low_Mem ---


--- Starting Training for: Acrobot-v1_DQN_DQN_Low_LR_Low_Mem ---
Episode 0/2000 | Duration: 500 | Epsilon: 0.5661
Episode 100/2000 | Duration: 500 | Epsilon: 0.0500
Episode 200/2000 | Duration: 118 | Epsilon: 0.0500
Episode 300/2000 | Duration: 121 | Epsilon: 0.0500
Episode 400/2000 | Duration: 96 | Epsilon: 0.0500
Episode 500/2000 | Duration: 105 | Epsilon: 0.0500
Episode 600/2000 | Duration: 114 | Epsilon: 0.0500
Episode 700/2000 | Duration: 86 | Epsilon: 0.0500
Episode 800/2000 | Duration: 84 | Epsilon: 0.0500
Episode 900/2000 | Duration: 99 | Epsilon: 0.0500
Episode 1000/2000 | Duration: 81 | Epsilon: 0.0500
Episode 1100/2000 | Duration: 64 | Epsilon: 0.0500
Episode 1200/2000 | Duration: 89 | Epsilon: 0.0500
Episode 1300/2000 | Duration: 133 | Epsilon: 0.0500
Episode 1400/2000 | Duration: 91 | Epsilon: 0.0500
Episode 1500/2000 | Duration: 127 | Epsilon: 0.0500
Episode 1600/2000 | Duration: 75 | Epsilon: 0.0500
Episode 1700/2000 | Duration: 78 | Epsilon: 0.0500
Episode 1800/2000 | D

  logger.warn(


Video saved to ./videos/Acrobot-v1_DQN_DQN_Low_LR_Low_Mem


0,1
avg_loss,▁▁▁▁▁▅▇▆▆▇▆▇█▆▆▆▇▅▅▆▆▆▅▅▄▄▄▄▄▄▄▄▄▄▄▄▃▃▃▃
duration,███▂▂▂▂▂▁▂▁▁▁▁▁▂▁▂▁▁▂▂▁▂▄▁▂▁▁▁▁▂▂▂▁▁▁▁▁▁
episode,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇█████
epsilon,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_avg_duration,▁
test_std_duration,▁

0,1
avg_loss,0.27119
duration,88.0
episode,1999.0
epsilon,0.05
test_avg_duration,99.04
test_std_duration,90.46512



--- Starting Run 3/4: DDQN_High_LR_High_Mem ---


--- Starting Training for: Acrobot-v1_DDQN_DDQN_High_LR_High_Mem ---
Episode 0/2000 | Duration: 500 | Epsilon: 0.5661
Episode 100/2000 | Duration: 121 | Epsilon: 0.0500
Episode 200/2000 | Duration: 79 | Epsilon: 0.0500
Episode 300/2000 | Duration: 83 | Epsilon: 0.0500
Episode 400/2000 | Duration: 113 | Epsilon: 0.0500
Episode 500/2000 | Duration: 116 | Epsilon: 0.0500
Episode 600/2000 | Duration: 71 | Epsilon: 0.0500
Episode 700/2000 | Duration: 75 | Epsilon: 0.0500
Episode 800/2000 | Duration: 99 | Epsilon: 0.0500
Episode 900/2000 | Duration: 88 | Epsilon: 0.0500
Episode 1000/2000 | Duration: 63 | Epsilon: 0.0500
Episode 1100/2000 | Duration: 105 | Epsilon: 0.0500
Episode 1200/2000 | Duration: 105 | Epsilon: 0.0500
Episode 1300/2000 | Duration: 97 | Epsilon: 0.0500
Episode 1400/2000 | Duration: 103 | Epsilon: 0.0500
Episode 1500/2000 | Duration: 63 | Epsilon: 0.0500
Episode 1600/2000 | Duration: 78 | Epsilon: 0.0500
Episode 1700/2000 | Duration: 98 | Epsilon: 0.0500
Episode 1800/2000 

  logger.warn(


Video saved to ./videos/Acrobot-v1_DDQN_DDQN_High_LR_High_Mem


0,1
avg_loss,▁▇█▇▅▅▅▅▅▅▅▅▆▅▆▅▆▆▆▅▆▆▅▆▆▆▆▆▇▆▆▆▆▆▆▆▆▅▆▅
duration,█▁▁▁▁▁▁▂▂▂▁▁▃█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▂▁▁▁▁▁
episode,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇█
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_avg_duration,▁
test_std_duration,▁

0,1
avg_loss,0.26219
duration,63.0
episode,1999.0
epsilon,0.05
test_avg_duration,73.39
test_std_duration,11.89193



--- Starting Run 4/4: DDQN_Low_LR_Low_Mem ---


--- Starting Training for: Acrobot-v1_DDQN_DDQN_Low_LR_Low_Mem ---
Episode 0/2000 | Duration: 500 | Epsilon: 0.7123
Episode 100/2000 | Duration: 414 | Epsilon: 0.0500
Episode 200/2000 | Duration: 178 | Epsilon: 0.0500
Episode 300/2000 | Duration: 244 | Epsilon: 0.0500
Episode 400/2000 | Duration: 235 | Epsilon: 0.0500
Episode 500/2000 | Duration: 164 | Epsilon: 0.0500
Episode 600/2000 | Duration: 180 | Epsilon: 0.0500
Episode 700/2000 | Duration: 136 | Epsilon: 0.0500
Episode 800/2000 | Duration: 96 | Epsilon: 0.0500
Episode 900/2000 | Duration: 107 | Epsilon: 0.0500
Episode 1000/2000 | Duration: 109 | Epsilon: 0.0500
Episode 1100/2000 | Duration: 94 | Epsilon: 0.0500
Episode 1200/2000 | Duration: 109 | Epsilon: 0.0500
Episode 1300/2000 | Duration: 160 | Epsilon: 0.0500
Episode 1400/2000 | Duration: 121 | Epsilon: 0.0500
Episode 1500/2000 | Duration: 102 | Epsilon: 0.0500
Episode 1600/2000 | Duration: 103 | Epsilon: 0.0500
Episode 1700/2000 | Duration: 114 | Epsilon: 0.0500
Episode 180

  logger.warn(


Video saved to ./videos/Acrobot-v1_DDQN_DDQN_Low_LR_Low_Mem


0,1
avg_loss,▁▃▃▁▅▄▆▄▄▄▆█▆▅▆▅▅▅▆▅▆▆▄▆▅▇▆▆▇▆▄▇▆▅▇▆▆▇▇▆
duration,█▄█▆▄▂▂▁▂▂▁▂▂▁▃▂▄▂▃▂▁▁▂▂▁▁▂▁▂▁▁▂▂▂▁▁▃▂▁▂
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇█████
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_avg_duration,▁
test_std_duration,▁

0,1
avg_loss,0.08371
duration,86.0
episode,1999.0
epsilon,0.05
test_avg_duration,115.43
test_std_duration,82.00027



    STARTING EXPERIMENTS FOR: MountainCar-v0 


--- Starting Run 1/4: DQN_High_LR_High_Mem ---


--- Starting Training for: MountainCar-v0_DQN_DQN_High_LR_High_Mem ---
Episode 0/2000 | Duration: 200 | Epsilon: 0.7466
Episode 100/2000 | Duration: 200 | Epsilon: 0.0500
Episode 200/2000 | Duration: 200 | Epsilon: 0.0500
Episode 300/2000 | Duration: 200 | Epsilon: 0.0500
Episode 400/2000 | Duration: 200 | Epsilon: 0.0500
Episode 500/2000 | Duration: 200 | Epsilon: 0.0500
Episode 600/2000 | Duration: 200 | Epsilon: 0.0500
Episode 700/2000 | Duration: 200 | Epsilon: 0.0500
Episode 800/2000 | Duration: 200 | Epsilon: 0.0500
Episode 900/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1000/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1100/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1200/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1300/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1400/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1500/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1600/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1700/2000 | Duration: 200 | Epsilon: 0.0500
Episo

  logger.warn(


Video saved to ./videos/MountainCar-v0_DQN_DQN_High_LR_High_Mem


0,1
avg_loss,▂▃▃▁▂▄▃▃▅▁▅▁▄▅▃▄▃▄▄▄▃▅▃▃▄▁▄▁█▄▃▄▄▄▂▅▆▄▄▃
duration,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇██
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_avg_duration,▁
test_std_duration,▁

0,1
avg_loss,0.00037
duration,200.0
episode,1999.0
epsilon,0.05
test_avg_duration,200.0
test_std_duration,0.0



--- Starting Run 2/4: DQN_Low_LR_Low_Mem ---


--- Starting Training for: MountainCar-v0_DQN_DQN_Low_LR_Low_Mem ---
Episode 0/2000 | Duration: 200 | Epsilon: 0.7466
Episode 100/2000 | Duration: 200 | Epsilon: 0.0500
Episode 200/2000 | Duration: 200 | Epsilon: 0.0500
Episode 300/2000 | Duration: 200 | Epsilon: 0.0500
Run MountainCar-v0_DQN_DQN_Low_LR_Low_Mem failed: 'TimeLimit' object has no attribute 'goal_position'


0,1
avg_loss,▁▁▁▂▁▂▁▁▁▁▁▁▂▃▂▂▂▃▂▂▁▁▁▁▂▂▂▂▂▂▃▁▁█▆▄▂▂▃█
duration,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██
epsilon,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
avg_loss,0.00102
duration,200.0
episode,326.0
epsilon,0.05



--- Starting Run 3/4: DDQN_High_LR_High_Mem ---


--- Starting Training for: MountainCar-v0_DDQN_DDQN_High_LR_High_Mem ---
Episode 0/2000 | Duration: 200 | Epsilon: 0.7466
Episode 100/2000 | Duration: 200 | Epsilon: 0.0500
Episode 200/2000 | Duration: 200 | Epsilon: 0.0500
Episode 300/2000 | Duration: 200 | Epsilon: 0.0500
Episode 400/2000 | Duration: 200 | Epsilon: 0.0500
Episode 500/2000 | Duration: 200 | Epsilon: 0.0500
Episode 600/2000 | Duration: 200 | Epsilon: 0.0500
Episode 700/2000 | Duration: 200 | Epsilon: 0.0500
Episode 800/2000 | Duration: 200 | Epsilon: 0.0500
Episode 900/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1000/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1100/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1200/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1300/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1400/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1500/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1600/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1700/2000 | Duration: 200 | Epsilon: 0.0500
Epi

  logger.warn(


Video saved to ./videos/MountainCar-v0_DDQN_DDQN_High_LR_High_Mem


0,1
avg_loss,▅▂▄▄▄▅▃▄▆▅▅▃▂▆▆▄▅▄▁▆▂█▅▅▃▆▃▆▃▅▁▅▄▄▆▄▄▄▄▄
duration,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
epsilon,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_avg_duration,▁
test_std_duration,▁

0,1
avg_loss,0.00039
duration,200.0
episode,1999.0
epsilon,0.05
test_avg_duration,200.0
test_std_duration,0.0



--- Starting Run 4/4: DDQN_Low_LR_Low_Mem ---


--- Starting Training for: MountainCar-v0_DDQN_DDQN_Low_LR_Low_Mem ---
Episode 0/2000 | Duration: 200 | Epsilon: 0.8195
Episode 100/2000 | Duration: 200 | Epsilon: 0.0500
Episode 200/2000 | Duration: 200 | Epsilon: 0.0500
Episode 300/2000 | Duration: 200 | Epsilon: 0.0500
Episode 400/2000 | Duration: 200 | Epsilon: 0.0500
Episode 500/2000 | Duration: 200 | Epsilon: 0.0500
Episode 600/2000 | Duration: 200 | Epsilon: 0.0500
Episode 700/2000 | Duration: 200 | Epsilon: 0.0500
Episode 800/2000 | Duration: 200 | Epsilon: 0.0500
Episode 900/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1000/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1100/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1200/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1300/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1400/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1500/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1600/2000 | Duration: 200 | Epsilon: 0.0500
Episode 1700/2000 | Duration: 200 | Epsilon: 0.0500
Episo

  logger.warn(


Video saved to ./videos/MountainCar-v0_DDQN_DDQN_Low_LR_Low_Mem


0,1
avg_loss,█▃▃▄▇▅▂▇▁▃▂▁▁▄▂▁▂▇▁▁▂▂▂▅▂▂▂▁▁▁▂▁▁▂▁▁▁▁▁▃
duration,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇█████
epsilon,█▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_avg_duration,▁
test_std_duration,▁

0,1
avg_loss,0.0
duration,200.0
episode,1999.0
epsilon,0.05
test_avg_duration,200.0
test_std_duration,0.0


--- All experiments complete. Check Weights & Biases dashboard! ---


##Pendoluim

In [3]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import math
from collections import deque, namedtuple
import time
import wandb

# Ensure the environment can be rendered (for video recording)
import os
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = '1'


#################################################
#  1. Discretization Wrapper
#################################################

class DiscretizeActionWrapper(gym.ActionWrapper):
    """
    Wraps the Pendulum-v1 environment to discretize the continuous action space.
    """
    def __init__(self, env, n_actions):
        super().__init__(env)
        self.n_actions = n_actions
        # Create a discrete action space
        self.action_space = gym.spaces.Discrete(n_actions)
        # Map discrete actions (0, 1, ..., n_actions-1) to continuous values
        self.continuous_actions = np.linspace(
            env.action_space.low[0],
            env.action_space.high[0],
            n_actions
        )

    def action(self, action):
        # Map the discrete action index back to a continuous torque value
        return [self.continuous_actions[action]]

#################################################
#  2. Replay Buffer
#################################################

# Use namedtuple for a more readable transition structure
Transition = namedtuple('Transition',
                        ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    """A fixed-size buffer to store experience tuples."""
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition."""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        """Randomly sample a batch of transitions."""
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

#################################################
#  3. Q-Network (MLP Model)
#################################################

class QNetwork(nn.Module):
    """MLP model for Q-value approximation."""
    def __init__(self, state_dim, n_actions):
        super(QNetwork, self).__init__()
        self.layer1 = nn.Linear(state_dim, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

#################################################
#  4. DQN/DDQN Agent
#################################################

class DQNAgent:
    def __init__(self, state_dim, n_actions, config, use_ddqn=False):
        self.state_dim = state_dim
        self.n_actions = n_actions
        self.config = config
        self.use_ddqn = use_ddqn

        self.gamma = config['gamma']
        self.epsilon_start = 1.0
        self.epsilon_end = 0.01
        self.epsilon_decay = config['epsilon_decay']
        self.learning_rate = config['learning_rate']
        self.batch_size = config['batch_size']
        self.target_update_freq = 100 # Update target net every 100 learning steps
        self.steps_done = 0

        # Use GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Initialize policy and target networks
        self.policy_net = QNetwork(state_dim, n_actions).to(self.device)
        self.target_net = QNetwork(state_dim, n_actions).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # Target network is only for evaluation

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)

        # *** Create memory with the capacity from config ***
        self.memory = ReplayBuffer(config['memory_size'])

        # Log model architecture to W&B
        if wandb.run:
             wandb.watch(self.policy_net)

    def select_action(self, state, exploration=True):
        """Selects an action using an epsilon-greedy policy."""
        # Calculate current epsilon
        if exploration:
            epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                      math.exp(-1. * self.steps_done / self.epsilon_decay)
            self.steps_done += 1
        else:
            epsilon = 0.0 # No exploration for testing

        # Epsilon-greedy selection
        if random.random() > epsilon:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=self.device, dtype=torch.long)

    def optimize_model(self):
        """Performs one step of optimization on the policy network."""
        if len(self.memory) < self.batch_size:
            return None  # Not enough samples in memory

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), device=self.device, dtype=torch.bool)

        non_final_next_states = torch.cat([torch.tensor(s, dtype=torch.float32, device=self.device).unsqueeze(0)
                                           for s in batch.next_state if s is not None])

        state_batch = torch.cat([torch.tensor(s, dtype=torch.float32, device=self.device).unsqueeze(0) for s in batch.state])
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat([torch.tensor([r], dtype=torch.float32, device=self.device) for r in batch.reward])

        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        next_state_values = torch.zeros(self.batch_size, device=self.device)

        if self.use_ddqn:
            # --- DDQN ---
            best_actions = self.policy_net(non_final_next_states).max(1)[1].unsqueeze(1)
            next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, best_actions).squeeze()
        else:
            # --- Standard DQN ---
            next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values * self.gamma) + reward_batch

        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1) # Gradient clipping
        self.optimizer.step()

        return loss.item()

    def update_target_net(self):
        """Hard update of the target network's weights."""
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save_model(self, path):
        torch.save(self.policy_net.state_dict(), path)

    def load_model(self, path):
        self.policy_net.load_state_dict(torch.load(path))
        self.target_net.load_state_dict(torch.load(path))


#################################################
#  5. Training and Evaluation Function
#################################################

def train_and_evaluate(config):
    """
    Main function to train and evaluate an agent based on a config dictionary.
    """

    # Initialize W&B
    run = wandb.init(
        project="DQN_DDQN_Pendulum",
        config=config,
        reinit=True # Allows multiple runs in the same script
    )

    # Create environment
    base_env = gym.make("Pendulum-v1")
    env = DiscretizeActionWrapper(base_env, n_actions=config['n_actions'])

    state_dim = env.observation_space.shape[0]
    n_actions = env.action_space.n

    agent = DQNAgent(
        state_dim,
        n_actions,
        config,
        use_ddqn=config['use_ddqn']
    )

    print(f"--- Starting Run: {run.name} ---")
    print(f"Config: {config}")

    total_learn_steps = 0

    # --- Training Phase ---
    for i_episode in range(config['num_episodes']):
        state, _ = env.reset()
        episode_reward = 0
        episode_loss = 0
        n_steps = 0

        for t in range(200): # Pendulum-v1 has a fixed 200-step duration
            action = agent.select_action(state, exploration=True)
            next_state, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated
            episode_reward += reward

            agent.memory.push(state, action, reward, next_state if not done else None, done)

            state = next_state

            loss = agent.optimize_model()
            if loss is not None:
                episode_loss += loss
                n_steps += 1
                total_learn_steps += 1

            if total_learn_steps % agent.target_update_freq == 0:
                agent.update_target_net()

            if done:
                break

        avg_loss = (episode_loss / n_steps) if n_steps > 0 else 0
        current_epsilon = agent.epsilon_end + (agent.epsilon_start - agent.epsilon_end) * \
                          math.exp(-1. * agent.steps_done / agent.epsilon_decay)

        wandb.log({
            "episode": i_episode,
            "total_reward": episode_reward,
            "avg_loss": avg_loss,
            "epsilon": current_epsilon,
            "steps_done": agent.steps_done
        })

        if i_episode % 50 == 0:
            print(f"Episode {i_episode}: Reward = {episode_reward:.2f}, Avg Loss = {avg_loss:.4f}, Epsilon = {current_epsilon:.3f}")

    print("Training complete.")

    # --- Testing & Recording Phase ---
    print("Starting testing and video recording...")

    video_dir = f"./videos/{run.name}"
    test_env = DiscretizeActionWrapper(
        gym.make("Pendulum-v1", render_mode="rgb_array"),
        n_actions=config['n_actions']
    )
    test_env = RecordVideo(test_env, video_dir, episode_trigger=lambda e: e < 3)

    test_rewards = []
    for i_test in range(100):
        state, _ = test_env.reset()
        episode_reward = 0
        done = False
        while not done:
            action = agent.select_action(state, exploration=False)
            next_state, reward, terminated, truncated, _ = test_env.step(action.item())
            done = terminated or truncated
            episode_reward += reward
            state = next_state
        test_rewards.append(episode_reward)

    test_env.close()

    avg_test_reward = np.mean(test_rewards)
    std_test_reward = np.std(test_rewards)

    print(f"Test Results: Avg Reward = {avg_test_reward:.2f} +/- {std_test_reward:.2f}")

    wandb.log({
        "avg_test_reward": avg_test_reward,
        "std_test_reward": std_test_reward,
        "video": wandb.Video(os.path.join(video_dir, "rl-video-episode-0.mp4"), fps=4, format="mp4")
    })

    print(f"--- Run {run.name} Finished ---")
    run.finish()


#################################################
#  6. Main Execution
#################################################

if __name__ == "__main__":

    # Define the hyperparameter setups to test
    # All configurations now use memory_size = 6000

    # Config 1: Standard DQN
    config1 = {
        "run_name": "DQN_fast_decay",
        "use_ddqn": False,
        "num_episodes": 500,
        "n_actions": 5,
        "gamma": 0.99,
        "epsilon_decay": 5000,
        "learning_rate": 0.001,
        "memory_size": 6000, # <-- UPDATED
        "batch_size": 64
    }

    # Config 2: Standard DDQN
    config2 = {
        "run_name": "DDQN_fast_decay",
        "use_ddqn": True,
        "num_episodes": 500,
        "n_actions": 5,
        "gamma": 0.99,
        "epsilon_decay": 5000,
        "learning_rate": 0.001,
        "memory_size": 6000, # <-- UPDATED
        "batch_size": 64
    }

    # Config 3: DDQN with slower decay
    config3 = {
        "run_name": "DDQN_slow_decay",
        "use_ddqn": True,
        "num_episodes": 500,
        "n_actions": 5,
        "gamma": 0.99,
        "epsilon_decay": 20000,
        "learning_rate": 0.0005,
        "memory_size": 6000, # <-- UPDATED
        "batch_size": 128
    }

    # Config 4: DQN with slower decay
    config4 = {
        "run_name": "DQN_slow_decay",
        "use_ddqn": False,
        "num_episodes": 500,
        "n_actions": 5,
        "gamma": 0.99,
        "epsilon_decay": 20000,
        "learning_rate": 0.0005,
        "memory_size": 6000, # <-- UPDATED
        "batch_size": 128
    }

    all_configs = [config1, config2, config3, config4]

    for cfg in all_configs:
        train_and_evaluate(cfg)

    print("All experiments complete.")



--- Starting Run: unique-oath-1 ---
Config: {'run_name': 'DQN_fast_decay', 'use_ddqn': False, 'num_episodes': 500, 'n_actions': 5, 'gamma': 0.99, 'epsilon_decay': 5000, 'learning_rate': 0.001, 'memory_size': 6000, 'batch_size': 64}
Episode 0: Reward = -1304.40, Avg Loss = 1.5544, Epsilon = 0.961
Episode 50: Reward = -122.15, Avg Loss = 0.9430, Epsilon = 0.139
Episode 100: Reward = -127.98, Avg Loss = 0.2349, Epsilon = 0.027
Episode 150: Reward = -2.98, Avg Loss = 0.2917, Epsilon = 0.012
Episode 200: Reward = -242.20, Avg Loss = 0.2526, Epsilon = 0.010
Episode 250: Reward = -121.84, Avg Loss = 0.3145, Epsilon = 0.010
Episode 300: Reward = -126.20, Avg Loss = 0.2226, Epsilon = 0.010
Episode 350: Reward = -130.32, Avg Loss = 0.2190, Epsilon = 0.010
Episode 400: Reward = -122.15, Avg Loss = 0.3693, Epsilon = 0.010
Episode 450: Reward = -245.49, Avg Loss = 0.1958, Epsilon = 0.010
Training complete.
Starting testing and video recording...


  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"


Test Results: Avg Reward = -136.44 +/- 89.88
--- Run unique-oath-1 Finished ---


0,1
avg_loss,▃██▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁
avg_test_reward,▁
episode,▁▁▁▁▁▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇█████
epsilon,█▇▄▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
std_test_reward,▁
steps_done,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇████
total_reward,▁▆▇▇▇▇▅▇▅▄▇▇▇▇█▇▄▇▇▅▄▅▅██▅▇▇▇▅▇█▅▇▇█▇▇▇▇

0,1
avg_loss,0.24922
avg_test_reward,-136.44267
episode,499.0
epsilon,0.01
std_test_reward,89.87898
steps_done,100000.0
total_reward,-120.36094


--- Starting Run: splendid-oath-2 ---
Config: {'run_name': 'DDQN_fast_decay', 'use_ddqn': True, 'num_episodes': 500, 'n_actions': 5, 'gamma': 0.99, 'epsilon_decay': 5000, 'learning_rate': 0.001, 'memory_size': 6000, 'batch_size': 64}
Episode 0: Reward = -1700.73, Avg Loss = 2.9056, Epsilon = 0.961
Episode 50: Reward = -120.47, Avg Loss = 1.0065, Epsilon = 0.139
Episode 100: Reward = -246.38, Avg Loss = 0.2331, Epsilon = 0.027
Episode 150: Reward = -15.74, Avg Loss = 0.2742, Epsilon = 0.012
Episode 200: Reward = -8.26, Avg Loss = 0.2285, Epsilon = 0.010
Episode 250: Reward = -232.17, Avg Loss = 0.2603, Epsilon = 0.010
Episode 300: Reward = -132.79, Avg Loss = 0.2541, Epsilon = 0.010
Episode 350: Reward = -129.53, Avg Loss = 0.2486, Epsilon = 0.010
Episode 400: Reward = -242.29, Avg Loss = 0.1933, Epsilon = 0.010
Episode 450: Reward = -5.46, Avg Loss = 0.2798, Epsilon = 0.010
Training complete.
Starting testing and video recording...




Test Results: Avg Reward = -137.01 +/- 86.15
--- Run splendid-oath-2 Finished ---


0,1
avg_loss,██▃▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁
avg_test_reward,▁
episode,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
epsilon,██▆▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
std_test_reward,▁
steps_done,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
total_reward,▁▇▆▆▇█▇▇█▇▇▇▇█▇▇██▇█▇▇▇█▇▇▇█▇▇▆▇▇▆▇▇▇▇▇▇

0,1
avg_loss,0.32766
avg_test_reward,-137.01323
episode,499.0
epsilon,0.01
std_test_reward,86.14521
steps_done,100000.0
total_reward,-239.59003


--- Starting Run: worthy-snowflake-3 ---
Config: {'run_name': 'DDQN_slow_decay', 'use_ddqn': True, 'num_episodes': 500, 'n_actions': 5, 'gamma': 0.99, 'epsilon_decay': 20000, 'learning_rate': 0.0005, 'memory_size': 6000, 'batch_size': 128}
Episode 0: Reward = -919.29, Avg Loss = 1.8942, Epsilon = 0.990
Episode 50: Reward = -625.31, Avg Loss = 2.7915, Epsilon = 0.604
Episode 100: Reward = -253.81, Avg Loss = 1.2355, Epsilon = 0.371
Episode 150: Reward = -597.83, Avg Loss = 1.0375, Epsilon = 0.229
Episode 200: Reward = -242.47, Avg Loss = 0.6506, Epsilon = 0.143
Episode 250: Reward = -128.42, Avg Loss = 0.3756, Epsilon = 0.090
Episode 300: Reward = -5.17, Avg Loss = 0.4511, Epsilon = 0.059
Episode 350: Reward = -246.33, Avg Loss = 0.2286, Epsilon = 0.040
Episode 400: Reward = -10.62, Avg Loss = 0.2865, Epsilon = 0.028
Episode 450: Reward = -121.10, Avg Loss = 0.2008, Epsilon = 0.021
Training complete.
Starting testing and video recording...




Test Results: Avg Reward = -147.79 +/- 80.19
--- Run worthy-snowflake-3 Finished ---


0,1
avg_loss,▄▅▅▆██▇▆▆▆▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁
avg_test_reward,▁
episode,▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇██
epsilon,█▇▅▄▄▄▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
std_test_reward,▁
steps_done,▁▁▁▁▂▂▂▂▂▂▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
total_reward,▃▄▁▃▃▆▅▇▆▅▅▆▇█▆▇▇▇▆▇█▇▇▇▇▆██▇▇▇▆▇▇██▇▇▇▆

0,1
avg_loss,0.20418
avg_test_reward,-147.78772
episode,499.0
epsilon,0.01667
std_test_reward,80.18873
steps_done,100000.0
total_reward,-129.61092


--- Starting Run: lilac-shape-4 ---
Config: {'run_name': 'DQN_slow_decay', 'use_ddqn': False, 'num_episodes': 500, 'n_actions': 5, 'gamma': 0.99, 'epsilon_decay': 20000, 'learning_rate': 0.0005, 'memory_size': 6000, 'batch_size': 128}
Episode 0: Reward = -1085.51, Avg Loss = 2.2274, Epsilon = 0.990
Episode 50: Reward = -623.49, Avg Loss = 2.5838, Epsilon = 0.604
Episode 100: Reward = -357.75, Avg Loss = 1.2742, Epsilon = 0.371
Episode 150: Reward = -380.91, Avg Loss = 0.8445, Epsilon = 0.229
Episode 200: Reward = -250.83, Avg Loss = 0.7239, Epsilon = 0.143
Episode 250: Reward = -121.21, Avg Loss = 0.4615, Epsilon = 0.090
Episode 300: Reward = -246.68, Avg Loss = 0.2476, Epsilon = 0.059
Episode 350: Reward = -119.66, Avg Loss = 0.1587, Epsilon = 0.040
Episode 400: Reward = -230.01, Avg Loss = 0.2787, Epsilon = 0.028
Episode 450: Reward = -116.77, Avg Loss = 0.1353, Epsilon = 0.021
Training complete.
Starting testing and video recording...




Test Results: Avg Reward = -158.59 +/- 82.32
--- Run lilac-shape-4 Finished ---


0,1
avg_loss,▇███▄▄▄▄▄▄▃▃▄▄▃▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
avg_test_reward,▁
episode,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇█████
epsilon,█▇▅▅▄▄▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
std_test_reward,▁
steps_done,▁▁▁▁▁▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
total_reward,▁▄▄▃▄▅██▇▇▇▇▇▇▆▇▇█▇█▇▇███████▇▇▇██████▇█

0,1
avg_loss,0.17139
avg_test_reward,-158.58513
episode,499.0
epsilon,0.01667
std_test_reward,82.32308
steps_done,100000.0
total_reward,-127.1124


All experiments complete.


## Mountain car

In [None]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import math
from collections import deque, namedtuple
import time
import wandb
import os

# Ensure the environment can be rendered (for video recording)
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = '1'

#################################################
#  1. Replay Buffer
#################################################

# Use namedtuple for a more readable transition structure
Transition = namedtuple('Transition',
                        ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    """A fixed-size buffer to store experience tuples."""
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition."""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        """Randomly sample a batch of transitions."""
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

#################################################
#  2. Q-Network (MLP Model)
#################################################

class QNetwork(nn.Module):
    """MLP model for Q-value approximation."""
    def __init__(self, state_dim, n_actions):
        super(QNetwork, self).__init__()
        self.layer1 = nn.Linear(state_dim, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

#################################################
#  3. DQN/DDQN Agent
#################################################

class DQNAgent:
    def __init__(self, state_dim, n_actions, config, use_ddqn=False):
        self.state_dim = state_dim
        self.n_actions = n_actions
        self.config = config
        self.use_ddqn = use_ddqn

        self.gamma = config['gamma']
        self.epsilon_start = 1.0
        self.epsilon_end = 0.01
        self.epsilon_decay = config['epsilon_decay']
        self.learning_rate = config['learning_rate']
        self.batch_size = config['batch_size']
        self.target_update_freq = 100 # Update target net every 100 learning steps
        self.steps_done = 0

        # Use GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Initialize policy and target networks
        self.policy_net = QNetwork(state_dim, n_actions).to(self.device)
        self.target_net = QNetwork(state_dim, n_actions).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # Target network is only for evaluation

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
        self.memory = ReplayBuffer(config['memory_size'])

        # Log model architecture to W&B
        if wandb.run:
             wandb.watch(self.policy_net)

    def select_action(self, state, exploration=True):
        """Selects an action using an epsilon-greedy policy."""
        # Calculate current epsilon
        if exploration:
            epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                      math.exp(-1. * self.steps_done / self.epsilon_decay)
            self.steps_done += 1
        else:
            epsilon = 0.0 # No exploration for testing

        # Epsilon-greedy selection
        if random.random() > epsilon:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
                # t.max(1) returns (values, indices)
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=self.device, dtype=torch.long)

    def optimize_model(self):
        """Performs one step of optimization on the policy network."""
        if len(self.memory) < self.batch_size:
            return None  # Not enough samples in memory

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), device=self.device, dtype=torch.bool)

        # We must check if there are any non-final states
        non_final_next_states_list = [torch.tensor(s, dtype=torch.float32, device=self.device).unsqueeze(0)
                                     for s in batch.next_state if s is not None]

        state_batch = torch.cat([torch.tensor(s, dtype=torch.float32, device=self.device).unsqueeze(0) for s in batch.state])
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat([torch.tensor([r], dtype=torch.float32, device=self.device) for r in batch.reward])

        # Compute Q(s_t, a)
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_state_values = torch.zeros(self.batch_size, device=self.device)

        # Only compute next_state_values if there are non-final states
        if len(non_final_next_states_list) > 0:
            non_final_next_states = torch.cat(non_final_next_states_list)

            if self.use_ddqn:
                # --- DDQN ---
                # 1. Select best action using policy_net
                best_actions = self.policy_net(non_final_next_states).max(1)[1].unsqueeze(1)
                # 2. Evaluate that action using target_net
                next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, best_actions).squeeze().detach()
            else:
                # --- Standard DQN ---
                next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()

        # Compute the expected Q values (Bellman equation)
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch

        # Compute loss (Smooth L1 Loss)
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_value_(self.policy_net.parameters(), 100) # Gradient clipping
        self.optimizer.step()

        return loss.item()

    def update_target_net(self):
        """Hard update of the target network's weights."""
        self.target_net.load_state_dict(self.policy_net.state_dict())

#################################################
#  4. Training and Evaluation Function
#################################################

def train_and_evaluate(config):
    """
    Main function to train and evaluate an agent based on a config dictionary.
    """

    # Initialize W&B
    run = wandb.init(
        project="DQN_DDQN_MountainCar", # New project name
        config=config,
        reinit=True # Allows multiple runs in the same script
    )

    # Create environment
    env = gym.make("MountainCar-v0")

    state_dim = env.observation_space.shape[0]
    n_actions = env.action_space.n

    agent = DQNAgent(
        state_dim,
        n_actions,
        config,
        use_ddqn=config['use_ddqn']
    )

    print(f"--- Starting Run: {run.name} ---")
    print(f"Config: {config}")

    total_learn_steps = 0

    # --- Training Phase ---
    for i_episode in range(config['num_episodes']):
        state, _ = env.reset()
        episode_reward = 0
        episode_loss = 0
        n_steps = 0

        # MountainCar-v0 has a fixed 200-step duration
        for t in range(200):
            action = agent.select_action(state, exploration=True)
            next_state, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated
            episode_reward += reward

            # Store transition (next_state is None if episode ended)
            agent.memory.push(state, action, reward, next_state if not done else None, done)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization
            loss = agent.optimize_model()
            if loss is not None:
                episode_loss += loss
                n_steps += 1
                total_learn_steps += 1

            # Update target network
            if total_learn_steps % agent.target_update_freq == 0:
                agent.update_target_net()

            if done:
                break

        avg_loss = (episode_loss / n_steps) if n_steps > 0 else 0
        current_epsilon = agent.epsilon_end + (agent.epsilon_start - agent.epsilon_end) * \
                          math.exp(-1. * agent.steps_done / agent.epsilon_decay)

        # Log metrics to W&B
        wandb.log({
            "episode": i_episode,
            "total_reward": episode_reward, # Will be e.g., -150
            "episode_duration": t + 1,      # Will be e.g., 150
            "avg_loss": avg_loss,
            "epsilon": current_epsilon,
            "steps_done": agent.steps_done
        })

        if i_episode % 100 == 0:
            print(f"Episode {i_episode}: Duration = {t+1}, Reward = {episode_reward:.2f}, Avg Loss = {avg_loss:.4f}")

    print("Training complete.")

    # --- Testing & Recording Phase ---
    print("Starting testing and video recording...")

    video_dir = f"./videos/{run.name}"
    test_env = gym.make("MountainCar-v0", render_mode="rgb_array")
    # Record only the first 3 test episodes
    test_env = RecordVideo(test_env, video_dir, episode_trigger=lambda e: e < 3)

    test_rewards = []
    test_durations = []
    for i_test in range(100):
        state, _ = test_env.reset()
        episode_reward = 0
        episode_duration = 0
        done = False
        while not done:
            # Select action greedily (no exploration)
            action = agent.select_action(state, exploration=False)
            next_state, reward, terminated, truncated, _ = test_env.step(action.item())
            done = terminated or truncated
            episode_reward += reward
            episode_duration += 1
            state = next_state
        test_rewards.append(episode_reward)
        test_durations.append(episode_duration)

    test_env.close() # Important to save the video

    avg_test_reward = np.mean(test_rewards)
    avg_test_duration = np.mean(test_durations)

    print(f"Test Results: Avg Duration = {avg_test_duration:.2f}, Avg Reward = {avg_test_reward:.2f}")

    # Log test results and video to W&B
    wandb.log({
        "avg_test_reward": avg_test_reward,
        "avg_test_duration": avg_test_duration,
        # Log the first recorded video
        "video": wandb.Video(os.path.join(video_dir, "rl-video-episode-0.mp4"), fps=30, format="mp4")
    })

    print(f"--- Run {run.name} Finished ---")
    run.finish()


#################################################
#  5. Main Execution
#################################################

if __name__ == "__main__":

    # Define the hyperparameter setups to test
    # We will test 4 different configurations

    # Config 1: Standard DQN
    config1 = {
        "run_name": "DQN_fast_decay_small_mem",
        "use_ddqn": False,
        "num_episodes": 1000,
        "gamma": 0.99,
        "epsilon_decay": 5000,
        "learning_rate": 0.001,
        "memory_size": 10000,
        "batch_size": 64
    }

    # Config 2: Standard DDQN (compare to Config 1)
    config2 = {
        "run_name": "DDQN_fast_decay_small_mem",
        "use_ddqn": True,
        "num_episodes": 1000,
        "gamma": 0.99,
        "epsilon_decay": 5000,
        "learning_rate": 0.001,
        "memory_size": 10000,
        "batch_size": 64
    }

    # Config 3: DDQN with slower decay and larger memory
    config3 = {
        "run_name": "DDQN_slow_decay_large_mem",
        "use_ddqn": True,
        "num_episodes": 1000,
        "gamma": 0.98, # Different gamma
        "epsilon_decay": 20000, # Slower decay
        "learning_rate": 0.0005, # Lower LR
        "memory_size": 50000, # Larger memory
        "batch_size": 128 # Larger batch
    }

    # Config 4: DQN with parameters from Config 3
    config4 = {
        "run_name": "DQN_slow_decay_large_mem",
        "use_ddqn": False,
        "num_episodes": 1000,
        "gamma": 0.98,
        "epsilon_decay": 20000,
        "learning_rate": 0.0005,
        "memory_size": 50000,
        "batch_size": 128
    }

    # List of all configurations to run
    all_configs = [config1, config2, config3, config4]

    # Run all experiments
    for cfg in all_configs:
        train_and_evaluate(cfg)

    print("All experiments complete.")

--- Starting Run: solar-donkey-1 ---
Config: {'run_name': 'DQN_fast_decay_small_mem', 'use_ddqn': False, 'num_episodes': 1000, 'gamma': 0.99, 'epsilon_decay': 5000, 'learning_rate': 0.001, 'memory_size': 10000, 'batch_size': 64}
Episode 0: Duration = 200, Reward = -200.00, Avg Loss = 0.0485
Episode 100: Duration = 200, Reward = -200.00, Avg Loss = 0.4394
Episode 200: Duration = 200, Reward = -200.00, Avg Loss = 0.5902
Episode 300: Duration = 153, Reward = -153.00, Avg Loss = 0.4309
Episode 400: Duration = 150, Reward = -150.00, Avg Loss = 5.1956
Episode 500: Duration = 200, Reward = -200.00, Avg Loss = 59.3080
Episode 600: Duration = 200, Reward = -200.00, Avg Loss = 35.3621
Episode 700: Duration = 200, Reward = -200.00, Avg Loss = 30.0643
Episode 800: Duration = 200, Reward = -200.00, Avg Loss = 11.2681
Episode 900: Duration = 153, Reward = -153.00, Avg Loss = 11.4898
Training complete.
Starting testing and video recording...




Test Results: Avg Duration = 156.74, Avg Reward = -156.74
--- Run solar-donkey-1 Finished ---


0,1
avg_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆█▇▅▃▂▂▂▅▄▂▂▂▂▂▂▂▂▃▆
avg_test_duration,▁
avg_test_reward,▁
episode,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
episode_duration,███████████████▅▂▁▅▅█▅█████████████████▇
epsilon,█▆▅▅▅▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
steps_done,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇█████
total_reward,▁▁▁▁▁▁▁▁▁▁▁▁▃▁█▁▁▁▄▄▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▄▃▄▃

0,1
avg_loss,61.4545
avg_test_duration,156.74
avg_test_reward,-156.74
episode,999.0
episode_duration,150.0
epsilon,0.01
steps_done,188100.0
total_reward,-150.0


--- Starting Run: sunny-durian-2 ---
Config: {'run_name': 'DDQN_fast_decay_small_mem', 'use_ddqn': True, 'num_episodes': 1000, 'gamma': 0.99, 'epsilon_decay': 5000, 'learning_rate': 0.001, 'memory_size': 10000, 'batch_size': 64}
Episode 0: Duration = 200, Reward = -200.00, Avg Loss = 0.0474
Episode 100: Duration = 200, Reward = -200.00, Avg Loss = 0.4371
Episode 200: Duration = 200, Reward = -200.00, Avg Loss = 0.3684
Episode 300: Duration = 200, Reward = -200.00, Avg Loss = 0.4343
Episode 400: Duration = 200, Reward = -200.00, Avg Loss = 0.4157
Episode 500: Duration = 200, Reward = -200.00, Avg Loss = 0.2366
Episode 600: Duration = 109, Reward = -109.00, Avg Loss = 0.0808
Episode 700: Duration = 107, Reward = -107.00, Avg Loss = 0.1074
Episode 800: Duration = 91, Reward = -91.00, Avg Loss = 0.1333
Episode 900: Duration = 93, Reward = -93.00, Avg Loss = 0.1042
Training complete.
Starting testing and video recording...




Test Results: Avg Duration = 119.30, Avg Reward = -119.30
--- Run sunny-durian-2 Finished ---


0,1
avg_loss,▁▁▃▄▄▄▅▆▆▅▆█▅▆▅▆▄▅▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
avg_test_duration,▁
avg_test_reward,▁
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
episode_duration,████████████████▃▇█▃▂█▄▂▂▄▂▂▂▂▆▂▅▁▂▅▁▅▅▂
epsilon,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
steps_done,▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇███
total_reward,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄█▅▁▁▅▄▆▄▆▇▇▇▇█▇▇▄▅▅▇▅▁█▄█

0,1
avg_loss,0.1496
avg_test_duration,119.3
avg_test_reward,-119.3
episode,999.0
episode_duration,114.0
epsilon,0.01
steps_done,152897.0
total_reward,-114.0


--- Starting Run: rich-tree-3 ---
Config: {'run_name': 'DDQN_slow_decay_large_mem', 'use_ddqn': True, 'num_episodes': 1000, 'gamma': 0.98, 'epsilon_decay': 20000, 'learning_rate': 0.0005, 'memory_size': 50000, 'batch_size': 128}
Episode 0: Duration = 200, Reward = -200.00, Avg Loss = 0.0732
