# Independent DQN - Heavy Weight Lunar Lander (Colab)

**Task:** Heavy Weight Lunar Lander (gravity_multiplier=1.5)

**Method:** Independent DQN (Phase 3 - Multi-Task Baseline)

This notebook is self-contained and designed to run on Google Colab with GPU acceleration.

In [None]:
# Cell 1: Install Dependencies
!pip install -q gymnasium[box2d]
!apt-get install -y swig > /dev/null 2>&1
!pip install -q box2d-py

print("‚úì Dependencies installed successfully!")

In [None]:
# Cell 2: Imports & GPU Configuration
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt
import time
import json
import pickle
import os
from collections import deque
import random
from typing import Tuple, Dict, List
import gymnasium as gym

# GPU Configuration
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"‚úì GPU acceleration enabled!")
else:
    print("‚ö†Ô∏è  Running on CPU (will be slower)")

In [None]:
# Cell 3: ReplayBuffer Class (Embedded)
class ReplayBuffer:
    """Experience replay buffer for DQN."""

    def __init__(self, capacity: int):
        """Initialize replay buffer with given capacity.

        Args:
            capacity: Maximum number of transitions to store
        """
        self.capacity = capacity
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        """Add a transition to the buffer."""
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size: int):
        """Sample a batch of transitions."""
        batch = random.sample(self.buffer, batch_size)

        states = np.array([t[0] for t in batch])
        actions = np.array([t[1] for t in batch])
        rewards = np.array([t[2] for t in batch])
        next_states = np.array([t[3] for t in batch])
        dones = np.array([t[4] for t in batch])

        return states, actions, rewards, next_states, dones

    def __len__(self):
        """Return current size of buffer."""
        return len(self.buffer)

    def __repr__(self):
        return f"ReplayBuffer(capacity={self.capacity}, size={len(self.buffer)})"

print("‚úì ReplayBuffer class defined")

In [None]:
# Cell 4: QNetwork & DQNAgent Classes (Embedded)
class QNetwork(nn.Module):
    """Q-Network for DQN."""

    def __init__(self, state_dim: int, action_dim: int):
        """Initialize Q-Network.

        Args:
            state_dim: Dimension of state space
            action_dim: Dimension of action space
        """
        super(QNetwork, self).__init__()

        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, action_dim)

    def forward(self, x):
        """Forward pass."""
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)


class DQNAgent:
    """DQN Agent with epsilon-greedy exploration."""

    def __init__(
        self,
        state_dim: int,
        action_dim: int,
        learning_rate: float = 5e-4,
        gamma: float = 0.99,
        epsilon_start: float = 1.0,
        epsilon_end: float = 0.01,
        epsilon_decay: float = 0.995,
        target_update_freq: int = 10,
        device: str = 'cpu'
    ):
        """Initialize DQN Agent."""
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.target_update_freq = target_update_freq
        self.device = device

        # Networks
        self.q_network = QNetwork(state_dim, action_dim).to(device)
        self.target_network = QNetwork(state_dim, action_dim).to(device)
        self.target_network.load_state_dict(self.q_network.state_dict())

        # Optimizer and loss
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
        self.loss_fn = nn.MSELoss()

        # Training stats
        self.episodes = 0

    def select_action(self, state, epsilon=None):
        """Select action using epsilon-greedy policy."""
        if epsilon is None:
            epsilon = self.epsilon

        if np.random.random() < epsilon:
            return np.random.randint(self.action_dim)
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                q_values = self.q_network(state_tensor)
                return q_values.argmax().item()

    def update(self, batch):
        """Update Q-network using a batch of transitions."""
        states, actions, rewards, next_states, dones = batch

        # Convert to tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)

        # Current Q values
        current_q = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Target Q values
        with torch.no_grad():
            next_q = self.target_network(next_states).max(1)[0]
            target_q = rewards + (1 - dones) * self.gamma * next_q

        # Compute loss and update
        loss = self.loss_fn(current_q, target_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def decay_epsilon(self):
        """Decay epsilon."""
        self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        """Update target network."""
        self.target_network.load_state_dict(self.q_network.state_dict())

    def save(self, path):
        """Save model."""
        torch.save({
            'q_network': self.q_network.state_dict(),
            'target_network': self.target_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'epsilon': self.epsilon,
            'episodes': self.episodes
        }, path)

    def load(self, path):
        """Load model."""
        checkpoint = torch.load(path, map_location=self.device)
        self.q_network.load_state_dict(checkpoint['q_network'])
        self.target_network.load_state_dict(checkpoint['target_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.epsilon = checkpoint['epsilon']
        self.episodes = checkpoint['episodes']

    def __repr__(self):
        return f"DQNAgent(state_dim={self.state_dim}, action_dim={self.action_dim}, epsilon={self.epsilon:.4f}, steps={self.episodes})"

print("‚úì QNetwork and DQNAgent classes defined")

In [None]:
# Cell 5: Environment Variants (Embedded)
from gymnasium.envs.box2d.lunar_lander import LunarLander

class StandardLunarLander(LunarLander):
    """Standard Lunar Lander environment (unchanged)."""

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.task_name = "Standard"


class WindyLunarLander(LunarLander):
    """Windy variant with random lateral wind forces."""

    def __init__(self, wind_power=20.0, **kwargs):
        super().__init__(**kwargs)
        self.wind_power = wind_power
        self.task_name = "Windy"

    def step(self, action):
        """Step with wind force applied."""
        # Apply random lateral wind
        wind_force = np.random.uniform(-self.wind_power, self.wind_power)
        if self.lander is not None:
            self.lander.ApplyForceToCenter((wind_force, 0.0), True)

        return super().step(action)


class HeavyWeightLunarLander(LunarLander):
    """Heavy weight variant with increased gravity."""

    def __init__(self, gravity_multiplier=1.5, **kwargs):
        super().__init__(**kwargs)
        self.gravity_multiplier = gravity_multiplier
        self.task_name = "Heavy Weight"
        # Increase gravity
        self.world.gravity = (0, -10.0 * gravity_multiplier)


def make_env(task: str, render_mode=None):
    """Factory function to create environment variants.

    Args:
        task: One of 'standard', 'windy', 'heavy'
        render_mode: Rendering mode (None for Colab)

    Returns:
        Environment instance
    """
    task = task.lower()

    if task == 'standard':
        return StandardLunarLander(render_mode=render_mode)
    elif task == 'windy':
        return WindyLunarLander(render_mode=render_mode)
    elif task == 'heavy':
        return HeavyWeightLunarLander(render_mode=render_mode)
    else:
        raise ValueError(f"Unknown task: {task}. Choose from 'standard', 'windy', 'heavy'")

print("‚úì Environment variants defined (Standard, Windy, Heavy)")

In [None]:
# Cell 6: Output Directory Setup
# Create output directories (matching local structure)
output_base = '/content/results'
os.makedirs(f'{output_base}/logs', exist_ok=True)
os.makedirs(f'{output_base}/models', exist_ok=True)
os.makedirs(f'{output_base}/plots', exist_ok=True)

print(f"‚úì Output directories created at: {output_base}")
print(f"  - Logs: {output_base}/logs")
print(f"  - Models: {output_base}/models")
print(f"  - Plots: {output_base}/plots")
print("\nüìÅ Directory structure matches local setup!")
print("   Download /content/results/ after training to merge with local results/")

In [None]:
# Cell 7: Helper Functions
def count_parameters(model):
    """Count total and trainable parameters in PyTorch model."""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return {'total': total_params, 'trainable': trainable_params}


def save_progress_checkpoint(rewards, losses, eval_rewards, eval_episodes, task_name):
    """Save training progress during training."""
    checkpoint_data = {
        'episode_rewards': rewards,
        'episode_losses': losses,
        'eval_rewards': eval_rewards,
        'eval_episodes': eval_episodes,
        'last_episode': len(rewards),
        'task': task_name,
        'timestamp': time.strftime("%Y%m%d_%H%M%S")
    }
    # Match baseline naming convention
    with open(f'{output_base}/logs/independent_dqn_{task_name}_progress.json', 'w') as f:
        json.dump(checkpoint_data, f, indent=2)

print("‚úì Helper functions defined")

In [None]:
# Cell 8: Hyperparameters
HYPERPARAMS = {
    # Training
    'num_episodes': 1000,
    'batch_size': 64,
    'replay_buffer_size': 100000,
    'min_replay_size': 1000,

    # DQN Agent
    'learning_rate': 5e-4,
    'gamma': 0.99,
    'epsilon_start': 1.0,
    'epsilon_end': 0.01,
    'epsilon_decay': 0.995,
    'target_update_freq': 10,

    # Evaluation
    'eval_freq': 50,
    'eval_episodes': 5,

    # Checkpointing
    'save_freq': 100,

    # Task - HEAVY WEIGHT VARIANT
    'task': 'heavy',

    # Device
    'device': device
}

print("Hyperparameters:")
print("=" * 60)
for key, value in HYPERPARAMS.items():
    print(f"  {key:<25} {value}")
print("=" * 60)

In [None]:
# Cell 9: Environment & Agent Initialization
# Create environment (render_mode=None for Colab)
task_name = HYPERPARAMS['task']
env = make_env(task_name, render_mode=None)

# Get environment info
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

print(f"Environment: {env.task_name} Lunar Lander")
print(f"State dimension: {state_dim}")
print(f"Action dimension: {action_dim}")

# Create DQN agent
agent = DQNAgent(
    state_dim=state_dim,
    action_dim=action_dim,
    learning_rate=HYPERPARAMS['learning_rate'],
    gamma=HYPERPARAMS['gamma'],
    epsilon_start=HYPERPARAMS['epsilon_start'],
    epsilon_end=HYPERPARAMS['epsilon_end'],
    epsilon_decay=HYPERPARAMS['epsilon_decay'],
    target_update_freq=HYPERPARAMS['target_update_freq'],
    device=device
)

# Create replay buffer
replay_buffer = ReplayBuffer(capacity=HYPERPARAMS['replay_buffer_size'])

print(f"\n‚úì Created: {agent}")
print(f"‚úì Created: {replay_buffer}")

# Count parameters
q_network_params = count_parameters(agent.q_network)
target_network_params = count_parameters(agent.target_network)
total_model_params = q_network_params['total'] + target_network_params['total']

print("\n" + "=" * 80)
print("MODEL PARAMETER COUNTS")
print("=" * 80)
print(f"Q-Network:      {q_network_params['total']:,} total, {q_network_params['trainable']:,} trainable")
print(f"Target Network: {target_network_params['total']:,} total, {target_network_params['trainable']:,} trainable")
print(f"Total:          {total_model_params:,} parameters")
print("=" * 80)

In [None]:
# Cell 10: Training Loop
# Training statistics
episode_rewards = []
episode_losses = []
eval_rewards = []
eval_episodes = []

# Sample Efficiency Metrics
total_env_steps = 0
total_gradient_updates = 0
performance_thresholds = {50: None, 100: None, 150: None, 200: None}

# Best model tracking
best_eval_reward = -np.inf

# Start training
start_time = time.time()

print(f"\nüöÄ Starting training on {task_name.upper()} task...")
print("üí° Progress bar will update below.\n")

# Progress bar
pbar = tqdm(total=HYPERPARAMS['num_episodes'],
            desc=f"Training {task_name}",
            bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')

# Training loop
for episode in range(HYPERPARAMS['num_episodes']):
    state, info = env.reset()
    episode_reward = 0
    episode_loss = []
    done = False
    truncated = False
    steps = 0

    # Play one episode
    while not (done or truncated):
        # Select action
        action = agent.select_action(state)

        # Take action
        next_state, reward, done, truncated, info = env.step(action)

        # Store transition
        replay_buffer.push(state, action, reward, next_state, done)

        # Train if we have enough experiences
        if len(replay_buffer) >= HYPERPARAMS['min_replay_size']:
            batch = replay_buffer.sample(HYPERPARAMS['batch_size'])
            loss = agent.update(batch)
            episode_loss.append(loss)
            total_gradient_updates += 1

        episode_reward += reward
        state = next_state
        steps += 1
        total_env_steps += 1

    # Decay epsilon
    agent.decay_epsilon()
    agent.episodes += 1

    # Update target network
    if episode % agent.target_update_freq == 0:
        agent.update_target_network()

    # Store statistics
    episode_rewards.append(episode_reward)
    avg_loss = np.mean(episode_loss) if episode_loss else 0
    episode_losses.append(avg_loss)
    avg_reward_100 = np.mean(episode_rewards[-100:]) if episode_rewards else 0

    # Check thresholds
    for threshold, first_episode in performance_thresholds.items():
        if first_episode is None and avg_reward_100 >= threshold:
            performance_thresholds[threshold] = {
                'episode': episode + 1,
                'total_steps': total_env_steps,
                'gradient_updates': total_gradient_updates
            }
            pbar.write(f"üéØ Threshold {threshold} reached at episode {episode+1} (steps: {total_env_steps:,})")

    # Update progress bar
    pbar.set_postfix({
        'reward': f'{episode_reward:.1f}',
        'avg_100': f'{avg_reward_100:.1f}',
        'Œµ': f'{agent.epsilon:.3f}',
        'loss': f'{avg_loss:.2f}'
    })
    pbar.update(1)

    # Print summary every 100 episodes
    if (episode + 1) % 100 == 0:
        pbar.write(f"\n[Episode {episode+1:4d}] Reward: {episode_reward:7.2f} | Avg(100): {avg_reward_100:7.2f} | Loss: {avg_loss:6.4f}")
        pbar.write(f"               Steps: {total_env_steps:,} | Updates: {total_gradient_updates:,}")

    # Save progress checkpoint every 50 episodes
    if (episode + 1) % 50 == 0:
        save_progress_checkpoint(episode_rewards, episode_losses, eval_rewards, eval_episodes, task_name)

    # Evaluation
    if episode % HYPERPARAMS['eval_freq'] == 0 and episode > 0:
        eval_reward_mean = 0
        eval_reward_list = []

        for _ in range(HYPERPARAMS['eval_episodes']):
            eval_state, _ = env.reset()
            eval_reward = 0
            eval_done = False
            eval_truncated = False

            while not (eval_done or eval_truncated):
                eval_action = agent.select_action(eval_state, epsilon=0.0)
                eval_state, r, eval_done, eval_truncated, _ = env.step(eval_action)
                eval_reward += r

            eval_reward_list.append(eval_reward)
            eval_reward_mean += eval_reward

        eval_reward_mean /= HYPERPARAMS['eval_episodes']
        eval_reward_std = np.std(eval_reward_list)
        eval_rewards.append(eval_reward_mean)
        eval_episodes.append(episode)

        pbar.write("-" * 80)
        if eval_reward_mean > best_eval_reward:
            best_eval_reward = eval_reward_mean
            model_path = f'{output_base}/models/independent_dqn_{task_name}.pth'
            agent.save(model_path)
            pbar.write(f"[EVAL @ Episode {episode+1}] Mean: {eval_reward_mean:7.2f} (¬±{eval_reward_std:5.2f}) ‚≠ê NEW BEST!")
        else:
            pbar.write(f"[EVAL @ Episode {episode+1}] Mean: {eval_reward_mean:7.2f} (¬±{eval_reward_std:5.2f})")
        pbar.write("-" * 80)

    # Checkpoint
    if episode % HYPERPARAMS['save_freq'] == 0 and episode > 0:
        checkpoint_path = f'{output_base}/models/independent_dqn_{task_name}_ep{episode}.pth'
        agent.save(checkpoint_path)
        pbar.write(f"[CHECKPOINT] Saved to {checkpoint_path}")

# Close progress bar
pbar.close()

# Training complete
training_time = time.time() - start_time

print("\n" + "=" * 80)
print(f"‚úÖ Training on {task_name.upper()} complete!")
print("=" * 80)
print(f"Training time: {training_time/60:.2f} minutes ({training_time:.1f} seconds)")
print(f"Best eval reward: {best_eval_reward:.2f}")
print(f"Final avg reward (last 100): {np.mean(episode_rewards[-100:]):.2f}")
print()
print("SAMPLE EFFICIENCY METRICS:")
print(f"  Total environment steps: {total_env_steps:,}")
print(f"  Total gradient updates: {total_gradient_updates:,}")
print(f"  Steps per episode (avg): {total_env_steps / len(episode_rewards):.1f}")
print()
print("PERFORMANCE THRESHOLDS REACHED:")
for threshold in sorted(performance_thresholds.keys()):
    milestone = performance_thresholds[threshold]
    if milestone:
        print(f"  Reward ‚â• {threshold:3d}: Episode {milestone['episode']:4d} | Steps: {milestone['total_steps']:,} | Updates: {milestone['gradient_updates']:,}")
    else:
        print(f"  Reward ‚â• {threshold:3d}: Not reached")
print("=" * 80)

env.close()

In [None]:
# Cell 11: Save Metrics
# Prepare metrics
metrics = {
    "method": "Independent DQN",
    "task": task_name,
    "hyperparams": HYPERPARAMS,
    "episode_rewards": episode_rewards,
    "episode_losses": episode_losses,
    "eval_rewards": eval_rewards,
    "eval_episodes": eval_episodes,
    "training_time": training_time,
    "best_eval_reward": best_eval_reward,

    "parameter_efficiency": {
        "q_network_params": q_network_params['total'],
        "target_network_params": target_network_params['total'],
        "total_params": total_model_params,
        "trainable_params": q_network_params['trainable'],
    },

    "sample_efficiency": {
        "total_env_steps": total_env_steps,
        "total_gradient_updates": total_gradient_updates,
        "steps_per_episode_avg": total_env_steps / len(episode_rewards),
        "performance_thresholds": performance_thresholds,
    },

    "conflict_robustness": {
        "average_reward": float(np.mean(episode_rewards)),
        "average_reward_last_100": float(np.mean(episode_rewards[-100:])),
        "per_task_reward": {
            task_name: float(np.mean(episode_rewards[-100:]))
        }
    }
}

# Save metrics
json_path = f'{output_base}/logs/independent_dqn_{task_name}_metrics.json'
pickle_path = f'{output_base}/logs/independent_dqn_{task_name}_metrics.pkl'

with open(json_path, 'w') as f:
    json.dump(metrics, f, indent=2)

with open(pickle_path, 'wb') as f:
    pickle.dump(metrics, f)

print("=" * 80)
print("‚úì Metrics saved successfully!")
print("=" * 80)
print(f"  JSON: {json_path}")
print(f"  Pickle: {pickle_path}")
print("=" * 80)

In [None]:
# Cell 12: Visualization
# Create visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Plot 1: Episode Rewards
rewards = episode_rewards
window = 20
smoothed = np.convolve(rewards, np.ones(window)/window, mode='valid')

axes[0, 0].plot(rewards, alpha=0.3, color='green', label='Raw')
axes[0, 0].plot(range(len(smoothed)), smoothed, color='green', linewidth=2, label='Smoothed')
axes[0, 0].axhline(200, color='red', linestyle='--', label='Success threshold')
axes[0, 0].set_title(f'{task_name.capitalize()} - Episode Rewards', fontweight='bold')
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Reward')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Training Loss
losses = episode_losses
smoothed_loss = np.convolve(losses, np.ones(window)/window, mode='valid')

axes[0, 1].plot(losses, alpha=0.3, color='red', label='Raw')
axes[0, 1].plot(range(len(smoothed_loss)), smoothed_loss, color='red', linewidth=2, label='Smoothed')
axes[0, 1].set_title('Training Loss', fontweight='bold')
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Evaluation Rewards
if eval_rewards:
    axes[0, 2].plot(eval_episodes, eval_rewards, marker='o', color='blue', linewidth=2, markersize=6)
    axes[0, 2].axhline(200, color='red', linestyle='--', label='Success')
    axes[0, 2].axhline(best_eval_reward, color='gold', linestyle=':', linewidth=2, label=f'Best: {best_eval_reward:.1f}')
    axes[0, 2].set_title('Evaluation Rewards', fontweight='bold')
    axes[0, 2].set_xlabel('Episode')
    axes[0, 2].set_ylabel('Mean Reward')
    axes[0, 2].legend()
    axes[0, 2].grid(True, alpha=0.3)

# Plot 4: Running Average
running_avg = [np.mean(episode_rewards[max(0, i-99):i+1]) for i in range(len(episode_rewards))]
axes[1, 0].plot(running_avg, color='darkgreen', linewidth=2)
axes[1, 0].axhline(200, color='red', linestyle='--', label='Success')
axes[1, 0].set_title('Running Average (100 episodes)', fontweight='bold')
axes[1, 0].set_xlabel('Episode')
axes[1, 0].set_ylabel('Average Reward')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Plot 5: Success Rate
window_size = 50
success_rate = []
for i in range(len(episode_rewards)):
    start_idx = max(0, i - window_size + 1)
    window_rewards = episode_rewards[start_idx:i+1]
    rate = sum(r > 200 for r in window_rewards) / len(window_rewards) * 100
    success_rate.append(rate)

axes[1, 1].plot(success_rate, color='orange', linewidth=2)
axes[1, 1].set_title(f'Success Rate (window={window_size})', fontweight='bold')
axes[1, 1].set_xlabel('Episode')
axes[1, 1].set_ylabel('Success Rate (%)')
axes[1, 1].set_ylim([0, 105])
axes[1, 1].grid(True, alpha=0.3)

# Plot 6: Summary Stats
final_avg = np.mean(episode_rewards[-100:])
axes[1, 2].text(0.5, 0.7, f"Final Performance\n(Last 100 Episodes)",
                ha='center', va='center', fontsize=14, fontweight='bold')
axes[1, 2].text(0.5, 0.4, f"{final_avg:.1f}",
                ha='center', va='center', fontsize=24, fontweight='bold',
                color='green' if final_avg >= 200 else 'orange')
axes[1, 2].text(0.5, 0.2, f"Parameters: {total_model_params:,}",
                ha='center', va='center', fontsize=12)
axes[1, 2].axis('off')

plt.suptitle(f'Independent DQN - {task_name.capitalize()} Task (Colab)', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{output_base}/plots/independent_dqn_{task_name}.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"‚úì Visualization saved to: {output_base}/plots/independent_dqn_{task_name}.png")