In [1]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo, RecordEpisodeStatistics
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from collections import deque, namedtuple
from torch.distributions import Categorical
import random
import wandb
import os
import itertools
from copy import deepcopy

GLOBAL_SEED = 100 # Use 42 for consistency, or any arbitrary fixed number
def set_seeds(seed_value):
    """Sets a fixed seed for reproducibility across all random components."""
    torch.manual_seed(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seeds(GLOBAL_SEED)

In [2]:
# Define a Transition named tuple for Experience Replay
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    """A fixed-size buffer to store experience tuples."""
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition."""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        """Retrieve a random batch of transitions."""
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


In [3]:
# --- 1. A2C MODEL ARCHITECTURE ---
# A single network that branches into Actor (policy) and Critic (value) heads.
class A2CNet(nn.Module):
    """
    Neural Network model for the Advantage Actor-Critic (A2C) Agent.
    It takes the state as input and outputs the action probabilities (Actor)
    and the state value estimate (Critic).
    """
    def __init__(self, state_dim, action_dim, config , hidden_size=64):
        super(A2CNet, self).__init__()

        # Shared Feature Extractor
        self.shared_layer = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU()
        )

        # Actor Head (Policy: pi(a|s)) - Outputs logits for Categorical distribution
        self.actor = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_dim)
        )

        # Critic Head (Value function: V(s)) - Outputs a single state value estimate
        self.critic = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1) # Single output for the value V(s)
        )

    def forward(self, x):
      """Passes the state through the network."""
      shared_features = self.shared_layer(x)
      action_logits = self.actor(shared_features)
      state_value = self.critic(shared_features)
      return action_logits, state_value


In [4]:
# --- 2. A2C AGENT IMPLEMENTATION ---
class A2CAgent:
    """Handles interaction with the environment, storage, and the learning update."""
    def __init__(self, state_dim, action_dim, config):
        # FIX: Store the configuration dictionary for later access (e.g., in evaluate_agent)
        self.config = config 
        
        # Determine the device to use
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Extract hyperparameters
        self.gamma = config['gamma']
        self.lr = config['learning_rate']
        self.c_entropy = config.get('c_entropy', 0.01) # ADDED: Entropy Coefficient (use 0.01 default)
        self.epsilon_decay = config.get('epsilon_decay', 1.0) 
        self.replay_memory_size = config.get('replay_memory_size', 1000) 
        self.batch_size = config.get('batch_size', 64) 

        # Initialize Model and Optimizer
        self.model = A2CNet(state_dim, action_dim, config).to(self.device) 
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        # Lists to store episode trajectory data (A2C is an on-policy algorithm)
        self.log_probs = []
        self.values = []
        self.rewards = []
        self.dones = []
        self.distributions = [] # ADDED: To store distribution objects for entropy calculation

    def choose_action(self, state):
        """Selects an action based on the current policy (Actor)."""
        # Ensure state is a tensor on the correct device
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        action_logits, value_tensor = self.model(state_tensor)

        # Create a categorical distribution over action logits
        dist = Categorical(logits=action_logits)
        action = dist.sample()

        # Store data for learning step 
        self.log_probs.append(dist.log_prob(action))
        self.values.append(value_tensor)
        self.distributions.append(dist) # ADDED: Store the distribution object
        
        return action.item()

    def update(self, next_state):
        """Performs the A2C update using the collected trajectory."""
        self.optimizer.zero_grad()

        # Convert lists to tensors, ensuring they are on the correct device
        rewards = torch.FloatTensor(self.rewards).to(self.device)
        log_probs = torch.cat(self.log_probs).to(self.device)
        values = torch.cat(self.values).squeeze().to(self.device)
        
        # 1. Calculate Discounted Future Rewards (R_t)
        # ... (Returns calculated as before) ...
        if next_state is None:
            R = 0
        else:
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0).to(self.device)
            with torch.no_grad():
                _, R_tensor = self.model(next_state_tensor)
                R = R_tensor.item()

        returns = []
        for r in rewards.flip(dims=(0,)): 
            R = r.item() + self.gamma * R
            returns.insert(0, R)
        
        returns = torch.FloatTensor(returns).to(self.device)

        # 2. Calculate Advantage (A_t)
        advantage = returns - values
        
        # Calculate Entropy (H)
        entropy = torch.cat([d.entropy() for d in self.distributions]).mean()
        
        # 3. Calculate Actor Loss (Policy Loss) - FIXED: Added Entropy Regularization
        # Loss = - Policy_Gradient_Term - Entropy_Term
        actor_loss = -(log_probs * advantage.detach()).mean() - self.c_entropy * entropy

        # 4. Calculate Critic Loss (Value Function Loss)
        critic_loss = nn.MSELoss()(values, returns)

        # 5. Total Loss and Optimization Step
        total_loss = actor_loss + 0.5 * critic_loss 

        total_loss.backward()
        self.optimizer.step()

        # Clear trajectory buffers
        self.log_probs = []
        self.values = []
        self.rewards = []
        self.dones = []
        self.distributions = [] # ADDED: Clear distributions buffer

        return total_loss.item()

    def store_transition(self, reward):
        """Stores the reward for the current step."""
        self.rewards.append(reward)

In [5]:
def train_agent(env, agent, num_episodes, env_name, max_steps=500):
    """The main training loop."""
    print(f"\n--- Starting Training on {env_name} ---")

    for i_episode in range(1, num_episodes + 1):
        seed_arg = GLOBAL_SEED if i_episode == 1 else None

        # Reset the environment at the start of the episode loop
        state, _ = env.reset(seed=seed_arg)

        episode_reward = 0
        loss = 0

        # Loop for a single episode (t: timestep)
        for t in itertools.count():
            action = agent.choose_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            agent.store_transition(reward)
            episode_reward += reward

            if done:
                # Update the network when the episode is finished
                loss = agent.update(next_state if not terminated else None)
                break

            state = next_state

        # Log episode results to Wandb
        wandb.log({
            "episode": i_episode,
            "episode_reward": episode_reward,
            # Use the length of the episode (t) for average loss calculation
            "avg_step_loss": loss / t if t > 0 else 0,
            "episode_length": t,
        })

        if i_episode % 100 == 0:
            print(f"Episode: {i_episode}/{num_episodes} | Reward: {episode_reward:.2f}")

    print(f"--- Training finished for {agent.model.__class__.__name__} ---")

In [6]:
class MountainCarRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.min_position = -1.2 # The minimum position of the track

    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated

        # --- REWARD SHAPING LOGIC ---
        
        # 1. Base Reward (The default -1 per step is implicit in the environment's max_steps limit)
        # We can explicitly set it to a small penalty if we want
        shaped_reward = -1.0 
        
        # 2. Add Reward for progress towards the goal (position is between -1.2 and 0.5)
        # Position is s[0]. The goal is at 0.5.
        # Max reward is given when the position is at 0.5.
        
        # This gives a reward that encourages moving right (max position)
        shaped_reward += 100 * (observation[0] - self.min_position)
        
        # 3. Terminal Reward
        if terminated:
            shaped_reward += 1000 # Large reward for success
        
        # NOTE: The default environment reward is still in 'reward', so we replace it with 'shaped_reward'
        return observation, shaped_reward, terminated, truncated, info


In [7]:
# --- UPDATED: evaluate_agent Function (Cell 22) ---
def evaluate_agent(env_name, agent, num_tests=100, record_video=False):
    """Evaluates the trained agent for a number of episodes and logs duration."""
    print(f"\n--- Starting Evaluation for {agent.config.model} on {env_name} ({num_tests} tests) ---")

    # Determine render mode
    render_mode = "rgb_array" if record_video else None

    # 1. Create evaluation environment (must use render_mode='rgb_array' for video)
    eval_env = gym.make(env_name, render_mode=render_mode)
    
    # 2. APPLY WRAPPERS (Matching training environment setup)
    if env_name == "Pendulum-v1":
         class ContinuousActionWrapper(gym.ActionWrapper):
             def __init__(self, env):
                 super().__init__(env)
                 # Ensure this action_range matches your training setup: [-2.0, -1.0, 0.0, 1.0, 2.0]
                 self.action_range = [-2.0, -1.0, 0.0, 1.0, 2.0]
                 self.action_space = gym.spaces.Discrete(len(self.action_range))
             def action(self, action_idx):
                 return np.array([self.action_range[action_idx]], dtype=np.float32)

         eval_env = ContinuousActionWrapper(eval_env)
         
    elif env_name == "MountainCar-v0":
        # === CRITICAL FIX: RE-APPLY THE REWARD SHAPING WRAPPER HERE ===
        eval_env = MountainCarRewardWrapper(eval_env) 

    # Wrap for collecting episode statistics
    eval_env = RecordEpisodeStatistics(eval_env)

    # Wrap for video recording (if requested)
    if record_video:
        video_folder = f"./videos/{env_name}_{agent.config.model}"
        # Only record the first test episode
        eval_env = RecordVideo(
            eval_env,
            video_folder=video_folder,
            episode_trigger=lambda x: x == 0,
            name_prefix=f"best_agent"
        )
        print(f"Recording the first episode to: {video_folder}")

    # Run tests
    test_durations = []
    test_rewards = []

    # Put model in evaluation mode
    agent.model.eval()

    for i in range(num_tests):
        state, info = eval_env.reset()
        done = False
        current_episode_reward = 0 # Initialize episode reward for this test

        while not done:
            # Move state to the agent's device
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(agent.device)

            with torch.no_grad(): # No need to track gradients during evaluation
                action_logits, _ = agent.model(state_tensor)

            # Select deterministic action by taking the argmax
            action = torch.argmax(action_logits).item()

            # Note: The reward returned here is the WRAPPED reward if the wrapper is applied above
            state, reward, terminated, truncated, info = eval_env.step(action)
            done = terminated or truncated
            current_episode_reward += reward

            # RecordEpisodeStatistics wrapper adds episode stats to info on done=True
            if done:
                # The stats are typically stored in the info dictionary
                if 'episode' in info:
                    duration = info['episode']['l']
                    reward = info['episode']['r']

                    # Ensure duration and reward are floats
                    duration = float(duration)
                    reward = float(reward)

                    test_durations.append(duration)
                    test_rewards.append(reward)

                    # Log individual test result
                    wandb.log({
                        f"{env_name}/Test_Episode_Duration": duration,
                        f"{env_name}/Test_Episode_Reward": reward,
                        "test_episode_index": i
                    })
                break

    eval_env.close()

    # Put model back in training mode
    agent.model.train()

    if test_durations:
        avg_duration = np.mean(test_durations)
        std_duration = np.std(test_durations)
        avg_reward = np.mean(test_rewards)

        wandb.log({
            f"{env_name}/Avg_Test_Duration": avg_duration,
            f"{env_name}/Std_Test_Duration": std_duration,
            f"{env_name}/Avg_Test_Reward": avg_reward,
        })

        print(f"Evaluation complete. Avg Duration: {avg_duration:.2f} \u00b1 {std_duration:.2f} steps.")
        print(f"Avg Reward: {avg_reward:.2f}")

    return avg_reward, avg_duration

In [11]:
def main_run(config):
    """Initializes Wandb, environment, agent, trains, and evaluates."""
    # 1. Initialize Wandb Run
    run_name = config.get('name', f"{config['model']}_DF{config['gamma']}_NNLR{config['learning_rate']}")

    run = wandb.init(
        project="Cartpole-v1-problem-seed-right-value-v3",
        name=run_name,
        config=config
    )

    # 2. Setup Environment
    env_name = config['env_name']

    if env_name in ["Pendulum-v1", "MountainCar-v0"]:
        # Discretize continuous environments for Q-Learning
        if env_name == "Pendulum-v1":
             # Actions: 5 discrete actions: max_torque * [-2.0, -1.0, 0.0, 1.0, 2.0]
             env = gym.make(env_name, max_episode_steps=200) # Default
             class ContinuousActionWrapper(gym.ActionWrapper):
                 def __init__(self, env):
                     super().__init__(env)
                     self.action_range = [-2.0, -1.0, 0.0, 1.0, 2.0]
                     self.action_space = gym.spaces.Discrete(len(self.action_range))
                 def action(self, action_idx):
                     # Map the discrete index to the continuous action value
                     return np.array([self.action_range[action_idx]], dtype=np.float32)
             env = ContinuousActionWrapper(env)

        elif env_name == "MountainCar-v0":
            # Actions: 3 discrete actions: 0:push_left, 1:no_push, 2:push_right
            env = gym.make(env_name, max_episode_steps=200) # Default
            env = MountainCarRewardWrapper(env) 
            
    elif env_name in ["CartPole-v1", "Acrobot-v1"]:
        env = gym.make(env_name)
    else:
        raise ValueError(f"Environment {env_name} not supported by this script.")

    # Get environment specs
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # 3. Create Agent
    agent = A2CAgent(state_size, action_size, wandb.config)

    # 4. Train
    #Set episodes based on difficulty (you can adjust these)
    if env_name == "CartPole-v1":
        num_episodes = 500
    elif env_name == "Acrobot-v1":
        num_episodes = 1000
    elif env_name == "MountainCar-v0":
        num_episodes = 3000
    elif env_name == "Pendulum-v1":
        num_episodes = 1000

    train_agent(env, agent, num_episodes, env_name)
    env.close()

    # 5. Evaluate (100 tests)
    evaluate_agent(env_name, agent, num_tests=100, record_video=True)

    run.finish()

In [9]:
BASELINE_CONFIG = {
    "env_name": "CartPole-v1",       # Environment to run (change this for other envs)
    "model": "A2C",                 
    "num_episodes": 500,            # Training episodes (overwritten in main_run)
    "gamma": 0.999,                  # CHANGED: Increased from 0.99 for long horizon
    "learning_rate": 5e-4,           # CHANGED: Increased from 2e-4 for faster convergence
    "c_entropy": 0.01,               # ADDED: Entropy coefficient for exploration
    "epsilon_start": 1.0,           # (Ignored by A2C)
    "epsilon_end": 0.01,            # (Ignored by A2C)
    "epsilon_decay": 0.001,         # (Ignored by A2C)
    "memory_size": 50000,           # (Ignored by A2C)
    "batch_size": 64,               # (Ignored by A2C)
    "target_update_freq": 200,      # (Ignored by A2C)
    "seed": 100,
}

MOUNTAINCAR_BASELINE_CONFIG = {
        "env_name": "MountainCar-v0",
        "model": "A2C",
        "num_episodes": 5000,
        "gamma": 1.0,
        "learning_rate": 1e-4,
        "c_entropy": 0.2,
        "epsilon_start": 1.0,
        "epsilon_end": 0.01,
        "epsilon_decay": 0.0005,
        "memory_size": 50000,
        "batch_size": 32,
        "target_update_freq": 200,
        "seed": 100,
    }

ACROBOT_BASELINE_CONFIG = {
        "env_name": "Acrobot-v1",
        "model": "A2C",
        "num_episodes": 1000,
        "gamma": 0.999,
        "learning_rate": 1e-3,
        "c_entropy": 0.05,
        "seed": 100,
    }


In [12]:
if __name__ == '__main__':
    # Ensure videos folder exists
    os.makedirs("./videos", exist_ok=True)
    # R1: DQN Baseline (Control Group - Optimized for Sparse Reward)
    config_r1 = deepcopy(ACROBOT_BASELINE_CONFIG)
    config_r1['name'] = 'AB_R1_A2C_BASELINE_OPTM'

    # R2: DDQN Baseline (Model Comparison)
    config_r2 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    config_r2['model'] = 'DDQN'
    config_r2['name'] = 'MC_R2_DDQN_COMPARISON'

    # --- DISCOUNT FACTOR (GAMMA) ---
    # R3: Gamma Too Low (Myopic Agent - Expected to Fail)
    config_r3 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    config_r3['gamma'] = 0.95  # Significantly lower than 0.999
    config_r3['name'] = 'MC_R3_GAMMA_LOW_0.95'

    # R4: Gamma Very High (Testing Edge Case)
    config_r4 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    config_r4['gamma'] = 1.0 # Perfect Discount
    config_r4['name'] = 'MC_R4_GAMMA_PERFECT_1.0'

    # --- NN LEARNING RATE (LR) ---
    # R5: LR Too High (Divergence/Oscillation)
    config_r5 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    config_r5['learning_rate'] = 0.01
    config_r5['name'] = 'MC_R5_LR_HIGH_0.01'

    # R6: LR Too Low (Slow Convergence)
    config_r6 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    config_r6['learning_rate'] = 1e-5
    config_r6['name'] = 'MC_R6_LR_LOW_1e-5'

    # --- EPSILON DECAY RATE (ALPHA) ---
    # R7: Decay Too Slow (Persistent Exploration - Baseline is already slow, make it ultra-slow)
    config_r7 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    config_r7['epsilon_decay'] = 0.0001
    config_r7['name'] = 'MC_R7_DECAY_ULTRA_SLOW_0.0001'

    # R8: Decay Too Fast (Premature Exploitation - Expected to Fail)
    config_r8 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    config_r8['epsilon_decay'] = 0.01 # 10x faster than baseline
    config_r8['name'] = 'MC_R8_DECAY_FAST_0.01'

    # --- MEMORY SIZE & BATCH SIZE ---
    # R9: Small Replay Memory (High Correlation)
    config_r9 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    config_r9['memory_size'] = 5000
    config_r9['name'] = 'MC_R9_MEM_SMALL_5k'

    # R10: Large Batch Size (Smoother Gradients, but may hinder exploration on sparse rewards)
    config_r10 = deepcopy(MOUNTAINCAR_BASELINE_CONFIG)
    config_r10['batch_size'] = 128
    config_r10['name'] = 'MC_R10_BATCH_LARGE_128'


    # --- EXECUTION LOOP ---
    #experiment_configs = [
     #   config_r1, config_r2, config_r3, config_r4, config_r5,
      #  config_r6, config_r7, config_r8, config_r9, config_r10
    #]

    experiment_configs = [
        config_r1
    ]

    for i, config in enumerate(experiment_configs):
        print(f"\n========================================================")
        print(f"Starting Experiment {i+1}/{len(experiment_configs)}: {config['name']}")
        print(f"========================================================")

        # Log the specific config name to Wandb for easy identification
        config_to_run = deepcopy(config)

        # NOTE: If you are running this in a notebook, you may need to restart
        # the kernel between runs to ensure Wandb is initialized correctly.

        # Run the experiment
        main_run(config_to_run)

    print("\n\nALL 10 CARTPOLE EXPERIMENTS COMPLETE. CHECK WANDB FOR RESULTS.")


Starting Experiment 1/1: AB_R1_A2C_BASELINE_OPTM


[34m[1mwandb[0m: Currently logged in as: [33mamira-elgarf02[0m ([33mamira-elgarf02-cairo-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(



--- Starting Training on Acrobot-v1 ---
Episode: 100/1000 | Reward: -500.00
Episode: 200/1000 | Reward: -218.00
Episode: 300/1000 | Reward: -384.00
Episode: 400/1000 | Reward: -119.00
Episode: 500/1000 | Reward: -109.00
Episode: 600/1000 | Reward: -110.00
Episode: 700/1000 | Reward: -115.00
Episode: 800/1000 | Reward: -89.00
Episode: 900/1000 | Reward: -90.00
Episode: 1000/1000 | Reward: -73.00
--- Training finished for A2CNet ---

--- Starting Evaluation for A2C on Acrobot-v1 (100 tests) ---


  logger.warn(


Recording the first episode to: ./videos/Acrobot-v1_A2C
Evaluation complete. Avg Duration: 84.58 ± 16.43 steps.
Avg Reward: -83.58


0,1
Acrobot-v1/Avg_Test_Duration,▁
Acrobot-v1/Avg_Test_Reward,▁
Acrobot-v1/Std_Test_Duration,▁
Acrobot-v1/Test_Episode_Duration,▁▂▂▂▂▂▂▂▅▂█▂▂▃▂▂▃▃▃▂▂▁▂▂▂▁▂▂▂▂▂▂▁▂▂▂▂▂▂▁
Acrobot-v1/Test_Episode_Reward,▆▆▇▆▆▇▇▇▆▇▆▅▇▇▆▆▅▇▇▅█▄▄▇▆▇▆▇▄▆▇▆▆▆▇▆▆▁▇▆
avg_step_loss,▅███▇█▆▆▄▄▃▂▂▄▂▃▂▁▁▂▁▂▁▁▁▁▁▁▁▂▁▂▁▁▁▁▁▁▁▁
episode,▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█
episode_length,██▅███████▃▆▂▂▂▂▂▂▂▂▂▁▂▂▃▂▂▂▂▂▂▁▁▁▁▂▁▁▁▁
episode_reward,▁▁▂▁▁▁▁▁▅▆▆▇▆▇▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇██▆█▇▇██
test_episode_index,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███

0,1
Acrobot-v1/Avg_Test_Duration,84.58
Acrobot-v1/Avg_Test_Reward,-83.58
Acrobot-v1/Std_Test_Duration,16.43179
Acrobot-v1/Test_Episode_Duration,71.0
Acrobot-v1/Test_Episode_Reward,-70.0
avg_step_loss,1.46033
episode,1000.0
episode_length,73.0
episode_reward,-73.0
test_episode_index,99.0




ALL 10 CARTPOLE EXPERIMENTS COMPLETE. CHECK WANDB FOR RESULTS.
