In [7]:
!pip install swig
!pip install "gymnasium[box2d]"

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2379375 sha256=d2c4c8ff8d3318fb715c191ad8cca583cd1886e09d01edce671416964b167b35
  Stored in directory: /root/.cache/pip/wheels/ab/f1/0c/d56f4a2bdd12bae0a0693ec33f2f0daadb5eb9753c78fa5308
Successfully built box2d-py
Installing collected packages: box2d-py
Successfully installed box2d-py-2.3.5


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import gymnasium as gym
import numpy as np

# --- Actor-Critic Network ---
# This class defines the neural network architecture for both the actor and the critic.
# For the more complex LunarLander environment, we'll use a slightly deeper network
# with two hidden layers to better capture the state features.

class ActorCritic(nn.Module):
    """
    A neural network for the Actor-Critic agent, adapted for LunarLander.

    This network takes a state as input and outputs two things:
    1. A probability distribution over actions (the policy), from the actor.
    2. An estimate of the value of the state, from the critic.
    """
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(ActorCritic, self).__init__()

        # Shared layers are now deeper for a more complex environment
        self.shared_layers = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )

        # Actor-specific layer
        # This layer outputs logits for each action. A softmax will be applied
        # later to get a probability distribution.
        self.actor_head = nn.Linear(hidden_dim, action_dim)

        # Critic-specific layer
        # This layer outputs a single value, which is the estimated value of the state.
        self.critic_head = nn.Linear(hidden_dim, 1)

    def forward(self, state):
        """
        Performs a forward pass through the network.

        Args:
            state (torch.Tensor): The current state of the environment.

        Returns:
            tuple: A tuple containing:
                - action_probs (torch.distributions.Categorical): A distribution over actions.
                - state_value (torch.Tensor): The estimated value of the input state.
        """
        # Pass the state through the shared layers
        shared_features = self.shared_layers(state)

        # Get the action logits from the actor head
        action_logits = self.actor_head(shared_features)
        # Create a categorical distribution from the logits
        action_probs = Categorical(F.softmax(action_logits, dim=-1))

        # Get the state value from the critic head
        state_value = self.critic_head(shared_features)

        return action_probs, state_value


# --- A2C Agent ---
# This class brings everything together. It contains the actor-critic network,
# the optimizer, and the logic for training the agent. The logic remains the same
# as it is a general implementation of A2C.

class A2CAgent:
    """
    The Advantage Actor-Critic (A2C) agent.
    """
    def __init__(self, state_dim, action_dim, learning_rate=0.0005, gamma=0.99):
        self.gamma = gamma

        # Initialize the Actor-Critic network
        self.ac_network = ActorCritic(state_dim, action_dim)
        # Initialize the optimizer
        self.optimizer = optim.Adam(self.ac_network.parameters(), lr=learning_rate)

        # These lists will store the experiences of one episode
        self.log_probs = []
        self.values = []
        self.rewards = []
        self.dones = []

    def select_action(self, state):
        """
        Selects an action based on the current policy.

        Args:
            state (np.ndarray): The current state of the environment.

        Returns:
            int: The action to take.
        """
        # Convert the state to a PyTorch tensor
        state = torch.from_numpy(state).float().unsqueeze(0)
        # Get the action probabilities and state value from the network
        action_probs, state_value = self.ac_network(state)

        # Sample an action from the distribution
        action = action_probs.sample()

        # Store the log probability of the action and the state value
        self.log_probs.append(action_probs.log_prob(action))
        self.values.append(state_value)

        return action.item()

    def update(self):
        """
        Updates the actor and critic networks.
        """
        # Convert lists to tensors
        log_probs = torch.cat(self.log_probs)
        values = torch.cat(self.values).squeeze()
        rewards = torch.tensor(self.rewards)
        dones = torch.tensor(self.dones, dtype=torch.float32)

        # Calculate returns (discounted rewards)
        returns = []
        discounted_reward = 0
        for reward, done in zip(reversed(self.rewards), reversed(self.dones)):
            if done:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            returns.insert(0, discounted_reward)

        returns = torch.tensor(returns).float()  # <-- FIX: Convert to float32 here
        # Normalize returns for more stable training
        returns = (returns - returns.mean()) / (returns.std() + 1e-9)

        # Calculate advantage
        # Advantage = Returns - Values
        advantage = returns - values

        # Calculate actor loss (policy gradient loss)
        actor_loss = -(log_probs * advantage.detach()).mean()

        # Calculate critic loss (mean squared error between returns and values)
        critic_loss = F.mse_loss(returns, values)

        # Total loss
        loss = actor_loss + 0.5 * critic_loss

        # Perform backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Clear the memory for the next episode
        self.log_probs = []
        self.values = []
        self.rewards = []
        self.dones = []

def main():
    # Create the environment - Changed to LunarLander-v3
    env = gym.make('LunarLander-v3')  # This is the only change needed.
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    # Create the A2C agent with a potentially adjusted learning rate
    agent = A2CAgent(state_dim, action_dim, learning_rate=5e-4)

    # Training parameters adjusted for LunarLander
    num_episodes = 2000
    max_steps_per_episode = 1000

    print(f"Starting training on LunarLander-v3...")
    print(f"State space dimension: {state_dim}")
    print(f"Action space dimension: {action_dim}")


    for episode in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0

        for step in range(max_steps_per_episode):
            # Select an action
            action = agent.select_action(state)
            # Take the action in the environment
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Store the experience
            agent.rewards.append(reward)
            agent.dones.append(done)

            state = next_state
            episode_reward += reward

            if done:
                break

        # Update the agent at the end of the episode
        agent.update()

        if (episode + 1) % 50 == 0:
            print(f"Episode {episode + 1}, Reward: {episode_reward}")

    env.close()

if __name__ == '__main__':
    main()


Starting training on LunarLander-v3...
State space dimension: 8
Action space dimension: 4
Episode 50, Reward: -203.3598829224493
Episode 100, Reward: -483.3620857665338
Episode 150, Reward: -199.40160871888105
Episode 200, Reward: -180.3695319392446
Episode 250, Reward: -393.51382411637377
Episode 300, Reward: -316.79128414559534
Episode 350, Reward: -10.353742055773637
Episode 400, Reward: -213.50247951431436
Episode 450, Reward: -299.47945337662617
Episode 500, Reward: -61.6104767649923
Episode 550, Reward: -211.9034591873595
Episode 600, Reward: -6.9788440833922465
Episode 650, Reward: 39.67398080978191
Episode 700, Reward: -271.7822727420509
Episode 750, Reward: -15.593473725284554
Episode 800, Reward: -41.452652517768456
Episode 850, Reward: -55.644293561977435
Episode 900, Reward: -63.85697810361698
Episode 950, Reward: -166.29564709615906
Episode 1000, Reward: -17.530074554587326
Episode 1050, Reward: -39.65788343165494
Episode 1100, Reward: 38.906078556604186
Episode 1150, Rewa