In [1]:
!pip install swig
!pip install "gymnasium[box2d]"

Collecting swig
  Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2379371 sha256=c7f305b1c67b4f314400c329e11d701ddb994914499c765b1399c43e0bb11016
  Stored in directory: /root/.cache/pip/wheels/ab

In [2]:
import torch
import torch.nn as nn
import gymnasium as gym
from torch.distributions import Categorical
import torch.optim as optim

In [3]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Actor, self).__init__()
        self.layer_1 = nn.Linear(state_dim, 64)
        self.layer_2 = nn.Linear(64, 64)
        self.layer_3 = nn.Linear(64, action_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, state):
        x = torch.relu(self.layer_1(state))
        x = torch.relu(self.layer_2(x))
        action_probs = self.softmax(self.layer_3(x))
        return action_probs

class Critic(nn.Module):
    def __init__(self, state_dim):
        super(Critic, self).__init__()
        self.layer_1 = nn.Linear(state_dim, 64)
        self.layer_2 = nn.Linear(64, 64)
        self.layer_3 = nn.Linear(64, 1)

    def forward(self, state):
        x = torch.relu(self.layer_1(state))
        x = torch.relu(self.layer_2(x))
        state_value = self.layer_3(x)
        return state_value


In [4]:
class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, epsilon, n_epochs):
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_epochs = n_epochs

        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim)

        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=lr_critic)

        self.MseLoss = nn.MSELoss()
        self.buffer = []

    def select_action(self, state):
        state = torch.FloatTensor(state)
        action_probs = self.actor(state)
        dist = Categorical(action_probs)
        action = dist.sample()
        action_logprob = dist.log_prob(action)
        return action.detach().numpy(), action_logprob.detach()

    def add_to_buffer(self, state, action, reward, next_state, log_prob, done):
        self.buffer.append((state, action, reward, next_state, log_prob, done))

    def clear_buffer(self):
        self.buffer = []

    def update(self):
        states, actions, rewards, next_states, old_log_probs, dones = zip(*self.buffer)

        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        old_log_probs = torch.FloatTensor(old_log_probs)
        dones = torch.FloatTensor(dones)

        # Calculate discounted rewards
        discounted_rewards = []
        running_reward = 0
        for reward, done in zip(reversed(rewards), reversed(dones)):
            if done:
                running_reward = 0
            running_reward = reward + self.gamma * running_reward
            discounted_rewards.append(running_reward)
        discounted_rewards.reverse()
        discounted_rewards = torch.FloatTensor(discounted_rewards)

        # Normalize discounted rewards
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)


        # Optimize policy for N epochs
        for _ in range(self.n_epochs):
            # Compute advantages
            state_values = self.critic(states).squeeze()
            advantages = discounted_rewards - state_values.detach()

            # Get new action probabilities and log probabilities
            new_action_probs = self.actor(states)
            new_dist = Categorical(new_action_probs)
            new_log_probs = new_dist.log_prob(actions)

            # Calculate ratio
            ratio = torch.exp(new_log_probs - old_log_probs.detach())

            # Calculate policy loss
            surrogate_1 = ratio * advantages
            surrogate_2 = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) * advantages
            policy_loss = -torch.min(surrogate_1, surrogate_2).mean()

            # Calculate value loss
            value_loss = self.MseLoss(state_values, discounted_rewards)

            # Update actor and critic networks
            self.optimizer_actor.zero_grad()
            policy_loss.backward()
            self.optimizer_actor.step()

            self.optimizer_critic.zero_grad()
            value_loss.backward()
            self.optimizer_critic.step()

        self.clear_buffer()


In [11]:
# Add gradient clipping to the update method of the PPO class

class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, epsilon, n_epochs):
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_epochs = n_epochs

        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim)

        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=lr_critic)

        self.MseLoss = nn.MSELoss()
        self.buffer = []

    def select_action(self, state):
        state = torch.FloatTensor(state)
        action_probs = self.actor(state)
        dist = Categorical(action_probs)
        action = dist.sample()
        action_logprob = dist.log_prob(action)
        return action.item(), action_logprob.detach()

    def add_to_buffer(self, state, action, reward, next_state, log_prob, done):
        self.buffer.append((state, action, reward, next_state, log_prob, done))

    def clear_buffer(self):
        self.buffer = []

    def update(self):
        states, actions, rewards, next_states, old_log_probs, dones = zip(*self.buffer)

        states = torch.FloatTensor(np.array(states))
        actions = torch.FloatTensor(np.array(actions))
        rewards = torch.FloatTensor(np.array(rewards))
        next_states = torch.FloatTensor(np.array(next_states))
        old_log_probs = torch.FloatTensor(np.array(old_log_probs))
        dones = torch.FloatTensor(np.array(dones))

        discounted_rewards = []
        running_reward = 0
        for reward, done in zip(reversed(rewards), reversed(dones)):
            if done:
                running_reward = 0
            running_reward = reward + self.gamma * running_reward
            discounted_rewards.append(running_reward)
        discounted_rewards.reverse()
        discounted_rewards = torch.FloatTensor(discounted_rewards)

        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)

        for _ in range(self.n_epochs):
            state_values = self.critic(states).squeeze()
            advantages = discounted_rewards - state_values.detach()

            new_action_probs = self.actor(states)
            new_dist = Categorical(new_action_probs)
            new_log_probs = new_dist.log_prob(actions.long())

            ratio = torch.exp(new_log_probs - old_log_probs.detach())

            surrogate_1 = ratio * advantages
            surrogate_2 = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) * advantages
            policy_loss = -torch.min(surrogate_1, surrogate_2).mean()

            value_loss = self.MseLoss(state_values, discounted_rewards)

            self.optimizer_actor.zero_grad()
            policy_loss.backward()
            # Add gradient clipping for actor
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(), max_norm=1.0)
            self.optimizer_actor.step()

            self.optimizer_critic.zero_grad()
            value_loss.backward()
            # Add gradient clipping for critic
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), max_norm=1.0)
            self.optimizer_critic.step()

        self.clear_buffer()

# Increase max_timesteps further
max_timesteps = 1000000 # Increased to 1,000,000 timesteps

# Re-run the training loop with the updated PPO class and max_timesteps
env = gym.make('LunarLander-v3')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

lr_actor = 0.0003
lr_critic = 0.001
gamma = 0.99
epsilon = 0.2
n_epochs = 10
update_timestep = 2000 # Update policy every n timesteps

ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, epsilon, n_epochs)

timestep = 0
episode = 0
reward_sum = 0
reward_history = []

while timestep <= max_timesteps:
    episode += 1
    state, _ = env.reset()
    done = False
    episode_reward = 0

    while not done:
        timestep += 1

        action, log_prob = ppo_agent.select_action(state)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        ppo_agent.add_to_buffer(state, action, reward, next_state, log_prob, done)

        if (timestep % update_timestep == 0 or done) and len(ppo_agent.buffer) > 0:
            ppo_agent.update()

        state = next_state

        episode_reward += reward

    print(f'Episode: {episode}, Timestep: {timestep}, Episode Reward: {episode_reward}')
    reward_history.append(episode_reward)

env.close()

Episode: 1, Timestep: 78, Episode Reward: -113.73573260038418
Episode: 2, Timestep: 184, Episode Reward: -135.7064565770491
Episode: 3, Timestep: 258, Episode Reward: -77.01228349271871
Episode: 4, Timestep: 345, Episode Reward: -76.19780241661974
Episode: 5, Timestep: 423, Episode Reward: -232.5721677100619
Episode: 6, Timestep: 480, Episode Reward: -117.78995861383109
Episode: 7, Timestep: 598, Episode Reward: -303.56405537759633
Episode: 8, Timestep: 687, Episode Reward: -117.58945015873759
Episode: 9, Timestep: 763, Episode Reward: -63.88207858725483
Episode: 10, Timestep: 843, Episode Reward: -118.34117748532647
Episode: 11, Timestep: 916, Episode Reward: -68.30547058700466
Episode: 12, Timestep: 1024, Episode Reward: -158.62161122407952
Episode: 13, Timestep: 1155, Episode Reward: -137.69312287806514
Episode: 14, Timestep: 1284, Episode Reward: -117.56327158059484
Episode: 15, Timestep: 1388, Episode Reward: -199.8116243525595
Episode: 16, Timestep: 1487, Episode Reward: -333.679

In [14]:
# 1. Initialize the LunarLander-v3 environment for evaluation
# Wrap the environment with RecordVideo to save a video of the agent's performance
eval_env = gym.make('LunarLander-v3', render_mode='rgb_array')
eval_env = gym.wrappers.RecordVideo(eval_env, 'lunar_lander_video')


# 2. Run a few episodes with the trained agent
num_eval_episodes = 10
eval_rewards = []

for episode in range(num_eval_episodes):
    state, _ = eval_env.reset()
    done = False
    episode_reward = 0

    while not done:
        # Select action using the trained actor network
        # No need to track log_prob during evaluation
        action, _ = ppo_agent.select_action(state)

        next_state, reward, terminated, truncated, _ = eval_env.step(action)
        done = terminated or truncated

        state = next_state
        episode_reward += reward

    eval_rewards.append(episode_reward)
    print(f'Evaluation Episode: {episode + 1}, Episode Reward: {episode_reward}')

# 3. Calculate and print the average evaluation reward
average_eval_reward = sum(eval_rewards) / num_eval_episodes
print(f'\nAverage Evaluation Reward over {num_eval_episodes} episodes: {average_eval_reward}')

eval_env.close()

  logger.warn(


Evaluation Episode: 1, Episode Reward: 175.76712747707268
Evaluation Episode: 2, Episode Reward: 94.94045143064001
Evaluation Episode: 3, Episode Reward: 120.25726239859053
Evaluation Episode: 4, Episode Reward: 138.83785764941996
Evaluation Episode: 5, Episode Reward: 132.8530919148773
Evaluation Episode: 6, Episode Reward: 228.8417566017576
Evaluation Episode: 7, Episode Reward: 19.15201491379196
Evaluation Episode: 8, Episode Reward: 8.680495376023117
Evaluation Episode: 9, Episode Reward: 240.1604234438188
Evaluation Episode: 10, Episode Reward: 119.12004753156575

Average Evaluation Reward over 10 episodes: 127.86105287375577
