In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal

# Hyperparameters
env_name = 'BipedalWalkerHardcore-v3'
seed = 123
gamma = 0.99
lam = 0.95
clip_epsilon = 0.2
learning_rate = 3e-4
value_loss_coef = 0.5
entropy_coef = 0.0
max_grad_norm = 0.5
num_steps_per_update = 2048
num_epochs = 10
minibatch_size = 64
total_timesteps = 1_000_000

# Set random seeds
torch.manual_seed(seed)
np.random.seed(seed)

# Create the environment
env = gym.make(env_name)
# Seed action and observation spaces
env.action_space.seed(seed)
env.observation_space.seed(seed)

obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_high = env.action_space.high
action_low = env.action_space.low

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Actor-Critic Network
class ActorCritic(nn.Module):
    def __init__(self, obs_dim, action_dim):
        super(ActorCritic, self).__init__()
        hidden_size = 256

        # Common feature layer
        self.feature = nn.Sequential(
            nn.Linear(obs_dim, hidden_size),
            nn.Tanh()
        )

        # Actor network
        self.actor_mean = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, action_dim)
        )

        # Log std parameter (state-independent)
        self.log_std = nn.Parameter(torch.zeros(action_dim))

        # Critic network
        self.critic = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, x):
        feature = self.feature(x)
        action_mean = self.actor_mean(feature)
        action_log_std = self.log_std.expand_as(action_mean)
        value = self.critic(feature)
        return action_mean, action_log_std, value

# Initialize the network and optimizer
model = ActorCritic(obs_dim, action_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Storage for training data
class RolloutBuffer:
    def __init__(self):
        self.observations = []
        self.actions = []
        self.log_probs = []
        self.rewards = []
        self.masks = []
        self.values = []
        self.advantages = []
        self.returns = []

    def clear(self):
        self.__init__()

buffer = RolloutBuffer()

def collect_trajectories(model, env, num_steps):
    global_step = 0
    obs, _ = env.reset()
    done = False

    while global_step < num_steps:
        # Convert observation to tensor and add batch dimension
        obs_tensor = torch.FloatTensor(obs).unsqueeze(0).to(device)

        with torch.no_grad():
            # Get action from the policy network
            action_mean, action_log_std, value = model(obs_tensor)
            action_std = action_log_std.exp()
            dist = Normal(action_mean, action_std)
            action = dist.sample()
            log_prob = dist.log_prob(action).sum(-1)
            value = value.squeeze(-1)

        # Remove batch dimension and convert to NumPy array
        action_np = action.cpu().numpy()[0]
        action_clipped = np.clip(action_np, action_low, action_high)

        # Step the environment
        next_obs, reward, done, truncated, _ = env.step(action_clipped)
        mask = 0 if (done or truncated) else 1

        # Store data in buffer
        buffer.observations.append(obs)
        buffer.actions.append(action_np)
        buffer.log_probs.append(log_prob.item())
        buffer.rewards.append(reward)
        buffer.masks.append(mask)
        buffer.values.append(value.item())

        obs = next_obs
        global_step += 1

        if done or truncated:
            obs, _ = env.reset()
            done = False

    # Compute the last value
    obs_tensor = torch.FloatTensor(obs).unsqueeze(0).to(device)
    with torch.no_grad():
        _, _, last_value = model(obs_tensor)
        last_value = last_value.squeeze(-1).item()

    # Compute advantages and returns
    compute_gae(last_value, buffer)

def compute_gae(last_value, buffer):
    rewards = buffer.rewards
    masks = buffer.masks
    values = buffer.values + [last_value]
    advantages = []
    gae = 0
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * lam * masks[step] * gae
        advantages.insert(0, gae)
    buffer.advantages = advantages
    buffer.returns = [adv + val for adv, val in zip(advantages, buffer.values)]

def ppo_update(model, optimizer, buffer):
    observations = torch.FloatTensor(buffer.observations).to(device)
    actions = torch.FloatTensor(buffer.actions).to(device)
    old_log_probs = torch.FloatTensor(buffer.log_probs).to(device)
    returns = torch.FloatTensor(buffer.returns).to(device)
    advantages = torch.FloatTensor(buffer.advantages).to(device)

    # Normalize advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    # Create dataset
    dataset = torch.utils.data.TensorDataset(
        observations, actions, old_log_probs, returns, advantages
    )
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=minibatch_size, shuffle=True
    )

    for _ in range(num_epochs):
        for batch in data_loader:
            obs_batch, actions_batch, old_log_probs_batch, returns_batch, advantages_batch = batch

            # Forward pass
            action_mean, action_log_std, value = model(obs_batch)
            action_std = action_log_std.exp()
            dist = Normal(action_mean, action_std)
            log_probs = dist.log_prob(actions_batch).sum(-1)
            entropy = dist.entropy().sum(-1)
            value = value.squeeze(-1)

            # Compute ratios
            ratios = torch.exp(log_probs - old_log_probs_batch)

            # Compute surrogate losses
            surr1 = ratios * advantages_batch
            surr2 = torch.clamp(ratios, 1.0 - clip_epsilon, 1.0 + clip_epsilon) * advantages_batch

            # Compute actor and critic losses
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = (returns_batch - value).pow(2).mean()
            entropy_loss = -entropy.mean()

            # Total loss
            loss = actor_loss + value_loss_coef * critic_loss + entropy_coef * entropy_loss

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()

def main():
    num_updates = total_timesteps // num_steps_per_update
    for update in range(1, num_updates + 1):
        # Collect trajectories
        buffer.clear()
        collect_trajectories(model, env, num_steps_per_update)

        # Update the policy
        ppo_update(model, optimizer, buffer)

        # Logging
        if update % 10 == 0:
            avg_reward = np.sum(buffer.rewards) / num_steps_per_update
            print(f"Update {update}, Average Reward per Step: {avg_reward:.2f}")

        # Save the model
        if update % 50 == 0:
            torch.save(model.state_dict(), f"ppo_bipedalwalker_{update}.pth")

    # Save the final model
    torch.save(model.state_dict(), "ppo_bipedalwalker_final.pth")

if __name__ == "__main__":
    main()

    # Test the trained model
    model.load_state_dict(torch.load("ppo_bipedalwalker_final.pth"))
    obs, _ = env.reset()
    done = False
    while not done:
        obs_tensor = torch.FloatTensor(obs).unsqueeze(0).to(device)
        with torch.no_grad():
            action_mean, _, _ = model(obs_tensor)
            action_np = action_mean.cpu().numpy()[0]
        action_clipped = np.clip(action_np, action_low, action_high)
        obs, reward, done, truncated, _ = env.step(action_clipped)
        done = done or truncated
        env.render()
    env.close()
