## **Proximal Policy Optimization (PPO)**

A policy-gradient algorithm that optimizes the policy while ensuring updates stay within a trust region.

**Imports**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym


**Define the Policy Network**

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return self.softmax(x)


**Initialize Environment and Model**

In [None]:
env = gym.make('CartPole-v1')
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
hidden_size = 64

model = PolicyNetwork(input_size, hidden_size, output_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)


**Define PPO Hyperparameters**

In [None]:
gamma = 0.99
epsilon = 0.2
epochs = 10
batch_size = 5


**Training Loop**

In [None]:
for epoch in range(epochs):
    state = env.reset()
    done = False
    log_probs = []
    rewards = []
    states = []
    actions = []

    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32)
        action_probs = model(state_tensor)
        dist = torch.distributions.Categorical(action_probs)
        action = dist.sample()
        next_state, reward, done, _, _ = env.step(action.item())

        log_probs.append(dist.log_prob(action))
        rewards.append(reward)
        states.append(state_tensor)
        actions.append(action)

        state = next_state

    # Compute returns
    returns = []
    R = 0
    for r in rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)

    # Update policy
    for _ in range(batch_size):
        for i in range(len(states)):
            state = states[i]
            action = actions[i]
            log_prob = log_probs[i]
            R = returns[i]

            # Compute advantage
            advantage = R - model(state).max().item()

            # Compute ratio
            ratio = torch.exp(log_prob - model(state).max().item())

            # Compute surrogate loss
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) * advantage
            loss = -torch.min(surr1, surr2).mean()

            # Update model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
