<a href="https://colab.research.google.com/github/NavjyotDataScientist/kaggle_huggingface_universe_projects/blob/main/12_3_ppo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

\ PPO (Proximal Policy Optimization). No skip. No rush.

This is the final boss of core Reinforcement Learning.

In [None]:
# PPO (Proximal Policy Optimization) - Minimal Example

import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributions as dist

# -----------------------------
# Actor-Critic Network
# -----------------------------
class ActorCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(1, 16),
            nn.ReLU()
        )
        self.actor = nn.Sequential(
            nn.Linear(16, 2),
            nn.Softmax(dim=1)
        )
        self.critic = nn.Linear(16, 1)

    def forward(self, x):
        x = self.shared(x)
        return self.actor(x), self.critic(x)

model = ActorCritic()
optimizer = optim.Adam(model.parameters(), lr=0.01)

gamma = 0.9
eps_clip = 0.2

# -----------------------------
# Environment
# -----------------------------
def env(state, action):
    state += 1 if action == 0 else -1
    state = max(0, state)
    if state == 5:
        return state, 10, True
    return state, -1, False

# -----------------------------
# Training
# -----------------------------
for episode in range(30):
    states, actions, rewards, log_probs = [], [], [], []

    state = 0
    done = False

    while not done:
        s = torch.tensor([[state]], dtype=torch.float32)
        probs, value = model(s)
        dist_action = dist.Categorical(probs)
        action = dist_action.sample()

        log_probs.append(dist_action.log_prob(action))
        states.append(s)
        actions.append(action)

        state, reward, done = env(state, action.item())
        rewards.append(reward)

    # Compute returns
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    returns = torch.tensor(returns, dtype=torch.float32)

    # PPO update
    for i in range(len(states)):
        probs, value = model(states[i])
        dist_new = dist.Categorical(probs)

        new_log_prob = dist_new.log_prob(actions[i])
        ratio = torch.exp(new_log_prob - log_probs[i])

        advantage = returns[i] - value.squeeze()

        clipped = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
        loss = -torch.min(ratio * advantage, clipped) + advantage.pow(2)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Episode {episode+1}: Total Reward = {sum(rewards)}")
