<a href="https://colab.research.google.com/github/NavjyotDataScientist/kaggle_huggingface_universe_projects/blob/main/12_2_reinforcement_policygradient_baseline_critic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

WHY POLICY GRADIENT EXISTS

Q-learning problems:

Hard with continuous actions

Max operation is unstable

Overestimation bias

Policy Gradient:
✅ Works with continuous actions
✅ Learns smooth behavior
❌ More variance (noisy)

In [1]:
# Policy Gradient: Teach a child riding style (0 → 5)

import torch
import torch.nn as nn
import torch.optim as optim
import random

# -----------------------------
# Policy Network (Brain)
# -----------------------------
policy = nn.Sequential(
    nn.Linear(1, 16),
    nn.ReLU(),
    nn.Linear(16, 2),
    nn.Softmax(dim=1)  # probabilities
)

optimizer = optim.Adam(policy.parameters(), lr=0.01)
gamma = 0.9

# -----------------------------
# Environment
# -----------------------------
def environment(speed, action):
    speed += 1 if action == 0 else -1
    speed = max(0, speed)

    if speed == 5:
        return speed, 10, True
    return speed, -1, False

# -----------------------------
# Training
# -----------------------------
for episode in range(30):
    speed = 0
    log_probs = []
    rewards = []
    done = False

    while not done:
        state = torch.tensor([[speed]], dtype=torch.float32)

        probs = policy(state)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()

        log_probs.append(dist.log_prob(action))

        speed, reward, done = environment(speed, action.item())
        rewards.append(reward)

    # -----------------------------
    # Policy Gradient Update
    # -----------------------------
    G = 0
    loss = 0

    for log_prob, reward in zip(reversed(log_probs), reversed(rewards)):
        G = reward + gamma * G
        loss -= log_prob * G

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Episode {episode+1}: Total reward = {sum(rewards)}")


Episode 1: Total reward = -50
Episode 2: Total reward = -9
Episode 3: Total reward = -37
Episode 4: Total reward = 2
Episode 5: Total reward = 2
Episode 6: Total reward = 4
Episode 7: Total reward = 4
Episode 8: Total reward = -3
Episode 9: Total reward = 0
Episode 10: Total reward = 4
Episode 11: Total reward = -3
Episode 12: Total reward = 6
Episode 13: Total reward = 5
Episode 14: Total reward = -3
Episode 15: Total reward = 2
Episode 16: Total reward = 0
Episode 17: Total reward = 1
Episode 18: Total reward = 6
Episode 19: Total reward = 6
Episode 20: Total reward = 6
Episode 21: Total reward = 4
Episode 22: Total reward = 6
Episode 23: Total reward = 6
Episode 24: Total reward = 5
Episode 25: Total reward = 6
Episode 26: Total reward = 6
Episode 27: Total reward = 6
Episode 28: Total reward = 6
Episode 29: Total reward = 6
Episode 30: Total reward = 3


In [2]:
# Policy Gradient with Baseline (Variance Reduction)

import torch
import torch.nn as nn
import torch.optim as optim
import random

# -----------------------------
# Policy Network
# -----------------------------
policy = nn.Sequential(
    nn.Linear(1, 16),
    nn.ReLU(),
    nn.Linear(16, 2),
    nn.Softmax(dim=1)
)

optimizer = optim.Adam(policy.parameters(), lr=0.01)
gamma = 0.9

# -----------------------------
# Environment
# -----------------------------
def environment(speed, action):
    speed += 1 if action == 0 else -1
    speed = max(0, speed)

    if speed == 5:
        return speed, 10, True
    return speed, -1, False

# -----------------------------
# Training
# -----------------------------
for episode in range(30):
    speed = 0
    log_probs = []
    rewards = []
    done = False

    while not done:
        state = torch.tensor([[speed]], dtype=torch.float32)

        probs = policy(state)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()

        log_probs.append(dist.log_prob(action))
        speed, reward, done = environment(speed, action.item())
        rewards.append(reward)

    # --------- BASELINE ----------
    baseline = sum(rewards) / len(rewards)

    G = 0
    loss = 0
    for log_prob, reward in zip(reversed(log_probs), reversed(rewards)):
        G = reward + gamma * G
        advantage = G - baseline
        loss -= log_prob * advantage

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Episode {episode+1}: Reward={sum(rewards)} Baseline={baseline:.2f}")


Episode 1: Reward=-75 Baseline=-0.87
Episode 2: Reward=-150 Baseline=-0.93
Episode 3: Reward=-5 Baseline=-0.31
Episode 4: Reward=-5 Baseline=-0.31
Episode 5: Reward=3 Baseline=0.38
Episode 6: Reward=-15 Baseline=-0.58
Episode 7: Reward=5 Baseline=0.83
Episode 8: Reward=-15 Baseline=-0.58
Episode 9: Reward=-6 Baseline=-0.35
Episode 10: Reward=5 Baseline=0.83
Episode 11: Reward=0 Baseline=0.00
Episode 12: Reward=-6 Baseline=-0.35
Episode 13: Reward=3 Baseline=0.38
Episode 14: Reward=4 Baseline=0.57
Episode 15: Reward=3 Baseline=0.38
Episode 16: Reward=6 Baseline=1.20
Episode 17: Reward=2 Baseline=0.22
Episode 18: Reward=6 Baseline=1.20
Episode 19: Reward=3 Baseline=0.38
Episode 20: Reward=4 Baseline=0.57
Episode 21: Reward=6 Baseline=1.20
Episode 22: Reward=6 Baseline=1.20
Episode 23: Reward=6 Baseline=1.20
Episode 24: Reward=1 Baseline=0.10
Episode 25: Reward=6 Baseline=1.20
Episode 26: Reward=6 Baseline=1.20
Episode 27: Reward=6 Baseline=1.20
Episode 28: Reward=5 Baseline=0.83
Episode 

In [3]:
# Actor-Critic: Two brains working together

import torch
import torch.nn as nn
import torch.optim as optim
import random

# -----------------------------
# Actor Network (Policy)
# -----------------------------
actor = nn.Sequential(
    nn.Linear(1, 16),
    nn.ReLU(),
    nn.Linear(16, 2),
    nn.Softmax(dim=1)
)

# -----------------------------
# Critic Network (Value)
# -----------------------------
critic = nn.Sequential(
    nn.Linear(1, 16),
    nn.ReLU(),
    nn.Linear(16, 1)
)

actor_opt = optim.Adam(actor.parameters(), lr=0.01)
critic_opt = optim.Adam(critic.parameters(), lr=0.01)
gamma = 0.9

# -----------------------------
# Environment
# -----------------------------
def environment(speed, action):
    speed += 1 if action == 0 else -1
    speed = max(0, speed)

    if speed == 5:
        return speed, 10, True
    return speed, -1, False

# -----------------------------
# Training
# -----------------------------
for episode in range(30):
    speed = 0
    done = False

    while not done:
        state = torch.tensor([[speed]], dtype=torch.float32)

        probs = actor(state)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()

        value = critic(state)

        next_speed, reward, done = environment(speed, action.item())
        next_state = torch.tensor([[next_speed]], dtype=torch.float32)

        next_value = critic(next_state)

        # Advantage
        advantage = reward + gamma * next_value - value

        # Actor loss
        actor_loss = -dist.log_prob(action) * advantage.detach()

        # Critic loss
        critic_loss = advantage.pow(2)

        actor_opt.zero_grad()
        critic_opt.zero_grad()

        actor_loss.backward()
        critic_loss.backward()

        actor_opt.step()
        critic_opt.step()

        speed = next_speed

    print(f"Episode {episode+1}: Finished at speed {speed}")


Episode 1: Finished at speed 5
Episode 2: Finished at speed 5
Episode 3: Finished at speed 5
Episode 4: Finished at speed 5
Episode 5: Finished at speed 5
Episode 6: Finished at speed 5
Episode 7: Finished at speed 5
Episode 8: Finished at speed 5
Episode 9: Finished at speed 5
Episode 10: Finished at speed 5
Episode 11: Finished at speed 5
Episode 12: Finished at speed 5
Episode 13: Finished at speed 5
Episode 14: Finished at speed 5
Episode 15: Finished at speed 5
Episode 16: Finished at speed 5
Episode 17: Finished at speed 5
Episode 18: Finished at speed 5
Episode 19: Finished at speed 5
Episode 20: Finished at speed 5
Episode 21: Finished at speed 5
Episode 22: Finished at speed 5
Episode 23: Finished at speed 5
Episode 24: Finished at speed 5
Episode 25: Finished at speed 5
Episode 26: Finished at speed 5
Episode 27: Finished at speed 5
Episode 28: Finished at speed 5
Episode 29: Finished at speed 5
Episode 30: Finished at speed 5
