# Implementation: PPO Clipping

**Goal**: Visualize the core constraint of PPO.

In [None]:
import numpy as np

def ppo_loss(old_prob, new_prob, advantage, epsilon=0.2):
    ratio = new_prob / old_prob
    
    # 1. Unclipped Objective
    obj1 = ratio * advantage
    
    # 2. Clipped Objective
    ratio_clipped = np.clip(ratio, 1 - epsilon, 1 + epsilon)
    obj2 = ratio_clipped * advantage
    
    # PPO takes the minimum (pessimistic bound)
    # We negate it because we want to Minimize Loss (Maximize Objective)
    loss = -np.minimum(obj1, obj2)
    return loss

# Scenario: Action was Good (Advantage > 0), but we changed prob too much.
old_p = 0.5
new_p = 0.9 # Huge jump!
adv = 1.0

loss = ppo_loss(old_p, new_p, adv)

print(f"Old Prob: {old_p}, New Prob: {new_p}")
print(f"Ratio: {new_p/old_p}")
print(f"Loss: {loss}")

# Because ratio (1.8) > 1.2, the gradient will be clipped. 
# The model won't get 'credit' for moving beyond 1.2, so it won't push further.

## Conclusion
This clipping mechanism prevents "Policy Collapse".