# Implementation: DPO Loss (Conceptual)

**Goal**: Optimize preferences directly.

In [None]:
import torch
import torch.nn.functional as F

def dpo_loss(policy_chosen_logps, policy_rejected_logps, ref_chosen_logps, ref_rejected_logps, beta=0.1):
    # 1. Calculate Implied Reward
    # How much more likely is 'chosen' under Policy vs Reference?
    chosen_ratio = policy_chosen_logps - ref_chosen_logps
    rejected_ratio = policy_rejected_logps - ref_rejected_logps
    
    # 2. Preference Logits
    logits = chosen_ratio - rejected_ratio
    
    # 3. Loss = -log(sigmoid(beta * logits))
    # We want sigmoid to be 1 (High likelihood that chosen > rejected)
    loss = -F.logsigmoid(beta * logits)
    return loss.mean()

# Mock Data
# Policy model likes the chosen answer slightly more than reference model
pol_c = torch.tensor([[-2.0]]) 
ref_c = torch.tensor([[-2.1]])

# Policy model HATES the rejected answer much more than reference model
pol_r = torch.tensor([[-5.0]])
ref_r = torch.tensor([[-3.0]])

loss = dpo_loss(pol_c, pol_r, ref_c, ref_r)
print(f"DPO Loss: {loss.item():.4f}")
print("The model is rewarded for widening the gap between chosen and rejected.")

## Conclusion
No Reward Model. No PPO. Just 4 forward passes.