# Implementation: Teacher-Student Loop

**Goal**: Train student using KL Divergence.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 1. Mock Teacher Outputs (Soft Targets)
# Teacher thinks: [Class 0: 0.1, Class 1: 0.8, Class 2: 0.1]
teacher_logits = torch.tensor([[1.0, 5.0, 1.0]]) 

# 2. Mock Student Outputs (Untrained)
# Student thinks: [0.33, 0.33, 0.33]
student_logits = torch.tensor([[0.0, 0.0, 0.0]], requires_grad=True)

# 3. Hyperparameters
T = 2.0 # Temperature
alpha = 0.5

# 4. Distillation Loss
# LogSoftmax for Student (Standard for KLDiv)
student_log_probs = F.log_softmax(student_logits / T, dim=1)
# Softmax for Teacher
teacher_probs = F.softmax(teacher_logits / T, dim=1)

distillation_loss = nn.KLDivLoss(reduction='batchmean')(student_log_probs, teacher_probs) * (T**2)

print(f"Distillation Loss: {distillation_loss.item()}")

# Note: We multiply by T^2 because Softmax gradients scale by 1/T^2.

## Conclusion
The student tries to match the probability distribution of the teacher.