<a href="https://colab.research.google.com/github/Janindu-Muthunayaka/model-distillation/blob/main/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Packages

# Installments

In [262]:
!pip install numpy pandas scikit-learn torch




# Data Preparation

In [263]:
epochs=70


DataSets (Wine for Classification)

In [264]:
from sklearn.datasets import load_wine,fetch_california_housing

#classification  - Wine for type of wine (0,1,2)
wine=load_wine(as_frame=True)
X_Wine=wine.data
Y_Wine=wine.target



# Data Preparation for PyTorch

In [265]:
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#split
xTrain,xTest,yTrain,yTest=train_test_split(X_Wine,Y_Wine,test_size=0.2,random_state=42)

#scaling
scaler=StandardScaler()
xTrain=scaler.fit_transform(xTrain)
xTest=scaler.transform(xTest)

#convert top Pytorch tensors
xTrain=torch.tensor(xTrain,dtype=torch.float32)
yTrain=torch.tensor(yTrain.values,dtype=torch.long)
xTest=torch.tensor(xTest,dtype=torch.float32)
yTest=torch.tensor(yTest.values,dtype=torch.long)






# Teacher MLP Creation

Going to create a fresh teacher MLP

In [266]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [267]:
class wineTeacher(nn.Module):
  def __init__(self):
    super(wineTeacher,self).__init__()
    self.fc1=nn.Linear(13,64)
    self.fc2=nn.Linear(64,32)
    self.fc3=nn.Linear(32,16)
    self.fc4=nn.Linear(16,8)
    self.fc5=nn.Linear(8,3)

  def forward(self,x):
    x=F.relu(self.fc1(x))
    x=F.relu(self.fc2(x))
    x=F.relu(self.fc3(x))
    x=F.relu(self.fc4(x))
    x=self.fc5(x)
    return x

#Student MLP Creation


Going to Create fresh Student MLP

In [268]:
class wineStudent(nn.Module):
  def __init__(self):
    super(wineStudent,self).__init__()
    self.fc1=nn.Linear(13,16)
    self.fc2=nn.Linear(16,3)

  def forward(self,x):
    x=F.relu(self.fc1(x))
    x=self.fc2(x)
    return x

# Training Teacher

Training Loop


In [269]:
import torch.optim as optim

teacher = wineTeacher()  # teacher model
criterion = nn.CrossEntropyLoss()  # classification loss
optimizer = optim.Adam(teacher.parameters(), lr=0.001)
#epochs = already declare

for epoch in range(epochs):
    teacher.train()
    optimizer.zero_grad()
    outputs = teacher(xTrain)
    loss = criterion(outputs, yTrain)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:#print only every 10 eopich for cleaness
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')


Epoch [10/70], Loss: 1.1102
Epoch [20/70], Loss: 1.0607
Epoch [30/70], Loss: 0.9776
Epoch [40/70], Loss: 0.8352
Epoch [50/70], Loss: 0.6388
Epoch [60/70], Loss: 0.4233
Epoch [70/70], Loss: 0.2385


# Knowledge Distillation function

Since KD has a Hard loss (Real Label) and Soft Loss (Teachers Predictions)

In [270]:
def distillation_loss(student_logits, teacher_logits, labels, T=2.0, alpha=0.5):
    # Hard loss: student vs true labels
    hard_loss = F.cross_entropy(student_logits, labels)

    # Soft loss: student vs teacher (using softmax with temperature)
    soft_student = F.log_softmax(student_logits / T, dim=1)
    soft_teacher = F.softmax(teacher_logits / T, dim=1)
    soft_loss = F.kl_div(soft_student, soft_teacher, reduction="batchmean") * (T * T)

    # Weighted sum
    return alpha * hard_loss + (1 - alpha) * soft_loss


#Training Student

In [271]:
student = wineStudent()
optimizer_s = optim.Adam(student.parameters(), lr=0.001)

#epochs = already declare
for epoch in range(epochs):
    student.train()
    optimizer_s.zero_grad()

    # Teacher predictions (fixed, no gradients)
    with torch.no_grad():
        teacher_outputs = teacher(xTrain)

    # Student predictions
    student_outputs = student(xTrain)

    # Distillation loss
    loss = distillation_loss(student_outputs, teacher_outputs, yTrain, T=2.0, alpha=0.5)

    loss.backward()
    optimizer_s.step()

    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], KD Loss: {loss.item():.4f}")


Epoch [10/70], KD Loss: 1.0964
Epoch [20/70], KD Loss: 1.0275
Epoch [30/70], KD Loss: 0.9620
Epoch [40/70], KD Loss: 0.8976
Epoch [50/70], KD Loss: 0.8321
Epoch [60/70], KD Loss: 0.7645
Epoch [70/70], KD Loss: 0.6961


# Testing

In [272]:
import time

# 1. Accuracy function
def evaluate_accuracy(model, xTest, yTest):
    model.eval()
    with torch.no_grad():
        y_pred = model(xTest)
        predicted_classes = torch.argmax(y_pred, dim=1)
        accuracy = (predicted_classes == yTest).float().mean().item()
    return accuracy

import time

def evaluate_inference_time_stable(model, xTest, repeats=10):
    model.eval()
    # warm-up
    with torch.no_grad():
        _ = model(xTest)

    times = []
    with torch.no_grad():
        for _ in range(repeats):
            start = time.time()
            _ = model(xTest)
            end = time.time()
            times.append(end - start)
    total_time = sum(times) / repeats
    avg_time = total_time / xTest.size(0)
    return total_time, avg_time


# 3. Model size function
def evaluate_model_size(model):
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    param_size_MB = num_params * 4 / (1024**2)  # 4 bytes per float32
    return num_params, param_size_MB


# Results

In [273]:
# Teacher
teacher_acc = evaluate_accuracy(teacher, xTest, yTest)
teacher_time, teacher_avg_time = evaluate_inference_time(teacher, xTest)
teacher_params, teacher_size = evaluate_model_size(teacher)

print(f"Teacher Accuracy: {teacher_acc:.4f}")
print(f"Teacher Inference Time: {teacher_time:.6f}s ({teacher_avg_time:.6f}s per sample)")
print(f"Teacher Params: {teacher_params}, Size: {teacher_size:.6f} MB")

# Student
student_acc = evaluate_accuracy(student, xTest, yTest)
student_time, student_avg_time = evaluate_inference_time(student, xTest)
student_params, student_size = evaluate_model_size(student)

print(f"Student Accuracy: {student_acc:.4f}")
print(f"Student Inference Time: {student_time:.6f}s ({student_avg_time:.6f}s per sample)")
print(f"Student Params: {student_params}, Size: {student_size:.6f} MB")


Teacher Accuracy: 0.9722
Teacher Inference Time: 0.000290s (0.000008s per sample)
Teacher Params: 3667, Size: 0.013988 MB
Student Accuracy: 0.9444
Student Inference Time: 0.000123s (0.000003s per sample)
Student Params: 275, Size: 0.001049 MB


In [274]:
def percent_change(student_val, teacher_val):
    return ((student_val - teacher_val) / teacher_val) * 100 if teacher_val != 0 else float('inf')

acc_change = percent_change(student_acc, teacher_acc)
time_change = percent_change(student_time, teacher_time)
avg_time_change = percent_change(student_avg_time, teacher_avg_time)
params_change = percent_change(student_params, teacher_params)
size_change = percent_change(student_size, teacher_size)

print("\n--- Percentage Change (Student vs Teacher) ---")
print(f"Accuracy Change: {acc_change:.2f}%")
print(f"Total Inference Time Change: {time_change:.2f}%")
print(f"Avg Inference Time per Sample Change: {avg_time_change:.2f}%")
print(f"Params Change: {params_change:.2f}%")
print(f"Model Size Change: {size_change:.2f}%")


--- Percentage Change (Student vs Teacher) ---
Accuracy Change: -2.86%
Total Inference Time Change: -57.53%
Avg Inference Time per Sample Change: -57.53%
Params Change: -92.50%
Model Size Change: -92.50%


# Regular Teaching

In [275]:
class winePerson(nn.Module):
  def __init__(self):
    super(winePerson,self).__init__()
    self.fc1=nn.Linear(13,16)
    self.fc2=nn.Linear(16,3)

  def forward(self,x):
    x=F.relu(self.fc1(x))
    x=self.fc2(x)
    return x

In [276]:
person = winePerson()

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(person.parameters(), lr=0.001)



# Training loop
for epoch in range(epochs):
    person.train()
    optimizer.zero_grad()             # reset gradients
    outputs = person(xTrain)          # forward pass
    loss = criterion(outputs, yTrain) # compute loss
    loss.backward()                   # backpropagation
    optimizer.step()                  # update weights

    # print every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [10/70], Loss: 1.0486
Epoch [20/70], Loss: 0.9688
Epoch [30/70], Loss: 0.8937
Epoch [40/70], Loss: 0.8223
Epoch [50/70], Loss: 0.7548
Epoch [60/70], Loss: 0.6898
Epoch [70/70], Loss: 0.6281


In [277]:
# Person metrics
person_acc = evaluate_accuracy(person, xTest, yTest)
person_time, person_avg_time = evaluate_inference_time(person, xTest)
person_params, person_size = evaluate_model_size(person)

print(f"Person Accuracy: {person_acc:.4f}")
print(f"Person Inference Time: {person_time:.6f}s ({person_avg_time:.6f}s per sample)")
print(f"Person Params: {person_params}, Size: {person_size:.6f} MB")


Person Accuracy: 0.8611
Person Inference Time: 0.000075s (0.000002s per sample)
Person Params: 275, Size: 0.001049 MB


In [278]:
acc_change = -1*(percent_change(person_acc, student_acc))
time_change = percent_change(person_time, student_time)
avg_time_change = percent_change(person_avg_time, student_avg_time)
params_change = percent_change(person_params, student_params)
size_change = percent_change(person_size, student_size)

print("\n--- Percentage Change (Student against Person) ---")
print(f"Accuracy Change: {acc_change:.2f}%")
print(f"Total Inference Time Change: {time_change:.2f}%")
print(f"Avg Inference Time per Sample Change: {avg_time_change:.2f}%")
print(f"Params Change: {params_change:.2f}%")
print(f"Model Size Change: {size_change:.2f}%")



--- Percentage Change (Student against Person) ---
Accuracy Change: 8.82%
Total Inference Time Change: -38.76%
Avg Inference Time per Sample Change: -38.76%
Params Change: 0.00%
Model Size Change: 0.00%
