<a href="https://colab.research.google.com/github/Janindu-Muthunayaka/model-distillation/blob/main/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Packages

# Installments

In [1]:
!pip install numpy pandas scikit-learn torch




In [21]:
import torch
import numpy as np
import random

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# If using GPU
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False



# Data Preparation

In [2]:
epochs=10


DataSets (Wine for Classification)

In [3]:
from sklearn.datasets import load_wine,fetch_california_housing

#classification  - Wine for type of wine (0,1,2)
wine=load_wine(as_frame=True)
X_Wine=wine.data
Y_Wine=wine.target



# Data Preparation for PyTorch

In [22]:
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

#split
xTrain,xTest,yTrain,yTest=train_test_split(X_Wine,Y_Wine,test_size=0.2,random_state=42)

#scaling
scaler=StandardScaler()
xTrain=scaler.fit_transform(xTrain)
xTest=scaler.transform(xTest)

#convert to Pytorch tensors
xTrain=torch.tensor(xTrain,dtype=torch.float32)
yTrain=torch.tensor(yTrain.values,dtype=torch.long)
xTest=torch.tensor(xTest,dtype=torch.float32)
yTest=torch.tensor(yTest.values,dtype=torch.long)

# Create TensorDatasets
train_dataset = TensorDataset(xTrain, yTrain)
test_dataset = TensorDataset(xTest, yTest)

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    generator=torch.Generator().manual_seed(seed) # Use the defined seed
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False # No need to shuffle test data
)

# Teacher MLP Creation

Going to create a fresh teacher MLP

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [6]:
class wineTeacher(nn.Module):
  def __init__(self):
    super(wineTeacher,self).__init__()
    self.fc1=nn.Linear(13,64)
    self.fc2=nn.Linear(64,32)
    self.fc3=nn.Linear(32,16)
    self.fc4=nn.Linear(16,8)
    self.fc5=nn.Linear(8,3)

  def forward(self,x):
    x=F.relu(self.fc1(x))
    x=F.relu(self.fc2(x))
    x=F.relu(self.fc3(x))
    x=F.relu(self.fc4(x))
    x=self.fc5(x)
    return x

#Student MLP Creation


Going to Create fresh Student MLP

In [7]:
class wineStudent(nn.Module):
  def __init__(self):
    super(wineStudent,self).__init__()
    self.fc1=nn.Linear(13,16)
    self.fc2=nn.Linear(16,3)

  def forward(self,x):
    x=F.relu(self.fc1(x))
    x=self.fc2(x)
    return x

# Training Teacher

Training Loop


In [8]:
import torch.optim as optim

teacher = wineTeacher()  # teacher model
criterion = nn.CrossEntropyLoss()  # classification loss
optimizer = optim.Adam(teacher.parameters(), lr=0.001)
#epochs = already declare

for epoch in range(epochs):
    teacher.train()
    optimizer.zero_grad()
    outputs = teacher(xTrain)
    loss = criterion(outputs, yTrain)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:#print only every 10 eopich for cleaness
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')


Epoch [10/10], Loss: 1.0984


# Knowledge Distillation function

Since KD has a Hard loss (Real Label) and Soft Loss (Teachers Predictions)

In [9]:
def distillation_loss(student_logits, teacher_logits, labels, T=2.0, alpha=0.5):
    # Hard loss: student vs true labels
    hard_loss = F.cross_entropy(student_logits, labels)

    # Soft loss: student vs teacher (using softmax with temperature)
    soft_student = F.log_softmax(student_logits / T, dim=1)
    soft_teacher = F.softmax(teacher_logits / T, dim=1)
    soft_loss = F.kl_div(soft_student, soft_teacher, reduction="batchmean") * (T * T)

    # Weighted sum
    return alpha * hard_loss + (1 - alpha) * soft_loss


#Training Student

In [10]:
student = wineStudent()
optimizer_s = optim.Adam(student.parameters(), lr=0.001)

#epochs = already declare
for epoch in range(epochs):
    student.train()
    optimizer_s.zero_grad()

    # Teacher predictions (fixed, no gradients)
    with torch.no_grad():
        teacher_outputs = teacher(xTrain)

    # Student predictions
    student_outputs = student(xTrain)

    # Distillation loss
    loss = distillation_loss(student_outputs, teacher_outputs, yTrain, T=2.0, alpha=0.5)

    loss.backward()
    optimizer_s.step()

    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], KD Loss: {loss.item():.4f}")


Epoch [10/10], KD Loss: 0.5456


# Testing

In [11]:
import time

# 1. Accuracy function
def evaluate_accuracy(model, xTest, yTest):
    model.eval()
    with torch.no_grad():
        y_pred = model(xTest)
        predicted_classes = torch.argmax(y_pred, dim=1)
        accuracy = (predicted_classes == yTest).float().mean().item()
    return accuracy

import time

def evaluate_inference_time_stable(model, xTest, repeats=10):
    model.eval()
    # warm-up
    with torch.no_grad():
        _ = model(xTest)

    times = []
    with torch.no_grad():
        for _ in range(repeats):
            start = time.time()
            _ = model(xTest)
            end = time.time()
            times.append(end - start)
    total_time = sum(times) / repeats
    avg_time = total_time / xTest.size(0)
    return total_time, avg_time


# 3. Model size function
def evaluate_model_size(model):
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    param_size_MB = num_params * 4 / (1024**2)  # 4 bytes per float32
    return num_params, param_size_MB


# Results

In [12]:
# Teacher
teacher_acc = evaluate_accuracy(teacher, xTest, yTest)
teacher_time, teacher_avg_time = evaluate_inference_time_stable(teacher, xTest)
teacher_params, teacher_size = evaluate_model_size(teacher)

print(f"Teacher Accuracy: {teacher_acc:.4f}")
print(f"Teacher Inference Time: {teacher_time:.6f}s ({teacher_avg_time:.6f}s per sample)")
print(f"Teacher Params: {teacher_params}, Size: {teacher_size:.6f} MB")

# Student
student_acc = evaluate_accuracy(student, xTest, yTest)
student_time, student_avg_time = evaluate_inference_time_stable(student, xTest)
student_params, student_size = evaluate_model_size(student)

print(f"Student Accuracy: {student_acc:.4f}")
print(f"Student Inference Time: {student_time:.6f}s ({student_avg_time:.6f}s per sample)")
print(f"Student Params: {student_params}, Size: {student_size:.6f} MB")


Teacher Accuracy: 0.2222
Teacher Inference Time: 0.000374s (0.000010s per sample)
Teacher Params: 3667, Size: 0.013988 MB
Student Accuracy: 0.6111
Student Inference Time: 0.000050s (0.000001s per sample)
Student Params: 275, Size: 0.001049 MB


In [13]:
def percent_change(student_val, teacher_val):
    return ((student_val - teacher_val) / teacher_val) * 100 if teacher_val != 0 else float('inf')

acc_change_s = percent_change(student_acc, teacher_acc)
time_change_s = percent_change(student_time, teacher_time)
avg_time_change_s = percent_change(student_avg_time, teacher_avg_time)
params_change_s = percent_change(student_params, teacher_params)
size_change_s = percent_change(student_size, teacher_size)

print("\n--- Percentage Change (Student vs Teacher) ---")
print(f"Accuracy Change: {acc_change_s:.2f}%")
print(f"Total Inference Time Change: {time_change_s:.2f}%")
print(f"Avg Inference Time per Sample Change: {avg_time_change_s:.2f}%")
print(f"Params Change: {params_change_s:.2f}%")
print(f"Model Size Change: {size_change_s:.2f}%")


--- Percentage Change (Student vs Teacher) ---
Accuracy Change: 175.00%
Total Inference Time Change: -86.58%
Avg Inference Time per Sample Change: -86.58%
Params Change: -92.50%
Model Size Change: -92.50%


# Regular Teaching

In [14]:
class winePerson(nn.Module):
  def __init__(self):
    super(winePerson,self).__init__()
    self.fc1=nn.Linear(13,16)
    self.fc2=nn.Linear(16,3)

  def forward(self,x):
    x=F.relu(self.fc1(x))
    x=self.fc2(x)
    return x

In [15]:
person = winePerson()

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(person.parameters(), lr=0.001)



# Training loop
for epoch in range(epochs):
    person.train()
    optimizer.zero_grad()             # reset gradients
    outputs = person(xTrain)          # forward pass
    loss = criterion(outputs, yTrain) # compute loss
    loss.backward()                   # backpropagation
    optimizer.step()                  # update weights

    # print every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [10/10], Loss: 1.0666


In [16]:
# Person metrics
person_acc = evaluate_accuracy(person, xTest, yTest)
person_time, person_avg_time = evaluate_inference_time_stable(person, xTest)
person_params, person_size = evaluate_model_size(person)

print(f"Person Accuracy: {person_acc:.4f}")
print(f"Person Inference Time: {person_time:.6f}s ({person_avg_time:.6f}s per sample)")
print(f"Person Params: {person_params}, Size: {person_size:.6f} MB")


Person Accuracy: 0.5833
Person Inference Time: 0.000103s (0.000003s per sample)
Person Params: 275, Size: 0.001049 MB


In [17]:
acc_change_p = percent_change(person_acc, teacher_acc)
time_change_p = percent_change(person_time, teacher_time)
avg_time_change_p = percent_change(person_avg_time, teacher_avg_time)
params_change_p = percent_change(person_params, teacher_params)
size_change_p = percent_change(person_size, teacher_size)

print("\n--- Percentage Change(Person vs Teacher) ---")
print(f"Accuracy Change: {acc_change_p:.2f}%")
print(f"Total Inference Time Change: {time_change_p:.2f}%")
print(f"Avg Inference Time per Sample Change: {avg_time_change_p:.2f}%")
print(f"Params Change: {params_change_p:.2f}%")
print(f"Model Size Change: {size_change_p:.2f}%")



--- Percentage Change(Person vs Teacher) ---
Accuracy Change: 162.50%
Total Inference Time Change: -72.48%
Avg Inference Time per Sample Change: -72.48%
Params Change: -92.50%
Model Size Change: -92.50%


# Model Comparism

In [18]:
print("Person & Student agains the Teacher\n")

print("Accuracy against Teacher\n")
print(f"Student: {acc_change_s:.2f}%")
print(f"Person: {acc_change_p:.2f}%")

print("\nInference Time against Teacher\n")
print(f"Student time: {time_change_s:.2f}%")
print(f"Student avg: {avg_time_change_s:.2f}%")
print(f"Perso time: {time_change_p:.2f}%")
print(f"Person avg: {avg_time_change_p:.2f}%")

print("\nModel Size against Teacher\n")
print(f"Model Size Change: {size_change_p:.2f}%")
print(f"Model Size Change: {size_change_s:.2f}%")

Person & Student agains the Teacher

Accuracy against Teacher

Student: 175.00%
Person: 162.50%

Inference Time against Teacher

Student time: -86.58%
Student avg: -86.58%
Perso time: -72.48%
Person avg: -72.48%

Model Size against Teacher

Model Size Change: -92.50%
Model Size Change: -92.50%
