<a href="https://colab.research.google.com/github/Janindu-Muthunayaka/model-distillation/blob/main/Regressionv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports and DataSet

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time

# Data Preparation
housing = fetch_california_housing(as_frame=True)
X_House = housing.data
Y_House = housing.target

xTrain, xTest, yTrain, yTest = train_test_split(
    X_House, Y_House, test_size=0.2, random_state=42
)

scaler = StandardScaler()
xTrain = scaler.fit_transform(xTrain)
xTest = scaler.transform(xTest)

xTrain = torch.tensor(xTrain, dtype=torch.float32)
yTrain = torch.tensor(yTrain.values, dtype=torch.float32).view(-1, 1)
xTest = torch.tensor(xTest, dtype=torch.float32)
yTest = torch.tensor(yTest.values, dtype=torch.float32).view(-1, 1)


Teacher Model

In [2]:
# Teacher Model (significantly larger)
class HouseTeacher(nn.Module):
    def __init__(self):
        super(HouseTeacher, self).__init__()
        self.fc1 = nn.Linear(8, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 32)
        self.fc6 = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return x

teacher = HouseTeacher()
criterion = nn.MSELoss()
optimizer = optim.Adam(teacher.parameters(), lr=0.001)
epochs_teacher = 100  # More epochs for Teacher to exaggerate performance gaps

for epoch in range(epochs_teacher):
    teacher.train()
    optimizer.zero_grad()
    outputs = teacher(xTrain)
    loss = criterion(outputs, yTrain)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs_teacher}], Loss: {loss.item():.4f}")



Epoch [10/100], Loss: 3.6716
Epoch [20/100], Loss: 1.6253
Epoch [30/100], Loss: 0.8546
Epoch [40/100], Loss: 0.7525
Epoch [50/100], Loss: 0.6494
Epoch [60/100], Loss: 0.5904
Epoch [70/100], Loss: 0.5308
Epoch [80/100], Loss: 0.4826
Epoch [90/100], Loss: 0.4475
Epoch [100/100], Loss: 0.4226


Student Model

In [3]:
# Student Model
class HouseStudent(nn.Module):
    def __init__(self):
        super(HouseStudent, self).__init__()
        self.fc1 = nn.Linear(8, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def distillation_loss_regression(student_out, teacher_out, true_labels, alpha=0.5):
    hard_loss = F.mse_loss(student_out, true_labels)
    soft_loss = F.mse_loss(student_out, teacher_out)
    return alpha * hard_loss + (1 - alpha) * soft_loss

student = HouseStudent()
optimizer_s = optim.Adam(student.parameters(), lr=0.001)
epochs = 100

for epoch in range(epochs):
    student.train()
    optimizer_s.zero_grad()

    with torch.no_grad():
        teacher_outputs = teacher(xTrain)

    student_outputs = student(xTrain)

    loss = distillation_loss_regression(student_outputs, teacher_outputs, yTrain)

    loss.backward()
    optimizer_s.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], KD Loss: {loss.item():.4f}")



Epoch [10/100], KD Loss: 3.7540
Epoch [20/100], KD Loss: 3.5454
Epoch [30/100], KD Loss: 3.3300
Epoch [40/100], KD Loss: 3.0981
Epoch [50/100], KD Loss: 2.8486
Epoch [60/100], KD Loss: 2.5852
Epoch [70/100], KD Loss: 2.3117
Epoch [80/100], KD Loss: 2.0335
Epoch [90/100], KD Loss: 1.7563
Epoch [100/100], KD Loss: 1.4864


Person Model

In [4]:
# Person Model (same architecture as Student, trained directly)
class HousePerson(nn.Module):
    def __init__(self):
        super(HousePerson, self).__init__()
        self.fc1 = nn.Linear(8, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

person = HousePerson()
optimizer_p = optim.Adam(person.parameters(), lr=0.001)

for epoch in range(epochs):
    person.train()
    optimizer_p.zero_grad()
    outputs = person(xTrain)
    loss = criterion(outputs, yTrain)
    loss.backward()
    optimizer_p.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Person Loss: {loss.item():.4f}")



Epoch [10/100], Person Loss: 4.6827
Epoch [20/100], Person Loss: 4.4368
Epoch [30/100], Person Loss: 4.1744
Epoch [40/100], Person Loss: 3.8878
Epoch [50/100], Person Loss: 3.5766
Epoch [60/100], Person Loss: 3.2454
Epoch [70/100], Person Loss: 2.9024
Epoch [80/100], Person Loss: 2.5614
Epoch [90/100], Person Loss: 2.2376
Epoch [100/100], Person Loss: 1.9443


Evaluation Functions

In [5]:
def evaluate_rmse(model, xTest, yTest):
    model.eval()
    with torch.no_grad():
        y_pred = model(xTest)
        mse = F.mse_loss(y_pred, yTest)
        rmse = torch.sqrt(mse).item()
    return rmse

def evaluate_inference_time(model, xTest):
    model.eval()
    with torch.no_grad():
        start_time = time.time()
        _ = model(xTest)
        end_time = time.time()
    elapsed_time = end_time - start_time
    avg_time = elapsed_time / xTest.size(0)
    return elapsed_time, avg_time

def evaluate_model_size(model):
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    param_size_MB = num_params * 4 / (1024**2)
    return num_params, param_size_MB

Evalautions and Output Direct

In [7]:
# Evaluate
teacher_rmse = evaluate_rmse(teacher, xTest, yTest)
teacher_time, teacher_avg_time = evaluate_inference_time(teacher, xTest)
teacher_params, teacher_size = evaluate_model_size(teacher)

student_rmse = evaluate_rmse(student, xTest, yTest)
student_time, student_avg_time = evaluate_inference_time(student, xTest)
student_params, student_size = evaluate_model_size(student)

person_rmse = evaluate_rmse(person, xTest, yTest)
person_time, person_avg_time = evaluate_inference_time(person, xTest)
person_params, person_size = evaluate_model_size(person)

# Teacher stats
print(f"Teacher RMSE: {teacher_rmse:.4f}")
print(f"Teacher Inference Time: {teacher_time:.6f}s ({teacher_avg_time:.6f}s per sample)")
print(f"Teacher Params: {teacher_params}, Size: {teacher_size:.6f} MB")

# Student stats
print(f"Student RMSE: {student_rmse:.4f}")
print(f"Student Inference Time: {student_time:.6f}s ({student_avg_time:.6f}s per sample)")
print(f"Student Params: {student_params}, Size: {student_size:.6f} MB")

# Person stats
print(f"Person RMSE: {person_rmse:.4f}")
print(f"Person Inference Time: {person_time:.6f}s ({person_avg_time:.6f}s per sample)")
print(f"Person Params: {person_params}, Size: {person_size:.6f} MB")


Teacher RMSE: 0.6593
Teacher Inference Time: 0.031752s (0.000008s per sample)
Teacher Params: 179201, Size: 0.683598 MB
Student RMSE: 1.2920
Student Inference Time: 0.000346s (0.000000s per sample)
Student Params: 289, Size: 0.001102 MB
Person RMSE: 1.3714
Person Inference Time: 0.000286s (0.000000s per sample)
Person Params: 289, Size: 0.001102 MB


Results Percentage Comparisms

In [8]:
# Student vs Teacher percentages
def percent_change(student_val, teacher_val):
    return ((student_val - teacher_val) / teacher_val) * 100 if teacher_val != 0 else float('inf')

print("\n--- Percentage Change (Student vs Teacher) ---")
print(f"RMSE Change: {percent_change(student_rmse, teacher_rmse):.2f}%")
print(f"Total Inference Time Change: {percent_change(student_time, teacher_time):.2f}%")
print(f"Avg Inference Time per Sample Change: {percent_change(student_avg_time, teacher_avg_time):.2f}%")
print(f"Params Change: {percent_change(student_params, teacher_params):.2f}%")
print(f"Model Size Change: {percent_change(student_size, teacher_size):.2f}%")

# Person vs Teacher percentages
print("\n--- Percentage Change (Person vs Teacher) ---")
print(f"RMSE Change: {percent_change(person_rmse, teacher_rmse):.2f}%")
print(f"Total Inference Time Change: {percent_change(person_time, teacher_time):.2f}%")
print(f"Avg Inference Time per Sample Change: {percent_change(person_avg_time, teacher_avg_time):.2f}%")
print(f"Params Change: {percent_change(person_params, teacher_params):.2f}%")
print(f"Model Size Change: {percent_change(person_size, teacher_size):.2f}%")


--- Percentage Change (Student vs Teacher) ---
RMSE Change: 95.96%
Total Inference Time Change: -98.91%
Avg Inference Time per Sample Change: -98.91%
Params Change: -99.84%
Model Size Change: -99.84%

--- Percentage Change (Person vs Teacher) ---
RMSE Change: 108.01%
Total Inference Time Change: -99.10%
Avg Inference Time per Sample Change: -99.10%
Params Change: -99.84%
Model Size Change: -99.84%
