<a href="https://colab.research.google.com/github/Janindu-Muthunayaka/model-distillation/blob/main/Regressionv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and DataSet

In [8]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time

# Data Preparation
housing = fetch_california_housing(as_frame=True)
X_House = housing.data
Y_House = housing.target

xTrain, xTest, yTrain, yTest = train_test_split(
    X_House, Y_House, test_size=0.2, random_state=42
)

scaler = StandardScaler()
xTrain = scaler.fit_transform(xTrain)
xTest = scaler.transform(xTest)

xTrain = torch.tensor(xTrain, dtype=torch.float32)
yTrain = torch.tensor(yTrain.values, dtype=torch.float32).view(-1, 1)
xTest = torch.tensor(xTest, dtype=torch.float32)
yTest = torch.tensor(yTest.values, dtype=torch.float32).view(-1, 1)


# Teacher Model

In [9]:
# Teacher Model (significantly larger)
class HouseTeacher(nn.Module):
    def __init__(self):
        super(HouseTeacher, self).__init__()
        self.fc1 = nn.Linear(8, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 32)
        self.fc6 = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return x

teacher = HouseTeacher()
criterion = nn.MSELoss()
optimizer = optim.Adam(teacher.parameters(), lr=0.001)
epochs_teacher = 500

for epoch in range(epochs_teacher):
    teacher.train()
    optimizer.zero_grad()
    outputs = teacher(xTrain)
    loss = criterion(outputs, yTrain)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs_teacher}], Loss: {loss.item():.4f}")



Epoch [10/500], Loss: 3.1027
Epoch [20/500], Loss: 1.3881
Epoch [30/500], Loss: 0.8496
Epoch [40/500], Loss: 0.7184
Epoch [50/500], Loss: 0.6139
Epoch [60/500], Loss: 0.5431
Epoch [70/500], Loss: 0.4836
Epoch [80/500], Loss: 0.4437
Epoch [90/500], Loss: 0.4181
Epoch [100/500], Loss: 0.4000
Epoch [110/500], Loss: 0.3852
Epoch [120/500], Loss: 0.3722
Epoch [130/500], Loss: 0.3604
Epoch [140/500], Loss: 0.3490
Epoch [150/500], Loss: 0.3387
Epoch [160/500], Loss: 0.3291
Epoch [170/500], Loss: 0.3189
Epoch [180/500], Loss: 0.3188
Epoch [190/500], Loss: 0.3093
Epoch [200/500], Loss: 0.3011
Epoch [210/500], Loss: 0.2959
Epoch [220/500], Loss: 0.2910
Epoch [230/500], Loss: 0.2864
Epoch [240/500], Loss: 0.2849
Epoch [250/500], Loss: 0.2839
Epoch [260/500], Loss: 0.2793
Epoch [270/500], Loss: 0.2738
Epoch [280/500], Loss: 0.2706
Epoch [290/500], Loss: 0.2678
Epoch [300/500], Loss: 0.3049
Epoch [310/500], Loss: 0.2699
Epoch [320/500], Loss: 0.2629
Epoch [330/500], Loss: 0.2598
Epoch [340/500], Lo

# Student Model

In [10]:
# Student Model
class HouseStudent(nn.Module):
    def __init__(self):
        super(HouseStudent, self).__init__()
        self.fc1 = nn.Linear(8, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def distillation_loss_regression(student_out, teacher_out, true_labels, alpha=0.5):
  hard_loss = F.mse_loss(student_out, true_labels)
  soft_loss = F.mse_loss(student_out, teacher_out)
  return alpha * hard_loss + (1 - alpha) * soft_loss


student = HouseStudent()
optimizer_s = optim.Adam(student.parameters(), lr=0.001)
epochs = 500

for epoch in range(epochs):
    student.train()
    optimizer_s.zero_grad()

    with torch.no_grad():
        teacher_outputs = teacher(xTrain)

    student_outputs = student(xTrain)

    loss = distillation_loss_regression(student_outputs, teacher_outputs, yTrain)

    loss.backward()
    optimizer_s.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], KD Loss: {loss.item():.4f}")



Epoch [10/500], KD Loss: 5.0915
Epoch [20/500], KD Loss: 4.8967
Epoch [30/500], KD Loss: 4.7156
Epoch [40/500], KD Loss: 4.5344
Epoch [50/500], KD Loss: 4.3393
Epoch [60/500], KD Loss: 4.1189
Epoch [70/500], KD Loss: 3.8600
Epoch [80/500], KD Loss: 3.5461
Epoch [90/500], KD Loss: 3.1827
Epoch [100/500], KD Loss: 2.7883
Epoch [110/500], KD Loss: 2.3848
Epoch [120/500], KD Loss: 2.0009
Epoch [130/500], KD Loss: 1.6565
Epoch [140/500], KD Loss: 1.3638
Epoch [150/500], KD Loss: 1.1281
Epoch [160/500], KD Loss: 0.9484
Epoch [170/500], KD Loss: 0.8199
Epoch [180/500], KD Loss: 0.7337
Epoch [190/500], KD Loss: 0.6773
Epoch [200/500], KD Loss: 0.6393
Epoch [210/500], KD Loss: 0.6120
Epoch [220/500], KD Loss: 0.5912
Epoch [230/500], KD Loss: 0.5743
Epoch [240/500], KD Loss: 0.5599
Epoch [250/500], KD Loss: 0.5471
Epoch [260/500], KD Loss: 0.5355
Epoch [270/500], KD Loss: 0.5245
Epoch [280/500], KD Loss: 0.5139
Epoch [290/500], KD Loss: 0.5032
Epoch [300/500], KD Loss: 0.4908
Epoch [310/500], KD

# Person Model

In [11]:
# Person Model (same architecture as Student, trained directly)
class HousePerson(nn.Module):
    def __init__(self):
        super(HousePerson, self).__init__()
        self.fc1 = nn.Linear(8, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

person = HousePerson()
optimizer_p = optim.Adam(person.parameters(), lr=0.001)

for epoch in range(epochs):
    person.train()
    optimizer_p.zero_grad()
    outputs = person(xTrain)
    loss = criterion(outputs, yTrain)
    loss.backward()
    optimizer_p.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Person Loss: {loss.item():.4f}")



Epoch [10/500], Person Loss: 5.0393
Epoch [20/500], Person Loss: 4.7923
Epoch [30/500], Person Loss: 4.5488
Epoch [40/500], Person Loss: 4.2977
Epoch [50/500], Person Loss: 4.0300
Epoch [60/500], Person Loss: 3.7399
Epoch [70/500], Person Loss: 3.4301
Epoch [80/500], Person Loss: 3.1093
Epoch [90/500], Person Loss: 2.7824
Epoch [100/500], Person Loss: 2.4559
Epoch [110/500], Person Loss: 2.1395
Epoch [120/500], Person Loss: 1.8447
Epoch [130/500], Person Loss: 1.5829
Epoch [140/500], Person Loss: 1.3637
Epoch [150/500], Person Loss: 1.1914
Epoch [160/500], Person Loss: 1.0646
Epoch [170/500], Person Loss: 0.9764
Epoch [180/500], Person Loss: 0.9163
Epoch [190/500], Person Loss: 0.8737
Epoch [200/500], Person Loss: 0.8422
Epoch [210/500], Person Loss: 0.8180
Epoch [220/500], Person Loss: 0.7982
Epoch [230/500], Person Loss: 0.7812
Epoch [240/500], Person Loss: 0.7660
Epoch [250/500], Person Loss: 0.7517
Epoch [260/500], Person Loss: 0.7378
Epoch [270/500], Person Loss: 0.7241
Epoch [280

# Evaluation Functions

In [12]:
def evaluate_rmse(model, xTest, yTest):
    model.eval()
    with torch.no_grad():
        y_pred = model(xTest)
        mse = F.mse_loss(y_pred, yTest)
        rmse = torch.sqrt(mse).item()
    return rmse

def evaluate_inference_time(model, xTest):
    model.eval()
    with torch.no_grad():
        start_time = time.time()
        _ = model(xTest)
        end_time = time.time()
    elapsed_time = end_time - start_time
    avg_time = elapsed_time / xTest.size(0)
    return elapsed_time, avg_time

def evaluate_model_size(model):
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    param_size_MB = num_params * 4 / (1024**2)
    return num_params, param_size_MB

# Evalautions and Output Direct

In [13]:
# Evaluate
teacher_rmse = evaluate_rmse(teacher, xTest, yTest)
teacher_time, teacher_avg_time = evaluate_inference_time(teacher, xTest)
teacher_params, teacher_size = evaluate_model_size(teacher)

student_rmse = evaluate_rmse(student, xTest, yTest)
student_time, student_avg_time = evaluate_inference_time(student, xTest)
student_params, student_size = evaluate_model_size(student)

person_rmse = evaluate_rmse(person, xTest, yTest)
person_time, person_avg_time = evaluate_inference_time(person, xTest)
person_params, person_size = evaluate_model_size(person)

# Teacher stats
print(f"Teacher RMSE: {teacher_rmse:.4f}")
print(f"Teacher Inference Time: {teacher_time:.6f}s ({teacher_avg_time:.6f}s per sample)")
print(f"Teacher Params: {teacher_params}, Size: {teacher_size:.6f} MB")

# Student stats
print(f"Student RMSE: {student_rmse:.4f}")
print(f"Student Inference Time: {student_time:.6f}s ({student_avg_time:.6f}s per sample)")
print(f"Student Params: {student_params}, Size: {student_size:.6f} MB")

# Person stats
print(f"Person RMSE: {person_rmse:.4f}")
print(f"Person Inference Time: {person_time:.6f}s ({person_avg_time:.6f}s per sample)")
print(f"Person Params: {person_params}, Size: {person_size:.6f} MB")


Teacher RMSE: 0.5270
Teacher Inference Time: 0.033269s (0.000008s per sample)
Teacher Params: 179201, Size: 0.683598 MB
Student RMSE: 0.6942
Student Inference Time: 0.000320s (0.000000s per sample)
Student Params: 289, Size: 0.001102 MB
Person RMSE: 0.6974
Person Inference Time: 0.000436s (0.000000s per sample)
Person Params: 289, Size: 0.001102 MB


# Results Percentage Comparisms

In [14]:
# Student vs Teacher percentages
def percent_change(student_val, teacher_val):
    return ((student_val - teacher_val) / teacher_val) * 100 if teacher_val != 0 else float('inf')

print("\n--- Percentage Change (Student vs Teacher) ---")
print(f"RMSE Change: {percent_change(student_rmse, teacher_rmse):.2f}%")
print(f"Total Inference Time Change: {percent_change(student_time, teacher_time):.2f}%")
print(f"Avg Inference Time per Sample Change: {percent_change(student_avg_time, teacher_avg_time):.2f}%")
print(f"Params Change: {percent_change(student_params, teacher_params):.2f}%")
print(f"Model Size Change: {percent_change(student_size, teacher_size):.2f}%")

# Person vs Teacher percentages
print("\n--- Percentage Change (Person vs Teacher) ---")
print(f"RMSE Change: {percent_change(person_rmse, teacher_rmse):.2f}%")
print(f"Total Inference Time Change: {percent_change(person_time, teacher_time):.2f}%")
print(f"Avg Inference Time per Sample Change: {percent_change(person_avg_time, teacher_avg_time):.2f}%")
print(f"Params Change: {percent_change(person_params, teacher_params):.2f}%")
print(f"Model Size Change: {percent_change(person_size, teacher_size):.2f}%")


--- Percentage Change (Student vs Teacher) ---
RMSE Change: 31.73%
Total Inference Time Change: -99.04%
Avg Inference Time per Sample Change: -99.04%
Params Change: -99.84%
Model Size Change: -99.84%

--- Percentage Change (Person vs Teacher) ---
RMSE Change: 32.34%
Total Inference Time Change: -98.69%
Avg Inference Time per Sample Change: -98.69%
Params Change: -99.84%
Model Size Change: -99.84%
