<a href="https://colab.research.google.com/github/Janindu-Muthunayaka/model-distillation/blob/main/Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installments

In [1]:
!pip install numpy pandas scikit-learn torch

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



# Data Preparation

Splitting

In [2]:
housing = fetch_california_housing(as_frame=True)
X_House = housing.data
Y_House = housing.target

print("Feature columns:", X_House.columns.tolist())
print("\nTarget (what we predict): MedianHouseValue")

# Train-test split
xTrain, xTest, yTrain, yTest = train_test_split(
    X_House, Y_House, test_size=0.2, random_state=42
)


Feature columns: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

Target (what we predict): MedianHouseValue


Scaling

In [3]:
# Scale features
scaler = StandardScaler()
xTrain = scaler.fit_transform(xTrain)
xTest = scaler.transform(xTest)

# Convert to torch tensors
xTrain = torch.tensor(xTrain, dtype=torch.float32)
yTrain = torch.tensor(yTrain.values, dtype=torch.float32).view(-1, 1)  # regression → float
xTest = torch.tensor(xTest, dtype=torch.float32)
yTest = torch.tensor(yTest.values, dtype=torch.float32).view(-1, 1)


# Training Teacher

In [5]:
class HouseTeacher(nn.Module):
    def __init__(self):
        super(HouseTeacher, self).__init__()
        self.fc1 = nn.Linear(8, 64)   # input: 8 features
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, 8)
        self.fc5 = nn.Linear(8, 1)    # output: 1 (house price)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)   # no activation → regression output
        return x


Training with MSE Loss

In [9]:
teacher = HouseTeacher()
criterion = nn.MSELoss()
optimizer = optim.Adam(teacher.parameters(), lr=0.001)
epochs = 20

for epoch in range(epochs):
    teacher.train()
    optimizer.zero_grad()
    outputs = teacher(xTrain)
    loss = criterion(outputs.squeeze(), yTrain)  # squeeze() to match shapes
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 2 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")


Epoch [2/20], Loss: 6.3683
Epoch [4/20], Loss: 6.3126
Epoch [6/20], Loss: 6.2576
Epoch [8/20], Loss: 6.2022
Epoch [10/20], Loss: 6.1458
Epoch [12/20], Loss: 6.0874
Epoch [14/20], Loss: 6.0265
Epoch [16/20], Loss: 5.9645
Epoch [18/20], Loss: 5.9059
Epoch [20/20], Loss: 5.8548


# Training Student

In [10]:
class HouseStudent(nn.Module):
    def __init__(self):
        super(HouseStudent, self).__init__()
        self.fc1 = nn.Linear(8, 16)   # input: 8 features
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)    # output: 1 (house price)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Training with Distillation Loss

In [11]:
def distillation_loss_regression(student_out, teacher_out, true_labels, alpha=0.5):
    # Hard loss: student vs true labels
    hard_loss = F.mse_loss(student_out, true_labels)
    # Soft loss: student vs teacher
    soft_loss = F.mse_loss(student_out, teacher_out)
    # Weighted sum
    return alpha * hard_loss + (1 - alpha) * soft_loss

student = HouseStudent()
optimizer_s = optim.Adam(student.parameters(), lr=0.001)

for epoch in range(epochs):
    student.train()
    optimizer_s.zero_grad()

    with torch.no_grad():
        teacher_outputs = teacher(xTrain)

    student_outputs = student(xTrain)

    loss = distillation_loss_regression(student_outputs, teacher_outputs, yTrain)

    loss.backward()
    optimizer_s.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], KD Loss: {loss.item():.4f}")


Epoch [10/20], KD Loss: 3.4971
Epoch [20/20], KD Loss: 3.3592


# Testing

In [12]:
import time
import math

# 1. RMSE function
def evaluate_rmse(model, xTest, yTest):
    model.eval()
    with torch.no_grad():
        y_pred = model(xTest)
        mse = F.mse_loss(y_pred, yTest)
        rmse = torch.sqrt(mse).item()
    return rmse

# 2. Inference time function
def evaluate_inference_time(model, xTest):
    model.eval()
    with torch.no_grad():
        start_time = time.time()
        _ = model(xTest)
        end_time = time.time()
    elapsed_time = end_time - start_time
    avg_time = elapsed_time / xTest.size(0)
    return elapsed_time, avg_time

# 3. Model size function
def evaluate_model_size(model):
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    param_size_MB = num_params * 4 / (1024**2)  # 4 bytes per float32
    return num_params, param_size_MB


# Results

In [13]:
# Teacher
teacher_rmse = evaluate_rmse(teacher, xTest, yTest)
teacher_time, teacher_avg_time = evaluate_inference_time(teacher, xTest)
teacher_params, teacher_size = evaluate_model_size(teacher)

print(f"Teacher RMSE: {teacher_rmse:.4f}")
print(f"Teacher Inference Time: {teacher_time:.6f}s ({teacher_avg_time:.6f}s per sample)")
print(f"Teacher Params: {teacher_params}, Size: {teacher_size:.6f} MB")

# Student
student_rmse = evaluate_rmse(student, xTest, yTest)
student_time, student_avg_time = evaluate_inference_time(student, xTest)
student_params, student_size = evaluate_model_size(student)

print(f"Student RMSE: {student_rmse:.4f}")
print(f"Student Inference Time: {student_time:.6f}s ({student_avg_time:.6f}s per sample)")
print(f"Student Params: {student_params}, Size: {student_size:.6f} MB")


Teacher RMSE: 2.3947
Teacher Inference Time: 0.002627s (0.000001s per sample)
Teacher Params: 3329, Size: 0.012699 MB
Student RMSE: 2.5609
Student Inference Time: 0.000389s (0.000000s per sample)
Student Params: 289, Size: 0.001102 MB


In [14]:
def percent_change(student_val, teacher_val):
    return ((student_val - teacher_val) / teacher_val) * 100 if teacher_val != 0 else float('inf')

rmse_change = percent_change(student_rmse, teacher_rmse)
time_change = percent_change(student_time, teacher_time)
avg_time_change = percent_change(student_avg_time, teacher_avg_time)
params_change = percent_change(student_params, teacher_params)
size_change = percent_change(student_size, teacher_size)

print("\n--- Percentage Change (Student vs Teacher) ---")
print(f"RMSE Change: {rmse_change:.2f}%")
print(f"Total Inference Time Change: {time_change:.2f}%")
print(f"Avg Inference Time per Sample Change: {avg_time_change:.2f}%")
print(f"Params Change: {params_change:.2f}%")
print(f"Model Size Change: {size_change:.2f}%")



--- Percentage Change (Student vs Teacher) ---
RMSE Change: 6.94%
Total Inference Time Change: -85.19%
Avg Inference Time per Sample Change: -85.19%
Params Change: -91.32%
Model Size Change: -91.32%
