Pip Install

In [None]:
!pip install torch torchvision numpy matplotlib

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

Imports and Initialization

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import time
import copy
import os
import gc
import torch.utils.mobile_optimizer as mobile_optimizer
from torch.nn.utils import prune


# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)


# Check if CUDA is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda:0


Model Definition

In [4]:
# Define a simple CNN model
class SimpleCNN(nn.Module):
   def __init__(self, in_channels=1, conv1_channels=32, conv2_channels=64, fc1_units=128, fc2_units=10):
       super(SimpleCNN, self).__init__()
       self.conv1 = nn.Conv2d(in_channels, conv1_channels, kernel_size=3, padding=1)
       self.conv2 = nn.Conv2d(conv1_channels, conv2_channels, kernel_size=3, padding=1)
       self.pool = nn.MaxPool2d(2, 2)
       self.fc1 = nn.Linear(conv2_channels * 7 * 7, fc1_units)
       self.fc2 = nn.Linear(fc1_units, fc2_units)
       self.relu = nn.ReLU()


   def forward(self, x):
       x = self.pool(self.relu(self.conv1(x)))
       x = self.pool(self.relu(self.conv2(x)))
       x = x.view(-1, self.conv2.out_channels * 7 * 7)
       x = self.relu(self.fc1(x))
       x = self.fc2(x)
       return x


Loading Dataset

In [5]:
# Load MNIST dataset
def load_data():
   transform = transforms.Compose([
       transforms.ToTensor(),
       transforms.Normalize((0.1307,), (0.3081,))
   ])


   trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                        download=True, transform=transform)
   trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                             shuffle=True, num_workers=2)


   testset = torchvision.datasets.MNIST(root='./data', train=False,
                                       download=True, transform=transform)
   testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                            shuffle=False, num_workers=2)
   return trainloader, testloader




Utils for Evaluation

In [6]:
# Evaluate model accuracy
def evaluate_model(model, testloader):
   model.to(device)
   model.eval()
   correct = 0
   total = 0
   with torch.no_grad():
       for data in testloader:
           images, labels = data[0].to(device), data[1].to(device)
           outputs = model(images)
           _, predicted = torch.max(outputs.data, 1)
           total += labels.size(0)
           correct += (predicted == labels).sum().item()


   accuracy = 100 * correct / total
   return accuracy


# Measure inference time more accurately
def measure_inference_time(model, testloader, num_runs=3):
   model.to(device)
   model.eval()


   # Warm-up run
   with torch.no_grad():
       for data in testloader:
           images = data[0].to(device)
           _ = model(images)


   # Measure inference time
   torch.cuda.synchronize() if device.type == 'cuda' else None
   total_time = 0


   for _ in range(num_runs):
       start_time = time.time()
       with torch.no_grad():
           for data in testloader:
               images = data[0].to(device)
               _ = model(images)
       torch.cuda.synchronize() if device.type == 'cuda' else None
       end_time = time.time()
       total_time += (end_time - start_time)


   return total_time / num_runs


# Get model size in MB (more accurately)
def get_model_size(model, filename="temp_model.pth"):
   # Save model in different formats to compare
   torch.save(model.state_dict(), filename)
   state_dict_size = os.path.getsize(filename) / (1024 * 1024)


   # Save as TorchScript for better compression of sparse models
   script_model = torch.jit.script(model.cpu())
   optimized_script_model = mobile_optimizer.optimize_for_mobile(script_model)
   script_filename = filename.replace('.pth', '.pt')
   optimized_script_model.save(script_filename)
   script_size = os.path.getsize(script_filename) / (1024 * 1024)


   # Clean up files
   if os.path.exists(filename):
       os.remove(filename)
   if os.path.exists(script_filename):
       os.remove(script_filename)


   return state_dict_size, script_size


# Count non-zero parameters
def count_parameters(model):
   total_params = 0
   nonzero_params = 0
   for param in model.parameters():
       total_params += param.numel()
       nonzero_params += torch.sum(param != 0).item()
   return total_params, nonzero_params


Training

In [7]:
# Train the model
def train_model(model, trainloader, epochs=3):
   model.to(device)
   criterion = nn.CrossEntropyLoss()
   optimizer = optim.Adam(model.parameters(), lr=0.001)


   for epoch in range(epochs):
       running_loss = 0.0
       for i, data in enumerate(trainloader, 0):
           inputs, labels = data[0].to(device), data[1].to(device)
           optimizer.zero_grad()
           outputs = model(inputs)
           loss = criterion(outputs, labels)
           loss.backward()
           optimizer.step()
           running_loss += loss.item()


           if i % 100 == 99:
               print(f'Epoch {epoch + 1}, Batch {i + 1}, Loss: {running_loss / 100:.3f}')
               running_loss = 0.0


   print('Finished Training')
   return model


Unstructured Pruning

In [8]:
# Apply unstructured pruning with parameter removal
def apply_unstructured_pruning(model, prune_amount=0.5):
   print("Applying unstructured pruning...")
   pruned_model = copy.deepcopy(model)
   pruned_model.cpu()  # Move to CPU for pruning operations


   # Apply pruning to all conv and linear layers
   for name, module in pruned_model.named_modules():
       if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
           prune.l1_unstructured(module, name='weight', amount=prune_amount)
           # Make pruning permanent (removes the mask)
           prune.remove(module, 'weight')


   # For actual model size reduction in a real system, we would need
   # to convert this to a sparse format or create a new model with fewer parameters
   return pruned_model




Structured Pruning

In [9]:
# Implement actual structured pruning (removing channels)
def apply_structured_pruning(model, prune_amount=0.3):
   print("Applying structured pruning...")
   original_model = model.cpu()  # Move to CPU for pruning operations


   # Get the original architecture parameters
   original_conv1_out = original_model.conv1.out_channels
   original_conv2_out = original_model.conv2.out_channels
   original_fc1_out = original_model.fc1.out_features


   # Calculate new dimensions after pruning
   new_conv1_out = int(original_conv1_out * (1 - prune_amount))
   new_conv2_out = int(original_conv2_out * (1 - prune_amount))
   new_fc1_out = int(original_fc1_out * (1 - prune_amount))


   # Create new model with reduced dimensions
   pruned_model = SimpleCNN(
       in_channels=1,
       conv1_channels=new_conv1_out,
       conv2_channels=new_conv2_out,
       fc1_units=new_fc1_out,
       fc2_units=10  # Output dimension remains the same
   )


   # For each layer, determine which channels to keep based on L2 norm
   # Conv1 layer
   conv1_weight = original_model.conv1.weight.data
   conv1_channel_norms = torch.norm(conv1_weight, p=2, dim=[1, 2, 3])
   _, conv1_indices = torch.topk(conv1_channel_norms, new_conv1_out)
   conv1_indices = sorted(conv1_indices.tolist())


   # Copy weights for kept channels in conv1
   pruned_model.conv1.weight.data = conv1_weight[conv1_indices]
   pruned_model.conv1.bias.data = original_model.conv1.bias.data[conv1_indices]


   # Conv2 layer (need to adjust input channels to match Conv1 output)
   conv2_weight = original_model.conv2.weight.data
   conv2_channel_norms = torch.norm(conv2_weight, p=2, dim=[1, 2, 3])
   _, conv2_indices = torch.topk(conv2_channel_norms, new_conv2_out)
   conv2_indices = sorted(conv2_indices.tolist())


   # Create a new weight tensor for conv2 with adjusted dimensions
   pruned_model.conv2.weight.data = torch.zeros(
       new_conv2_out, new_conv1_out,
       conv2_weight.size(2), conv2_weight.size(3)
   )


   # Copy weights for kept channels, adjusting for input channels
   for i, out_idx in enumerate(conv2_indices):
       for j, in_idx in enumerate(conv1_indices):
           pruned_model.conv2.weight.data[i, j] = conv2_weight[out_idx, in_idx]


   pruned_model.conv2.bias.data = original_model.conv2.bias.data[conv2_indices]


   # FC1 layer (need to adjust input to match Conv2 output)
   fc1_weight = original_model.fc1.weight.data
   fc1_output_norms = torch.norm(fc1_weight, p=2, dim=1)
   _, fc1_indices = torch.topk(fc1_output_norms, new_fc1_out)
   fc1_indices = sorted(fc1_indices.tolist())


   # Create a new weight tensor with adjusted dimensions
   pruned_model.fc1.weight.data = torch.zeros(
       new_fc1_out, new_conv2_out * 7 * 7
   )


   # This is a bit tricky - we need to reshape both matrices to account for
   # the changed conv2 output channels
   reshaped_old = fc1_weight.view(original_fc1_out, original_conv2_out, 7, 7)
   reshaped_new = pruned_model.fc1.weight.data.view(new_fc1_out, new_conv2_out, 7, 7)


   for i, out_idx in enumerate(fc1_indices):
       for j, in_idx in enumerate(conv2_indices):
           reshaped_new[i, j] = reshaped_old[out_idx, in_idx]


   pruned_model.fc1.bias.data = original_model.fc1.bias.data[fc1_indices]


   # FC2 layer
   fc2_weight = original_model.fc2.weight.data
   # Create a new weight tensor for FC2 with adjusted dimensions
   pruned_model.fc2.weight.data = torch.zeros(10, new_fc1_out)


   # Copy weights for kept FC1 output units
   for j, in_idx in enumerate(fc1_indices):
       pruned_model.fc2.weight.data[:, j] = fc2_weight[:, in_idx]


   pruned_model.fc2.bias.data = original_model.fc2.bias.data


   return pruned_model


Fine-Tuning

In [10]:
# Fine-tune the pruned model
def fine_tune_model(model, trainloader, epochs=2):
   model.to(device)
   criterion = nn.CrossEntropyLoss()
   optimizer = optim.Adam(model.parameters(), lr=0.0005)


   print("Fine-tuning the pruned model...")
   for epoch in range(epochs):
       running_loss = 0.0
       for i, data in enumerate(trainloader, 0):
           inputs, labels = data[0].to(device), data[1].to(device)
           optimizer.zero_grad()
           outputs = model(inputs)
           loss = criterion(outputs, labels)
           loss.backward()
           optimizer.step()
           running_loss += loss.item()


           if i % 100 == 99:
               print(f'Fine-tuning Epoch {epoch + 1}, Batch {i + 1}, Loss: {running_loss / 100:.3f}')
               running_loss = 0.0


   print('Finished Fine-tuning')
   return model


Main Function

In [11]:
# Main function to run the experiment
def main():
   import os


   # Load data
   trainloader, testloader = load_data()


   # Initialize and train the model
   print("Training original model...")
   original_model = SimpleCNN()
   original_model = train_model(original_model, trainloader)


   # Evaluate original model
   print("\nEvaluating original model...")
   original_accuracy = evaluate_model(original_model, testloader)
   original_inference_time = measure_inference_time(original_model, testloader)
   original_state_dict_size, original_script_size = get_model_size(original_model)
   total_params, nonzero_params = count_parameters(original_model)


   print("\n--- Original Model Metrics ---")
   print(f"Accuracy: {original_accuracy:.2f}%")
   print(f"Inference Time: {original_inference_time:.4f} seconds")
   print(f"Model Size (state_dict): {original_state_dict_size:.2f} MB")
   print(f"Model Size (TorchScript): {original_script_size:.2f} MB")
   print(f"Total Parameters: {total_params}")
   print(f"Non-zero Parameters: {nonzero_params} ({nonzero_params/total_params*100:.2f}%)")


   # Clear memory
   gc.collect()
   torch.cuda.empty_cache() if device.type == 'cuda' else None


   # Apply unstructured pruning
   unstructured_model = apply_unstructured_pruning(original_model)


   # Evaluate unstructured pruned model
   print("\nEvaluating unstructured pruned model...")
   unstructured_accuracy = evaluate_model(unstructured_model, testloader)
   unstructured_inference_time = measure_inference_time(unstructured_model, testloader)
   unstructured_state_dict_size, unstructured_script_size = get_model_size(unstructured_model)
   total_params_u, nonzero_params_u = count_parameters(unstructured_model)


   print("\n--- Unstructured Pruned Model Metrics ---")
   print(f"Accuracy: {unstructured_accuracy:.2f}%")
   print(f"Inference Time: {unstructured_inference_time:.4f} seconds")
   print(f"Model Size (state_dict): {unstructured_state_dict_size:.2f} MB")
   print(f"Model Size (TorchScript): {unstructured_script_size:.2f} MB")
   print(f"Total Parameters: {total_params_u}")
   print(f"Non-zero Parameters: {nonzero_params_u} ({nonzero_params_u/total_params_u*100:.2f}%)")
   print(f"Accuracy Drop: {original_accuracy - unstructured_accuracy:.2f}%")
   print(f"Inference Speedup: {original_inference_time / unstructured_inference_time:.2f}x")
   print(f"Size Reduction (state_dict): {(1 - unstructured_state_dict_size / original_state_dict_size) * 100:.2f}%")
   print(f"Size Reduction (TorchScript): {(1 - unstructured_script_size / original_script_size) * 100:.2f}%")


   # Clear memory
   gc.collect()
   torch.cuda.empty_cache() if device.type == 'cuda' else None


   # Apply structured pruning
   structured_model = apply_structured_pruning(original_model)


   # Evaluate structured pruned model
   print("\nEvaluating structured pruned model...")
   structured_accuracy = evaluate_model(structured_model, testloader)
   structured_inference_time = measure_inference_time(structured_model, testloader)
   structured_state_dict_size, structured_script_size = get_model_size(structured_model)
   total_params_s, nonzero_params_s = count_parameters(structured_model)


   print("\n--- Structured Pruned Model Metrics ---")
   print(f"Accuracy: {structured_accuracy:.2f}%")
   print(f"Inference Time: {structured_inference_time:.4f} seconds")
   print(f"Model Size (state_dict): {structured_state_dict_size:.2f} MB")
   print(f"Model Size (TorchScript): {structured_script_size:.2f} MB")
   print(f"Total Parameters: {total_params_s}")
   print(f"Non-zero Parameters: {nonzero_params_s} ({nonzero_params_s/total_params_s*100:.2f}%)")
   print(f"Accuracy Drop: {original_accuracy - structured_accuracy:.2f}%")
   print(f"Inference Speedup: {original_inference_time / structured_inference_time:.2f}x")
   print(f"Size Reduction (state_dict): {(1 - structured_state_dict_size / original_state_dict_size) * 100:.2f}%")
   print(f"Size Reduction (TorchScript): {(1 - structured_script_size / original_script_size) * 100:.2f}%")


   # Fine-tune unstructured pruned model
   fine_tuned_unstructured = fine_tune_model(unstructured_model, trainloader)


   # Evaluate fine-tuned unstructured pruned model
   print("\nEvaluating fine-tuned unstructured pruned model...")
   fine_tuned_unstructured_accuracy = evaluate_model(fine_tuned_unstructured, testloader)
   fine_tuned_unstructured_inference = measure_inference_time(fine_tuned_unstructured, testloader)


   print("\n--- Fine-tuned Unstructured Pruned Model Metrics ---")
   print(f"Accuracy: {fine_tuned_unstructured_accuracy:.2f}%")
   print(f"Inference Time: {fine_tuned_unstructured_inference:.4f} seconds")
   print(f"Accuracy Recovery: {fine_tuned_unstructured_accuracy - unstructured_accuracy:.2f}%")


   # Fine-tune structured pruned model
   fine_tuned_structured = fine_tune_model(structured_model, trainloader)


   # Evaluate fine-tuned structured pruned model
   print("\nEvaluating fine-tuned structured pruned model...")
   fine_tuned_structured_accuracy = evaluate_model(fine_tuned_structured, testloader)
   fine_tuned_structured_inference = measure_inference_time(fine_tuned_structured, testloader)


   print("\n--- Fine-tuned Structured Pruned Model Metrics ---")
   print(f"Accuracy: {fine_tuned_structured_accuracy:.2f}%")
   print(f"Inference Time: {fine_tuned_structured_inference:.4f} seconds")
   print(f"Accuracy Recovery: {fine_tuned_structured_accuracy - structured_accuracy:.2f}%")


   # Summary comparison
   print("\n--- Summary ---")
   print("Model              | Accuracy | Inference Time | Size (MB) | Non-zero/Total Params")
   print("--------------------|----------|----------------|-----------|--------------------")
   print(f"Original            | {original_accuracy:.2f}%   | {original_inference_time:.4f}s        | {original_script_size:.2f}    | {nonzero_params}/{total_params} ({nonzero_params/total_params*100:.1f}%)")
   print(f"Unstructured Pruned | {unstructured_accuracy:.2f}%   | {unstructured_inference_time:.4f}s        | {unstructured_script_size:.2f}    | {nonzero_params_u}/{total_params_u} ({nonzero_params_u/total_params_u*100:.1f}%)")
   print(f"+ Fine-tuned        | {fine_tuned_unstructured_accuracy:.2f}%   | {fine_tuned_unstructured_inference:.4f}s        | {unstructured_script_size:.2f}    | {nonzero_params_u}/{total_params_u} ({nonzero_params_u/total_params_u*100:.1f}%)")
   print(f"Structured Pruned   | {structured_accuracy:.2f}%   | {structured_inference_time:.4f}s        | {structured_script_size:.2f}    | {nonzero_params_s}/{total_params_s} ({nonzero_params_s/total_params_s*100:.1f}%)")
   print(f"+ Fine-tuned        | {fine_tuned_structured_accuracy:.2f}%   | {fine_tuned_structured_inference:.4f}s        | {structured_script_size:.2f}    | {nonzero_params_s}/{total_params_s} ({nonzero_params_s/total_params_s*100:.1f}%)")



In [12]:
if __name__ == "__main__":
   main()


100%|██████████| 9.91M/9.91M [00:00<00:00, 18.2MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 481kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 4.47MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 6.81MB/s]


Training original model...
Epoch 1, Batch 100, Loss: 0.570
Epoch 1, Batch 200, Loss: 0.162
Epoch 1, Batch 300, Loss: 0.098
Epoch 1, Batch 400, Loss: 0.087
Epoch 1, Batch 500, Loss: 0.088
Epoch 1, Batch 600, Loss: 0.064
Epoch 1, Batch 700, Loss: 0.065
Epoch 1, Batch 800, Loss: 0.065
Epoch 1, Batch 900, Loss: 0.048
Epoch 2, Batch 100, Loss: 0.049
Epoch 2, Batch 200, Loss: 0.041
Epoch 2, Batch 300, Loss: 0.053
Epoch 2, Batch 400, Loss: 0.036
Epoch 2, Batch 500, Loss: 0.045
Epoch 2, Batch 600, Loss: 0.040
Epoch 2, Batch 700, Loss: 0.033
Epoch 2, Batch 800, Loss: 0.038
Epoch 2, Batch 900, Loss: 0.036
Epoch 3, Batch 100, Loss: 0.030
Epoch 3, Batch 200, Loss: 0.034
Epoch 3, Batch 300, Loss: 0.031
Epoch 3, Batch 400, Loss: 0.030
Epoch 3, Batch 500, Loss: 0.023
Epoch 3, Batch 600, Loss: 0.034
Epoch 3, Batch 700, Loss: 0.021
Epoch 3, Batch 800, Loss: 0.027
Epoch 3, Batch 900, Loss: 0.026
Finished Training

Evaluating original model...

--- Original Model Metrics ---
Accuracy: 99.02%
Inference Ti