In [1]:
import torch
import numpy as np
from torchvision import datasets
import torchvision
import torchvision.transforms as transforms

# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 20

# convert data to torch.FloatTensor

use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu") 

PATH = "save/trained_cnn_model.pt"

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.1307), (0.3081))])


# choose the training and test datasets
train_data = datasets.MNIST(root='data', train=True, download=False,  transform=transform)
test_data  = datasets.MNIST(root='data', train=False, download=False, transform=transform)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
    num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, 
    num_workers=num_workers)


import torch.nn as nn
import torch.nn.functional as F

## Define the NN architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

model = Net().to(device)
print(model)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

Net(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (dropout1): Dropout(p=0.25, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=9216, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)


In [2]:

# number of epochs to train the model
n_epochs = 10  
# per epoch, all the training data set is used once
model.train() # prep model for training


for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    
    ###################
    # train the model #
    ###################
    for data, target in train_loader:
        data, target = data.to(device), target.to(device) # loading to GPU
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*data.size(0) # as loss is tensor, .item() needed to get the value
        
    # print training statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)

    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
    
# see following link for details of state_dict   
# https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html
torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss,
            }, PATH)

Epoch: 1 	Training Loss: 0.407418
Epoch: 2 	Training Loss: 0.156157
Epoch: 3 	Training Loss: 0.107206
Epoch: 4 	Training Loss: 0.084939
Epoch: 5 	Training Loss: 0.070567
Epoch: 6 	Training Loss: 0.062605
Epoch: 7 	Training Loss: 0.057260
Epoch: 8 	Training Loss: 0.050419
Epoch: 9 	Training Loss: 0.046523
Epoch: 10 	Training Loss: 0.042122


In [3]:
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
train_loss = checkpoint['loss']

model.eval()

test_loss = 0
correct = 0

with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device) # loading to GPU
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)  
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)

print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


Test set: Accuracy: 9897/10000 (99%)



In [4]:

import torch.nn.utils.prune as prune


## sparsity= 0.9, L1-norm, keep the maximum values
prune.l1_unstructured(model.conv1, name='weight', amount=0.9)
prune.l1_unstructured(model.conv2, name='weight', amount=0.9)
prune.l1_unstructured(model.fc1, name='weight',   amount=0.9)
prune.l1_unstructured(model.fc2, name='weight',   amount=0.9)



'''
parameters_to_prune = (
    (model.conv1, 'weight'),
    (model.conv2, 'weight'),
    (model.fc1, 'weight'),
    (model.fc2, 'weight'),
)

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.9,
)
'''

"\nparameters_to_prune = (\n    (model.conv1, 'weight'),\n    (model.conv2, 'weight'),\n    (model.fc1, 'weight'),\n    (model.fc2, 'weight'),\n)\n\nprune.global_unstructured(\n    parameters_to_prune,\n    pruning_method=prune.L1Unstructured,\n    amount=0.9,\n)\n"

In [8]:
list(model.named_parameters())   ## print original weights and bias

[('conv1.bias',
  Parameter containing:
  tensor([ 0.1767, -0.0641,  0.1383, -0.1277, -0.2425,  0.3985,  0.2801, -0.1825,
           0.1816,  0.2958,  0.2951,  0.2997,  0.1704,  0.2194, -0.1753,  0.3444,
           0.2839, -0.2019, -0.2521,  0.3300, -0.0445,  0.3421,  0.0340, -0.2177,
          -0.1981,  0.1312,  0.3961,  0.2942, -0.2997,  0.0696, -0.2515, -0.0492],
         device='cuda:0', requires_grad=True)),
 ('conv1.weight_orig',
  Parameter containing:
  tensor([[[[-0.1543, -0.3897, -0.2129],
            [-0.1677, -0.0549, -0.4141],
            [ 0.4592,  0.3294, -0.0461]]],
  
  
          [[[-0.3311, -0.0720, -0.1613],
            [-0.2683, -0.3370,  0.0393],
            [ 0.0042, -0.1206, -0.1392]]],
  
  
          [[[ 0.0348,  0.3495,  0.0787],
            [ 0.1876, -0.2082,  0.2398],
            [ 0.1089, -0.3401,  0.3614]]],
  
  
          [[[-0.2217,  0.0509,  0.2880],
            [-0.1665, -0.1999, -0.3127],
            [ 0.0784,  0.0379,  0.0676]]],
  
  
          [[

In [9]:
model.conv1.weight  ## weights after pruning   weight = weight_orig*weight_mask

tensor([[[[-0.0000, -0.0000, -0.0000],
          [-0.0000, -0.0000, -0.4141],
          [ 0.4592,  0.0000, -0.0000]]],


        [[[-0.0000, -0.0000, -0.0000],
          [-0.0000, -0.0000,  0.0000],
          [ 0.0000, -0.0000, -0.0000]]],


        [[[ 0.0000,  0.0000,  0.0000],
          [ 0.0000, -0.0000,  0.0000],
          [ 0.0000, -0.0000,  0.0000]]],


        [[[-0.0000,  0.0000,  0.0000],
          [-0.0000, -0.0000, -0.0000],
          [ 0.0000,  0.0000,  0.0000]]],


        [[[-0.0000, -0.0000,  0.0000],
          [-0.0000, -0.0000, -0.0000],
          [ 0.0000,  0.0000,  0.0000]]],


        [[[ 0.0000, -0.0000, -0.0000],
          [-0.0000,  0.0000,  0.0000],
          [ 0.5285,  0.5707,  0.0000]]],


        [[[-0.0000, -0.4092,  0.0000],
          [ 0.0000, -0.0000,  0.0000],
          [-0.0000, -0.0000,  0.0000]]],


        [[[-0.0000,  0.0000,  0.0000],
          [-0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.4177, -0.0000]]],


        [[[-0.0000, -0.0000,  0.

In [10]:
model.conv1.weight_mask  ## pruning position=0   non-pruning position = 1

tensor([[[[0., 0., 0.],
          [0., 0., 1.],
          [1., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [1., 1., 0.]]],


        [[[0., 1., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 1., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[1., 0., 0.],
          [1., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 1., 1.],
          [1., 0., 0.],
          [1., 1., 0.]]],


        [[[0., 0., 1.],
          [0., 0., 1.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [1., 0., 0.]]],


        [[[0

In [11]:
print(model.conv1.weight_orig) ## original weights

Parameter containing:
tensor([[[[-0.1543, -0.3897, -0.2129],
          [-0.1677, -0.0549, -0.4141],
          [ 0.4592,  0.3294, -0.0461]]],


        [[[-0.3311, -0.0720, -0.1613],
          [-0.2683, -0.3370,  0.0393],
          [ 0.0042, -0.1206, -0.1392]]],


        [[[ 0.0348,  0.3495,  0.0787],
          [ 0.1876, -0.2082,  0.2398],
          [ 0.1089, -0.3401,  0.3614]]],


        [[[-0.2217,  0.0509,  0.2880],
          [-0.1665, -0.1999, -0.3127],
          [ 0.0784,  0.0379,  0.0676]]],


        [[[-0.0700, -0.2863,  0.0509],
          [-0.2318, -0.0868, -0.1835],
          [ 0.0735,  0.2243,  0.2917]]],


        [[[ 0.0571, -0.3867, -0.2179],
          [-0.1670,  0.3274,  0.0442],
          [ 0.5285,  0.5707,  0.2665]]],


        [[[-0.2328, -0.4092,  0.1658],
          [ 0.1244, -0.1309,  0.2966],
          [-0.3802, -0.0626,  0.2041]]],


        [[[-0.1673,  0.1645,  0.0460],
          [-0.0482,  0.0922,  0.1061],
          [ 0.0688,  0.4177, -0.0077]]],


        [[

In [12]:
mask1 = model.conv1.weight_mask
mask2 = model.conv2.weight_mask
mask3 = model.fc1.weight_mask
mask4 = model.fc2.weight_mask
sparsity_mask1 = (mask1 == 0).sum() / mask1.nelement()  ## calculate percentage of zeros
sparsity_mask2 = (mask2 == 0).sum() / mask2.nelement()
sparsity_mask3 = (mask3 == 0).sum() / mask3.nelement()
sparsity_mask4 = (mask4 == 0).sum() / mask4.nelement()
print("Conv1: ", sparsity_mask1)
print("Conv2: ", sparsity_mask2)
print("FC1:   ", sparsity_mask3)
print("FC2:   ", sparsity_mask4)

total_zeros = (mask1 == 0).sum() + (mask2 == 0).sum() + (mask3 == 0).sum() + (mask4 == 0).sum()
total_elements = mask1.nelement() + mask2.nelement() + mask3.nelement() + mask4.nelement()

print("total: ", total_zeros / total_elements)

Conv1:  tensor(0.8993, device='cuda:0')
Conv2:  tensor(0.9000, device='cuda:0')
FC1:    tensor(0.9000, device='cuda:0')
FC2:    tensor(0.9000, device='cuda:0')
total:  tensor(0.9000, device='cuda:0')


In [13]:

model.eval()

test_loss = 0
correct = 0

with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device) # loading to GPU
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)  
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)

print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


Test set: Accuracy: 4942/10000 (49%)



In [None]:
### This cell removes the weight_orig and weight_mask, but only store the pruned weight
### Note that if you want to fine-tune with the next cell, you should not run this cell
### If you run this cell and finetune, the pruned weight will be updated again.
prune.remove(model.conv1, 'weight')
prune.remove(model.conv2, 'weight')
prune.remove(model.fc1, 'weight')
prune.remove(model.fc2, 'weight')

In [14]:

# number of epochs to train the model
n_epochs = 10  
# per epoch, all the training data set is used once
model.train() # prep model for training


for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    
    ###################
    # train the model #
    ###################
    for data, target in train_loader:
        data, target = data.to(device), target.to(device) # loading to GPU
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*data.size(0) # as loss is tensor, .item() needed to get the value
        
    # print training statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)

    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
    


Epoch: 1 	Training Loss: 0.337096
Epoch: 2 	Training Loss: 0.195101
Epoch: 3 	Training Loss: 0.164772
Epoch: 4 	Training Loss: 0.145192
Epoch: 5 	Training Loss: 0.132416
Epoch: 6 	Training Loss: 0.126836
Epoch: 7 	Training Loss: 0.115685
Epoch: 8 	Training Loss: 0.111675
Epoch: 9 	Training Loss: 0.105740
Epoch: 10 	Training Loss: 0.100351


In [15]:
# see following link for details of state_dict   
# https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html
PATH_prune = "save/trained_cnn_model_pruned.pt"

torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss,
            }, PATH_prune)

In [None]:
# After above finetuning, run the 7th cell to check the sparsity.
# Then, check the accuracy by running the 8th cell

In [16]:
## Now, after running the first cell, let's try to load the stored model.
## It won't work because the named_parameters has been changed.
## Thus, run 4th cell to make a pruned model.
## Then, run this cell.

PATH_prune = "save/trained_cnn_model_pruned.pt"
checkpoint = torch.load(PATH_prune)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
train_loss = checkpoint['loss']

model.eval()

test_loss = 0
correct = 0

with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device) # loading to GPU
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)  
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)

print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


Test set: Accuracy: 9860/10000 (99%)

