[文章参考来源](https://pytorch.org/tutorials/intermediate/pruning_tutorial.html#serializing-a-pruned-model)

# 导入函数库

In [1]:
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
import torch.nn.functional as F

# 定义网络结构

In [2]:
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 3)  #输入1通道，输出6通道，kernel_size=3
        self.conv2 = nn.Conv2d(6, 16, 3)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10) # 因为是10分类，所以最后一层的神经元个数为10
    
    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
        # 全连接层接受的是二维输入，所以要把[batch_size,C,H,W]转化成二维[batch_size,C*H*W]，.nelement()的作用是统计array里元素个数
        x = x.view(-1, int(x.nelement() / x.shape[0]))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x) # 返回logits
        return x

# 声明一个LeNet网络的实例

In [3]:
model = LeNet()

## 查看网络结构

In [4]:
model.parameters

<bound method Module.parameters of LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)>

In [5]:
model.modules

<bound method Module.modules of LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)>

## 检测conv1,其中包含了weight和bias

In [6]:
module = model.conv1

先说一下后面会用到的几个概念，model.conv1.weight这里的weight是属性，运行prune操作之后产生的稀疏权重会存储在这里，然后我们需要运行prune.remove()操作，才能让model.conv1.weight和model.conv1.named_parameters()的显示结果变成一样

model.conv1.named_buffers()运行操作后，生成的mask会在这里

model.conv1.named_parameters()模型参数的存储位置，这里面的值会跟随torch.save存储到本地的.pth

## 观察module或者说是conv1,我们发现了weight和bias

In [7]:
list(module.named_parameters())

[('weight',
  Parameter containing:
  tensor([[[[ 0.0436,  0.2702,  0.0559],
            [ 0.1413, -0.0787, -0.1531],
            [ 0.0558,  0.2458, -0.0896]]],
  
  
          [[[-0.2921,  0.1019,  0.0539],
            [ 0.1019,  0.2754, -0.1386],
            [-0.2756, -0.1292,  0.1475]]],
  
  
          [[[ 0.2188,  0.0133, -0.1130],
            [-0.1287,  0.2355, -0.1248],
            [ 0.2215,  0.1693,  0.0800]]],
  
  
          [[[-0.1820, -0.0322,  0.1641],
            [-0.1137, -0.1801, -0.1784],
            [-0.2672,  0.3220, -0.2941]]],
  
  
          [[[-0.3005,  0.1728,  0.1805],
            [ 0.0071,  0.1560, -0.0921],
            [ 0.0193, -0.2779,  0.0085]]],
  
  
          [[[ 0.0596,  0.2006, -0.0119],
            [ 0.2035, -0.0278,  0.2438],
            [-0.3046, -0.1751,  0.2437]]]], requires_grad=True)),
 ('bias',
  Parameter containing:
  tensor([-0.0568, -0.0444,  0.1789,  0.2256,  0.0646, -0.1924],
         requires_grad=True))]

## 调用weight属性

In [8]:
module.weight

Parameter containing:
tensor([[[[ 0.0436,  0.2702,  0.0559],
          [ 0.1413, -0.0787, -0.1531],
          [ 0.0558,  0.2458, -0.0896]]],


        [[[-0.2921,  0.1019,  0.0539],
          [ 0.1019,  0.2754, -0.1386],
          [-0.2756, -0.1292,  0.1475]]],


        [[[ 0.2188,  0.0133, -0.1130],
          [-0.1287,  0.2355, -0.1248],
          [ 0.2215,  0.1693,  0.0800]]],


        [[[-0.1820, -0.0322,  0.1641],
          [-0.1137, -0.1801, -0.1784],
          [-0.2672,  0.3220, -0.2941]]],


        [[[-0.3005,  0.1728,  0.1805],
          [ 0.0071,  0.1560, -0.0921],
          [ 0.0193, -0.2779,  0.0085]]],


        [[[ 0.0596,  0.2006, -0.0119],
          [ 0.2035, -0.0278,  0.2438],
          [-0.3046, -0.1751,  0.2437]]]], requires_grad=True)

## 因为还没使用过conv1,所以这里的buffer为空，调用prune之后你会发现差别

In [9]:
list(module.named_buffers())

[]

# 对conv1里的weight进行random_unstructured剪枝

In [10]:
prune.random_unstructured(module, name='weight', amount=0.3) # 裁剪比例设置为0.3

Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))

## 你可以发现这里的weight被重命名为weight_orig

In [11]:
list(module.named_parameters())

[('bias',
  Parameter containing:
  tensor([-0.0568, -0.0444,  0.1789,  0.2256,  0.0646, -0.1924],
         requires_grad=True)),
 ('weight_orig',
  Parameter containing:
  tensor([[[[ 0.0436,  0.2702,  0.0559],
            [ 0.1413, -0.0787, -0.1531],
            [ 0.0558,  0.2458, -0.0896]]],
  
  
          [[[-0.2921,  0.1019,  0.0539],
            [ 0.1019,  0.2754, -0.1386],
            [-0.2756, -0.1292,  0.1475]]],
  
  
          [[[ 0.2188,  0.0133, -0.1130],
            [-0.1287,  0.2355, -0.1248],
            [ 0.2215,  0.1693,  0.0800]]],
  
  
          [[[-0.1820, -0.0322,  0.1641],
            [-0.1137, -0.1801, -0.1784],
            [-0.2672,  0.3220, -0.2941]]],
  
  
          [[[-0.3005,  0.1728,  0.1805],
            [ 0.0071,  0.1560, -0.0921],
            [ 0.0193, -0.2779,  0.0085]]],
  
  
          [[[ 0.0596,  0.2006, -0.0119],
            [ 0.2035, -0.0278,  0.2438],
            [-0.3046, -0.1751,  0.2437]]]], requires_grad=True))]

## model.conv1.weight这里的weight是属性，运行prune操作之后产生的稀疏权重会存储在这里

In [12]:
module.weight   # 你会发现有些地方的值为0

tensor([[[[ 0.0436,  0.0000,  0.0559],
          [ 0.1413, -0.0787, -0.1531],
          [ 0.0558,  0.0000, -0.0000]]],


        [[[-0.0000,  0.1019,  0.0000],
          [ 0.1019,  0.2754, -0.1386],
          [-0.2756, -0.0000,  0.0000]]],


        [[[ 0.0000,  0.0133, -0.1130],
          [-0.1287,  0.2355, -0.1248],
          [ 0.2215,  0.1693,  0.0800]]],


        [[[-0.1820, -0.0000,  0.1641],
          [-0.1137, -0.1801, -0.1784],
          [-0.2672,  0.3220, -0.2941]]],


        [[[-0.3005,  0.1728,  0.1805],
          [ 0.0071,  0.0000, -0.0921],
          [ 0.0000, -0.2779,  0.0085]]],


        [[[ 0.0000,  0.0000, -0.0000],
          [ 0.2035, -0.0278,  0.2438],
          [-0.0000, -0.0000,  0.2437]]]], grad_fn=<MulBackward0>)

## model.conv1.named_buffers()运行prune操作后，生成的mask会在这里

In [13]:
list(module.named_buffers())  # 因为目前只对weight进行了prune，所以只有weight_mask

[('weight_mask',
  tensor([[[[1., 0., 1.],
            [1., 1., 1.],
            [1., 0., 0.]]],
  
  
          [[[0., 1., 0.],
            [1., 1., 1.],
            [1., 0., 0.]]],
  
  
          [[[0., 1., 1.],
            [1., 1., 1.],
            [1., 1., 1.]]],
  
  
          [[[1., 0., 1.],
            [1., 1., 1.],
            [1., 1., 1.]]],
  
  
          [[[1., 1., 1.],
            [1., 0., 1.],
            [0., 1., 1.]]],
  
  
          [[[0., 0., 0.],
            [1., 1., 1.],
            [0., 0., 1.]]]]))]

# 进行L1_unstructured剪枝操作,不过这里的剪枝对象是conv1里的bias

In [14]:
prune.l1_unstructured(module, name='bias', amount=0.3) # 裁剪比例设置为0.3

Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))

## 我们可以发现bias变成了bias_orig

In [15]:
list(module.named_parameters())

[('weight_orig',
  Parameter containing:
  tensor([[[[ 0.0436,  0.2702,  0.0559],
            [ 0.1413, -0.0787, -0.1531],
            [ 0.0558,  0.2458, -0.0896]]],
  
  
          [[[-0.2921,  0.1019,  0.0539],
            [ 0.1019,  0.2754, -0.1386],
            [-0.2756, -0.1292,  0.1475]]],
  
  
          [[[ 0.2188,  0.0133, -0.1130],
            [-0.1287,  0.2355, -0.1248],
            [ 0.2215,  0.1693,  0.0800]]],
  
  
          [[[-0.1820, -0.0322,  0.1641],
            [-0.1137, -0.1801, -0.1784],
            [-0.2672,  0.3220, -0.2941]]],
  
  
          [[[-0.3005,  0.1728,  0.1805],
            [ 0.0071,  0.1560, -0.0921],
            [ 0.0193, -0.2779,  0.0085]]],
  
  
          [[[ 0.0596,  0.2006, -0.0119],
            [ 0.2035, -0.0278,  0.2438],
            [-0.3046, -0.1751,  0.2437]]]], requires_grad=True)),
 ('bias_orig',
  Parameter containing:
  tensor([-0.0568, -0.0444,  0.1789,  0.2256,  0.0646, -0.1924],
         requires_grad=True))]

## 产生了bias_mask，这样子的话conv1里的weight和bias都进行了剪枝

In [16]:
list(module.named_buffers())

[('weight_mask',
  tensor([[[[1., 0., 1.],
            [1., 1., 1.],
            [1., 0., 0.]]],
  
  
          [[[0., 1., 0.],
            [1., 1., 1.],
            [1., 0., 0.]]],
  
  
          [[[0., 1., 1.],
            [1., 1., 1.],
            [1., 1., 1.]]],
  
  
          [[[1., 0., 1.],
            [1., 1., 1.],
            [1., 1., 1.]]],
  
  
          [[[1., 1., 1.],
            [1., 0., 1.],
            [0., 1., 1.]]],
  
  
          [[[0., 0., 0.],
            [1., 1., 1.],
            [0., 0., 1.]]]])),
 ('bias_mask', tensor([0., 0., 1., 1., 1., 1.]))]

# 进行ln_structured剪枝，注意看输出的weight,都是一整个channel为0的，剪枝的话，是可以连续剪枝的，这里是第二次对conv1里的weight进行剪枝

In [17]:
prune.ln_structured(module, name='weight', amount=0.5, n=2, dim=0)
# as we can verify, this will zero out all the connections corresponding to 50%(3 out of 6) of the channels,
# while preserving the action of the previous mask
# removing the specified amount of (currently unpruned) channels along the specified dim with the lowest L``n``-norm

Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))

For the forward pass to work without modification, the weight attribute needs to exist. The pruning techniques implemented in torch.nn.utils.prune compute the pruned version of the weight(by combining the mask with the original parameter) and store them in the attribute weight. Note, this is no longer a parameter of the module, it is now simply an attribute

In [18]:
module.weight

tensor([[[[ 0.0000,  0.0000,  0.0000],
          [ 0.0000, -0.0000, -0.0000],
          [ 0.0000,  0.0000, -0.0000]]],


        [[[-0.0000,  0.1019,  0.0000],
          [ 0.1019,  0.2754, -0.1386],
          [-0.2756, -0.0000,  0.0000]]],


        [[[ 0.0000,  0.0000, -0.0000],
          [-0.0000,  0.0000, -0.0000],
          [ 0.0000,  0.0000,  0.0000]]],


        [[[-0.1820, -0.0000,  0.1641],
          [-0.1137, -0.1801, -0.1784],
          [-0.2672,  0.3220, -0.2941]]],


        [[[-0.3005,  0.1728,  0.1805],
          [ 0.0071,  0.0000, -0.0921],
          [ 0.0000, -0.2779,  0.0085]]],


        [[[ 0.0000,  0.0000, -0.0000],
          [ 0.0000, -0.0000,  0.0000],
          [-0.0000, -0.0000,  0.0000]]]], grad_fn=<MulBackward0>)

In [19]:
model.state_dict().keys()

odict_keys(['conv1.weight_orig', 'conv1.bias_orig', 'conv1.weight_mask', 'conv1.bias_mask', 'conv2.weight', 'conv2.bias', 'fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias', 'fc3.weight', 'fc3.bias'])

# prune.remove之后，我们发现weight_orig变成了weight, 其实就是把module.weight的值赋值给了weight_orig

运行prune.remove的作用，to make the pruning permanent, remove the re-parametrization in terms of weight_orig and weight_mask, and remove the forward_pre_hook, we can use the remove functionality from torch.nn.utils.prune. Note that this doesn't undo the pruning, as if it never happened. it simply makes it permanent, instead, by reassigning the parameter weight to the model parameters, in its pruned version

In [20]:
prune.remove(module, 'weight')

Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))

## 我们发现weight_orig变成了weight，而且数值也发生了变化

In [21]:
list(module.named_parameters())

[('bias_orig',
  Parameter containing:
  tensor([-0.0568, -0.0444,  0.1789,  0.2256,  0.0646, -0.1924],
         requires_grad=True)),
 ('weight',
  Parameter containing:
  tensor([[[[ 0.0000,  0.0000,  0.0000],
            [ 0.0000, -0.0000, -0.0000],
            [ 0.0000,  0.0000, -0.0000]]],
  
  
          [[[-0.0000,  0.1019,  0.0000],
            [ 0.1019,  0.2754, -0.1386],
            [-0.2756, -0.0000,  0.0000]]],
  
  
          [[[ 0.0000,  0.0000, -0.0000],
            [-0.0000,  0.0000, -0.0000],
            [ 0.0000,  0.0000,  0.0000]]],
  
  
          [[[-0.1820, -0.0000,  0.1641],
            [-0.1137, -0.1801, -0.1784],
            [-0.2672,  0.3220, -0.2941]]],
  
  
          [[[-0.3005,  0.1728,  0.1805],
            [ 0.0071,  0.0000, -0.0921],
            [ 0.0000, -0.2779,  0.0085]]],
  
  
          [[[ 0.0000,  0.0000, -0.0000],
            [ 0.0000, -0.0000,  0.0000],
            [-0.0000, -0.0000,  0.0000]]]], requires_grad=True))]

## buffers里的weight_mask不在了

In [22]:
list(module.named_buffers())

[('bias_mask', tensor([0., 0., 1., 1., 1., 1.]))]

# Prunning multiple parameters in a model

In [23]:
new_model = LeNet()

In [24]:
for name, module in new_model.named_modules():
    if isinstance(module, torch.nn.Conv2d): # 收集卷积模块
        prune.l1_unstructured(module, name='weight', amount=0.2) # 只对weight剪枝，比例为0.2
    elif isinstance(module, torch.nn.Linear): # 收集全连接模块
        prune.l1_unstructured(module, name='weight', amount=0.4) # 只对weight剪枝，比例为0.4

In [25]:
dict(new_model.named_buffers()).keys()

dict_keys(['conv1.weight_mask', 'conv2.weight_mask', 'fc1.weight_mask', 'fc2.weight_mask', 'fc3.weight_mask'])

# Global Pruning

so far, we only looked at what is usually referred to as "local" pruning, the practice of pruning tensors in a model one by one, by comparing the statisitcs(weight magnitude, activation, gradient) of each entry exclusively to the other entries in that tensor. however, a common and perhaps more powerful technique is to prune the model all at once, by removing(for example) the lowest 20% of connections across the whole model, instead of removing the lowest 20% of connections in each layer. this is likely to result in different pruning percentages per layer. let's ses how to do that using global_unstructured from torch.nn.utils.prune

In [26]:
model = LeNet()

## 设定要剪枝的对象

In [27]:
parameters_to_prune = (
    (model.conv1, 'weight'),
    (model.conv2, 'weight'),
    (model.fc1, 'weight'),
    (model.fc2, 'weight'),
    (model.fc3, 'weight')
)

In [28]:
prune.global_unstructured(
    parameters_to_prune,  # 接受元组形式
    pruning_method=prune.L1Unstructured, # L1非结构化剪枝
    amount=0.7 # 裁剪比例设置为0.7
)

## 查看每层剪枝比例

In [29]:
print("Sparsity in conv1.weight: {:.2f}%".format(100.0 * float(torch.sum(model.conv1.weight == 0)) / 
     float(model.conv1.weight.nelement())))
print("Sparsity in conv2.weight: {:.2f}%".format(100.0 * float(torch.sum(model.conv2.weight == 0)) /
                                                float(model.conv2.weight.nelement())))
print("Sparsity in fc1.weight: {:.2f}%".format(100.0 * float(torch.sum(model.fc1.weight == 0)) / 
     float(model.fc1.weight.nelement())))
print("Sparsity in fc2.weight: {:.2f}%".format(100.0 * float(torch.sum(model.fc2.weight == 0)) / 
     float(model.fc2.weight.nelement())))
print("Sparsity in fc3.weight: {:.2f}%".format(100.0 * float(torch.sum(model.fc3.weight == 0)) /
                                              float(model.fc3.weight.nelement())))

Sparsity in conv1.weight: 12.96%
Sparsity in conv2.weight: 26.97%
Sparsity in fc1.weight: 77.06%
Sparsity in fc2.weight: 43.20%
Sparsity in fc3.weight: 36.19%


## 统计全局剪枝比例是否和开始设置的值对上了

In [30]:
print("Global sparsity: {:.2f}%".format(
    100.0 * float(
        torch.sum(model.conv1.weight == 0)
        + torch.sum(model.conv2.weight == 0)
        + torch.sum(model.fc1.weight == 0)
        + torch.sum(model.fc2.weight == 0)
        + torch.sum(model.fc3.weight == 0)
    )
    / float(
        model.conv1.weight.nelement()
        + model.conv2.weight.nelement()
        + model.fc1.weight.nelement()
        + model.fc2.weight.nelement()
        + model.fc3.weight.nelement()
    )
))

Global sparsity: 70.00%


# 计算0值的个数，结果看来是吻合的，计算结果为12.96%稀疏度，在conv1层的weight上

In [31]:
torch.sum(model.conv1.weight == 0)

tensor(7)

In [32]:
model.conv1.weight.nelement()

54

In [33]:
7 / 54.0 * 100

12.962962962962962

# 在保存模型之前，对每个layer运行prune.remove操作

In [34]:
for module, name in parameters_to_prune:
    print(module, name)
    prune.remove(module, name)

Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1)) weight
Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1)) weight
Linear(in_features=400, out_features=120, bias=True) weight
Linear(in_features=120, out_features=84, bias=True) weight
Linear(in_features=84, out_features=10, bias=True) weight


## 我们发现weight_orig变成了weight,以及buffers消失了

In [35]:
list(model.conv1.named_parameters())

[('bias',
  Parameter containing:
  tensor([ 0.2713, -0.2962,  0.1615, -0.0092,  0.0532,  0.2505],
         requires_grad=True)),
 ('weight',
  Parameter containing:
  tensor([[[[-0.2913, -0.0000, -0.0000],
            [-0.2775, -0.0000, -0.2242],
            [ 0.2880,  0.1896,  0.0476]]],
  
  
          [[[-0.2580,  0.2894, -0.2938],
            [ 0.0835,  0.0000,  0.1750],
            [ 0.1294,  0.1813, -0.2590]]],
  
  
          [[[ 0.3096,  0.2377, -0.3243],
            [ 0.2653,  0.2832, -0.0856],
            [-0.0602,  0.2314,  0.1457]]],
  
  
          [[[ 0.0613, -0.2724, -0.0000],
            [ 0.0730, -0.1033, -0.1826],
            [-0.2681,  0.2517, -0.0915]]],
  
  
          [[[ 0.2669, -0.1155, -0.0000],
            [-0.1073, -0.3000,  0.2573],
            [ 0.0000, -0.0541, -0.0402]]],
  
  
          [[[-0.2933,  0.3057, -0.1105],
            [ 0.2823,  0.1902, -0.1146],
            [ 0.0639,  0.2808, -0.2625]]]], requires_grad=True))]

In [36]:
list(model.conv1.named_buffers())

[]

# 保存模型以及比较.pth压缩后.zip格式模型的大小

In [37]:
torch.save(model.state_dict(), 'sparse_model.pth') #这种方式只保存权重

## 计算下剪枝后模型压缩比例

In [38]:
(241.8 - 91.1) / 241.8  # 前者为压缩前，后者为压缩后

0.6232423490488007

In [39]:
model.conv1.weight

Parameter containing:
tensor([[[[-0.2913, -0.0000, -0.0000],
          [-0.2775, -0.0000, -0.2242],
          [ 0.2880,  0.1896,  0.0476]]],


        [[[-0.2580,  0.2894, -0.2938],
          [ 0.0835,  0.0000,  0.1750],
          [ 0.1294,  0.1813, -0.2590]]],


        [[[ 0.3096,  0.2377, -0.3243],
          [ 0.2653,  0.2832, -0.0856],
          [-0.0602,  0.2314,  0.1457]]],


        [[[ 0.0613, -0.2724, -0.0000],
          [ 0.0730, -0.1033, -0.1826],
          [-0.2681,  0.2517, -0.0915]]],


        [[[ 0.2669, -0.1155, -0.0000],
          [-0.1073, -0.3000,  0.2573],
          [ 0.0000, -0.0541, -0.0402]]],


        [[[-0.2933,  0.3057, -0.1105],
          [ 0.2823,  0.1902, -0.1146],
          [ 0.0639,  0.2808, -0.2625]]]], requires_grad=True)