1. train되어 있는 model에 s_j 기준 내림차순 정렬
2. 하위 channel 제거
3. 40 epoch retraining
4. Test Accuracy, #FLOPs

In [1]:
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import torch.nn as nn
import tensorflow as tf

import torch.optim as optim
from torch.optim import lr_scheduler
from pytorch_model_summary import summary
import torch.nn.init as init
import kornia
import math
import torch.profiler as profiler

import sys
sys.path.append('/home/hslee/Desktop/Embedded_AI/PyTorch_Tutorials/03_Pruning_Filters_for_Efficient_Convnets/')
from architecture import *
from utils import *

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

2024-02-06 16:51:11.341888: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-06 16:51:11.543223: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


cuda:0


# 1. One shot pruning

In [2]:
model = VGG16_BN()
checkpoint = torch.load('../00_vgg16_baseline_exp1/checkpoint/best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [3]:
layer_idx = 1
for layer in model.modules():
    if isinstance(layer, nn.Conv2d):
        print(f"conv {layer_idx} : {layer.weight.data.shape}")
        layer_idx += 1
    if isinstance(layer, nn.Linear):
        print(f"fc : {layer.weight.data.shape}")
        layer_idx += 1

conv 1 : torch.Size([64, 3, 3, 3])
conv 2 : torch.Size([64, 64, 3, 3])
conv 3 : torch.Size([128, 64, 3, 3])
conv 4 : torch.Size([128, 128, 3, 3])
conv 5 : torch.Size([256, 128, 3, 3])
conv 6 : torch.Size([256, 256, 3, 3])
conv 7 : torch.Size([256, 256, 3, 3])
conv 8 : torch.Size([512, 256, 3, 3])
conv 9 : torch.Size([512, 512, 3, 3])
conv 10 : torch.Size([512, 512, 3, 3])
conv 11 : torch.Size([512, 512, 3, 3])
conv 12 : torch.Size([512, 512, 3, 3])
conv 13 : torch.Size([512, 512, 3, 3])
fc : torch.Size([512, 512])
fc : torch.Size([10, 512])


In [4]:
layer = getattr(model, f"fc1")
print(layer)

fc_layer = layer[1]
print(fc_layer.weight.data.shape)


Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=512, out_features=512, bias=True)
  (2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (3): ReLU(inplace=True)
)
torch.Size([512, 512])


In [5]:
new_pruned_model = getOneShotPrunedModel(model)

pruned_rate : 50%
num_prune_channels : 32
weight.shape : torch.Size([64, 3, 3, 3])
bias.shape : torch.Size([64])
bn_gamma.shape : torch.Size([64])
bn_beta.shape : torch.Size([64])
bn_running_mean.shape : torch.Size([64])
bn_running_var.shape : torch.Size([64])
sorted_weight_indices : tensor([30, 24, 32, 22, 47, 41, 49, 53, 51, 56, 23,  0, 15, 57, 50,  7, 58, 36,
         2,  4, 12, 31, 17, 16, 63, 21,  1, 33, 48,  9, 43, 13, 37, 52, 34, 27,
        10, 62, 60, 46, 59, 39, 18, 40, 55, 29,  3,  8, 19, 54, 35, 20, 28,  6,
        11, 44, 14, 61, 25, 38, 45,  5, 26, 42])
saving_filter_idices : tensor([30, 24, 32, 22, 47, 41, 49, 53, 51, 56, 23,  0, 15, 57, 50,  7, 58, 36,
         2,  4, 12, 31, 17, 16, 63, 21,  1, 33, 48,  9, 43, 13])
pruned_weight.shape : torch.Size([32, 3, 3, 3])
pruned_bias.shape : torch.Size([32])
pruned_bn_gamma.shape : torch.Size([32])
pruned_bn_beta.shape : torch.Size([32])
pruned_bn_running_mean.shape : torch.Size([32])
pruned_bn_running_var.shape : torch.Size([32

In [6]:
new_pruned_model.to(device)
showNewPrunedModel(new_pruned_model)

------------------------------------------------------------------------
      Layer (type)          Input Shape         Param #     Tr. Param #
          Conv2d-1       [1, 3, 32, 32]             896             896
     BatchNorm2d-2      [1, 32, 32, 32]              64              64
            ReLU-3      [1, 32, 32, 32]               0               0
          Conv2d-4      [1, 32, 32, 32]          18,496          18,496
     BatchNorm2d-5      [1, 64, 32, 32]             128             128
            ReLU-6      [1, 64, 32, 32]               0               0
       MaxPool2d-7      [1, 64, 32, 32]               0               0
          Conv2d-8      [1, 64, 16, 16]          73,856          73,856
     BatchNorm2d-9     [1, 128, 16, 16]             256             256
           ReLU-10     [1, 128, 16, 16]               0               0
         Conv2d-11     [1, 128, 16, 16]         147,584         147,584
    BatchNorm2d-12     [1, 128, 16, 16]             256        

# 3. Test Accuracy, #FLOPs

In [7]:

val_loader, tesize = loadValDataset()

# load best model
model = VGG16_BN_PRUNE_FOR_SCRATCH()
checkpoint = torch.load('checkpoint/best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
top1_acc, top5_acc = testAccuracy(model, val_loader)
print(f"Top1 Accuracy : {top1_acc:.2f}, Top5 Accuracy : {top5_acc:.2f}")

Files already downloaded and verified
# test datas : 10000
Top1 Accuracy : 91.47, Top5 Accuracy : 99.28


In [8]:
# check the number of parameters
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of Parameters : {num_parameters}")

Number of Parameters : 5399690


In [9]:
from torch.profiler import profile, record_function, ProfilerActivity
import torch

# Define your model and device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Sample input data
input_data = torch.randn(1, 3, 32, 32).to(device)

# Profiler를 사용하여 FLOPS 계산
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    with record_function("model_inference"):
        model(input_data)
        
# Print the profile results
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
# flops
print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        17.93%     899.000us        88.24%       4.425ms       4.425ms       0.000us         0.00%     328.000us     328.000us             1  
                                      aten::convolution         1.66%      83.000us        43.35%       2.174ms     167.231us       0.000us         0.00%     270.000us      20.769us            13  
         

STAGE:2024-02-06 16:51:24 2771716:2771716 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-02-06 16:51:24 2771716:2771716 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-02-06 16:51:24 2771716:2771716 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [10]:
print("the number of flops before pruning")
! flopth -m VGG16_BN -p ../architecture -i 3 32 32

the number of flops before pruning
Op Flatten is not supported at now, set FLOPs of it to zero.
+---------------+---------------+-------------+-------------+------------------------------+----------+------------------+----------------------+----------+-----------------+---------------------+
| module_name   | module_type   | in_shape    | out_shape   | kernel_size,padding,stride   | params   | params_percent   | params_percent_vis   | flops    | flops_percent   | flops_percent_vis   |
| conv1.0       | Conv2d        | (3,32,32)   | (64,32,32)  | k=(3, 3), p=(1, 1), s=(1, 1) | 1.792K   | 0.0119531%       |                      | 1.83501M | 0.583105%       |                     |
+---------------+---------------+-------------+-------------+------------------------------+----------+------------------+----------------------+----------+-----------------+---------------------+
| conv1.1       | BatchNorm2d   | (64,32,32)  | (64,32,32)  |                              | 128.0    | 0.000853792%

In [11]:
print("the number of flops after pruning")
! flopth -m VGG16_BN_PRUNE_FOR_SCRATCH -p ../architecture -i 3 32 32

the number of flops after pruning
Op Flatten is not supported at now, set FLOPs of it to zero.
+---------------+---------------+-------------+-------------+------------------------------+----------+------------------+----------------------+----------+-----------------+---------------------+
| module_name   | module_type   | in_shape    | out_shape   | kernel_size,padding,stride   | params   | params_percent   | params_percent_vis   | flops    | flops_percent   | flops_percent_vis   |
| conv1.0       | Conv2d        | (3,32,32)   | (32,32,32)  | k=(3, 3), p=(1, 1), s=(1, 1) | 896.0    | 0.0165935%       |                      | 917.504K | 0.442566%       |                     |
+---------------+---------------+-------------+-------------+------------------------------+----------+------------------+----------------------+----------+-----------------+---------------------+
| conv1.1       | BatchNorm2d   | (32,32,32)  | (32,32,32)  |                              | 64.0     | 0.00118525%  

In [12]:
num_flops_before_pruning = 314696000
num_flops_after_pruning = 207314000

print(f"reduced percentage : {(num_flops_before_pruning - num_flops_after_pruning) / num_flops_before_pruning * 100:.2f}%")

reduced percentage : 34.12%
