# Inference Optimization for Convolutional Netwroks
### Part 1: Model fusion, quantization

In [1]:
# Import packages 
from torch import nn
from torchsummary import summary
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


### Notebook overview 
- Create CNN model and the quantized version of the same model
- Compare difference in size and latency of two models
- Fuse several blocks into one
- Compare fused and quantized version with only fused version

### Create simple CNN

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        # Convolutional Block 1
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=20,kernel_size=(5, 5))
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        
        # Convolutional  Block 2
        self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, kernel_size=(5, 5))
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        
        # Fully connected 1
        self.fc1 = nn.Linear(in_features=50*53*53, out_features=500)
        self.relu3 = nn.ReLU()
        
        # Fully connected 2
        self.fc2 = nn.Linear(in_features=500, out_features=10)
        self.Softmax = nn.Softmax(1)
        
    def forward(self, x):
        # pass the input through block 1
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        
        # pass the input through block 2
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        
        # flatten the output from the previous layer and pass it through fully connected 1
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = self.relu3(x)
        
        # pass the input through fully connected 2 and Softmax 
        x = self.fc2(x)
        output = self.Softmax(x)
        return output

In [3]:
# changes in network

class NetQuant(nn.Module):
    def __init__(self):
        super(NetQuant, self).__init__()
        # Prepare for quanitzation
        self.quant = torch.quantization.QuantStub()
        
        # Convolutional Block 1
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=20,kernel_size=(5, 5))
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        
        # Convolutional Block 2
        self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, kernel_size=(5, 5))
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        
        # Fully connected 1
        self.fc1 = nn.Linear(in_features=50*53*53, out_features=500)
        self.relu3 = nn.ReLU()
        
        # Fully connected 2
        self.fc2 = nn.Linear(in_features=500, out_features=10)
        self.Softmax = nn.Softmax(1)
        
        # Prepare for dequantization
        self.dequant = torch.quantization.DeQuantStub()
        
        
    def forward(self, x):

        x = self.quant(x)
        
        # pass the input through block 1
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        
        # pass the input through block 2
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        
        # flatten the output from the previous layer and pass it through fully connected 1
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = self.relu3(x)
        
        # pass the input through fully connected 2 and Softmax 
        x = self.fc2(x)
        x = self.dequant(x)
        x = self.Softmax(x)
        
        return x

In [4]:
# Define original and quantized models and prepae for evaluation 

net = Net()
net.eval()
net_quant = NetQuant()
net_quant.eval()

NetQuant(
  (quant): QuantStub()
  (conv1): Conv2d(3, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=140450, out_features=500, bias=True)
  (relu3): ReLU()
  (fc2): Linear(in_features=500, out_features=10, bias=True)
  (Softmax): Softmax(dim=1)
  (dequant): DeQuantStub()
)

In [5]:
# Prepare model quantization and convert to quantized version
net_quant.qconfig = torch.quantization.get_default_qconfig("fbgemm")
torch.backends.quantized.engine = "fbgemm"
net_quant = torch.quantization.prepare(net_quant.cpu(), inplace=False)
net_quant = torch.quantization.convert(net_quant, inplace=False)



### Check size

In [6]:
# Check model size
def print_model_size(mdl):
    torch.save(mdl.state_dict(), "tmp.pt")
    size = round(os.path.getsize("tmp.pt")/1e6)
    os.remove('tmp.pt')
    return size

net_size = print_model_size(net)
quant_size = print_model_size(net_quant)

print(f'Size whitout quantization: {net_size} MB \n Size whit quantization: {quant_size} MB')
print(f'Size ratio: {round(net_size/quant_size, 2)}')

Size whitout quantization: 281 MB 
 Size whit quantization: 70 MB
Size ratio: 4.01


## Latency

In [7]:
# input for the model
inpp = torch.rand(32, 3, 224, 224)
# compare the performance
print("Floating point FP32")
%timeit net(inpp)

print("Quantized INT8")
%timeit net_quant(inpp)


Floating point FP32
162 ms ± 6.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Quantized INT8
94.8 ms ± 15.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Fusion

In [8]:
# Define original and quantized models and prepae for evaluation 

net = Net()
net.eval()
net_quant = NetQuant()
net_quant.eval()

NetQuant(
  (quant): QuantStub()
  (conv1): Conv2d(3, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=140450, out_features=500, bias=True)
  (relu3): ReLU()
  (fc2): Linear(in_features=500, out_features=10, bias=True)
  (Softmax): Softmax(dim=1)
  (dequant): DeQuantStub()
)

In [9]:
# Perpare blocks for the fusion 

moduls_to_fuse =  [['conv1', 'relu1'], 
                   ['conv2', 'relu2'], 
                   ['fc1', 'relu3']]

net_quant_fused = torch.quantization.fuse_modules(net_quant, moduls_to_fuse)

net_fused = torch.quantization.fuse_modules(net, moduls_to_fuse)

In [10]:
# Prepare and quantize the model

net_quant_fused.qconfig = torch.quantization.get_default_qconfig("fbgemm")
torch.backends.quantized.engine = "fbgemm"
net_quant_fused = torch.quantization.prepare(net_quant_fused.cpu(), inplace=False)
net_quant_fused = torch.quantization.convert(net_quant_fused, inplace=False)

In [11]:
print("Fused and quantized model latency")
%timeit net_quant_fused(inpp)

print("Fused model latency")
%timeit net_fused(inpp)

Fused and quantized model latency
91.7 ms ± 15.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Fused model latency
178 ms ± 9.98 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
