# Inference Optimization for Convolutional Netwroks
### Model fusion, quantization

### TODO: Pruning

In [50]:
from torchvision.models import resnet50, quantization
import torch
from torch import nn
from torchsummary import summary
import os
from torchvision.datasets import CIFAR10

### To try
- Create model, fuse and quantizaiton, test on speed and size
- Explanation and test of accuracy on resnet50

### Create model, fuse and quantizaiton, test on speed and size

In [5]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        # initialize first set of CONV => RELU => POOL layers
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=20,kernel_size=(5, 5))
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        
        # initialize second set of CONV => RELU => POOL layers
        self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, kernel_size=(5, 5))
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        
        # initialize first (and only) set of FC => RELU layers
        self.fc1 = nn.Linear(in_features=50*53*53, out_features=500)
        self.relu3 = nn.ReLU()
        
        # initialize our softmax classifier
        self.fc2 = nn.Linear(in_features=500, out_features=10)
        self.Softmax = nn.Softmax(1)
        
    def forward(self, x):
        # pass the input through our first set of CONV => RELU => POOL layers
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        
        # pass the output from the previous layer through the second set of CONV => RELU => POOL layers
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        
        # flatten the output from the previous layer and pass it through our only set of FC => RELU layers
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = self.relu3(x)
        
        # pass the output to our softmax classifier to get our output predictions
        x = self.fc2(x)
        output = self.Softmax(x)
        return output

In [24]:
# changes in network

class NetQuant(nn.Module):
    def __init__(self):
        super(NetQuant, self).__init__()
        self.quant = torch.quantization.QuantStub()
        # initialize first set of CONV => RELU => POOL layers
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=20,kernel_size=(5, 5))
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        
        # initialize second set of CONV => RELU => POOL layers
        self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, kernel_size=(5, 5))
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        
        # initialize first (and only) set of FC => RELU layers
        self.fc1 = nn.Linear(in_features=50*53*53, out_features=500)
        self.relu3 = nn.ReLU()
        
        # initialize our softmax classifier
        self.fc2 = nn.Linear(in_features=500, out_features=10)
        self.dequant = torch.quantization.DeQuantStub()
        self.Softmax = nn.Softmax(1)
        self.dequant = torch.quantization.DeQuantStub()
        
    def forward(self, x):
        #x = x.contiguous(memory_format=torch.channels_last)
        x = self.quant(x)
        # pass the input through our first set of CONV => RELU => POOL layers
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        
        # pass the output from the previous layer through the second set of CONV => RELU => POOL layers
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        
        # flatten the output from the previous layer and pass it through our only set of FC => RELU layers
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = self.relu3(x)
        
        # pass the output to our softmax classifier to get our output predictions
        x = self.fc2(x)
        x = self.dequant(x)
        x = self.Softmax(x)
        
        return x

In [35]:
net = Net()
net.eval()
net_quant = NetQuant()
net_quant.eval()

NetQuant(
  (quant): QuantStub()
  (conv1): Conv2d(3, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=140450, out_features=500, bias=True)
  (relu3): ReLU()
  (fc2): Linear(in_features=500, out_features=10, bias=True)
  (dequant): DeQuantStub()
  (Softmax): Softmax(dim=1)
)

In [26]:
net_quant.qconfig = torch.quantization.get_default_qconfig("fbgemm")
torch.backends.quantized.engine = "fbgemm"
net_quant = torch.quantization.prepare(net_quant.cpu(), inplace=False)
net_quant = torch.quantization.convert(net_quant, inplace=False)

### Check size

In [15]:
summary(net.cuda(), (3, 224,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 20, 220, 220]           1,520
              ReLU-2         [-1, 20, 220, 220]               0
         MaxPool2d-3         [-1, 20, 110, 110]               0
            Conv2d-4         [-1, 50, 106, 106]          25,050
              ReLU-5         [-1, 50, 106, 106]               0
         MaxPool2d-6           [-1, 50, 53, 53]               0
            Linear-7                  [-1, 500]      70,225,500
              ReLU-8                  [-1, 500]               0
            Linear-9                   [-1, 10]           5,010
          Softmax-10                   [-1, 10]               0
Total params: 70,257,080
Trainable params: 70,257,080
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 26.27
Params size (MB): 268.01
Es

In [9]:
summary(net_quant.cuda(), (3, 224,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         QuantStub-1          [-1, 3, 224, 224]               0
            Conv2d-2         [-1, 20, 220, 220]           1,520
              ReLU-3         [-1, 20, 220, 220]               0
         MaxPool2d-4         [-1, 20, 110, 110]               0
            Conv2d-5         [-1, 50, 106, 106]          25,050
              ReLU-6         [-1, 50, 106, 106]               0
         MaxPool2d-7           [-1, 50, 53, 53]               0
            Linear-8                  [-1, 500]      70,225,500
              ReLU-9                  [-1, 500]               0
           Linear-10                   [-1, 10]           5,010
          Softmax-11                   [-1, 10]               0
      DeQuantStub-12                   [-1, 10]               0
Total params: 70,257,080
Trainable params: 70,257,080
Non-trainable params: 0
-------------------------

In [27]:
# Check model size

def print_model_size(mdl):
    torch.save(mdl.state_dict(), "tmp.pt")
    print("%.2f MB" %(os.path.getsize("tmp.pt")/1e6))
    os.remove('tmp.pt')

print_model_size(net)

281.03 MB


In [28]:
print_model_size(net_quant)

70.28 MB


In [59]:
281.03/70.26

3.999857671505835

## Latency

In [22]:
inpp = torch.rand(32, 3, 224, 224)

In [30]:
print("Quantized INT8")
%timeit net_quant(inpp)

Quantized INT8
85.3 ms ± 2.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
# compare the performance
print("Floating point FP32")
%timeit net(inpp)

Floating point FP32
151 ms ± 4.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Fusion

Finally, to get the most performance out of static quantization, you need to also use module fusion. Module fusion is the technique of combining ("fusing") sequences of high-level layers, e.g. Conv2d + Batchnorm, into a single combined layer. This improves performance by pushing the combined sequence of operations into the low-level library, allowing it to be computed in one shot, e.g. without having to surface an intermediate representation back to the PyTorch Python process. This speeds things up and leads to more accurate results, albeit at the cost of debuggability.

In [42]:
net = Net()
net.eval()
net_quant = NetQuant()
net_quant.eval()

NetQuant(
  (quant): QuantStub()
  (conv1): Conv2d(3, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=140450, out_features=500, bias=True)
  (relu3): ReLU()
  (fc2): Linear(in_features=500, out_features=10, bias=True)
  (dequant): DeQuantStub()
  (Softmax): Softmax(dim=1)
)

In [45]:
moduls_to_fuse =  [['conv1', 'relu1'], 
                   ['conv2', 'relu2'], 
                   ['fc1', 'relu3']]

net_quant_fused = torch.quantization.fuse_modules(net_quant, moduls_to_fuse)

net_fused = torch.quantization.fuse_modules(net, moduls_to_fuse)

In [46]:
net_quant_fused.qconfig = torch.quantization.get_default_qconfig("fbgemm")
torch.backends.quantized.engine = "fbgemm"
net_quant_fused = torch.quantization.prepare(net_quant_fused.cpu(), inplace=False)
net_quant_fused = torch.quantization.convert(net_quant_fused, inplace=False)



In [47]:
print("Fused and quantized model latency")
%timeit net_quant_fused(inpp)

Fused and quantized model latency
81 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [48]:
print("Fused model latency")
%timeit net_fused(inpp)

Fused model latency
146 ms ± 3.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


yeah it is not supported on CUDA, quantized::linear_dynamic is only supported in CPU. We do not have immediate plans to support CUDA but we plan to publish a doc for custom backends which will make the extension easier.

https://pytorch.org/blog/introduction-to-quantization-on-pytorch/

### only static is supported

diff between dinamic and static?


# Accuracy

In [49]:
model_resnet50 = resnet50(pretrained=True)
model_resnet50_quant = quantization.resnet50(pretrained=True)

In [52]:
dataset = CIFAR10('.', train=False, download=True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz


170499072it [00:19, 8590075.64it/s]                                             


Extracting ./cifar-10-python.tar.gz to .


In [54]:
data_loader = torch.utils.data.DataLoader(dataset,
                                          batch_size=64,
                                          shuffle=True)

In [55]:
for i in data_loader:
    print(i)
    break

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.Image.Image'>