# Static Quantization

Quantization

In [1]:
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2

import warnings
warnings.filterwarnings('ignore')

import os    
os.environ['KMP_DUPLICATE_LIB_OK']='True'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def print_model_size(model):
    torch.save(model.state_dict(), "tmp.pt")
    print("%.2f MB" %(os.path.getsize("tmp.pt")/1e6))
    os.remove('tmp.pt')

In [3]:
import torchvision
import torch
import torch.nn as nn
from torchvision.models.detection import SSD300_VGG16_Weights


model = torchvision.models.detection.ssd300_vgg16(weights=SSD300_VGG16_Weights)
model
# model.eval()
# x = [torch.rand(3, 300, 300), torch.rand(3, 500, 400)]
# predictions = model(x)
# predictions

SSD(
  (backbone): SSDFeatureExtractorVGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=

In [4]:
class CoCo_TestDataset(Dataset):
    def __init__(self, rootDir, folder, tf=None):
        """Dataset class for CoCo data

        Args:
            rootDir (str): path to directory containing CoCo image data
            folder (str) : 'train' or 'val' folder
            tf (optional): transformation to apply. Defaults to None
        """        
        self.rootDir = rootDir
        self.folder = folder
        self.transform = tf

        # read rgb image list
        sourceImgFolder =  os.path.join(self.rootDir, self.folder)
        self.sourceImgFiles  = [os.path.join(sourceImgFolder, x) for x in sorted(os.listdir(sourceImgFolder))]
    
    def __len__(self):
        return len(self.sourceImgFiles)
  
    def __getitem__(self, index):
        # read source image and convert to RGB, apply transform
        sourceImage = cv2.imread(f"{self.sourceImgFiles[index]}", -1)
        sourceImage = cv2.cvtColor(sourceImage, cv2.COLOR_BGR2RGB)
        if self.transform is not None:
            sourceImage = self.transform(sourceImage)

        return sourceImage

In [5]:
tf = transforms.Compose([
        transforms.ToPILImage(),
        # transforms.Resize((320, 320)),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.56, 0.406), std=(0.229, 0.224, 0.225))
        ])

TEST_BATCH_SIZE = 1
num_workers = 8

# Creating Test set and Test Dataloaders
test_set = CoCo_TestDataset(rootDir= 'data/coco/', folder='test2017_sub', tf = tf)
# test_set = torch.utils.data.Subset(test_set, indices=np.arange(32))
# test_loader  = DataLoader(test_set, batch_size=TEST_BATCH_SIZE)
test_sampler = torch.utils.data.SequentialSampler(test_set)


# train_sampler = torch.utils.data.RandomSampler(train_set)
# test_sampler = torch.utils.data.SequentialSampler(test_set)

# train_loader = torch.utils.data.DataLoader(
#     dataset=train_set, batch_size=32,
#     sampler=train_sampler, num_workers=num_workers)

test_loader = torch.utils.data.DataLoader(
    dataset=test_set, batch_size=TEST_BATCH_SIZE,
    sampler=test_sampler, num_workers=num_workers)

## 3. Fusion layer

In [6]:
import copy

model_fp32 = copy.deepcopy(model)
model_fp32.eval()

SSD(
  (backbone): SSDFeatureExtractorVGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=

In [7]:
# Fuse the activations to preceding layers, where applicable.
# This needs to be done manually depending on the model architecture.
# Common fusions include `conv + relu` and `conv + batchnorm + relu`
modules_to_fuse = [ 
    ["backbone.features.0", "backbone.features.1"],
    ["backbone.features.2", "backbone.features.3"],
    ["backbone.features.5", "backbone.features.6"],
    ["backbone.features.7", "backbone.features.8"],
    ["backbone.features.10", "backbone.features.11"],
    ["backbone.features.12", "backbone.features.13"],
    ["backbone.features.14", "backbone.features.15"],
    ["backbone.features.17", "backbone.features.18"],
    ["backbone.features.19", "backbone.features.20"],
    ["backbone.features.21", "backbone.features.22"],
    
    ["backbone.extra.0.1", "backbone.extra.0.2"],
    ["backbone.extra.0.3", "backbone.extra.0.4"],
    ["backbone.extra.0.5", "backbone.extra.0.6"],
    ["backbone.extra.0.7.1", "backbone.extra.0.7.2"],
    ["backbone.extra.0.7.3", "backbone.extra.0.7.4"],
    
    ["backbone.extra.1.0", "backbone.extra.1.1"],
    ["backbone.extra.1.2", "backbone.extra.1.3"],
    
    ["backbone.extra.2.0", "backbone.extra.2.1"],
    ["backbone.extra.2.2", "backbone.extra.2.3"],

    ["backbone.extra.3.0", "backbone.extra.3.1"],
    ["backbone.extra.3.2", "backbone.extra.3.3"],

    ["backbone.extra.4.0", "backbone.extra.4.1"],
    ["backbone.extra.4.2", "backbone.extra.4.3"]

    ]


torch.quantization.fuse_modules(model_fp32, modules_to_fuse, inplace=True)
model_fp32

SSD(
  (backbone): SSDFeatureExtractorVGG(
    (features): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (1): Identity()
      (2): ConvReLU2d(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (3): Identity()
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): ConvReLU2d(
        (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (6): Identity()
      (7): ConvReLU2d(
        (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (8): Identity()
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): ConvReLU2d(
        (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1

## fused model equivalence

In [8]:
model.eval()
model_fp32.eval()

# assert helper.model_equivalence(model_1=model_fp32, model_2=fused_model_fp32, device=cuda_device, rtol=1e-05, atol=1e-05, num_tests=100, input_size=(1,3,320,320)), "Fused model is not equivalent to the original model!"

SSD(
  (backbone): SSDFeatureExtractorVGG(
    (features): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (1): Identity()
      (2): ConvReLU2d(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (3): Identity()
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): ConvReLU2d(
        (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (6): Identity()
      (7): ConvReLU2d(
        (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (8): Identity()
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): ConvReLU2d(
        (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1

## 4. Static Quantization

In [9]:
from torch.quantization import QuantStub, DeQuantStub

class Quantizedmodel(nn.Module):
    def __init__(self, model_fp32):
        super(Quantizedmodel, self).__init__()
        self.quant = QuantStub()
        self.dequant = DeQuantStub()
        self.model_fp32 = model_fp32
    
    def forward(self, x):        
        # x = self.quant(x)
        
        # x = self.dequant(x)
        x = self.model_fp32(x)
        return x

In [10]:
# def calibrate_model(model, loader, device=torch.device("cpu:0")):

#     model.to(device)
#     model.eval()

#     for inputs, labels in loader:
#         inputs = inputs.to(device)
#         labels = labels.to(device)
#         _ = model(inputs)

In [11]:
def calibrate_model(model, loader, device=torch.device("cpu:0")):
    
    model.to(device)
    model.eval()

    for inputs in tqdm(loader):

      inputs = inputs[0].to(device)
      inputs = inputs.unsqueeze(0) # create a mini-batch as expected by the model
      _ = model(inputs)

In [12]:
model_fp32

SSD(
  (backbone): SSDFeatureExtractorVGG(
    (features): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (1): Identity()
      (2): ConvReLU2d(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (3): Identity()
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): ConvReLU2d(
        (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (6): Identity()
      (7): ConvReLU2d(
        (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (8): Identity()
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): ConvReLU2d(
        (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1

In [13]:
# Quantization Model Define
quantized_model = Quantizedmodel(model_fp32=model_fp32)

# Quantization Configuration Define
quantized_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')     # 'qnnpack' for NVIDIA
torch.quantization.prepare(quantized_model, inplace=True)

Quantizedmodel(
  (quant): QuantStub(
    (activation_post_process): HistogramObserver()
  )
  (dequant): DeQuantStub()
  (model_fp32): SSD(
    (backbone): SSDFeatureExtractorVGG(
      (features): Sequential(
        (0): ConvReLU2d(
          (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (1): ReLU(inplace=True)
          (activation_post_process): HistogramObserver()
        )
        (1): Identity()
        (2): ConvReLU2d(
          (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (1): ReLU(inplace=True)
          (activation_post_process): HistogramObserver()
        )
        (3): Identity()
        (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (5): ConvReLU2d(
          (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (1): ReLU(inplace=True)
          (activation_post_process): HistogramObserver()
        )
        (6): Identity()
        

In [14]:
cuda_device = torch.device("cuda:0")
cpu_device = torch.device("cpu:0")

In [15]:
# Calibration
calibrate_model(model=quantized_model, loader=test_loader, device=cpu_device)
quantized_model = quantized_model.to(cpu_device)

100%|██████████| 20/20 [00:04<00:00,  4.77it/s]


In [16]:
# Quantization Completed
quantized_model_int8 = torch.quantization.convert(quantized_model, inplace=True)
quantized_model_int8.eval()
print(quantized_model_int8)

Quantizedmodel(
  (quant): Quantize(scale=tensor([1.]), zero_point=tensor([0]), dtype=torch.quint8)
  (dequant): DeQuantize()
  (model_fp32): SSD(
    (backbone): SSDFeatureExtractorVGG(
      (features): Sequential(
        (0): QuantizedConvReLU2d(3, 64, kernel_size=(3, 3), stride=(1, 1), scale=18.992958068847656, zero_point=0, padding=(1, 1))
        (1): Identity()
        (2): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=132.3282012939453, zero_point=0, padding=(1, 1))
        (3): Identity()
        (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (5): QuantizedConvReLU2d(64, 128, kernel_size=(3, 3), stride=(1, 1), scale=127.04178619384766, zero_point=0, padding=(1, 1))
        (6): Identity()
        (7): QuantizedConvReLU2d(128, 128, kernel_size=(3, 3), stride=(1, 1), scale=109.63899230957031, zero_point=0, padding=(1, 1))
        (8): Identity()
        (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1,

---

In [17]:
print_model_size(model)
print_model_size(quantized_model_int8)

142.59 MB
35.92 MB


---

In [19]:
from data.util import  read_image

model.eval()
quantized_model_int8.eval()

imgs = read_image('misc/demo2.jpg')
imgs = torch.from_numpy(imgs)[None]
imgs.to(cpu_device)

quantized_model_int8(imgs)

NotImplementedError: Could not run 'quantized::conv2d_relu.new' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'quantized::conv2d_relu.new' is only available for these backends: [Negative, UNKNOWN_TENSOR_TYPE_ID, QuantizedXPU, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, SparseCPU, SparseCUDA, SparseHIP, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, SparseVE, UNKNOWN_TENSOR_TYPE_ID, NestedTensorCUDA, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID, UNKNOWN_TENSOR_TYPE_ID].

QuantizedCPU: registered at ../aten/src/ATen/native/quantized/cpu/qconv.cpp:1423 [kernel]
BackendSelect: fallthrough registered at ../aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:133 [backend fallback]
Named: registered at ../aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at ../aten/src/ATen/ConjugateFallback.cpp:18 [backend fallback]
Negative: registered at ../aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at ../aten/src/ATen/ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:64 [backend fallback]
AutogradOther: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:35 [backend fallback]
AutogradCPU: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:39 [backend fallback]
AutogradCUDA: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:47 [backend fallback]
AutogradXLA: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:51 [backend fallback]
AutogradMPS: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:59 [backend fallback]
AutogradXPU: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:43 [backend fallback]
AutogradHPU: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:68 [backend fallback]
AutogradLazy: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:55 [backend fallback]
Tracer: registered at ../torch/csrc/autograd/TraceTypeManual.cpp:295 [backend fallback]
AutocastCPU: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:481 [backend fallback]
Autocast: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:324 [backend fallback]
Batched: registered at ../aten/src/ATen/BatchingRegistrations.cpp:1064 [backend fallback]
VmapMode: fallthrough registered at ../aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
Functionalize: registered at ../aten/src/ATen/FunctionalizeFallbackKernel.cpp:89 [backend fallback]
PythonTLSSnapshot: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:137 [backend fallback]


---

In [21]:
x = [torch.rand(3, 300, 300), torch.rand(3, 500, 400)]
predictions = quantized_model_int8(x)
predictions

[{'boxes': tensor([[  5.5650,   1.2811, 297.4328, 294.3029],
          [ 68.5410,   2.1247, 223.9397,  89.3615],
          [ 27.9329,   7.7210, 121.7162, 144.6143],
          [ 70.2444,   5.3549, 258.7443, 179.7451],
          [103.4997,  34.8510, 220.3155, 276.7411],
          [  9.5837,   1.7827, 295.3752, 296.3570],
          [  8.9762,   1.8686, 154.3551,  92.3048],
          [ 68.7176,  10.4747, 154.2110, 144.5562],
          [  6.8083,   0.0000, 296.3820, 295.5557],
          [  0.9061,  28.9066,  92.6611, 285.9233],
          [107.3213,   6.1443, 181.6549, 158.8449],
          [139.8574,  45.8880, 300.0000, 261.7556],
          [132.3773,  10.6286, 219.7238, 146.5825],
          [  9.2775,  53.1377, 179.9628, 254.6997],
          [ 78.8346,  32.2352, 216.1662, 119.8140],
          [168.7235,   3.3175, 246.2312, 158.1704],
          [  6.8083,   0.0000, 296.3820, 295.5557],
          [ 44.3411,  60.8006, 113.4774, 221.1278],
          [115.8170,  19.1909, 190.4655,  93.6851],
   