# Static Quantization

## 1. Import packages

In [1]:
import torch
import resnet
import copy
import torch.nn as nn
import helper
import torchvision
import torchvision.transforms as transforms
import os

cuda_device = torch.device("cuda:0")
cpu_device = torch.device("cpu:0")

## 2. Create a model instance

In [2]:
model_fp32 = resnet.resnet18(pretrained=False, num_classes=10)
model_fp32

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (skip_add): FloatFunctional(
        (activation_post_process): Identity()
      )
      (relu2): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
     

In [3]:
state_dict = torch.load('resnet18.pt')
model_fp32.load_state_dict(state_dict)

<All keys matched successfully>

# CIFAR10 DataSet

In [4]:
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

In [5]:
num_workers = 8
    
train_set = torchvision.datasets.CIFAR10(root="data", train=True, download=False, transform=transform) 
test_set = torchvision.datasets.CIFAR10(root="data", train=False, download=False, transform=transform)

train_sampler = torch.utils.data.RandomSampler(train_set)
test_sampler = torch.utils.data.SequentialSampler(test_set)
    
train_loader = torch.utils.data.DataLoader(
    dataset=train_set, batch_size=32,
    sampler=train_sampler, num_workers=num_workers)

test_loader = torch.utils.data.DataLoader(
    dataset=test_set, batch_size=32,
    sampler=test_sampler, num_workers=num_workers)

In [6]:
# for a, b in test_loader:
#     print(a.size())
#     print(b.size())

evaluate_model `int8_eval_loss` and `int8_eval_accurac`

In [7]:
fp32_eval_loss_tmp, fp32_eval_accuracy_tmp, len_test_dataset = helper.evaluate_model(model=model_fp32, test_loader=test_loader, device=cuda_device, criterion=None)
fp32_eval_accuracy_int = fp32_eval_accuracy_tmp.item()

fp32_eval_loss = fp32_eval_loss_tmp/len_test_dataset
fp32_eval_accuracy = fp32_eval_accuracy_int/len_test_dataset

print('fp32_eval_loss = {}'.format(fp32_eval_loss))
print('fp32_eval_accuracy = {}'.format(fp32_eval_accuracy))
print('len_test_dataset = {}'.format(len_test_dataset))

100%|██████████| 313/313 [00:01<00:00, 163.62it/s]

fp32_eval_loss = 0.0
fp32_eval_accuracy = 0.9259
len_test_dataset = 10000





## 3. Fusion layer

In [8]:
model_fp32

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (skip_add): FloatFunctional(
        (activation_post_process): Identity()
      )
      (relu2): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
     

In [9]:
fused_model_fp32 = copy.deepcopy(model_fp32)
fused_model_fp32.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (skip_add): FloatFunctional(
        (activation_post_process): Identity()
      )
      (relu2): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
     

In [10]:
# Fuse the activations to preceding layers, where applicable.
# This needs to be done manually depending on the model architecture.
# Common fusions include `conv + relu` and `conv + batchnorm + relu`
fused_model_fp32 = torch.quantization.fuse_modules(fused_model_fp32, [["conv1", "bn1", "relu"]], inplace=True)

for module_name, module in fused_model_fp32.named_children():
    if "layer" in module_name:
        for basic_block_name, basic_block in module.named_children():
            torch.quantization.fuse_modules(basic_block, [["conv1", "bn1", "relu"], ["conv2", "bn2"]], inplace=True)
            for sub_block_name, sub_block in basic_block.named_children():
                if sub_block_name == "downsample":
                    torch.quantization.fuse_modules(sub_block, [["0", "1"]], inplace=True)

fused_model_fp32

ResNet(
  (conv1): ConvReLU2d(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
  )
  (bn1): Identity()
  (relu): Identity()
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): ConvReLU2d(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (bn1): Identity()
      (relu): Identity()
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn2): Identity()
      (skip_add): FloatFunctional(
        (activation_post_process): Identity()
      )
      (relu2): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): ConvReLU2d(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (bn1): Identity()
      (relu): Identity()
      (conv2): Conv2d(64, 64, kernel_size=(3

## fused model equivalence

In [11]:
model_fp32.eval()
fused_model_fp32.eval()

assert helper.model_equivalence(model_1=model_fp32, model_2=fused_model_fp32, device=cpu_device, rtol=1e-05, atol=1e-05, num_tests=100, input_size=(1,3,32,32)), "Fused model is not equivalent to the original model!"

## 4. Static Quantization

In [None]:
class QuantizedResNet18(nn.Module):
    def __init__(self, model_fp32):
        super(QuantizedResNet18, self).__init__()
        
        self.quant = torch.quantization.QuantStub()
        self.model_fp32 = model_fp32
        self.dequant = torch.quantization.DeQuantStub()
        
    def forward(self, x):
        x = self.quant(x)
        x = self.model_fp32(x)
        x = self.dequant(x)
        return x

In [None]:
def calibrate_model(model, loader, device=torch.device("cpu:0")):

    model.to(device)
    model.eval()

    for inputs, labels in loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        _ = model(inputs)

In [None]:
# Quantization Model Define
quantized_model = QuantizedResNet18(model_fp32=fused_model_fp32)

# Quantization Configuration Define
quantized_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')     # 'qnnpack' for NVIDIA
torch.quantization.prepare(quantized_model, inplace=True)

# Calibration
calibrate_model(model=quantized_model, loader=test_loader, device=cuda_device)
quantized_model = quantized_model.to(cpu_device)

# Quantization Completed
quantized_model_int8 = torch.quantization.convert(quantized_model, inplace=True)
quantized_model_int8.eval()
print(quantized_model_int8)

QuantizedResNet18(
  (quant): Quantize(scale=tensor([0.0408]), zero_point=tensor([60]), dtype=torch.quint8)
  (model_fp32): ResNet(
    (conv1): QuantizedConvReLU2d(3, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.005240552127361298, zero_point=0, padding=(1, 1))
    (bn1): Identity()
    (relu): Identity()
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.0032231544610112906, zero_point=0, padding=(1, 1))
        (bn1): Identity()
        (relu): Identity()
        (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.007309970445930958, zero_point=79, padding=(1, 1))
        (bn2): Identity()
        (skip_add): QFunctional(
          scale=0.009357315488159657, zero_point=56
          (activation_post_process): Identity()
        )
        (relu2): QuantizedReLU(inplace=True)
      )
  

### MINI TEST

In [None]:
fp32_eval_loss_tmp, fp32_eval_accuracy_tmp, len_test_dataset = helper.evaluate_model(model=model_fp32, test_loader=test_loader, device=cuda_device, criterion=None)
fp32_eval_accuracy_int = fp32_eval_accuracy_tmp.item()

fp32_eval_loss = fp32_eval_loss_tmp/len_test_dataset
fp32_eval_accuracy = fp32_eval_accuracy_int/len_test_dataset
print('\nfp32_eval_loss = {}'.format(fp32_eval_loss))
print('fp32_eval_accuracy = {}'.format(fp32_eval_accuracy))
print('len_test_dataset = {}'.format(len_test_dataset))

100%|██████████| 313/313 [00:01<00:00, 160.54it/s]


fp32_eval_loss = 0.0
fp32_eval_accuracy = 0.9259
len_test_dataset = 10000





In [None]:
int8_eval_loss_tmp, int8_eval_accuracy_tmp, len_test_dataset = helper.evaluate_model(model=quantized_model_int8, test_loader=test_loader, device=cpu_device, criterion=None)
int8_eval_accuracy_int = int8_eval_accuracy_tmp.item()

int8_eval_loss = int8_eval_loss_tmp/len_test_dataset
int8_eval_accuracy = int8_eval_accuracy_int/len_test_dataset
print('\nint8_eval_loss = {}'.format(int8_eval_loss))
print('int8_eval_accuracy = {}'.format(int8_eval_accuracy))
print('len_test_dataset = {}'.format(len_test_dataset))

100%|██████████| 313/313 [00:03<00:00, 93.40it/s] 


int8_eval_loss = 0.0
int8_eval_accuracy = 0.9244
len_test_dataset = 10000





In [None]:
quantized_model_int8

QuantizedResNet18(
  (quant): Quantize(scale=tensor([0.0408]), zero_point=tensor([60]), dtype=torch.quint8)
  (model_fp32): ResNet(
    (conv1): QuantizedConvReLU2d(3, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.005240552127361298, zero_point=0, padding=(1, 1))
    (bn1): Identity()
    (relu): Identity()
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.0032231544610112906, zero_point=0, padding=(1, 1))
        (bn1): Identity()
        (relu): Identity()
        (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.007309970445930958, zero_point=79, padding=(1, 1))
        (bn2): Identity()
        (skip_add): QFunctional(
          scale=0.009357315488159657, zero_point=56
          (activation_post_process): Identity()
        )
        (relu2): QuantizedReLU(inplace=True)
      )
  

# Test

Performance Comparison Between :: Not Quantized and Quantized

Quantized Model's Inference Input Define

In [None]:
import time
from PIL import Image
from torchvision import transforms

input_image = Image.open("dog.jpg")
preprocess = transforms.Compose([
    transforms.Resize(32),
    transforms.CenterCrop(32),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
])

input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model

Not Quantized Model's Inference Input Define

In [None]:
model_fp32.to(cpu_device)

output_notQuantized = model_fp32(input_batch)
output_Quantized = quantized_model_int8(input_batch)

In [None]:
# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
print(output_notQuantized[0])
print(output_Quantized[0])

# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
probabilities_notQuantized = torch.nn.functional.softmax(output_notQuantized[0], dim=0)
probabilities_Quantized = torch.nn.functional.softmax(output_Quantized[0], dim=0)

tensor([-0.9417, -1.0729, -0.6361,  1.9892, -0.7011,  4.9936, -0.8382, -0.5704,
        -1.1279, -1.0947], grad_fn=<SelectBackward>)
tensor([-0.9542, -1.0905, -0.6134,  2.0447, -0.6816,  4.9754, -0.8179, -0.5453,
        -1.1587, -1.0905])


In [None]:
categories = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]

# Show top categories per image
top5_prob_notQuantized, top5_catid_notQuantized = torch.topk(probabilities_notQuantized, 5)
top5_prob_Quantized, top5_catid_Quantized = torch.topk(probabilities_Quantized, 5)

print("not Quantized Model's Perfomance:")
for i in range(top5_prob_notQuantized.size(0)):
    print(categories[top5_catid_notQuantized[i]], top5_prob_notQuantized[i].item())

print("\nQuantized Model's Perfomance:")    
for i in range(top5_prob_Quantized.size(0)):
    print(categories[top5_catid_Quantized[i]], top5_prob_Quantized[i].item())

not Quantized Model's Perfomance:
dog 0.9322130680084229
cat 0.046208709478378296
horse 0.0035733398981392384
bird 0.003346235491335392
deer 0.0031356869731098413

Quantized Model's Perfomance:
dog 0.9284108281135559
cat 0.049537982791662216
horse 0.003716521430760622
bird 0.0034716553054749966
deer 0.0032429220154881477


Inference Speed Comparison Between :: Not Quantized and Quantized

In [None]:
model_fp32_cpu_inference_latency = helper.measure_inference_latency(model=model_fp32, device=cpu_device, input_size=(1,3,32,32), num_samples=100)
quantized_model_int8_cpu_inference_latency = helper.measure_inference_latency(model=quantized_model_int8, device=cpu_device, input_size=(1,3,32,32), num_samples=100)

model_fp32_cpu_inference_latency = model_fp32_cpu_inference_latency * 1000
quantized_model_int8_cpu_inference_latency = quantized_model_int8_cpu_inference_latency * 1000

print(f"model_fp32_cpu_inference_latency : {model_fp32_cpu_inference_latency}  ms/sample")
print(f"quantized_model_int8_cpu_inference_latency : {quantized_model_int8_cpu_inference_latency}  ms/sample")

model_fp32_cpu_inference_latency : 0.006166379451751709
quantized_model_int8_cpu_inference_latency : 0.002060849666595459


Model Size Comparison Between :: Not Quantized and Quantized

In [None]:
def print_model_size(model):
    torch.save(model.state_dict(), "tmp.pt")
    print("%.2f MB" %(os.path.getsize("tmp.pt")/1e6))
    os.remove('tmp.pt')
    
    
print_model_size(model_fp32)
print_model_size(quantized_model_int8)

44.78 MB
11.31 MB
