## Floating-point model

In [1]:
import torch

class M(torch.nn.Module):
    def __init__(self):
        super(M, self).__init__()
        # QuantStub converts tensors from floating point to quantized
        self.quant = torch.quantization.QuantStub()
        self.conv1 = torch.nn.Conv2d(1, 1, 1)
        self.relu = torch.nn.ReLU()
        self.conv2 = torch.nn.Conv2d(1, 1, 1)
        # DeQuantStub converts tensors from quantized to floating point
        self.dequant = torch.quantization.DeQuantStub()

    def forward(self, x):
        # manually specify where tensors will be converted from floating
        # point to quantized in the quantized model
        x = self.quant(x)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        # manually specify where tensors will be converted from quantized
        # to floating point in the quantized model
        x = self.dequant(x)
        return x

model_fp32 = M()
model_fp32.eval()

M(
  (quant): QuantStub()
  (conv1): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  (relu): ReLU()
  (conv2): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  (dequant): DeQuantStub()
)

## 量化

"fbgemm"对activation采用HistogramObserver，对weight采用PerChannelMinMaxObserver：

In [2]:
model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
model_fp32.qconfig

QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})

融合conv1 + relu：

In [3]:
model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv1', 'relu']])
model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
model_fp32_prepared



M(
  (quant): QuantStub(
    (activation_post_process): HistogramObserver()
  )
  (conv1): ConvReLU2d(
    (0): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
    (1): ReLU()
    (activation_post_process): HistogramObserver()
  )
  (relu): Identity()
  (conv2): Conv2d(
    1, 1, kernel_size=(1, 1), stride=(1, 1)
    (activation_post_process): HistogramObserver()
  )
  (dequant): DeQuantStub()
)

### 校准模型

In [4]:
input_fp32 = torch.randn(1, 1, 2, 3)
model_fp32_prepared(input_fp32)

model_int8 = torch.quantization.convert(model_fp32_prepared)
model_int8

  src_bin_begin // dst_bin_width, 0, self.dst_nbins - 1
  src_bin_end // dst_bin_width, 0, self.dst_nbins - 1


M(
  (quant): Quantize(scale=tensor([0.0011]), zero_point=tensor([0]), dtype=torch.quint8)
  (conv1): QuantizedConvReLU2d(1, 1, kernel_size=(1, 1), stride=(1, 1), scale=0.006616627331823111, zero_point=0)
  (relu): Identity()
  (conv2): QuantizedConv2d(1, 1, kernel_size=(1, 1), stride=(1, 1), scale=0.012025130912661552, zero_point=0)
  (dequant): DeQuantize()
)

### 对比浮点和定点模型
输出浮点模型和定点模型结果：

In [5]:
model_int8(input_fp32), model_fp32(input_fp32)

(tensor([[[[1.2747, 1.3228, 1.3228],
           [1.3228, 1.2747, 1.2747]]]]),
 tensor([[[[1.2804, 1.5274, 1.3441],
           [1.3480, 1.1642, 1.2445]]]], grad_fn=<SlowConv2DBackward0>))

对比两个模型conv1的weight：

In [6]:
print(model_int8.conv1.weight())
print(model_fp32.conv1.weight)

tensor([[[[0.3052]]]], size=(1, 1, 1, 1), dtype=torch.qint8,
       quantization_scheme=torch.per_channel_affine,
       scale=tensor([0.0024], dtype=torch.float64), zero_point=tensor([0]),
       axis=0)
Parameter containing:
tensor([[[[0.3064]]]], requires_grad=True)


model_int8实际运行时其conv1的weight_int为`weight/scale`：

In [7]:
print(model_int8.conv1.weight().int_repr())

tensor([[[[127]]]], dtype=torch.int8)


model_int8实际运行时送入conv1的数据为：

In [8]:
model_int8.quant(input_fp32).int_repr()

tensor([[[[  0, 255, 255],
          [255,   0,   0]]]], dtype=torch.uint8)

qx反量化后为dqx，和input_fp32对比：

In [9]:
qx = model_int8.quant(input_fp32)
dqx = torch.dequantize(qx)
dqx, input_fp32

(tensor([[[[0.0000, 0.2748, 0.2748],
           [0.2748, 0.0000, 0.0000]]]]),
 tensor([[[[-0.0087,  1.4414,  0.3655],
           [ 0.3885, -0.6908, -0.2191]]]]))

### 构建一个伪定点模型

In [10]:
from copy import deepcopy
fake_qmodel = deepcopy(model_fp32)
fake_qmodel

M(
  (quant): QuantStub()
  (conv1): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  (relu): ReLU()
  (conv2): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  (dequant): DeQuantStub()
)

替换conv1.weight为model_int8.conv1.weight的反量化值：

In [11]:
dq_weight = torch.dequantize(model_int8.conv1.weight())
fake_qmodel.conv1.weight = torch.nn.Parameter(dq_weight)
fake_qmodel.conv1.weight

Parameter containing:
tensor([[[[0.3052]]]], requires_grad=True)

对比三个模型conv1的weight：

In [12]:
print('model_int8:', model_int8.conv1.weight())
print('\nmodel_fp32:', model_fp32.conv1.weight)
print('\nfake_qmodel:', fake_qmodel.conv1.weight)

model_int8: tensor([[[[0.3052]]]], size=(1, 1, 1, 1), dtype=torch.qint8,
       quantization_scheme=torch.per_channel_affine,
       scale=tensor([0.0024], dtype=torch.float64), zero_point=tensor([0]),
       axis=0)

model_fp32: Parameter containing:
tensor([[[[0.3064]]]], requires_grad=True)

fake_qmodel: Parameter containing:
tensor([[[[0.3052]]]], requires_grad=True)


下面对比fake_qmodel.conv1和model_int8.conv1的结果，注意：
1. qx（torch.quint8）和dqx（torch.float）的值是一样的，只是类型不同
2. 由于model_int8的conv1融合了relu，因此我们也对fake_qmodel.conv1输出结果加上F.relu：

In [13]:
from torch.nn import functional as F

F.relu(fake_qmodel.conv1(dqx)), model_int8.conv1(qx)

(tensor([[[[0.5894, 0.6733, 0.6733],
           [0.6733, 0.5894, 0.5894]]]], grad_fn=<ReluBackward0>),
 tensor([[[[0.5889, 0.6749, 0.6749],
           [0.6749, 0.5889, 0.5889]]]], size=(1, 1, 2, 3), dtype=torch.quint8,
        quantization_scheme=torch.per_tensor_affine, scale=0.006616627331823111,
        zero_point=0))

从结果可以看出，虽然输入（qx、dqx）和conv1.weight的值都一样，但是结果仍然有差异，**这是应为model_int8.conv1计算时，用的是weight.int_repr()和qx.int_repr()做卷积，中间结果用int型保存，只是在输出时除以了浮点数scale**，而fake_qmodel.conv1始终以浮点数计算。

为了验证这个结论，我们可以构造一个全定点模型。

### 全定点计算模型

In [14]:
full_qmodel = deepcopy(model_fp32)
full_qmodel.conv1.weight = torch.nn.Parameter(model_int8.conv1.weight().int_repr().int(), requires_grad=False)
int_x = qx.int_repr().int()

full_qmodel.conv1.weight, int_x

(Parameter containing:
 tensor([[[[127]]]], dtype=torch.int32),
 tensor([[[[  0, 255, 255],
           [255,   0,   0]]]], dtype=torch.int32))

为了计算方便，将conv1.bias设为0：

In [15]:
full_qmodel.conv1.bias = torch.nn.Parameter(torch.tensor([0], dtype=torch.int), requires_grad=False)
full_qmodel.conv1.bias

Parameter containing:
tensor([0], dtype=torch.int32)

标准卷积计算公式：

$Y = XW + B$

由于将conv1.bias设为了0，那么量化后的卷积公式为：

$Y_q = (\frac{X}{s_x} + z)\cdot \frac{W}{s_w}$

$\quad = \frac{XW}{s_xs_w} + \frac{zW}{s_w}$

$Y \approx s_xs_wY_q - s_xWz + B$

> 实际运行时 $- s_xWz + B$ 可以提前计算出。

In [16]:
Y_q = full_qmodel.conv1(int_x)
z = qx.q_zero_point()
sw = model_int8.conv1.weight().q_per_channel_scales()
sx = qx.q_scale()
w = torch.dequantize(model_int8.conv1.weight())
B = model_fp32.conv1.bias

Y_a = Y_q * sw * sx - sx*w*z + B
F.relu(Y_a)

tensor([[[[0.5894, 0.6733, 0.6733],
          [0.6733, 0.5894, 0.5894]]]], dtype=torch.float64,
       grad_fn=<ReluBackward0>)

此时结果和model_int8.conv1(qx)还是有差异：

In [19]:
model_int8.conv1(qx), model_int8.conv1(qx).int_repr()

(tensor([[[[0.5889, 0.6749, 0.6749],
           [0.6749, 0.5889, 0.5889]]]], size=(1, 1, 2, 3), dtype=torch.quint8,
        quantization_scheme=torch.per_tensor_affine, scale=0.006616627331823111,
        zero_point=0),
 tensor([[[[ 89, 102, 102],
           [102,  89,  89]]]], dtype=torch.uint8))

这是因为model_int8.conv1在输出时又进行了一次量化，我们需要进行一次伪量化：

$\text{round}(\frac{Y}{s_a})s_a = \text{round}(\frac{s_xs_w}{s_a}Y_q + \frac{B}{s_a}) \cdot s_a$

In [18]:
out_scale = model_int8.conv1.scale
torch.round(Y_a / out_scale) * out_scale

tensor([[[[0.5889, 0.6749, 0.6749],
          [0.6749, 0.5889, 0.5889]]]], dtype=torch.float64,
       grad_fn=<MulBackward0>)