In [84]:
import numpy as np

# 8bit对称量化 映射到[-128,127]

https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/kv_int8.md

$$
quant = clip(round(\frac {x - zero\_point} {scale}), -128, 127) \\
dequant = quant * scale + zero\_point
$$

In [85]:
np.random.seed(0)
x = np.random.randint(0, 10, (10,)) + np.cos(np.random.uniform(0, 10, (10,)))
x

array([4.01378717, 0.84344637, 2.08489566, 3.06421797, 6.73548049,
       9.08727158, 2.29349957, 4.51461278, 1.0268764 , 4.98035689])

In [86]:
x.min(), x.max()

(0.8434463715450559, 9.087271580440916)

In [87]:
# keep zero_point
zero_point = (x.min() + x.max()) / 2
zero_point

4.965358975992986

In [88]:
# keep scale
scale = (x.max() - x.min()) / 255
scale

0.03232872630939553

In [89]:
# quantize
quant = np.clip(np.round((x - zero_point) / scale), -128, 127).astype(np.int8)
quant

array([ -29, -127,  -89,  -59,   55,  127,  -83,  -14, -122,    0],
      dtype=int8)

In [90]:
quant.min(), quant.max()

(-127, 127)

In [91]:
# dequantize
dequant = quant * scale + zero_point
dequant.min(), dequant.max()

(0.8596107346997535, 9.071107217286219)

In [92]:
dequant

array([4.02782591, 0.85961073, 2.08810233, 3.05796412, 6.74343892,
       9.07110722, 2.28207469, 4.51275681, 1.02125437, 4.96535898])

In [93]:
x

array([4.01378717, 0.84344637, 2.08489566, 3.06421797, 6.73548049,
       9.08727158, 2.29349957, 4.51461278, 1.0268764 , 4.98035689])

In [94]:
# 效果看来对称量化比非对称量化效果好
np.isclose(x, dequant, rtol=0.01, atol=0.01)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

# 8bit非对称量化 映射到[0，255]

$$
quant = clip(round(\frac {x - min(x)} {scale}), 0, 255) \\
dequant = quant * scale + min(x)
$$

In [95]:
np.random.seed(0)
x = np.random.randint(0, 10, (10,)) + np.cos(np.random.uniform(0, 10, (10,)))
x

array([4.01378717, 0.84344637, 2.08489566, 3.06421797, 6.73548049,
       9.08727158, 2.29349957, 4.51461278, 1.0268764 , 4.98035689])

In [96]:
# keep min(x)
x_min = x.min()
x_min, x.max()

(0.8434463715450559, 9.087271580440916)

In [97]:
# keep scale
scale = (x.max() - x.min()) / 255
scale

0.03232872630939553

In [98]:
# quantize
quant = np.clip(np.round((x - x_min) / scale), 0, 255).astype(np.uint8)
quant

array([ 98,   0,  38,  69, 182, 255,  45, 114,   6, 128], dtype=uint8)

In [99]:
quant.min(), quant.max()

(0, 255)

In [100]:
# dequantize
dequant = quant * scale + x_min
dequant.min(), dequant.max()

(0.8434463715450559, 9.087271580440916)

In [101]:
dequant

array([4.01166155, 0.84344637, 2.07193797, 3.07412849, 6.72727456,
       9.08727158, 2.29823906, 4.52892117, 1.03741873, 4.98152334])

In [102]:
x

array([4.01378717, 0.84344637, 2.08489566, 3.06421797, 6.73548049,
       9.08727158, 2.29349957, 4.51461278, 1.0268764 , 4.98035689])

In [103]:
np.isclose(x, dequant, rtol=0.01, atol=0.01)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])