# TD10a - Quantization

In [4]:
import torch

# define a random model that takes 4000 input features and outputs 4 features with an intermediate layer of 4000 features
class M(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(4000, 4000)
        self.fc2 = torch.nn.Linear(4000, 4)

    def forward(self, x):
        x = self.fc(x)
        x = self.fc2(x)
        return x

# create a model instance
model_fp32 = M()

# create a quantized model instance
model_int8 = torch.ao.quantization.quantize_dynamic(
    model_fp32,  # the original model
    {torch.nn.Linear},  # a set of layers to dynamically quantize
    dtype=torch.qint8  # the target dtype for quantized weights, here we use 8-bit integer - we are going a bit crazy on the quantization amount
)

In [5]:
# run the model with and without quantization on a batch of four random inputs - let's see the time difference
input_ = torch.randn(4, 4000)
with torch.no_grad():
    %timeit res_32 = model_fp32(input_)
    %timeit res_16 = model_int8(input_)

8.7 ms ± 113 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
500 µs ± 6.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [6]:
# Make sure the results are roughly the same because if it's faster but outputs stupid stuff, it's not really worth it
with torch.no_grad():
    res_32 = model_fp32(input_)
    res_16 = model_int8(input_)
print(res_16, "\n", res_32)

tensor([[-0.0023,  0.1657, -0.4194, -0.1135],
        [ 0.2793, -0.6509, -0.3346,  0.0237],
        [-0.2458,  0.3721,  0.3461, -0.6264],
        [ 0.3397,  0.0511,  0.3700,  0.0050]]) 
 tensor([[ 0.0053,  0.1750, -0.4322, -0.0996],
        [ 0.2767, -0.6634, -0.3338,  0.0279],
        [-0.2463,  0.3696,  0.3532, -0.6312],
        [ 0.3414,  0.0432,  0.3672,  0.0082]])


In [7]:
# save both models
torch.save(model_fp32.state_dict(), "model_fp32.pth")
torch.save(model_int8.state_dict(), "model_int8.pth")

And now, compare the sizes of the two models on your hard drive.