In [1]:
import sys
if '..' not in sys.path:
    sys.path.append('..')

In [2]:
import torch 
from torchvision.models import resnet18, ResNet18_Weights
from torch.quantization import MovingAverageMinMaxObserver,HistogramObserver

from tqdm import tqdm
import numpy as np

from source.data import get_imagenet_test_loader, get_imagenet_train_val_loaders
from source.models import ResNet18Quant

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def accuracy(model, dataset_loader, device='cuda', num_classes=1000):
    def one_hot(x, K):
        return np.array(x[:, None] == np.arange(K)[None, :], dtype=int)
    
    # Set BN and Droupout to eval regime
    model.eval()

    total_correct = 0

    for (x, y) in tqdm(dataset_loader):
        x = x.to(device)
        y = one_hot(np.array(y.numpy()), num_classes)
        target_class = np.argmax(y, axis=1)

        with torch.no_grad():
            out = model(x).cpu().detach().numpy()
            predicted_class = np.argmax(out, axis=1)
            total_correct += np.sum(predicted_class == target_class)

    total = len(dataset_loader) * dataset_loader.batch_size
    return total_correct / total

In [4]:
batch_size = 100

In [5]:
train_loader, val_loader = get_imagenet_train_val_loaders(data_root='/gpfs/gpfs0/k.sobolev/ILSVRC-12/',
                                       batch_size=batch_size,
                                       num_workers=4,
                                       pin_memory=True,
                                       val_perc=0.04,
                                       shuffle=True,
                                       random_seed=5)

In [6]:
test_loader = get_imagenet_test_loader(data_root='/gpfs/gpfs0/k.sobolev/ILSVRC-12/', 
                                       batch_size=batch_size,
                                       num_workers=4,
                                       pin_memory=True,
                                       shuffle=False)

In [4]:
# model = resnet18(pretrained=True)
model = resnet18(weights=ResNet18_Weights.DEFAULT)
model.eval()
model = model.cuda()



In [6]:
%%time
accuracy(model, test_loader, device='cuda', num_classes=1000)

100%|██████████| 100/100 [02:34<00:00,  1.55s/it]

CPU times: user 21.3 s, sys: 8.14 s, total: 29.4 s
Wall time: 2min 34s





0.6976

## Dynamic Quantization

In [30]:
model = model.cpu()

In [31]:
model = torch.quantization.quantize_dynamic(
    model,
    {torch.nn.Conv2d, torch.nn.Linear, torch.nn.BatchNorm2d,
     torch.nn.ReLU, torch.nn.MaxPool2d, torch.nn.AdaptiveAvgPool2d},
    dtype=torch.qint8
)
# model.qconfig

In [34]:
%%time
accuracy(model, test_loader, device='cpu', num_classes=1000)

100%|██████████| 100/100 [21:07<00:00, 12.67s/it]

CPU times: user 40min 15s, sys: 6min 42s, total: 46min 57s
Wall time: 21min 7s





0.6976

## Static Quantization

In [7]:
weights = ResNet18_Weights.verify(ResNet18_Weights.IMAGENET1K_V1)
model = ResNet18Quant(num_classes=len(weights.meta["categories"]))
model.load_state_dict(weights.get_state_dict(progress=True))
model.eval()

ResNet18Quant(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicQuantBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (ff): FloatFunctional(
        (activation_post_process): Identity()
      )
    )
    (1): BasicQuantBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=F

In [8]:
# model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
model.qconfig = torch.quantization.QConfig(
  activation=HistogramObserver.with_args(reduce_range=True),
  weight=MovingAverageMinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.qint8)
)
model.qconfig

QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MovingAverageMinMaxObserver'>, qscheme=torch.per_tensor_affine, dtype=torch.qint8){})

In [9]:
modules_to_fuse = [
    ['conv1', 'bn1', 'relu'],
    *([f'layer{i}.{j}.conv1', f'layer{i}.{j}.bn1', f'layer{i}.{j}.relu1'] for i in (1,2,3,4) for j in (0,1)),
    *([f'layer{i}.{j}.conv2', f'layer{i}.{j}.bn2'] for i in (1,2,3,4) for j in (0,1)),
    *([f'layer{i}.0.downsample.0', f'layer{i}.0.downsample.1'] for i in (2,3,4))
]
modules_to_fuse

[['conv1', 'bn1', 'relu'],
 ['layer1.0.conv1', 'layer1.0.bn1', 'layer1.0.relu1'],
 ['layer1.1.conv1', 'layer1.1.bn1', 'layer1.1.relu1'],
 ['layer2.0.conv1', 'layer2.0.bn1', 'layer2.0.relu1'],
 ['layer2.1.conv1', 'layer2.1.bn1', 'layer2.1.relu1'],
 ['layer3.0.conv1', 'layer3.0.bn1', 'layer3.0.relu1'],
 ['layer3.1.conv1', 'layer3.1.bn1', 'layer3.1.relu1'],
 ['layer4.0.conv1', 'layer4.0.bn1', 'layer4.0.relu1'],
 ['layer4.1.conv1', 'layer4.1.bn1', 'layer4.1.relu1'],
 ['layer1.0.conv2', 'layer1.0.bn2'],
 ['layer1.1.conv2', 'layer1.1.bn2'],
 ['layer2.0.conv2', 'layer2.0.bn2'],
 ['layer2.1.conv2', 'layer2.1.bn2'],
 ['layer3.0.conv2', 'layer3.0.bn2'],
 ['layer3.1.conv2', 'layer3.1.bn2'],
 ['layer4.0.conv2', 'layer4.0.bn2'],
 ['layer4.1.conv2', 'layer4.1.bn2'],
 ['layer2.0.downsample.0', 'layer2.0.downsample.1'],
 ['layer3.0.downsample.0', 'layer3.0.downsample.1'],
 ['layer4.0.downsample.0', 'layer4.0.downsample.1']]

In [10]:
model = torch.quantization.fuse_modules(model, modules_to_fuse)

In [11]:
model = torch.quantization.prepare(model)



In [12]:
# model

In [13]:
# can move to gpu for faster quantization calbration
model = model.cuda()

In [14]:
train_loader.batch_size

100

In [15]:
# quantiation calibration on 1000 samples of train dataset
model.eval()
with torch.no_grad():
    for idx, (train_x, _) in tqdm(enumerate(train_loader)):
        _ = model(train_x.cuda())
        if idx * train_loader.batch_size >= 1000: break

10it [00:13,  1.38s/it]


In [16]:
# have to move to cpu for quantization conversion
model = model.cpu()

In [17]:
model = torch.quantization.convert(model)

In [18]:
%%time
accuracy(model, test_loader, device='cpu', num_classes=1000)

100%|██████████| 500/500 [15:27<00:00,  1.86s/it]

CPU times: user 49min 45s, sys: 12.5 s, total: 49min 57s
Wall time: 15min 27s





0.69326

## Quantization Aware Training

In [None]:
model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
