In [1]:
import sys
if '..' not in sys.path:
    sys.path.append('..')

In [2]:
from flopco import FlopCo

In [3]:
import torch
from torchvision.models import resnet18

from tqdm import tqdm
import numpy as np
import os
import random

from torchvision.models import resnet18, ResNet18_Weights
from torch.quantization import MovingAverageMinMaxObserver,HistogramObserver

from source.data import get_imagenet_test_loader, get_imagenet_train_val_loaders
from source.eval import accuracy
from source.admm import build_cp_layer
from source.utils import bncalibrate_model
from source.models import ResNet18Quant
from source.rank_map import get_rank_map

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
def pass_calibration_data(sim_model, use_cuda):
    batch_size = train_loader.batch_size

    if use_cuda:
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    sim_model.eval()
    samples = 1000

    batch_cntr = 0
    with torch.no_grad():
        for input_data, target_data in train_loader:

            inputs_batch = input_data.to(device)
            sim_model(inputs_batch)

            batch_cntr += 1
            print(batch_cntr * batch_size)
            if (batch_cntr * batch_size) >= samples:
                break

In [5]:
train_loader, val_loader = get_imagenet_train_val_loaders(data_root='/gpfs/gpfs0/k.sobolev/ILSVRC-12/',
                                       batch_size=500,
                                       num_workers=4,
                                       pin_memory=True,
                                       val_perc=0.04,
                                       shuffle=True,
                                       random_seed=seed)

In [6]:
test_loader = get_imagenet_test_loader(data_root='/gpfs/gpfs0/k.sobolev/ILSVRC-12/', 
                                       batch_size=500,
                                       num_workers=4,
                                       pin_memory=True,
                                       shuffle=False)

In [7]:
method = 'admm'
qscheme = 'tensor_affine'
bits = 8
eps = 0.003
decomp = 'cp3'
rank_map = get_rank_map(eps, decomp)

In [8]:
model = resnet18(pretrained=True).to(device)
# model = torch.load('eps0.003_calibrated')
# weights = ResNet18_Weights.verify(ResNet18_Weights.IMAGENET1K_V1)
# model = ResNet18Quant(num_classes=len(weights.meta["categories"]))
# model.load_state_dict(weights.get_state_dict(progress=True))
model.eval()
model = model.to(device)



In [9]:
model_stats = FlopCo(model.to(device), img_size=(1, 3, 224, 224), device=device)
orig_macs = 0
for x in model_stats.macs.values():
    orig_macs += x[0]
orig_macs

1814073344

In [9]:
for module in ['layer1', 'layer2', 'layer3', 'layer4']:
    for layer_path in [f'{module}.0.conv1', f'{module}.0.conv2', 
#                        f'{module}.0.downsample',
                       f'{module}.1.conv1', f'{module}.1.conv2']:
        # there is no layer1.0.downsample layer
        if layer_path == 'layer1.0.downsample': continue
            
        lname, lidx, ltype = layer_path.split('.')
        lidx = int(lidx)
        layer = model.__getattr__(lname)[lidx].__getattr__(ltype)
        kernel_size = layer.kernel_size
        stride = layer.stride
        padding = layer.padding
        cin = layer.in_channels
        cout = layer.out_channels
        rank = rank_map[layer_path]
        bias = layer.bias
        if bias is not None: bias = bias.detach()
        
        print(f'loading {bits}bit_{qscheme}/factors_{method}_seed{seed}/{layer_path}_{method}_random_rank_{rank}')
        A = torch.load(f'../{bits}bit_{qscheme}/factors_{method}_seed{seed}/{layer_path}_{method}_random_rank_{rank}_mode_0.pt').to(device)
        assert A.dtype == torch.float 
        B = torch.load(f'../{bits}bit_{qscheme}/factors_{method}_seed{seed}/{layer_path}_{method}_random_rank_{rank}_mode_1.pt').to(device)
        C = torch.load(f'../{bits}bit_{qscheme}/factors_{method}_seed{seed}/{layer_path}_{method}_random_rank_{rank}_mode_2.pt').to(device)

        model.__getattr__(lname)[lidx].__setattr__(
            ltype, build_cp_layer(rank, [A,B,C], bias, cin, cout, kernel_size, padding, stride).to(device))

loading 8bit_tensor_affine/factors_admm_seed42/layer1.0.conv1_admm_random_rank_64
loading 8bit_tensor_affine/factors_admm_seed42/layer1.0.conv2_admm_random_rank_61
loading 8bit_tensor_affine/factors_admm_seed42/layer1.1.conv1_admm_random_rank_78
loading 8bit_tensor_affine/factors_admm_seed42/layer1.1.conv2_admm_random_rank_73
loading 8bit_tensor_affine/factors_admm_seed42/layer2.0.conv1_admm_random_rank_133
loading 8bit_tensor_affine/factors_admm_seed42/layer2.0.conv2_admm_random_rank_146
loading 8bit_tensor_affine/factors_admm_seed42/layer2.1.conv1_admm_random_rank_173
loading 8bit_tensor_affine/factors_admm_seed42/layer2.1.conv2_admm_random_rank_131
loading 8bit_tensor_affine/factors_admm_seed42/layer3.0.conv1_admm_random_rank_159
loading 8bit_tensor_affine/factors_admm_seed42/layer3.0.conv2_admm_random_rank_248
loading 8bit_tensor_affine/factors_admm_seed42/layer3.1.conv1_admm_random_rank_268
loading 8bit_tensor_affine/factors_admm_seed42/layer3.1.conv2_admm_random_rank_249
loading 

In [11]:
model_stats = FlopCo(model.to(device), img_size=(1, 3, 224, 224), device=device)
redc_macs = 0
for x in model_stats.macs.values():
    redc_macs += x[0]
redc_macs / orig_macs

0.3318769249276836

In [12]:
%time 
accuracy(model, test_loader, device=device)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.48 µs


100%|██████████| 100/100 [01:48<00:00,  1.08s/it]


0.37086

In [12]:
%%time
num_samples = 10000
model = bncalibrate_model(model, train_loader, num_samples=num_samples, device=device)

  1%|          | 21/2459 [01:48<3:30:33,  5.18s/it]

CPU times: user 7.12 s, sys: 2.89 s, total: 10 s
Wall time: 1min 48s





In [13]:
%time 
accuracy(model, test_loader, device=device)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs


100%|██████████| 100/100 [01:47<00:00,  1.08s/it]


0.6274

In [14]:
torch.save(model, f"e={eps}_d={decomp}_{qscheme.split('_')[-1]}.calibrated_{num_samples}")

# Torch Static Quantization

In [21]:
# model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
model.qconfig = torch.quantization.QConfig(
  activation=HistogramObserver.with_args(reduce_range=True, dtype=torch.quint8, qscheme=torch.per_tensor_affine),
  weight=MovingAverageMinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.qint8)
)
model.qconfig

QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MovingAverageMinMaxObserver'>, qscheme=torch.per_tensor_affine, dtype=torch.qint8){})

In [28]:
modules_to_fuse = [
    ['conv1', 'bn1', 'relu'],
    *([f'layer{i}.{j}.conv1.conv3', f'layer{i}.{j}.bn1', f'layer{i}.{j}.relu1'] for i in (1,2,3,4) for j in (0,1)),
    *([f'layer{i}.{j}.conv2.conv3', f'layer{i}.{j}.bn2'] for i in (1,2,3,4) for j in (0,1)),
    *([f'layer{i}.0.downsample.0', f'layer{i}.0.downsample.1'] for i in (2,3,4))
]
modules_to_fuse

[['conv1', 'bn1', 'relu'],
 ['layer1.0.conv1.conv3', 'layer1.0.bn1', 'layer1.0.relu1'],
 ['layer1.1.conv1.conv3', 'layer1.1.bn1', 'layer1.1.relu1'],
 ['layer2.0.conv1.conv3', 'layer2.0.bn1', 'layer2.0.relu1'],
 ['layer2.1.conv1.conv3', 'layer2.1.bn1', 'layer2.1.relu1'],
 ['layer3.0.conv1.conv3', 'layer3.0.bn1', 'layer3.0.relu1'],
 ['layer3.1.conv1.conv3', 'layer3.1.bn1', 'layer3.1.relu1'],
 ['layer4.0.conv1.conv3', 'layer4.0.bn1', 'layer4.0.relu1'],
 ['layer4.1.conv1.conv3', 'layer4.1.bn1', 'layer4.1.relu1'],
 ['layer1.0.conv2.conv3', 'layer1.0.bn2'],
 ['layer1.1.conv2.conv3', 'layer1.1.bn2'],
 ['layer2.0.conv2.conv3', 'layer2.0.bn2'],
 ['layer2.1.conv2.conv3', 'layer2.1.bn2'],
 ['layer3.0.conv2.conv3', 'layer3.0.bn2'],
 ['layer3.1.conv2.conv3', 'layer3.1.bn2'],
 ['layer4.0.conv2.conv3', 'layer4.0.bn2'],
 ['layer4.1.conv2.conv3', 'layer4.1.bn2'],
 ['layer2.0.downsample.0', 'layer2.0.downsample.1'],
 ['layer3.0.downsample.0', 'layer3.0.downsample.1'],
 ['layer4.0.downsample.0', 'layer4.

In [29]:
model = torch.quantization.fuse_modules(model, modules_to_fuse)

In [30]:
model = torch.quantization.prepare(model)



In [33]:
# model

In [34]:
# can move to gpu for faster quantization calbration
model = model.cuda()

In [37]:
# quantiation calibration on 2000 samples of train dataset
model.eval()
with torch.no_grad():
    for idx, (train_x, _) in tqdm(enumerate(train_loader)):
        _ = model(train_x.cuda())
        if idx * train_loader.batch_size >= 2000: break

4it [00:28,  7.23s/it]


In [38]:
# have to move to cpu for quantization conversion
model = model.cpu()

In [39]:
model = torch.quantization.convert(model)

In [40]:
%time 
accuracy(model, test_loader, device='cpu')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 9.06 µs


100%|██████████| 100/100 [1:44:08<00:00, 62.49s/it]


0.52988