## Accuracy

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="3" 

In [2]:
from catalyst.dl.callbacks import Callback, AccuracyCallback
from catalyst.dl.experiments import SupervisedRunner

from collections import OrderedDict, defaultdict

import torch
import numpy as np

In [3]:
loaders = OrderedDict()

In [6]:
import sys
sys.path.append('../')
import dataloaders

import torchvision.datasets as datasets
from torchvision import transforms

# DATA_ROOT = "/workspace/raid/data/datasets"
DATA_ROOT = "/gpfs/gpfs0/e.ponomarev/"
dataset_name = 'imagenet'

bs = 128
num_workers = 32

if dataset_name == 'cifar10':
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    DATA_PATH = "{}/cifar10".format(DATA_ROOT)

    loaders["valid"] = torch.utils.data.DataLoader(
        datasets.CIFAR10(root=DATA_PATH, train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]),  download = True),
        batch_size=bs, shuffle=False,
        num_workers=num_workers, pin_memory=True)
elif dataset_name == 'imagenet':
    loaders["valid"] = dataloaders.get_loader(batch_size=bs,
                                        data_name = 'imagenet',
                                        data_root = DATA_ROOT,
                                        num_workers = num_workers, 
                                        pin_memory = True)['val']

Building imagenet data loader with 32 workers


In [7]:
class MyInferCallback(Callback):
    
    
    def __init__(self, out_dir=None, out_prefix=None):
        self.out_dir = out_dir
        self.out_prefix = out_prefix
        self.predictions = defaultdict(lambda: [])
        self._keys_from_state = ["out_dir", "out_prefix"]

    def on_stage_start(self, state):
        for key in self._keys_from_state:
            value = getattr(state, key, None)
            if value is not None:
                setattr(self, key, value)
        # assert self.out_prefix is not None
        if self.out_dir is not None:
            self.out_prefix = str(self.out_dir) + "/" + str(self.out_prefix)
        if self.out_prefix is not None:
            os.makedirs(os.path.dirname(self.out_prefix), exist_ok=True)

    def on_loader_start(self, state):
        self.predictions = defaultdict(lambda: [])
        self.inputs = defaultdict(lambda: [])

    def on_batch_end(self, state):
        dct = state.output
        dct = {key: value.detach().cpu().numpy() for key, value in dct.items()}
        for key, value in dct.items():
            self.predictions[key].append(value)
            
        dct = state.input
        dct = {key: value.detach().cpu().numpy() for key, value in dct.items()}
        for key, value in dct.items():
            self.inputs[key].append(value)

    def on_loader_end(self, state):
        from catalyst.dl.metrics import accuracy
        self.predictions = {
            key: np.concatenate(value, axis=0)
            for key, value in self.predictions.items()
        }
        self.inputs = {
            key: np.concatenate(value, axis=0)
            for key, value in self.inputs.items()
        }
        y_true = torch.tensor(self.inputs['targets'])
        y_pred = torch.tensor(self.predictions['logits'])
        self.accuracy_score = torch.stack(accuracy(y_pred,y_true,topk=(1, 3, 5))).numpy().astype('float32')
        self.accuracy_score = np.squeeze(self.accuracy_score)

In [8]:
def run(model, device = 'cuda'):

    runner = SupervisedRunner()
    runner.infer(
        model=model.to(device),
        verbose = True,
        loaders=OrderedDict([("infer", loaders["valid"])]),
        callbacks=[
            MyInferCallback(),
            AccuracyCallback(accuracy_args=[1, 3, 5])
        ],
    )
    score = runner.callbacks[0].accuracy_score
    del model
    del runner
    torch.cuda.empty_cache()
    return score

In [9]:
import sys
sys.path.append('../')

from model_utils import load_model
from torchvision.models import resnet34, resnet18

# MODEL_NAME = 'vgg16_imagenet'
# MODEL_NAME = 'resnet50_imagenet'
MODEL_NAME = 'resnet18_imagenet'
# MODEL_NAME = 'resnet34_imagenet'

# MODEL_NAME = 'faster_rcnn_vgg16
# MODEL_NAME = 'faster_rcnn_resnet50'


# # Uncomment if MODEL_NAME = 'faster_rcnn_resnet50'
# sys.path.append('/workspace/home/jgusak/maxvol_objects/facebook_frcnn/')
# import maskrcnn_benchmark

# model = load_model(MODEL_NAME)

model = resnet18(pretrained = True)
model = model.eval()

In [10]:
for p in model.parameters():
#     print(p.requires_grad)
    p.requires_grad = False

In [11]:
scores0 = run(model)

0/1 * Epoch (infer): 100% 391/391 [00:50<00:00, 10.77it/s, _timers/_fps=863.038, accuracy01=42.500, accuracy03=73.750, accuracy05=77.500]  


In [12]:
scores0

array([69.758   , 84.95801 , 89.076004], dtype=float32)

### Get  all  layers

Function  **get_layer_names()** returns names of model layers (convolutional and fully connected) and boolean mask for convolutional layers. 

In [13]:
from model_utils import get_layer_names

layer_names, conv_layer_mask = get_layer_names(model)


fc_layer_mask = (1 - conv_layer_mask).astype(bool)

print(layer_names[conv_layer_mask])
print(layer_names[fc_layer_mask])

['conv1' 'layer1.0.conv1' 'layer1.0.conv2' 'layer1.1.conv1'
 'layer1.1.conv2' 'layer2.0.conv1' 'layer2.0.conv2'
 'layer2.0.downsample.0' 'layer2.1.conv1' 'layer2.1.conv2'
 'layer3.0.conv1' 'layer3.0.conv2' 'layer3.0.downsample.0'
 'layer3.1.conv1' 'layer3.1.conv2' 'layer4.0.conv1' 'layer4.0.conv2'
 'layer4.0.downsample.0' 'layer4.1.conv1' 'layer4.1.conv2']
['fc']


In [14]:
# auxiliary function
import numpy as np
def split_resnet_layers_by_blocks(lnames):
#     starts = ['body.stem.conv1'] + ['body.layer{}'.format(i) for i in range(1,5)]
    starts = ['conv1'] + ['layer{}'.format(i) for i in range(1,5)]


    start_idx = 0
    blocks_idxs = []
    layer_names_by_blocks = []

    for s in starts:
        curr_block =  [l for l in lnames if l.startswith(s)]
        layer_names_by_blocks.append(curr_block)

        blocks_idxs.append(np.arange(start_idx, start_idx+len(curr_block)))
        start_idx += len(curr_block)

    return blocks_idxs

### Compress selected layers

For **convolutional** layers
- Set **decomposition**: 'tucker2', 'cp3' or 'cp4'
- Set  decomposition **ranks** for convolutional layers (namely, ranks we use to decompose convolutional weight tensors). 
  - In Tucker2 case, for one layer 
      - If **rank = None**, the layer won't be decomposed.
      - Elif **rank = 0**, then  VBMF method with **vbmf_weaken_factor**  will be used to select (rank_cout, rank_cin).
      - Elif **rank = (-scalar) < 0**, then values (rank_cout, rank_cin) will be choosen as maximal values which allow **(sacalar x) layer parameter reduction**.
      - Else **rank = tuple** and determines absolute ranks values (rank_cout, rank_cin)
  - In CP case, rank for one layer is a scalar
      - If **rank = None**, the layer won't be decomposed.
      - Elif **rank = (-scalar) < 0** then value for rank will be choosen as maximal rank which allows **(sacalar x) layer parameter reduction**.
      - Else **rank = scalar > 0** and determines absolute rank value.
      
For **fully connected** layers
- Set **decomposition** = 'svd'
- Set decomposition for linear layers (namely, ranks we use to factorize weight matrices)
    - In SVD case, rank for one layer is a scalar
      - If **rank = None**, the layer won't be decomposed.
      - Elif **rank = 0**, then  VBMF method with **vbmf_weaken_factor**  will be used to select rank.
      - Elif **rank = (-scalar) < 0** then value for rank will be choosen as maximal rank which allows **(sacalar x) layer parameter reduction**.
      - Else **rank = scalar > 0** and determines absolute rank value.

In [19]:
from tensor_compression import get_compressed_model
import copy
import numpy as np

# decomposition_conv = 'cp4'
# decomposition_conv = 'cp3'
decomposition_conv = 'tucker2'

decomposition_fc = 'svd'

RANK_SELECTION = 'vbmf'
# RANK_SELECTION = 'nx'
# RANK_SELECTION = 'custom'

if RANK_SELECTION == 'vbmf':
    WEAKEN_FACTOR = 1.
    X_FACTOR = 0
    rank_selection_suffix = "/wf:{}".format(WEAKEN_FACTOR)
elif RANK_SELECTION == 'nx':
    WEAKEN_FACTOR = None  
    X_FACTOR = 10
    rank_selection_suffix = "/{}x".format(X_FACTOR)
    
    
if MODEL_NAME == 'vgg16_imagenet':
    ranks_conv = [None] + [-X_FACTOR]*(len(layer_names[conv_layer_mask])-1)

elif MODEL_NAME == 'resnet50_imagenet':
    ranks_conv = [None if not name.endswith('conv2') else -X_FACTOR
                  for name in layer_names[conv_layer_mask]]

elif MODEL_NAME in ['resnet18_imagenet', 'resnet34_imagenet']:
    ranks_conv = [None if name == 'conv1' or not (name.endswith('conv2') or
                                                  name.endswith('conv1')) else -X_FACTOR
              for name in layer_names[conv_layer_mask]]

elif MODEL_NAME ==  'faster_rcnn_resnet50':
    ranks_conv = [None if not name.endswith('body.conv2') else -X_FACTOR
                  for name in layer_names[conv_layer_mask]]

# ranks_fc = [-X_FACTOR]*(len(layer_names[fc_layer_mask]))
ranks_fc = [None]*(len(layer_names[fc_layer_mask]))

    
    

ranks = np.array([None]*len(layer_names))
ranks[conv_layer_mask] = ranks_conv
ranks[fc_layer_mask] = ranks_fc

decompositions = np.array([None]*len(layer_names))
decompositions[conv_layer_mask] = decomposition_conv
decompositions[fc_layer_mask] = decomposition_fc

CONV_SPLIT = 20
FC_SPLIT = 1
n_layers = len(layer_names)

RESNET_SPLIT = False
if MODEL_NAME in ['resnet50_imagenet', 'resnet34_imagenet', 'resnet18_imagenet',  'faster_rcnn_resnet50'] and RESNET_SPLIT:
    split_tuples = split_resnet_layers_by_blocks(layer_names[conv_layer_mask])[::-1]
else:
    split_tuples = np.array_split(np.arange(n_layers)[conv_layer_mask], CONV_SPLIT)[::-1]
split_tuples.append(np.array_split(np.arange(n_layers)[fc_layer_mask], FC_SPLIT))

In [20]:
for tupl in split_tuples:
    print(layer_names[tupl])

['layer4.1.conv2']
['layer4.1.conv1']
['layer4.0.downsample.0']
['layer4.0.conv2']
['layer4.0.conv1']
['layer3.1.conv2']
['layer3.1.conv1']
['layer3.0.downsample.0']
['layer3.0.conv2']
['layer3.0.conv1']
['layer2.1.conv2']
['layer2.1.conv1']
['layer2.0.downsample.0']
['layer2.0.conv2']
['layer2.0.conv1']
['layer1.1.conv2']
['layer1.1.conv1']
['layer1.0.conv2']
['layer1.0.conv1']
['conv1']
['fc']



Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.



In [21]:
import time

scores = defaultdict(list)

compressed_model = copy.deepcopy(model.cpu())
for tupl in split_tuples:
    lname, rank, decomposition = layer_names[tupl], ranks[tupl], decompositions[tupl]
    print(lname, rank)
    start = time.time()
    compressed_model = get_compressed_model(compressed_model,
                                            ranks=rank,
                                            layer_names=lname,
                                            decompositions = decomposition,
                                            vbmf_weaken_factor=WEAKEN_FACTOR)
    end = time.time()
    print('================ time : {}'.format(end - start))
    score = run(copy.deepcopy(compressed_model))
    print(score)
#     scores[tupl[0]] = score
    scores[layer_names[tupl[0]][0]] = score
    

['layer4.1.conv2'] [0]
Decompose layer layer4.1.conv2
-------------------- SVD time:  10.672295570373535
------------------- VBMF time :  10.674672603607178
-------------------- SVD time:  3.7167229652404785
------------------- VBMF time :  3.719432830810547
	 new rank:  [307, 279]
0/1 * Epoch (infer): 100% 391/391 [01:00<00:00, 11.34it/s, _timers/_fps=2610.072, accuracy01=37.500, accuracy03=71.250, accuracy05=76.250] 
[69.324005 84.86201  88.996   ]
['layer4.1.conv1'] [0]
Decompose layer layer4.1.conv1
-------------------- SVD time:  2.7068915367126465
------------------- VBMF time :  2.708742380142212
-------------------- SVD time:  2.610800266265869
------------------- VBMF time :  2.612825632095337
	 new rank:  [114, 127]
0/1 * Epoch (infer): 100% 391/391 [01:02<00:00,  6.22it/s, _timers/_fps=3996.449, accuracy01=52.500, accuracy03=78.750, accuracy05=85.000] 
[54.640003 73.816    80.204   ]
['layer4.0.downsample.0'] [None]
Skip layer layer4.0.downsample.0
0/1 * Epoch (infer): 100% 

0/1 * Epoch (infer): 100% 391/391 [01:00<00:00,  9.98it/s, _timers/_fps=5071.758, accuracy01=12.500, accuracy03=20.000, accuracy05=22.500] 
[0.76000005 1.9020001  2.9180002 ]
['layer1.0.conv1'] [0]
Decompose layer layer1.0.conv1
-------------------- SVD time:  0.0765542984008789
------------------- VBMF time :  0.07866263389587402
-------------------- SVD time:  0.03251314163208008
------------------- VBMF time :  0.03909182548522949
	 new rank:  [29, 40]
0/1 * Epoch (infer): 100% 391/391 [01:00<00:00, 12.42it/s, _timers/_fps=4706.504, accuracy01=10.000, accuracy03=17.500, accuracy05=23.750] 
[0.804 2.138 3.19 ]
['conv1'] [None]
Skip layer conv1
0/1 * Epoch (infer): 100% 391/391 [01:03<00:00,  6.18it/s, _timers/_fps=15746.786, accuracy01=10.000, accuracy03=17.500, accuracy05=23.750]
[0.804 2.138 3.19 ]
['fc'] [None]



Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.



Skip layer fc
0/1 * Epoch (infer): 100% 391/391 [01:00<00:00,  6.48it/s, _timers/_fps=15148.300, accuracy01=10.000, accuracy03=17.500, accuracy05=23.750]
[0.804 2.138 3.19 ]


In [22]:
scores

defaultdict(list,
            {'l': array([0.804, 2.138, 3.19 ], dtype=float32),
             'c': array([0.804, 2.138, 3.19 ], dtype=float32),
             'fc': array([0.804, 2.138, 3.19 ], dtype=float32)})

# Count parameters

In [28]:
from collections import defaultdict

def count_params(model):
    n_params = 0
    
    for name, param in model.named_parameters():
        n_params += param.numel()
    return n_params

In [29]:
params_count_dict_m = count_params(model)
params_count_dict_cm = count_params(compressed_model)

params_count_dict_m / params_count_dict_cm

7.232299048353415

In [None]:
split_tuples

# Compute FLOPS

In [30]:
# compressed_model = model

In [15]:
import sys
sys.path.append("../")

from flopco import FlopCo

In [32]:
model.to('cpu')
compressed_model.to('cpu')

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [33]:
flopco_m = FlopCo(model)
flopco_cm = FlopCo(compressed_model)

In [34]:
flopco_m.total_flops / flopco_cm.total_flops

3.2223035384295993

In [35]:
flopco_m.flops

defaultdict(list,
            {'conv1': [236027904],
             'bn1': [3211264],
             'relu': [802816],
             'maxpool': [1806336],
             'layer1.0.conv1': [231211008],
             'layer1.0.bn1': [802816],
             'layer1.0.relu': [200704, 200704],
             'layer1.0.conv2': [231211008],
             'layer1.0.bn2': [802816],
             'layer1.1.conv1': [231211008],
             'layer1.1.bn1': [802816],
             'layer1.1.relu': [200704, 200704],
             'layer1.1.conv2': [231211008],
             'layer1.1.bn2': [802816],
             'layer1.2.conv1': [231211008],
             'layer1.2.bn1': [802816],
             'layer1.2.relu': [200704, 200704],
             'layer1.2.conv2': [231211008],
             'layer1.2.bn2': [802816],
             'layer2.0.conv1': [115605504],
             'layer2.0.bn1': [401408],
             'layer2.0.relu': [100352, 100352],
             'layer2.0.conv2': [231211008],
             'layer2.0.bn2': [401

3x3xcxc

params: 9cc
flops: 9cchw

1x1xcxr, 3x3x1xr, 1x1xrxc

params: cr + 9r + cr= r(2c + 9)
flops: crhw + 9rhw + crhw = r(2c + 9)hw

paramx_nx = 9cc / r(2c + 9)
flops_nx = 9cchw / r(2c + 9)hw

# Time

In [36]:
model.to('cuda')
compressed_model.to('cuda')

x = torch.randn(32, 3, 224, 224).cuda()

In [37]:
%timeit y = model(x)

14.7 ms ± 7.41 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [38]:
%timeit y = compressed_model(x)

10.4 ms ± 284 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [39]:
model.to('cpu')
compressed_model.to('cpu')

x = torch.randn(1, 3, 224, 224)

In [40]:
%timeit y = model(x)

4.47 s ± 113 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [41]:
%timeit y = compressed_model(x)

20.2 s ± 1.94 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
