In [1]:
import sys
if '..' not in sys.path:
    sys.path.append('..')

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchvision.models import resnet18
import numpy as np
import random

from flopco import FlopCo
from musco.pytorch.compressor.config_gen import generate_model_compr_kwargs
from musco.pytorch import Compressor
from musco.pytorch.compressor.rank_estimation.estimator import estimate_rank_for_compression_rate

import copy
import gc
from collections import defaultdict
import argparse
from tqdm.notebook import tqdm

from source.data import get_imagenet_train_val_loaders, get_imagenet_test_loader


seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
def accuracy(model, dataset_loader, device='cuda', num_classes=1000):
    def one_hot(x, K):
        return np.array(x[:, None] == np.arange(K)[None, :], dtype=int)
    
    # Set BN and Droupout to eval regime
    model.eval()

    total_correct = 0

    for (x, y) in tqdm(dataset_loader):
        x = x.to(device)
        y = one_hot(np.array(y.numpy()), num_classes)
        target_class = np.argmax(y, axis=1)

        with torch.no_grad():
            out = model(x).cpu().detach().numpy()
            predicted_class = np.argmax(out, axis=1)
            total_correct += np.sum(predicted_class == target_class)

    total = len(dataset_loader) * dataset_loader.batch_size
    return total_correct / total


def get_layer_by_name(model, mname):
    '''
    Extract layer using layer name
    '''
    module = model
    mname_list = mname.split('.')
    for mname in mname_list:
        module = module._modules[mname]

    return module


def batchnorm_callibration(model, train_loader, n_batches = 200000//256, 
                           layer_name = None, device="cuda:0"):
    '''
    Update batchnorm statistics for layers after layer_name
    Parameters:
    model                   -   Pytorch model
    train_loader            -   Training dataset dataloader, Pytorch Dataloader
    n_callibration_batches  -   Number of batchnorm callibration iterations, int
    layer_name              -   Name of layer after which to update BN statistics, string or None
                                (if None updates statistics for all BN layers)
    device                  -   Device to store the model, string
    '''
    
    # switch batchnorms into the mode, in which its statistics are updated
    model.to(device).eval() 
    layer_passed = False
    
    if layer_name is not None:
        #freeze batchnorms before replaced layer
        for lname, l in model.named_modules():

            if lname == layer_name:
                layer_passed = True
            
            if (isinstance(l, nn.BatchNorm2d)) and layer_passed:
                if layer_passed:
                    l.train()
                else:
                    l.eval()

    with torch.no_grad():            

        for i, (data, _) in enumerate(train_loader):
            _ = model(data.to(device))

            if i > n_batches:
                break
            
        del data
        torch.cuda.empty_cache()
        
    model.eval()
    return model

In [4]:
def find_best_rank_for_layer(model, lname, decomposition, train_loader, val_loader, 
                             eval_func, bn_cal_func, bn_cal_n_iters, score_eps, 
                             max_rank, min_rank=3, grid_step=1, nx=1, device='cuda'):
    '''
    Find minimal decomposition rank for given acceptable target metric drop (uses binary search)
    Parameters:
    model           -   Initial model
    lname           -   Name of layer to find decomposition rank, String
    decomposition   -   Decomposition algorithm name, Options: (cp3, tucker2, svd), String
    score_eps       -   Acceptable target metric drop, float
    train_loader    -   Training dataset dataloader, Pytorch Dataloader
    val_loader      -   Validation dataset dataloader, Pytorch Dataloader
    eval_func       -   Function for model evaluation (returns target metric score,
                        args: temp_model, val_loader, device), Python function
    bn_cal_func     -   Function for batchnorm statistics calibration
                        (args: emp_model, train_loader, lname, bn_cal_n_iters, device), Python function
    bn_cal_n_iters  -   Number of batchnorm callibration iterations, int
    max_rank        -   Upper bound of rank search, int
    min_rank        -   Lower bound of rank search, int
    grid_step       -   Rank search grid step (search for ranks multiple of grid_step)
    nx              -   Minimal compression ratio for layer FLOPs, float
    device          -   Device to store the model
    
    Output:
    best_rank       -   Best rank for compression of given layer, int or None
                        (if layer can not be compressed with given settings)
    '''
    
    if decomposition not in ['cp3', 'tucker2', 'svd', 'cp3-epc']:
        raise ValueError('Wrong decomposition name. Correct options: (cp3, tucker2, svd, cp3-epc)')
    
    curr_rank = max_rank // grid_step if max_rank // grid_step != 0 else 1
    curr_max = max_rank // grid_step if max_rank // grid_step != 0 else 1
    curr_min = min_rank // grid_step if min_rank // grid_step != 0 else 1
    best_rank = None

    n = int(np.log2(curr_max)) + 1
    score_init = eval_func(model.to(device), val_loader, device=device)
    
    init_layer = get_layer_by_name(model, lname)
    ch_ratio = init_layer.in_channels / init_layer.out_channels
    
    if curr_max < curr_min:
        print("Layer can not be compressed with given grid step")

    for i in range(n):
        print("Search iter {}: ranks (min, curr, max): ({}, {}, {})".format(i, curr_min, curr_rank, 
                                                                            curr_max))

        print("-------------------------\n Compression step")
        
        manual_rank = (int(curr_rank * ch_ratio), curr_rank) if decomposition=='tucker2' else curr_rank
        
        model_compr_kwargs = {lname: {'decomposition': decomposition,
                                      'rank_selection': 'manual',
                                      'manual_rank': [manual_rank],
                                      'curr_compr_iter': 0}
                              }
        model_stats = FlopCo(model.to(device), img_size=(1, 3, 224, 224), device=device)

        compressor = Compressor(copy.deepcopy(model.cpu()),
                                model_stats,
                                ft_every=3,
                                nglobal_compress_iters=1,
                                model_compr_kwargs = model_compr_kwargs,
                               )
        compressor.compression_step()

        print("-------------------------\n Calibration step")
        # calibrate batch norm statistics

        compressor.compressed_model.to(device)
        bn_cal_func(compressor.compressed_model, train_loader, layer_name=lname,
                    n_batches=bn_cal_n_iters, device=device)

        print("-------------------------\n Test step")

        # eval model
        score = eval_func(compressor.compressed_model, val_loader, device=device)
        print('Current score: {}'.format(score))

        # clear memory
        del compressor
        gc.collect()
        torch.cuda.empty_cache()

        if score + score_eps < score_init:

            if i == 0:
                print("Bad layer to compress")
                if nx > 1:
                    best_rank = curr_rank
                break
            else:
                curr_min = curr_rank
                curr_rank = curr_rank + (curr_max - curr_rank) // 2
        else:
            best_rank = curr_rank

            curr_max = curr_rank
            curr_rank = curr_rank - (curr_rank - curr_min) // 2

    if best_rank is not None:
        return best_rank * grid_step
    else:
        return best_rank


In [5]:
def estimate_macs(model, layer_name, rank):
    """Returns original and reduced macs based on reduction rank
    original macs = C_i * W_k * H_k * C_o * W_o * H_o
    reduced macs = rank * C_i * W_i * H_i + rank**2 * W_k * H_k * W_o * H_o + rank * C_o * W_o * H_o
    where:
        C_i - number of input channels
        C_o - number of output channels
        W_o - width of the output image
        H_o - height of the output image
        W_i - width of the input image
        H_i - height of the input image
        W_k - width of the kernel
        H_k - height of the kernel
    """
    input_shape = output_shape = (1, 3, 224, 224)
    layer = None
    x = torch.rand(*input_shape)
    model.eval()
    with torch.no_grad():
        for lname, layer in model.named_modules():
            if not (isinstance(layer, nn.Conv2d) 
                    or isinstance(layer, nn.BatchNorm2d) 
                    or isinstance(layer, nn.MaxPool2d) 
                    or isinstance(layer, nn.ReLU)): continue
            input_shape = x.shape
            x = layer(x)
            output_shape = x.shape
            if lname == layer_name: break
                
    if not isinstance(layer, nn.Conv2d):
        raise NotImplementedError('Function estimate_macs works only for Conv2d layers')
        
    orig_macs = layer.in_channels * layer.kernel_size[-1] * layer.kernel_size[-2] \
                * layer.out_channels * output_shape[-1] * output_shape[-2]
    redc_macs = rank * layer.in_channels * input_shape[-1] * input_shape[-2] \
                + rank**2 * layer.kernel_size[-1] * layer.kernel_size[-2] * output_shape[-1] * output_shape[-2] \
                + rank * layer.out_channels * output_shape[-1] * output_shape[-2]
    
    return orig_macs, redc_macs

In [6]:
train_loader, val_loader = get_imagenet_train_val_loaders(data_root='/gpfs/gpfs0/k.sobolev/ILSVRC-12/',
                                       batch_size=500,
                                       num_workers=4,
                                       pin_memory=True,
                                       val_perc=0.04,
                                       shuffle=True,
                                       random_seed=5)

In [7]:
test_loader = get_imagenet_test_loader(data_root='/gpfs/gpfs0/k.sobolev/ILSVRC-12/', 
                                       batch_size=500,
                                       num_workers=4,
                                       pin_memory=True,
                                       shuffle=False)

In [8]:
model = resnet18(pretrained=True).to(device)
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [24]:
%time 
accuracy(model, test_loader, device=device)

CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 5.01 µs


  0%|          | 0/100 [00:00<?, ?it/s]

0.6976

In [9]:
model_stats = FlopCo(model, img_size=(1, 3, 224, 224), device=device)
all_lnames = list(model_stats.flops.keys())

In [10]:
# create list of layer names to be compressed
lnames_to_compress = [k for k in all_lnames if model_stats.ltypes[k]['type'] == nn.Conv2d \
                      and k != 'conv1' and 'downsample' not in k]

In [11]:
max_ranks = {}

for lname in lnames_to_compress:
    layer_shape = get_layer_by_name(model, lname).weight.shape
    print('Layer:', lname)
    print('Shape:', layer_shape)
    rank = estimate_rank_for_compression_rate(layer_shape, rate=2,
                                              tensor_format='cp3')
    print('Rank:', rank)
    print()
    max_ranks[lname] = rank
    
saved_ranks = {k: None for k in all_lnames}
min_ranks = {k: 10 for k in max_ranks.keys()}
curr_ranks = copy.deepcopy(max_ranks)

Layer: layer1.0.conv1
Shape: torch.Size([64, 64, 3, 3])
Rank: 134

Layer: layer1.0.conv2
Shape: torch.Size([64, 64, 3, 3])
Rank: 134

Layer: layer1.1.conv1
Shape: torch.Size([64, 64, 3, 3])
Rank: 134

Layer: layer1.1.conv2
Shape: torch.Size([64, 64, 3, 3])
Rank: 134

Layer: layer2.0.conv1
Shape: torch.Size([128, 64, 3, 3])
Rank: 183

Layer: layer2.0.conv2
Shape: torch.Size([128, 128, 3, 3])
Rank: 278

Layer: layer2.1.conv1
Shape: torch.Size([128, 128, 3, 3])
Rank: 278

Layer: layer2.1.conv2
Shape: torch.Size([128, 128, 3, 3])
Rank: 278

Layer: layer3.0.conv1
Shape: torch.Size([256, 128, 3, 3])
Rank: 375

Layer: layer3.0.conv2
Shape: torch.Size([256, 256, 3, 3])
Rank: 566

Layer: layer3.1.conv1
Shape: torch.Size([256, 256, 3, 3])
Rank: 566

Layer: layer3.1.conv2
Shape: torch.Size([256, 256, 3, 3])
Rank: 566

Layer: layer4.0.conv1
Shape: torch.Size([512, 256, 3, 3])
Rank: 759

Layer: layer4.0.conv2
Shape: torch.Size([512, 512, 3, 3])
Rank: 1141

Layer: layer4.1.conv1
Shape: torch.Size([5

In [15]:
%%time
find_best_rank_for_layer(model, 
                         lname='layer1.0.conv1', 
                         decomposition='cp3-epc', 
                         train_loader=train_loader, 
                         val_loader=val_loader, 
                         eval_func=accuracy,
                         bn_cal_func=batchnorm_callibration, 
                         bn_cal_n_iters=1, 
                         score_eps=0.003,
                         max_rank=max_ranks['layer1.0.conv1'], 
                         min_rank=min_ranks['layer1.0.conv1'],
                         grid_step=1, 
                         device=device)

  0%|          | 0/102 [00:00<?, ?it/s]

Search iter 0: ranks (min, curr, max): (10, 134, 134)
-------------------------
 Compression step
layer1.0.conv1 {'decomposition': 'cp3-epc', 'rank_selection': 'manual', 'manual_rank': [134], 'curr_compr_iter': 0}


Use numpy backend
Use numpy backend
Use numpy backend
Use numpy backend
Use numpy backend


-------------------------
 Calibration step
-------------------------
 Test step


  0%|          | 0/102 [00:00<?, ?it/s]

Current score: 0.7946470588235294
Search iter 1: ranks (min, curr, max): (10, 72, 134)
-------------------------
 Compression step
layer1.0.conv1 {'decomposition': 'cp3-epc', 'rank_selection': 'manual', 'manual_rank': [72], 'curr_compr_iter': 0}


Use numpy backend
Use numpy backend
Use numpy backend


-------------------------
 Calibration step


Use numpy backend


-------------------------
 Test step


  0%|          | 0/102 [00:00<?, ?it/s]

Current score: 0.7894705882352941
Search iter 2: ranks (min, curr, max): (72, 103, 134)
-------------------------
 Compression step
layer1.0.conv1 {'decomposition': 'cp3-epc', 'rank_selection': 'manual', 'manual_rank': [103], 'curr_compr_iter': 0}


Use numpy backend
Use numpy backend
Use numpy backend
Use numpy backend
Use numpy backend


-------------------------
 Calibration step
-------------------------
 Test step


  0%|          | 0/102 [00:00<?, ?it/s]

Current score: 0.793078431372549
Search iter 3: ranks (min, curr, max): (72, 88, 103)
-------------------------
 Compression step
layer1.0.conv1 {'decomposition': 'cp3-epc', 'rank_selection': 'manual', 'manual_rank': [88], 'curr_compr_iter': 0}


Use numpy backend
Use numpy backend
Use numpy backend
Use numpy backend


-------------------------
 Calibration step
-------------------------
 Test step


  0%|          | 0/102 [00:00<?, ?it/s]

Current score: 0.792843137254902
Search iter 4: ranks (min, curr, max): (72, 80, 88)
-------------------------
 Compression step
layer1.0.conv1 {'decomposition': 'cp3-epc', 'rank_selection': 'manual', 'manual_rank': [80], 'curr_compr_iter': 0}


Use numpy backend
Use numpy backend
Use numpy backend


-------------------------
 Calibration step


Use numpy backend


-------------------------
 Test step


  0%|          | 0/102 [00:00<?, ?it/s]

Current score: 0.787
Search iter 5: ranks (min, curr, max): (80, 84, 88)
-------------------------
 Compression step
layer1.0.conv1 {'decomposition': 'cp3-epc', 'rank_selection': 'manual', 'manual_rank': [84], 'curr_compr_iter': 0}


Use numpy backend
Use numpy backend
Use numpy backend


-------------------------
 Calibration step


Use numpy backend


-------------------------
 Test step


  0%|          | 0/102 [00:00<?, ?it/s]

Current score: 0.7919607843137255
Search iter 6: ranks (min, curr, max): (80, 82, 84)
-------------------------
 Compression step
layer1.0.conv1 {'decomposition': 'cp3-epc', 'rank_selection': 'manual', 'manual_rank': [82], 'curr_compr_iter': 0}


Use numpy backend
Use numpy backend
Use numpy backend
Use numpy backend


-------------------------
 Calibration step
-------------------------
 Test step


  0%|          | 0/102 [00:00<?, ?it/s]

Current score: 0.7904901960784314
Search iter 7: ranks (min, curr, max): (82, 83, 84)
-------------------------
 Compression step
layer1.0.conv1 {'decomposition': 'cp3-epc', 'rank_selection': 'manual', 'manual_rank': [83], 'curr_compr_iter': 0}


Use numpy backend
Use numpy backend
Use numpy backend
Use numpy backend


-------------------------
 Calibration step
-------------------------
 Test step


  0%|          | 0/102 [00:00<?, ?it/s]

Current score: 0.7907058823529411
CPU times: user 31min 5s, sys: 1min 3s, total: 32min 8s
Wall time: 48min 56s


84

In [64]:
orig_macs, redc_macs = estimate_macs(model, 'layer1.0.conv1', 65)
redc_macs / orig_macs

1.2571885850694444

In [16]:
orig_macs, redc_macs = estimate_macs(model, 'layer1.0.conv1', 84)
redc_macs / orig_macs

2.0143229166666665

In [None]:
%%time
lname = 'layer4.1.conv2'
find_best_rank_for_layer(model, 
                         lname=lname, 
                         decomposition='cp3', 
                         train_loader=train_loader, 
                         val_loader=val_loader, 
                         eval_func=accuracy,
                         bn_cal_func=batchnorm_callibration, 
                         bn_cal_n_iters=1, 
                         score_eps=0.003,
                         max_rank=max_ranks[lname], 
                         min_rank=min_ranks[lname],
                         grid_step=1, 
                         device=device)

  0%|          | 0/102 [00:00<?, ?it/s]

Search iter 0: ranks (min, curr, max): (10, 1141, 1141)
-------------------------
 Compression step
layer4.1.conv2 {'decomposition': 'cp3', 'rank_selection': 'manual', 'manual_rank': [1141], 'curr_compr_iter': 0}


Use numpy backend


In [None]:
orig_macs, redc_macs = estimate_macs(model, 'layer4.1.conv2', 65)
redc_macs / orig_macs