In [1]:
import sys
if '..' not in sys.path:
    sys.path.append('..')

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchvision.models import resnet18
import numpy as np
import random

from flopco import FlopCo
from musco.pytorch.compressor.config_gen import generate_model_compr_kwargs
from musco.pytorch import Compressor
from musco.pytorch.compressor.rank_estimation.estimator import estimate_rank_for_compression_rate

import copy
import gc
from collections import defaultdict
import argparse
from tqdm.notebook import tqdm

from source.data import get_imagenet_train_val_loaders, get_imagenet_test_loader
from source.eval import accuracy, estimate_macs
from source.utils import get_layer_by_name, bncalibrate_layer

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
def find_best_rank_for_layer(model, lname, decomposition, train_loader, val_loader, 
                             eval_func, bn_cal_func, bn_cal_n_iters, score_eps, 
                             max_rank, min_rank=3, grid_step=1, nx=1, device='cuda'):
    '''
    Find minimal decomposition rank for given acceptable target metric drop (uses binary search)
    Parameters:
    model           -   Initial model
    lname           -   Name of layer to find decomposition rank, String
    decomposition   -   Decomposition algorithm name, Options: (cp3, tucker2, svd), String
    score_eps       -   Acceptable target metric drop, float
    train_loader    -   Training dataset dataloader, Pytorch Dataloader
    val_loader      -   Validation dataset dataloader, Pytorch Dataloader
    eval_func       -   Function for model evaluation (returns target metric score,
                        args: temp_model, val_loader, device), Python function
    bn_cal_func     -   Function for batchnorm statistics calibration
                        (args: emp_model, train_loader, lname, bn_cal_n_iters, device), Python function
    bn_cal_n_iters  -   Number of batchnorm callibration iterations, int
    max_rank        -   Upper bound of rank search, int
    min_rank        -   Lower bound of rank search, int
    grid_step       -   Rank search grid step (search for ranks multiple of grid_step)
    nx              -   Minimal compression ratio for layer FLOPs, float
    device          -   Device to store the model
    
    Output:
    best_rank       -   Best rank for compression of given layer, int or None
                        (if layer can not be compressed with given settings)
    '''
    
    if decomposition not in ['cp3', 'tucker2', 'svd', 'cp3-epc']:
        raise ValueError('Wrong decomposition name. Correct options: (cp3, tucker2, svd, cp3-epc)')
    
    curr_rank = max_rank // grid_step if max_rank // grid_step != 0 else 1
    curr_max = max_rank // grid_step if max_rank // grid_step != 0 else 1
    curr_min = min_rank // grid_step if min_rank // grid_step != 0 else 1
    best_rank = None

    n = int(np.log2(curr_max)) + 1
    score_init = eval_func(model.to(device), val_loader, device=device)
    
    init_layer = get_layer_by_name(model, lname)
    ch_ratio = init_layer.in_channels / init_layer.out_channels
    
    if curr_max < curr_min:
        print("Layer can not be compressed with given grid step")

    for i in range(n):
        print("Search iter {}: ranks (min, curr, max): ({}, {}, {})".format(i, curr_min, curr_rank, 
                                                                            curr_max))

        print("-------------------------\n Compression step")
        
        manual_rank = (int(curr_rank * ch_ratio), curr_rank) if decomposition=='tucker2' else curr_rank
        
        model_compr_kwargs = {lname: {'decomposition': decomposition,
                                      'rank_selection': 'manual',
                                      'manual_rank': [manual_rank],
                                      'curr_compr_iter': 0}
                              }
        model_stats = FlopCo(model.to(device), img_size=(1, 3, 224, 224), device=device)

        compressor = Compressor(copy.deepcopy(model.cpu()),
                                model_stats,
                                ft_every=3,
                                nglobal_compress_iters=1,
                                model_compr_kwargs = model_compr_kwargs,
                               )
        compressor.compression_step()

        print("-------------------------\n Calibration step")
        # calibrate batch norm statistics

        compressor.compressed_model.to(device)
        bn_cal_func(compressor.compressed_model, train_loader, layer_name=lname,
                    n_batches=bn_cal_n_iters, device=device)

        print("-------------------------\n Test step")

        # eval model
        score = eval_func(compressor.compressed_model, val_loader, device=device)
        print('Current score: {}'.format(score))

        # clear memory
        del compressor
        gc.collect()
        torch.cuda.empty_cache()

        if score + score_eps < score_init:

            if i == 0:
                print("Bad layer to compress")
                if nx > 1:
                    best_rank = curr_rank
                break
            else:
                curr_min = curr_rank
                curr_rank = curr_rank + (curr_max - curr_rank) // 2
        else:
            best_rank = curr_rank

            curr_max = curr_rank
            curr_rank = curr_rank - (curr_rank - curr_min) // 2

    if best_rank is not None:
        return best_rank * grid_step
    else:
        return best_rank

In [7]:
train_loader, val_loader = get_imagenet_train_val_loaders(data_root='/gpfs/gpfs0/k.sobolev/ILSVRC-12/',
                                       batch_size=500,
                                       num_workers=4,
                                       pin_memory=True,
                                       val_perc=0.04,
                                       shuffle=True,
                                       random_seed=5)

In [8]:
test_loader = get_imagenet_test_loader(data_root='/gpfs/gpfs0/k.sobolev/ILSVRC-12/', 
                                       batch_size=500,
                                       num_workers=4,
                                       pin_memory=True,
                                       shuffle=False)

In [9]:
model = resnet18(pretrained=True).to(device)
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [10]:
# %time 
# accuracy(model, test_loader, device=device)

In [11]:
model_stats = FlopCo(model, img_size=(1, 3, 224, 224), device=device)
all_lnames = list(model_stats.flops.keys())

In [12]:
# create list of layer names to be compressed
lnames_to_compress = [k for k in all_lnames if model_stats.ltypes[k]['type'] == nn.Conv2d \
                      and k != 'conv1' and 'downsample' not in k]

In [13]:
max_ranks = {}

for lname in lnames_to_compress:
    layer_shape = get_layer_by_name(model, lname).weight.shape
    print('Layer:', lname)
    print('Shape:', layer_shape)
    rank = estimate_rank_for_compression_rate(layer_shape, rate=4,
                                              tensor_format='cp3')
    print('Rank:', rank)
    print()
    max_ranks[lname] = rank
    
saved_ranks = {k: None for k in all_lnames}
min_ranks = {k: 10 for k in max_ranks.keys()}
curr_ranks = copy.deepcopy(max_ranks)

Layer: layer1.0.conv1
Shape: torch.Size([64, 64, 3, 3])
Rank: 67

Layer: layer1.0.conv2
Shape: torch.Size([64, 64, 3, 3])
Rank: 67

Layer: layer1.1.conv1
Shape: torch.Size([64, 64, 3, 3])
Rank: 67

Layer: layer1.1.conv2
Shape: torch.Size([64, 64, 3, 3])
Rank: 67

Layer: layer2.0.conv1
Shape: torch.Size([128, 64, 3, 3])
Rank: 91

Layer: layer2.0.conv2
Shape: torch.Size([128, 128, 3, 3])
Rank: 139

Layer: layer2.1.conv1
Shape: torch.Size([128, 128, 3, 3])
Rank: 139

Layer: layer2.1.conv2
Shape: torch.Size([128, 128, 3, 3])
Rank: 139

Layer: layer3.0.conv1
Shape: torch.Size([256, 128, 3, 3])
Rank: 187

Layer: layer3.0.conv2
Shape: torch.Size([256, 256, 3, 3])
Rank: 283

Layer: layer3.1.conv1
Shape: torch.Size([256, 256, 3, 3])
Rank: 283

Layer: layer3.1.conv2
Shape: torch.Size([256, 256, 3, 3])
Rank: 283

Layer: layer4.0.conv1
Shape: torch.Size([512, 256, 3, 3])
Rank: 379

Layer: layer4.0.conv2
Shape: torch.Size([512, 512, 3, 3])
Rank: 570

Layer: layer4.1.conv1
Shape: torch.Size([512, 51

In [14]:
%%time
lname = 'layer1.0.conv2'
best_rank = find_best_rank_for_layer(model, 
                         lname=lname, 
                         decomposition='cp3', 
                         train_loader=train_loader, 
                         val_loader=val_loader, 
                         eval_func=accuracy,
                         bn_cal_func=bncalibrate_layer, 
                         bn_cal_n_iters=1, 
                         score_eps=0.003,
                         max_rank=max_ranks[lname], 
                         min_rank=min_ranks[lname],
                         grid_step=1, 
                         device=device)
best_rank

100%|██████████| 102/102 [05:13<00:00,  3.07s/it]
Use numpy backend


Search iter 0: ranks (min, curr, max): (10, 67, 67)
-------------------------
 Compression step
layer1.0.conv2 {'decomposition': 'cp3', 'rank_selection': 'manual', 'manual_rank': [67], 'curr_compr_iter': 0}
-------------------------
 Calibration step
-------------------------
 Test step


100%|██████████| 102/102 [01:59<00:00,  1.17s/it]
Use numpy backend


Current score: 0.792235294117647
Search iter 1: ranks (min, curr, max): (10, 39, 67)
-------------------------
 Compression step
layer1.0.conv2 {'decomposition': 'cp3', 'rank_selection': 'manual', 'manual_rank': [39], 'curr_compr_iter': 0}
-------------------------
 Calibration step
-------------------------
 Test step


100%|██████████| 102/102 [01:53<00:00,  1.11s/it]
Use numpy backend


Current score: 0.784921568627451
Search iter 2: ranks (min, curr, max): (39, 53, 67)
-------------------------
 Compression step
layer1.0.conv2 {'decomposition': 'cp3', 'rank_selection': 'manual', 'manual_rank': [53], 'curr_compr_iter': 0}
-------------------------
 Calibration step
-------------------------
 Test step


100%|██████████| 102/102 [01:56<00:00,  1.14s/it]
Use numpy backend


Current score: 0.7883137254901961
Search iter 3: ranks (min, curr, max): (53, 60, 67)
-------------------------
 Compression step
layer1.0.conv2 {'decomposition': 'cp3', 'rank_selection': 'manual', 'manual_rank': [60], 'curr_compr_iter': 0}
-------------------------
 Calibration step
-------------------------
 Test step


100%|██████████| 102/102 [01:51<00:00,  1.09s/it]
Use numpy backend


Current score: 0.7895686274509804
Search iter 4: ranks (min, curr, max): (60, 63, 67)
-------------------------
 Compression step
layer1.0.conv2 {'decomposition': 'cp3', 'rank_selection': 'manual', 'manual_rank': [63], 'curr_compr_iter': 0}
-------------------------
 Calibration step
-------------------------
 Test step


100%|██████████| 102/102 [02:05<00:00,  1.23s/it]
Use numpy backend


Current score: 0.7914313725490196
Search iter 5: ranks (min, curr, max): (60, 62, 63)
-------------------------
 Compression step
layer1.0.conv2 {'decomposition': 'cp3', 'rank_selection': 'manual', 'manual_rank': [62], 'curr_compr_iter': 0}
-------------------------
 Calibration step
-------------------------
 Test step


100%|██████████| 102/102 [01:48<00:00,  1.07s/it]
Use numpy backend


Current score: 0.7916274509803921
Search iter 6: ranks (min, curr, max): (60, 61, 62)
-------------------------
 Compression step
layer1.0.conv2 {'decomposition': 'cp3', 'rank_selection': 'manual', 'manual_rank': [61], 'curr_compr_iter': 0}
-------------------------
 Calibration step
-------------------------
 Test step


100%|██████████| 102/102 [01:54<00:00,  1.12s/it]

Current score: 0.7904705882352941
CPU times: user 4min 2s, sys: 51.8 s, total: 4min 54s
Wall time: 23min 11s





62

In [16]:
orig_macs, redc_macs = estimate_macs(model, lname, best_rank, device='cpu')
redc_macs / orig_macs

0.2304144965277778

In [10]:
# %%time
# lname = 'layer4.1.conv2'
# find_best_rank_for_layer(model, 
#                          lname=lname, 
#                          decomposition='cp3', 
#                          train_loader=train_loader, 
#                          val_loader=val_loader, 
#                          eval_func=accuracy,
#                          bn_cal_func=batchnorm_callibration, 
#                          bn_cal_n_iters=1, 
#                          score_eps=0.005,
#                          max_rank=max_ranks[lname], 
#                          min_rank=min_ranks[lname],
#                          grid_step=1, 
#                          device=device)

In [11]:
# orig_macs, redc_macs = estimate_macs(model, 'layer4.1.conv2', 65)
# redc_macs / orig_macs