In [1]:
import torch,torchvision,os,time
import torchvision.transforms as transforms
import numpy as np
from utils.util import get_loader,evaluate
from utils.layer import qConv2d,qLinear
from utils.train import QAVAT_train
import matplotlib.pyplot as plt
import torchvision.models as models             

              # for example model
from mqbench.prepare_by_platform import prepare_by_platform   # add quant nodes for specific Backend
from mqbench.prepare_by_platform import BackendType           # contain various Backend, like TensorRT, NNIE, etc.
from mqbench.utils.state import enable_calibration            # turn on calibration algorithm, determine scale, zero_point, etc.
from mqbench.utils.state import enable_quantization           # turn on actually quantization, like FP32 -> INT8
from mqbench.utils.state import disable_all           # turn on actually quantization, like FP32 -> INT8
from copy import deepcopy

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

model = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar10_resnet56", pretrained=True).cuda()
# model = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar100_mobilenetv2_x0_5", pretrained=True).cuda()

Using cache found in /home/zihao/.cache/torch/hub/chenyaofo_pytorch-cifar-models_master


In [2]:
train,test = get_loader('cifar10'.upper(),batch_size=128,test_batch_size=128)
train.num_workers = 2
test.num_workers = 2
train.pin_in_memory = True
test.pin_in_memory = True

Files already downloaded and verified
Files already downloaded and verified


In [3]:
# calibration data used to calibrate PTQ and MPQ
calib_data = []
i = 0
for img,label in train:
    i += 1
    calib_data.append((img,label))
    if i == 8:
        break

In [4]:
MPQ_scheme = (2,4,8)
model.eval()

CifarResNet(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias

In [None]:
def getModuleByName(model,moduleName):
    '''
        replace module with name modelName.moduleName with newModule
    '''
    tokens = moduleName.split('.')
    m = model
    for tok in tokens:
        m = getattr(m,tok)
    return m

for b in MPQ_scheme:
    mqb_fp_model = deepcopy(model)
    
    # MSE calibration on model parameters
    backend = BackendType.Academic
    extra_config = {
        'extra_qconfig_dict': {
            'w_observer': 'MSEObserver',                              # custom weight observer
            'a_observer': 'EMAMSEObserver',                              # custom activation observer
            'w_fakequantize': 'FixedFakeQuantize',                    # custom weight fake quantize function
            'a_fakequantize': 'FixedFakeQuantize',                    # custom activation fake quantize function
            'w_qscheme': {
                'bit': b,                                             # custom bitwidth for weight,
                'symmetry': True,                                    # custom whether quant is symmetric for weight,
                'per_channel': False,                                  # custom whether quant is per-channel or per-tensor for weight,
                'pot_scale': False,                                   # custom whether scale is power of two for weight.
            },
            'a_qscheme': {
                'bit': b,                                             # custom bitwidth for activation,
                'symmetry': False,                                    # custom whether quant is symmetric for activation,
                'per_channel': False,                                  # custom whether quant is per-channel or per-tensor for activation,
                'pot_scale': False,                                   # custom whether scale is power of two for activation.
            }
        }                                                         # custom tracer behavior, checkout https://github.com/pytorch/pytorch/blob/efcbbb177eacdacda80b94ad4ce34b9ed6cf687a/torch/fx/_symbolic_trace.py#L836
    }
    print(f'Prepare {b}bits model using MQBench')

    exec(f'mqb_{b}bits_model=prepare_by_platform(mqb_fp_model, backend,extra_config).cuda()')
    
    # calibration loop
    enable_calibration(eval(f'mqb_{b}bits_model'))
    for img,label in calib_data:
        eval(f'mqb_{b}bits_model')(img.cuda())  
    disable_all(eval(f'mqb_{b}bits_model'))
    # evaluation loop
    enable_quantization(eval(f'mqb_{b}bits_model'))
    print('evaluate mqb quantized model')
    evaluate(test,eval(f'mqb_{b}bits_model'))

Prepare 2bits model using MQBench
[MQBENCH] INFO: Quantize model Scheme: BackendType.Academic Mode: Eval
[MQBENCH] INFO: Weight Qconfig:
    FakeQuantize: FixedFakeQuantize Params: {}
    Oberver:      MSEObserver Params: Symmetric: True / Bitwidth: 2 / Per channel: False / Pot scale: False / Extra kwargs: {}
[MQBENCH] INFO: Activation Qconfig:
    FakeQuantize: FixedFakeQuantize Params: {}
    Oberver:      EMAMSEObserver Params: Symmetric: False / Bitwidth: 2 / Per channel: False / Pot scale: False / Extra kwargs: {}
[MQBENCH] INFO: Replace module to qat module.
[MQBENCH] INFO: Set layer conv1 to 8 bit.
[MQBENCH] INFO: Set layer fc to 8 bit.
[MQBENCH] INFO: Set x post act quantize to 8 bit.
[MQBENCH] INFO: Insert act quant x_post_act_fake_quantizer
[MQBENCH] INFO: Insert act quant relu_post_act_fake_quantizer
[MQBENCH] INFO: Insert act quant layer1_0_relu_post_act_fake_quantizer
[MQBENCH] INFO: Insert act quant layer1_0_relu_1_post_act_fake_quantizer
[MQBENCH] INFO: Insert act quant 

In [None]:
evaluate(test,mqb_2bits_model),evaluate(test,mqb_4bits_model), evaluate(test,mqb_8bits_model)

In [None]:
mqb_fp_model = deepcopy(mqb_8bits_model)
disable_all(mqb_fp_model)
mqb_mix_model = deepcopy(mqb_fp_model)

# 1. record all modules we want to consider
types_to_quant = (torch.nn.Conv2d,torch.nn.Linear)

layer_input_map = {}

for node in mqb_8bits_model.graph.nodes:
    try:
        node_target = getModuleByName(mqb_mix_model,node.target)
        if isinstance(node_target,types_to_quant):
            node_args = node.args[0]
            print('input of ',node.target,' is ',node_args)
            layer_input_map[node.target] = str(node_args.target)
    except:
        continue

In [None]:
ref_metric = ('mean_acc',evaluate(test,mqb_fp_model)['mean_acc'])
ref_metric = ('mean_loss',evaluate(calib_data,mqb_fp_model)['mean_loss'])

In [None]:
ref_metric

In [None]:
def perturb(perturb_scheme):
    # perturb_scheme: {layer_name:(act_bits,weight_bits)}
    for layer_name in perturb_scheme:
        a_bits,w_bits = perturb_scheme[layer_name]
        
        if a_bits is not None:
            mix_module = getModuleByName(mqb_mix_model,layer_name)
            tar_module = getModuleByName(eval(f'mqb_{w_bits}bits_model'),layer_name)
            # replace input quant to use a_bits quantization
            a_cmd = f'mix_module.weight_fake_quant=tar_module.weight_fake_quant'
            exec(a_cmd)
        
        if w_bits is not None:
        
            # replace weight quant to use w_bits quantization
            w_cmd = f'mqb_mix_model.{layer_input_map[layer_name]}=mqb_{a_bits}bits_model.{layer_input_map[layer_name]}'
            exec(w_cmd)
        
        #print(layer_name)
        #print(a_cmd)
        #print(w_cmd)


In [None]:
# perturb functionality test
perturb_scheme = {}
for layer_name in layer_input_map:
    perturb_scheme[layer_name] = (4,8)
perturb(perturb_scheme)

In [None]:
evaluate(test,mqb_mix_model)

In [None]:
mqb_mix_model = deepcopy(mqb_8bits_model)
disable_all(mqb_mix_model)
evaluate(test,mqb_mix_model)

## CLADO

In [None]:
def perturb_loss(perturb_scheme,ref_metric=ref_metric,eval_data=calib_data,printInfo=False):
    global mqb_mix_model
    with torch.no_grad():
        # perturb layers
        perturb(perturb_scheme)
            
        # do evaluation
        res = evaluate(eval_data,mqb_mix_model)
        perturbed_loss = res[ref_metric[0]] - ref_metric[1]
        
        if printInfo:
            print(res)
        
        # recover layers
        mqb_mix_model = deepcopy(mqb_fp_model)
            
    return perturbed_loss

In [None]:
# perturb loss functionality check
for layer in layer_input_map:
    for a_bits in MPQ_scheme:
        for w_bits in MPQ_scheme:
            p = perturb_loss({layer:(a_bits,w_bits)},eval_data=test,printInfo=True)
            print(f'{layer} (a:{a_bits} bits,w:{w_bits} bits), accuracy degradation: {p*100:.2f}%')

## Build Cached Grad if not done before

In [None]:
import time
s_time = time.time()
cached = {}
for n in layers_to_quant:
    for m in layers_to_quant:
        for s1 in MPQ_scheme:
            for s2 in MPQ_scheme:
                if (n,m,s1,s2) not in cached:
                    if n == m:
                        print(f'mix perturb layer {n} to {s1}bits and {s2}bits')
                        p = perturb_loss({n:(s1,s2)},ref_metric,calib_data,mix_perturb=True)
                    else:
                        print(f'perturb layer {n} to {s1}bits and layer {m} to {s2}bits')
                        p = perturb_loss({n:s1,m:s2},ref_metric,calib_data,mix_perturb=False)
                    
                    cached[(n,m,s1,s2)] = cached[(m,n,s2,s1)] = p
                    
print(f'{time.time()-s_time:.2f} seconds elapsed')

In [None]:
layer_index = {}
cnt = 0
for layer in layers_to_quant:
    for s in MPQ_scheme:
        layer_index[layer+f'{s}bits'] = cnt
        cnt += 1
L = cnt

In [None]:
import numpy as np
hm = np.zeros(shape=(L,L))
for n in layers_to_quant:
    for m in layers_to_quant:
        for s1 in MPQ_scheme:
            for s2 in MPQ_scheme:
                hm[layer_index[n+f'{s1}bits'],layer_index[m+f'{s2}bits']] = cached[(n,m,s1,s2)]

In [None]:
cached_grad = np.zeros_like(hm)

In [None]:
import pickle
with open('generalw248_c10resnet56_calib','wb') as f:
    pickle.dump({'Ltilde':hm,'layer_index':layer_index},f)

In [None]:
perturb_loss(['conv1',],ref_metric,eval_data=calib_data)

## Load Cached Grad

In [None]:
import pickle
with open('generalw248_c10resnet56_calib','rb') as f:
    hm = pickle.load(f)

In [None]:
index2layerscheme = [None for i in range(hm['Ltilde'].shape[0])]

for name in hm['layer_index']:
    index = hm['layer_index'][name]
    layer_name = name[:-5]
    scheme = name[-5:]
    a = hm['Ltilde']
    print(f'index {index} layer {layer_name} scheme {scheme} Ltilde {a[index,index].item():.6f}')
    
    index2layerscheme[index] = (layer_name,scheme)
    

In [None]:
plt.imshow(hm['Ltilde'],cmap='hot')

In [None]:
L = hm['Ltilde'].shape[0]
cached_grad = np.zeros_like(hm['Ltilde'])
for i in range(L):
    for j in range(L):
        layer_i,scheme_i = index2layerscheme[i]
        layer_j,scheme_j = index2layerscheme[j]
        if layer_i == layer_j:
            if scheme_i == scheme_j:
                cached_grad[i,j] = cached_grad[j,i] = 2*hm['Ltilde'][i,j]
            else:
                #cached_grad[i,j] = cached_grad[j,i] = 4 * hm['Ltilde'][i,j] - hm['Ltilde'][i,i] - hm['Ltilde'][j,j]
                cached_grad[i,j] = cached_grad[j,i] = 0
        else:
            cached_grad[i,j] = cached_grad[j,i] = hm['Ltilde'][i,j] - hm['Ltilde'][i,i] - hm['Ltilde'][j,j]
        '''
        print(index2layerscheme[i])
        print(index2layerscheme[j])
        '''
        '''
        if i == j:
            cached_grad[i,j] = 0.5 * hm['Ltilde'][i,j]
        else:
            cached_grad[i,j] = 0.25 * (hm['Ltilde'][i,j]-hm['Ltilde'][i,i]-hm['Ltilde'][j,j])
        '''

In [None]:
# cached_grad[cached_grad<0]=0
plt.imshow(cached_grad)

### Define a naive cost function: model size

In [None]:
layer_size = np.array([0 for i in range(L)])
for l in hm['layer_index']:
    index = hm['layer_index'][l]
    layer_name, scheme = index2layerscheme[index]
    layer_size[index] = torch.numel(layers_to_quant[layer_name]['fp'].weight) * int(scheme[0])

In [None]:
# initialize random variable v
# use recitfied sigmoid h(v) to represent alpha
# freg is 1-(1-2h(v))**beta, annealing beta to 

if not isinstance(cached_grad,torch.Tensor):
    cached_grad = torch.Tensor(cached_grad)

layer_size_tensor = torch.Tensor(layer_size)

def lossfunc(v,beta,lambda1,lambda2,printInfo=False,naive=False,b=None):
    
    alpha = torch.nn.Softmax(dim=1)(v.reshape(-1,len(MPQ_scheme))).reshape(-1,)
    
    if not naive:
        outer_alpha = torch.outer(alpha,alpha)
        netloss = torch.sum(outer_alpha * cached_grad)
    else:
        netloss = torch.sum(torch.diagonal(cached_grad) * alpha)
        
    model_size = torch.sum(layer_size_tensor * alpha)/8/1024/1024 # model size in MB
            
    regloss = torch.sum(1-(torch.abs(1-2*alpha))**beta)
    regloss *= lambda1

    if b is None:
        closs = lambda2 * model_size
    else:
        closs = lambda2 * torch.clamp(model_size-b,0)
    
    totloss = netloss + regloss + closs
    
    if printInfo:
        print(f'netloss {netloss.item():.4f} regloss {regloss.item():.4f}(beta={beta:.4f}) closs{closs.item():.4f}(model size: {model_size.item():.4f}MB constraint:{b})')
        print('alpha:\n',alpha)
        
    return totloss    
    

In [None]:
def optimize(n_iteration,lr,beta,lambda1,lambda2,b=None,naive=False):
    
    v = torch.nn.Parameter(torch.randn(L))
    optim = torch.optim.Adam([v,],lr=lr)
    bs = np.linspace(beta[0],beta[1],n_iteration)
    
    for i in range(n_iteration):
        if i==0 or (i+1) % 1000 == 0:
            printInfo = True
            print(f'Iter {i+1}')
        else:
            printInfo = False
            
        optim.zero_grad()
        loss = lossfunc(v,bs[i],lambda1,lambda2,printInfo=printInfo,b=b,naive=naive)
        loss.backward()
        optim.step()
    
    return v

def evaluate_decision(v,printInfo=False,test=test):
    v = v.detach()
    # alpha = torch.nn.Softmax(dim=1)(v.reshape(-1,len(MPQ_scheme)))
    offset = torch.ones(int(L/len(MPQ_scheme)),dtype=int) * len(MPQ_scheme)
    offset = offset.cumsum(dim=-1) - len(MPQ_scheme)
    select = v.reshape(-1,len(MPQ_scheme)).argmax(dim=1) + offset
    
    modelsize = (layer_size[select]).sum()/8/1024/1024
    
    decisions = {}
    for scheme_id in select.numpy():
        layer,scheme = index2layerscheme[scheme_id]
        decisions[layer] = scheme
    
    print("evaluate_decision\n",decisions)
    

    with torch.no_grad():
        # perturb layers
        for n in decisions:
            layers_to_quant[n]['mix'].weight.data = layers_to_quant[n][decisions[n]].weight.data
        # do evaluation
        res = evaluate(test,torch_mix_model)
        # recover layers
        for n in decisions:
            layers_to_quant[n]['mix'].weight.data = layers_to_quant[n]['fp'].weight.data
    return res,modelsize

In [None]:
v = optimize(n_iteration=5000,lr=2e-3,beta=[20,2],lambda1=1e-6,lambda2=1e-4,naive=True)

In [None]:
evaluate_decision(v)

## Use only 4 and 8 bits

In [None]:
MPQ_scheme = (2,4,8)

In [None]:
cached_grad.size()

In [None]:
del_index = []
for index in range(len(index2layerscheme)):
    if (int(index2layerscheme[index][1][:-4])) not in MPQ_scheme:
        del_index.append(index)

In [None]:
cached_grad = np.delete(cached_grad,[del_index],axis=0)
cached_grad = np.delete(cached_grad,[del_index],axis=1)
index2layerscheme = np.delete(index2layerscheme,[del_index],axis=0)
layer_size = np.delete(layer_size,[del_index],axis=0)

In [None]:
L = len(layer_size)

In [None]:
L

In [None]:
if not isinstance(cached_grad,torch.Tensor):
    cached_grad = torch.Tensor(cached_grad)

layer_size_tensor = torch.Tensor(layer_size)

def lossfunc(v,beta,lambda1,lambda2,printInfo=False,naive=False,b=None):
    
    alpha = torch.nn.Softmax(dim=1)(v.reshape(-1,len(MPQ_scheme))).reshape(-1,)
    
    if not naive:
        outer_alpha = torch.outer(alpha,alpha)
        netloss = torch.sum(outer_alpha * cached_grad)
    else:
        netloss = torch.sum(torch.diagonal(cached_grad) * alpha)
        
    model_size = torch.sum(layer_size_tensor * alpha)/8/1024/1024 # model size in MB
            
    regloss = torch.sum(1-(torch.abs(1-2*alpha))**beta)
    regloss *= lambda1

    if b is None:
        closs = lambda2 * model_size
    else:
        closs = lambda2 * torch.clamp(model_size-b,0)
    
    totloss = netloss + regloss + closs
    
    if printInfo:
        print(f'netloss {netloss.item():.4f} regloss {regloss.item():.4f}(beta={beta:.4f}) closs{closs.item():.4f}(model size: {model_size.item():.4f}MB constraint:{b})')
        print('alpha:\n',alpha)
        
    return totloss    

In [None]:
def optimize(n_iteration,lr,beta,lambda1,lambda2,b=None,naive=False):
    
    v = torch.nn.Parameter(torch.randn(L))
    optim = torch.optim.Adam([v,],lr=lr)
    bs = np.linspace(beta[0],beta[1],n_iteration)
    
    for i in range(n_iteration):
        if i==0 or (i+1) % 1000 == 0:
            printInfo = True
            print(f'Iter {i+1}')
        else:
            printInfo = False
            
        optim.zero_grad()
        loss = lossfunc(v,bs[i],lambda1,lambda2,printInfo=printInfo,b=b,naive=naive)
        loss.backward()
        optim.step()
    
    return v

def evaluate_decision(v,printInfo=False,test=test):
    v = v.detach()
    # alpha = torch.nn.Softmax(dim=1)(v.reshape(-1,len(MPQ_scheme)))
    offset = torch.ones(int(L/len(MPQ_scheme)),dtype=int) * len(MPQ_scheme)
    offset = offset.cumsum(dim=-1) - len(MPQ_scheme)
    select = v.reshape(-1,len(MPQ_scheme)).argmax(dim=1) + offset
    
    modelsize = (layer_size[select]).sum()/8/1024/1024
    
    decisions = {}
    for scheme_id in select.numpy():
        layer,scheme = index2layerscheme[scheme_id]
        decisions[layer] = scheme
    
    print("evaluate_decision\n",decisions)
    

    with torch.no_grad():
        # perturb layers
        for n in decisions:
            layers_to_quant[n]['mix'].weight.data = layers_to_quant[n][decisions[n]].weight.data
        # do evaluation
        res = evaluate(test,torch_mix_model)
        # recover layers
        for n in decisions:
            layers_to_quant[n]['mix'].weight.data = layers_to_quant[n]['fp'].weight.data
    return res,modelsize

## Random MPQ

In [None]:
# random_size = []
# random_acc = []
for i in range(500):
    v = torch.randn(L)
    res,size = evaluate_decision(v)
    random_size.append(size)
    random_acc.append(res['mean_acc'])

In [None]:
random_size,random_acc

In [None]:
with open('resnet56_random_baseline.pkl','wb') as f:
    pickle.dump({'size':random_size,'acc':random_acc},f)
    

In [None]:
plt.hist(random_size)

## Pareto-Frontier of FeintLady vs Inter-Layer Dependency Unaware Optimization (Naive)

In [None]:
n_iters = (5000,10000)
lambda1s = np.logspace(-6,-3,3)
lambda2s = np.logspace(-3,1,50) 
sample_size = 5
results = {}
for n_iter in n_iters:
    for lambda1 in lambda1s:
        for lambda2 in lambda2s:
            feint_loss,feint_size = [],[]
            trial_name = f'{MPQ_scheme}bits_CLADO_lambda1{lambda1}_lambda2{lambda2}_{n_iter}iters'
            print(trial_name)
            for repeat in range(sample_size):
                v = optimize(n_iteration=n_iter,lr=2e-3,beta=[20,2],lambda1=lambda1,lambda2=lambda2,naive=False)
                perf,size = evaluate_decision(v)
                feint_loss.append(perf)
                feint_size.append(size)
            results[trial_name] = {'size':feint_size,'perf':feint_loss}

In [None]:
results

In [None]:
n_iters = (5000,10000)
lambda1s = np.logspace(-6,-3,3)
lambda2s = np.logspace(-6,-1,50) #lambda1=1e-3,n=5000,lr=1e-3,beta=[20,2] for resnet20 on cifar10
sample_size = 5

for n_iter in n_iters:
    for lambda1 in lambda1s:
        for lambda2 in lambda2s:
            naive_loss,naive_size = [],[]
            print('lambda2:',lambda2)
            trial_name = f'{MPQ_scheme}bits_NAIVE_lambda1{lambda1}_lambda2{lambda2}_{n_iter}iters'
            for repeat in range(sample_size):
                v = optimize(n_iteration=n_iter,lr=2e-3,beta=[20,2],lambda1=lambda1,lambda2=lambda2,naive=True)
                perf,size = evaluate_decision(v)
                naive_loss.append(perf)
                naive_size.append(size)
            results[trial_name] = {'size':naive_size,'perf':naive_loss}

In [None]:
with open('general248c10resnet56results.pkl','wb') as f:
    pickle.dump(results,f)

In [None]:
with open('saved/general48c10resnet56results.pkl','rb') as f:
    c48 = pickle.load(f)
with open('saved/general248c10resnet56results.pkl','rb') as f:
    c248 = pickle.load(f)

In [None]:
def getPF(xs,ys):
    xs = np.array(xs)
    ys = np.array(ys)
    
    order = np.argsort(xs)
    
    xs = xs[order]
    ys = ys[order]
    
    cur_max = -1
    for i in range(ys.shape[0]):
        if ys[i] > cur_max:
            cur_max = ys[i]
        ys[i] = cur_max
    
    return xs,ys
plt.rcParams['figure.figsize'] = (12,8)

In [None]:
clado_size,clado_acc = [], []
naive_size,naive_acc = [], []
for trial in c48:
    size = c48[trial]['size']
    perf = c48[trial]['perf']
    perf = [x['mean_acc'] for x in perf]
    if 'NAIVE' in trial:
        naive_size,naive_acc = naive_size+size,naive_acc+perf
    if 'CLADO' in trial:
        clado_size,clado_acc = clado_size+size,clado_acc+perf 
    #size = np.array(size)
    #perf = np.array(perf)
    #size,perf = getPF(size,perf)
    #plt.plot(size,perf,label=trial)
c48_naive_pf = getPF(np.array(naive_size),np.array(naive_acc))
c48_clado_pf = getPF(np.array(clado_size),np.array(clado_acc))
plt.plot(c48_naive_pf[0],c48_naive_pf[1],label='(4,8)bits naive MPQ')
plt.plot(c48_clado_pf[0],c48_clado_pf[1],label='(4,8)bits clado MPQ')

clado_size,clado_acc = [], []
naive_size,naive_acc = [], []
for trial in c248:
    size = c248[trial]['size']
    perf = c248[trial]['perf']
    perf = [x['mean_acc'] for x in perf]
    if 'NAIVE' in trial:
        naive_size,naive_acc = naive_size+size,naive_acc+perf
    if 'CLADO' in trial:
        clado_size,clado_acc = clado_size+size,clado_acc+perf 
    #size = np.array(size)
    #perf = np.array(perf)
    #size,perf = getPF(size,perf)
    #plt.plot(size,perf,label=trial)
c248_naive_pf = getPF(np.array(naive_size),np.array(naive_acc))
c248_clado_pf = getPF(np.array(clado_size),np.array(clado_acc))
plt.plot(c248_naive_pf[0],c248_naive_pf[1],label='(2,4,8)bits naive MPQ')
plt.plot(c248_clado_pf[0],c248_clado_pf[1],label='(2,4,8)bits clado MPQ')
plt.legend()

plt.xlim([0.4,0.7])
plt.ylim([0.88,0.95])
plt.xlabel('Hardware Cost (Model Size in MB)',fontsize=20)
plt.ylabel('Performance (Accuracy)',fontsize=20)
plt.legend()
plt.savefig('c10resnet56_w248.pdf',transparent=True, bbox_inches='tight', pad_inches=0)

In [None]:
c48_naive_size = np.array(c48['naive_size'])
c48_naive_loss = c48['naive_loss']
c48_feint_size = np.array(c48['feint_size'])
c48_feint_loss = c48['feint_loss']

c48_naive_acc = []
for i in range(len(c48_naive_loss)):
    c48_naive_acc.append(c48_naive_loss[i]['mean_acc'])

c48_feint_acc = []
for i in range(len(c48_feint_loss)):
    c48_feint_acc.append(c48_feint_loss[i]['mean_acc'])

c248_naive_size = np.array(c248['naive_size'])
c248_naive_loss = c248['naive_loss']
c248_feint_size = np.array(c248['feint_size'])
c248_feint_loss = c248['feint_loss']

c248_naive_acc = []
for i in range(len(c248_naive_loss)):
    c248_naive_acc.append(c248_naive_loss[i]['mean_acc'])

c248_feint_acc = []
for i in range(len(c248_feint_loss)):
    c248_feint_acc.append(c248_feint_loss[i]['mean_acc'])

In [None]:
def getPF(xs,ys):
    xs = np.array(xs)
    ys = np.array(ys)
    
    order = np.argsort(xs)
    
    xs = xs[order]
    ys = ys[order]
    
    cur_max = -1
    for i in range(ys.shape[0]):
        if ys[i] > cur_max:
            cur_max = ys[i]
        ys[i] = cur_max
    
    return xs,ys

plt.rcParams["figure.figsize"] = (12,10)

c48_feint_size,c48_feint_acc = getPF(c48_feint_size,c48_feint_acc)

c48_naive_size,c48_naive_acc = getPF(c48_naive_size,c48_naive_acc)

c248_feint_size,c248_feint_acc = getPF(c248_feint_size,c248_feint_acc)

c248_naive_size,c248_naive_acc = getPF(c248_naive_size,c248_naive_acc)

plt.scatter(c48_naive_size,c48_naive_acc,color='lightcoral',alpha=0.5,label='c48 Inter-Layer Depedency Unaware Optimization')
plt.scatter(c48_feint_size,c48_feint_acc,color='lightblue',alpha=0.5,label='c48 FeintLady Optimization')
# plt.scatter(c248_naive_size,c248_naive_acc,color='red',alpha=0.5,label='c248 CLADO Used')
# plt.scatter(c248_feint_size,c248_feint_acc,color='blue',alpha=0.5,label='c248 CLADO Not Used')

plt.xlabel('Hardware cost')
plt.ylabel('Performance')
plt.legend()
#plt.savefig('c100resnet56FeintEffecacy.pdf',transparent=True, bbox_inches='tight', pad_inches=0)

In [None]:
plt.scatter(naive_size,naive_acc,color='red',alpha=0.5,label='naive')
# plt.scatter(naive_size,naive_acc,color='blue',alpha=0.5,label='feint')
plt.xlabel('hardware cost')
plt.ylabel('performance')
plt.legend()
plt.show()

## Visualization

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt

fname = 'result_cifar100_shufflenetv2_x2_0_mode0_useaccFalse.pkl'
with open(fname,'rb') as f:
    res_sfn20 = pickle.load(f)

fname = 'result_cifar100_shufflenetv2_x1_5_mode0_useaccFalse.pkl'
with open(fname,'rb') as f:
    res_sfn15 = pickle.load(f)
    
fname = 'result_cifar100_mobilenetv2_x1_4_mode0_useaccFalse.pkl'
with open(fname,'rb') as f:
    res_mbn14 = pickle.load(f)

fname = 'result_cifar100_mobilenetv2_x0_75_mode0_useaccFalse.pkl'
with open(fname,'rb') as f:
    res_mbn075 = pickle.load(f)
    

fname = 'result_cifar100_resnet56_mode0_useaccFalse.pkl'
with open(fname,'rb') as f:
    res_rsn56 = pickle.load(f)
    
    

In [None]:
for k in res_rsn56: print(k)

In [None]:
def getPF_(xs,ys,mode='max',roundtoprecision=1):
    pf = {}
    for x,y in zip(xs,ys):
        new_x = round(x,roundtoprecision)
        if new_x in pf:
            pf[new_x] = eval(mode)(pf[new_x],y)
        else:
            pf[new_x] = y
    
    pf_x,pf_y = [],[]
    
    for x in pf:
        pf_x.append(x)
        pf_y.append(pf[x])
    
    pf_x, pf_y = np.array(pf_x),np.array(pf_y)
    
    return pf_x,pf_y

def getPF(xs,ys):
    xs = np.array(xs)
    ys = np.array(ys)
    
    order = np.argsort(xs)
    
    xs = xs[order]
    ys = ys[order]
    
    cur_max = -1
    for i in range(ys.shape[0]):
        if ys[i] > cur_max:
            cur_max = ys[i]
        ys[i] = cur_max
    
    return xs,ys
        

In [None]:

x_1_mbn075,y_1_mbn075 = getPF(res_mbn075['naive_size'],res_mbn075['naive_acc'])
x_2_mbn075,y_2_mbn075 = getPF(res_mbn075['feint_size'],res_mbn075['feint_acc'])

x_1_mbn14,y_1_mbn14 = getPF(res_mbn14['naive_size'],res_mbn14['naive_acc'])
x_2_mbn14,y_2_mbn14 = getPF(res_mbn14['feint_size'],res_mbn14['feint_acc'])

x_1_sfn20,y_1_sfn20 = getPF(res_sfn20['naive_size'],res_sfn20['naive_acc'])
x_2_sfn20,y_2_sfn20 = getPF(res_sfn20['feint_size'],res_sfn20['feint_acc'])

x_1_sfn15,y_1_sfn15 = getPF(res_sfn15['naive_size'],res_sfn15['naive_acc'])
x_2_sfn15,y_2_sfn15 = getPF(res_sfn15['feint_size'],res_sfn15['feint_acc'])

x_1_rsn56,y_1_rsn56 = getPF(res_rsn56['naive_size'],res_rsn56['naive_acc'])
x_2_rsn56,y_2_rsn56 = getPF(res_rsn56['feint_size'],res_rsn56['feint_acc'])

#x_random,y_random = getPF(random_size,random_acc)

In [None]:
# random baseline vs use/not use gradient on resnet56
# plt.rcParams['figure.figsize'] = (12,8)
fname = 'result_cifar10_resnet56_mode0_useaccFalse.pkl'
with open(fname,'rb') as f:
    res_rsn = pickle.load(f)
fname = 'resnet56_random_baseline.pkl'
with open(fname,'rb') as f:
    rand_rsn = pickle.load(f)

In [None]:
plt.scatter(res_rsn['feint_size'][:],res_rsn['feint_acc'][:],color='blue',
            marker='o',s=20,alpha=0.5,label='Cross-layer Gradients Used')

plt.scatter(res_rsn['naive_size'][:],res_rsn['naive_acc'][:],color='red',
            marker='o',s=20,alpha=0.5,label='Cross-layer Gradients Ignored')

plt.scatter(rand_rsn['size'],rand_rsn['acc'],color='black',marker='o',s=20,alpha=0.5,
            label='Random Guess')
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('Hardware Cost (Model Size in MB)',fontsize=20)
plt.ylabel('Performance (Accuracy)',fontsize=20)
plt.legend()
plt.savefig('c10resnet.pdf',transparent=True, bbox_inches='tight', pad_inches=0)

In [None]:
plt.rcParams['figure.figsize'] = (12,8)

plt.plot(x_1_mbn14,y_1_mbn14,color='red',
         #marker='^',markersize=3,alpha=0.5,
         linewidth=1,label='mobilenetv2_x1_4(N)')
plt.plot(x_2_mbn14,y_2_mbn14,color='blue',
         #marker='v',markersize=3,alpha=0.5,
         linewidth=1,label='mobilenetv2_x1_4(A)')

# plt.plot(x_1_mbn075,y_1_mbn075,color='red',
#          marker='^',markersize=3,alpha=0.5,linewidth=1,label='mobilenetv2_x0_75(N)')
# plt.plot(x_2_mbn075,y_2_mbn075,color='blue',
#          marker='v',markersize=3,alpha=0.5,linewidth=1,label='mobilenetv2_x0_75(A)')

plt.plot(x_1_sfn20,y_1_sfn20,color='lightcoral',
         #marker='^',markersize=3,alpha=0.5,
         linewidth=1,label='shufflenetv2_x2_0(N)')
plt.plot(x_2_sfn20,y_2_sfn20,color='lightblue',
         #marker='v',markersize=3,alpha=0.5,
         linewidth=1,label='sufflenetv2_x2_0(A)')

plt.plot(x_1_sfn15,y_1_sfn15,color='orangered',
         #marker='^',markersize=3,alpha=0.5,
         linewidth=1,label='shufflenetv2_x1_5(N)')
plt.plot(x_2_sfn15,y_2_sfn15,color='cyan',
         #marker='v',markersize=3,alpha=0.5,
         linewidth=1,label='sufflenetv2_x1_5(A)')

plt.plot(x_1_rsn56,y_1_rsn56,color='darkred',
         #marker='^',markersize=3,alpha=0.5,
         linewidth=1,label='resnet56(N)')
plt.plot(x_2_rsn56,y_2_rsn56,color='darkblue',
         #marker='v',markersize=3,alpha=0.5,
         linewidth=1,label='resnet56(A)')


# plt.scatter(x_1,y_1,color='red',marker='^',s=10,alpha=0.5)
# plt.scatter(x_2,y_2,color='blue',marker='v',s=10,alpha=0.5)

plt.ylim([0.68,0.76])
plt.xlim([0.,4])
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('Hardware Cost (Model Size in MB)',fontsize=20)
plt.ylabel('Performance (Accuracy)',fontsize=20)
plt.legend()

# plt.ylim([0.7,0.755])
# plt.xlim([2.7,4.0])
plt.savefig('c100pareto_3nets.pdf',transparent=True, bbox_inches='tight', pad_inches=0)

In [None]:
plt.rcParams['figure.figsize'] = (12,6)
plt.plot(np.log(x_1_mbn)[100:],y_1_mbn[100:],color='red',marker='^',markersize=3,alpha=0.5,linewidth=1,label='mobilenet(A)')
plt.plot(np.log(x_2_mbn)[100:],y_2_mbn[100:],color='blue',marker='v',markersize=3,alpha=0.5,linewidth=1,label='mobilenet(N)')

plt.plot(np.log(x_1_sfn)[200:],y_1_sfn[200:],color='lightcoral',marker='^',markersize=3,alpha=0.5,linewidth=1,label='shufflenet(A)')
plt.plot(np.log(x_2_sfn)[200:],y_2_sfn[200:],color='lightblue',marker='v',markersize=3,alpha=0.5,linewidth=1,label='sufflenet(N)')

plt.plot(np.log(x_1_rsn)[0:],y_1_rsn[0:],color='darkred',marker='^',markersize=3,alpha=0.5,linewidth=1,label='resnet(A)')
plt.plot(np.log(x_2_rsn)[0:],y_2_rsn[0:],color='darkblue',marker='v',markersize=3,alpha=0.5,linewidth=1,label='resnet(N)')
# plt.scatter(x_1,y_1,color='red',marker='^',s=10,alpha=0.5)
# plt.scatter(x_2,y_2,color='blue',marker='v',s=10,alpha=0.5)
# plt.ylim([0.66,0.76])
# plt.xlim([0.5,3.5])
plt.ylim([0.65,0.76])
plt.xlim([-1,1.8])
plt.xlabel('Log Model Size (in MB)',fontsize=20)
plt.ylabel('Test Accuracy',fontsize=20)
plt.legend()
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
# plt.ylim([0.7,0.755])
# plt.xlim([2.7,4.0])
#plt.savefig('c100pareto.pdf',transparent=True, bbox_inches='tight', pad_inches=0)

