In [1]:
!pip3 install brevitas -q

In [10]:

import sys, os, shutil, json
import cv2 as cv
import matplotlib.pyplot as plt
import torch
sys.path.append('..')
# from .. 
import metrics, preprocessing, utils, training
import numpy as np

preprocessing.BaseGenerator.MAX_NUMBER_OF_THREADS = 2
preprocessing.YoloDataGenerator.NUMBER_OF_THREADS = 1

metrics.CONSTANTS.OLD_TORCH = True
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
dataset_local_path = '../../DATASETS/Merged_dataset'
folds = training.load_folds('../folds_state_path_bbox.pkl')
batch_size = 16


In [5]:
class DWConv2d(torch.nn.Module):
    
    def __init__(self, in_ch, intermediate_channels=1, bias=False, use_bn=True, use_relu=False, device=None):
        super().__init__()
        self.conv = torch.nn.Conv2d(in_ch,in_ch*intermediate_channels,3,padding=1, groups=in_ch, bias=bias)
        self.mul = intermediate_channels
        self.reordered = False
        self.use_bn = use_bn
        self.use_relu = use_relu
        self.use_bias = bias
        
        if use_bn:
            self.bn = torch.nn.BatchNorm2d(in_ch*intermediate_channels)
        else:
            self.bn = None
        if use_relu:
            self.relu = torch.nn.ReLU(True)
        else:
            self.relu = None
    
    def reorder(self, order:torch.tensor):
        ch_in = self.conv.in_channels
        ch_out = self.conv.out_channels
        mul = self.mul
        
        layers = []
        indeces = []
        for i in range(mul):
            ind = torch.arange(0,ch_in)*mul+i
            ind = ind[order]
            
            w = self.conv.weight[ind,...]
            b = self.conv.bias[ind,...] if self.use_bias else None
            L = DWConv2d(ch_in,intermediate_channels=1, 
                         bias=self.use_bias,
                         use_bn=self.use_bn, 
                         use_relu=self.use_relu)
            
            with torch.no_grad():
                L.conv.weight[...] = w
                if self.use_bias:
                    L.conv.bias[...] = b
            
            if self.use_bn is not None:
                with torch.no_grad():
                    L.bn.weight[...] = self.bn.weight[ind,...]
                    L.bn.bias[...] = self.bn.bias[ind,...]
                    L.bn.running_mean[...] = self.bn.running_mean[ind,...]
                    L.bn.running_var[...] = self.bn.running_var[ind,...]
            
            indeces.append(ind)
            layers.append(L)
            self.add_module("sub_dw_"+str(i),L)
        
        del self.conv
        del self.bn
        del self.relu
        
        indeces = torch.cat(indeces)
        self.reordered = True
        self.layers = layers
        
        return indeces

    def forward(self, x):
        
        if self.reordered:
            y = []
            for L in self.layers:
                y.append(L(x))
            
            x = torch.cat(y,dim=1) if len(y) > 1 else y[0]
            
        else:
            x = self.conv(x)
            if self.bn:
                x = self.bn(x)
            if self.relu:
                x = self.relu(x)
        
        return x


class PWConv2d(torch.nn.Module):
    def __init__(self,in_ch, out_ch, bias=False, use_bn=True, use_relu=False, use_mp=False, device=None):
        super().__init__()
        self.conv = torch.nn.Conv2d(in_ch,out_ch,1,padding=0, bias=bias)
    
        if use_bn:
            self.bn = torch.nn.BatchNorm2d(out_ch)
        else:
            self.bn = None
        if use_relu:
            self.relu = torch.nn.ReLU(True)
        else:
            self.relu = None
        if use_mp:
            self.mp = torch.nn.MaxPool2d(2,2)
        else:
            self.mp = None
    
    def reorder(self, order:torch.tensor):
        conv = self.conv
        ch_in = self.conv.in_channels
        ch_out = self.conv.out_channels
        
        with torch.no_grad():
            self.conv.weight[...] = self.conv.weight[:,order,...] 
        
        return torch.arange(0,ch_out)
    
    def forward(self, x):
        x = self.conv(x)
        
        if self.bn:
            x = self.bn(x)
        if self.relu:
            x = self.relu(x)
        if self.mp:
            x = self.mp(x)
        
        return x


class AnchorMul(torch.nn.Module):
    def __init__(self, num_of_anchors, device=torch.device('cpu')):
        super().__init__()
        self.noa = num_of_anchors
        self.anchors = torch.nn.Parameter(data=torch.Tensor(1,2*self.noa,1,1), requires_grad=True)
        self.anchors.data.uniform_(-1,1)
        self.register_parameter('anchors', self.anchors)
        
        self.to(device)

    def forward(self, x):
        xvc = x[:,:-2*self.noa,:,:]
        xwh = x[:,-2*self.noa:,:,:]
        ywh = xwh*torch.exp(self.anchors)
        y = torch.cat((xvc,ywh), dim=1)

        return y

# float LN7
net = torch.nn.Sequential(
            DWConv2d(3, intermediate_channels=2, bias=False, use_bn=True, use_relu=True, device=device),
            PWConv2d(6,8, bias=True, use_bn=True, use_relu=False, use_mp=True, device=device),
            DWConv2d(8, bias=True, use_bn=True, use_relu=False, device=device),
            DWConv2d(8, intermediate_channels=2, bias=False, use_bn=True, use_relu=True, device=device),
            PWConv2d(16,32, bias=True, use_bn=True, use_relu=False, use_mp=True, device=device),
            DWConv2d(32, bias=True, use_bn=True, use_relu=False, device=device),
            DWConv2d(32, intermediate_channels=2, bias=False, use_bn=True, use_relu=True, device=device),
            PWConv2d(64,64, bias=False, use_bn=True, use_relu=False, use_mp=True, device=device),
            DWConv2d(64, bias=True, use_bn=True, use_relu=False, device=device),
            DWConv2d(64, intermediate_channels=2, bias=True, use_bn=True, use_relu=True, device=device),
            PWConv2d(128,128, bias=False, use_bn=True, use_relu=True, use_mp=True, device=device),
            DWConv2d(128, bias=True, use_bn=True, use_relu=False, device=device),
            DWConv2d(128, intermediate_channels=2, bias=True, use_bn=True, use_relu=True, device=device),
            PWConv2d(256,256, bias=True, use_bn=True, use_relu=True, device=device),
            DWConv2d(256, bias=True, use_bn=True, use_relu=False, device=device),
            DWConv2d(256, intermediate_channels=2, bias=True, use_bn=True, use_relu=True, device=device),
            PWConv2d(512,256, bias=True, use_bn=True, use_relu=True, device=device),
            DWConv2d(256, intermediate_channels=1, bias=False, use_bn=True, use_relu=True, device=device),
            PWConv2d(256,5*3, bias=True, use_bn=False, use_relu=False, device=device)
).to(device)
anchor_mul = AnchorMul(3,device).to(device)

sd = torch.load('weights_float_gciou.pt',map_location=device)
k = list(sd.keys())
v = list(sd.values())
# load anchor mul
am_sd = {list(anchor_mul.state_dict().keys())[0]:v[-1]}
anchor_mul.load_state_dict(am_sd)
# load LN7 weights
k = list(net.state_dict().keys()) # net keys
net_sd = {k:v for k,v in zip(k,v[:-1])}
net.load_state_dict(net_sd)

net = net.eval()
anchor_mul = anchor_mul.eval()
net.train(False)
anchor_mul.train(False)

# for k,v in net.state_dict().items():
#     print(k,v.shape)


AnchorMul()

In [6]:
net = net
order = torch.arange(0,3)
for n,m in net.named_children():
    order = m.reorder(order)
net = net.eval().train(False)

In [7]:
image_shape = (112, 208, 3)

after_load = preprocessing.numpy_to_torch_iou_params(device)
to_anchors_single = lambda *x: preprocessing.to_anchors_for_iou_loss(*x,False,False)
# to_anchors_multi = lambda *x: preprocessing.to_anchors_for_iou_loss(*x,True,True)
to_anchors_multi = lambda *x: preprocessing.to_anchors_for_iou_loss(*x,True,False)
# to_anchors_single = to_anchors_multi

anchors = [22,33,
            5,10,
            15,5
          ]
anchors = np.array(anchors, np.float32).reshape((-1,2))
anchors *= np.array([[image_shape[0]/340, image_shape[1]/640]])

# pass example tensor
torch.cuda.empty_cache()
tensor = torch.rand((8,3,)+image_shape[:2]).to(device)
print("Input shape =",tensor.shape)
with torch.no_grad():
    result = net(tensor)

print("Result shape =",result.shape)

# get yolo paremeters
# output_sizes = net.output_sizes(input_size=image_shape[:2][::-1])[-1,:]
output_sizes = np.array(result.shape[2:][::-1])
del tensor
del result

print("Anchors: ")
print(anchors) 
print("Output sizes: ")
print(output_sizes) 

# CREATE GENERATORS
def numpy_to_tensor(X,y,device=device):
    return utils.data_to_tensor_v3(X,y,device)
(None,None,None,None)
grid_WH2 = image_shape[:2][::-1] // (2*output_sizes)

val_generator = preprocessing.YoloDataGenerator(
                            dataset_local_path,
                            input_shape=image_shape,
                            anchors=anchors,
                            images_labes=[], 
                            batch_size=batch_size,
                            name='ValGenerator', 
                            augmentator=None,
                            output_size=output_sizes,
                            after_load=after_load,
                            # bbox_to_anchors=to_anchors_single,
                            bbox_to_anchors=to_anchors_multi,
                            )
test_generator = preprocessing.YoloDataGenerator(
                            dataset_local_path,
                            input_shape=image_shape,
                            anchors=anchors,
                            images_labes=[], 
                            batch_size=batch_size,
                            name='TestGenerator', 
                            augmentator=None,
                            output_size=output_sizes,
                            after_load=after_load,
                            # bbox_to_anchors=to_anchors_single,
                            bbox_to_anchors=to_anchors_multi,
                            )

_, val_set = folds.__getitem__(0, train_folds=4)
val_generator.images_labes = val_set
test_generator.images_labes = folds.test_set

# decorator -> aplly anchor mul before metric calculation 
metric_iou = metrics.SingleObjectIOUsBasedMetrics(anchors, image_shape, device)
def mean_iou(y_pred, y_ref, metric_iou=metric_iou, anchor_mul=anchor_mul):
    y_pred = anchor_mul(y_pred)
    return metric_iou(y_pred, y_ref)



Input shape = torch.Size([8, 3, 112, 208])
Result shape = torch.Size([8, 15, 7, 13])
Anchors: 
[[ 7.247059  10.725    ]
 [ 1.6470588  3.25     ]
 [ 4.9411764  1.625    ]]
Output sizes: 
[13  7]


In [8]:
import torch
import torch.nn as nn


def evaluate(model,
             dataloader,
             evaluator, fl_model=net
             ):
    with torch.no_grad():
        score = 0.0
        cntr = 0
        for i in range(len(dataloader)):
            XY = dataloader[i]
            X = XY[0]
            Y = XY[1]
            L = X.shape[0]
            y_pred = model(X)
            score = score*cntr + X.shape[0]*evaluator(y_pred, Y)
            cntr += X.shape[0]
            score /= cntr
            print("\rEvaluation {}/{}. Score = {}".format(i,len(dataloader), score),end='')
        
        print("\rEvaluation {}/{}. Score = {}".format(len(dataloader),len(dataloader), score),end='\n')


def quantize(float_model:torch.nn.Module, 
             input_shape:tuple,
             quant_dir:str, 
             quant_mode:str, 
             device:torch.device,
             dataloader,
             evaluator):
    """
    :param float_model: float model with loaded weights
    :param input_shape: shape of input(CH,W,H)
    :param quant_dir: path to directory with quantized model components
    :param quant_mode: quant_mode in ['calib', 'test'] 
    :param data_loader: data_loader target is not needed - for 'calib' must be batch_size == 1
    :param evaluator: fcn/obj like: fcn(y_pred, y_ref) -> float 
    """
    # available in docker or after packaging 
    # vitis-AI-tools/..../pytorch../pytorch_nndct
    # and installing the package
    from pytorch_nndct.apis import torch_quantizer, dump_xmodel
    # model to device
    model = float_model.to(device)
    
    # That was present in vai tutorial.
    # I don't know if it affects to anything?
    # Force to merge BN with CONV for better quantization accuracy
    optimize = 1

    rand_in = torch.randn((1,)+input_shape[-1:]+input_shape[:2])
    print("get qunatizer start")
    try:
        quantizer = torch_quantizer(
            quant_mode, model, rand_in, output_dir=quant_dir, device=device)
    except Exception as e:
        print("exception:")
        print(e)
        return
    print("get qunatizer end")
        
    print("get quantized model start")
    quantized_model = quantizer.quant_model
    print("get quantized model end")

    # evaluate
    print("testing st")
    evaluate(quantized_model, dataloader, evaluator)
    print("testing end")

    # export config
    if quant_mode == 'calib':
        print("export config")
        quantizer.export_quant_config()
        print("export config end")
    # export model
    if quant_mode == 'test':
        print("export xmodel")
        quantizer.export_xmodel(deploy_check=False, output_dir=quant_dir)
        print("export xmodel end")

    return

"""
Needed to run of quantize.
first with quant_mode = 'calib'
second with quant_mode = 'test'
"""


"\nNeeded to run of quantize.\nfirst with quant_mode = 'calib'\nsecond with quant_mode = 'test'\n"

In [None]:
# Evaluate float model on test dataset
# evaluate(net,test_generator,evaluator=mean_iou)
# Evaluation 6123/6124. Score = 0.6773354439616692
# Evaluate float model on val dataset
# evaluate(net,val_generator,evaluator=mean_iou)
# Evaluation 279/1839. Score = 0.6722009609852523

In [203]:
# Use only subset of val set
# set whole dataset
val_generator.images_labes = val_set
# shuffle samples
val_generator.on_epoch_end()
# get subset (100) of samples
# subset = val_generator.images_labes[:200]
val_generator.images_labes = subset
# process only one image per forward
val_generator.batch_size = 1

In [202]:
# Quantize model - calib
quantize(net, 
         image_shape,
         quant_dir='quant_dir',
         quant_mode='calib',
         device=device,
         dataloader=val_generator,
         evaluator=mean_iou)

get qunatizer start

[0;32m[NNDCT_NOTE]: Quantization calibration process start up...[0m

[0;32m[NNDCT_NOTE]: =>Quant Module is in 'cpu'.[0m

[0;32m[NNDCT_NOTE]: =>Parsing Sequential...[0m

[0;32m[NNDCT_NOTE]: =>Doing weights equalization...[0m

[0;32m[NNDCT_NOTE]: =>Quantizable module is generated.(quant_dir/Sequential.py)[0m
get qunatizer end
get quantized model start

[0;32m[NNDCT_NOTE]: =>Get module with quantization.[0m
get quantized model end
testing st
Evaluation 200/200. Score = 0.6620479585044089
testing end
export config

[0;32m[NNDCT_NOTE]: =>Exporting quant config.(quant_dir/quant_info.json)[0m
export config end


In [206]:
try:
    # Quantize model - test
    quantize(net, 
             image_shape,
             quant_dir='quant_dir',
             quant_mode='test',
             device=device,
             dataloader=val_generator,
             evaluator=mean_iou)
except Exception as e:
    print(e)
except:
    print("XD")

get qunatizer start

[0;32m[NNDCT_NOTE]: Quantization test process start up...[0m

[0;32m[NNDCT_NOTE]: =>Quant Module is in 'cpu'.[0m

[0;32m[NNDCT_NOTE]: =>Parsing Sequential...[0m

[0;32m[NNDCT_NOTE]: =>Doing weights equalization...[0m

[0;32m[NNDCT_NOTE]: =>Quantizable module is generated.(quant_dir/Sequential.py)[0m
get qunatizer end
get quantized model start

[0;32m[NNDCT_NOTE]: =>Get module with quantization.[0m
get quantized model end
testing st
Evaluation 6614/97969. Score = 0.6549693154283055XD


In [205]:
!vai_c_xir --xmodel quant_dir/Sequential_int.xmodel --arch arch.json --net_name LN7_VAI --output_dir  build

**************************************************
* VITIS_AI Compilation - Xilinx Inc.
**************************************************
[UNILOG][INFO] The compiler log will be dumped at "/tmp/vitis-ai-user/log/xcompiler-20220123-181535-70874"
[UNILOG][INFO] Compile mode: dpu
[UNILOG][INFO] Debug mode: function
[UNILOG][INFO] Target architecture: DPUCZDX8G_CUSTOMIZED
[UNILOG][INFO] Graph name: Sequential, with op num: 188
[UNILOG][INFO] Begin to compile...
[UNILOG][INFO] Total device subgraph number 3, DPU subgraph number 1
[UNILOG][INFO] Compile done.
[UNILOG][INFO] The meta json is saved to "/workspace/TRAIN/Vitis_AI/build/meta.json"
[UNILOG][INFO] The compiled xmodel is saved to "/workspace/TRAIN/Vitis_AI/build/LN7_VAI.xmodel"
[UNILOG][INFO] The compiled xmodel's md5sum is bb935c9a20bd88579cc1e298a8f0fe1a, and been saved to "/workspace/TRAIN/Vitis_AI/build/md5sum.txt"


In [11]:
# CREATE TEST EVAL SET
subset = folds.test_set[:3000]
d = {}
generator = preprocessing.YoloDataGenerator(
                            dataset_local_path,
                            input_shape=image_shape,
                            anchors=anchors,
                            images_labes=subset, 
                            batch_size=1,
                            name='Generator', 
                            augmentator=None,
                            output_size=output_sizes,
                            after_load=after_load,
                            # bbox_to_anchors=to_anchors_single,
                            bbox_to_anchors=to_anchors_single,
                            )

# for i,(path,bbox) in enumerate(subset):
for i in range(len(generator)):
    img,Y = generator[i]
    img = img.reshape(3,-1).numpy().T.reshape((112,208,3))
    img = (img*255).astype(np.uint8)
    bbox = np.round(Y[0].reshape(-1).numpy()).astype(int).flatten().tolist()
    
    new_path = 'images/img_'+(str(i).zfill(4))+'.png'
    dst = os.path.join('eval_images/'+new_path)
#     bbox = np.round(utils.xcycwh_to_ltrb(bbox.copy().reshape((1,4)))).astype(int).flatten().tolist()
    bbox = {k:v for k,v in zip('ltrb',bbox)}
    d[str(i)] = {'path':new_path,'bbox':bbox}
    cv.imwrite(dst, img)
    
# print(d)
with open('eval_images/gt.json','w') as f:
    f.write(json.dumps(d, indent=4,))