In [1]:
import numpy as np
import os, time, json, sys
import cv2 as cv
import pynq_dpu, pynq
from threading import Thread
import numpy as np
# import matplotlib.pyplot as plt
# add path
sys.path.append('../custom_lib')
from utils import iou, sigmoid, PowerRecorder

In [2]:
def load_img(path, dst_list, in_sh, shift=1):
    pre_buff = cv.imread(path) >> shift
    pre_buff = pre_buff.reshape(in_sh).astype(np.int8)
    dst_list.append(pre_buff)

    
class VAI:

    def __init__(self, dpu_path, xmodel, anchors, anchors_mul):
        self.ov = pynq_dpu.DpuOverlay(dpu_path)
        self.ov.load_model(xmodel)
        self.delay = 2
        
        self.dpu = self.ov.runner

        inputTensors = self.dpu.get_input_tensors()
        outputTensors = self.dpu.get_output_tensors()

        shapeIn = tuple(inputTensors[0].dims)
        shapeOut = tuple(outputTensors[0].dims)
        in_bw = inputTensors[0].get_attr('bit_width')
        in_fp = inputTensors[0].get_attr('fix_point')
        out_bw = outputTensors[0].get_attr('bit_width')
        out_fp = outputTensors[0].get_attr('fix_point')

        self.in_repr = (in_bw, in_fp)
        self.out_repr = (out_bw, out_fp)
        # buffers
        self.pre_buff = []
        self.dpu_buff_in = np.empty(shapeIn,dtype=np.int8, order="C")
        self.dpu_buff_out = np.empty(shapeOut,dtype=np.int8, order="C")
        self.post_buff = np.empty(shapeOut,dtype=np.int8, order="C")
        
        self.in_sh = shapeIn
        self.out_sh = shapeOut
        
        self.map_size = shapeOut[-3]*shapeOut[-2]
        self.anchors_num = shapeOut[-1] // 5
        self.cols = shapeOut[-2]
        self.rows = shapeOut[-3]
        
        input_shape = np.array(shapeIn[-3:-1][::-1], dtype=np.float32) # colsxrows
        output_shape = np.array(shapeOut[-3:-1][::-1], dtype=np.float32) # colsxrows
        # position at the output grid to pos at the original img 
        self.xy_mul = input_shape / output_shape
        self.anchors = anchors
        self.anchors_mul = anchors_mul
        self.to_int_mul = 1 / (2**self.out_repr[1])
        
        self.delay = 2
    
    def find_max(self, a):
        a = a.reshape((-1,15)).T.reshape((5,-1))
        pos = a.shape[1] -1 - np.argmax(a[0,::-1].flatten())
        pos = np.argmax(a[0,:].flatten())
        
        # get position in 3d
        anchor_pos = pos // self.map_size
        col_row_pos = np.array([pos % self.cols, pos//self.cols - anchor_pos*self.rows])
        # anchor wh for given anchor pos
        anchor = self.anchors[anchor_pos,:]
        anchors_mul = self.anchors_mul[anchor_pos,:]
        # get most probable bbox params
        xywh_int = a[1:,pos].flatten().astype(np.float)
        xywh_f = xywh_int*self.to_int_mul
        # get position of center
        xy = col_row_pos + sigmoid(xywh_f[:2])
        xy = xy*self.xy_mul
        # get sizes
        wh = np.exp(xywh_f[-2:]*anchors_mul)*anchor
        # XcYcWH to LTRB
        LT = xy - wh / 2
        RB = LT + wh
        # to int values
        ltrb = np.concatenate([LT,RB])
        ltrb = np.round(ltrb).astype(int).tolist()

        return ltrb
    
    def load_img(self, path):
        # shift = bw - fraq  <- align fraqtional part to LSB
        shift = self.in_repr[0] - self.in_repr[1]
        th = Thread(target=load_img,args=(path,self.pre_buff,self.in_sh,shift))
        th.start()
        return th
    
    def __call__(self, path):
        # start dpu processing
        job_id = self.dpu.execute_async([self.dpu_buff_in], [self.dpu_buff_out])
        
        # run img loading thread
        img_th = self.load_img(path)
        
        # post process prev result
        result = self.find_max(self.post_buff)
        
        # sync
        self.dpu.wait(job_id)
        img_th.join()
        
        # swap buffers
        # out buffs
        tmp = self.post_buff
        self.post_buff = self.dpu_buff_out
        self.dpu_buff_out = tmp
        # in buffs
        self.dpu_buff_in = self.pre_buff[0]
        self.pre_buff = []
        
        return result


def get_dataset(path):
    with open(os.path.join(path,'gt.json')) as f:
        ds = json.loads(f.read())
    
    paths = []
    ltrb = []
    for v in ds.values():
        paths.append(os.path.join(path,v['path']))
        bbox = v['bbox']
        ltrb.append([bbox['l'],bbox['t'],bbox['r'],bbox['b']])

    return paths, ltrb


def predict(paths, vai_obj:VAI):
    results = []
    
    for p in paths:
        r = vai_obj(p)
        results.append(r)
        
    for i in range(vai_obj.delay):
        r = vai_obj(p)
        results.append(r)
    
    results = results[vai_obj.delay:]
    
    return results

In [3]:
anchor_mul = [[0.23019284009933472, 0.23658646643161774], 
              [0.19149231910705566, 0.19143685698509216], 
              [0.20761309564113617, 0.18979156017303467]]
anchors = [[7.247058868408203, 10.725000381469727], 
           [1.6470588445663452, 3.25], 
           [4.941176414489746, 1.625]]
anchor_mul = np.array(anchor_mul, dtype=np.float32)
anchors = np.array(anchors, dtype=np.float32)
paths, bbox_ref = get_dataset('../eval_images')

vai = VAI('../VAI/dpu.bit','LN7_VAI.xmodel', anchors, anchor_mul)

for i in range(5):
    print()
    print(i)
    # start time measurement
    start = time.time()
    # record power every 0.05[s]
    recorder = PowerRecorder()
    with recorder.record(0.05):
        bbox_pred = predict(paths,vai)

    # stop time measurement
    end = time.time()
    t = end - start
        
    # Energy measurements    
    energy = recorder.mean_power * t    

    total_time = t
    fps = len(paths) / total_time
    total_energy = energy
    energy_per_sample = energy / len(paths)

    IOU = iou(bbox_pred, bbox_ref)
    mean_iou = np.mean(IOU)

    print("Number of images:", len(paths))
    print("Total time:", total_time, "[s]")
    print("Throughput:", fps, "[fps]")
    print("Total energy:", total_energy, "[J]")
    print("Energy per sample:", energy_per_sample, "[J/sample]")
    print("Mean IoU:", mean_iou)
