In [1]:
import numpy as np
import os, time, json, sys
import cv2 as cv
import pynq_dpu, pynq
from threading import Thread
import numpy as np
# import matplotlib.pyplot as plt
# add path
sys.path.append('../custom_lib')
from utils import iou, sigmoid, PowerRecorder

In [2]:
def load_img(path, dst_list, in_sh):
    pre_buff = cv.imread(path) + np.uint8(128)
    pre_buff = (pre_buff >> 1) | (pre_buff & 128)    
    pre_buff = pre_buff.reshape(in_sh).astype(np.int8)
    dst_list.append(pre_buff)

    
class VAI_FINN:

    def __init__(self, dpu_path, xmodel, anchors):
        self.ov = pynq_dpu.DpuOverlay(dpu_path)
        self.ov.load_model(xmodel)
        self.delay = 2
        
        self.dpu = self.ov.runner

        inputTensors = self.dpu.get_input_tensors()
        outputTensors = self.dpu.get_output_tensors()

        shapeIn = tuple(inputTensors[0].dims)
        shapeOut = tuple(outputTensors[0].dims)
        in_bw = inputTensors[0].get_attr('bit_width')
        in_fp = inputTensors[0].get_attr('fix_point')
        out_bw = outputTensors[0].get_attr('bit_width')
        out_fp = outputTensors[0].get_attr('fix_point')

        self.in_repr = (in_bw, in_fp)
        self.out_repr = (out_bw, out_fp)
        # buffers
        self.pre_buff = []
        self.dpu_buff_in = np.empty(shapeIn,dtype=np.int8, order="C")
        self.dpu_buff_out = np.empty(shapeOut,dtype=np.int8, order="C")
        self.post_buff = np.empty(shapeOut,dtype=np.int8, order="C")
        
        self.in_sh = shapeIn
        self.out_sh = shapeOut
        
        self.map_size = shapeOut[-3]*shapeOut[-2]
        self.anchors_num = anchors.shape[0]
        self.cols = shapeOut[-2]
        self.rows = shapeOut[-3]
        
        input_shape = np.array(shapeIn[-3:-1][::-1], dtype=np.float32) # colsxrows
        output_shape = np.array(shapeOut[-3:-1][::-1], dtype=np.float32) # colsxrows
        # position at the output grid to pos at the original img 
        self.xy_mul = input_shape / output_shape
        self.anchors = anchors
        self.to_int_mul = 1 / (2**self.out_repr[1])
            
    def find_max(self, a):
        a = a.reshape((-1,5,6))
        pos = np.argmax(a[:,:,4].flatten())
        
        # get position in 3d
        anchor_pos = pos % self.anchors_num
        col_row_pos = np.array([pos // self.anchors_num % self.cols, 
                                pos // (self.anchors_num*self.cols)])
        # anchor wh for given anchor pos
        anchor = self.anchors[anchor_pos,:]
        # get most probable bbox params
        xywh_int = a.reshape((10,20,5,6))[col_row_pos[1], col_row_pos[0], anchor_pos,:4].flatten().astype(np.float)
        xywh_f = xywh_int*self.to_int_mul
        # get position of center
        xy = col_row_pos + sigmoid(xywh_f[:2])
        xy = xy*self.xy_mul
        # get sizes
        wh = np.exp(xywh_f[-2:])*anchor
        # XcYcWH to LTRB
        LT = xy - wh / 2
        RB = LT + wh
        # to int values
        ltrb = np.concatenate([LT,RB])
        ltrb = np.round(ltrb).astype(int).tolist()

        return ltrb
    
    def load_img(self, path):
        th = Thread(target=load_img,args=(path,self.pre_buff,self.in_sh))
        th.start()
        return th
    
    def __call__(self, path):
        # start dpu processing
        job_id = self.dpu.execute_async([self.dpu_buff_in], [self.dpu_buff_out])
        
        # run img loading thread
        img_th = self.load_img(path)
        
        # post process prev result
        result = self.find_max(self.post_buff)
        
        # sync
        self.dpu.wait(job_id)
        img_th.join()
        
        # swap buffers
        # out buffs
        tmp = self.post_buff
        self.post_buff = self.dpu_buff_out
        self.dpu_buff_out = tmp
        # in buffs
        self.dpu_buff_in = self.pre_buff[0]
        self.pre_buff = []
        
        return result


def get_dataset(path):
    with open(os.path.join(path,'gt.json')) as f:
        ds = json.loads(f.read())
    
    paths = []
    ltrb = []
    for v in ds.values():
        paths.append(os.path.join(path,v['path']))
        bbox = v['bbox']
        ltrb.append([bbox['l'],bbox['t'],bbox['r'],bbox['b']])

    return paths, ltrb


def predict(paths, vai_obj:VAI_FINN):
    results = []
    
    for p in paths:
        r = vai_obj(p)
        results.append(r)
        
    for i in range(vai_obj.delay):
        r = vai_obj(p)
        results.append(r)
    
    results = results[vai_obj.delay:]
    
    return results

In [3]:
anchors = [
        [10.762251, 13.063103],
        [25.158768, 42.200066],
        [19.567272, 25.438337],
        [91.87796, 35.945087],
        [38.639523, 69.15513]
        ]
anchors = np.array(anchors,dtype=np.float32).reshape((-1,2))
paths, bbox_ref = get_dataset('../eval_images_finn')

vai = VAI_FINN('../VAI/dpu.bit','FINN_VAI.xmodel', anchors)

for i in range(5):
    print()
    print(i)
    # start time measurement
    start = time.time()
    # record power every 0.05[s]
    recorder = PowerRecorder()
    with recorder.record(0.05):
        bbox_pred = predict(paths,vai)

    # stop time measurement
    end = time.time()
    t = end - start

    # Energy measurements    
    energy = recorder.mean_power * t    

    total_time = t
    fps = len(paths) / total_time
    total_energy = energy
    energy_per_sample = energy / len(paths)

    IOU = iou(bbox_pred, bbox_ref)
    mean_iou = np.mean(IOU)

    print("Number of images:", len(paths))
    print("Total time:", total_time, "[s]")
    print("Throughput:", fps, "[fps]")
    print("Total energy:", total_energy, "[J]")
    print("Energy per sample:", energy_per_sample, "[J/sample]")
    print("Mean IoU:", mean_iou)



0
Number of images: 3000
Total time: 124.19387817382812 [s]
Throughput: 24.155780011966822 [fps]
Total energy: 57.35938569349525 [J]
Energy per sample: 0.019119795231165082 [J/sample]
Mean IoU: 0.5757448

1
Number of images: 3000
Total time: 59.15351581573486 [s]
Throughput: 50.71549777945741 [fps]
Total energy: 26.041145468306265 [J]
Energy per sample: 0.008680381822768755 [J/sample]
Mean IoU: 0.5757448

2
Number of images: 3000
Total time: 59.05522012710571 [s]
Throughput: 50.799912243880236 [fps]
Total energy: 24.43766921590413 [J]
Energy per sample: 0.00814588973863471 [J/sample]
Mean IoU: 0.5757448

3
Number of images: 3000
Total time: 58.89796018600464 [s]
Throughput: 50.935550068724815 [fps]
Total energy: 25.014225233264586 [J]
Energy per sample: 0.008338075077754862 [J/sample]
Mean IoU: 0.5757448

4
Number of images: 3000
Total time: 59.00492596626282 [s]
Throughput: 50.843212678807646 [fps]
Total energy: 24.807730610383683 [J]
Energy per sample: 0.008269243536794561 [J/sample