In [1]:
import numpy as np
import os, time, json, sys
import cv2 as cv
import pynq
from threading import Thread
import numpy as np
# import matplotlib.pyplot as plt# add path
sys.path.append('../custom_lib')
from utils import iou, sigmoid, PowerRecorder

In [2]:
def load_img(path, dst_buffer:list):
    # read from disk
    pre_buff = cv.imread(path)
    pre_buff = pre_buff.astype(np.uint8)
    # copy to buffer            
    dst_buffer[0][:] = pre_buff[:]
    

class LittleNet:

    def __init__(self, bit_path, anchors, anchors_mul, in_shape=(112,208,3),out_fraq_bits = 4):
        self.ov = pynq.Overlay(bit_path, download=True,)
        self.ov.reset()
        self.dma:pynq.lib.DMA = self.ov.axi_dma_0 
        self.sender = self.dma.sendchannel
        self.receiver = self.dma.recvchannel
        self.delay = 2+6-1
        
        anchors = anchors.reshape((-1,2))
        anchors_mul = anchors_mul.reshape((-1,2))
        
        self.in_sh = in_shape
        out_shape=(5*anchors.shape[0], in_shape[0]//16, in_shape[1]//16)
        self.out_sh = out_shape
        self.out_buff_size = (8,)
        
        # buffers
        self.pre_buff = pynq.allocate(self.in_sh, dtype=np.uint8)
        self.acc_buff_in = pynq.allocate(self.in_sh, dtype=np.uint8)
        self.acc_buff_out = pynq.allocate(self.out_buff_size, dtype=np.int8)
        self.post_buff = pynq.allocate(self.out_buff_size, dtype=np.int8)
        
        self.map_size = self.out_sh[1]*self.out_sh[2]
        self.anchors_num = self.out_sh[0] // 5
        self.cols = self.out_sh[2]
        self.rows = self.out_sh[1]
        
        input_size = np.array(self.in_sh[0:2][::-1], # HxWxCh 
                              dtype=np.float32)  # colsxrows
        output_size = np.array(self.out_sh[1:][::-1], # ChxHxW
                               dtype=np.float32) # colsxrows
        
        # position at the output grid to pos at the original img 
        self.xy_mul = input_size / output_size
        self.anchors = anchors
        self.anchors_mul = anchors_mul
        self.to_int_mul = 1 / (2**out_fraq_bits)
    
    def get_max(self, result:np.ndarray):
        result = result.flatten().astype(np.int8)
        # object desc
        xywh = result[:4]
        # validity = result[4]
        anchor_pos = result[5].astype(np.uint8)
        col_row_pos = result[6:8][::-1].astype(np.uint8)
        # anchor wh for given anchor pos
        anchor = self.anchors[anchor_pos,:]
        anchor_mul = self.anchors_mul[anchor_pos,:]
        # make float representation
        xywh_f = xywh.astype(np.float32) * self.to_int_mul
        # get position of center
        xy = col_row_pos + sigmoid(xywh_f[:2])
        xy = xy*self.xy_mul
        # get sizes
        wh = np.exp(xywh_f[-2:]*anchor_mul)*anchor
        # XcYcWH to LTRB
        LT = xy - wh/2
        RB = LT + wh
        # make list
        result = np.concatenate([LT,RB])
        result = np.round(result).astype(int).tolist()
        
        return result
    
    def load_img(self, path):
        th = Thread(target=load_img,args=(path,[self.pre_buff]))
        th.start()
        return th
    
    def swap_buffers(self):
        # out buffs
        tmp = self.post_buff
        self.post_buff = self.acc_buff_out
        self.acc_buff_out = tmp
        
        # in buffs
        tmp = self.acc_buff_in
        self.acc_buff_in = self.pre_buff
        self.pre_buff = tmp
    
    def __call__(self, path):
        # run img loading thread
        img_th = self.load_img(path)
        # start sending data to acc
        self.sender.transfer(self.acc_buff_in)
        # start receiving data from acc
        self.receiver.transfer(self.acc_buff_out)
        # post process prev result
        result = self.get_max(self.post_buff)
        
        # sync
        img_th.join()
        self.receiver.wait()
        self.sender.wait()
        
        # swap buffers
        self.swap_buffers()
        
        return result
    
    def clear(self):
        self.pre_buff.freebuffer()
        self.post_buff.freebuffer()
        self.acc_buff_in.freebuffer()
        self.acc_buff_out.freebuffer()


def get_dataset(path):
    with open(os.path.join(path,'gt.json')) as f:
        ds = json.loads(f.read())
    
    paths = []
    ltrb = []
    for v in ds.values():
        paths.append(os.path.join(path,v['path']))
        bbox = v['bbox']
        ltrb.append([bbox['l'],bbox['t'],bbox['r'],bbox['b']])

    return paths, ltrb


def predict(paths, ln_obj:LittleNet):
    results = []
    cntr = 0
    for p in paths:
        r = ln_obj(p)
        results.append(r)
        
    for i in range(ln_obj.delay):
        r = ln_obj(p)
        results.append(r)

    results = results[ln_obj.delay:]
    
    return results

In [3]:
anchor_mul = [[0.33497440814971924, 0.2894629240036011], 
             [0.5097968578338623, 0.41656744480133057], 
             [0.3774974048137665, 0.4998015761375427]]
anchors = [[7.247058868408203, 10.725000381469727], 
           [1.6470588445663452, 3.25], 
           [4.941176414489746, 1.625]]
anchor_mul = np.array(anchor_mul,dtype=np.float32)
anchors = np.array(anchors,dtype=np.float32)

paths, bbox_ref = get_dataset('../eval_images')

LN = LittleNet('build/base/LN7.bit',
               anchors, 
               anchor_mul,
               in_shape=(112,208,3), 
               out_fraq_bits=4)

f = 300
pynq.ps.Clocks.fclk0_mhz = f

for i in range(5):
    print()
    print(i)
    # start time measurement
    start = time.time()
    # record power every 0.05[s]
    recorder = PowerRecorder()
    with recorder.record(0.05):
        bbox_pred = predict(paths,LN)

    # stop time measurement
    end = time.time()
    t = end - start

    # Energy measurements
    energy = recorder.mean_power * t

    total_time = t
    fps = len(paths) / total_time
    total_energy = energy
    energy_per_sample = energy / len(paths)

    IOU = iou(bbox_pred, bbox_ref)
    mean_iou = np.mean(IOU)
    print("Frequency:", f,"[MHz]")
    print("Number of images:", len(paths))
    print("Total time:", total_time, "[s]")
    print("Throughput:", fps, "[fps]")
    print("Total energy:", total_energy, "[J]")
    print("Energy per sample:", energy_per_sample, "[J/sample]")
    print("Mean IoU:", mean_iou)

pynq.ps.Clocks.fclk0_mhz = 1

In [10]:
# get all models
models = [os.path.join('build',d,'LN7.bit') for d in os.listdir('build') if '.' not in d]

records = {}
anchor_mul_round = [[0.33369001746177673, 0.2886362671852112], [0.508991539478302, 0.41514432430267334], [0.3766272962093353, 0.4983048737049103]]
anchor_mul_floor = [[0.33497440814971924, 0.2894629240036011], [0.5097968578338623, 0.41656744480133057], [0.3774974048137665, 0.4998015761375427]]
anchor_mul_round = np.array(anchor_mul_round, dtype=np.float32)
anchor_mul_floor = np.array(anchor_mul_floor, dtype=np.float32)

for m in models:
    print(m)
    # load model 
    LN = LittleNet(m, 
               anchors, 
               anchor_mul_floor if 'base' in m or 'sl_2' in m else anchor_mul_round,
               in_shape=(112,208,3), 
               out_fraq_bits=4)
    rails = pynq.get_rails()
    f_records = {}
    for f in [50,100,150,200,250,300][::-1]:
        print("started with f =", f,'[MHz]')
        pynq.ps.Clocks.fclk0_mhz = f
        # start time measurement
        start = time.time()
        
        # record power every 0.05[s]
        recorder = PowerRecorder()
        with recorder.record(0.05):
            bbox_pred = predict(paths,LN)

        # stop time measurement
        end = time.time()
        t = end - start

        # Energy measurements    
        energy = recorder.mean_power * t

        total_time = t
        fps = len(paths) / total_time
        total_energy = energy
        energy_per_sample = energy / len(paths)

        IOU = iou(bbox_pred, bbox_ref)
        mean_iou = np.mean(IOU)
        
        print(m)
        print("Frequency:", f,"[MHz]")
        print("Number of images:", len(paths))
        print("Total time:", total_time, "[s]")
        print("Throughput:", fps, "[fps]")
        print("Total energy:", total_energy, "[J]")
        print("Energy per sample:", energy_per_sample, "[J/sample]")
        print("Mean IoU:", mean_iou)
        print()
        
        entry = {}
        entry['f_expected'] = f
        entry['f_real'] = pynq.ps.Clocks.fclk0_mhz
        entry['imgs_number'] = len(paths)
        entry['time'] = total_time
        entry['fps'] = fps
        entry['energy'] = total_energy
        entry['energy_per_sample'] = energy_per_sample
        entry['mean_iou'] = mean_iou
        
        f_records[str(f)] = entry
    
    records[m] = f_records
        
pynq.ps.Clocks.fclk0_mhz = 1

print(records)

data = records
new_data = {}
for m,d in data.items():
    new_data[m] = {}
    for f,dd in d.items():
        new_data[m][f] = {}
        for n,v in dd.items():
            new_data[m][f][n] = float(v)
print(new_data)

import json
tt = int(time.time())
with open("comparison_data_"+str(tt)+".json", "w") as f:
    json.dump(new_data,f,indent=4)
    print("Results saved under:", "comparison_data_"+str(tt)+".json")


build/P32_sl_2/LN7.bit
started with f = 300 [MHz]
build/P32_sl_2/LN7.bit
Frequency: 300 [MHz]
Number of images: 3000
Total time: 28.25994372367859 [s]
Throughput: 106.1573239258203 [fps]
Total energy: 87.24890929914262 [J]
Energy per sample: 0.029082969766380876 [J/sample]
Mean IoU: 0.66160923

{'build/P32_sl_2/LN7.bit': {'300': {'f_expected': 300, 'f_real': 299.997, 'imgs_number': 3000, 'time': 28.25994372367859, 'fps': 106.1573239258203, 'energy': 87.24890929914262, 'energy_per_sample': 0.029082969766380876, 'mean_iou': 0.66160923}}}
{'build/P32_sl_2/LN7.bit': {'300': {'f_expected': 300.0, 'f_real': 299.997, 'imgs_number': 3000.0, 'time': 28.25994372367859, 'fps': 106.1573239258203, 'energy': 87.24890929914262, 'energy_per_sample': 0.029082969766380876, 'mean_iou': 0.6616092324256897}}}
Results saved under: comparison_data_1645485416.json
