## Convolutional Operation Demo<br>PYNQ on Ultra96v2 board  

H. Nakahara (Tokyo Tech.) 19th/Mar./2020  
Copyright all rights reserved.

### Setup
load bitstream file

In [1]:
from pynq import Overlay
import pynq

overlay = Overlay('/home/xilinx/pynq/overlays/base/pynq_ultra96_conv_l0_r1.bit')
dir(overlay)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_deepcopy_dict_from',
 '_ip_map',
 'axi_dma_0',
 'binfile_name',
 'bit_data',
 'bitfile_name',
 'clock_dict',
 'device',
 'download',
 'dtbo',
 'firmware_path',
 'gpio_dict',
 'hierarchy_dict',
 'ignore_version',
 'insert_dtbo',
 'interrupt_controllers',
 'interrupt_pins',
 'ip_dict',
 'is_loaded',
 'kernel_0',
 'load_ip_data',
 'mem_dict',
 'parser',
 'partial',
 'pr_dict',
 'pr_download',
 'remove_dtbo',
 'reset',
 'timestamp',
 'zynq_ultra_ps_e_0']

In [2]:
registers = overlay.kernel_0.register_map
print(registers)

RegisterMap {
  CTRL = Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0),
  GIER = Register(Enable=0, RESERVED=0),
  IP_IER = Register(CHAN0_INT_EN=0, CHAN1_INT_EN=0, RESERVED=0),
  IP_ISR = Register(CHAN0_INT_ST=0, CHAN1_INT_ST=0, RESERVED=0)
}


load testbench file

In [3]:
import numpy as np

inimg = np.loadtxt('/home/xilinx/data/testbench_input.txt')

In [4]:
inimg = inimg.reshape((3,416,416)).transpose(1,2,0) # Y,X,CH
inimg = inimg * 1024.0
inimg = inimg.astype(np.int32)

Setup DMA buffer

In [5]:
import pynq.lib.dma

dma = overlay.axi_dma_0

In [6]:
from pynq import Xlnk

inimg_size = 416*11*3
outfmap_size = 102*64+1

xlnk = Xlnk()

send_buf   = xlnk.cma_array(shape=(inimg_size),dtype=np.int32)
recv_buf = xlnk.cma_array(shape=(outfmap_size),dtype=np.int32)

In [7]:
inimg_buf   = np.zeros((11,416,3)).astype(np.int32)
outfmap_buf = np.zeros((102,64,102)).astype(np.int32)

### Perform Convolutional Operation (...but too slow)

In [9]:
%%time
for line in range(102):
    # load input image
    for i in range(11):
        inimg_buf[i] = inimg[i+line*4]
    
    tmp = inimg_buf.copy().transpose((2,0,1)).reshape(-1,) # CH,Y,X
    send_buf[0:inimg_size] = tmp[0:inimg_size]

    # activate DMA
    registers.CTRL.AP_START = 1

    # DMA access
    dma.sendchannel.transfer(send_buf)
    dma.recvchannel.transfer(recv_buf)

    # wait DMA
    dma.sendchannel.wait()
    dma.recvchannel.wait()
    
    # store output buffer
    tmp2 = recv_buf[0:outfmap_size - 1]
    tmp2 = tmp2.reshape((64,102)) # CH, X
    outfmap_buf[line] = tmp2

CPU times: user 22.5 s, sys: 6.85 ms, total: 22.5 s
Wall time: 22.5 s


### Verification with C++ testbench

In [10]:
outfmap_buf = outfmap_buf.transpose((1,0,2)) / 1024.0 # Y,CH,X -> CH,Y,X

In [11]:
bench_outfmap = np.loadtxt('/home/xilinx/data/testbench_output.txt')

In [12]:
error = np.abs(bench_outfmap - outfmap_buf.reshape(-1,))
max_error = np.max(error)

print('max error',max_error)

if max_error < 0.1:
    print('TEST_PASS')
else:
    print('TEST_FAILURE')

max error 0.010464375
TEST_PASS


### Appendix
Inference on ARM processor

In [13]:
import torch

In [21]:
x = torch.randn(1,3,416,416)

conv = torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=11,stride=4,bias=False)

In [22]:
%%time
y = conv(x)

CPU times: user 259 ms, sys: 7.96 ms, total: 267 ms
Wall time: 93.2 ms
