In [2]:
import numpy as np
import pynq
from pynq import MMIO
import time

In [3]:
batchSize = 1
numInputChannels = 3
numOutputChannels = 48
strideSize = 1
kernelSize = 3
inputHeight = 160
inputWidth = 320
outputHeight = 160
outputWidth = 320
paddingSize = 1
data_type = np.float32

In [4]:
overlay = pynq.Overlay('design_1.bit')
top_ip = overlay.dw_conv_0
top_ip.signature

In [5]:
def float32_to_fixed16_3(value):
    scale_factor = 2**13  # for 16,3 fixed point
    return np.int16(np.round(value * scale_factor))
    return fixed_point_value

def int16_to_fixed16_3(value):
    scale_factor = 2 ** 13
    fixed_point_as_float = value / scale_factor
    return fixed_point_as_float

In [6]:
inBuffer = pynq.allocate((batchSize, numInputChannels, inputHeight, inputWidth),np.int16)
wBuffer0 = pynq.allocate((numInputChannels,1, kernelSize , kernelSize),np.int16)
biasBuffer0 = pynq.allocate((numInputChannels ),np.int16)
wBuffer1 = pynq.allocate((numOutputChannels , numInputChannels, 1, 1),np.int16)
biasBuffer1 = pynq.allocate((numOutputChannels),np.int16)
outBuffer = pynq.allocate((batchSize, numOutputChannels , outputHeight , outputWidth),np.int16)

In [7]:
size_input3x3 = batchSize * numInputChannels * inputHeight * inputWidth
size_weight0 =  numInputChannels * 1 * kernelSize * kernelSize
size_bias0 = numInputChannels 

size_weight1 = numOutputChannels * numInputChannels * 1 * 1
size_bias1 = numOutputChannels
size_outGolden = batchSize * numOutputChannels * outputHeight * outputWidth

with open('dw_conv_input3x3.bin', 'rb') as file:
    input_read = np.fromfile(file, dtype=data_type, count=size_input3x3)
my_input = input_read.reshape((batchSize, numInputChannels, inputHeight, inputWidth))

with open('dw_conv_weight3x3.bin', 'rb') as file:
    weight0_read = np.fromfile(file, dtype=data_type, count=size_weight0)
weight0 = weight0_read.reshape((numInputChannels, 1, kernelSize, kernelSize))

with open('dw_conv_bias3x3.bin', 'rb') as file:
    bias0_read = np.fromfile(file, dtype=data_type, count=size_bias0)
bias0 = bias0_read.reshape((numInputChannels))

with open('conv_weight1x1.bin', 'rb') as file:
    weight1_read = np.fromfile(file, dtype=data_type, count=size_weight1)
weight1 = weight1_read.reshape((numOutputChannels, numInputChannels, 1, 1))

with open('conv_bias1x1.bin', 'rb') as file:
    bias1_read = np.fromfile(file, dtype=data_type, count=size_bias1)
bias1 = bias1_read.reshape((numOutputChannels))

with open('conv_output_torch1x1.bin', 'rb') as file:
    outGolden_read = np.fromfile(file, dtype=data_type, count=size_outGolden)
outGolden = outGolden_read.reshape((batchSize, numOutputChannels, outputHeight, outputWidth))

In [8]:
#initialize input
for b in range (0, batchSize):
    for i in range (0, numInputChannels):
        for j in range (0, inputHeight):
            for k in range (0, inputWidth):
                inBuffer[b][i][j][k] = int(float32_to_fixed16_3(my_input[b][i][j][k]))

for i in range (0, numInputChannels):
    for j in range (0, kernelSize):
        for k in range (0, kernelSize):
            wBuffer0[i][0][j][k] = int(float32_to_fixed16_3(weight0[i][0][j][k]))

for i in range (0, numInputChannels):
    biasBuffer0[i] = int(float32_to_fixed16_3(bias0[i]))


for i in range (0, numOutputChannels):
    for j in range (0, numInputChannels):
        wBuffer1[i][j][0][0] = int(float32_to_fixed16_3(weight1[i][j][0][0]))

for i in range (0, numOutputChannels):
    biasBuffer1[i] = int(float32_to_fixed16_3(bias1[i]))

In [9]:
#specify the address
inptr = inBuffer.physical_address
w0ptr = wBuffer0.physical_address
b0ptr = biasBuffer0.physical_address
w1ptr = wBuffer1.physical_address
b1ptr = biasBuffer1.physical_address
outptr = outBuffer.physical_address

top_ip.write(0x10, inptr)
top_ip.write(0x1c, w0ptr)
top_ip.write(0x28, b0ptr)
top_ip.write(0x34, w1ptr)
top_ip.write(0x40, b1ptr)
top_ip.write(0x4c, outptr)

In [10]:
#start the HLS kernel
start_time = time.time()
top_ip.write(0x00, 1)
isready = top_ip.read(0x00)
while (isready == 1):
    isready = top_ip.read(0x00)
end_time = time.time()

In [11]:
my_output = np.full((batchSize,numOutputChannels,outputHeight,outputWidth),1, dtype=np.float32) 
for b in range (0, batchSize):
    for i in range (0, numOutputChannels):
        for j in range (0, outputHeight):
            for k in range (0, outputWidth):
                 my_output[b][i][j][k] = int16_to_fixed16_3(outBuffer[b][i][j][k])
            

In [18]:
# Analyze the result
overlay.free()
# calculate time latency
start_time1 = time.time()
end_time1 = time.time()
internet_latency = end_time1 - start_time1
print('internet_latency:',internet_latency)
print('runtime:',end_time - start_time - internet_latency)
#print and compare your outputs
print('expected:')
print(outGolden[0][0][0])
print('my output:')
print(my_output[0][0][0])
# calculate MSE
difference = outGolden - my_output
squared_difference = np.square(difference)
mse = np.mean(squared_difference)
print("MSE:", mse)

internet_latency: 0.000171661376953125
runtime: 0.0015718936920166016
expected:
[ 0.25689793 -0.0879821  -0.29876578 -0.7450676   0.2707423  -0.39328253
 -0.36761388 -0.51023257 -0.5128461   0.18865699 -0.5157582  -0.57080376
 -0.706433   -0.4558599  -0.19102935 -0.2935251  -0.7222746  -0.13186099
  0.9887995  -0.17377673 -1.2284567  -0.37220484 -0.85755765  0.05055475
 -0.08165509 -0.7155271  -1.1902595  -0.08883794 -0.01253189 -0.2753939
 -0.4159099  -0.18728805]
my output:
[ 0.11230469  0.22949219 -0.21508789 -0.55578613 -0.28430176 -0.5235596
 -0.92663574 -0.26342773 -0.02880859  0.4609375  -0.24853516 -0.40551758
  1.0994873  -0.7928467  -0.08044434 -0.50805664 -0.9577637  -0.7352295
  0.6072998  -0.15124512 -1.1711426  -0.5004883  -0.16210938 -0.65771484
  0.4071045   0.06408691 -0.68359375  0.29858398 -0.41479492 -0.5690918
 -0.31762695 -0.26489258]
MSE: 0.49252725
