# MLP IP implemented in HLS via DMA.


Better features(instead of the raw accelerometer data) and training data can probably be used.  

In [1]:
from pynq import Overlay
import pynq.lib.dma
from pynq import DefaultIP
from pynq import allocate
import numpy as np

import time
inputs = [-43,-35,-300,300,-41,-30,-295,305,-42,-32,
            -297,302,0,3,2,2,0,1,1,1,0,0,0,0,42,32,297,
            302,1,1,12,12,70,39,3386,3496,8,6,58,59] 
#input is a zig zag feature set, index 2 in the output array

## MLP CPU IMPLEMENTATION(Quantized ints)

In [2]:
start_time = time.time()
! ./MLPInference << ./input_weights.txt
print("Overhead" ,(time.time() - start_time) * 1000 , 'ms') #overhead in launching program through notebook

1502470 976993 -499376 

8.601ms
Overhead 654.2484760284424 ms


## HLS IMPLEMENTATION(Area/ Power optimal on PL)
The custom IP is communicated to via DMA.

It is used to accelerate the calculations of the euclidian distance of the acceleration data.

The first result is the distance between ZigZag, the second is rocket, the third is head.

The 4th number is padding.
<img src="./files/MLP_No_Optimizations_util.png">
<img src="./files/MLP_No_Optimizations_power.png">

In [3]:
overlay = Overlay('/home/xilinx/impl_2/design_1.bit')

OSError: Bitstream file /home/xilinx/impl_2/design_1.bit does not exist.

In [None]:
#overlay?

In [None]:
dma = overlay.axi_dma_0            

In [None]:

in_buffer = allocate(shape=(40,), dtype=np.int32)
for i in range(40):
    in_buffer[i] = inputs[i];
out_buffer = allocate(shape=(4,), dtype=np.int32)

In [None]:
start_time = time.time()
dma.sendchannel.transfer(in_buffer)
dma.recvchannel.transfer(out_buffer)
dma.sendchannel.wait()
dma.recvchannel.wait()
out_buffer
print((time.time() - start_time) * 1000 , 'ms')

## HLS IMPLEMENTATION(Dataflow optimization on all layers)


In [None]:
overlay = Overlay('/home/xilinx/impl_dataflow/design_1.bit')

In [None]:
dma = overlay.axi_dma_0            

In [None]:
in_buffer = allocate(shape=(40,), dtype=np.int32)
for i in range(40):
    in_buffer[i] = inputs[i];
out_buffer = allocate(shape=(4,), dtype=np.int32)

In [None]:
start_time = time.time()
dma.sendchannel.transfer(in_buffer)
dma.recvchannel.transfer(out_buffer)
dma.sendchannel.wait()
dma.recvchannel.wait()
print((time.time() - start_time) * 1000 , 'ms')
print(out_buffer)


## HLS IMPLEMENTATION(Unroll directive on first(128 factor) and last layers(32))

<img src="./files/MLP_Unroll_128_0_full.png">


<img src="./files/MLP_Unroll_0_128_Full_power.png">

8% more LUTS, 2.6W vs 2.4W (~8% of power increase)
Non pareto optimal due to the next implementation

In [None]:
overlay = Overlay('/home/xilinx/impl_unroll/design_1.bit')
dma = overlay.axi_dma_0            
in_buffer = allocate(shape=(40,), dtype=np.int32)
for i in range(40):
    in_buffer[i] = inputs[i];
out_buffer = allocate(shape=(4,), dtype=np.int32)
start_time = time.time()
dma.sendchannel.transfer(in_buffer)
dma.recvchannel.transfer(out_buffer)
dma.sendchannel.wait()
dma.recvchannel.wait()
print((time.time() - start_time) * 1000 , 'ms')
print(out_buffer)


## HLS IMPLEMENTATION(Unroll directive on first(32 factor) second(64) and last layers(32))

<img src="./files/MLP_Unroll_32_64_32.png">
<img src="./files/MLP_Unroll_32_64_32_Power.png">

Power and Area in between minimal area implementation and 128 factor on first layer(Prev implementation)

In [None]:
overlay = Overlay('/home/xilinx/impl_32_64_32/design_1.bit')
dma = overlay.axi_dma_0            
in_buffer = allocate(shape=(40,), dtype=np.int32)
for i in range(40):
    in_buffer[i] = inputs[i];
out_buffer = allocate(shape=(4,), dtype=np.int32)
start_time = time.time()
dma.sendchannel.transfer(in_buffer)
dma.recvchannel.transfer(out_buffer)
dma.sendchannel.wait()
dma.recvchannel.wait()
print((time.time() - start_time) * 1000 , 'ms')
print(out_buffer)


## HLS IMPLEMENTATION(Unroll directive on first(64 factor) second(128) and last layers(32))

<img src="./files/MLP_Unroll_64_128_32.png">
<img src="./files/MLP_Unroll_64_128_32_Power.png">

Much higher resource usuage (LUTS)for not much performance increase, but nontherless still pareto-optimal

In [None]:
overlay = Overlay('/home/xilinx/impl_64_128_32/design_1.bit')
dma = overlay.axi_dma_0            
in_buffer = allocate(shape=(40,), dtype=np.int32)
for i in range(40):
    in_buffer[i] = inputs[i];
out_buffer = allocate(shape=(4,), dtype=np.int32)
start_time = time.time()
dma.sendchannel.transfer(in_buffer)
dma.recvchannel.transfer(out_buffer)
dma.sendchannel.wait()
dma.recvchannel.wait()
print((time.time() - start_time) * 1000 , 'ms')
print(out_buffer)

## KNN Implementation with 40 features

The outputs are the distance to the closest neighbour of the type i.e k = 1;
The closest is the prediction of the feature set.

## CPU Implementation

In [None]:
start_time = time.time()
! ./KNNInference < input_weights.txt
print("Overhead" ,(time.time() - start_time) * 1000 , 'ms') #overhead in launching program through notebook

## HLS Implementation of KNN (No optimisations)
<img src="./files/KNN_No_Optimisation.png">

Without optimisations, KNN has a lower area usage and power usage as compared to MLP

In [None]:
overlay = Overlay('/home/xilinx/impl_knn_no_optimisations/design_1.bit')
dma = overlay.axi_dma_0            
in_buffer = allocate(shape=(40,), dtype=np.int32)
for i in range(40):
    in_buffer[i] = inputs[i];
out_buffer = allocate(shape=(4,), dtype=np.int32)
start_time = time.time()
dma.sendchannel.transfer(in_buffer)
dma.recvchannel.transfer(out_buffer)
dma.sendchannel.wait()
dma.recvchannel.wait()
print((time.time() - start_time) * 1000 , 'ms')
print(out_buffer)

## HLS Implementation of KNN (unroll 32)
<img src="./files/KNN_32.png">


In [None]:
overlay = Overlay('/home/xilinx/impl_knn/design_1.bit')
dma = overlay.axi_dma_0            
in_buffer = allocate(shape=(40,), dtype=np.int32)
for i in range(40):
    in_buffer[i] = inputs[i];
out_buffer = allocate(shape=(4,), dtype=np.int32)
start_time = time.time()
dma.sendchannel.transfer(in_buffer)
dma.recvchannel.transfer(out_buffer)
dma.sendchannel.wait()
dma.recvchannel.wait()
print((time.time() - start_time) * 1000 , 'ms')
print(out_buffer)