-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_deep_clean.py
150 lines (134 loc) · 5.96 KB
/
run_deep_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import pynq
import numpy as np
from pynq import allocate
from pynq import Overlay
from datetime import datetime
class NeuralNetworkOverlay(Overlay):
def __init__(self, xclbin_name, dtbo=None, download=True, ignore_version=False, device=None):
super().__init__(xclbin_name, dtbo=dtbo, download=download, ignore_version=ignore_version, device=device)
def allocate_mem(self, X_shape, y_shape, dtype=np.float32, trg_in=None, trg_out=None):
"""
Buffer allocation in the card memory
Parameters
----------
X_shape : input buffer shape.
y_shape : output buffer shape.
dtype : the data type of the elements of the input/output vectors.
Note: it should be set depending on the interface of the accelerator; if it uses 'float'
types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
doc for more info).
In this case the encoding/decoding has to be computed by the host machine. For example for
'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
'float' -> 'ap_fixed<16,6>':
def encode(xi):
return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
def decode(yi):
return yi * 2**-10
encode_v = np.vectorize(encode) # to apply them element-wise
decode_v = np.vectorize(decode)
trg_in : input buffer target memory, alveo-u50 has 32 bank of HBM 256 MB each. By default the v++ command
set it to HBM[0].
trg_out : output buffer target memory, alveo-u50 has 32 bank of HBM 256 MB each. By default the v++ command
set it to HBM[0].
Returns
-------
input_buffer, output_buffer : input and output PYNQ buffers
"""
input_buffer = allocate(shape=X_shape, dtype=dtype, target=trg_in )
output_buffer = allocate(shape=y_shape, dtype=dtype, target=trg_out)
return input_buffer, output_buffer
def predict(self, X, y_shape, input_buffer, output_buffer, debug=None, profile=False, encode=None,
decode=None):
"""
Obtain the predictions of the NN implemented in the FPGA.
Parameters:
- X : the input vector. Should be numpy ndarray.
- y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and
for sizing the output vector shape.
- input_buffer : input PYNQ buffer, must be allocated first and just once.
- output_buffer : output PYNQ buffer, must be allocated first and just once.
- debug : boolean, if set the function will print information about the data transfers status.
- profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
- encode/decode: function pointers. See `dtype` section for more information.
- return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
the namesake parameter.
"""
if profile:
timea = datetime.now()
if encode is not None:
print('pre-encoded:', X)
X = encode(X)
print('post-encoded:', X)
in_size = np.prod(X.shape)
out_size = np.prod(y_shape)
input_buffer[:] = X
input_buffer.sync_to_device()
if debug:
print("Send OK")
print('input buffer:', input_buffer)
print('in size and out size:', in_size, out_size)
self.krnl_rtl_1.call(input_buffer, output_buffer, in_size, out_size)
if debug:
print("Kernel call OK")
output_buffer.sync_from_device()
if debug:
print("Recieve OK")
print('output buffer:', output_buffer)
result = output_buffer.copy()
print('result:', result)
if profile:
timeb = datetime.now()
dts, rate = self._print_dt(timea, timeb, len(X))
input_buffer.flush()
output_buffer.flush()
del input_buffer
del output_buffer
self.free()
return result, dts, rate
input_buffer.flush()
output_buffer.flush()
del input_buffer
del output_buffer
return result
def free_overlay(self):
self.free()
def _print_dt(self, timea, timeb, N):
dt = (timeb - timea)
dts = dt.seconds + dt.microseconds * 10 ** -6
rate = N / dts
print("Ran {} inputs in {} seconds ({} runs / s)".format(N, dts, rate))
print("Or {} us / run".format(1 / rate * 1e6))
return dts, rate
ol = NeuralNetworkOverlay(xclbin_name="deep_clean.xclbin")
# set up encoding to put it into the 16 bits
def enc_data(xi):
return np.int32(round(xi*2**8))
def dec_data(yi):
return yi * 2**-8
encode_v = np.vectorize(enc_data)
decode_v = np.vectorize(dec_data)
print("ol maybe created successfully")
print(ol.krnl_rtl_1.register_map)
print(ol.krnl_rtl_1.signature)
np.random.seed(13) # seed input to make sure output matches
n_batch = 1
x_test = np.random.normal(size=(n_batch, 8192, 21))
# below line sets up deep clean with float inputs
i_buff, o_buff = ol.allocate_mem(X_shape=x_test.shape, y_shape=(n_batch, 8192), dtype=np.float32)
y_pred = ol.predict(
X=x_test,
y_shape=(n_batch, 8192), # use for deep clean
input_buffer=i_buff,
output_buffer=o_buff,
profile=True,
debug=False,
#encode=encode_v,
#decode=decode_v,
)
print('x test:', x_test)
print('non decoded output:', y_pred)
#y_dc = decode_v(y_pred[0][0])
#print('decoded output:', y_dc)
ol.free_overlay()