In [None]:
from time import time

import cffi
import numpy as np
from pynq import MMIO, Overlay, allocate

ffi = cffi.FFI()

# Classifier Dimensions
BATCH = 8192
FEAT = 256
CLASSES = 10

In [None]:
# Prepare the custome overlay and DMA
ol = Overlay("/home/xilinx/classifier.bit")
ol.download()

dma_mm2s = ol.axi_dma_0
dma_s2mm = ol.axi_dma_1
mm2s_buffer = allocate(shape=(CLASSES * 4 + CLASSES * FEAT + BATCH * FEAT,), dtype=np.uint8)
s2mm_buffer = allocate(shape=(BATCH * CLASSES,), dtype=np.int32)

# Accelerator Base Address
ACCEL_CTRL = 0x43C00000

# Initialize HLS IP
mmult_ip = MMIO(ACCEL_CTRL, 0x10000)

# Start the accelerator
ctrl = mmult_ip.read(0x00) & 0x08
mmult_ip.write(0x00, (ctrl | 0x81))
ctrl = mmult_ip.read(0x00)
hex(ctrl)


In [None]:
# Initialize offsets, weights and inputs
o = np.load("model_offsets_fixed.npy").astype(np.int32)
w = np.load("model_weights_fixed.npy").astype(np.int8)
i = np.load("test_data.npy").astype(np.uint8)[0:BATCH]
l = np.load("test_labels.npy").astype(np.int32)[0:BATCH]

In [None]:
# Move offset, weight and input data to DMA buffer
ffi.memmove(mm2s_buffer[:], ffi.cast("uint8_t *", o.ctypes.data), CLASSES * 4)
ffi.memmove(mm2s_buffer[CLASSES * 4 :], ffi.cast("uint8_t *", w.ctypes.data), CLASSES * FEAT)
ffi.memmove(mm2s_buffer[CLASSES * 4 + CLASSES * FEAT :], ffi.cast("uint8_t *", i.ctypes.data), BATCH * FEAT)
mm2s_buffer.flush()

# Perform FPGA offloading
start_t = time()
dma_mm2s.sendchannel.transfer(mm2s_buffer)
dma_s2mm.recvchannel.transfer(s2mm_buffer)
dma_mm2s.sendchannel.wait()
dma_s2mm.recvchannel.wait()
fpga_time = time() - start_t

# Dump FPGA result to a numpy array
c = np.array(s2mm_buffer).reshape(BATCH, CLASSES)


In [None]:
# Prepare input and weight matrices for matrix multiplication on CPU
ones = np.ones(BATCH).reshape((BATCH, 1))
i_p = np.append(ones, i, axis=1)
w_p = np.append(o.reshape(CLASSES, 1), w, axis=1)

# Compute CPU result
start_t = time()
c_ref = np.dot(i_p, w_p.T)
cpu_time = time() - start_t

In [None]:
# Evaluate validation accuracy
actual_label = l.argmax(axis=1)
fpga_label = c.argmax(axis=1)
cpu_label = c_ref.argmax(axis=1)

fpga_errors = np.sum(fpga_label != actual_label)
cpu_errors = np.sum(cpu_label != actual_label)

# Report results
print("FPGA accuracy: {0:.2f}% validation error".format(fpga_errors / BATCH * 100))
print("CPU accuracy:  {0:.2f}% validation error".format(cpu_errors / BATCH * 100))
if cpu_time < fpga_time:
    print("FPGA has a {0:.2f}x slowdown".format(fpga_time / cpu_time))
else:
    print("FPGA has a {0:.2f}x speedup".format(cpu_time / fpga_time))


In [None]:
# Render a given numpy 2D array of pixel data.
def show(image):
    from matplotlib import pyplot
    import matplotlib as mpl

    fig = pyplot.figure()
    ax = fig.add_subplot(1, 1, 1)
    imgplot = ax.imshow(image, cmap=mpl.cm.Greys)
    imgplot.set_interpolation("nearest")
    ax.xaxis.set_ticks_position("top")
    ax.yaxis.set_ticks_position("left")
    pyplot.show()


# Inspect one of the hand digits classified by the FPGA
idx = 1
show(i[idx].reshape(16, 16))
print("Classified as {} by the FPGA".format(np.argmax(c[idx])))