In [None]:
from pynq import Overlay, allocate
import numpy as np

overlay = Overlay('mlp_fast.bit')  # your generated bitstream
mlp_ip = overlay.mlp_fast_0  # IP block name from your design


In [None]:
INPUT_SIZE = 784
OUTPUT_SIZE = 10

# Allocate input/output buffers
input_buffer = allocate(shape=(INPUT_SIZE,), dtype=np.float32)
output_buffer = allocate(shape=(OUTPUT_SIZE,), dtype=np.float32)

# Assign physical addresses to IP registers
mlp_ip.write(0x10, input_buffer.physical_address)  # input address
mlp_ip.write(0x18, output_buffer.physical_address) # output address


299.997
374.99625
1333.32


In [None]:
def mlp_inference_fpga(image):
    # Copy data into input buffer
    input_buffer[:] = image[:]

    # Start inference
    mlp_ip.write(0x00, 1)  # Start IP (usually AP_START)

    # Wait until done
    while (mlp_ip.read(0x00) & 0x2) == 0:  # AP_DONE signal check
        pass

    # Read results
    return np.array(output_buffer)


In [None]:
# Load MNIST test data
x_test = np.load('x_test.npy') / 255.0
y_test = np.load('y_test.npy')

# Test single inference
idx = 0  # Test first image
result = mlp_inference_fpga(x_test[idx].flatten())

print("FPGA inference result:", result)
print("Predicted digit:", np.argmax(result))
print("Actual digit:", y_test[idx])


CTRL (<class 'pynq.registers.RegisterCTRL'>, 0, 32, None, None, 'read-write')
GIER (<class 'pynq.registers.RegisterGIER'>, 4, 32, None, None, 'read-write')
IP_IER (<class 'pynq.registers.RegisterIP_IER'>, 8, 32, None, None, 'read-write')
IP_ISR (<class 'pynq.registers.RegisterIP_ISR'>, 12, 32, None, None, 'read-write')
im_1 (<class 'pynq.registers.Registerim_1'>, 16, 32, None, None, 'write-only')
im_2 (<class 'pynq.registers.Registerim_2'>, 20, 32, None, None, 'write-only')
out_r_1 (<class 'pynq.registers.Registerout_r_1'>, 28, 32, None, None, 'write-only')
out_r_2 (<class 'pynq.registers.Registerout_r_2'>, 32, 32, None, None, 'write-only')


In [None]:
correct = 0
for i in range(len(x_test)):
    result = mlp_inference_fpga(x_test[i].flatten())
    if np.argmax(result) == y_test[i]:
        correct += 1

accuracy = correct / len(x_test)
print("FPGA inference accuracy:", accuracy)


In [None]:
import time

start_time = time.time()
for i in range(1000):  # Test first 1000 samples for timing
    mlp_inference_fpga(x_test[i].flatten())

end_time = time.time()
fps = 1000 / (end_time - start_time)

print("FPGA inference FPS:", fps)


In [None]:
import time

start_time = time.time()
for i in range(1000):  # Test first 1000 samples for timing
    mlp_inference_fpga(x_test[i].flatten())

end_time = time.time()
fps = 1000 / (end_time - start_time)

print("FPGA inference FPS:", fps)


acc hls 0.9764
acc py 0.9764


In [10]:
hw_time = %timeit -n 1 -r 10 -o mnist_hw(x_test.flatten())
sw_time = %timeit -n 1 -r 10 -o mnist_sw(x_test)

print('Performance gain:', sw_time.average / hw_time.average) 

43.2 ms ± 381 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)
7.4 s ± 18.6 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)
Performance gain: 171.33220449705442


In [11]:
print("hw fps = {:.1f}".format((hw_time.average/10000)**-1))
print("sw fps = {:.1f}".format((sw_time.average/10000)**-1))

hw fps = 231630.8
sw fps = 1351.9
