# Timing Analyses of the numpy fft implementation compared to an implementation on an FPGA-Board mit $\textit{NFFT} = 2^{14}$
Author: David Schulz

## 0. Setup

In [1]:
import numpy as np
from time import perf_counter_ns
from pynq import Overlay, allocate
from typing import Callable, Optional

In [2]:
def timeit(
    func: Callable[[np.ndarray], np.ndarray], 
    values: np.ndarray, 
    func_setup: Optional[Callable[[np.ndarray], None]] = lambda y: None
) -> None:
    """
    timeit is a function that times the input function with the given values

    params:
        - func: the function that is supposed to be timed
        - values: values that are used as parameters while calling the function (func)

    return: None
    """

    # get the number of items inside the numpy array
    # this is the number of runs that are done
    runs: int = values.shape[0]
    
    # create an array with all the times for each run
    times: np.ndarray = np.zeros(runs)

    # loop over the number of runs and time the function for each loop
    for i in range(runs):
        func_setup(values[i])
        
        # get the start time with the performance counter in nano seconds to
        # ensure the highest precision
        start_time: int = perf_counter_ns()

        # call the function with the values for this loop
        func(values[i])
        
        # get the end time with the performance counter in nano seconds to
        # ensure the highest precision
        end_time: int = perf_counter_ns()

        # subtract the end time with the start time to get the time that the function needed
        # and store this time inside the times array
        times[i] = end_time - start_time

    # calculate the mean and standard deviation in nano seconds
    mean_ns: np.float64 = np.mean(times)
    std_ns: np.float64 = np.std(times)

    # convert the mean and standard deviation to milli seconds
    mean_milli: np.float64 = mean_ns * 10**-6
    std_milli: np.float64 = std_ns * 10**-6

    # round the mean and standard deviation to 2 decimal places
    mean_milli_round: np.float64 = np.round(mean_milli, decimals=2)
    std_milli_round: np.float64 = np.round(std_milli, decimals=2)

    # print the result to the console
    # the output is supposed to be similar to the output of the built-in package timeit
    print(f'{mean_milli_round} ms ± {std_milli_round} ms per loop (mean ± std. dev. of {runs} runs)')

In [3]:
NFFT: int = 2**14

# defined values to be used as an input to the fft functions
values = np.random.random_sample((1_000,NFFT))

## 1. Numpy Timing

In [4]:
# defining the numpy fft function
numpy_fft: Callable[[np.ndarray], np.ndarray] = np.fft.fft

In [5]:
# calling the defined timeit function to time the numpy fft function with the predefined values
timeit(numpy_fft, values)

9.14 ms ± 0.85 ms per loop (mean ± std. dev. of 1000 runs)


## 2. FPGA FFT Timing

In [6]:
# load the overlay of the fft block
overlay: Overlay = Overlay('FFT_test_16k.bit')
    
fft_data_dma = overlay.fft_data_dma
fft_config_dma = overlay.fft_config_dma

fft_data_send_channel = fft_data_dma.sendchannel
fft_data_receive_channel = fft_data_dma.recvchannel
fft_config_send_channel = fft_config_dma.sendchannel

In [7]:
input_buffer = allocate(NFFT, dtype=np.csingle)
output_buffer = allocate(NFFT, dtype=np.csingle)

In [8]:
def fpga_fft_setup(y: np.ndarray):
    np.copyto(input_buffer, y) 

In [9]:
def fpga_fft(y: np.ndarray):
    fft_data_send_channel.transfer(input_buffer)
    fft_data_receive_channel.transfer(output_buffer)
    fft_data_send_channel.wait()
    fft_data_receive_channel.wait()

In [10]:
timeit(fpga_fft, values, func_setup=fpga_fft_setup)

1.36 ms ± 0.08 ms per loop (mean ± std. dev. of 1000 runs)


## 3. FPGA Timing without Transfer 

In [11]:
def fpga_fft_setup(y: np.ndarray):
    np.copyto(input_buffer, y)
    fft_data_send_channel.transfer(input_buffer)

In [12]:
def fpga_fft_without_transfer(y: np.ndarray):
    fft_data_receive_channel.transfer(output_buffer)
    fft_data_send_channel.wait()
    fft_data_receive_channel.wait()

In [13]:
timeit(fpga_fft_without_transfer, values, func_setup=fpga_fft_setup)

0.99 ms ± 0.02 ms per loop (mean ± std. dev. of 1000 runs)


# 3. FPGA Timing without Transfer 2

In [14]:
def fpga_fft_setup_2(y: np.ndarray):
    np.copyto(input_buffer, y)
    fft_data_send_channel.transfer(input_buffer)
    fft_data_receive_channel.transfer(output_buffer)

In [15]:
def fpga_fft_without_transfer_2(y: np.ndarray):
    fft_data_send_channel.wait()
    fft_data_receive_channel.wait()

In [16]:
timeit(fpga_fft_without_transfer_2, values, func_setup=fpga_fft_setup_2)

0.87 ms ± 0.11 ms per loop (mean ± std. dev. of 1000 runs)


# 3. FPGA Timing without Transfer 3

In [17]:
def fpga_fft_setup_3(y: np.ndarray):
    np.copyto(input_buffer, y)
    fft_data_send_channel.transfer(input_buffer)
    fft_data_receive_channel.transfer(output_buffer)
    fft_data_send_channel.wait()

In [18]:
def fpga_fft_without_transfer_3(y: np.ndarray):
    fft_data_receive_channel.wait()

In [19]:
timeit(fpga_fft_without_transfer_3, values, func_setup=fpga_fft_setup_3)

0.76 ms ± 0.04 ms per loop (mean ± std. dev. of 1000 runs)
