In [1]:
import math
import numpy as np
from numba import cuda

***
# FIR filter
***
<font size="4">
The finite impulse response (FIR) filter is given by equation \eqref{fir}

\begin{equation}
    y[n] = \sum_{k=0}^{N-1} h[k]x[n-k] 
    \label{fir} \tag{1}
\end{equation}

where *x* is an input signal, *h* is the impulse response of the filter and *y* is an output signal.
Filtration is realised by convolving the signal with impulse response of the filter.

  
    
***    
</font>

### Example: 

In [2]:
@cuda.jit
def convolve_kernel(y, x, coeffs):
    i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    if i < y.shape[0]:
        value = 0.0    
        n = min(coeffs.shape[0], i+1)
        for j in range(n):
            value += x[i-j]*coeffs[j]
        y[i] = value

In [3]:
def convolve(y, x, h):
    block_size = (256, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    convolve_kernel[grid_size, block_size](y, x, h)

```

x = [0, 1, 2, 3, 4]
h = [0, 1, 2]

```

```
y[0] = h[0]*x[0]                     = 0  
y[1] = h[0]*x[1]+h[1]*x[0]           = 0
y[2] = h[0]*x[2]+h[1]*x[1]*h[2]*x[0] = 1
y[3] = h[0]*x[3]+h[1]*x[2]*h[2]*x[0] = 4
y[4] = h[0]*x[4]+h[1]*x[3]+h[2]*x[2] = 7
```

```
 x = [0, 1, 2, 3, 4]  y = 
     [0]                  [0]
     [1, 0]               [0]
     [2, 1, 0]            [1]
        [2, 1, 0]         [4]
           [2, 1, 0]      [7]
```

In [4]:
x = np.array([0, 1, 2, 3, 4])
h = np.array([0, 1, 2])
y_gpu = cuda.device_array(len(x))

convolve(y_gpu, x, h)

y_host = y_gpu.copy_to_host()
np.testing.assert_equal(y_host, [0, 0, 1, 4, 7])
y_host

array([0., 0., 1., 4., 7.])

In [None]:
def test_

### Profiling the kernel

In [5]:
%%writefile 1.4-convolve-1d.py

import math
import numpy as np
from numba import cuda

# CUDA kernel.

@cuda.jit
def convolve_kernel(y, x, coeffs):
    i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    if i < y.shape[0]:
        value = 0.0    
        n = min(coeffs.shape[0], i+1)
        for j in range(n):
            value += x[i-j]*coeffs[j]
        y[i] = value
        
        
def convolve(y, x, h):
    block_size = (256, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    convolve_kernel[grid_size, block_size](y, x, h)
        
# Test data.
n = 100000
x_host = np.random.rand(n).astype(np.float32)
h_host = np.random.rand(64).astype(np.float32)
y_gpu = np.zeros(n, dtype=np.float32)

x_gpu = cuda.to_device(x_host)
h_gpu = cuda.to_device(h_host)

for i in range(100):
    convolve(y_gpu, x_gpu, h_gpu)

Overwriting 1.4-convolve-1d.py


In [6]:
! nsys profile --stats=true --export none --trace cuda --output 1.4-convolve-1d python 1.4-convolve-1d.py

Collecting data...
Processing events...
Saving temporary "/tmp/nsys-report-863a-d311-77c3-3b2e.qdstrm" file to disk...
Creating final output files...

Saved report file to "/tmp/nsys-report-863a-d311-77c3-3b2e.qdrep"

Exported successfully to
/tmp/nsys-report-863a-d311-77c3-3b2e.sqlite

Generating CUDA API Statistics...
CUDA API Statistics (nanoseconds)

Time(%)      Total Time       Calls         Average         Minimum         Maximum  Name                                                                            
-------  --------------  ----------  --------------  --------------  --------------  --------------------------------------------------------------------------------
   88.9        82247572         100        822475.7          189232         1085006  cuMemcpyDtoH_v2                                                                 
    5.3         4892326         102         47964.0            7038           80255  cuMemcpyHtoD_v2                                             