In [1]:
%%writefile 2.1-convolve-1d-global-memory.py

import time
import math
import numpy as np
from numba import cuda, float32

@cuda.jit
def convolve_kernel(y, x, coeffs):
    i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    
    if i >= y.shape[0]:
        return  
    value = float32(0.0)    
    n = min(coeffs.shape[0], i+1)
    for j in range(n):
        value += x[i-j]*coeffs[j]
    y[i] = value
        
        
def convolve(y, x, h):
    block_size = (256, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    convolve_kernel[grid_size, block_size](y, x, h)
    
# Tests

# Test 1
# x = np.array([0, 1, 2, 3, 4])
# h = np.array([0, 1, 2])
# y_gpu = cuda.device_array(len(x))

# convolve(y_gpu, x, h)
# np.testing.assert_equal(y_gpu.copy_to_host(), [0, 0, 1, 4, 7])

# # Test 1
# x = np.random.rand(1000)
# h = np.random.rand(30)
# y_gpu = cuda.device_array(len(x))
# convolve(y_gpu, x, h)
# np.testing.assert_equal(y_gpu.copy_to_host(), [0, 0, 1, 4, 7])

        
# Test data.
n = 1024*64*16*16 

for i in range(100):
    x_host = np.random.rand(n).astype(np.float32)
    h_host = np.random.rand(256).astype(np.float32)
    start = time.time()
    y_gpu = cuda.device_array(shape=(n,), dtype=np.float32) # np.zeros(n, dtype=np.float32)
    x_gpu = cuda.to_device(x_host)
    h_gpu = cuda.to_device(h_host)
    convolve(y_gpu, x_gpu, h_gpu)
    y_host = y_gpu.copy_to_host()
    end = time.time()
    print(f"Execution time: {end-start}")

Overwriting 2.1-convolve-1d-global-memory.py


In [2]:
! nsys profile --stats=true -t cuda python 2.1-convolve-1d-global-memory.py

Collecting data...
Execution time: 0.39854979515075684
Execution time: 0.0722038745880127
Execution time: 0.07133960723876953
Execution time: 0.07091784477233887
Execution time: 0.08097481727600098
Execution time: 0.0719461441040039
Execution time: 0.07051634788513184
Execution time: 0.07268738746643066
Execution time: 0.09375500679016113
Execution time: 0.0707712173461914
Execution time: 0.07194352149963379
Execution time: 0.07742857933044434
Execution time: 0.07236385345458984
Execution time: 0.0714104175567627
Execution time: 0.07010054588317871
Execution time: 0.07824563980102539
Execution time: 0.07202672958374023
Execution time: 0.07043814659118652
Execution time: 0.07079887390136719
Execution time: 0.09495377540588379
Execution time: 0.07022786140441895
Execution time: 0.07021021842956543
Execution time: 0.07788276672363281
Execution time: 0.07030820846557617
Execution time: 0.07108521461486816
Execution time: 0.07120561599731445
Execution time: 0.07826113700866699
Execution tim

In [3]:
# ! ncu --section MemoryWorkloadAnalysis python 2.1-convolve-1d-global-memory.py

In [4]:
# ! nsys profile --stats=true -t cuda python 2.1-convolve-1d-global-memory.py
# ! ncu python 2.1-convolve-1d-global-memory.py