In [1]:
%%writefile 2.1-convolve-1d-global-memory.py

import time
import math
import numpy as np
from numba import cuda, float32

@cuda.jit
def convolve_kernel(y, x, coeffs):
    i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    
    if i >= y.shape[0]:
        return  
    value = float32(0.0)    
    n = min(coeffs.shape[0], i+1)
    for j in range(n):
        value += x[i-j]*coeffs[j]
    y[i] = value
        
        
def convolve(y, x, h):
    block_size = (256, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    convolve_kernel[grid_size, block_size](y, x, h)
    
# Tests

# Test 1
# x = np.array([0, 1, 2, 3, 4])
# h = np.array([0, 1, 2])
# y_gpu = cuda.device_array(len(x))

# convolve(y_gpu, x, h)
# np.testing.assert_equal(y_gpu.copy_to_host(), [0, 0, 1, 4, 7])

# # Test 1
# x = np.random.rand(1000)
# h = np.random.rand(30)
# y_gpu = cuda.device_array(len(x))
# convolve(y_gpu, x, h)
# np.testing.assert_equal(y_gpu.copy_to_host(), [0, 0, 1, 4, 7])

        
# Test data.
n = 1024*64*16*16 

for i in range(100):
    x_host = np.random.rand(n).astype(np.float32)
    h_host = np.random.rand(256).astype(np.float32)
    start = time.time()
    y_gpu = cuda.device_array(shape=(n,), dtype=np.float32) # np.zeros(n, dtype=np.float32)
    x_gpu = cuda.to_device(x_host)
    h_gpu = cuda.to_device(h_host)
    convolve(y_gpu, x_gpu, h_gpu)
    y_host = y_gpu.copy_to_host()
    end = time.time()
    print(f"Execution time: {end-start}")

Overwriting 2.1-convolve-1d-global-memory.py


In [5]:
! nvprof python 2.1-convolve-1d-global-memory.py

==39065== NVPROF is profiling process 39065, command: python 2.1-convolve-1d-global-memory.py
Execution time: 0.7839598655700684
Execution time: 0.3149600028991699
Execution time: 0.2945418357849121
Execution time: 0.29302167892456055
Execution time: 0.3063042163848877
Execution time: 0.2941744327545166
Execution time: 0.2939951419830322
Execution time: 0.3050954341888428
Execution time: 0.2931020259857178
Execution time: 0.29428863525390625
Execution time: 0.31705212593078613
Execution time: 0.30550646781921387
Execution time: 0.2946469783782959
Execution time: 0.31102538108825684
Execution time: 0.3362107276916504
Execution time: 0.3303670883178711
Execution time: 0.3096504211425781
Execution time: 0.31031131744384766
Execution time: 0.33253049850463867
Execution time: 0.39071059226989746
Execution time: 0.3184206485748291
Execution time: 0.32593202590942383
Execution time: 0.30050134658813477
Execution time: 0.317415714263916
Execution time: 0.325054407119751
Execution time: 0.31627

In [3]:
# ! ncu --section MemoryWorkloadAnalysis python 2.1-convolve-1d-global-memory.py

In [4]:
# ! nsys profile --stats=true -t cuda python 2.1-convolve-1d-global-memory.py
# ! ncu python 2.1-convolve-1d-global-memory.py