In [1]:
%%writefile 2.3-convolve-1d-shared-constant-memory.py

import time
import math
import numpy as np
from numba import cuda, float32

thread_block_size_deafult = 256

h_host = np.random.rand(256).astype(np.float32)


@cuda.jit
def convolve_kernel(y, x):
    i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    
    if i >= y.shape[0]:
        return
    
    # Constant memory.
    h_gpu_const = cuda.const.array_like(h_host)
    filter_size = len(h_gpu_const)
    
    # Shared memory.
    x_sm = cuda.shared.array(shape=0, dtype=float32)
    sm_size = cuda.blockDim.x+filter_size-1
    
    # Copy a portion of global memory data to shared memory.
    k = i - (filter_size-1) # The current position in the global memory.
    k_sm = cuda.threadIdx.x # The current position in the shared memory.  
    while k_sm < sm_size:
        if k < 0:
            x_sm[k_sm] = float32(0.0)
        else:
            x_sm[k_sm] = x[k]
        k_sm += cuda.blockDim.x
        k    += cuda.blockDim.x

    cuda.syncthreads()
    
    k_sm = cuda.threadIdx.x+filter_size-1
    value = float32(0.0)
    for j in range(filter_size):
        value += x_sm[k_sm-j]*h_gpu_const[j]
    y[i] = value

    
def convolve(y, x):
    thread_block_size = min(thread_block_size_deafult, len(y))
    block_size = (thread_block_size, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    filter_size = len(h_host)
    shared_memory_size = thread_block_size+filter_size-1
    shared_memory_size_bytes = shared_memory_size*y.dtype.itemsize
    convolve_kernel[grid_size, block_size, cuda.default_stream(), shared_memory_size_bytes](y, x)  
    
# Test data.
n = 1024*64*16*16

for i in range(20):
    x_host = np.random.rand(n).astype(np.float32)
    y_gpu = cuda.device_array(shape=(n,), dtype=np.float32)
    start = time.time()
    x_gpu = cuda.to_device(x_host)
    convolve(y_gpu, x_gpu)
    y_host = y_gpu.copy_to_host()
    end = time.time()
    print(f"Execution time: {end-start}")

Overwriting 2.3-convolve-1d-shared-constant-memory.py


In [2]:
! nvprof python 2.3-convolve-1d-shared-constant-memory.py

==39938== NVPROF is profiling process 39938, command: python 2.3-convolve-1d-shared-constant-memory.py
Execution time: 0.5726428031921387
Execution time: 0.2206273078918457
Execution time: 0.21988773345947266
Execution time: 0.2207186222076416
Execution time: 0.22198104858398438
Execution time: 0.2215731143951416
Execution time: 0.22063207626342773
Execution time: 0.23209428787231445
Execution time: 0.2208080291748047
Execution time: 0.21992778778076172
Execution time: 0.22011971473693848
Execution time: 0.22240877151489258
Execution time: 0.22060918807983398
Execution time: 0.22048616409301758
Execution time: 0.23507213592529297
Execution time: 0.25647950172424316
Execution time: 0.2404470443725586
Execution time: 0.22742605209350586
Execution time: 0.22945713996887207
Execution time: 0.3387610912322998
==39938== Profiling application: python 2.3-convolve-1d-shared-constant-memory.py
==39938== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Ma

In [3]:
! ncu python 2.2-convolve-1d-shared-memory.py

==PROF== Connected to process 39975 (/home/pjarosik/bin/miniconda3/envs/ius2021sc/bin/python3.8)
==ERROR== Error: ERR_NVGPUCTRPERM - The user does not have permission to access NVIDIA GPU Performance Counters on the target device 0. For instructions on enabling permissions and to get more information see https://developer.nvidia.com/ERR_NVGPUCTRPERM
==PROF== Disconnected from process 39975
==ERROR== An error occurred while trying to profile.
