In [1]:
# import math
# import numpy as np
# from numba import cuda, float32

# thread_block_size_deafult = 256

In [2]:
# @cuda.jit
# def convolve_kernel(y, x, h):
#     i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    
#     if i >= y.shape[0]:
#         return
    
#     filter_size = len(h)
#     x_sm = cuda.shared.array(shape=0, dtype=float32)
#     sm_size = cuda.blockDim.x+filter_size-1
    
#     # Copy a portion global memory data to shared memory.
#     k = i - (filter_size-1) # The current position in the global memory.
#     k_sm = cuda.threadIdx.x # The current position in the shared memory.  
#     while k_sm < sm_size:
#         if k < 0:
#             x_sm[k_sm] = 0.0
#         else:
#             x_sm[k_sm] = x[k]
#         k_sm += cuda.blockDim.x
#         k    += cuda.blockDim.x

#     cuda.syncthreads()
    
#     k_sm = cuda.threadIdx.x+filter_size-1
#     value = 0.0
#     for j in range(filter_size):
#         value += x_sm[k_sm-j]*h[j]
#     y[i] = value

    
# def convolve(y, x, h):
#     thread_block_size = min(thread_block_size_deafult, len(y))
#     block_size = (thread_block_size, )
#     grid_size = (math.ceil(len(y)/block_size[0]), )
#     filter_size = len(h)
#     shared_memory_size = thread_block_size+filter_size-1
#     shared_memory_size_bytes = shared_memory_size * y.dtype.itemsize
#     convolve_kernel[grid_size, block_size, 0, shared_memory_size_bytes](y, x, h)  
    
    
# x = np.array([0, 1, 2, 3, 4])
# h = np.array([0, 1, 2])
# y_gpu = cuda.device_array(len(x))

# convolve(y_gpu, x, h)

# y_host = y_gpu.copy_to_host()
# np.testing.assert_equal(y_host, [0, 0, 1, 4, 7])
# y_host

In [3]:
%%writefile 2.2-convolve-1d-shared-memory.py

import math
import numpy as np
from numba import cuda, float32

thread_block_size_deafult = 256


@cuda.jit
def convolve_kernel(y, x, h):
    i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    
    if i >= y.shape[0]:
        return
    
    filter_size = len(h)
    x_sm = cuda.shared.array(shape=0, dtype=float32)
    sm_size = cuda.blockDim.x+filter_size-1
    
    # Copy a portion global memory data to shared memory.
    k = i - (filter_size-1) # The current position in the global memory.
    k_sm = cuda.threadIdx.x # The current position in the shared memory.  
    while k_sm < sm_size:
        if k < 0:
            x_sm[k_sm] = float32(0.0)
        else:
            x_sm[k_sm] = x[k]
        k_sm += cuda.blockDim.x
        k    += cuda.blockDim.x

    cuda.syncthreads()
    
    k_sm = cuda.threadIdx.x+filter_size-1
    value = float32(0.0)
    for j in range(filter_size):
        value += x_sm[k_sm-j]*h[j]
    y[i] = value

    
def convolve(y, x, h):
    thread_block_size = min(thread_block_size_deafult, len(y))
    block_size = (thread_block_size, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    filter_size = len(h)
    shared_memory_size = thread_block_size+filter_size-1
    shared_memory_size_bytes = shared_memory_size*y.dtype.itemsize
    convolve_kernel[grid_size, block_size, cuda.default_stream(), shared_memory_size_bytes](y, x, h)  
    
# Test data.
n = 1024*64*16*16

for i in range(20):
    x_host = np.random.rand(n).astype(np.float32)
    h_host = np.random.rand(256).astype(np.float32)
    y_gpu = cuda.device_array(shape=(n,), dtype=np.float32)
    x_gpu = cuda.to_device(x_host)
    h_gpu = cuda.to_device(h_host)
    convolve(y_gpu, x_gpu, h_gpu)
    y_host = y_gpu.copy_to_host()

Overwriting 2.2-convolve-1d-shared-memory.py


In [4]:
! nvprof python 2.2-convolve-1d-shared-memory.py

==39273== NVPROF is profiling process 39273, command: python 2.2-convolve-1d-shared-memory.py
==39273== Profiling application: python 2.2-convolve-1d-shared-memory.py
==39273== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   82.35%  4.33889s        20  216.94ms  215.59ms  220.54ms  cudapy::__main__::convolve_kernel$241(Array<float, int=1, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)
                    9.30%  490.15ms        40  12.254ms     928ns  24.752ms  [CUDA memcpy HtoD]
                    8.34%  439.63ms        20  21.981ms  21.596ms  22.360ms  [CUDA memcpy DtoH]
      API calls:   86.91%  4.78460s        20  239.23ms  237.76ms  243.08ms  cuMemcpyDtoH
                    8.79%  483.79ms        40  12.095ms  11.866us  24.378ms  cuMemcpyHtoD
                    2.27%  125.20ms         1  125.20ms  125.20ms  125.20ms  cuDevicePrimaryCtxRetain
        

In [5]:
! ncu --section MemoryWorkloadAnalysis python 2.2-convolve-1d-shared-memory.py

==PROF== Connected to process 39309 (/home/pjarosik/bin/miniconda3/envs/ius2021sc/bin/python3.8)
==ERROR== Error: ERR_NVGPUCTRPERM - The user does not have permission to access NVIDIA GPU Performance Counters on the target device 0. For instructions on enabling permissions and to get more information see https://developer.nvidia.com/ERR_NVGPUCTRPERM
==PROF== Disconnected from process 39309
==ERROR== An error occurred while trying to profile.


In [None]:
# ! nsys profile --stats=true -t cuda python 2.2-convolve-1d-shared-memory.py
! ncu python 2.2-convolve-1d-shared-memory.py

==PROF== Connected to process 39352 (/home/pjarosik/bin/miniconda3/envs/ius2021sc/bin/python3.8)
==ERROR== Error: ERR_NVGPUCTRPERM - The user does not have permission to access NVIDIA GPU Performance Counters on the target device 0. For instructions on enabling permissions and to get more information see https://developer.nvidia.com/ERR_NVGPUCTRPERM
