In [1]:
# import math
# import numpy as np
# from numba import cuda, float32

# thread_block_size_deafult = 256

In [2]:
# @cuda.jit
# def convolve_kernel(y, x, h):
#     i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    
#     if i >= y.shape[0]:
#         return
    
#     filter_size = len(h)
#     x_sm = cuda.shared.array(shape=0, dtype=float32)
#     sm_size = cuda.blockDim.x+filter_size-1
    
#     # Copy a portion global memory data to shared memory.
#     k = i - (filter_size-1) # The current position in the global memory.
#     k_sm = cuda.threadIdx.x # The current position in the shared memory.  
#     while k_sm < sm_size:
#         if k < 0:
#             x_sm[k_sm] = 0.0
#         else:
#             x_sm[k_sm] = x[k]
#         k_sm += cuda.blockDim.x
#         k    += cuda.blockDim.x

#     cuda.syncthreads()
    
#     k_sm = cuda.threadIdx.x+filter_size-1
#     value = 0.0
#     for j in range(filter_size):
#         value += x_sm[k_sm-j]*h[j]
#     y[i] = value

    
# def convolve(y, x, h):
#     thread_block_size = min(thread_block_size_deafult, len(y))
#     block_size = (thread_block_size, )
#     grid_size = (math.ceil(len(y)/block_size[0]), )
#     filter_size = len(h)
#     shared_memory_size = thread_block_size+filter_size-1
#     shared_memory_size_bytes = shared_memory_size * y.dtype.itemsize
#     convolve_kernel[grid_size, block_size, 0, shared_memory_size_bytes](y, x, h)  
    
    
# x = np.array([0, 1, 2, 3, 4])
# h = np.array([0, 1, 2])
# y_gpu = cuda.device_array(len(x))

# convolve(y_gpu, x, h)

# y_host = y_gpu.copy_to_host()
# np.testing.assert_equal(y_host, [0, 0, 1, 4, 7])
# y_host

In [3]:
%%writefile 2.2-convolve-1d-shared-memory.py

import math
import numpy as np
from numba import cuda, float32

thread_block_size_deafult = 256


@cuda.jit
def convolve_kernel(y, x, h):
    i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    
    if i >= y.shape[0]:
        return
    
    filter_size = len(h)
    x_sm = cuda.shared.array(shape=0, dtype=float32)
    sm_size = cuda.blockDim.x+filter_size-1
    
    # Copy a portion global memory data to shared memory.
    k = i - (filter_size-1) # The current position in the global memory.
    k_sm = cuda.threadIdx.x # The current position in the shared memory.  
    while k_sm < sm_size:
        if k < 0:
            x_sm[k_sm] = float32(0.0)
        else:
            x_sm[k_sm] = x[k]
        k_sm += cuda.blockDim.x
        k    += cuda.blockDim.x

    cuda.syncthreads()
    
    k_sm = cuda.threadIdx.x+filter_size-1
    value = float32(0.0)
    for j in range(filter_size):
        value += x_sm[k_sm-j]*h[j]
    y[i] = value

    
def convolve(y, x, h):
    thread_block_size = min(thread_block_size_deafult, len(y))
    block_size = (thread_block_size, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    filter_size = len(h)
    shared_memory_size = thread_block_size+filter_size-1
    shared_memory_size_bytes = shared_memory_size*y.dtype.itemsize
    convolve_kernel[grid_size, block_size, cuda.default_stream(), shared_memory_size_bytes](y, x, h)  
    
# Test data.
n = 1024*64*16*16

for i in range(20):
    x_host = np.random.rand(n).astype(np.float32)
    h_host = np.random.rand(256).astype(np.float32)
    y_gpu = cuda.device_array(shape=(n,), dtype=np.float32)
    x_gpu = cuda.to_device(x_host)
    h_gpu = cuda.to_device(h_host)
    convolve(y_gpu, x_gpu, h_gpu)
    y_host = y_gpu.copy_to_host()

Overwriting 2.2-convolve-1d-shared-memory.py


In [4]:
! nsys profile --stats=true -t cuda python 2.2-convolve-1d-shared-memory.py

Collecting data...
Processing events...
Saving temporary "/tmp/nsys-report-7ebb-93a4-c60e-749d.qdstrm" file to disk...
Creating final output files...

Saved report file to "/tmp/nsys-report-7ebb-93a4-c60e-749d.qdrep"

Exported successfully to
/tmp/nsys-report-7ebb-93a4-c60e-749d.sqlite

Generating CUDA API Statistics...
CUDA API Statistics (nanoseconds)

Time(%)      Total Time       Calls         Average         Minimum         Maximum  Name                                                                            
-------  --------------  ----------  --------------  --------------  --------------  --------------------------------------------------------------------------------
   80.2       733124781          20      36656239.0        35987419        38240690  cuMemcpyDtoH_v2                                                                 
   14.6       133835705          40       3345892.6            7751        13451342  cuMemcpyHtoD_v2                                             

In [5]:
! ncu --section MemoryWorkloadAnalysis python 2.2-convolve-1d-shared-memory.py

==PROF== Connected to process 13207 (/opt/conda/envs/rapids/bin/python3.7)
==PROF== Profiling "convolve_kernel$241" - 1: 0%....50%....100% - 7 passes
==PROF== Profiling "convolve_kernel$241" - 2: 0%....50%....100% - 7 passes
==PROF== Profiling "convolve_kernel$241" - 3: 0%....50%....100% - 7 passes
==PROF== Profiling "convolve_kernel$241" - 4: 0%....50%....100% - 7 passes
==PROF== Profiling "convolve_kernel$241" - 5: 0%....50%....100% - 7 passes
==PROF== Profiling "convolve_kernel$241" - 6: 0%....50%....100% - 7 passes
==PROF== Profiling "convolve_kernel$241" - 7: 0%....50%....100% - 7 passes
==PROF== Profiling "convolve_kernel$241" - 8: 0%....50%....100% - 7 passes
==PROF== Profiling "convolve_kernel$241" - 9: 0%....50%....100% - 7 passes
==PROF== Profiling "convolve_kernel$241" - 10: 0%....50%....100% - 7 passes
==PROF== Profiling "convolve_kernel$241" - 11: 0%....50%....100% - 7 passes
==PROF== Profiling "convolve_kernel$241" - 12: 0%....50%....100% - 7 passes
==PROF== Profiling "co

In [6]:
# ! nsys profile --stats=true -t cuda python 2.2-convolve-1d-shared-memory.py
! ncu python 2.2-convolve-1d-shared-memory.py

==PROF== Connected to process 13231 (/opt/conda/envs/rapids/bin/python3.7)
==PROF== Profiling "convolve_kernel$241" - 1: 0%....50%....100% - 8 passes
==PROF== Profiling "convolve_kernel$241" - 2: 0%....50%....100% - 8 passes
==PROF== Profiling "convolve_kernel$241" - 3: 0%....50%....100% - 8 passes
==PROF== Profiling "convolve_kernel$241" - 4: 0%....50%....100% - 8 passes
==PROF== Profiling "convolve_kernel$241" - 5: 0%....50%....100% - 8 passes
==PROF== Profiling "convolve_kernel$241" - 6: 0%....50%....100% - 8 passes
==PROF== Profiling "convolve_kernel$241" - 7: 0%....50%....100% - 8 passes
==PROF== Profiling "convolve_kernel$241" - 8: 0%....50%....100% - 8 passes
==PROF== Profiling "convolve_kernel$241" - 9: 0%....50%....100% - 8 passes
==PROF== Profiling "convolve_kernel$241" - 10: 0%....50%....100% - 8 passes
==PROF== Profiling "convolve_kernel$241" - 11: 0%....50%....100% - 8 passes
==PROF== Profiling "convolve_kernel$241" - 12: 0%....50%....100% - 8 passes
==PROF== Profiling "co