# 2.2. Memory model: shared memory

In [1]:
import math
from numba import cuda, float32, int32
import cupy as cp
from tests import test_convolve, benchmark_convolve

## How much shared memory do we have?

Maximum shared memory size per thread block (bytes).

Note: cupy seems to provide much more information about the device attributes, than Numba.

In [2]:
device_props = cp.cuda.runtime.getDeviceProperties(0)

print(f"Device name: {device_props['name']}")
print(f"Shared memory per thread block: {device_props['sharedMemPerBlock']} [bytes]")

Device name: b'GeForce MX250'
Shared memory per thread block: 49152 [bytes]


A complete description of the device properties is available here:
https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html

The below thread block size and shared memory size is enough to be run on the GPU:

In [3]:
THREAD_BLOCK_SIZE = 256

## How to use shared memory?

In [4]:
@cuda.jit
def convolve_gpu_kernel(y, x, h):
    i = cuda.grid(1)
    
    if i >= y.shape[0]:
        return
    
    M = len(x)
    N = len(h)
    
    x_shared = cuda.shared.array(shape=0, dtype=float32)
    SHARED_SIZE = cuda.blockDim.x+N-1
    OFFSET = int32(math.ceil(N/2)-1)
    
    # Copy a portion of data from global memory to shared memory.
    
    # The current position in the global memory.
    k = i-(N-1)+OFFSET 
    # The current position in the shared memory.
    k_shared = cuda.threadIdx.x 
    while k_shared < SHARED_SIZE:
        if k >= 0 and k < M:
            x_shared[k_shared] = x[k]
        else:
            x_shared[k_shared] = float32(0.0)
        k_shared += cuda.blockDim.x
        k        += cuda.blockDim.x

    cuda.syncthreads()
    
    k_shared = cuda.threadIdx.x+N-1
    value = float32(0.0)
    for j in range(N):
        value += x_shared[k_shared-j]*h[j]
        
    y[i] = value

In [5]:
def convolve_gpu(y, x, h):
    if y is None:
        y = cuda.device_array(x.shape, dtype=x.dtype)
    
    # Determine thread and block size.
    n_threads = min(THREAD_BLOCK_SIZE, len(y))
    block_size = (n_threads, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    
    # Determine shared memory size.
    N = len(h)
    SHARED_SIZE = THREAD_BLOCK_SIZE+N-1
    SHARED_SIZE_BYTES = SHARED_SIZE*y.dtype.itemsize
    
    if SHARED_SIZE_BYTES > device_props['sharedMemPerBlock']:    
        raise ValueError("Declared shared memory size exceeds the amount available for the device.")
    
    # Execute the kernel.
    convolve_gpu_kernel[grid_size, block_size, cuda.default_stream(), SHARED_SIZE_BYTES](y, x, h)
    return y.copy_to_host()

In [6]:
test_convolve(lambda x, h: convolve_gpu(None, x, h))

All tests passed.


## How much improvement using shared memory gives us?

In [7]:
%%writefile 2_2_convolve_shared_memory.py

import math
from numba import cuda, float32, int32
import cupy as cp
from tests import test_convolve, benchmark_convolve

THREAD_BLOCK_SIZE = 256

@cuda.jit
def convolve_gpu_kernel(y, x, h):
    i = cuda.grid(1)
    
    if i >= y.shape[0]:
        return
    
    M = len(x)
    N = len(h)
    
    x_shared = cuda.shared.array(shape=0, dtype=float32)
    SHARED_SIZE = cuda.blockDim.x+N-1
    OFFSET = int32(math.ceil(N/2)-1)
    
    # Copy a portion of data from global memory to shared memory.
    
    # The current position in the global memory.
    k = i-(N-1)+OFFSET 
    # The current position in the shared memory.
    k_shared = cuda.threadIdx.x 
    while k_shared < SHARED_SIZE:
        if k >= 0 and k < M:
            x_shared[k_shared] = x[k]
        else:
            x_shared[k_shared] = float32(0.0)
        k_shared += cuda.blockDim.x
        k        += cuda.blockDim.x

    cuda.syncthreads()
    
    k_shared = cuda.threadIdx.x+N-1
    value = float32(0.0)
    for j in range(N):
        value += x_shared[k_shared-j]*h[j]
        
    y[i] = value
    
def convolve_gpu(y, x, h):
    if y is None:
        y = cuda.device_array(x.shape, dtype=x.dtype)
    
    # Determine thread and block size.
    n_threads = min(THREAD_BLOCK_SIZE, len(y))
    block_size = (n_threads, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    
    # Determine shared memory size.
    N = len(h)
    SHARED_SIZE = THREAD_BLOCK_SIZE+N-1
    SHARED_SIZE_BYTES = SHARED_SIZE*y.dtype.itemsize
    
    # Execute the kernel.
    convolve_gpu_kernel[grid_size, block_size, cuda.default_stream(), SHARED_SIZE_BYTES](y, x, h)
    return y.copy_to_host()


benchmark_convolve(lambda x, h: convolve_gpu(None, x, h))

Overwriting 2_2_convolve_shared_memory.py


In [8]:
! nvprof python 2_2_convolve_shared_memory.py

==25307== NVPROF is profiling process 25307, command: python 2_2_convolve_shared_memory.py
Benchmark result: 
Average processing time: 0.0255 seconds (+/- 0.0601), median: 0.0192
==25307== Profiling application: python 2_2_convolve_shared_memory.py
==25307== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   76.80%  1.36432s       100  13.643ms  13.227ms  15.811ms  cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=1, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)
                   14.84%  263.65ms       300  878.84us  1.1200us  2.5858ms  [CUDA memcpy DtoH]
                    8.36%  148.52ms       200  742.58us     896ns  1.5442ms  [CUDA memcpy HtoD]
      API calls:   77.61%  1.53060s       200  7.6530ms  12.728us  17.509ms  cuMemcpyDtoHAsync
                    7.45%  146.87ms       100  1.4687ms  1.4370ms  2.7823ms  cuMemcpyDtoH
                   