# 2.3 Memory model: constant memory

In [1]:
import math
from numba import cuda, float32, int32
import cupy as cp
import numpy as np
from tests import test_convolve_const

## How much constant memory we have?

In [2]:
device_props = cp.cuda.runtime.getDeviceProperties(0)

print(f"Device name: {device_props['name']}")
print(f"Shared memory per thread block: {device_props['totalConstMem']} [bytes]")

Device name: b'GeForce MX250'
Shared memory per thread block: 65536 [bytes]


## How to use constant memory?

In [3]:
THREAD_BLOCK_SIZE = 256

- if we assume that filter coefficients doesn't change in the runtime, we can put them into the constant memory

In [4]:
h_host_const = np.random.rand(5).astype(np.float32)

@cuda.jit
def convolve_gpu_kernel(y, x):
    i = cuda.grid(1)
    
    if i >= y.shape[0]:
        return
    
    # Constant memory
    h_gpu_const = cuda.const.array_like(h_host_const)
    
    M = len(x)
    N = len(h_gpu_const)
    OFFSET = int32(math.ceil(N/2)-1)
    
    
    # Shared memory
    x_shared = cuda.shared.array(shape=0, dtype=float32)
    SHARED_SIZE = cuda.blockDim.x+N-1
    
    
    # Copy a portion of data from global memory to shared memory.
    # The current position in the global memory.
    k = i-(N-1)+OFFSET 
    # The current position in the shared memory.
    k_shared = cuda.threadIdx.x 
    while k_shared < SHARED_SIZE:
        if k >= 0 and k < M:
            x_shared[k_shared] = x[k]
        else:
            x_shared[k_shared] = float32(0.0)
        k_shared += cuda.blockDim.x
        k        += cuda.blockDim.x

    cuda.syncthreads()
    
    k_shared = cuda.threadIdx.x+N-1
    value = float32(0.0)
    for j in range(N):
        value += x_shared[k_shared-j]*h_gpu_const[j]
        
    y[i] = value

In [5]:
def convolve_gpu(y, x):
    if y is None:
        y = cuda.device_array(x.shape, dtype=x.dtype)
    
    # Determine thread and block size.
    n_threads = min(THREAD_BLOCK_SIZE, len(y))
    block_size = (n_threads, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    
    # Determine shared memory size.
    N = len(h_host_const)
    SHARED_SIZE = THREAD_BLOCK_SIZE+N-1
    SHARED_SIZE_BYTES = SHARED_SIZE*y.dtype.itemsize
    
    if SHARED_SIZE_BYTES > device_props['sharedMemPerBlock']:    
        raise ValueError("Declared shared memory size exceeds the amount available for the device.")
    
    # Execute the kernel.
    convolve_gpu_kernel[grid_size, block_size, cuda.default_stream(), SHARED_SIZE_BYTES](y, x)
    return y.copy_to_host()

In [6]:
test_convolve_const(lambda x: convolve_gpu(None, x), h_host_const)

All tests passed.


In [7]:
%%writefile 2_3_convolve_const_memory.py


import math
from numba import cuda, float32, int32
import cupy as cp
import numpy as np
from tests import benchmark_convolve_const, DEFAULT_BENCHMARK_H_SIZE


THREAD_BLOCK_SIZE = 256


h_host_const = np.random.rand(DEFAULT_BENCHMARK_H_SIZE).astype(np.float32)


@cuda.jit
def convolve_gpu_kernel(y, x):
    i = cuda.grid(1)
    
    if i >= y.shape[0]:
        return
    
    # Constant memory
    h_gpu_const = cuda.const.array_like(h_host_const)
    
    M = len(x)
    N = len(h_gpu_const)
    OFFSET = int32(math.ceil(N/2)-1)
    
    
    # Shared memory
    x_shared = cuda.shared.array(shape=0, dtype=float32)
    SHARED_SIZE = cuda.blockDim.x+N-1
    
    
    # Copy a portion of data from global memory to shared memory.
    # The current position in the global memory.
    k = i-(N-1)+OFFSET 
    # The current position in the shared memory.
    k_shared = cuda.threadIdx.x 
    while k_shared < SHARED_SIZE:
        if k >= 0 and k < M:
            x_shared[k_shared] = x[k]
        else:
            x_shared[k_shared] = float32(0.0)
        k_shared += cuda.blockDim.x
        k        += cuda.blockDim.x

    cuda.syncthreads()
    
    k_shared = cuda.threadIdx.x+N-1
    value = float32(0.0)
    for j in range(N):
        value += x_shared[k_shared-j]*h_gpu_const[j]
        
    y[i] = value

    
def convolve_gpu(y, x):
    if y is None:
        y = cuda.device_array(x.shape, dtype=x.dtype)
    
    # Determine thread and block size.
    n_threads = min(THREAD_BLOCK_SIZE, len(y))
    block_size = (n_threads, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    
    # Determine shared memory size.
    N = len(h_host_const)
    SHARED_SIZE = THREAD_BLOCK_SIZE+N-1
    SHARED_SIZE_BYTES = SHARED_SIZE*y.dtype.itemsize
    
    # Execute the kernel.
    convolve_gpu_kernel[grid_size, block_size, cuda.default_stream(), SHARED_SIZE_BYTES](y, x)
    return y.copy_to_host()    

benchmark_convolve_const(lambda x: convolve_gpu(None, x), h_host_const)

Overwriting 2_3_convolve_const_memory.py


In [8]:
! nvprof python 2_3_convolve_const_memory.py

==28454== NVPROF is profiling process 28454, command: python 2_3_convolve_const_memory.py
Benchmark result: 
Average processing time: 0.0229 seconds (+/- 0.0634), median: 0.0159
==28454== Profiling application: python 2_3_convolve_const_memory.py
==28454== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   72.65%  1.08956s       100  10.896ms  10.618ms  13.389ms  cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=1, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)
                   17.42%  261.21ms       200  1.3060ms  1.2894ms  2.1076ms  [CUDA memcpy DtoH]
                    9.94%  149.02ms       100  1.4902ms  1.3706ms  1.5719ms  [CUDA memcpy HtoD]
      API calls:   73.87%  1.28092s       100  12.809ms  12.462ms  15.842ms  cuMemcpyDtoHAsync
                    8.42%  146.07ms       100  1.4607ms  1.4382ms  1.7524ms  cuMemcpyDtoH
                    6.22%  107.95ms       100  1.0795ms  984.27