# 3.1. Performance guidelines: memory coalescing

Let's move back for a while to the basic CUDA kernel convolution implemenation, with no shared and constant memory (for the sake of simplicity).

Let's extend the example to be possible to do 1-D convolution on a 2-D Array. 

In [32]:
import math
import numpy as np
from numba import cuda, float32, int32
import cupy as cp

In [33]:
@cuda.jit
def convolve_gpu_kernel(y, x, h):
    i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    j = cuda.blockIdx.y*cuda.blockDim.y + cuda.threadIdx.y

    N = len(h)
    offset = int32(math.ceil(N/2)-1)
    
    HEIGHT = x.shape[0]
    WIDTH = x.shape[1]
    
    if i >= WIDTH or j >= HEIGHT:
        return
    
    value = float32(0.0)
    for k in range(N):
        l = i + offset - k
        if l >= 0 and l < WIDTH:
            value += x[j, l]*h[k]
            
    y[j, i] = value
    
    
def convolve_gpu(y, x, h):
    block_size = (32, 32)
    height, width = x.shape
    # The left most index is the most quickly changing one.
    grid_size = (math.ceil(width/block_size[1]), math.ceil(height/block_size[0]))
    convolve_gpu_kernel[grid_size, block_size](y, x, h)

For example:

In [34]:
x_host = np.array(
    [[ 0,  1,  2,  3,  4],
     [10, 20, 30, 40, 50],
     [-1, -2, -3, -4, -5]])
x_host = x_host.astype(np.float32)

h_host = np.array([0, 1, 2])
h_host = h_host.astype(np.float32)

x_gpu = cuda.to_device(x_host)
h_gpu = cuda.to_device(h_host)
y_gpu = cuda.device_array(x_gpu.shape, dtype=x_gpu.dtype)
print(y_gpu.shape)
convolve_gpu(y_gpu, x_gpu, h_gpu)
print(y_gpu.copy_to_host())

(3, 5)
[[  0.   1.   4.   7.  10.]
 [ 10.  40.  70. 100. 130.]
 [ -1.  -4.  -7. -10. -13.]]


Depending on the axis along which we would like to perform the convolution, we will get two different performance results.

### I. Convolve along the first axis

In [35]:
%%writefile 3_1_memory_coalescing_axis_1.py


import math
import numpy as np
from numba import cuda, float32, int32
import cupy as cp


@cuda.jit
def convolve_gpu_kernel(y, x, h):
    i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    j = cuda.blockIdx.y*cuda.blockDim.y + cuda.threadIdx.y

    N = len(h)
    offset = int32(math.ceil(N/2)-1)
    
    HEIGHT = x.shape[0]
    WIDTH = x.shape[1]
    
    if i >= HEIGHT or j >= WIDTH:
        return
    
    value = float32(0.0)
    for k in range(N):
        l = i + offset - k
        if l >= 0 and l < HEIGHT:
            value += x[l, j]*h[k]
            
    y[i, j] = value
    
    
def convolve_gpu(y, x, h):
    block_size = (32, 32)
    height, width = x.shape
    # The left most index is the most quickly changing one.
    grid_size = (math.ceil(width/block_size[1]), math.ceil(height/block_size[0]))
    convolve_gpu_kernel[grid_size, block_size](y, x, h)
    
    
for i in range(10):
    x_host = np.random.rand(256, 256).astype(np.float32)
    h_host = np.random.rand(32).astype(np.float32)
    x_gpu = cuda.to_device(x_host)
    h_gpu = cuda.to_device(h_host)
    y_gpu = cuda.device_array(x_gpu.shape, dtype=x_gpu.dtype)
    convolve_gpu(y_gpu, x_gpu, h_gpu)
    y_host = y_gpu.copy_to_host()

Overwriting 3_1_memory_coalescing_axis_1.py


In [36]:
! nvprof --trace gpu python 3_1_memory_coalescing_axis_1.py

==8889== NVPROF is profiling process 8889, command: python 3_1_memory_coalescing_axis_1.py
==8889== Profiling application: python 3_1_memory_coalescing_axis_1.py
==8889== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   69.06%  3.7648ms        10  376.48us  371.98us  378.80us  cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)
                   16.17%  881.24us        20  44.062us     864ns  91.644us  [CUDA memcpy HtoD]
                   14.77%  805.21us        10  80.521us  79.292us  89.597us  [CUDA memcpy DtoH]
No API activities were profiled.


### II. Convolve along the second axis

In [37]:
%%writefile 3_1_memory_coalescing_axis_2.py


import math
import numpy as np
from numba import cuda, float32, int32
import cupy as cp


@cuda.jit
def convolve_gpu_kernel(y, x, h):
    i = cuda.blockIdx.x*cuda.blockDim.x + cuda.threadIdx.x
    j = cuda.blockIdx.y*cuda.blockDim.y + cuda.threadIdx.y

    N = len(h)
    offset = int32(math.ceil(N/2)-1)
    
    HEIGHT = x.shape[0]
    WIDTH = x.shape[1]
    
    if i >= WIDTH or j >= HEIGHT:
        return
    
    value = float32(0.0)
    for k in range(N):
        l = i + offset - k
        if l >= 0 and l < WIDTH:
            value += x[j, l]*h[k]
            
    y[j, i] = value
    
    
def convolve_gpu(y, x, h):
    block_size = (32, 32)
    height, width = x.shape
    # The left most index is the most quickly changing one.
    grid_size = (math.ceil(width/block_size[1]), math.ceil(height/block_size[0]))
    convolve_gpu_kernel[grid_size, block_size](y, x, h)
    
for i in range(10):
    x_host = np.random.rand(256, 256).astype(np.float32)
    h_host = np.random.rand(32).astype(np.float32)
    x_gpu = cuda.to_device(x_host)
    h_gpu = cuda.to_device(h_host)
    y_gpu = cuda.device_array(x_gpu.shape, dtype=x_gpu.dtype)
    convolve_gpu(y_gpu, x_gpu, h_gpu)
    y_host = y_gpu.copy_to_host()

Overwriting 3_1_memory_coalescing_axis_2.py


In [38]:
! nvprof --trace gpu python 3_1_memory_coalescing_axis_2.py

==8917== NVPROF is profiling process 8917, command: python 3_1_memory_coalescing_axis_2.py
==8917== Profiling application: python 3_1_memory_coalescing_axis_2.py
==8917== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   55.58%  2.2042ms        10  220.42us  218.23us  222.68us  cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)
                   22.29%  883.87us        20  44.193us     863ns  97.532us  [CUDA memcpy HtoD]
                   22.13%  877.69us        10  87.769us  79.261us  158.62us  [CUDA memcpy DtoH]
No API activities were profiled.


In [39]:
! nvprof --profile-api-trace none --metrics gld_efficiency,gst_efficiency python 3_1_memory_coalescing_axis_1.py

==8946== NVPROF is profiling process 8946, command: python 3_1_memory_coalescing_axis_1.py
==8946== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
==8946== Replaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (1 of 4)... 
	4 internal events
==8946== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (2 of 4)... 
	2 internal events
==8946== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (3 of 4)... 
	1 internal events
==8946== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convol

==8946== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (2 of 4)... 
	2 internal events
==8946== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (3 of 4)... 
	4 internal events
==8946== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (4 of 4)... 
	1 internal events
==8946== [1A
[K[2A[K
[1A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (done)
==8946== 

In [40]:
! nvprof --profile-api-trace none --metrics gld_efficiency,gst_efficiency python 3_1_memory_coalescing_axis_2.py

==8976== NVPROF is profiling process 8976, command: python 3_1_memory_coalescing_axis_2.py
==8976== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
==8976== Replaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (1 of 4)... 
	1 internal events
==8976== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (2 of 4)... 
	2 internal events
==8976== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (3 of 4)... 
	4 internal events
==8976== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convol

==8976== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (2 of 4)... 
	1 internal events
==8976== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (3 of 4)... 
	2 internal events
==8976== [1A
[K[2A[K
[K
[2A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (4 of 4)... 
	4 internal events
==8976== [1A
[K[2A[K
[1A[KReplaying kernel "cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=2, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)" (done)
==8976== 