# 1. CUDA basics: threads and kernels.

In [1]:
from numba import cuda
import numpy as np
import cupy as cp
import math

In [2]:
a = cp.random.randn(192, 4096, 64).astype(cp.float32)
a_small = cp.array([
    [[1,  2,  3],
     [4,  5,  6]],
    [[7,  8,  9],
     [10, 11, 12]]
], dtype=cp.float32)

In [15]:
a_texture = cp.cuda.texture.CUDAarray(
    cp.cuda.texture.ChannelFormatDescriptor(32, 0, 0, 0, cp.cuda.runtime.cudaChannelFormatKindFloat),
    3, 2, 2, 1)


## 1.1. Running CUDA kernel with numba.

### How to implement the kernel

In [3]:
@cuda.jit
def gpu_sum_rows(result, array):
    x, y = cuda.grid(2)
    if x >= array.shape[0] or y >= array.shape[1]:
        return
    r = 0
    for i in range(array.shape[2]):
        r += array[x, y, i]
    result[x, y] = r

### How to run the kernel

In [10]:
def sum_rows(array):
    block_size = (16, 16)
    grid_size_x = math.ceil(array.shape[0]/block_size[0])
    grid_size_y = math.ceil(array.shape[1]/block_size[1])
    grid_size = (grid_size_x, grid_size_y)
    output_array = cp.zeros(array.shape[:2], dtype=np.float32)
    gpu_sum_rows[grid_size, block_size](output_array, array)
    return output_array
    
result_small = sum_rows(a_small)
np.testing.assert_allclose(result_small.get(), np.array([[6, 15], [24, 33]], dtype=np.float32))
print("Result for small array:")
print(result_small)

result = sum_rows(a)
np.testing.assert_allclose(result.get(), cp.sum(a, axis=-1).get(), atol=1e-5)

Result for small array:
[[ 6. 15.]
 [24. 33.]]


## 1.2. Passing data to/from CUDA kernel.