In [1]:
from numba import cuda
import time
import numpy as np

In [2]:
##Some GPU metrics

device = cuda.get_current_device()

print(f"Max threads per block - {device.MAX_THREADS_PER_BLOCK}") ##Work with this as this 
                                            #is required for locating idx

Max threads per block - 1024


#### Basic 1st GPU kernel using cuda jit from numbba - Square tthe elements of array


In [3]:

@cuda.jit
def square_kernel(ip, op):
    idx = cuda.grid(1) #ip is 1D

    if idx < ip.size: ## bounds check
        op[idx] = ip[idx] ** 2   ## Note that this is not a looping thing


##Data on CPU
ip = np.arange(1000000, dtype=np.int64)
op = np.zeros_like(ip)

##Copy data to GPU
d_arr = cuda.to_device(ip)
d_result = cuda.to_device(op)

threads_per_block = 1024
blocks_req = (ip.size + threads_per_block - 1) // threads_per_block

square_kernel[blocks_req, threads_per_block](d_arr, d_result)

result = d_result.copy_to_host()



In [4]:
del  d_result, ip, op, result, d_arr

#### Element-wise ops -start with addition

In [5]:
@cuda.jit
def add_arrays(a, b, arr_sum):
    idx = cuda.grid(1)

    if idx < arr_sum.size: ## this should probably do bounds check for all i/o
        arr_sum[idx] = a[idx] + b[idx]


##Create arrays on CPU
ip1 = np.random.rand(1000000)
ip2 = np.random.rand(1000000)

#Given this is size check its easier to assert this on CPU itself
assert ip1.size == ip2.size

res = np.zeros_like(ip1)
arr_sum = cuda.to_device(res)


threads_per_block = 1024
blocks = (ip1.size + threads_per_block - 1) // threads_per_block
add_arrays[blocks, threads_per_block](cuda.to_device(ip1), 
                                      cuda.to_device(ip2),
                                      arr_sum)


print((arr_sum.copy_to_host()[:10]))



[1.59339559 0.79031274 1.47804362 0.53435669 1.34245197 0.05528387
 1.59969826 1.76894895 0.6627682  1.60367803]


In [6]:
del ip1, ip2, res, arr_sum

In [8]:
cuda.close()

##### 2 D matrix addition - This is a lot easier than reduction operations in cuda jit

In [9]:
@cuda.jit(fastmath=True)  ##allow fastmath
def add_2d_matrices(A, B, C):

    i,j = cuda.grid(2)

    if i < A.shape[0] and j < A.shape[1]:
        C[i,j] = A[i,j] + B[i,j]


A = np.random.rand(10000, 10000)
B = np.random.rand(10000, 10000)
C = np.zeros_like(A)


assert A.shape == B.shape 

threads_per_block = (32, 32)  ##32*32=1024
blocks_x = (A.shape[0] + threads_per_block[0] - 1) // threads_per_block[0] # Blocks along 2 axes - #This is a visual illusion. In reality this doesnot exist 
blocks_y = (A.shape[1] + threads_per_block[1] - 1) // threads_per_block[1]

res = cuda.to_device(C)

add_2d_matrices[(blocks_x, blocks_y), threads_per_block](cuda.to_device(A),
                                                        cuda.to_device(B),
                                                        res)

print(res.copy_to_host()[0][:10])
    

CudaAPIError: [2] Call to cuMemAlloc results in CUDA_ERROR_OUT_OF_MEMORY

In [None]:
del A, B, C, res

##### Reduction type operations - This felt a bit difficult to understand
##### Honestly cupy is a lot more easier for this, if builtin ones are not helpful

##### Numba atomic operations - 
Usually used to prevent race conditions when you have want to read or write updates correctly. Mostly used when doing reduction type operations. Key tip is try to leverage shared memory to do this as opposed to global memory on GPU to facilitate faster read/write

In [None]:
@cuda.jit
def sum_array(a, op):

    idx = cuda.grid(1)
    if idx < a.size:
        cuda.atomic.add(op, 0, a[idx])



In [None]:
a = np.random.random(1000000).astype(np.float64)
d_result = cuda.to_device(np.array([0.0], dtype=np.float64))

tpb = 1024
blocks = (a.size + tpb - 1) // tpb
sum_array[blocks, tpb](cuda.to_device(a),d_result)
print(d_result.copy_to_host())

##Note play around with floats -> using float32 in the kernel and np.sum is internally
##float64 and given that these are floats due to roudning you will get a different answer

##In this example use float32 in kernel you will get a different answer with a larg diffewrnce with np.sum

In [None]:
np.sum(a)