In [1]:
import numpy as np
from numba import cuda

In [2]:
@cuda.jit
def vec_add_kernel(arr1, arr2, output):
    i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    if i < arr1.size:
        output[i] = arr1[i] + arr2[i]

In [12]:
def vec_add(arr1, arr2, output):
    arr1_d = cuda.to_device(arr1)
    arr2_d = cuda.to_device(arr2)
    output_d = cuda.to_device(output)

    threadsperblock = 16
    blockspergrid = int((arr1.size + threadsperblock -1) / threadsperblock)
    # print(blockspergrid)

    vec_add_kernel[blockspergrid, threadsperblock](arr1_d, arr2_d, output_d)

    output_d.copy_to_host(output)

In [13]:
arr1 = np.arange(10000)
arr2 = np.arange(10000)
output = np.zeros(10000)

In [14]:
vec_add(arr1, arr2, output)
print(output)

[0.0000e+00 2.0000e+00 4.0000e+00 ... 1.9994e+04 1.9996e+04 1.9998e+04]
