In [None]:
@cuda.jit
def add_experiment(a, b, out, stride, coalesced):
    i = cuda.grid(1)
    # The above line is equivalent to
    # i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    if coalesced == True:
        out[i] = a[i] + b[i]
    else:
        out[i] = a[stride*i] + b[stride*i]

In [None]:
@cuda.jit
def row_sums(a, sums, n):
    idx = cuda.grid(1)
    sum = 0.0

    for i in range(n):
        sum += a[idx][i]

    sums[idx] = sum
@cuda.jit
def col_sums(a, sums, ds):
    idx = cuda.grid(1)
    sum = 0.0

    for i in range(n):
        sum += a[i][idx]

    sums[idx] = sum

In [None]:
A = np.zeros((4,4))
d_A = cuda.to_device(A)

blocks = (2, 2)
threads_per_block = (2, 2)
@cuda.jit
def get_2D_indices(A):

    x, y = cuda.grid(2)

    A[x][y] = x + y / 10

In [None]:
@cuda.jit
def swap_with_shared(vector, swapped):

    temp = cuda.shared.array(4, dtype=types.int32)

    idx = cuda.grid(1)


    temp[idx] = vector[idx]

    cuda.syncthreads()

    swapped[idx] = temp[3 - cuda.threadIdx.x]

In [None]:
@cuda.jit
def tile_transpose(a, transposed):

    tile = cuda.shared.array((32, 33), numba_types.float32)

    a_col = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    a_row = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y

    tile[cuda.threadIdx.y, cuda.threadIdx.x] = a[a_row, a_col]

    cuda.syncthreads()

    t_col = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.x
    t_row = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.y

    transposed[t_row, t_col] = tile[cuda.threadIdx.x, cuda.threadIdx.y]