In [1]:
from numba import cuda

# Note the use of an `out` array. CUDA kernels written with `@cuda.jit` do not return values,
# just like their C counterparts. Also, no explicit type signature is required with @cuda.jit
@cuda.jit
def add_kernel(x, y, out):
    
    # The actual values of the following CUDA-provided variables for thread and block indices,
    # like function parameters, are not known until the kernel is launched.
    
    # This calculation gives a unique thread index within the entire grid (see the slides above for more)
    idx = cuda.grid(1)          # 1 = one dimensional thread grid, returns a single value.
                                # This Numba-provided convenience function is equivalent to
                                # `cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x`

    # This thread will do the work on the data element with the same index as its own
    # unique index within the grid.
    out[idx] = x[idx] + y[idx]

In [2]:
import numpy as np

n = 4096
x = np.arange(n).astype(np.int32) # [0...4095] on the host
y = np.ones_like(x)               # [1...1] on the host

d_x = cuda.to_device(x) # Copy of x on the device
d_y = cuda.to_device(y) # Copy of y on the device
d_out = cuda.device_array_like(d_x) # Like np.array_like, but for device arrays

# Because of how we wrote the kernel above, we need to have a 1 thread to one data element mapping,
# therefore we define the number of threads in the grid (128*32) to equal n (4096).
threads_per_block = 128
blocks_per_grid = 32

In [3]:
add_kernel[blocks_per_grid, threads_per_block](d_x, d_y, d_out)
cuda.synchronize()
print(d_out.copy_to_host()) # Should be [1...4096]



[   1    2    3 ... 4094 4095 4096]


In [53]:
import numpy as np
from numba import vectorize, cuda, float32
import math
import time

# CUDA @vectorize 데코레이터를 사용한 arctan 함수
@vectorize([float32(float32)], target='cuda')
def arctan_cuda_vectorize(x):
    return math.atan(x)

# CUDA @cuda.jit 데코레이터를 사용한 arctan 함수
@cuda.jit
def arctan_cuda_jit(x, result):
    i = cuda.grid(1)  # 스레드 인덱스 계산
    if i < x.size:
        result[i] = math.atan(x[i])

# Numpy로 arctan 계산
def arctan_numpy(x):
    return np.arctan(x)

# 입력 데이터 생성 (큰 배열로 성능 테스트)
x = np.linspace(-1, 1, 100000000, dtype=np.float32)

# CUDA @vectorize 시간 측정
start_cuda_vectorize = time.time()
arctan_cuda_vectorize_result = arctan_cuda_vectorize(x)
end_cuda_vectorize = time.time()

# CUDA @cuda.jit 시간 측정
arctan_cuda_jit_result = np.empty_like(x)
threads_per_block = 512
blocks_per_grid = (x.size + (threads_per_block - 1)) // threads_per_block
start_cuda_jit = time.time()
arctan_cuda_jit[blocks_per_grid, threads_per_block](x, arctan_cuda_jit_result)
cuda.synchronize()  # CUDA 커널 동기화
end_cuda_jit = time.time()

# Numpy 시간 측정
start_numpy = time.time()
arctan_numpy_result = arctan_numpy(x)
end_numpy = time.time()

# 시간 출력
print("CUDA @vectorize 계산 시간: {:.8f}초".format(end_cuda_vectorize - start_cuda_vectorize))
print("CUDA @cuda.jit 계산 시간: {:.8f}초".format(end_cuda_jit - start_cuda_jit))
print("Numpy 계산 시간: {:.8f}초".format(end_numpy - start_numpy))




CUDA @vectorize 계산 시간: 0.39950538초
CUDA @cuda.jit 계산 시간: 0.34300160초
Numpy 계산 시간: 0.52499819초


In [54]:
import numpy as np
from numba import cuda

n = 2048*2048 # 4M

# 2D blocks
threads_per_block = (32, 32)
# 2D grid
blocks = (64, 64)

# 2048x2048 input matrices
a = np.arange(n).reshape(2048,2048).astype(np.float32)
b = a.copy().astype(np.float32)

# 2048x2048 0-initialized output matrix
out = np.zeros_like(a).astype(np.float32)

d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_out = cuda.to_device(out)