In [1]:
import numpy as np
from numba import cuda, float32, int32

In [2]:
@cuda.jit
def find_nearest_point_shared(points, query, min_idx, min_dist):
    shared_min_dist = cuda.shared.array(1024, dtype=float32)
    shared_min_idx = cuda.shared.array(1024, dtype=int32)
    
    tid = cuda.threadIdx.x
    i = cuda.grid(1)
    n = points.shape[0]

    local_dist = float32(1e20)
    local_idx = int32(-1)

    if i < n:
        dx = points[i, 0] - query[0]
        dy = points[i, 1] - query[1]
        dz = points[i, 2] - query[2]
        dist = dx*dx + dy*dy + dz*dz
        local_dist = dist
        local_idx = i

    shared_min_dist[tid] = local_dist
    shared_min_idx[tid] = local_idx
    cuda.syncthreads()

    # Intra-block reduction in shared memory
    stride = cuda.blockDim.x // 2
    while stride > 0:
        if tid < stride:
            if shared_min_dist[tid + stride] < shared_min_dist[tid]:
                shared_min_dist[tid] = shared_min_dist[tid + stride]
                shared_min_idx[tid] = shared_min_idx[tid + stride]
        stride //= 2
        cuda.syncthreads()

    # Write per-block result to global memory
    if tid == 0:
        block_id = cuda.blockIdx.x
        min_dist[block_id] = shared_min_dist[0]
        min_idx[block_id] = shared_min_idx[0]

def final_reduce(min_dist_d, min_idx_d):
    min_dist_h = min_dist_d.copy_to_host()
    min_idx_h = min_idx_d.copy_to_host()
    best_idx = min_idx_h[np.argmin(min_dist_h)]
    return best_idx

def find_nearest_point_gpu(points_device, query_device, min_idx, min_distance):
    # Call the kernel to compute the closest point and minimum distance
    find_nearest_point_shared[blocks, threads_per_block](points_device, query_device, min_idx, min_distance)
    best_idx = final_reduce(min_distance, min_idx)

    return best_idx

In [3]:
N = 1024**2  # Number of points
# Define grid and block dimensions
threads_per_block = 1024
blocks = (N + threads_per_block - 1) // threads_per_block

points_host = np.random.rand(N, 3).astype(np.float32)  # 1 million 3D points
points_device = cuda.to_device(points_host)  # Pre-allocate points on the device

query_host = np.array([0.1, 0.5, 0.9], dtype=np.float32)  # Query point

# Allocate device memory for the closest point and minimum distance
min_idx = cuda.device_array(blocks, dtype=np.int32)  # Use np.int32 for integer array
min_distance = cuda.device_array(blocks, dtype=np.float32)  # Use np.float32 for float array

query_device = cuda.to_device(np.array(query_host, dtype=np.float32))

In [4]:
find_nearest_point_gpu(points_device, query_device, min_idx, min_distance)

370408

In [8]:
%timeit find_nearest_point_gpu(points_device, query_device, min_idx, min_distance)

1.36 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
