In [1]:
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

In [2]:
from common import read_file_str, show_formatted_cpp, load_numpy

In [3]:
!cl

usage: cl [ option... ] filename... [ /link linkoption... ]


Microsoft (R) C/C++ Optimizing Compiler Version 19.43.34810 for x64
Copyright (C) Microsoft Corporation.  All rights reserved.



In [4]:
points = load_numpy("nearest_neighbour_points.npy")
query = load_numpy("nearest_neighbour_query.npy")

In [5]:
cuda_code = read_file_str("./kernels/nearest_neighbour.cu")

In [6]:
show_formatted_cpp(cuda_code)

In [7]:
mod = SourceModule(cuda_code)

In [8]:
def get_closest_point(points_gpu, query_point_gpu,
                      num_points, points, 
                      block_size, grid_size,
                      min_idx, min_idx_gpu, min_distances, min_distance_gpu):

    find_min_distance_index(points_gpu, query_point_gpu,
                            min_idx_gpu, min_distance_gpu, num_points,
                            block=(block_size, 1, 1), grid=(grid_size, 1))
    
    cuda.memcpy_dtoh(min_idx, min_idx_gpu)
    cuda.memcpy_dtoh(min_distances, min_distance_gpu)
    
    # Find the closest point
    return points[min_idx[np.argmin(min_distances)]]

In [9]:
num_points = np.int32(1024 * 1024)
min_distances = np.array([0] * 1024, dtype=np.float32)
min_idx = np.array([0] * 1024, dtype=np.int32)

In [10]:
points_gpu = cuda.mem_alloc(points.nbytes)
query_point_gpu = cuda.mem_alloc(query.nbytes)
min_idx_gpu = cuda.mem_alloc(min_idx.nbytes)
min_distance_gpu = cuda.mem_alloc(min_distances.nbytes)

In [11]:
cuda.memcpy_htod(points_gpu, points)
cuda.memcpy_htod(query_point_gpu, query)
cuda.memcpy_htod(min_idx_gpu, min_idx)
cuda.memcpy_htod(min_distance_gpu, min_distances)

In [12]:
find_min_distance_index = mod.get_function("find_min_distance_index")

In [13]:
block_size = 1024
grid_size = (int(num_points) + block_size - 1) // block_size

In [14]:
get_closest_point(points_gpu, query_point_gpu,
                  num_points, points,
                  block_size, grid_size,
                  min_idx, min_idx_gpu, min_distances, min_distance_gpu)

array([0.72436523, 0.10307071, 0.2642327 ], dtype=float32)

In [15]:
%timeit get_closest_point(points_gpu, query_point_gpu, num_points, points, block_size, grid_size, min_idx, min_idx_gpu, min_distances, min_distance_gpu)

295 µs ± 66.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
