In [1]:
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

In [2]:
!cl

usage: cl [ option... ] filename... [ /link linkoption... ]


Microsoft (R) C/C++ Optimizing Compiler Version 19.43.34810 for x64
Copyright (C) Microsoft Corporation.  All rights reserved.



In [8]:
def load_numpy(filename: str) -> np.ndarray:
    return np.load(f'./numpy/{filename}')

In [9]:
vertices = load_numpy('piece_vertices.npy')
triangles = load_numpy('triangle_vertices.npy')
normals = load_numpy('triangle_normals.npy')

In [4]:
mod = SourceModule("""
__device__ bool ray_intersects_triangle(float3 orig, float3 dir,
                                        float3 v0, float3 v1, float3 v2) {
    const float EPSILON = 1e-6f;
    float3 edge1 = make_float3(v1.x - v0.x, v1.y - v0.y, v1.z - v0.z);
    float3 edge2 = make_float3(v2.x - v0.x, v2.y - v0.y, v2.z - v0.z);
    float3 h = make_float3(
        dir.y * edge2.z - dir.z * edge2.y,
        dir.z * edge2.x - dir.x * edge2.z,
        dir.x * edge2.y - dir.y * edge2.x
    );
    float a = edge1.x * h.x + edge1.y * h.y + edge1.z * h.z;
    if (fabs(a) < EPSILON)
        return false;
    float f = 1.0f / a;
    float3 s = make_float3(orig.x - v0.x, orig.y - v0.y, orig.z - v0.z);
    float u = f * (s.x * h.x + s.y * h.y + s.z * h.z);
    if (u < 0.0f || u > 1.0f)
        return false;
    float3 q = make_float3(
        s.y * edge1.z - s.z * edge1.y,
        s.z * edge1.x - s.x * edge1.z,
        s.x * edge1.y - s.y * edge1.x
    );
    float v = f * (dir.x * q.x + dir.y * q.y + dir.z * q.z);
    if (v < 0.0f || u + v > 1.0f)
        return false;
    float t = f * (edge2.x * q.x + edge2.y * q.y + edge2.z * q.z);
    return t > EPSILON;
}

__global__ void point_in_mesh(float *triangles, int num_triangles,
                              float *points, int num_points,
                              bool *output_flags) {
    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (pt_idx >= num_points) return;

    float3 query = make_float3(
        points[pt_idx * 3 + 0],
        points[pt_idx * 3 + 1],
        points[pt_idx * 3 + 2]
    );
    float3 ray_dir = make_float3(1.0f, 0.0f, 0.0f); // +X ray

    int hit_count = 0;

    for (int tri_idx = 0; tri_idx < num_triangles; ++tri_idx) {
        float3 v0 = make_float3(
            triangles[tri_idx * 9 + 0],
            triangles[tri_idx * 9 + 1],
            triangles[tri_idx * 9 + 2]
        );
        float3 v1 = make_float3(
            triangles[tri_idx * 9 + 3],
            triangles[tri_idx * 9 + 4],
            triangles[tri_idx * 9 + 5]
        );
        float3 v2 = make_float3(
            triangles[tri_idx * 9 + 6],
            triangles[tri_idx * 9 + 7],
            triangles[tri_idx * 9 + 8]
        );

        if (ray_intersects_triangle(query, ray_dir, v0, v1, v2)) {
            hit_count++;
        }
    }

    output_flags[pt_idx] = (hit_count % 2 == 1);
}
""")

NameError: name 'SourceModule' is not defined

In [3]:
point_in_mesh = mod.get_function("point_in_mesh")

NameError: name 'SourceModule' is not defined

In [9]:
def get_closest_point(points_gpu, query_point_gpu, distances_gpu,
                      num_points, points, 
                      block_size, grid_size,
                      min_idx, min_idx_gpu):
    find_closest_point(points_gpu, query_point_gpu, distances_gpu, 
                       num_points, block=(block_size, 1, 1), grid=(grid_size, 1))

    # Run the kernel to find the minimum distance index
    find_min_distance_index(distances_gpu, min_idx_gpu, num_points, block=(block_size, 1, 1), grid=(1, 1))
    
    # Copy the result back to host
    cuda.memcpy_dtoh(min_idx, min_idx_gpu)
    
    # Find the closest point
    return points[min_idx[0]]

In [10]:
# Define number of 3D points
num_points = np.int32(1024 * 1024)

# Allocate memory on GPU
points_gpu = cuda.mem_alloc(points.nbytes)
query_point_gpu = cuda.mem_alloc(query.nbytes)
distances_gpu = cuda.mem_alloc(points.shape[0] * np.float32(0).nbytes)
min_idx_gpu = cuda.mem_alloc(np.int32(0).nbytes)

# Copy data to GPU
cuda.memcpy_htod(points_gpu, points)
cuda.memcpy_htod(query_point_gpu, query)

# Prepare the kernel functions
find_closest_point = mod.get_function("find_closest_point")
find_min_distance_index = mod.get_function("find_min_distance_index")

# Define block and grid sizes
block_size = 1024
grid_size = (int(num_points) + block_size - 1) // block_size

# Run the kernel to calculate distances
find_closest_point(points_gpu, query_point_gpu, distances_gpu, num_points, block=(block_size, 1, 1), grid=(grid_size, 1))

# Run the kernel to find the minimum distance index
find_min_distance_index(distances_gpu, min_idx_gpu, num_points, block=(block_size, 1, 1), grid=(1, 1))

# Copy the result back to host
min_idx = np.zeros(1, dtype=np.int32)
cuda.memcpy_dtoh(min_idx, min_idx_gpu)

# Find the closest point
closest_point = points[min_idx[0]]

print(f"The closest point to {query} is {closest_point} at index {min_idx[0]}")

The closest point to [0.8656291  0.7443568  0.20227067] is [0.55593413 0.16710657 0.67367786] at index 0


In [11]:
get_closest_point(points_gpu, query_point_gpu, distances_gpu, num_points, points, block_size, grid_size, min_idx, min_idx_gpu)

array([0.55593413, 0.16710657, 0.67367786], dtype=float32)

In [15]:
%timeit get_closest_point(points_gpu, query_point_gpu, distances_gpu, num_points, points, block_size, grid_size, min_idx, min_idx_gpu)

269 µs ± 46.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
