In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.compiler import SourceModule

This does not work, it needs to be worked out and tested against Trimesh be

In [4]:
# Define CUDA kernel to calculate closest point on a triangle
kernel_code = """
__device__ float3 cross_product(float3 a, float3 b) {
    return make_float3(a.y * b.z - a.z * b.y,
                       a.z * b.x - a.x * b.z,
                       a.x * b.y - a.y * b.x);
}

__device__ float dot_product(float3 a, float3 b) {
    return a.x * b.x + a.y * b.y + a.z * b.z;
}

__device__ float3 subtract(float3 a, float3 b) {
    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
}

__device__ float3 add(float3 a, float3 b) {
    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}

__device__ float3 scale(float3 v, float s) {
    return make_float3(v.x * s, v.y * s, v.z * s);
}

__device__ bool point_in_triangle(float3 p, float3 v0, float3 v1, float3 v2) {
    // Barycentric coordinates method to check if the point is inside the triangle
    float3 v0v1 = subtract(v1, v0);
    float3 v0v2 = subtract(v2, v0);
    float3 v0p = subtract(p, v0);
    
    float d00 = dot_product(v0v1, v0v1);
    float d01 = dot_product(v0v1, v0v2);
    float d11 = dot_product(v0v2, v0v2);
    float d20 = dot_product(v0p, v0v1);
    float d21 = dot_product(v0p, v0v2);
    
    float denom = d00 * d11 - d01 * d01;
    float v = (d11 * d20 - d01 * d21) / denom;
    float w = (d00 * d21 - d01 * d20) / denom;
    float u = 1.0f - v - w;
    
    return (u >= 0.0f) && (v >= 0.0f) && (w >= 0.0f);
}

__device__ float3 closest_point_on_triangle(float3 p, float3 v0, float3 v1, float3 v2) {
    // Compute the closest point using perpendicular projection
    float3 v0v1 = subtract(v1, v0);
    float3 v0v2 = subtract(v2, v0);
    float3 v0p = subtract(p, v0);
    
    float d00 = dot_product(v0v1, v0v1);
    float d01 = dot_product(v0v1, v0v2);
    float d11 = dot_product(v0v2, v0v2);
    float d20 = dot_product(v0p, v0v1);
    float d21 = dot_product(v0p, v0v2);
    
    float denom = d00 * d11 - d01 * d01;
    float v = (d11 * d20 - d01 * d21) / denom;
    float w = (d00 * d21 - d01 * d20) / denom;
    float u = 1.0f - v - w;
    
    float3 closest_point = add(v0, add(scale(v0v1, u), scale(v0v2, v)));
    return closest_point;
}

__global__ void find_closest_point_and_normal(float3 *vertices, int3 *indices, float3 query_point, 
                                               float3 *closest_point, float3 *normals, int num_triangles) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= num_triangles) return;
    
    int3 triangle = indices[idx];
    
    float3 v0 = vertices[triangle.x];
    float3 v1 = vertices[triangle.y];
    float3 v2 = vertices[triangle.z];
    
    // Find closest point on triangle
    float3 point_on_triangle = closest_point_on_triangle(query_point, v0, v1, v2);
    
    // Check if this point is closer than the current closest point
    float3 closest = *closest_point;
    float dist_new = dot_product(subtract(point_on_triangle, query_point), subtract(point_on_triangle, query_point));
    float dist_old = dot_product(subtract(closest, query_point), subtract(closest, query_point));
    
    if (dist_new < dist_old || dist_old == 0.0f) {
        *closest_point = point_on_triangle;
        
        // Compute normal of the triangle
        float3 edge1 = subtract(v1, v0);
        float3 edge2 = subtract(v2, v0);
        float3 normal = cross_product(edge1, edge2);
        float norm_length = sqrt(dot_product(normal, normal));
        normal = scale(normal, 1.0f / norm_length);  // Normalize the normal
        
        *normals = normal;
    }
}
"""

# Compile the CUDA kernel
mod = SourceModule(kernel_code)

In [15]:
# Initialize GPU memory
def closest_point_on_mesh(vertices, triangles, query_point):
    num_triangles = len(triangles)
    
    # Allocate memory on GPU
    vertex_gpu = cuda.mem_alloc(vertices.nbytes)
    triangle_gpu = cuda.mem_alloc(triangles.nbytes)
    query_gpu = cuda.mem_alloc(query_point.nbytes)
    closest_point_gpu = cuda.mem_alloc(query_point.nbytes)
    normals_gpu = cuda.mem_alloc(query_point.nbytes)

    # Transfer data to GPU
    cuda.memcpy_htod(vertex_gpu, vertices)
    cuda.memcpy_htod(triangle_gpu, triangles)
    cuda.memcpy_htod(query_gpu, query_point)
    
    # Initialize closest point to a large value
    initial_point = np.array([float('inf'), float('inf'), float('inf')], dtype=np.float32)
    cuda.memcpy_htod(closest_point_gpu, initial_point)
    
    # Launch the kernel
    block_size = 256
    grid_size = (num_triangles + block_size - 1) // block_size
    func = mod.get_function("find_closest_point_and_normal")
    func(vertex_gpu, triangle_gpu, query_gpu, closest_point_gpu, normals_gpu, np.int32(num_triangles), 
         block=(block_size, 1, 1), grid=(grid_size, 1))

    # Retrieve results from GPU
    closest_point_result = np.empty_like(query_point)
    cuda.memcpy_dtoh(closest_point_result, closest_point_gpu)
    normals_result = np.empty_like(query_point)
    cuda.memcpy_dtoh(normals_result, normals_gpu)

    return closest_point_result, normals_result

# Example usage
vertices = np.array([
    [0.0, 0.0, 0.0],  # Vertex 0
    [1.0, 0.0, 0.0],  # Vertex 1
    [0.0, 1.0, 0.0],  # Vertex 2
    [0.0, 0.0, 1.0],  # Vertex 3
], dtype=np.float32)

triangles = np.array([
    [0, 1, 2],  # Triangle 1
    [0, 1, 3],  # Triangle 2
], dtype=np.int32)

query_point = np.array([0.1, 0.1, 0.1], dtype=np.float32)

closest_point, normal = closest_point_on_mesh(vertices, triangles, query_point)
print("Closest Point:", closest_point)
print("Normal:", normal)

Closest Point: [inf inf inf]
Normal: [0. 0. 0.]


In [13]:
from pygments import highlight
from pygments.lexers import CppLexer
from pygments.formatters import HtmlFormatter
from IPython.core.display import HTML

In [14]:
formatter = HtmlFormatter(style="colorful", full=True, noclasses=True)
highlighted_code = highlight(kernel_code, CppLexer(), formatter)

HTML(highlighted_code)

## Gemini attempt

Gemini doesn't give compilable code.

In [18]:
# Kernel code
kernel_code = """
__device__ float dot_product(float *a, float *b) {
    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
}

__device__ float point_triangle_distance_squared(float *p, float *v0, float *v1, float *v2) {
    // Compute vectors
    float e0[3], e1[3], e2[3];
    for (int i = 0; i < 3; ++i) {
        e0[i] = v1[i] - v0[i];
        e1[i] = v2[i] - v0[i];
        e2[i] = v0[i] - p[i];
    }

    // Compute cross products
    float a[3], b[3], c[3];
    a[0] = e0[1] * e2[2] - e0[2] * e2[1];
    a[1] = e0[2] * e2[0] - e0[0] * e2[2];
    a[2] = e0[0] * e2[1] - e0[1] * e2[0];

    b[0] = e1[1] * a[2] - e1[2] * a[1];
    b[1] = e1[2] * a[0] - e1[0] * a[2];
    b[2] = e1[0] * a[1] - e1[1] * a[0];

    c[0] = e2[1] * b[2] - e2[2] * b[1];
    c[1] = e2[2] * b[0] - e2[0] * b[2];
    c[2] = e2[0] * b[1] - e2[1] * b[0];

    // Compute squared lengths
    float a_dot_a = dot_product(a, a);
    float b_dot_b = dot_product(b, b);
    float c_dot_c = dot_product(c, c);

    // Compute signed volumes
    float det = dot_product(a, e1);
    float s = dot_product(a, e2);
    float t = dot_product(e0, c);

    if (det > 0.0f) {
        if (s < 0.0f || s > det) {
            if (t < 0.0f) {
                if (s < 0.0f) { // Region 4
                    float dist_sq = dot_product(p, p) + dot_product(v0, v0) - 2.0f * dot_product(p, v0);
                    return dist_sq;
                } else { // Region 3
                    float b_dot_e0 = dot_product(b, e0);
                    float dist_sq = (b_dot_b * dot_product(p, p) + c_dot_c * dot_product(v1, v1) + a_dot_a * dot_product(v0, v0) -
                                     2.0f * (b_dot_b * dot_product(p, v1) + c_dot_c * dot_product(v1, v0) + a_dot_a * dot_product(v0, p)) +
                                     2.0f * (dot_product(b, c) * dot_product(v1, v0) + dot_product(c, a) * dot_product(v0, p) + dot_product(a, b) * dot_product(p, v1))) / (b_dot_b + c_dot_c + a_dot_a - 2.0f * (dot_product(b, c) + dot_product(c, a) + dot_product(a, b)));
                    return dist_sq;
                }
            } else if (t > det) {
                if (s < 0.0f) { // Region 5
                    float a_dot_e1 = dot_product(a, e1);
                    float dist_sq = (a_dot_a * dot_product(p, p) + c_dot_c * dot_product(v2, v2) + b_dot_b * dot_product(v0, v0) -
                                     2.0f * (a_dot_a * dot_product(p, v2) + c_dot_c * dot_product(v2, v0) + b_dot_b * dot_product(v0, p)) +
                                     2.0f * (dot_product(a, c) * dot_product(v2, v0) + dot_product(c, b) * dot_product(v0, p) + dot_product(b, a) * dot_product(p, v2))) / (a_dot_a + c_dot_c + b_dot_b - 2.0f * (dot_product(a, c) + dot_product(c, b) + dot_product(b, a)));
                    return dist_sq;
                } else { // Region 2
                    float dist_sq = dot_product(p, p) + dot_product(v2, v2) - 2.0f * dot_product(p, v2);
                    return dist_sq;
                }
            } else { // Region 0: Inside the triangle
                return 0.0f;
            }
        } else {
            if (t < 0.0f) { // Region 6
                float dist_sq = dot_product(p, p) + dot_product(v0, v0) - 2.0f * dot_product(p, v0);
                return dist_sq;
            } else if (t > det) { // Region 1
                float dist_sq = dot_product(p, p) + dot_product(v1, v1) - 2.0f * dot_product(p, v1);
                return dist_sq;
            } else { // Region 0: Inside the triangle
                return 0.0f;
            }
        }
    } else { // det <= 0.0f (treat as degenerate)
        float dist_sq1 = dot_product(p, p) + dot_product(v0, v0) - 2.0f * dot_product(p, v0);
        float dist_sq2 = dot_product(p, p) + dot_product(v1, v1) - 2.0f * dot_product(p, v1);
        float dist_sq3 = dot_product(p, p) + dot_product(v2, v2) - 2.0f * dot_product(p, v2);
        return min(min(dist_sq1, dist_sq2), dist_sq3);
    }
}

__global__ void find_closest_point_kernel(float *points, int num_points,
                                           float *triangles, int num_triangles,
                                           float *closest_points, float *min_distances_sq) {
    int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (point_idx < num_points) {
        float px = points[point_idx * 3 + 0];
        float py = points[point_idx * 3 + 1];
        float pz = points[point_idx * 3 + 2];
        float p[3] = {px, py, pz};

        float min_dist_sq = 1e20;
        float closest_point[3] = {0.0f, 0.0f, 0.0f};

        for (int tri_idx = 0; tri_idx < num_triangles; ++tri_idx) {
            float v0[3] = {triangles[tri_idx * 9 + 0], triangles[tri_idx * 9 + 1], triangles[tri_idx * 9 + 2]};
            float v1[3] = {triangles[tri_idx * 9 + 3], triangles[tri_idx * 9 + 4], triangles[tri_idx * 9 + 5]};
            float v2[3] = {triangles[tri_idx * 9 + 6], triangles[tri_idx * 9 + 7], triangles[tri_idx * 9 + 8]};

            float dist_sq = point_triangle_distance_squared(p, v0, v1, v2);

            if (dist_sq < min_dist_sq) {
                min_dist_sq = dist_sq;
                // For now, we are only calculating the distance.
                // Finding the actual closest point on the triangle is more involved
                // and would require barycentric coordinates or projection onto edges/faces.
                // A simplified approach here is to just store one of the triangle's vertices.
                // A more accurate implementation would require further calculations within the kernel.
                closest_point[0] = (v0[0] + v1[0] + v2[0]) / 3.0f; // Approximate closest point (centroid)
                closest_point[1] = (v0[1] + v1[1] + v2[1]) / 3.0f;
                closest_point[2] = (v0[2] + v1[2] + v2[2]) / 3.0f;
            }
        }
        min_distances_sq[point_idx] = min_dist_sq;
        closest_points[point_idx * 3 + 0] = closest_point[0];
        closest_points[point_idx * 3 + 1] = closest_point[1];
        closest_points[point_idx * 3 + 2] = closest_point[2];
    }
}
"""

def find_closest_points_on_triangles(points, triangles, block_size=32):
    """
    Finds the closest point on a set of triangles for each given point using a PyCUDA kernel.

    Args:
        points (numpy.ndarray): Array of points (N x 3).
        triangles (numpy.ndarray): Array of triangles (M x 9), where each row contains
                                   the coordinates of the three vertices (x0, y0, z0, x1, y1, z1, x2, y2, z2).
        block_size (int): Size of the CUDA thread block.

    Returns:
        tuple: A tuple containing two numpy arrays:
               - closest_points (numpy.ndarray): Array of the closest points on the triangles (N x 3).
               - min_distances (numpy.ndarray): Array of the minimum squared distances (N x 1).
    """
    num_points = points.shape[0]
    num_triangles = triangles.shape[0]

    # Allocate device memory
    points_gpu = cuda.mem_alloc(points.nbytes)
    triangles_gpu = cuda.mem_alloc(triangles.nbytes)
    closest_points_gpu = cuda.mem_alloc(np.float32(num_points * 3).nbytes)
    min_distances_sq_gpu = cuda.mem_alloc(np.float32(num_points).nbytes)

    # Copy data to device
    cuda.memcpy_htod(points_gpu, points)
    cuda.memcpy_htod(triangles_gpu, triangles)

    # Compile the kernel
    mod = SourceModule(kernel_code)
    find_closest_point_kernel = mod.get_function("find_closest_point_kernel")

    # Set grid and block dimensions
    grid_size = ( (num_points + block_size - 1) // block_size, 1, 1)
    block = (block_size, 1, 1)

    # Call the kernel
    find_closest_point_kernel(points_gpu, np.int32(num_points),
                              triangles_gpu, np.int32(num_triangles),
                              closest_points_gpu, min_distances_sq_gpu,
                              grid=grid_size, block=block)

    # Allocate host memory for results
    closest_points_host = np.empty_like(points)
    min_distances_host = np.empty((num_points,), dtype=np.float32)

    # Copy results back to host
    cuda.memcpy_dtoh(closest_points_host, closest_points_gpu)
    cuda.memcpy_dtoh(min_distances_host, min_distances_sq_gpu)

    return closest_points_host, np.sqrt(min_distances_host)

if __name__ == '__main__':
    # Example usage
    points = np.array([[1.0, 1.0, 1.0],
                       [0.0, 0.0, 0.0],
                       [2.0, 2.0, 2.0]], dtype=np.float32)
    triangles = np.array([[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0],
                          [1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.0]], dtype=np.float32)

    closest_points, min_distances = find_closest_points_on_triangles(points, triangles)

    print("Points:")
    print(points)
    print("\nTriangles:")
    print(triangles.reshape(-1, 3, 3))
    print("\nClosest Points on Triangles:")
    print(closest_points)
    print("\nMinimum Distances:")
    print(min_distances)

kernel.cu

  mod = SourceModule(kernel_code)


LogicError: cuMemcpyDtoH failed: invalid argument