## Imports

In [1]:
import numpy as np
from time import perf_counter
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

In [2]:
from common import read_file_str, show_formatted_cpp, load_numpy

In [3]:
!cl

usage: cl [ option... ] filename... [ /link linkoption... ]


Microsoft (R) C/C++ Optimizing Compiler Version 19.43.34810 for x64
Copyright (C) Microsoft Corporation.  All rights reserved.



## Load in data from mesh

In [4]:
vertices = load_numpy('piece_vertices.npy')
triangles = load_numpy('triangle_vertices.npy')
normals = load_numpy('triangle_normals.npy')

## Parameters

In [5]:
BLOCK_SIZE = 64
NR_BLOCKS = (len(vertices) + BLOCK_SIZE - 1) // BLOCK_SIZE

## Compile cuda kernel

In [6]:
cuda_code = read_file_str("./kernels/adjust_points_in_mesh.cu")

In [7]:
show_formatted_cpp(cuda_code)

In [8]:
mod = SourceModule(cuda_code)

## Set-up memory for running kernel

In [9]:
point_in_mesh = mod.get_function("adjust_point_in_mesh")

In [10]:
nr_triangles = np.int32(len(triangles))
nr_vertices = np.int32(len(vertices))

We can reduce a lot of the workload by using bounding spheres. Other techniques is to spatially order the input so the search can be narrowed to a certain window.

### Pre-compute bounding spheres around body mesh triangles

In [11]:
triangle_centers = (triangles[:, 0] + triangles[:, 1] + triangles[:, 2]) / 3

In [12]:
distances_to_center = np.expand_dims(np.max(
    np.linalg.norm(triangles - np.expand_dims(triangle_centers, 1), axis=2),
axis=1), axis=-1)

In [13]:
triangle_centers = np.hstack([triangle_centers, distances_to_center])

In [14]:
triangles[:, 1] -= triangles[:, 0]
triangles[:, 2] -= triangles[:, 0]

### Allocate memory to gpu

In [15]:
assert vertices.flatten().flags['C_CONTIGUOUS']
assert triangles.flatten().flags['C_CONTIGUOUS']
assert normals.flatten().flags['C_CONTIGUOUS']
assert triangle_centers.flatten().flags['C_CONTIGUOUS']
assert len(normals) == len(triangles)

In [16]:
triangles_gpu = cuda.mem_alloc(triangles.nbytes)
vertices_gpu = cuda.mem_alloc(vertices.nbytes)
normals_gpu = cuda.mem_alloc(normals.nbytes)
centers_gpu = cuda.mem_alloc(triangle_centers.nbytes)

In [17]:
cuda.memcpy_htod(triangles_gpu, triangles.flatten())
cuda.memcpy_htod(vertices_gpu, vertices.flatten())
cuda.memcpy_htod(normals_gpu, normals.flatten())
cuda.memcpy_htod(centers_gpu, triangle_centers.flatten())

## Profile function

This function does not need to return anything if the points are copied to a rendering context.

In [18]:
def get_points_in_mesh():
    point_in_mesh(triangles_gpu, nr_triangles,
                  vertices_gpu, nr_vertices,
                  normals_gpu, centers_gpu,
                  block=(BLOCK_SIZE, 1, 1), grid=(NR_BLOCKS, 1, 1))

We can only run this once. Once it is run a second time, all threads exit after ray tracing.

In [19]:
start = perf_counter()
get_points_in_mesh()
print(f"Time taken {(perf_counter() - start)*1000:.3f} ms")

Time taken 1.971 ms


We can only run this once. Once it is run a second time, all threads exit after ray tracing.

In [26]:
start = perf_counter()
get_points_in_mesh()
print(f"Time taken {(perf_counter() - start)*1000:.3f} ms")

Time taken 0.383 ms


## Check which points were moved

In [20]:
original_vertices = vertices.copy()

In [21]:
cuda.memcpy_dtoh(vertices, vertices_gpu)

In [22]:
np.where(np.any((original_vertices - vertices) != 0., axis=1))

(array([1680, 1683, 1741, 1798, 2245, 2298, 2299, 2300, 2301, 2302, 2303,
        2345, 2346, 2347, 2348, 2349, 2350, 2351, 2352, 2353, 2354, 2355,
        2356, 2357, 2358, 2359, 2360, 2361, 2362, 2406, 2407, 2408, 2456,
        2457, 2458, 2459, 2498, 2499, 2501, 2549, 2600, 2609, 2651, 2655,
        2690, 2701, 2741, 2742, 2793, 2945], dtype=int64),)

In [23]:
vertices_changed_mask = np.any((original_vertices - vertices) != 0., axis=1)

In [24]:
vertices_changed_mask.sum()

50

Getting vertices from device is also feasible

In [27]:
%timeit cuda.memcpy_dtoh(vertices, vertices_gpu)

69.9 µs ± 15.7 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
