-
Notifications
You must be signed in to change notification settings - Fork 30
/
mesh.cu
90 lines (75 loc) · 3.75 KB
/
mesh.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include <cuda.h>
#include "common.h"
#include "mesh.h"
//------------------------------------------------------------------------
// Kernels
__global__ void xfmPointsFwdKernel(XfmKernelParams p)
{
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int pz = blockIdx.z * blockDim.z + threadIdx.z;
__shared__ float mtx[4][4];
if (threadIdx.x < 16)
mtx[threadIdx.x % 4][threadIdx.x / 4] = p.matrix.fetch(p.matrix.nhwcIndex(pz, threadIdx.x / 4, threadIdx.x % 4, 0));
__syncthreads();
if (px >= p.gridSize.x)
return;
vec3f pos(
p.points.fetch(p.points.nhwcIndex(pz, px, 0, 0)),
p.points.fetch(p.points.nhwcIndex(pz, px, 1, 0)),
p.points.fetch(p.points.nhwcIndex(pz, px, 2, 0))
);
if (p.isPoints)
{
p.out.store(p.out.nhwcIndex(pz, px, 0, 0), pos.x * mtx[0][0] + pos.y * mtx[1][0] + pos.z * mtx[2][0] + mtx[3][0]);
p.out.store(p.out.nhwcIndex(pz, px, 1, 0), pos.x * mtx[0][1] + pos.y * mtx[1][1] + pos.z * mtx[2][1] + mtx[3][1]);
p.out.store(p.out.nhwcIndex(pz, px, 2, 0), pos.x * mtx[0][2] + pos.y * mtx[1][2] + pos.z * mtx[2][2] + mtx[3][2]);
p.out.store(p.out.nhwcIndex(pz, px, 3, 0), pos.x * mtx[0][3] + pos.y * mtx[1][3] + pos.z * mtx[2][3] + mtx[3][3]);
}
else
{
p.out.store(p.out.nhwcIndex(pz, px, 0, 0), pos.x * mtx[0][0] + pos.y * mtx[1][0] + pos.z * mtx[2][0]);
p.out.store(p.out.nhwcIndex(pz, px, 1, 0), pos.x * mtx[0][1] + pos.y * mtx[1][1] + pos.z * mtx[2][1]);
p.out.store(p.out.nhwcIndex(pz, px, 2, 0), pos.x * mtx[0][2] + pos.y * mtx[1][2] + pos.z * mtx[2][2]);
}
}
__global__ void xfmPointsBwdKernel(XfmKernelParams p)
{
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int pz = blockIdx.z * blockDim.z + threadIdx.z;
__shared__ float mtx[4][4];
if (threadIdx.x < 16)
mtx[threadIdx.x % 4][threadIdx.x / 4] = p.matrix.fetch(p.matrix.nhwcIndex(pz, threadIdx.x / 4, threadIdx.x % 4, 0));
__syncthreads();
if (px >= p.gridSize.x)
return;
vec3f pos(
p.points.fetch(p.points.nhwcIndex(pz, px, 0, 0)),
p.points.fetch(p.points.nhwcIndex(pz, px, 1, 0)),
p.points.fetch(p.points.nhwcIndex(pz, px, 2, 0))
);
vec4f d_out(
p.out.fetch(p.out.nhwcIndex(pz, px, 0, 0)),
p.out.fetch(p.out.nhwcIndex(pz, px, 1, 0)),
p.out.fetch(p.out.nhwcIndex(pz, px, 2, 0)),
p.out.fetch(p.out.nhwcIndex(pz, px, 3, 0))
);
if (p.isPoints)
{
p.points.store_grad(p.points._nhwcIndex(pz, px, 0, 0), d_out.x * mtx[0][0] + d_out.y * mtx[0][1] + d_out.z * mtx[0][2] + d_out.w * mtx[0][3]);
p.points.store_grad(p.points._nhwcIndex(pz, px, 1, 0), d_out.x * mtx[1][0] + d_out.y * mtx[1][1] + d_out.z * mtx[1][2] + d_out.w * mtx[1][3]);
p.points.store_grad(p.points._nhwcIndex(pz, px, 2, 0), d_out.x * mtx[2][0] + d_out.y * mtx[2][1] + d_out.z * mtx[2][2] + d_out.w * mtx[2][3]);
}
else
{
p.points.store_grad(p.points._nhwcIndex(pz, px, 0, 0), d_out.x * mtx[0][0] + d_out.y * mtx[0][1] + d_out.z * mtx[0][2]);
p.points.store_grad(p.points._nhwcIndex(pz, px, 1, 0), d_out.x * mtx[1][0] + d_out.y * mtx[1][1] + d_out.z * mtx[1][2]);
p.points.store_grad(p.points._nhwcIndex(pz, px, 2, 0), d_out.x * mtx[2][0] + d_out.y * mtx[2][1] + d_out.z * mtx[2][2]);
}
}