-
Notifications
You must be signed in to change notification settings - Fork 119
/
nv_target.cu
145 lines (128 loc) · 5.21 KB
/
nv_target.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
// This file demonstrates how to use <nv/target> and how to avoid common
// pitfalls regarding compiler/dialect compatibility.
//=======================================================================================================================
#include <nv/target>
// The below are part of libcu++ and are exposed for users that would like a simpler method of targeting host/device code
// on NVCC, NVC++ or GCC/Clang/MSVC even when the NVCC compiler isn't present.
// These macros are to be used in lieu of common #if defined(__CUDA_ARCH__) statements and
// are only to be used inside of function scopes
/* Macros defined when including <nv/target> or virtually any libcu++ header
NV_IF_TARGET(query, true, ...) | Queries compilation mode and emits code if true |
NV_IF_ELSE_TARGET(query, true, false) | As above, but can also emit different code when false |
NV_DISPATCH_TARGET(...) | Similar to a switch statement emitting code for multiple modes |
*/
/* Queryable properties defined by <nv/target>
Can be imagined as __CUDA_ARCH__ >= SM_XX
NV_PROVIDES_SM_35
NV_PROVIDES_SM_37
NV_PROVIDES_SM_50
NV_PROVIDES_SM_52
NV_PROVIDES_SM_53
NV_PROVIDES_SM_60
NV_PROVIDES_SM_61
NV_PROVIDES_SM_62
NV_PROVIDES_SM_70
NV_PROVIDES_SM_72
NV_PROVIDES_SM_75
NV_PROVIDES_SM_80
NV_PROVIDES_SM_86
NV_PROVIDES_SM_87
Similar to above, but instead __CUDA_ARCH__ == SM_XX
NV_IS_EXACTLY_SM_35
NV_IS_EXACTLY_SM_37
NV_IS_EXACTLY_SM_50
NV_IS_EXACTLY_SM_52
NV_IS_EXACTLY_SM_53
NV_IS_EXACTLY_SM_60
NV_IS_EXACTLY_SM_61
NV_IS_EXACTLY_SM_62
NV_IS_EXACTLY_SM_70
NV_IS_EXACTLY_SM_72
NV_IS_EXACTLY_SM_75
NV_IS_EXACTLY_SM_80
NV_IS_EXACTLY_SM_86
NV_IS_EXACTLY_SM_87
Queries whether if host or device code is being compiled
NV_IS_HOST
NV_IS_DEVICE
Static true/false values for fallbacks or user manipulation
NV_ANY_TARGET
NV_NO_TARGET
*/
//=======================================================================================================================
// NV_IF_ELSE_TARGET(query, true statement, false statement)
__host__ __device__ int my_popc(unsigned int v) {
// NV_IF_ELSE_TARGET accepts three arguments, a query and two statement.
// Here we check if we're compiling for device code. This function acts as a backend for both CUDA and host CPU popc.
NV_IF_ELSE_TARGET(
NV_IS_DEVICE,
return __popc(v);, // Is false, use CUDA intrinsic
return __builtin_popc(v); // Is host, use GCC builtin
)
}
// Note the commas seperating statements, if preprocessed code is written out the macro will be preprocessed into the below:
/*
my_popc(unsigned int v) {
{return __popc(v);}
}
*/
//=======================================================================================================================
// NV_IF_TARGET(query, true statement) OR NV_IF_TARGET(q, t, ...)
__host__ __device__ void some_algorithm() {
// NV_IF_TARGET accepts two arguments, a query and a statement. (and an optional false statement in >=C++11)
NV_IF_TARGET(
NV_IS_DEVICE,
do_device_specific_work(); // Code only emitted if compiling for device
)
}
//=======================================================================================================================
// NV_DISPATCH_TARGET(...) - Available only in C++11 and up due to variadic macros
__host__ __device__ void my_memset(void *p, uint8_t v, uint64_t c) {
// Target dispatch accepts pairs of queries and statements.
// The first postive query encountered will be emitted while others are ignored.
NV_DISPATCH_TARGET(
NV_PROVIDES_SM_80,
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
if (v == 0) zero_fill(p, 0, c); // zero fill using cp.async available on SM_80
else memset(p, v, c);, // Notice comma signifying end of block
NV_ANY_TARGET, // Uncoditionally use memset in other cases
memset(p, v, c);
)
}
//***********************************************************************************************************************
// # Common pitfalls:
//=======================================================================================================================
// Embedding preprocessor statements as an argument. Perform textual manipulation outside of the macro.
/*
NV_IF_TARGET(
NV_IS_DEVICE,
// This will break immediately on most compilers
# if defined(ENABLE_SM_80_FEATURE)
sm80_function();
# else
device_function();
# endif
)
// Instead one could write the above as:
#if defined(ENABLE_SM_80_FEATURE)
# define OPTIMAL_DEVICE_FUNCTION() sm80_function()
#else
# define OPTIMAL_DEVICE_FUNCTION() device_function()
#endif
NV_IF_TARGET(
NV_IS_DEVICE,
OPTIMAL_DEVICE_FUNCTION();
)
*/
//=======================================================================================================================
// Some statements may have unguarded commas, e.g. lambdas or aggregate assignment
// Supported with C++11 and up ONLY, as it requires variadic macro processing
/*
NV_IF_TARGET(
NV_IS_DEVICE,
( // You may wrap a statement or series of statements with a parenthesis to guard commas from any macro machinery
int input[] = {x, y, z...};
my_algorithm(input);
)
)
*/