diff --git a/docs/html/block__discontinuity_8cuh_source.html b/docs/html/block__discontinuity_8cuh_source.html new file mode 100644 index 0000000000..d9423b274f --- /dev/null +++ b/docs/html/block__discontinuity_8cuh_source.html @@ -0,0 +1,407 @@ + + + + + + + +CUB: block_discontinuity.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
block_discontinuity.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include "../util_type.cuh"
+
37 #include "../util_namespace.cuh"
+
38 
+
40 CUB_NS_PREFIX
+
41 
+
43 namespace cub {
+
44 
+
94 template <
+
95  typename T,
+
96  int BLOCK_THREADS>
+ +
98 {
+
99 private:
+
100 
+
101  /******************************************************************************
+
102  * Type definitions
+
103  ******************************************************************************/
+
104 
+
106  typedef T _TempStorage[BLOCK_THREADS];
+
107 
+
108 
+
109  /******************************************************************************
+
110  * Utility methods
+
111  ******************************************************************************/
+
112 
+
114  __device__ __forceinline__ _TempStorage& PrivateStorage()
+
115  {
+
116  __shared__ _TempStorage private_storage;
+
117  return private_storage;
+
118  }
+
119 
+
120 
+
122  template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+
123  struct ApplyOp
+
124  {
+
125  // Apply flag operator
+
126  static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
+
127  {
+
128  return flag_op(a, b, idx);
+
129  }
+
130  };
+
131 
+
133  template <typename FlagOp>
+
134  struct ApplyOp<FlagOp, false>
+
135  {
+
136  // Apply flag operator
+
137  static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
+
138  {
+
139  return flag_op(a, b);
+
140  }
+
141  };
+
142 
+
143 
+
144  /******************************************************************************
+
145  * Thread fields
+
146  ******************************************************************************/
+
147 
+
149  _TempStorage &temp_storage;
+
150 
+
152  int linear_tid;
+
153 
+
154 
+
155 public:
+
156 
+
158  struct TempStorage : Uninitialized<_TempStorage> {};
+
159 
+
160 
+
161  /******************************************************************/
+
165 
+
169  __device__ __forceinline__ BlockDiscontinuity()
+
170  :
+
171  temp_storage(PrivateStorage()),
+
172  linear_tid(threadIdx.x)
+
173  {}
+
174 
+
175 
+
179  __device__ __forceinline__ BlockDiscontinuity(
+
180  TempStorage &temp_storage)
+
181  :
+
182  temp_storage(temp_storage.Alias()),
+
183  linear_tid(threadIdx.x)
+
184  {}
+
185 
+
186 
+
190  __device__ __forceinline__ BlockDiscontinuity(
+
191  int linear_tid)
+
192  :
+
193  temp_storage(PrivateStorage()),
+
194  linear_tid(linear_tid)
+
195  {}
+
196 
+
197 
+
201  __device__ __forceinline__ BlockDiscontinuity(
+
202  TempStorage &temp_storage,
+
203  int linear_tid)
+
204  :
+
205  temp_storage(temp_storage.Alias()),
+
206  linear_tid(linear_tid)
+
207  {}
+
208 
+
209 
+
210 
+
212  /******************************************************************/
+
216 
+
217 
+
267  template <
+
268  int ITEMS_PER_THREAD,
+
269  typename FlagT,
+
270  typename FlagOp>
+
271  __device__ __forceinline__ void FlagHeads(
+
272  FlagT (&head_flags)[ITEMS_PER_THREAD],
+
273  T (&input)[ITEMS_PER_THREAD],
+
274  FlagOp flag_op)
+
275  {
+
276  // Share last item
+
277  temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
278 
+
279  __syncthreads();
+
280 
+
281  // Set flag for first item
+
282  head_flags[0] = (linear_tid == 0) ?
+
283  1 : // First thread
+
284  ApplyOp<FlagOp>::Flag(
+
285  flag_op,
+
286  temp_storage[linear_tid - 1],
+
287  input[0],
+
288  linear_tid * ITEMS_PER_THREAD);
+
289 
+
290  // Set head_flags for remaining items
+
291  #pragma unroll
+
292  for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ITEM++)
+
293  {
+
294  head_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+
295  flag_op,
+
296  input[ITEM - 1],
+
297  input[ITEM],
+
298  (linear_tid * ITEMS_PER_THREAD) + ITEM);
+
299  }
+
300  }
+
301 
+
302 
+
357  template <
+
358  int ITEMS_PER_THREAD,
+
359  typename FlagT,
+
360  typename FlagOp>
+
361  __device__ __forceinline__ void FlagHeads(
+
362  FlagT (&head_flags)[ITEMS_PER_THREAD],
+
363  T (&input)[ITEMS_PER_THREAD],
+
364  FlagOp flag_op,
+
365  T tile_predecessor_item)
+
366  {
+
367  // Share last item
+
368  temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
369 
+
370  __syncthreads();
+
371 
+
372  // Set flag for first item
+
373  int predecessor = (linear_tid == 0) ?
+
374  tile_predecessor_item : // First thread
+
375  temp_storage[linear_tid - 1];
+
376 
+
377  head_flags[0] = ApplyOp<FlagOp>::Flag(
+
378  flag_op,
+
379  predecessor,
+
380  input[0],
+
381  linear_tid * ITEMS_PER_THREAD);
+
382 
+
383  // Set flag for remaining items
+
384  #pragma unroll
+
385  for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ITEM++)
+
386  {
+
387  head_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+
388  flag_op,
+
389  input[ITEM - 1],
+
390  input[ITEM],
+
391  (linear_tid * ITEMS_PER_THREAD) + ITEM);
+
392  }
+
393  }
+
394 
+
395 
+
397  /******************************************************************/
+
401 
+
402 
+
452  template <
+
453  int ITEMS_PER_THREAD,
+
454  typename FlagT,
+
455  typename FlagOp>
+
456  __device__ __forceinline__ void FlagTails(
+
457  FlagT (&tail_flags)[ITEMS_PER_THREAD],
+
458  T (&input)[ITEMS_PER_THREAD],
+
459  FlagOp flag_op)
+
460  {
+
461  // Share first item
+
462  temp_storage[linear_tid] = input[0];
+
463 
+
464  __syncthreads();
+
465 
+
466  // Set flag for last item
+
467  tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+
468  1 : // Last thread
+
469  ApplyOp<FlagOp>::Flag(
+
470  flag_op,
+
471  input[ITEMS_PER_THREAD - 1],
+
472  temp_storage[linear_tid + 1],
+
473  (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
+
474 
+
475  // Set flags for remaining items
+
476  #pragma unroll
+
477  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
+
478  {
+
479  tail_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+
480  flag_op,
+
481  input[ITEM],
+
482  input[ITEM + 1],
+
483  (linear_tid * ITEMS_PER_THREAD) + ITEM);
+
484  }
+
485  }
+
486 
+
487 
+
543  template <
+
544  int ITEMS_PER_THREAD,
+
545  typename FlagT,
+
546  typename FlagOp>
+
547  __device__ __forceinline__ void FlagTails(
+
548  FlagT (&tail_flags)[ITEMS_PER_THREAD],
+
549  T (&input)[ITEMS_PER_THREAD],
+
550  FlagOp flag_op,
+
551  T tile_successor_item)
+
552  {
+
553  // Share first item
+
554  temp_storage[linear_tid] = input[0];
+
555 
+
556  __syncthreads();
+
557 
+
558  // Set flag for last item
+
559  int successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+
560  tile_successor_item : // Last thread
+
561  temp_storage[linear_tid + 1];
+
562 
+
563  tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
+
564  flag_op,
+
565  input[ITEMS_PER_THREAD - 1],
+
566  successor_item,
+
567  (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
+
568 
+
569  // Set flags for remaining items
+
570  #pragma unroll
+
571  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
+
572  {
+
573  tail_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+
574  flag_op,
+
575  input[ITEM],
+
576  input[ITEM + 1],
+
577  (linear_tid * ITEMS_PER_THREAD) + ITEM);
+
578  }
+
579  }
+
580 
+
582 
+
583 };
+
584 
+
585 
+
586 } // CUB namespace
+
587 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/block__exchange_8cuh_source.html b/docs/html/block__exchange_8cuh_source.html new file mode 100644 index 0000000000..6612c803d7 --- /dev/null +++ b/docs/html/block__exchange_8cuh_source.html @@ -0,0 +1,745 @@ + + + + + + + +CUB: block_exchange.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
block_exchange.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include "../util_arch.cuh"
+
37 #include "../util_macro.cuh"
+
38 #include "../util_type.cuh"
+
39 #include "../util_namespace.cuh"
+
40 
+
42 CUB_NS_PREFIX
+
43 
+
45 namespace cub {
+
46 
+
105 template <
+
106  typename T,
+
107  int BLOCK_THREADS,
+
108  int ITEMS_PER_THREAD,
+
109  bool WARP_TIME_SLICING = false>
+ +
111 {
+
112 private:
+
113 
+
114  /******************************************************************************
+
115  * Constants
+
116  ******************************************************************************/
+
117 
+
118  enum
+
119  {
+
120  LOG_WARP_THREADS = PtxArchProps::LOG_WARP_THREADS,
+
121  WARP_THREADS = 1 << LOG_WARP_THREADS,
+
122  WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
+
123 
+
124  LOG_SMEM_BANKS = PtxArchProps::LOG_SMEM_BANKS,
+
125  SMEM_BANKS = 1 << LOG_SMEM_BANKS,
+
126 
+
127  TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+
128 
+
129  TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1,
+
130 
+
131  TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
+
132  TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
133 
+
134  WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+
135  WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
136 
+
137  // Insert padding if the number of items per thread is a power of two
+
138  INSERT_PADDING = ((ITEMS_PER_THREAD & (ITEMS_PER_THREAD - 1)) == 0),
+
139  PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
+
140  };
+
141 
+
142  /******************************************************************************
+
143  * Type definitions
+
144  ******************************************************************************/
+
145 
+
147  typedef T _TempStorage[TIME_SLICED_ITEMS + PADDING_ITEMS];
+
148 
+
149 public:
+
150 
+
152  struct TempStorage : Uninitialized<_TempStorage> {};
+
153 
+
154 private:
+
155 
+
156 
+
157  /******************************************************************************
+
158  * Thread fields
+
159  ******************************************************************************/
+
160 
+
162  _TempStorage &temp_storage;
+
163 
+
165  int linear_tid;
+
166  int warp_lane;
+
167  int warp_id;
+
168  int warp_offset;
+
169 
+
170 
+
171  /******************************************************************************
+
172  * Utility methods
+
173  ******************************************************************************/
+
174 
+
176  __device__ __forceinline__ _TempStorage& PrivateStorage()
+
177  {
+
178  __shared__ _TempStorage private_storage;
+
179  return private_storage;
+
180  }
+
181 
+
182 
+
186  __device__ __forceinline__ void BlockedToStriped(
+
187  T items[ITEMS_PER_THREAD],
+
188  Int2Type<false> time_slicing)
+
189  {
+
190  #pragma unroll
+
191  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
192  {
+
193  int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+
194  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
195  temp_storage[item_offset] = items[ITEM];
+
196  }
+
197 
+
198  __syncthreads();
+
199 
+
200  #pragma unroll
+
201  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
202  {
+
203  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+
204  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
205  items[ITEM] = temp_storage[item_offset];
+
206  }
+
207  }
+
208 
+
209 
+
213  __device__ __forceinline__ void BlockedToStriped(
+
214  T items[ITEMS_PER_THREAD],
+
215  Int2Type<true> time_slicing)
+
216  {
+
217  T temp_items[ITEMS_PER_THREAD];
+
218 
+
219  #pragma unroll
+
220  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+
221  {
+
222  const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
+
223  const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
224 
+
225  __syncthreads();
+
226 
+
227  if (warp_id == SLICE)
+
228  {
+
229  #pragma unroll
+
230  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
231  {
+
232  int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
+
233  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
234  temp_storage[item_offset] = items[ITEM];
+
235  }
+
236  }
+
237 
+
238  __syncthreads();
+
239 
+
240  #pragma unroll
+
241  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
242  {
+
243  // Read a strip of items
+
244  const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
+
245  const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
+
246 
+
247  if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+
248  {
+
249  int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+
250  if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+
251  {
+
252  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
253  temp_items[ITEM] = temp_storage[item_offset];
+
254  }
+
255  }
+
256  }
+
257  }
+
258 
+
259  // Copy
+
260  #pragma unroll
+
261  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
262  {
+
263  items[ITEM] = temp_items[ITEM];
+
264  }
+
265  }
+
266 
+
267 
+
271  __device__ __forceinline__ void BlockedToWarpStriped(
+
272  T items[ITEMS_PER_THREAD],
+
273  Int2Type<false> time_slicing)
+
274  {
+
275  #pragma unroll
+
276  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
277  {
+
278  int item_offset = warp_offset + ITEM + (warp_lane * ITEMS_PER_THREAD);
+
279  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
280  temp_storage[item_offset] = items[ITEM];
+
281  }
+
282 
+
283  #pragma unroll
+
284  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
285  {
+
286  int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+
287  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
288  items[ITEM] = temp_storage[item_offset];
+
289  }
+
290  }
+
291 
+
295  __device__ __forceinline__ void BlockedToWarpStriped(
+
296  T items[ITEMS_PER_THREAD],
+
297  Int2Type<true> time_slicing)
+
298  {
+
299  #pragma unroll
+
300  for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+
301  {
+
302  __syncthreads();
+
303 
+
304  if (warp_id == SLICE)
+
305  {
+
306  #pragma unroll
+
307  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
308  {
+
309  int item_offset = ITEM + (warp_lane * ITEMS_PER_THREAD);
+
310  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
311  temp_storage[item_offset] = items[ITEM];
+
312  }
+
313 
+
314  #pragma unroll
+
315  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
316  {
+
317  int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+
318  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
319  items[ITEM] = temp_storage[item_offset];
+
320  }
+
321  }
+
322  }
+
323  }
+
324 
+
325 
+
329  __device__ __forceinline__ void StripedToBlocked(
+
330  T items[ITEMS_PER_THREAD],
+
331  Int2Type<false> time_slicing)
+
332  {
+
333  #pragma unroll
+
334  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
335  {
+
336  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+
337  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
338  temp_storage[item_offset] = items[ITEM];
+
339  }
+
340 
+
341  __syncthreads();
+
342 
+
343  // No timeslicing
+
344  #pragma unroll
+
345  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
346  {
+
347  int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+
348  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
349  items[ITEM] = temp_storage[item_offset];
+
350  }
+
351  }
+
352 
+
353 
+
357  __device__ __forceinline__ void StripedToBlocked(
+
358  T items[ITEMS_PER_THREAD],
+
359  Int2Type<true> time_slicing)
+
360  {
+
361  // Warp time-slicing
+
362  T temp_items[ITEMS_PER_THREAD];
+
363 
+
364  #pragma unroll
+
365  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+
366  {
+
367  const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
+
368  const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
369 
+
370  __syncthreads();
+
371 
+
372  #pragma unroll
+
373  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
374  {
+
375  // Write a strip of items
+
376  const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
+
377  const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
+
378 
+
379  if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+
380  {
+
381  int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+
382  if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+
383  {
+
384  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
385  temp_storage[item_offset] = items[ITEM];
+
386  }
+
387  }
+
388  }
+
389 
+
390  __syncthreads();
+
391 
+
392  if (warp_id == SLICE)
+
393  {
+
394  #pragma unroll
+
395  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
396  {
+
397  int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
+
398  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
399  temp_items[ITEM] = temp_storage[item_offset];
+
400  }
+
401  }
+
402  }
+
403 
+
404  // Copy
+
405  #pragma unroll
+
406  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
407  {
+
408  items[ITEM] = temp_items[ITEM];
+
409  }
+
410  }
+
411 
+
412 
+
416  __device__ __forceinline__ void WarpStripedToBlocked(
+
417  T items[ITEMS_PER_THREAD],
+
418  Int2Type<false> time_slicing)
+
419  {
+
420  #pragma unroll
+
421  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
422  {
+
423  int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+
424  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
425  temp_storage[item_offset] = items[ITEM];
+
426  }
+
427 
+
428  #pragma unroll
+
429  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
430  {
+
431  int item_offset = warp_offset + ITEM + (warp_lane * ITEMS_PER_THREAD);
+
432  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
433  items[ITEM] = temp_storage[item_offset];
+
434  }
+
435  }
+
436 
+
437 
+
441  __device__ __forceinline__ void WarpStripedToBlocked(
+
442  T items[ITEMS_PER_THREAD],
+
443  Int2Type<true> time_slicing)
+
444  {
+
445  #pragma unroll
+
446  for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+
447  {
+
448  __syncthreads();
+
449 
+
450  if (warp_id == SLICE)
+
451  {
+
452  #pragma unroll
+
453  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
454  {
+
455  int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+
456  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
457  temp_storage[item_offset] = items[ITEM];
+
458  }
+
459 
+
460  #pragma unroll
+
461  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
462  {
+
463  int item_offset = ITEM + (warp_lane * ITEMS_PER_THREAD);
+
464  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
465  items[ITEM] = temp_storage[item_offset];
+
466  }
+
467  }
+
468  }
+
469  }
+
470 
+
471 
+
475  __device__ __forceinline__ void ScatterToBlocked(
+
476  T items[ITEMS_PER_THREAD],
+
477  int ranks[ITEMS_PER_THREAD],
+
478  Int2Type<false> time_slicing)
+
479  {
+
480  #pragma unroll
+
481  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
482  {
+
483  int item_offset = ranks[ITEM];
+
484  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+
485  temp_storage[item_offset] = items[ITEM];
+
486  }
+
487 
+
488  __syncthreads();
+
489 
+
490  #pragma unroll
+
491  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
492  {
+
493  int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+
494  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+
495  items[ITEM] = temp_storage[item_offset];
+
496  }
+
497  }
+
498 
+
502  __device__ __forceinline__ void ScatterToBlocked(
+
503  T items[ITEMS_PER_THREAD],
+
504  int ranks[ITEMS_PER_THREAD],
+
505  Int2Type<true> time_slicing)
+
506  {
+
507  T temp_items[ITEMS_PER_THREAD];
+
508 
+
509  #pragma unroll
+
510  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+
511  {
+
512  __syncthreads();
+
513 
+
514  const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+
515 
+
516  #pragma unroll
+
517  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
518  {
+
519  int item_offset = ranks[ITEM] - SLICE_OFFSET;
+
520  if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+
521  {
+
522  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+
523  temp_storage[item_offset] = items[ITEM];
+
524  }
+
525  }
+
526 
+
527  __syncthreads();
+
528 
+
529  if (warp_id == SLICE)
+
530  {
+
531  #pragma unroll
+
532  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
533  {
+
534  int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
+
535  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+
536  temp_items[ITEM] = temp_storage[item_offset];
+
537  }
+
538  }
+
539  }
+
540 
+
541  // Copy
+
542  #pragma unroll
+
543  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
544  {
+
545  items[ITEM] = temp_items[ITEM];
+
546  }
+
547  }
+
548 
+
549 
+
553  __device__ __forceinline__ void ScatterToStriped(
+
554  T items[ITEMS_PER_THREAD],
+
555  int ranks[ITEMS_PER_THREAD],
+
556  Int2Type<false> time_slicing)
+
557  {
+
558  #pragma unroll
+
559  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
560  {
+
561  int item_offset = ranks[ITEM];
+
562  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+
563  temp_storage[item_offset] = items[ITEM];
+
564  }
+
565 
+
566  __syncthreads();
+
567 
+
568  #pragma unroll
+
569  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
570  {
+
571  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+
572  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+
573  items[ITEM] = temp_storage[item_offset];
+
574  }
+
575  }
+
576 
+
577 
+
581  __device__ __forceinline__ void ScatterToStriped(
+
582  T items[ITEMS_PER_THREAD],
+
583  int ranks[ITEMS_PER_THREAD],
+
584  Int2Type<true> time_slicing)
+
585  {
+
586  T temp_items[ITEMS_PER_THREAD];
+
587 
+
588  #pragma unroll
+
589  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+
590  {
+
591  const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
+
592  const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
593 
+
594  __syncthreads();
+
595 
+
596  #pragma unroll
+
597  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
598  {
+
599  int item_offset = ranks[ITEM] - SLICE_OFFSET;
+
600  if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+
601  {
+
602  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+
603  temp_storage[item_offset] = items[ITEM];
+
604  }
+
605  }
+
606 
+
607  __syncthreads();
+
608 
+
609  #pragma unroll
+
610  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
611  {
+
612  // Read a strip of items
+
613  const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
+
614  const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
+
615 
+
616  if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+
617  {
+
618  int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+
619  if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+
620  {
+
621  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+
622  temp_items[ITEM] = temp_storage[item_offset];
+
623  }
+
624  }
+
625  }
+
626  }
+
627 
+
628  // Copy
+
629  #pragma unroll
+
630  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
631  {
+
632  items[ITEM] = temp_items[ITEM];
+
633  }
+
634  }
+
635 
+
636 
+
637 public:
+
638 
+
639  /******************************************************************/
+
643 
+
647  __device__ __forceinline__ BlockExchange()
+
648  :
+
649  temp_storage(PrivateStorage()),
+
650  linear_tid(threadIdx.x),
+
651  warp_lane(linear_tid & (WARP_THREADS - 1)),
+
652  warp_id(linear_tid >> LOG_WARP_THREADS),
+
653  warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+
654  {}
+
655 
+
656 
+
660  __device__ __forceinline__ BlockExchange(
+
661  TempStorage &temp_storage)
+
662  :
+
663  temp_storage(temp_storage.Alias()),
+
664  linear_tid(threadIdx.x),
+
665  warp_lane(linear_tid & (WARP_THREADS - 1)),
+
666  warp_id(linear_tid >> LOG_WARP_THREADS),
+
667  warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+
668  {}
+
669 
+
670 
+
674  __device__ __forceinline__ BlockExchange(
+
675  int linear_tid)
+
676  :
+
677  temp_storage(PrivateStorage()),
+
678  linear_tid(linear_tid),
+
679  warp_lane(linear_tid & (WARP_THREADS - 1)),
+
680  warp_id(linear_tid >> LOG_WARP_THREADS),
+
681  warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+
682  {}
+
683 
+
684 
+
688  __device__ __forceinline__ BlockExchange(
+
689  TempStorage &temp_storage,
+
690  int linear_tid)
+
691  :
+
692  temp_storage(temp_storage.Alias()),
+
693  linear_tid(linear_tid),
+
694  warp_lane(linear_tid & (WARP_THREADS - 1)),
+
695  warp_id(linear_tid >> LOG_WARP_THREADS),
+
696  warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+
697  {}
+
698 
+
699 
+
701  /******************************************************************/
+
705 
+
740  __device__ __forceinline__ void StripedToBlocked(
+
741  T items[ITEMS_PER_THREAD])
+
742  {
+
743  StripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
+
744  }
+
745 
+
784  __device__ __forceinline__ void BlockedToStriped(
+
785  T items[ITEMS_PER_THREAD])
+
786  {
+
787  BlockedToStriped(items, Int2Type<WARP_TIME_SLICING>());
+
788  }
+
789 
+
790 
+
827  __device__ __forceinline__ void WarpStripedToBlocked(
+
828  T items[ITEMS_PER_THREAD])
+
829  {
+
830  WarpStripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
+
831  }
+
832 
+
872  __device__ __forceinline__ void BlockedToWarpStriped(
+
873  T items[ITEMS_PER_THREAD])
+
874  {
+
875  BlockedToWarpStriped(items, Int2Type<WARP_TIME_SLICING>());
+
876  }
+
877 
+
878 
+
880  /******************************************************************/
+
884 
+
885 
+
891  __device__ __forceinline__ void ScatterToBlocked(
+
892  T items[ITEMS_PER_THREAD],
+
893  int ranks[ITEMS_PER_THREAD])
+
894  {
+
895  ScatterToBlocked(items, ranks, Int2Type<WARP_TIME_SLICING>());
+
896  }
+
897 
+
898 
+
904  __device__ __forceinline__ void ScatterToStriped(
+
905  T items[ITEMS_PER_THREAD],
+
906  int ranks[ITEMS_PER_THREAD])
+
907  {
+
908  ScatterToStriped(items, ranks, Int2Type<WARP_TIME_SLICING>());
+
909  }
+
910 
+
912 
+
913 
+
914 };
+
915 
+
916 } // CUB namespace
+
917 CUB_NS_POSTFIX // Optional outer namespace(s)
+
918 
+
+ + + + + diff --git a/docs/html/block__histogram_8cuh_source.html b/docs/html/block__histogram_8cuh_source.html new file mode 100644 index 0000000000..c5a79d1882 --- /dev/null +++ b/docs/html/block__histogram_8cuh_source.html @@ -0,0 +1,310 @@ + + + + + + + +CUB: block_histogram.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
block_histogram.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include "specializations/block_histogram_sort.cuh"
+
37 #include "specializations/block_histogram_atomic.cuh"
+
38 #include "../util_arch.cuh"
+
39 #include "../util_namespace.cuh"
+
40 
+
42 CUB_NS_PREFIX
+
43 
+
45 namespace cub {
+
46 
+
47 
+
48 /******************************************************************************
+
49  * Algorithmic variants
+
50  ******************************************************************************/
+
51 
+ +
56 {
+
57 
+ +
68 
+
69 
+ +
81 };
+
82 
+
83 
+
84 
+
85 /******************************************************************************
+
86  * Block histogram
+
87  ******************************************************************************/
+
88 
+
89 
+
143 template <
+
144  typename T,
+
145  int BLOCK_THREADS,
+
146  int ITEMS_PER_THREAD,
+
147  int BINS,
+ + +
150 {
+
151 private:
+
152 
+
153  /******************************************************************************
+
154  * Constants and type definitions
+
155  ******************************************************************************/
+
156 
+
163  static const BlockHistogramAlgorithm SAFE_ALGORITHM =
+
164  ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
+ +
166  ALGORITHM;
+
167 
+
169  typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
+
170  BlockHistogramSort<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS>,
+
171  BlockHistogramAtomic<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS> >::Type InternalBlockHistogram;
+
172 
+
174  typedef typename InternalBlockHistogram::TempStorage _TempStorage;
+
175 
+
176 
+
177  /******************************************************************************
+
178  * Thread fields
+
179  ******************************************************************************/
+
180 
+
182  _TempStorage &temp_storage;
+
183 
+
185  int linear_tid;
+
186 
+
187 
+
188  /******************************************************************************
+
189  * Utility methods
+
190  ******************************************************************************/
+
191 
+
193  __device__ __forceinline__ _TempStorage& PrivateStorage()
+
194  {
+
195  __shared__ _TempStorage private_storage;
+
196  return private_storage;
+
197  }
+
198 
+
199 
+
200 public:
+
201 
+
203  struct TempStorage : Uninitialized<_TempStorage> {};
+
204 
+
205 
+
206  /******************************************************************/
+
210 
+
214  __device__ __forceinline__ BlockHistogram()
+
215  :
+
216  temp_storage(PrivateStorage()),
+
217  linear_tid(threadIdx.x)
+
218  {}
+
219 
+
220 
+
224  __device__ __forceinline__ BlockHistogram(
+
225  TempStorage &temp_storage)
+
226  :
+
227  temp_storage(temp_storage.Alias()),
+
228  linear_tid(threadIdx.x)
+
229  {}
+
230 
+
231 
+
235  __device__ __forceinline__ BlockHistogram(
+
236  int linear_tid)
+
237  :
+
238  temp_storage(PrivateStorage()),
+
239  linear_tid(linear_tid)
+
240  {}
+
241 
+
242 
+
246  __device__ __forceinline__ BlockHistogram(
+
247  TempStorage &temp_storage,
+
248  int linear_tid)
+
249  :
+
250  temp_storage(temp_storage.Alias()),
+
251  linear_tid(linear_tid)
+
252  {}
+
253 
+
254 
+
256  /******************************************************************/
+
260 
+
261 
+
297  template <typename HistoCounter>
+
298  __device__ __forceinline__ void InitHistogram(HistoCounter histogram[BINS])
+
299  {
+
300  // Initialize histogram bin counts to zeros
+
301  int histo_offset = 0;
+
302 
+
303  #pragma unroll
+
304  for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+
305  {
+
306  histogram[histo_offset + linear_tid] = 0;
+
307  }
+
308  // Finish up with guarded initialization if necessary
+
309  if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+
310  {
+
311  histogram[histo_offset + linear_tid] = 0;
+
312  }
+
313  }
+
314 
+
315 
+
349  template <
+
350  typename HistoCounter>
+
351  __device__ __forceinline__ void Histogram(
+
352  T (&items)[ITEMS_PER_THREAD],
+
353  HistoCounter histogram[BINS])
+
354  {
+
355  // Initialize histogram bin counts to zeros
+
356  InitHistogram(histogram);
+
357 
+
358  // Composite the histogram
+
359  InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
+
360  }
+
361 
+
362 
+
363 
+
401  template <
+
402  typename HistoCounter>
+
403  __device__ __forceinline__ void Composite(
+
404  T (&items)[ITEMS_PER_THREAD],
+
405  HistoCounter histogram[BINS])
+
406  {
+
407  InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
+
408  }
+
409 
+
410 };
+
411 
+
412 } // CUB namespace
+
413 CUB_NS_POSTFIX // Optional outer namespace(s)
+
414 
+
+ + + + + diff --git a/docs/html/block__load_8cuh_source.html b/docs/html/block__load_8cuh_source.html new file mode 100644 index 0000000000..a9f4a12498 --- /dev/null +++ b/docs/html/block__load_8cuh_source.html @@ -0,0 +1,787 @@ + + + + + + + +CUB: block_load.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
block_load.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include <iterator>
+
37 
+
38 #include "../util_namespace.cuh"
+
39 #include "../util_macro.cuh"
+
40 #include "../util_type.cuh"
+
41 #include "../util_vector.cuh"
+
42 #include "../thread/thread_load.cuh"
+
43 #include "block_exchange.cuh"
+
44 
+
46 CUB_NS_PREFIX
+
47 
+
49 namespace cub {
+
50 
+
57 /******************************************************************/
+
61 
+
62 
+
73 template <
+
74  PtxLoadModifier MODIFIER,
+
75  typename T,
+
76  int ITEMS_PER_THREAD,
+
77  typename InputIteratorRA>
+
78 __device__ __forceinline__ void LoadBlocked(
+
79  int linear_tid,
+
80  InputIteratorRA block_itr,
+
81  T (&items)[ITEMS_PER_THREAD])
+
82 {
+
83  // Load directly in thread-blocked order
+
84  #pragma unroll
+
85  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
86  {
+
87  items[ITEM] = ThreadLoad<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM);
+
88  }
+
89 }
+
90 
+
91 
+
102 template <
+
103  PtxLoadModifier MODIFIER,
+
104  typename T,
+
105  int ITEMS_PER_THREAD,
+
106  typename InputIteratorRA>
+
107 __device__ __forceinline__ void LoadBlocked(
+
108  int linear_tid,
+
109  InputIteratorRA block_itr,
+
110  T (&items)[ITEMS_PER_THREAD],
+
111  int valid_items)
+
112 {
+
113  int bounds = valid_items - (linear_tid * ITEMS_PER_THREAD);
+
114 
+
115  #pragma unroll
+
116  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
117  {
+
118  if (ITEM < bounds)
+
119  {
+
120  items[ITEM] = ThreadLoad<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM);
+
121  }
+
122  }
+
123 }
+
124 
+
125 
+
136 template <
+
137  PtxLoadModifier MODIFIER,
+
138  typename T,
+
139  int ITEMS_PER_THREAD,
+
140  typename InputIteratorRA>
+
141 __device__ __forceinline__ void LoadBlocked(
+
142  int linear_tid,
+
143  InputIteratorRA block_itr,
+
144  T (&items)[ITEMS_PER_THREAD],
+
145  int valid_items,
+
146  T oob_default)
+
147 {
+
148  int bounds = valid_items - (linear_tid * ITEMS_PER_THREAD);
+
149 
+
150  #pragma unroll
+
151  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
152  {
+
153  items[ITEM] = (ITEM < bounds) ?
+
154  ThreadLoad<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM) :
+
155  oob_default;
+
156  }
+
157 }
+
158 
+
159 
+
160 
+
162 /******************************************************************/
+
166 
+
167 
+
179 template <
+
180  PtxLoadModifier MODIFIER,
+
181  int BLOCK_THREADS,
+
182  typename T,
+
183  int ITEMS_PER_THREAD,
+
184  typename InputIteratorRA>
+
185 __device__ __forceinline__ void LoadStriped(
+
186  int linear_tid,
+
187  InputIteratorRA block_itr,
+
188  T (&items)[ITEMS_PER_THREAD])
+
189 {
+
190  #pragma unroll
+
191  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
192  {
+
193  items[ITEM] = ThreadLoad<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid);
+
194  }
+
195 }
+
196 
+
197 
+
209 template <
+
210  PtxLoadModifier MODIFIER,
+
211  int BLOCK_THREADS,
+
212  typename T,
+
213  int ITEMS_PER_THREAD,
+
214  typename InputIteratorRA>
+
215 __device__ __forceinline__ void LoadStriped(
+
216  int linear_tid,
+
217  InputIteratorRA block_itr,
+
218  T (&items)[ITEMS_PER_THREAD],
+
219  int valid_items)
+
220 {
+
221  int bounds = valid_items - linear_tid;
+
222 
+
223  #pragma unroll
+
224  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
225  {
+
226  if (ITEM * BLOCK_THREADS < bounds)
+
227  {
+
228  items[ITEM] = ThreadLoad<MODIFIER>(block_itr + linear_tid + (ITEM * BLOCK_THREADS));
+
229  }
+
230  }
+
231 }
+
232 
+
233 
+
245 template <
+
246  PtxLoadModifier MODIFIER,
+
247  int BLOCK_THREADS,
+
248  typename T,
+
249  int ITEMS_PER_THREAD,
+
250  typename InputIteratorRA>
+
251 __device__ __forceinline__ void LoadStriped(
+
252  int linear_tid,
+
253  InputIteratorRA block_itr,
+
254  T (&items)[ITEMS_PER_THREAD],
+
255  int valid_items,
+
256  T oob_default)
+
257 {
+
258  int bounds = valid_items - linear_tid;
+
259 
+
260  #pragma unroll
+
261  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
262  {
+
263  items[ITEM] = (ITEM * BLOCK_THREADS < bounds) ?
+
264  ThreadLoad<MODIFIER>(block_itr + linear_tid + (ITEM * BLOCK_THREADS)) :
+
265  oob_default;
+
266  }
+
267 }
+
268 
+
269 
+
270 
+
272 /******************************************************************/
+
276 
+
277 
+
291 template <
+
292  PtxLoadModifier MODIFIER,
+
293  typename T,
+
294  int ITEMS_PER_THREAD,
+
295  typename InputIteratorRA>
+
296 __device__ __forceinline__ void LoadWarpStriped(
+
297  int linear_tid,
+
298  InputIteratorRA block_itr,
+
299  T (&items)[ITEMS_PER_THREAD])
+
300 {
+
301  int tid = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+
302  int wid = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+
303  int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
304 
+
305  // Load directly in warp-striped order
+
306  #pragma unroll
+
307  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
308  {
+
309  items[ITEM] = ThreadLoad<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS));
+
310  }
+
311 }
+
312 
+
313 
+
327 template <
+
328  PtxLoadModifier MODIFIER,
+
329  typename T,
+
330  int ITEMS_PER_THREAD,
+
331  typename InputIteratorRA>
+
332 __device__ __forceinline__ void LoadWarpStriped(
+
333  int linear_tid,
+
334  InputIteratorRA block_itr,
+
335  T (&items)[ITEMS_PER_THREAD],
+
336  int valid_items)
+
337 {
+
338  int tid = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+
339  int wid = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+
340  int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
341  int bounds = valid_items - warp_offset - tid;
+
342 
+
343  // Load directly in warp-striped order
+
344  #pragma unroll
+
345  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
346  {
+
347  if ((ITEM * PtxArchProps::WARP_THREADS) < bounds)
+
348  {
+
349  items[ITEM] = ThreadLoad<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS));
+
350  }
+
351  }
+
352 }
+
353 
+
354 
+
368 template <
+
369  PtxLoadModifier MODIFIER,
+
370  typename T,
+
371  int ITEMS_PER_THREAD,
+
372  typename InputIteratorRA>
+
373 __device__ __forceinline__ void LoadWarpStriped(
+
374  int linear_tid,
+
375  InputIteratorRA block_itr,
+
376  T (&items)[ITEMS_PER_THREAD],
+
377  int valid_items,
+
378  T oob_default)
+
379 {
+
380  int tid = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+
381  int wid = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+
382  int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
383  int bounds = valid_items - warp_offset - tid;
+
384 
+
385  // Load directly in warp-striped order
+
386  #pragma unroll
+
387  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
388  {
+
389  items[ITEM] = ((ITEM * PtxArchProps::WARP_THREADS) < bounds) ?
+
390  ThreadLoad<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS)) :
+
391  oob_default;
+
392  }
+
393 }
+
394 
+
395 
+
396 
+
398 /******************************************************************/
+
402 
+
418 template <
+
419  PtxLoadModifier MODIFIER,
+
420  typename T,
+
421  int ITEMS_PER_THREAD>
+
422 __device__ __forceinline__ void LoadBlockedVectorized(
+
423  int linear_tid,
+
424  T *block_ptr,
+
425  T (&items)[ITEMS_PER_THREAD])
+
426 {
+
427  enum
+
428  {
+
429  // Maximum CUDA vector size is 4 elements
+
430  MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
431 
+
432  // Vector size must be a power of two and an even divisor of the items per thread
+
433  VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+
434  MAX_VEC_SIZE :
+
435  1,
+
436 
+
437  VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+
438  };
+
439 
+
440  // Vector type
+
441  typedef typename VectorHelper<T, VEC_SIZE>::Type Vector;
+
442 
+
443  // Alias local data (use raw_items array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+
444  T raw_items[ITEMS_PER_THREAD];
+
445 
+
446  // Direct-load using vector types
+
447  LoadBlocked<MODIFIER>(
+
448  linear_tid,
+
449  reinterpret_cast<Vector *>(block_ptr),
+
450  reinterpret_cast<Vector (&)[VECTORS_PER_THREAD]>(raw_items));
+
451 
+
452  // Copy
+
453  #pragma unroll
+
454  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
455  {
+
456  items[ITEM] = raw_items[ITEM];
+
457  }
+
458 }
+
459 
+
460 
+
462  // end group IoModule
+
464 
+
465 
+
466 
+
467 //-----------------------------------------------------------------------------
+
468 // Generic BlockLoad abstraction
+
469 //-----------------------------------------------------------------------------
+
470 
+ +
475 {
+ +
488 
+ +
510 
+ +
529 
+
530 
+ +
552 };
+
553 
+
554 
+
616 template <
+
617  typename InputIteratorRA,
+
618  int BLOCK_THREADS,
+
619  int ITEMS_PER_THREAD,
+ +
621  PtxLoadModifier MODIFIER = LOAD_DEFAULT,
+
622  bool WARP_TIME_SLICING = false>
+ +
624 {
+
625 private:
+
626 
+
627  /******************************************************************************
+
628  * Constants and typed definitions
+
629  ******************************************************************************/
+
630 
+
631  // Data type of input iterator
+
632  typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
633 
+
634 
+
635  /******************************************************************************
+
636  * Algorithmic variants
+
637  ******************************************************************************/
+
638 
+
640  template <BlockLoadAlgorithm _POLICY, int DUMMY = 0>
+
641  struct LoadInternal;
+
642 
+
643 
+
647  template <int DUMMY>
+
648  struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
+
649  {
+
651  typedef NullType TempStorage;
+
652 
+
654  int linear_tid;
+
655 
+
657  __device__ __forceinline__ LoadInternal(
+
658  TempStorage &temp_storage,
+
659  int linear_tid)
+
660  :
+
661  linear_tid(linear_tid)
+
662  {}
+
663 
+
665  __device__ __forceinline__ void Load(
+
666  InputIteratorRA block_itr,
+
667  T (&items)[ITEMS_PER_THREAD])
+
668  {
+
669  LoadBlocked<MODIFIER>(linear_tid, block_itr, items);
+
670  }
+
671 
+
673  __device__ __forceinline__ void Load(
+
674  InputIteratorRA block_itr,
+
675  T (&items)[ITEMS_PER_THREAD],
+
676  int valid_items)
+
677  {
+
678  LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
+
679  }
+
680 
+
682  __device__ __forceinline__ void Load(
+
683  InputIteratorRA block_itr,
+
684  T (&items)[ITEMS_PER_THREAD],
+
685  int valid_items,
+
686  T oob_default)
+
687  {
+
688  LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items, oob_default);
+
689  }
+
690 
+
691  };
+
692 
+
693 
+
697  template <int DUMMY>
+
698  struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
+
699  {
+
701  typedef NullType TempStorage;
+
702 
+
704  int linear_tid;
+
705 
+
707  __device__ __forceinline__ LoadInternal(
+
708  TempStorage &temp_storage,
+
709  int linear_tid)
+
710  :
+
711  linear_tid(linear_tid)
+
712  {}
+
713 
+
715  __device__ __forceinline__ void Load(
+
716  T *block_ptr,
+
717  T (&items)[ITEMS_PER_THREAD])
+
718  {
+
719  LoadBlockedVectorized<MODIFIER>(linear_tid, block_ptr, items);
+
720  }
+
721 
+
723  template <
+
724  typename T,
+
725  typename _InputIteratorRA>
+
726  __device__ __forceinline__ void Load(
+
727  _InputIteratorRA block_itr,
+
728  T (&items)[ITEMS_PER_THREAD])
+
729  {
+
730  LoadBlocked<MODIFIER>(linear_tid, block_itr, items);
+
731  }
+
732 
+
734  __device__ __forceinline__ void Load(
+
735  InputIteratorRA block_itr,
+
736  T (&items)[ITEMS_PER_THREAD],
+
737  int valid_items)
+
738  {
+
739  LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
+
740  }
+
741 
+
743  __device__ __forceinline__ void Load(
+
744  InputIteratorRA block_itr,
+
745  T (&items)[ITEMS_PER_THREAD],
+
746  int valid_items,
+
747  T oob_default)
+
748  {
+
749  LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items, oob_default);
+
750  }
+
751 
+
752  };
+
753 
+
754 
+
758  template <int DUMMY>
+
759  struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
+
760  {
+
761  // BlockExchange utility type for keys
+ +
763 
+
765  typedef typename BlockExchange::TempStorage _TempStorage;
+
766 
+
768  struct TempStorage : Uninitialized<_TempStorage> {};
+
769 
+
771  _TempStorage &temp_storage;
+
772 
+
774  int linear_tid;
+
775 
+
777  __device__ __forceinline__ LoadInternal(
+
778  TempStorage &temp_storage,
+
779  int linear_tid)
+
780  :
+
781  temp_storage(temp_storage.Alias()),
+
782  linear_tid(linear_tid)
+
783  {}
+
784 
+
786  __device__ __forceinline__ void Load(
+
787  InputIteratorRA block_itr,
+
788  T (&items)[ITEMS_PER_THREAD])
+
789  {
+
790  LoadStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items);
+
791  BlockExchange(temp_storage, linear_tid).StripedToBlocked(items);
+
792  }
+
793 
+
795  __device__ __forceinline__ void Load(
+
796  InputIteratorRA block_itr,
+
797  T (&items)[ITEMS_PER_THREAD],
+
798  int valid_items)
+
799  {
+
800  LoadStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+
801  BlockExchange(temp_storage, linear_tid).StripedToBlocked(items);
+
802  }
+
803 
+
805  __device__ __forceinline__ void Load(
+
806  InputIteratorRA block_itr,
+
807  T (&items)[ITEMS_PER_THREAD],
+
808  int valid_items,
+
809  T oob_default)
+
810  {
+
811  LoadStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+
812  BlockExchange(temp_storage, linear_tid).StripedToBlocked(items);
+
813  }
+
814 
+
815  };
+
816 
+
817 
+
821  template <int DUMMY>
+
822  struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
+
823  {
+
824  enum
+
825  {
+
826  WARP_THREADS = PtxArchProps::WARP_THREADS
+
827  };
+
828 
+
829  // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+
830  CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
831 
+
832  // BlockExchange utility type for keys
+
833  typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
+
834 
+
836  typedef typename BlockExchange::TempStorage _TempStorage;
+
837 
+
839  struct TempStorage : Uninitialized<_TempStorage> {};
+
840 
+
842  _TempStorage &temp_storage;
+
843 
+
845  int linear_tid;
+
846 
+
848  __device__ __forceinline__ LoadInternal(
+
849  TempStorage &temp_storage,
+
850  int linear_tid)
+
851  :
+
852  temp_storage(temp_storage.Alias()),
+
853  linear_tid(linear_tid)
+
854  {}
+
855 
+
857  __device__ __forceinline__ void Load(
+
858  InputIteratorRA block_itr,
+
859  T (&items)[ITEMS_PER_THREAD])
+
860  {
+
861  LoadWarpStriped<MODIFIER>(linear_tid, block_itr, items);
+
862  BlockExchange(temp_storage, linear_tid).WarpStripedToBlocked(items);
+
863  }
+
864 
+
866  __device__ __forceinline__ void Load(
+
867  InputIteratorRA block_itr,
+
868  T (&items)[ITEMS_PER_THREAD],
+
869  int valid_items)
+
870  {
+
871  LoadWarpStriped<MODIFIER>(linear_tid, block_itr, items, valid_items);
+
872  BlockExchange(temp_storage, linear_tid).WarpStripedToBlocked(items);
+
873  }
+
874 
+
875 
+
877  __device__ __forceinline__ void Load(
+
878  InputIteratorRA block_itr,
+
879  T (&items)[ITEMS_PER_THREAD],
+
880  int valid_items,
+
881  T oob_default)
+
882  {
+
883  LoadWarpStriped<MODIFIER>(linear_tid, block_itr, items, valid_items, oob_default);
+
884  BlockExchange(temp_storage, linear_tid).WarpStripedToBlocked(items);
+
885  }
+
886  };
+
887 
+
888 
+
889  /******************************************************************************
+
890  * Type definitions
+
891  ******************************************************************************/
+
892 
+
894  typedef LoadInternal<ALGORITHM> InternalLoad;
+
895 
+
896 
+
898  typedef typename InternalLoad::TempStorage _TempStorage;
+
899 
+
900 
+
901  /******************************************************************************
+
902  * Utility methods
+
903  ******************************************************************************/
+
904 
+
906  __device__ __forceinline__ _TempStorage& PrivateStorage()
+
907  {
+
908  __shared__ _TempStorage private_storage;
+
909  return private_storage;
+
910  }
+
911 
+
912 
+
913  /******************************************************************************
+
914  * Thread fields
+
915  ******************************************************************************/
+
916 
+
918  _TempStorage &temp_storage;
+
919 
+
921  int linear_tid;
+
922 
+
923 public:
+
924 
+
926  struct TempStorage : Uninitialized<_TempStorage> {};
+
927 
+
928 
+
929  /******************************************************************/
+
933 
+
937  __device__ __forceinline__ BlockLoad()
+
938  :
+
939  temp_storage(PrivateStorage()),
+
940  linear_tid(threadIdx.x)
+
941  {}
+
942 
+
943 
+
947  __device__ __forceinline__ BlockLoad(
+
948  TempStorage &temp_storage)
+
949  :
+
950  temp_storage(temp_storage.Alias()),
+
951  linear_tid(threadIdx.x)
+
952  {}
+
953 
+
954 
+
958  __device__ __forceinline__ BlockLoad(
+
959  int linear_tid)
+
960  :
+
961  temp_storage(PrivateStorage()),
+
962  linear_tid(linear_tid)
+
963  {}
+
964 
+
965 
+
969  __device__ __forceinline__ BlockLoad(
+
970  TempStorage &temp_storage,
+
971  int linear_tid)
+
972  :
+
973  temp_storage(temp_storage.Alias()),
+
974  linear_tid(linear_tid)
+
975  {}
+
976 
+
977 
+
978 
+
980  /******************************************************************/
+
984 
+
985 
+
1019  __device__ __forceinline__ void Load(
+
1020  InputIteratorRA block_itr,
+
1021  T (&items)[ITEMS_PER_THREAD])
+
1022  {
+
1023  InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+
1024  }
+
1025 
+
1026 
+
1061  __device__ __forceinline__ void Load(
+
1062  InputIteratorRA block_itr,
+
1063  T (&items)[ITEMS_PER_THREAD],
+
1064  int valid_items)
+
1065  {
+
1066  InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+
1067  }
+
1068 
+
1069 
+
1105  __device__ __forceinline__ void Load(
+
1106  InputIteratorRA block_itr,
+
1107  T (&items)[ITEMS_PER_THREAD],
+
1108  int valid_items,
+
1109  T oob_default)
+
1110  {
+
1111  InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+
1112  }
+
1113 
+
1114 
+
1116 
+
1117 };
+
1118 
+
1119 
+
1120 } // CUB namespace
+
1121 CUB_NS_POSTFIX // Optional outer namespace(s)
+
1122 
+
+ + + + + diff --git a/docs/html/block__radix__sort_8cuh_source.html b/docs/html/block__radix__sort_8cuh_source.html new file mode 100644 index 0000000000..c168b03cd5 --- /dev/null +++ b/docs/html/block__radix__sort_8cuh_source.html @@ -0,0 +1,465 @@ + + + + + + + +CUB: block_radix_sort.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
block_radix_sort.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
35 #pragma once
+
36 
+
37 #include "../util_namespace.cuh"
+
38 #include "../util_arch.cuh"
+
39 #include "../util_type.cuh"
+
40 #include "block_exchange.cuh"
+
41 #include "block_radix_rank.cuh"
+
42 
+
44 CUB_NS_PREFIX
+
45 
+
47 namespace cub {
+
48 
+
113 template <
+
114  typename Key,
+
115  int BLOCK_THREADS,
+
116  int ITEMS_PER_THREAD,
+
117  typename Value = NullType,
+
118  int RADIX_BITS = 4,
+
119  bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false,
+
120  BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
+
121  cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte>
+ +
123 {
+
124 private:
+
125 
+
126  /******************************************************************************
+
127  * Constants and type definitions
+
128  ******************************************************************************/
+
129 
+
130  // Key traits and unsigned bits type
+ +
132  typedef typename KeyTraits::UnsignedBits UnsignedBits;
+
133 
+
135  typedef BlockRadixRank<BLOCK_THREADS, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixRank;
+
136 
+ +
139 
+ +
142 
+
144  struct _TempStorage
+
145  {
+
146  union
+
147  {
+
148  typename BlockRadixRank::TempStorage ranking_storage;
+
149  typename BlockExchangeKeys::TempStorage exchange_keys;
+
150  typename BlockExchangeValues::TempStorage exchange_values;
+
151  };
+
152  };
+
153 
+
154  /******************************************************************************
+
155  * Utility methods
+
156  ******************************************************************************/
+
157 
+
159  __device__ __forceinline__ _TempStorage& PrivateStorage()
+
160  {
+
161  __shared__ _TempStorage private_storage;
+
162  return private_storage;
+
163  }
+
164 
+
165 
+
166  /******************************************************************************
+
167  * Thread fields
+
168  ******************************************************************************/
+
169 
+
171  _TempStorage &temp_storage;
+
172 
+
174  int linear_tid;
+
175 
+
176 
+
177 public:
+
178 
+
180  struct TempStorage : Uninitialized<_TempStorage> {};
+
181 
+
182 
+
183  /******************************************************************/
+
187 
+
191  __device__ __forceinline__ BlockRadixSort()
+
192  :
+
193  temp_storage(PrivateStorage()),
+
194  linear_tid(threadIdx.x)
+
195  {}
+
196 
+
197 
+
201  __device__ __forceinline__ BlockRadixSort(
+
202  TempStorage &temp_storage)
+
203  :
+
204  temp_storage(temp_storage.Alias()),
+
205  linear_tid(threadIdx.x)
+
206  {}
+
207 
+
208 
+
212  __device__ __forceinline__ BlockRadixSort(
+
213  int linear_tid)
+
214  :
+
215  temp_storage(PrivateStorage()),
+
216  linear_tid(linear_tid)
+
217  {}
+
218 
+
219 
+
223  __device__ __forceinline__ BlockRadixSort(
+
224  TempStorage &temp_storage,
+
225  int linear_tid)
+
226  :
+
227  temp_storage(temp_storage.Alias()),
+
228  linear_tid(linear_tid)
+
229  {}
+
230 
+
231 
+
232 
+
234  /******************************************************************/
+
238 
+
273  __device__ __forceinline__ void Sort(
+
274  Key (&keys)[ITEMS_PER_THREAD],
+
275  int begin_bit = 0,
+
276  int end_bit = sizeof(Key) * 8)
+
277  {
+
278  UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+
279  reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
280 
+
281  // Twiddle bits if necessary
+
282  #pragma unroll
+
283  for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+
284  {
+
285  unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+
286  }
+
287 
+
288  // Radix sorting passes
+
289  while (true)
+
290  {
+
291  // Rank the blocked keys
+
292  int ranks[ITEMS_PER_THREAD];
+
293  BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+
294  begin_bit += RADIX_BITS;
+
295 
+
296  __syncthreads();
+
297 
+
298  // Exchange keys through shared memory in blocked arrangement
+
299  BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
300 
+
301  // Quit if done
+
302  if (begin_bit >= end_bit) break;
+
303 
+
304  __syncthreads();
+
305  }
+
306 
+
307  // Untwiddle bits if necessary
+
308  #pragma unroll
+
309  for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+
310  {
+
311  unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+
312  }
+
313  }
+
314 
+
315 
+
358  __device__ __forceinline__ void Sort(
+
359  Key (&keys)[ITEMS_PER_THREAD],
+
360  Value (&values)[ITEMS_PER_THREAD],
+
361  int begin_bit = 0,
+
362  int end_bit = sizeof(Key) * 8)
+
363  {
+
364  UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+
365  reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
366 
+
367  // Twiddle bits if necessary
+
368  #pragma unroll
+
369  for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+
370  {
+
371  unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+
372  }
+
373 
+
374  // Radix sorting passes
+
375  while (true)
+
376  {
+
377  // Rank the blocked keys
+
378  int ranks[ITEMS_PER_THREAD];
+
379  BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+
380  begin_bit += RADIX_BITS;
+
381 
+
382  __syncthreads();
+
383 
+
384  // Exchange keys through shared memory in blocked arrangement
+
385  BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
386 
+
387  __syncthreads();
+
388 
+
389  // Exchange values through shared memory in blocked arrangement
+
390  BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToBlocked(values, ranks);
+
391 
+
392  // Quit if done
+
393  if (begin_bit >= end_bit) break;
+
394 
+
395  __syncthreads();
+
396  }
+
397 
+
398  // Untwiddle bits if necessary
+
399  #pragma unroll
+
400  for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+
401  {
+
402  unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+
403  }
+
404  }
+
405 
+
406 
+
408  /******************************************************************/
+
412 
+
413 
+
449  __device__ __forceinline__ void SortBlockedToStriped(
+
450  Key (&keys)[ITEMS_PER_THREAD],
+
451  int begin_bit = 0,
+
452  int end_bit = sizeof(Key) * 8)
+
453  {
+
454  UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+
455  reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
456 
+
457  // Twiddle bits if necessary
+
458  #pragma unroll
+
459  for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+
460  {
+
461  unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+
462  }
+
463 
+
464  // Radix sorting passes
+
465  while (true)
+
466  {
+
467  // Rank the blocked keys
+
468  int ranks[ITEMS_PER_THREAD];
+
469  BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+
470  begin_bit += RADIX_BITS;
+
471 
+
472  __syncthreads();
+
473 
+
474  // Check if this is the last pass
+
475  if (begin_bit >= end_bit)
+
476  {
+
477  // Last pass exchanges keys through shared memory in striped arrangement
+
478  BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToStriped(keys, ranks);
+
479 
+
480  // Quit
+
481  break;
+
482  }
+
483 
+
484  // Exchange keys through shared memory in blocked arrangement
+
485  BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
486 
+
487  __syncthreads();
+
488  }
+
489 
+
490  // Untwiddle bits if necessary
+
491  #pragma unroll
+
492  for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+
493  {
+
494  unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+
495  }
+
496  }
+
497 
+
498 
+
541  __device__ __forceinline__ void SortBlockedToStriped(
+
542  Key (&keys)[ITEMS_PER_THREAD],
+
543  Value (&values)[ITEMS_PER_THREAD],
+
544  int begin_bit = 0,
+
545  int end_bit = sizeof(Key) * 8)
+
546  {
+
547  UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+
548  reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
549 
+
550  // Twiddle bits if necessary
+
551  #pragma unroll
+
552  for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+
553  {
+
554  unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+
555  }
+
556 
+
557  // Radix sorting passes
+
558  while (true)
+
559  {
+
560  // Rank the blocked keys
+
561  int ranks[ITEMS_PER_THREAD];
+
562  BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+
563  begin_bit += RADIX_BITS;
+
564 
+
565  __syncthreads();
+
566 
+
567  // Check if this is the last pass
+
568  if (begin_bit >= end_bit)
+
569  {
+
570  // Last pass exchanges keys through shared memory in striped arrangement
+
571  BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToStriped(keys, ranks);
+
572 
+
573  __syncthreads();
+
574 
+
575  // Last pass exchanges through shared memory in striped arrangement
+
576  BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToStriped(values, ranks);
+
577 
+
578  // Quit
+
579  break;
+
580  }
+
581 
+
582  // Exchange keys through shared memory in blocked arrangement
+
583  BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
584 
+
585  __syncthreads();
+
586 
+
587  // Exchange values through shared memory in blocked arrangement
+
588  BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToBlocked(values, ranks);
+
589 
+
590  __syncthreads();
+
591  }
+
592 
+
593  // Untwiddle bits if necessary
+
594  #pragma unroll
+
595  for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+
596  {
+
597  unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+
598  }
+
599  }
+
600 
+
601 
+
603 
+
604 };
+
605 
+
606 } // CUB namespace
+
607 CUB_NS_POSTFIX // Optional outer namespace(s)
+
608 
+
+ + + + + diff --git a/docs/html/block__reduce_8cuh_source.html b/docs/html/block__reduce_8cuh_source.html new file mode 100644 index 0000000000..ce3b1be64f --- /dev/null +++ b/docs/html/block__reduce_8cuh_source.html @@ -0,0 +1,336 @@ + + + + + + + +CUB: block_reduce.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
block_reduce.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include "specializations/block_reduce_raking.cuh"
+
37 #include "specializations/block_reduce_warp_reductions.cuh"
+
38 #include "../util_type.cuh"
+
39 #include "../thread/thread_operators.cuh"
+
40 #include "../util_namespace.cuh"
+
41 
+
43 CUB_NS_PREFIX
+
44 
+
46 namespace cub {
+
47 
+
48 
+
49 
+
50 /******************************************************************************
+
51  * Algorithmic variants
+
52  ******************************************************************************/
+
53 
+ +
59 {
+
60 
+ +
82 
+
83 
+ +
107 };
+
108 
+
109 
+
110 /******************************************************************************
+
111  * Block reduce
+
112  ******************************************************************************/
+
113 
+
168 template <
+
169  typename T,
+
170  int BLOCK_THREADS,
+ + +
173 {
+
174 private:
+
175 
+
176  /******************************************************************************
+
177  * Constants and typedefs
+
178  ******************************************************************************/
+
179 
+
181  typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
+
182  BlockReduceWarpReductions<T, BLOCK_THREADS>,
+
183  BlockReduceRaking<T, BLOCK_THREADS> >::Type InternalBlockReduce;
+
184 
+
186  typedef typename InternalBlockReduce::TempStorage _TempStorage;
+
187 
+
188 
+
189  /******************************************************************************
+
190  * Utility methods
+
191  ******************************************************************************/
+
192 
+
194  __device__ __forceinline__ _TempStorage& PrivateStorage()
+
195  {
+
196  __shared__ _TempStorage private_storage;
+
197  return private_storage;
+
198  }
+
199 
+
200 
+
201  /******************************************************************************
+
202  * Thread fields
+
203  ******************************************************************************/
+
204 
+
206  _TempStorage &temp_storage;
+
207 
+
209  int linear_tid;
+
210 
+
211 
+
212 public:
+
213 
+
215  struct TempStorage : Uninitialized<_TempStorage> {};
+
216 
+
217 
+
218  /******************************************************************/
+
222 
+
226  __device__ __forceinline__ BlockReduce()
+
227  :
+
228  temp_storage(PrivateStorage()),
+
229  linear_tid(threadIdx.x)
+
230  {}
+
231 
+
232 
+
236  __device__ __forceinline__ BlockReduce(
+
237  TempStorage &temp_storage)
+
238  :
+
239  temp_storage(temp_storage.Alias()),
+
240  linear_tid(threadIdx.x)
+
241  {}
+
242 
+
243 
+
247  __device__ __forceinline__ BlockReduce(
+
248  int linear_tid)
+
249  :
+
250  temp_storage(PrivateStorage()),
+
251  linear_tid(linear_tid)
+
252  {}
+
253 
+
254 
+
258  __device__ __forceinline__ BlockReduce(
+
259  TempStorage &temp_storage,
+
260  int linear_tid)
+
261  :
+
262  temp_storage(temp_storage.Alias()),
+
263  linear_tid(linear_tid)
+
264  {}
+
265 
+
266 
+
267 
+
269  /******************************************************************/
+
273 
+
274 
+
309  template <typename ReductionOp>
+
310  __device__ __forceinline__ T Reduce(
+
311  T input,
+
312  ReductionOp reduction_op)
+
313  {
+
314  return InternalBlockReduce(temp_storage, linear_tid).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
+
315  }
+
316 
+
317 
+
356  template <
+
357  int ITEMS_PER_THREAD,
+
358  typename ReductionOp>
+
359  __device__ __forceinline__ T Reduce(
+
360  T (&inputs)[ITEMS_PER_THREAD],
+
361  ReductionOp reduction_op)
+
362  {
+
363  // Reduce partials
+
364  T partial = ThreadReduce(inputs, reduction_op);
+
365  return Reduce(partial, reduction_op);
+
366  }
+
367 
+
368 
+
405  template <typename ReductionOp>
+
406  __device__ __forceinline__ T Reduce(
+
407  T input,
+
408  ReductionOp reduction_op,
+
409  int num_valid)
+
410  {
+
411  // Determine if we scan skip bounds checking
+
412  if (num_valid >= BLOCK_THREADS)
+
413  {
+
414  return InternalBlockReduce(temp_storage, linear_tid).template Reduce<true>(input, num_valid, reduction_op);
+
415  }
+
416  else
+
417  {
+
418  return InternalBlockReduce(temp_storage, linear_tid).template Reduce<false>(input, num_valid, reduction_op);
+
419  }
+
420  }
+
421 
+
422 
+
424  /******************************************************************/
+
428 
+
429 
+
461  __device__ __forceinline__ T Sum(
+
462  T input)
+
463  {
+
464  return InternalBlockReduce(temp_storage, linear_tid).template Sum<true>(input, BLOCK_THREADS);
+
465  }
+
466 
+
500  template <int ITEMS_PER_THREAD>
+
501  __device__ __forceinline__ T Sum(
+
502  T (&inputs)[ITEMS_PER_THREAD])
+
503  {
+
504  // Reduce partials
+
505  T partial = ThreadReduce(inputs, cub::Sum());
+
506  return Sum(partial);
+
507  }
+
508 
+
509 
+
542  __device__ __forceinline__ T Sum(
+
543  T input,
+
544  int num_valid)
+
545  {
+
546  // Determine if we scan skip bounds checking
+
547  if (num_valid >= BLOCK_THREADS)
+
548  {
+
549  return InternalBlockReduce(temp_storage, linear_tid).template Sum<true>(input, num_valid);
+
550  }
+
551  else
+
552  {
+
553  return InternalBlockReduce(temp_storage, linear_tid).template Sum<false>(input, num_valid);
+
554  }
+
555  }
+
556 
+
557 
+
559 };
+
560 
+
561 } // CUB namespace
+
562 CUB_NS_POSTFIX // Optional outer namespace(s)
+
563 
+
+ + + + + diff --git a/docs/html/block__scan_8cuh_source.html b/docs/html/block__scan_8cuh_source.html new file mode 100644 index 0000000000..01cc2bd765 --- /dev/null +++ b/docs/html/block__scan_8cuh_source.html @@ -0,0 +1,814 @@ + + + + + + + +CUB: block_scan.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
block_scan.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include "specializations/block_scan_raking.cuh"
+
37 #include "specializations/block_scan_warp_scans.cuh"
+
38 #include "../util_arch.cuh"
+
39 #include "../util_type.cuh"
+
40 #include "../util_namespace.cuh"
+
41 
+
43 CUB_NS_PREFIX
+
44 
+
46 namespace cub {
+
47 
+
48 
+
49 /******************************************************************************
+
50  * Algorithmic variants
+
51  ******************************************************************************/
+
52 
+ +
57 {
+
58 
+ +
78 
+
79 
+ +
88 
+
89 
+ +
108 };
+
109 
+
110 
+
111 /******************************************************************************
+
112  * Block scan
+
113  ******************************************************************************/
+
114 
+
182 template <
+
183  typename T,
+
184  int BLOCK_THREADS,
+ + +
187 {
+
188 private:
+
189 
+
190  /******************************************************************************
+
191  * Constants and typedefs
+
192  ******************************************************************************/
+
193 
+
200  static const BlockScanAlgorithm SAFE_ALGORITHM =
+
201  ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % PtxArchProps::WARP_THREADS != 0)) ?
+ +
203  ALGORITHM;
+
204 
+
206  typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
+
207  BlockScanWarpScans<T, BLOCK_THREADS>,
+
208  BlockScanRaking<T, BLOCK_THREADS, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)> >::Type InternalBlockScan;
+
209 
+
210 
+
212  typedef typename InternalBlockScan::TempStorage _TempStorage;
+
213 
+
214 
+
215  /******************************************************************************
+
216  * Thread fields
+
217  ******************************************************************************/
+
218 
+
220  _TempStorage &temp_storage;
+
221 
+
223  int linear_tid;
+
224 
+
225 
+
226  /******************************************************************************
+
227  * Utility methods
+
228  ******************************************************************************/
+
229 
+
231  __device__ __forceinline__ _TempStorage& PrivateStorage()
+
232  {
+
233  __shared__ _TempStorage private_storage;
+
234  return private_storage;
+
235  }
+
236 
+
237 
+
238 public:
+
239 
+
241  struct TempStorage : Uninitialized<_TempStorage> {};
+
242 
+
243 
+
244  /******************************************************************/
+
248 
+
252  __device__ __forceinline__ BlockScan()
+
253  :
+
254  temp_storage(PrivateStorage()),
+
255  linear_tid(threadIdx.x)
+
256  {}
+
257 
+
258 
+
262  __device__ __forceinline__ BlockScan(
+
263  TempStorage &temp_storage)
+
264  :
+
265  temp_storage(temp_storage.Alias()),
+
266  linear_tid(threadIdx.x)
+
267  {}
+
268 
+
269 
+
273  __device__ __forceinline__ BlockScan(
+
274  int linear_tid)
+
275  :
+
276  temp_storage(PrivateStorage()),
+
277  linear_tid(linear_tid)
+
278  {}
+
279 
+
280 
+
284  __device__ __forceinline__ BlockScan(
+
285  TempStorage &temp_storage,
+
286  int linear_tid)
+
287  :
+
288  temp_storage(temp_storage.Alias()),
+
289  linear_tid(linear_tid)
+
290  {}
+
291 
+
292 
+
293 
+
295  /******************************************************************/
+
299 
+
300 
+
335  __device__ __forceinline__ void ExclusiveSum(
+
336  T input,
+
337  T &output)
+
338  {
+
339  T block_aggregate;
+
340  InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate);
+
341  }
+
342 
+
343 
+
380  __device__ __forceinline__ void ExclusiveSum(
+
381  T input,
+
382  T &output,
+
383  T &block_aggregate)
+
384  {
+
385  InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate);
+
386  }
+
387 
+
388 
+
464  template <typename BlockPrefixOp>
+
465  __device__ __forceinline__ void ExclusiveSum(
+
466  T input,
+
467  T &output,
+
468  T &block_aggregate,
+
469  BlockPrefixOp &block_prefix_op)
+
470  {
+
471  InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate, block_prefix_op);
+
472  }
+
473 
+
474 
+
476  /******************************************************************/
+
480 
+
481 
+
518  template <int ITEMS_PER_THREAD>
+
519  __device__ __forceinline__ void ExclusiveSum(
+
520  T (&input)[ITEMS_PER_THREAD],
+
521  T (&output)[ITEMS_PER_THREAD])
+
522  {
+
523  // Reduce consecutive thread items in registers
+
524  Sum scan_op;
+
525  T thread_partial = ThreadReduce(input, scan_op);
+
526 
+
527  // Exclusive threadblock-scan
+
528  ExclusiveSum(thread_partial, thread_partial);
+
529 
+
530  // Exclusive scan in registers with prefix
+
531  ThreadScanExclusive(input, output, scan_op, thread_partial);
+
532  }
+
533 
+
534 
+
573  template <int ITEMS_PER_THREAD>
+
574  __device__ __forceinline__ void ExclusiveSum(
+
575  T (&input)[ITEMS_PER_THREAD],
+
576  T (&output)[ITEMS_PER_THREAD],
+
577  T &block_aggregate)
+
578  {
+
579  // Reduce consecutive thread items in registers
+
580  Sum scan_op;
+
581  T thread_partial = ThreadReduce(input, scan_op);
+
582 
+
583  // Exclusive threadblock-scan
+
584  ExclusiveSum(thread_partial, thread_partial, block_aggregate);
+
585 
+
586  // Exclusive scan in registers with prefix
+
587  ThreadScanExclusive(input, output, scan_op, thread_partial);
+
588  }
+
589 
+
590 
+
677  template <
+
678  int ITEMS_PER_THREAD,
+
679  typename BlockPrefixOp>
+
680  __device__ __forceinline__ void ExclusiveSum(
+
681  T (&input)[ITEMS_PER_THREAD],
+
682  T (&output)[ITEMS_PER_THREAD],
+
683  T &block_aggregate,
+
684  BlockPrefixOp &block_prefix_op)
+
685  {
+
686  // Reduce consecutive thread items in registers
+
687  Sum scan_op;
+
688  T thread_partial = ThreadReduce(input, scan_op);
+
689 
+
690  // Exclusive threadblock-scan
+
691  ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_op);
+
692 
+
693  // Exclusive scan in registers with prefix
+
694  ThreadScanExclusive(input, output, scan_op, thread_partial);
+
695  }
+
696 
+
697 
+
698 
+
700  /******************************************************************/
+
704 
+
705 
+
743  template <typename ScanOp>
+
744  __device__ __forceinline__ void ExclusiveScan(
+
745  T input,
+
746  T &output,
+
747  T identity,
+
748  ScanOp scan_op)
+
749  {
+
750  T block_aggregate;
+
751  InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
+
752  }
+
753 
+
754 
+
794  template <typename ScanOp>
+
795  __device__ __forceinline__ void ExclusiveScan(
+
796  T input,
+
797  T &output,
+
798  const T &identity,
+
799  ScanOp scan_op,
+
800  T &block_aggregate)
+
801  {
+
802  InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
+
803  }
+
804 
+
805 
+
885  template <
+
886  typename ScanOp,
+
887  typename BlockPrefixOp>
+
888  __device__ __forceinline__ void ExclusiveScan(
+
889  T input,
+
890  T &output,
+
891  T identity,
+
892  ScanOp scan_op,
+
893  T &block_aggregate,
+
894  BlockPrefixOp &block_prefix_op)
+
895  {
+
896  InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate, block_prefix_op);
+
897  }
+
898 
+
899 
+
901  /******************************************************************/
+
905 
+
906 
+
948  template <
+
949  int ITEMS_PER_THREAD,
+
950  typename ScanOp>
+
951  __device__ __forceinline__ void ExclusiveScan(
+
952  T (&input)[ITEMS_PER_THREAD],
+
953  T (&output)[ITEMS_PER_THREAD],
+
954  const T &identity,
+
955  ScanOp scan_op)
+
956  {
+
957  // Reduce consecutive thread items in registers
+
958  T thread_partial = ThreadReduce(input, scan_op);
+
959 
+
960  // Exclusive threadblock-scan
+
961  ExclusiveScan(thread_partial, thread_partial, identity, scan_op);
+
962 
+
963  // Exclusive scan in registers with prefix
+
964  ThreadScanExclusive(input, output, scan_op, thread_partial);
+
965  }
+
966 
+
967 
+
1009  template <
+
1010  int ITEMS_PER_THREAD,
+
1011  typename ScanOp>
+
1012  __device__ __forceinline__ void ExclusiveScan(
+
1013  T (&input)[ITEMS_PER_THREAD],
+
1014  T (&output)[ITEMS_PER_THREAD],
+
1015  const T &identity,
+
1016  ScanOp scan_op,
+
1017  T &block_aggregate)
+
1018  {
+
1019  // Reduce consecutive thread items in registers
+
1020  T thread_partial = ThreadReduce(input, scan_op);
+
1021 
+
1022  // Exclusive threadblock-scan
+
1023  ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate);
+
1024 
+
1025  // Exclusive scan in registers with prefix
+
1026  ThreadScanExclusive(input, output, scan_op, thread_partial);
+
1027  }
+
1028 
+
1029 
+
1119  template <
+
1120  int ITEMS_PER_THREAD,
+
1121  typename ScanOp,
+
1122  typename BlockPrefixOp>
+
1123  __device__ __forceinline__ void ExclusiveScan(
+
1124  T (&input)[ITEMS_PER_THREAD],
+
1125  T (&output)[ITEMS_PER_THREAD],
+
1126  T identity,
+
1127  ScanOp scan_op,
+
1128  T &block_aggregate,
+
1129  BlockPrefixOp &block_prefix_op)
+
1130  {
+
1131  // Reduce consecutive thread items in registers
+
1132  T thread_partial = ThreadReduce(input, scan_op);
+
1133 
+
1134  // Exclusive threadblock-scan
+
1135  ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate, block_prefix_op);
+
1136 
+
1137  // Exclusive scan in registers with prefix
+
1138  ThreadScanExclusive(input, output, scan_op, thread_partial);
+
1139  }
+
1140 
+
1141 
+
1143 
+
1144 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
1145 
+
1146  /******************************************************************/
+
1150 
+
1151 
+
1163  template <typename ScanOp>
+
1164  __device__ __forceinline__ void ExclusiveScan(
+
1165  T input,
+
1166  T &output,
+
1167  ScanOp scan_op)
+
1168  {
+
1169  T block_aggregate;
+
1170  InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate);
+
1171  }
+
1172 
+
1173 
+
1185  template <typename ScanOp>
+
1186  __device__ __forceinline__ void ExclusiveScan(
+
1187  T input,
+
1188  T &output,
+
1189  ScanOp scan_op,
+
1190  T &block_aggregate)
+
1191  {
+
1192  InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate);
+
1193  }
+
1194 
+
1195 
+
1213  template <
+
1214  typename ScanOp,
+
1215  typename BlockPrefixOp>
+
1216  __device__ __forceinline__ void ExclusiveScan(
+
1217  T input,
+
1218  T &output,
+
1219  ScanOp scan_op,
+
1220  T &block_aggregate,
+
1221  BlockPrefixOp &block_prefix_op)
+
1222  {
+
1223  InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate, block_prefix_op);
+
1224  }
+
1225 
+
1226 
+
1228  /******************************************************************/
+
1232 
+
1233 
+
1246  template <
+
1247  int ITEMS_PER_THREAD,
+
1248  typename ScanOp>
+
1249  __device__ __forceinline__ void ExclusiveScan(
+
1250  T (&input)[ITEMS_PER_THREAD],
+
1251  T (&output)[ITEMS_PER_THREAD],
+
1252  ScanOp scan_op)
+
1253  {
+
1254  // Reduce consecutive thread items in registers
+
1255  T thread_partial = ThreadReduce(input, scan_op);
+
1256 
+
1257  // Exclusive threadblock-scan
+
1258  ExclusiveScan(thread_partial, thread_partial, scan_op);
+
1259 
+
1260  // Exclusive scan in registers with prefix
+
1261  ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+
1262  }
+
1263 
+
1264 
+
1277  template <
+
1278  int ITEMS_PER_THREAD,
+
1279  typename ScanOp>
+
1280  __device__ __forceinline__ void ExclusiveScan(
+
1281  T (&input)[ITEMS_PER_THREAD],
+
1282  T (&output)[ITEMS_PER_THREAD],
+
1283  ScanOp scan_op,
+
1284  T &block_aggregate)
+
1285  {
+
1286  // Reduce consecutive thread items in registers
+
1287  T thread_partial = ThreadReduce(input, scan_op);
+
1288 
+
1289  // Exclusive threadblock-scan
+
1290  ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
1291 
+
1292  // Exclusive scan in registers with prefix
+
1293  ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+
1294  }
+
1295 
+
1296 
+
1315  template <
+
1316  int ITEMS_PER_THREAD,
+
1317  typename ScanOp,
+
1318  typename BlockPrefixOp>
+
1319  __device__ __forceinline__ void ExclusiveScan(
+
1320  T (&input)[ITEMS_PER_THREAD],
+
1321  T (&output)[ITEMS_PER_THREAD],
+
1322  ScanOp scan_op,
+
1323  T &block_aggregate,
+
1324  BlockPrefixOp &block_prefix_op)
+
1325  {
+
1326  // Reduce consecutive thread items in registers
+
1327  T thread_partial = ThreadReduce(input, scan_op);
+
1328 
+
1329  // Exclusive threadblock-scan
+
1330  ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_op);
+
1331 
+
1332  // Exclusive scan in registers with prefix
+
1333  ThreadScanExclusive(input, output, scan_op, thread_partial);
+
1334  }
+
1335 
+
1336 
+
1338 
+
1339 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
1340 
+
1341  /******************************************************************/
+
1345 
+
1346 
+
1381  __device__ __forceinline__ void InclusiveSum(
+
1382  T input,
+
1383  T &output)
+
1384  {
+
1385  T block_aggregate;
+
1386  InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate);
+
1387  }
+
1388 
+
1389 
+
1426  __device__ __forceinline__ void InclusiveSum(
+
1427  T input,
+
1428  T &output,
+
1429  T &block_aggregate)
+
1430  {
+
1431  InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate);
+
1432  }
+
1433 
+
1434 
+
1435 
+
1511  template <typename BlockPrefixOp>
+
1512  __device__ __forceinline__ void InclusiveSum(
+
1513  T input,
+
1514  T &output,
+
1515  T &block_aggregate,
+
1516  BlockPrefixOp &block_prefix_op)
+
1517  {
+
1518  InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate, block_prefix_op);
+
1519  }
+
1520 
+
1521 
+
1523  /******************************************************************/
+
1527 
+
1528 
+
1565  template <int ITEMS_PER_THREAD>
+
1566  __device__ __forceinline__ void InclusiveSum(
+
1567  T (&input)[ITEMS_PER_THREAD],
+
1568  T (&output)[ITEMS_PER_THREAD])
+
1569  {
+
1570  if (ITEMS_PER_THREAD == 1)
+
1571  {
+
1572  InclusiveSum(input[0], output[0]);
+
1573  }
+
1574  else
+
1575  {
+
1576  // Reduce consecutive thread items in registers
+
1577  Sum scan_op;
+
1578  T thread_partial = ThreadReduce(input, scan_op);
+
1579 
+
1580  // Exclusive threadblock-scan
+
1581  ExclusiveSum(thread_partial, thread_partial);
+
1582 
+
1583  // Inclusive scan in registers with prefix
+
1584  ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+
1585  }
+
1586  }
+
1587 
+
1588 
+
1630  template <int ITEMS_PER_THREAD>
+
1631  __device__ __forceinline__ void InclusiveSum(
+
1632  T (&input)[ITEMS_PER_THREAD],
+
1633  T (&output)[ITEMS_PER_THREAD],
+
1634  T &block_aggregate)
+
1635  {
+
1636  if (ITEMS_PER_THREAD == 1)
+
1637  {
+
1638  InclusiveSum(input[0], output[0], block_aggregate);
+
1639  }
+
1640  else
+
1641  {
+
1642  // Reduce consecutive thread items in registers
+
1643  Sum scan_op;
+
1644  T thread_partial = ThreadReduce(input, scan_op);
+
1645 
+
1646  // Exclusive threadblock-scan
+
1647  ExclusiveSum(thread_partial, thread_partial, block_aggregate);
+
1648 
+
1649  // Inclusive scan in registers with prefix
+
1650  ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+
1651  }
+
1652  }
+
1653 
+
1654 
+
1741  template <
+
1742  int ITEMS_PER_THREAD,
+
1743  typename BlockPrefixOp>
+
1744  __device__ __forceinline__ void InclusiveSum(
+
1745  T (&input)[ITEMS_PER_THREAD],
+
1746  T (&output)[ITEMS_PER_THREAD],
+
1747  T &block_aggregate,
+
1748  BlockPrefixOp &block_prefix_op)
+
1749  {
+
1750  if (ITEMS_PER_THREAD == 1)
+
1751  {
+
1752  InclusiveSum(input[0], output[0], block_aggregate, block_prefix_op);
+
1753  }
+
1754  else
+
1755  {
+
1756  // Reduce consecutive thread items in registers
+
1757  Sum scan_op;
+
1758  T thread_partial = ThreadReduce(input, scan_op);
+
1759 
+
1760  // Exclusive threadblock-scan
+
1761  ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_op);
+
1762 
+
1763  // Inclusive scan in registers with prefix
+
1764  ThreadScanInclusive(input, output, scan_op, thread_partial);
+
1765  }
+
1766  }
+
1767 
+
1768 
+
1770  /******************************************************************/
+
1774 
+
1775 
+
1813  template <typename ScanOp>
+
1814  __device__ __forceinline__ void InclusiveScan(
+
1815  T input,
+
1816  T &output,
+
1817  ScanOp scan_op)
+
1818  {
+
1819  T block_aggregate;
+
1820  InclusiveScan(input, output, scan_op, block_aggregate);
+
1821  }
+
1822 
+
1823 
+
1863  template <typename ScanOp>
+
1864  __device__ __forceinline__ void InclusiveScan(
+
1865  T input,
+
1866  T &output,
+
1867  ScanOp scan_op,
+
1868  T &block_aggregate)
+
1869  {
+
1870  InternalBlockScan(temp_storage, linear_tid).InclusiveScan(input, output, scan_op, block_aggregate);
+
1871  }
+
1872 
+
1873 
+
1953  template <
+
1954  typename ScanOp,
+
1955  typename BlockPrefixOp>
+
1956  __device__ __forceinline__ void InclusiveScan(
+
1957  T input,
+
1958  T &output,
+
1959  ScanOp scan_op,
+
1960  T &block_aggregate,
+
1961  BlockPrefixOp &block_prefix_op)
+
1962  {
+
1963  InternalBlockScan(temp_storage, linear_tid).InclusiveScan(input, output, scan_op, block_aggregate, block_prefix_op);
+
1964  }
+
1965 
+
1966 
+
1968  /******************************************************************/
+
1972 
+
1973 
+
2013  template <
+
2014  int ITEMS_PER_THREAD,
+
2015  typename ScanOp>
+
2016  __device__ __forceinline__ void InclusiveScan(
+
2017  T (&input)[ITEMS_PER_THREAD],
+
2018  T (&output)[ITEMS_PER_THREAD],
+
2019  ScanOp scan_op)
+
2020  {
+
2021  if (ITEMS_PER_THREAD == 1)
+
2022  {
+
2023  InclusiveScan(input[0], output[0], scan_op);
+
2024  }
+
2025  else
+
2026  {
+
2027  // Reduce consecutive thread items in registers
+
2028  T thread_partial = ThreadReduce(input, scan_op);
+
2029 
+
2030  // Exclusive threadblock-scan
+
2031  ExclusiveScan(thread_partial, thread_partial, scan_op);
+
2032 
+
2033  // Inclusive scan in registers with prefix
+
2034  ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+
2035  }
+
2036  }
+
2037 
+
2038 
+
2082  template <
+
2083  int ITEMS_PER_THREAD,
+
2084  typename ScanOp>
+
2085  __device__ __forceinline__ void InclusiveScan(
+
2086  T (&input)[ITEMS_PER_THREAD],
+
2087  T (&output)[ITEMS_PER_THREAD],
+
2088  ScanOp scan_op,
+
2089  T &block_aggregate)
+
2090  {
+
2091  if (ITEMS_PER_THREAD == 1)
+
2092  {
+
2093  InclusiveScan(input[0], output[0], scan_op, block_aggregate);
+
2094  }
+
2095  else
+
2096  {
+
2097  // Reduce consecutive thread items in registers
+
2098  T thread_partial = ThreadReduce(input, scan_op);
+
2099 
+
2100  // Exclusive threadblock-scan
+
2101  ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
2102 
+
2103  // Inclusive scan in registers with prefix
+
2104  ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+
2105  }
+
2106  }
+
2107 
+
2108 
+
2198  template <
+
2199  int ITEMS_PER_THREAD,
+
2200  typename ScanOp,
+
2201  typename BlockPrefixOp>
+
2202  __device__ __forceinline__ void InclusiveScan(
+
2203  T (&input)[ITEMS_PER_THREAD],
+
2204  T (&output)[ITEMS_PER_THREAD],
+
2205  ScanOp scan_op,
+
2206  T &block_aggregate,
+
2207  BlockPrefixOp &block_prefix_op)
+
2208  {
+
2209  if (ITEMS_PER_THREAD == 1)
+
2210  {
+
2211  InclusiveScan(input[0], output[0], scan_op, block_aggregate, block_prefix_op);
+
2212  }
+
2213  else
+
2214  {
+
2215  // Reduce consecutive thread items in registers
+
2216  T thread_partial = ThreadReduce(input, scan_op);
+
2217 
+
2218  // Exclusive threadblock-scan
+
2219  ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_op);
+
2220 
+
2221  // Inclusive scan in registers with prefix
+
2222  ThreadScanInclusive(input, output, scan_op, thread_partial);
+
2223  }
+
2224  }
+
2225 
+
2227 
+
2228 
+
2229 };
+
2230 
+
2231 } // CUB namespace
+
2232 CUB_NS_POSTFIX // Optional outer namespace(s)
+
2233 
+
+ + + + + diff --git a/docs/html/block__store_8cuh_source.html b/docs/html/block__store_8cuh_source.html new file mode 100644 index 0000000000..c2fd267f11 --- /dev/null +++ b/docs/html/block__store_8cuh_source.html @@ -0,0 +1,648 @@ + + + + + + + +CUB: block_store.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
block_store.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include <iterator>
+
37 
+
38 #include "../util_namespace.cuh"
+
39 #include "../util_macro.cuh"
+
40 #include "../util_type.cuh"
+
41 #include "../util_vector.cuh"
+
42 #include "../thread/thread_store.cuh"
+
43 #include "block_exchange.cuh"
+
44 
+
46 CUB_NS_PREFIX
+
47 
+
49 namespace cub {
+
50 
+
57 /******************************************************************/
+
61 
+
72 template <
+
73  PtxStoreModifier MODIFIER,
+
74  typename T,
+
75  int ITEMS_PER_THREAD,
+
76  typename OutputIteratorRA>
+
77 __device__ __forceinline__ void StoreBlocked(
+
78  int linear_tid,
+
79  OutputIteratorRA block_itr,
+
80  T (&items)[ITEMS_PER_THREAD])
+
81 {
+
82  // Store directly in thread-blocked order
+
83  #pragma unroll
+
84  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
85  {
+
86  ThreadStore<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
+
87  }
+
88 }
+
89 
+
90 
+
101 template <
+
102  PtxStoreModifier MODIFIER,
+
103  typename T,
+
104  int ITEMS_PER_THREAD,
+
105  typename OutputIteratorRA>
+
106 __device__ __forceinline__ void StoreBlocked(
+
107  int linear_tid,
+
108  OutputIteratorRA block_itr,
+
109  T (&items)[ITEMS_PER_THREAD],
+
110  int valid_items)
+
111 {
+
112  // Store directly in thread-blocked order
+
113  #pragma unroll
+
114  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
115  {
+
116  if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+
117  {
+
118  ThreadStore<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
+
119  }
+
120  }
+
121 }
+
122 
+
123 
+
124 
+
126 /******************************************************************/
+
130 
+
131 
+
143 template <
+
144  PtxStoreModifier MODIFIER,
+
145  int BLOCK_THREADS,
+
146  typename T,
+
147  int ITEMS_PER_THREAD,
+
148  typename OutputIteratorRA>
+
149 __device__ __forceinline__ void StoreStriped(
+
150  int linear_tid,
+
151  OutputIteratorRA block_itr,
+
152  T (&items)[ITEMS_PER_THREAD])
+
153 {
+
154  // Store directly in striped order
+
155  #pragma unroll
+
156  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
157  {
+
158  ThreadStore<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
+
159  }
+
160 }
+
161 
+
162 
+
174 template <
+
175  PtxStoreModifier MODIFIER,
+
176  int BLOCK_THREADS,
+
177  typename T,
+
178  int ITEMS_PER_THREAD,
+
179  typename OutputIteratorRA>
+
180 __device__ __forceinline__ void StoreStriped(
+
181  int linear_tid,
+
182  OutputIteratorRA block_itr,
+
183  T (&items)[ITEMS_PER_THREAD],
+
184  int valid_items)
+
185 {
+
186  // Store directly in striped order
+
187  #pragma unroll
+
188  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
189  {
+
190  if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+
191  {
+
192  ThreadStore<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
+
193  }
+
194  }
+
195 }
+
196 
+
197 
+
198 
+
200 /******************************************************************/
+
204 
+
205 
+
219 template <
+
220  PtxStoreModifier MODIFIER,
+
221  typename T,
+
222  int ITEMS_PER_THREAD,
+
223  typename OutputIteratorRA>
+
224 __device__ __forceinline__ void StoreWarpStriped(
+
225  int linear_tid,
+
226  OutputIteratorRA block_itr,
+
227  T (&items)[ITEMS_PER_THREAD])
+
228 {
+
229  int tid = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+
230  int wid = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+
231  int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
232 
+
233  // Store directly in warp-striped order
+
234  #pragma unroll
+
235  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
236  {
+
237  ThreadStore<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
+
238  }
+
239 }
+
240 
+
241 
+
255 template <
+
256  PtxStoreModifier MODIFIER,
+
257  typename T,
+
258  int ITEMS_PER_THREAD,
+
259  typename OutputIteratorRA>
+
260 __device__ __forceinline__ void StoreWarpStriped(
+
261  int linear_tid,
+
262  OutputIteratorRA block_itr,
+
263  T (&items)[ITEMS_PER_THREAD],
+
264  int valid_items)
+
265 {
+
266  int tid = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+
267  int wid = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+
268  int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
269 
+
270  // Store directly in warp-striped order
+
271  #pragma unroll
+
272  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
273  {
+
274  if (warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS) < valid_items)
+
275  {
+
276  ThreadStore<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
+
277  }
+
278  }
+
279 }
+
280 
+
281 
+
282 
+
284 /******************************************************************/
+
288 
+
307 template <
+
308  PtxStoreModifier MODIFIER,
+
309  typename T,
+
310  int ITEMS_PER_THREAD>
+
311 __device__ __forceinline__ void StoreBlockedVectorized(
+
312  int linear_tid,
+
313  T *block_ptr,
+
314  T (&items)[ITEMS_PER_THREAD])
+
315 {
+
316  enum
+
317  {
+
318  // Maximum CUDA vector size is 4 elements
+
319  MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
320 
+
321  // Vector size must be a power of two and an even divisor of the items per thread
+
322  VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+
323  MAX_VEC_SIZE :
+
324  1,
+
325 
+
326  VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+
327  };
+
328 
+
329  // Vector type
+
330  typedef typename VectorHelper<T, VEC_SIZE>::Type Vector;
+
331 
+
332  // Alias global pointer
+
333  Vector *block_ptr_vectors = reinterpret_cast<Vector *>(block_ptr);
+
334 
+
335  // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+
336  Vector raw_vector[VECTORS_PER_THREAD];
+
337  T *raw_items = reinterpret_cast<T*>(raw_vector);
+
338 
+
339  // Copy
+
340  #pragma unroll
+
341  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+
342  {
+
343  raw_items[ITEM] = items[ITEM];
+
344  }
+
345 
+
346  // Direct-store using vector types
+
347  StoreBlocked<MODIFIER>(linear_tid, block_ptr_vectors, raw_vector);
+
348 }
+
349 
+
350 
+
352 
+
353  // end group IoModule
+
355 
+
356 
+
357 //-----------------------------------------------------------------------------
+
358 // Generic BlockStore abstraction
+
359 //-----------------------------------------------------------------------------
+
360 
+ +
365 {
+ +
378 
+ +
400 
+ +
419 
+ +
438 };
+
439 
+
440 
+
441 
+
511 template <
+
512  typename OutputIteratorRA,
+
513  int BLOCK_THREADS,
+
514  int ITEMS_PER_THREAD,
+ +
516  PtxStoreModifier MODIFIER = STORE_DEFAULT,
+
517  bool WARP_TIME_SLICING = false>
+ +
519 {
+
520 private:
+
521  /******************************************************************************
+
522  * Constants and typed definitions
+
523  ******************************************************************************/
+
524 
+
525  // Data type of input iterator
+
526  typedef typename std::iterator_traits<OutputIteratorRA>::value_type T;
+
527 
+
528 
+
529  /******************************************************************************
+
530  * Algorithmic variants
+
531  ******************************************************************************/
+
532 
+
534  template <BlockStoreAlgorithm _POLICY, int DUMMY = 0>
+
535  struct StoreInternal;
+
536 
+
537 
+
541  template <int DUMMY>
+
542  struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+
543  {
+
545  typedef NullType TempStorage;
+
546 
+
548  int linear_tid;
+
549 
+
551  __device__ __forceinline__ StoreInternal(
+
552  TempStorage &temp_storage,
+
553  int linear_tid)
+
554  :
+
555  linear_tid(linear_tid)
+
556  {}
+
557 
+
559  __device__ __forceinline__ void Store(
+
560  OutputIteratorRA block_itr,
+
561  T (&items)[ITEMS_PER_THREAD])
+
562  {
+
563  StoreBlocked<MODIFIER>(linear_tid, block_itr, items);
+
564  }
+
565 
+
567  __device__ __forceinline__ void Store(
+
568  OutputIteratorRA block_itr,
+
569  T (&items)[ITEMS_PER_THREAD],
+
570  int valid_items)
+
571  {
+
572  StoreBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
+
573  }
+
574  };
+
575 
+
576 
+
580  template <int DUMMY>
+
581  struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+
582  {
+
584  typedef NullType TempStorage;
+
585 
+
587  int linear_tid;
+
588 
+
590  __device__ __forceinline__ StoreInternal(
+
591  TempStorage &temp_storage,
+
592  int linear_tid)
+
593  :
+
594  linear_tid(linear_tid)
+
595  {}
+
596 
+
598  __device__ __forceinline__ void Store(
+
599  T *block_ptr,
+
600  T (&items)[ITEMS_PER_THREAD])
+
601  {
+
602  StoreBlockedVectorized<MODIFIER>(linear_tid, block_ptr, items);
+
603  }
+
604 
+
606  template <typename _OutputIteratorRA>
+
607  __device__ __forceinline__ void Store(
+
608  _OutputIteratorRA block_itr,
+
609  T (&items)[ITEMS_PER_THREAD])
+
610  {
+
611  StoreBlocked<MODIFIER>(linear_tid, block_itr, items);
+
612  }
+
613 
+
615  __device__ __forceinline__ void Store(
+
616  OutputIteratorRA block_itr,
+
617  T (&items)[ITEMS_PER_THREAD],
+
618  int valid_items)
+
619  {
+
620  StoreBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
+
621  }
+
622  };
+
623 
+
624 
+
628  template <int DUMMY>
+
629  struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+
630  {
+
631  // BlockExchange utility type for keys
+ +
633 
+
635  typedef typename BlockExchange::TempStorage _TempStorage;
+
636 
+
638  struct TempStorage : Uninitialized<_TempStorage> {};
+
639 
+
641  _TempStorage &temp_storage;
+
642 
+
644  int linear_tid;
+
645 
+
647  __device__ __forceinline__ StoreInternal(
+
648  TempStorage &temp_storage,
+
649  int linear_tid)
+
650  :
+
651  temp_storage(temp_storage.Alias()),
+
652  linear_tid(linear_tid)
+
653  {}
+
654 
+
656  __device__ __forceinline__ void Store(
+
657  OutputIteratorRA block_itr,
+
658  T (&items)[ITEMS_PER_THREAD])
+
659  {
+
660  BlockExchange(temp_storage).BlockedToStriped(items);
+
661  StoreStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items);
+
662  }
+
663 
+
665  __device__ __forceinline__ void Store(
+
666  OutputIteratorRA block_itr,
+
667  T (&items)[ITEMS_PER_THREAD],
+
668  int valid_items)
+
669  {
+
670  BlockExchange(temp_storage).BlockedToStriped(items);
+
671  StoreStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+
672  }
+
673  };
+
674 
+
675 
+
679  template <int DUMMY>
+
680  struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+
681  {
+
682  enum
+
683  {
+
684  WARP_THREADS = PtxArchProps::WARP_THREADS
+
685  };
+
686 
+
687  // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+
688  CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
689 
+
690  // BlockExchange utility type for keys
+
691  typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
+
692 
+
694  typedef typename BlockExchange::TempStorage _TempStorage;
+
695 
+
697  struct TempStorage : Uninitialized<_TempStorage> {};
+
698 
+
700  _TempStorage &temp_storage;
+
701 
+
703  int linear_tid;
+
704 
+
706  __device__ __forceinline__ StoreInternal(
+
707  TempStorage &temp_storage,
+
708  int linear_tid)
+
709  :
+
710  temp_storage(temp_storage.Alias()),
+
711  linear_tid(linear_tid)
+
712  {}
+
713 
+
715  __device__ __forceinline__ void Store(
+
716  OutputIteratorRA block_itr,
+
717  T (&items)[ITEMS_PER_THREAD])
+
718  {
+
719  BlockExchange(temp_storage).BlockedToWarpStriped(items);
+
720  StoreWarpStriped<MODIFIER>(linear_tid, block_itr, items);
+
721  }
+
722 
+
724  __device__ __forceinline__ void Store(
+
725  OutputIteratorRA block_itr,
+
726  T (&items)[ITEMS_PER_THREAD],
+
727  int valid_items)
+
728  {
+
729  BlockExchange(temp_storage).BlockedToWarpStriped(items);
+
730  StoreWarpStriped<MODIFIER>(linear_tid, block_itr, items, valid_items);
+
731  }
+
732  };
+
733 
+
734  /******************************************************************************
+
735  * Type definitions
+
736  ******************************************************************************/
+
737 
+
739  typedef StoreInternal<ALGORITHM> InternalStore;
+
740 
+
741 
+
743  typedef typename InternalStore::TempStorage _TempStorage;
+
744 
+
745 
+
746  /******************************************************************************
+
747  * Utility methods
+
748  ******************************************************************************/
+
749 
+
751  __device__ __forceinline__ _TempStorage& PrivateStorage()
+
752  {
+
753  __shared__ _TempStorage private_storage;
+
754  return private_storage;
+
755  }
+
756 
+
757 
+
758  /******************************************************************************
+
759  * Thread fields
+
760  ******************************************************************************/
+
761 
+
763  _TempStorage &temp_storage;
+
764 
+
766  int linear_tid;
+
767 
+
768 public:
+
769 
+
770 
+
772  struct TempStorage : Uninitialized<_TempStorage> {};
+
773 
+
774 
+
775  /******************************************************************/
+
779 
+
783  __device__ __forceinline__ BlockStore()
+
784  :
+
785  temp_storage(PrivateStorage()),
+
786  linear_tid(threadIdx.x)
+
787  {}
+
788 
+
789 
+
793  __device__ __forceinline__ BlockStore(
+
794  TempStorage &temp_storage)
+
795  :
+
796  temp_storage(temp_storage.Alias()),
+
797  linear_tid(threadIdx.x)
+
798  {}
+
799 
+
800 
+
804  __device__ __forceinline__ BlockStore(
+
805  int linear_tid)
+
806  :
+
807  temp_storage(PrivateStorage()),
+
808  linear_tid(linear_tid)
+
809  {}
+
810 
+
811 
+
815  __device__ __forceinline__ BlockStore(
+
816  TempStorage &temp_storage,
+
817  int linear_tid)
+
818  :
+
819  temp_storage(temp_storage.Alias()),
+
820  linear_tid(linear_tid)
+
821  {}
+
822 
+
823 
+
825  /******************************************************************/
+
829 
+
830 
+
868  __device__ __forceinline__ void Store(
+
869  OutputIteratorRA block_itr,
+
870  T (&items)[ITEMS_PER_THREAD])
+
871  {
+
872  InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+
873  }
+
874 
+
913  __device__ __forceinline__ void Store(
+
914  OutputIteratorRA block_itr,
+
915  T (&items)[ITEMS_PER_THREAD],
+
916  int valid_items)
+
917  {
+
918  InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+
919  }
+
920 };
+
921  // end group BlockModule
+
923 
+
924 } // CUB namespace
+
925 CUB_NS_POSTFIX // Optional outer namespace(s)
+
926 
+
+ + + + + diff --git a/docs/html/cub_8cuh_source.html b/docs/html/cub_8cuh_source.html new file mode 100644 index 0000000000..71994a953c --- /dev/null +++ b/docs/html/cub_8cuh_source.html @@ -0,0 +1,193 @@ + + + + + + + +CUB: cub.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
cub.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 
+
37 // Block
+ + +
40 #include "block/block_exchange.cuh"
+
41 #include "block/block_load.cuh"
+
42 #include "block/block_radix_rank.cuh"
+ +
44 #include "block/block_reduce.cuh"
+
45 #include "block/block_scan.cuh"
+
46 #include "block/block_store.cuh"
+
47 
+
48 // Device
+ + +
51 #include "device/device_reduce.cuh"
+
52 #include "device/device_scan.cuh"
+
53 
+
54 // Grid
+
55 //#include "grid/grid_barrier.cuh"
+
56 #include "grid/grid_even_share.cuh"
+
57 #include "grid/grid_mapping.cuh"
+
58 #include "grid/grid_queue.cuh"
+
59 
+
60 // Host
+
61 #include "host/spinlock.cuh"
+
62 
+
63 // Thread
+
64 #include "thread/thread_load.cuh"
+ +
66 #include "thread/thread_reduce.cuh"
+
67 #include "thread/thread_scan.cuh"
+
68 #include "thread/thread_store.cuh"
+
69 
+
70 // Warp
+
71 #include "warp/warp_reduce.cuh"
+
72 #include "warp/warp_scan.cuh"
+
73 
+
74 // Util
+
75 #include "util_allocator.cuh"
+
76 #include "util_arch.cuh"
+
77 #include "util_debug.cuh"
+
78 #include "util_device.cuh"
+
79 #include "util_macro.cuh"
+
80 #include "util_ptx.cuh"
+
81 #include "util_type.cuh"
+
82 #include "util_iterator.cuh"
+
83 #include "util_vector.cuh"
+
84 
+
+ + + + + diff --git a/docs/html/device__histogram_8cuh_source.html b/docs/html/device__histogram_8cuh_source.html new file mode 100644 index 0000000000..1bfa4810d7 --- /dev/null +++ b/docs/html/device__histogram_8cuh_source.html @@ -0,0 +1,793 @@ + + + + + + + +CUB: device_histogram.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
device_histogram.cuh
+
+
+Go to the documentation of this file.
1 
+
2 /******************************************************************************
+
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
4  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
5  *
+
6  * Redistribution and use in source and binary forms, with or without
+
7  * modification, are permitted provided that the following conditions are met:
+
8  * * Redistributions of source code must retain the above copyright
+
9  * notice, this list of conditions and the following disclaimer.
+
10  * * Redistributions in binary form must reproduce the above copyright
+
11  * notice, this list of conditions and the following disclaimer in the
+
12  * documentation and/or other materials provided with the distribution.
+
13  * * Neither the name of the NVIDIA CORPORATION nor the
+
14  * names of its contributors may be used to endorse or promote products
+
15  * derived from this software without specific prior written permission.
+
16  *
+
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
27  *
+
28  ******************************************************************************/
+
29 
+
35 #pragma once
+
36 
+
37 #include <stdio.h>
+
38 #include <iterator>
+
39 
+
40 #include "block/block_histo_tiles.cuh"
+
41 #include "../grid/grid_even_share.cuh"
+
42 #include "../grid/grid_queue.cuh"
+
43 #include "../util_debug.cuh"
+
44 #include "../util_device.cuh"
+
45 #include "../util_namespace.cuh"
+
46 
+
48 CUB_NS_PREFIX
+
49 
+
51 namespace cub {
+
52 
+
53 
+
54 /******************************************************************************
+
55  * Kernel entry points
+
56  *****************************************************************************/
+
57 
+
58 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
59 
+
60 
+
64 template <
+
65  int BINS,
+
66  int ACTIVE_CHANNELS,
+
67  typename SizeT,
+
68  typename HistoCounter>
+
69 __launch_bounds__ (BINS, 1)
+
70 __global__ void InitHistoKernel(
+
71  GridQueue<SizeT> grid_queue,
+
72  ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_out_histograms,
+
73  SizeT num_samples)
+
74 {
+
75  d_out_histograms.array[blockIdx.x][threadIdx.x] = 0;
+
76  if (threadIdx.x == 0) grid_queue.ResetDrain(num_samples);
+
77 }
+
78 
+
79 
+
83 template <
+
84  typename BlockHistogramTilesPolicy,
+
85  int BINS,
+
86  int CHANNELS,
+
87  int ACTIVE_CHANNELS,
+
88  typename InputIteratorRA,
+
89  typename HistoCounter,
+
90  typename SizeT>
+
91 __launch_bounds__ (int(BlockHistogramTilesPolicy::BLOCK_THREADS), BlockHistogramTilesPolicy::SM_OCCUPANCY)
+
92 __global__ void MultiBlockHistogramKernel(
+
93  InputIteratorRA d_samples,
+
94  ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_out_histograms,
+
95  SizeT num_samples,
+
96  GridEvenShare<SizeT> even_share,
+
97  GridQueue<SizeT> queue)
+
98 {
+
99  // Constants
+
100  enum
+
101  {
+
102  BLOCK_THREADS = BlockHistogramTilesPolicy::BLOCK_THREADS,
+
103  ITEMS_PER_THREAD = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
+
104  TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
+
105  };
+
106 
+
107  // Thread block type for compositing input tiles
+
108  typedef BlockHistogramTiles<BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT> BlockHistogramTilesT;
+
109 
+
110  // Shared memory for BlockHistogramTiles
+
111  __shared__ typename BlockHistogramTilesT::TempStorage temp_storage;
+
112 
+
113  // Consume input tiles
+
114  BlockHistogramTilesT(temp_storage, d_samples, d_out_histograms.array).ConsumeTiles(
+
115  num_samples,
+
116  even_share,
+
117  queue,
+
118  Int2Type<BlockHistogramTilesPolicy::GRID_MAPPING>());
+
119 }
+
120 
+
121 
+
125 template <
+
126  int BINS,
+
127  int ACTIVE_CHANNELS,
+
128  typename HistoCounter>
+
129 __launch_bounds__ (BINS, 1)
+
130 __global__ void AggregateHistoKernel(
+
131  HistoCounter* d_block_histograms,
+
132  ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_out_histograms,
+
133  int num_threadblocks)
+
134 {
+
135  // Accumulate threadblock-histograms from the channel
+
136  HistoCounter bin_aggregate = 0;
+
137 
+
138  int block_offset = blockIdx.x * (num_threadblocks * BINS);
+
139  int block_oob = block_offset + (num_threadblocks * BINS);
+
140 
+
141 #if CUB_PTX_ARCH >= 200
+
142  #pragma unroll 32
+
143 #endif
+
144  while (block_offset < block_oob)
+
145  {
+
146  bin_aggregate += d_block_histograms[block_offset + threadIdx.x];
+
147  block_offset += BINS;
+
148  }
+
149 
+
150  // Output
+
151  d_out_histograms.array[blockIdx.x][threadIdx.x] = bin_aggregate;
+
152 }
+
153 
+
154 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
155 
+
156 
+
157 
+
158 /******************************************************************************
+
159  * DeviceHistogram
+
160  *****************************************************************************/
+
161 
+ +
179 {
+
180 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
181 
+
182  /******************************************************************************
+
183  * Constants and typedefs
+
184  ******************************************************************************/
+
185 
+
187  struct KernelDispachParams
+
188  {
+
189  // Policy fields
+
190  int block_threads;
+
191  int items_per_thread;
+
192  BlockHistogramTilesAlgorithm block_algorithm;
+
193  GridMappingStrategy grid_mapping;
+
194  int subscription_factor;
+
195 
+
196  // Derived fields
+
197  int channel_tile_size;
+
198 
+
199  template <typename BlockHistogramTilesPolicy>
+
200  __host__ __device__ __forceinline__
+
201  void Init(int subscription_factor = 1)
+
202  {
+
203  block_threads = BlockHistogramTilesPolicy::BLOCK_THREADS;
+
204  items_per_thread = BlockHistogramTilesPolicy::ITEMS_PER_THREAD;
+
205  block_algorithm = BlockHistogramTilesPolicy::GRID_ALGORITHM;
+
206  grid_mapping = BlockHistogramTilesPolicy::GRID_MAPPING;
+
207  this->subscription_factor = subscription_factor;
+
208 
+
209  channel_tile_size = block_threads * items_per_thread;
+
210  }
+
211 
+
212  __host__ __device__ __forceinline__
+
213  void Print()
+
214  {
+
215  printf("%d, %d, %d, %d, %d",
+
216  block_threads,
+
217  items_per_thread,
+
218  block_algorithm,
+
219  grid_mapping,
+
220  subscription_factor);
+
221  }
+
222 
+
223  };
+
224 
+
225 
+
226  /******************************************************************************
+
227  * Tuning policies
+
228  ******************************************************************************/
+
229 
+
231  template <
+
232  int CHANNELS,
+
233  int ACTIVE_CHANNELS,
+
234  BlockHistogramTilesAlgorithm GRID_ALGORITHM,
+
235  int ARCH>
+
236  struct TunedPolicies;
+
237 
+
239  template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
+
240  struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 350>
+
241  {
+
242  typedef BlockHistogramTilesPolicy<
+
243  (GRID_ALGORITHM == GRID_HISTO_SORT) ? 128 : 256,
+
244  (GRID_ALGORITHM == GRID_HISTO_SORT) ? 12 : (30 / ACTIVE_CHANNELS),
+
245  GRID_ALGORITHM,
+
246  (GRID_ALGORITHM == GRID_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE,
+
247  (GRID_ALGORITHM == GRID_HISTO_SORT) ? 8 : 1> MultiBlockPolicy;
+
248  enum { SUBSCRIPTION_FACTOR = 7 };
+
249  };
+
250 
+
252  template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
+
253  struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 300>
+
254  {
+
255  typedef BlockHistogramTilesPolicy<
+
256  128,
+
257  (GRID_ALGORITHM == GRID_HISTO_SORT) ? 20 : (22 / ACTIVE_CHANNELS),
+
258  GRID_ALGORITHM,
+
259  (GRID_ALGORITHM == GRID_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE,
+
260  1> MultiBlockPolicy;
+
261  enum { SUBSCRIPTION_FACTOR = 1 };
+
262  };
+
263 
+
265  template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
+
266  struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 200>
+
267  {
+
268  typedef BlockHistogramTilesPolicy<
+
269  128,
+
270  (GRID_ALGORITHM == GRID_HISTO_SORT) ? 21 : (23 / ACTIVE_CHANNELS),
+
271  GRID_ALGORITHM,
+
272  GRID_MAPPING_DYNAMIC,
+
273  1> MultiBlockPolicy;
+
274  enum { SUBSCRIPTION_FACTOR = 1 };
+
275  };
+
276 
+
278  template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
+
279  struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 100>
+
280  {
+
281  typedef BlockHistogramTilesPolicy<
+
282  128,
+
283  7,
+
284  GRID_HISTO_SORT, // (use sort regardless because atomics are perf-useless)
+
285  GRID_MAPPING_EVEN_SHARE,
+
286  1> MultiBlockPolicy;
+
287  enum { SUBSCRIPTION_FACTOR = 1 };
+
288  };
+
289 
+
290 
+
292  template <
+
293  int CHANNELS,
+
294  int ACTIVE_CHANNELS,
+
295  BlockHistogramTilesAlgorithm GRID_ALGORITHM>
+
296  struct PtxDefaultPolicies
+
297  {
+
298  static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ?
+
299  350 :
+
300  (CUB_PTX_ARCH >= 300) ?
+
301  300 :
+
302  (CUB_PTX_ARCH >= 200) ?
+
303  200 :
+
304  100;
+
305 
+
306  // Tuned policy set for the current PTX compiler pass
+
307  typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, PTX_TUNE_ARCH> PtxTunedPolicies;
+
308 
+
309  // Subscription factor for the current PTX compiler pass
+
310  static const int SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR;
+
311 
+
312  // MultiBlockPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+
313  struct MultiBlockPolicy : PtxTunedPolicies::MultiBlockPolicy {};
+
314 
+
318  static void InitDispatchParams(int ptx_version, KernelDispachParams &multi_block_dispatch_params)
+
319  {
+
320  if (ptx_version >= 350)
+
321  {
+
322  typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 350> TunedPolicies;
+
323  multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
324  }
+
325  else if (ptx_version >= 300)
+
326  {
+
327  typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 300> TunedPolicies;
+
328  multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
329  }
+
330  else if (ptx_version >= 200)
+
331  {
+
332  typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 200> TunedPolicies;
+
333  multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
334  }
+
335  else
+
336  {
+
337  typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 100> TunedPolicies;
+
338  multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
339  }
+
340  }
+
341  };
+
342 
+
343 
+
344  /******************************************************************************
+
345  * Utility methods
+
346  ******************************************************************************/
+
347 
+
351  template <
+
352  int BINS,
+
353  int CHANNELS,
+
354  int ACTIVE_CHANNELS,
+
355  typename InitHistoKernelPtr,
+
356  typename MultiBlockHistogramKernelPtr,
+
357  typename AggregateHistoKernelPtr,
+
358  typename InputIteratorRA,
+
359  typename HistoCounter,
+
360  typename SizeT>
+
361  __host__ __device__ __forceinline__
+
362  static cudaError_t Dispatch(
+
363  void *d_temp_storage,
+
364  size_t &temp_storage_bytes,
+
365  InitHistoKernelPtr init_kernel,
+
366  MultiBlockHistogramKernelPtr multi_block_kernel,
+
367  AggregateHistoKernelPtr aggregate_kernel,
+
368  KernelDispachParams &multi_block_dispatch_params,
+
369  InputIteratorRA d_samples,
+
370  HistoCounter *d_histograms[ACTIVE_CHANNELS],
+
371  SizeT num_samples,
+
372  cudaStream_t stream = 0,
+
373  bool stream_synchronous = false)
+
374  {
+
375 #ifndef CUB_RUNTIME_ENABLED
+
376 
+
377  // Kernel launch not supported from this device
+
378  return CubDebug(cudaErrorNotSupported);
+
379 
+
380 #else
+
381 
+
382  cudaError error = cudaSuccess;
+
383  do
+
384  {
+
385  // Get device ordinal
+
386  int device_ordinal;
+
387  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
388 
+
389  // Get SM count
+
390  int sm_count;
+
391  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
392 
+
393  // Get a rough estimate of multi_block_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+
394  int multi_block_sm_occupancy = CUB_MIN(
+ +
396  ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / multi_block_dispatch_params.block_threads);
+
397 
+
398 #ifndef __CUDA_ARCH__
+
399  // We're on the host, so come up with a more accurate estimate of multi_block_kernel SM occupancy from actual device properties
+
400  Device device_props;
+
401  if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
402 
+
403  if (CubDebug(error = device_props.MaxSmOccupancy(
+
404  multi_block_sm_occupancy,
+
405  multi_block_kernel,
+
406  multi_block_dispatch_params.block_threads))) break;
+
407 #endif
+
408 
+
409  // Get device occupancy for multi_block_kernel
+
410  int multi_block_occupancy = multi_block_sm_occupancy * sm_count;
+
411 
+
412  // Even-share work distribution
+
413  GridEvenShare<SizeT> even_share;
+
414 
+
415  // Get tile size for multi_block_kernel
+
416  int multi_block_tile_size = multi_block_dispatch_params.channel_tile_size * CHANNELS;
+
417 
+
418  // Get grid size for multi_block_kernel
+
419  int multi_block_grid_size;
+
420  switch (multi_block_dispatch_params.grid_mapping)
+
421  {
+
422  case GRID_MAPPING_EVEN_SHARE:
+
423 
+
424  // Work is distributed evenly
+
425  even_share.GridInit(
+
426  num_samples,
+
427  multi_block_occupancy * multi_block_dispatch_params.subscription_factor,
+
428  multi_block_tile_size);
+
429  multi_block_grid_size = even_share.grid_size;
+
430  break;
+
431 
+
432  case GRID_MAPPING_DYNAMIC:
+
433 
+
434  // Work is distributed dynamically
+
435  int num_tiles = (num_samples + multi_block_tile_size - 1) / multi_block_tile_size;
+
436  multi_block_grid_size = (num_tiles < multi_block_occupancy) ?
+
437  num_tiles : // Not enough to fill the device with threadblocks
+
438  multi_block_occupancy; // Fill the device with threadblocks
+
439  break;
+
440  };
+
441 
+
442  // Temporary storage allocation requirements
+
443  void* allocations[2];
+
444  size_t allocation_sizes[2] =
+
445  {
+
446  ACTIVE_CHANNELS * multi_block_grid_size * sizeof(HistoCounter) * BINS, // bytes needed for privatized histograms
+
447  GridQueue<int>::AllocationSize() // bytes needed for grid queue descriptor
+
448  };
+
449 
+
450  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
451 
+
452  // Return if the caller is simply requesting the size of the storage allocation
+
453  if (d_temp_storage == NULL)
+
454  return cudaSuccess;
+
455 
+
456  // Privatized per-block reductions
+
457  HistoCounter *d_block_histograms = (HistoCounter*) allocations[0];
+
458 
+
459  // Grid queue descriptor
+
460  GridQueue<SizeT> queue(allocations[1]);
+
461 
+
462  // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+ +
464  for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+
465  d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
+
466 
+
467  // Setup array wrapper for temporary histogram channel output (because we can't pass static arrays as kernel parameters)
+ +
469  for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+
470  d_temp_histo_wrapper.array[CHANNEL] = d_block_histograms + (CHANNEL * multi_block_grid_size * BINS);
+
471 
+
472  // Log init_kernel configuration
+
473  if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", ACTIVE_CHANNELS, BINS, (long long) stream);
+
474 
+
475  // Invoke init_kernel to initialize counters and queue descriptor
+
476  init_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(queue, d_histo_wrapper, num_samples);
+
477 
+
478  // Sync the stream if specified
+
479  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
480 
+
481  // Whether we need privatized histograms (i.e., non-global atomics and multi-block)
+
482  bool privatized_temporaries = (multi_block_grid_size > 1) && (multi_block_dispatch_params.block_algorithm != GRID_HISTO_GLOBAL_ATOMIC);
+
483 
+
484  // Log multi_block_kernel configuration
+
485  if (stream_synchronous) CubLog("Invoking multi_block_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+
486  multi_block_grid_size, multi_block_dispatch_params.block_threads, (long long) stream, multi_block_dispatch_params.items_per_thread, multi_block_sm_occupancy);
+
487 
+
488  // Invoke multi_block_kernel
+
489  multi_block_kernel<<<multi_block_grid_size, multi_block_dispatch_params.block_threads, 0, stream>>>(
+
490  d_samples,
+
491  (privatized_temporaries) ?
+
492  d_temp_histo_wrapper :
+
493  d_histo_wrapper,
+
494  num_samples,
+
495  even_share,
+
496  queue);
+
497 
+
498  // Sync the stream if specified
+
499  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
500 
+
501  // Aggregate privatized block histograms if necessary
+
502  if (privatized_temporaries)
+
503  {
+
504  // Log aggregate_kernel configuration
+
505  if (stream_synchronous) CubLog("Invoking aggregate_kernel<<<%d, %d, 0, %lld>>>()\n",
+
506  ACTIVE_CHANNELS, BINS, (long long) stream);
+
507 
+
508  // Invoke aggregate_kernel
+
509  aggregate_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(
+
510  d_block_histograms,
+
511  d_histo_wrapper,
+
512  multi_block_grid_size);
+
513 
+
514  // Sync the stream if specified
+
515  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
516  }
+
517  }
+
518  while (0);
+
519 
+
520  return error;
+
521 #endif // CUB_RUNTIME_ENABLED
+
522  }
+
523 
+
524 
+
534  template <
+
535  BlockHistogramTilesAlgorithm GRID_ALGORITHM,
+
536  int BINS,
+
537  int CHANNELS,
+
538  int ACTIVE_CHANNELS,
+
539  typename InputIteratorRA,
+
540  typename HistoCounter>
+
541  __host__ __device__ __forceinline__
+
542  static cudaError_t Dispatch(
+
543  void *d_temp_storage,
+
544  size_t &temp_storage_bytes,
+
545  InputIteratorRA d_samples,
+
546  HistoCounter *d_histograms[ACTIVE_CHANNELS],
+
547  int num_samples,
+
548  cudaStream_t stream = 0,
+
549  bool stream_synchronous = false)
+
550  {
+
551  // Type used for array indexing
+
552  typedef int SizeT;
+
553 
+
554  // Tuning polices for the PTX architecture that will get dispatched to
+
555  typedef PtxDefaultPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM> PtxDefaultPolicies;
+
556  typedef typename PtxDefaultPolicies::MultiBlockPolicy MultiBlockPolicy;
+
557 
+
558  cudaError error = cudaSuccess;
+
559  do
+
560  {
+
561  // Declare dispatch parameters
+
562  KernelDispachParams multi_block_dispatch_params;
+
563 
+
564  #ifdef __CUDA_ARCH__
+
565 
+
566  // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+
567  multi_block_dispatch_params.Init<MultiBlockPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+
568 
+
569  #else
+
570 
+
571  // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+
572  int ptx_version;
+
573  if (CubDebug(error = PtxVersion(ptx_version))) break;
+
574  PtxDefaultPolicies::InitDispatchParams(ptx_version, multi_block_dispatch_params);
+
575 
+
576  #endif
+
577 
+
578  Dispatch<BINS, CHANNELS, ACTIVE_CHANNELS>(
+
579  d_temp_storage,
+
580  temp_storage_bytes,
+
581  InitHistoKernel<BINS, ACTIVE_CHANNELS, SizeT, HistoCounter>,
+
582  MultiBlockHistogramKernel<MultiBlockPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>,
+
583  AggregateHistoKernel<BINS, ACTIVE_CHANNELS, HistoCounter>,
+
584  multi_block_dispatch_params,
+
585  d_samples,
+
586  d_histograms,
+
587  num_samples,
+
588  stream,
+
589  stream_synchronous);
+
590 
+
591  if (CubDebug(error)) break;
+
592  }
+
593  while (0);
+
594 
+
595  return error;
+
596  }
+
597 
+
598  #endif // DOXYGEN_SHOULD_SKIP_THIS
+
599 
+
600 
+
601  /******************************************************************/
+
605 
+
606 
+
658  template <
+
659  int BINS,
+
660  typename InputIteratorRA,
+
661  typename HistoCounter>
+
662  __host__ __device__ __forceinline__
+
663  static cudaError_t SingleChannelSorting(
+
664  void *d_temp_storage,
+
665  size_t &temp_storage_bytes,
+
666  InputIteratorRA d_samples,
+
667  HistoCounter* d_histogram,
+
668  int num_samples,
+
669  cudaStream_t stream = 0,
+
670  bool stream_synchronous = false)
+
671  {
+
672  return Dispatch<GRID_HISTO_SORT, BINS, 1, 1>(
+
673  d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, num_samples, stream, stream_synchronous);
+
674  }
+
675 
+
676 
+
728  template <
+
729  int BINS,
+
730  typename InputIteratorRA,
+
731  typename HistoCounter>
+
732  __host__ __device__ __forceinline__
+
733  static cudaError_t SingleChannelSharedAtomic(
+
734  void *d_temp_storage,
+
735  size_t &temp_storage_bytes,
+
736  InputIteratorRA d_samples,
+
737  HistoCounter* d_histogram,
+
738  int num_samples,
+
739  cudaStream_t stream = 0,
+
740  bool stream_synchronous = false)
+
741  {
+
742  return Dispatch<GRID_HISTO_SHARED_ATOMIC, BINS, 1, 1>(
+
743  d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, num_samples, stream, stream_synchronous);
+
744  }
+
745 
+
746 
+
797  template <
+
798  int BINS,
+
799  typename InputIteratorRA,
+
800  typename HistoCounter>
+
801  __host__ __device__ __forceinline__
+
802  static cudaError_t SingleChannelGlobalAtomic(
+
803  void *d_temp_storage,
+
804  size_t &temp_storage_bytes,
+
805  InputIteratorRA d_samples,
+
806  HistoCounter* d_histogram,
+
807  int num_samples,
+
808  cudaStream_t stream = 0,
+
809  bool stream_synchronous = false)
+
810  {
+
811  return Dispatch<GRID_HISTO_GLOBAL_ATOMIC, BINS, 1, 1>(
+
812  d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, num_samples, stream, stream_synchronous);
+
813  }
+
814 
+
815 
+
817  /******************************************************************/
+
821 
+
822 
+
879  template <
+
880  int BINS,
+
881  int CHANNELS,
+
882  int ACTIVE_CHANNELS,
+
883  typename InputIteratorRA,
+
884  typename HistoCounter>
+
885  __host__ __device__ __forceinline__
+
886  static cudaError_t MultiChannelSorting(
+
887  void *d_temp_storage,
+
888  size_t &temp_storage_bytes,
+
889  InputIteratorRA d_samples,
+
890  HistoCounter *d_histograms[ACTIVE_CHANNELS],
+
891  int num_samples,
+
892  cudaStream_t stream = 0,
+
893  bool stream_synchronous = false)
+
894  {
+
895  return Dispatch<GRID_HISTO_SORT, BINS, CHANNELS, ACTIVE_CHANNELS>(
+
896  d_temp_storage, temp_storage_bytes, d_samples, d_histograms, num_samples, stream, stream_synchronous);
+
897  }
+
898 
+
899 
+
956  template <
+
957  int BINS,
+
958  int CHANNELS,
+
959  int ACTIVE_CHANNELS,
+
960  typename InputIteratorRA,
+
961  typename HistoCounter>
+
962  __host__ __device__ __forceinline__
+
963  static cudaError_t MultiChannelSharedAtomic(
+
964  void *d_temp_storage,
+
965  size_t &temp_storage_bytes,
+
966  InputIteratorRA d_samples,
+
967  HistoCounter *d_histograms[ACTIVE_CHANNELS],
+
968  int num_samples,
+
969  cudaStream_t stream = 0,
+
970  bool stream_synchronous = false)
+
971  {
+
972  return Dispatch<GRID_HISTO_SHARED_ATOMIC, BINS, CHANNELS, ACTIVE_CHANNELS>(
+
973  d_temp_storage, temp_storage_bytes, d_samples, d_histograms, num_samples, stream, stream_synchronous);
+
974  }
+
975 
+
976 
+
1034  template <
+
1035  int BINS,
+
1036  int CHANNELS,
+
1037  int ACTIVE_CHANNELS,
+
1038  typename InputIteratorRA,
+
1039  typename HistoCounter>
+
1040  __host__ __device__ __forceinline__
+
1041  static cudaError_t MultiChannelGlobalAtomic(
+
1042  void *d_temp_storage,
+
1043  size_t &temp_storage_bytes,
+
1044  InputIteratorRA d_samples,
+
1045  HistoCounter *d_histograms[ACTIVE_CHANNELS],
+
1046  int num_samples,
+
1047  cudaStream_t stream = 0,
+
1048  bool stream_synchronous = false)
+
1049  {
+
1050  return Dispatch<GRID_HISTO_GLOBAL_ATOMIC, BINS, CHANNELS, ACTIVE_CHANNELS>(
+
1051  d_temp_storage, temp_storage_bytes, d_samples, d_histograms, num_samples, stream, stream_synchronous);
+
1052  }
+
1053 
+
1055 
+
1056 };
+
1057 
+
1058 
+
1059 } // CUB namespace
+
1060 CUB_NS_POSTFIX // Optional outer namespace(s)
+
1061 
+
1062 
+
+ + + + + diff --git a/docs/html/device__radix__sort_8cuh_source.html b/docs/html/device__radix__sort_8cuh_source.html new file mode 100644 index 0000000000..f02810ac75 --- /dev/null +++ b/docs/html/device__radix__sort_8cuh_source.html @@ -0,0 +1,861 @@ + + + + + + + +CUB: device_radix_sort.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
device_radix_sort.cuh
+
+
+Go to the documentation of this file.
1 
+
2 /******************************************************************************
+
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
4  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
5  *
+
6  * Redistribution and use in source and binary forms, with or without
+
7  * modification, are permitted provided that the following conditions are met:
+
8  * * Redistributions of source code must retain the above copyright
+
9  * notice, this list of conditions and the following disclaimer.
+
10  * * Redistributions in binary form must reproduce the above copyright
+
11  * notice, this list of conditions and the following disclaimer in the
+
12  * documentation and/or other materials provided with the distribution.
+
13  * * Neither the name of the NVIDIA CORPORATION nor the
+
14  * names of its contributors may be used to endorse or promote products
+
15  * derived from this software without specific prior written permission.
+
16  *
+
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
27  *
+
28  ******************************************************************************/
+
29 
+
35 #pragma once
+
36 
+
37 #include <stdio.h>
+
38 #include <iterator>
+
39 
+
40 #include "block/block_radix_sort_upsweep_tiles.cuh"
+
41 #include "block/block_radix_sort_downsweep_tiles.cuh"
+
42 #include "block/block_scan_tiles.cuh"
+
43 #include "../grid/grid_even_share.cuh"
+
44 #include "../util_debug.cuh"
+
45 #include "../util_device.cuh"
+
46 #include "../util_namespace.cuh"
+
47 
+
49 CUB_NS_PREFIX
+
50 
+
52 namespace cub {
+
53 
+
54 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
55 
+
56 
+
57 
+
58 
+
59 /******************************************************************************
+
60  * Kernel entry points
+
61  *****************************************************************************/
+
62 
+
66 template <
+
67  typename BlockRadixSortUpsweepTilesPolicy,
+
68  typename Key,
+
69  typename SizeT>
+
70 __launch_bounds__ (int(BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS), 1)
+
71 __global__ void RadixSortUpsweepKernel(
+
72  Key *d_keys,
+
73  SizeT *d_spine,
+
74  SizeT num_items,
+
75  int current_bit,
+
76  bool use_primary_bit_granularity,
+
77  bool first_pass,
+
78  GridEvenShare<SizeT> even_share)
+
79 {
+
80 
+
81  // Alternate policy for when fewer bits remain
+
82  typedef typename BlockRadixSortUpsweepTilesPolicy::AltPolicy AltPolicy;
+
83 
+
84  // Parameterize two versions of BlockRadixSortUpsweepTiles type for the current configuration
+
85  typedef BlockRadixSortUpsweepTiles<BlockRadixSortUpsweepTilesPolicy, Key, SizeT> BlockRadixSortUpsweepTilesT; // Primary
+
86  typedef BlockRadixSortUpsweepTiles<AltPolicy, Key, SizeT> AltBlockRadixSortUpsweepTilesT; // Alternate (smaller bit granularity)
+
87 
+
88  // Shared memory storage
+
89  __shared__ union
+
90  {
+
91  typename BlockRadixSortUpsweepTilesT::TempStorage pass_storage;
+
92  typename AltBlockRadixSortUpsweepTilesT::TempStorage alt_pass_storage;
+
93  } temp_storage;
+
94 
+
95  // Initialize even-share descriptor for this thread block
+
96  even_share.BlockInit();
+
97 
+
98  // Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
+
99  if (use_primary_bit_granularity)
+
100  {
+
101  // Primary granularity
+
102  SizeT bin_count;
+
103  BlockRadixSortUpsweepTilesT(temp_storage.pass_storage, d_keys, current_bit).ProcessTiles(
+
104  even_share.block_offset,
+
105  even_share.block_oob,
+
106  bin_count);
+
107 
+
108  // Write out digit counts (striped)
+
109  if (threadIdx.x < BlockRadixSortUpsweepTilesT::RADIX_DIGITS)
+
110  {
+
111  d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
+
112  }
+
113  }
+
114  else
+
115  {
+
116  // Alternate granularity
+
117  // Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
+
118  SizeT bin_count;
+
119  AltBlockRadixSortUpsweepTilesT(temp_storage.alt_pass_storage, d_keys, current_bit).ProcessTiles(
+
120  even_share.block_offset,
+
121  even_share.block_oob,
+
122  bin_count);
+
123 
+
124  // Write out digit counts (striped)
+
125  if (threadIdx.x < AltBlockRadixSortUpsweepTilesT::RADIX_DIGITS)
+
126  {
+
127  d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
+
128  }
+
129  }
+
130 }
+
131 
+
132 
+
136 template <
+
137  typename BlockScanTilesPolicy,
+
138  typename SizeT>
+
139 __launch_bounds__ (int(BlockScanTilesPolicy::BLOCK_THREADS), 1)
+
140 __global__ void RadixSortScanKernel(
+
141  SizeT *d_spine,
+
142  int num_counts)
+
143 {
+
144  // Parameterize the BlockScanTiles type for the current configuration
+
145  typedef BlockScanTiles<BlockScanTilesPolicy, SizeT*, SizeT*, cub::Sum, SizeT, SizeT> BlockScanTilesT;
+
146 
+
147  // Shared memory storage
+
148  __shared__ typename BlockScanTilesT::TempStorage temp_storage;
+
149 
+
150  // Block scan instance
+
151  BlockScanTilesT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), SizeT(0)) ;
+
152 
+
153  // Process full input tiles
+
154  int block_offset = 0;
+
155  RunningBlockPrefixOp<SizeT> prefix_op;
+
156  prefix_op.running_total = 0;
+
157  while (block_offset < num_counts)
+
158  {
+
159  block_scan.ConsumeTile<true, false>(block_offset, prefix_op);
+
160  block_offset += BlockScanTilesT::TILE_ITEMS;
+
161  }
+
162 }
+
163 
+
164 
+
168 template <
+
169  typename BlockRadixSortDownsweepTilesPolicy,
+
170  typename Key,
+
171  typename Value,
+
172  typename SizeT>
+
173 __launch_bounds__ (int(BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS))
+
174 __global__ void RadixSortDownsweepKernel(
+
175  Key *d_keys_in,
+
176  Key *d_keys_out,
+
177  Value *d_values_in,
+
178  Value *d_values_out,
+
179  SizeT *d_spine,
+
180  SizeT num_items,
+
181  int current_bit,
+
182  bool use_primary_bit_granularity,
+
183  bool first_pass,
+
184  bool last_pass,
+
185  GridEvenShare<SizeT> even_share)
+
186 {
+
187 
+
188  // Alternate policy for when fewer bits remain
+
189  typedef typename BlockRadixSortDownsweepTilesPolicy::AltPolicy AltPolicy;
+
190 
+
191  // Parameterize two versions of BlockRadixSortDownsweepTiles type for the current configuration
+
192  typedef BlockRadixSortDownsweepTiles<BlockRadixSortDownsweepTilesPolicy, Key, Value, SizeT> BlockRadixSortDownsweepTilesT;
+
193  typedef BlockRadixSortDownsweepTiles<AltPolicy, Key, Value, SizeT> AltBlockRadixSortDownsweepTilesT;
+
194 
+
195  // Shared memory storage
+
196  __shared__ union
+
197  {
+
198  typename BlockRadixSortDownsweepTilesT::TempStorage pass_storage;
+
199  typename AltBlockRadixSortDownsweepTilesT::TempStorage alt_pass_storage;
+
200 
+
201  } temp_storage;
+
202 
+
203  // Initialize even-share descriptor for this thread block
+
204  even_share.BlockInit();
+
205 
+
206  if (use_primary_bit_granularity)
+
207  {
+
208  // Process input tiles
+
209  BlockRadixSortDownsweepTilesT(temp_storage.pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
+
210  even_share.block_offset,
+
211  even_share.block_oob);
+
212  }
+
213  else
+
214  {
+
215  // Process input tiles
+
216  AltBlockRadixSortDownsweepTilesT(temp_storage.alt_pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
+
217  even_share.block_offset,
+
218  even_share.block_oob);
+
219  }
+
220 }
+
221 
+
222 
+
223 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
224 
+
225 
+
226 
+
227 
+
228 
+
229 /******************************************************************************
+
230  * DeviceRadixSort
+
231  *****************************************************************************/
+
232 
+ +
262 {
+
263  #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
264 
+
265 
+
266  /******************************************************************************
+
267  * Constants and typedefs
+
268  ******************************************************************************/
+
269 
+
271  struct KernelDispachParams
+
272  {
+
273  int block_threads;
+
274  int items_per_thread;
+
275  cudaSharedMemConfig smem_config;
+
276  int radix_bits;
+
277  int alt_radix_bits;
+
278  int subscription_factor;
+
279  int tile_size;
+
280 
+
281  template <typename SortBlockPolicy>
+
282  __host__ __device__ __forceinline__
+
283  void InitUpsweepPolicy(int subscription_factor = 1)
+
284  {
+
285  block_threads = SortBlockPolicy::BLOCK_THREADS;
+
286  items_per_thread = SortBlockPolicy::ITEMS_PER_THREAD;
+
287  radix_bits = SortBlockPolicy::RADIX_BITS;
+
288  alt_radix_bits = SortBlockPolicy::AltPolicy::RADIX_BITS;
+
289  smem_config = cudaSharedMemBankSizeFourByte;
+
290  this->subscription_factor = subscription_factor;
+
291  tile_size = block_threads * items_per_thread;
+
292  }
+
293 
+
294  template <typename ScanBlockPolicy>
+
295  __host__ __device__ __forceinline__
+
296  void InitScanPolicy()
+
297  {
+
298  block_threads = ScanBlockPolicy::BLOCK_THREADS;
+
299  items_per_thread = ScanBlockPolicy::ITEMS_PER_THREAD;
+
300  radix_bits = 0;
+
301  alt_radix_bits = 0;
+
302  smem_config = cudaSharedMemBankSizeFourByte;
+
303  subscription_factor = 0;
+
304  tile_size = block_threads * items_per_thread;
+
305  }
+
306 
+
307  template <typename SortBlockPolicy>
+
308  __host__ __device__ __forceinline__
+
309  void InitDownsweepPolicy(int subscription_factor = 1)
+
310  {
+
311  block_threads = SortBlockPolicy::BLOCK_THREADS;
+
312  items_per_thread = SortBlockPolicy::ITEMS_PER_THREAD;
+
313  radix_bits = SortBlockPolicy::RADIX_BITS;
+
314  alt_radix_bits = SortBlockPolicy::AltPolicy::RADIX_BITS;
+
315  smem_config = SortBlockPolicy::SMEM_CONFIG;
+
316  this->subscription_factor = subscription_factor;
+
317  tile_size = block_threads * items_per_thread;
+
318  }
+
319  };
+
320 
+
321 
+
322 
+
323  /******************************************************************************
+
324  * Tuning policies
+
325  ******************************************************************************/
+
326 
+
328  template <typename Key, typename Value, typename SizeT, int ARCH>
+
329  struct TunedPolicies;
+
330 
+
332  template <typename Key, typename Value, typename SizeT>
+
333  struct TunedPolicies<Key, Value, SizeT, 350>
+
334  {
+
335  enum {
+
336  KEYS_ONLY = (Equals<Value, NullType>::VALUE),
+
337  SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+
338  RADIX_BITS = 5,
+
339  };
+
340 
+
341  // UpsweepPolicy
+
342  typedef BlockRadixSortUpsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
+
343  typedef BlockRadixSortUpsweepTilesPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
+
344  typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
345 /*
+
346  // 4bit
+
347  typedef BlockRadixSortUpsweepTilesPolicy <128, 15, LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
+
348  typedef BlockRadixSortUpsweepTilesPolicy <256, 13, LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
+
349 */
+
350  // ScanPolicy
+
351  typedef BlockScanTilesPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
352 
+
353  // DownsweepPolicy
+
354  typedef BlockRadixSortDownsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
+
355  typedef BlockRadixSortDownsweepTilesPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
+ +
357 
+
358 /*
+
359  // 4bit
+
360  typedef BlockRadixSortDownsweepTilesPolicy <128, 15, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
+
361  typedef BlockRadixSortDownsweepTilesPolicy <256, 13, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
+
362 */
+
363  enum { SUBSCRIPTION_FACTOR = 7 };
+
364  };
+
365 
+
366 
+
368  template <typename Key, typename Value, typename SizeT>
+
369  struct TunedPolicies<Key, Value, SizeT, 200>
+
370  {
+
371  enum {
+
372  KEYS_ONLY = (Equals<Value, NullType>::VALUE),
+
373  SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+
374  RADIX_BITS = 5,
+
375  };
+
376 
+
377  // UpsweepPolicy
+
378  typedef BlockRadixSortUpsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
+
379  typedef BlockRadixSortUpsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
+
380  typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
381 
+
382  // ScanPolicy
+
383  typedef BlockScanTilesPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
384 
+
385  // DownsweepPolicy
+
386  typedef BlockRadixSortDownsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
+
387  typedef BlockRadixSortDownsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
+ +
389 
+
390  enum { SUBSCRIPTION_FACTOR = 3 };
+
391  };
+
392 
+
393 
+
395  template <typename Key, typename Value, typename SizeT>
+
396  struct TunedPolicies<Key, Value, SizeT, 100>
+
397  {
+
398  enum {
+
399  RADIX_BITS = 4,
+
400  };
+
401 
+
402  // UpsweepPolicy
+
403  typedef BlockRadixSortUpsweepTilesPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy;
+
404 
+
405  // ScanPolicy
+
406  typedef BlockScanTilesPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
407 
+
408  // DownsweepPolicy
+
409  typedef BlockRadixSortDownsweepTilesPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy;
+
410 
+
411  enum { SUBSCRIPTION_FACTOR = 3 };
+
412  };
+
413 
+
414 
+
415 
+
416  /******************************************************************************
+
417  * Default policy initializer
+
418  ******************************************************************************/
+
419 
+
421  template <typename Key, typename Value, typename SizeT>
+
422  struct PtxDefaultPolicies
+
423  {
+
424 
+
425  static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ?
+
426  350 :
+
427  (CUB_PTX_ARCH >= 200) ?
+
428  200 :
+
429  100;
+
430 
+
431  // Tuned policy set for the current PTX compiler pass
+
432  typedef TunedPolicies<Key, Value, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
433 
+
434  // UpsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+
435  struct UpsweepPolicy : PtxTunedPolicies::UpsweepPolicy {};
+
436 
+
437  // ScanPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+
438  struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};
+
439 
+
440  // DownsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+
441  struct DownsweepPolicy : PtxTunedPolicies::DownsweepPolicy {};
+
442 
+
443  // Subscription factor for the current PTX compiler pass
+
444  enum { SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR };
+
445 
+
446 
+
450  static void InitDispatchParams(
+
451  int ptx_version,
+
452  KernelDispachParams &upsweep_dispatch_params,
+
453  KernelDispachParams &scan_dispatch_params,
+
454  KernelDispachParams &downsweep_dispatch_params)
+
455  {
+
456  if (ptx_version >= 350)
+
457  {
+
458  typedef TunedPolicies<Key, Value, SizeT, 350> TunedPolicies;
+
459  upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
460  scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
+
461  downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
462  }
+
463  else if (ptx_version >= 200)
+
464  {
+
465  typedef TunedPolicies<Key, Value, SizeT, 200> TunedPolicies;
+
466  upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
467  scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
+
468  downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
469  }
+
470  else
+
471  {
+
472  typedef TunedPolicies<Key, Value, SizeT, 100> TunedPolicies;
+
473  upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
474  scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
+
475  downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
476  }
+
477  }
+
478  };
+
479 
+
480 
+
481 
+
482  /******************************************************************************
+
483  * Utility methods
+
484  ******************************************************************************/
+
485 
+
489  template <
+
490  typename UpsweepKernelPtr,
+
491  typename SpineKernelPtr,
+
492  typename DownsweepKernelPtr,
+
493  typename Key,
+
494  typename Value,
+
495  typename SizeT>
+
496  __host__ __device__ __forceinline__
+
497  static cudaError_t Dispatch(
+
498  void *d_temp_storage,
+
499  size_t &temp_storage_bytes,
+
500  UpsweepKernelPtr upsweep_kernel,
+
501  SpineKernelPtr scan_kernel,
+
502  DownsweepKernelPtr downsweep_kernel,
+
503  KernelDispachParams &upsweep_dispatch_params,
+
504  KernelDispachParams &scan_dispatch_params,
+
505  KernelDispachParams &downsweep_dispatch_params,
+
506  DoubleBuffer<Key> &d_keys,
+
507  DoubleBuffer<Value> &d_values,
+
508  SizeT num_items,
+
509  int begin_bit = 0,
+
510  int end_bit = sizeof(Key) * 8,
+
511  cudaStream_t stream = 0,
+
512  bool stream_synchronous = false)
+
513  {
+
514 #ifndef CUB_RUNTIME_ENABLED
+
515 
+
516  // Kernel launch not supported from this device
+
517  return CubDebug(cudaErrorNotSupported );
+
518 
+
519 #else
+
520 
+
521  cudaError error = cudaSuccess;
+
522  do
+
523  {
+
524  // Get device ordinal
+
525  int device_ordinal;
+
526  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
527 
+
528  // Get SM count
+
529  int sm_count;
+
530  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
531 
+
532  // Get a rough estimate of downsweep_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+
533  int downsweep_sm_occupancy = CUB_MIN(
+ +
535  ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / downsweep_dispatch_params.block_threads);
+
536  int upsweep_sm_occupancy = downsweep_sm_occupancy;
+
537 
+
538 #ifndef __CUDA_ARCH__
+
539  // We're on the host, so come up with more accurate estimates of SM occupancy from actual device properties
+
540  Device device_props;
+
541  if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
542 
+
543  if (CubDebug(error = device_props.MaxSmOccupancy(
+
544  downsweep_sm_occupancy,
+
545  downsweep_kernel,
+
546  downsweep_dispatch_params.block_threads))) break;
+
547 
+
548  if (CubDebug(error = device_props.MaxSmOccupancy(
+
549  upsweep_sm_occupancy,
+
550  upsweep_kernel,
+
551  upsweep_dispatch_params.block_threads))) break;
+
552 #endif
+
553  // Get device occupancies
+
554  int downsweep_occupancy = downsweep_sm_occupancy * sm_count;
+
555 
+
556  // Get even-share work distribution descriptor
+
557  GridEvenShare<SizeT> even_share;
+
558  int max_downsweep_grid_size = downsweep_occupancy * downsweep_dispatch_params.subscription_factor;
+
559  int downsweep_grid_size;
+
560  even_share.GridInit(num_items, max_downsweep_grid_size, downsweep_dispatch_params.tile_size);
+
561  downsweep_grid_size = even_share.grid_size;
+
562 
+
563  // Get number of spine elements (round up to nearest spine scan kernel tile size)
+
564  int bins = 1 << downsweep_dispatch_params.radix_bits;
+
565  int spine_size = downsweep_grid_size * bins;
+
566  int spine_tiles = (spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+
567  spine_size = spine_tiles * scan_dispatch_params.tile_size;
+
568 
+
569  int alt_bins = 1 << downsweep_dispatch_params.alt_radix_bits;
+
570  int alt_spine_size = downsweep_grid_size * alt_bins;
+
571  int alt_spine_tiles = (alt_spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+
572  alt_spine_size = alt_spine_tiles * scan_dispatch_params.tile_size;
+
573 
+
574  // Temporary storage allocation requirements
+
575  void* allocations[1];
+
576  size_t allocation_sizes[1] =
+
577  {
+
578  spine_size * sizeof(SizeT), // bytes needed for privatized block digit histograms
+
579  };
+
580 
+
581  // Alias temporaries (or set the necessary size of the storage allocation)
+
582  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
583 
+
584  // Return if the caller is simply requesting the size of the storage allocation
+
585  if (d_temp_storage == NULL)
+
586  return cudaSuccess;
+
587 
+
588  // Privatized per-block digit histograms
+
589  SizeT *d_spine = (SizeT*) allocations[0];
+
590 
+
591 #ifndef __CUDA_ARCH__
+
592  // Get current smem bank configuration
+
593  cudaSharedMemConfig original_smem_config;
+
594  if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
+
595  cudaSharedMemConfig current_smem_config = original_smem_config;
+
596 #endif
+
597  // Iterate over digit places
+
598  int current_bit = begin_bit;
+
599  while (current_bit < end_bit)
+
600  {
+
601  // Use primary bit granularity if bits remaining is a whole multiple of bit primary granularity
+
602  int bits_remaining = end_bit - current_bit;
+
603  bool use_primary_bit_granularity = (bits_remaining % downsweep_dispatch_params.radix_bits == 0);
+
604  int radix_bits = (use_primary_bit_granularity) ?
+
605  downsweep_dispatch_params.radix_bits :
+
606  downsweep_dispatch_params.alt_radix_bits;
+
607 
+
608 #ifndef __CUDA_ARCH__
+
609  // Update smem config if necessary
+
610  if (current_smem_config != upsweep_dispatch_params.smem_config)
+
611  {
+
612  if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_dispatch_params.smem_config))) break;
+
613  current_smem_config = upsweep_dispatch_params.smem_config;
+
614  }
+
615 #endif
+
616 
+
617  // Log upsweep_kernel configuration
+
618  if (stream_synchronous)
+
619  CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n",
+
620  downsweep_grid_size, upsweep_dispatch_params.block_threads, (long long) stream, upsweep_dispatch_params.smem_config, upsweep_dispatch_params.items_per_thread, upsweep_sm_occupancy, d_keys.selector, current_bit, radix_bits);
+
621 
+
622  // Invoke upsweep_kernel with same grid size as downsweep_kernel
+
623  upsweep_kernel<<<downsweep_grid_size, upsweep_dispatch_params.block_threads, 0, stream>>>(
+
624  d_keys.d_buffers[d_keys.selector],
+
625  d_spine,
+
626  num_items,
+
627  current_bit,
+
628  use_primary_bit_granularity,
+
629  (current_bit == begin_bit),
+
630  even_share);
+
631 
+
632  // Sync the stream if specified
+
633  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
634 
+
635  // Log scan_kernel configuration
+
636  if (stream_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+
637  1, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread);
+
638 
+
639  // Invoke scan_kernel
+
640  scan_kernel<<<1, scan_dispatch_params.block_threads, 0, stream>>>(
+
641  d_spine,
+
642  (use_primary_bit_granularity) ? spine_size : alt_spine_size);
+
643 
+
644  // Sync the stream if specified
+
645  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
646 
+
647 #ifndef __CUDA_ARCH__
+
648  // Update smem config if necessary
+
649  if (current_smem_config != downsweep_dispatch_params.smem_config)
+
650  {
+
651  if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_dispatch_params.smem_config))) break;
+
652  current_smem_config = downsweep_dispatch_params.smem_config;
+
653  }
+
654 #endif
+
655 
+
656  // Log downsweep_kernel configuration
+
657  if (stream_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n",
+
658  downsweep_grid_size, downsweep_dispatch_params.block_threads, (long long) stream, downsweep_dispatch_params.smem_config, downsweep_dispatch_params.items_per_thread, downsweep_sm_occupancy);
+
659 
+
660  // Invoke downsweep_kernel
+
661  downsweep_kernel<<<downsweep_grid_size, downsweep_dispatch_params.block_threads, 0, stream>>>(
+
662  d_keys.d_buffers[d_keys.selector],
+
663  d_keys.d_buffers[d_keys.selector ^ 1],
+
664  d_values.d_buffers[d_values.selector],
+
665  d_values.d_buffers[d_values.selector ^ 1],
+
666  d_spine,
+
667  num_items,
+
668  current_bit,
+
669  use_primary_bit_granularity,
+
670  (current_bit == begin_bit),
+
671  (current_bit + downsweep_dispatch_params.radix_bits >= end_bit),
+
672  even_share);
+
673 
+
674  // Sync the stream if specified
+
675  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
676 
+
677  // Invert selectors
+
678  d_keys.selector ^= 1;
+
679  d_values.selector ^= 1;
+
680 
+
681  // Update current bit position
+
682  current_bit += radix_bits;
+
683  }
+
684 
+
685 #ifndef __CUDA_ARCH__
+
686  // Reset smem config if necessary
+
687  if (current_smem_config != original_smem_config)
+
688  {
+
689  if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
+
690  }
+
691 #endif
+
692 
+
693  }
+
694  while (0);
+
695 
+
696  return error;
+
697 
+
698 #endif // CUB_RUNTIME_ENABLED
+
699  }
+
700 
+
701 
+
702 
+
703  #endif // DOXYGEN_SHOULD_SKIP_THIS
+
704 
+
705  /******************************************************************************
+
706  * Interface
+
707  ******************************************************************************/
+
708 
+
709 
+
755  template <
+
756  typename Key,
+
757  typename Value>
+
758  __host__ __device__ __forceinline__
+
759  static cudaError_t SortPairs(
+
760  void *d_temp_storage,
+
761  size_t &temp_storage_bytes,
+
762  DoubleBuffer<Key> &d_keys,
+
763  DoubleBuffer<Value> &d_values,
+
764  int num_items,
+
765  int begin_bit = 0,
+
766  int end_bit = sizeof(Key) * 8,
+
767  cudaStream_t stream = 0,
+
768  bool stream_synchronous = false)
+
769  {
+
770  // Type used for array indexing
+
771  typedef int SizeT;
+
772 
+
773  // Tuning polices
+
774  typedef PtxDefaultPolicies<Key, Value, SizeT> PtxDefaultPolicies; // Wrapper of default kernel policies
+
775  typedef typename PtxDefaultPolicies::UpsweepPolicy UpsweepPolicy; // Upsweep kernel policy
+
776  typedef typename PtxDefaultPolicies::ScanPolicy ScanPolicy; // Scan kernel policy
+
777  typedef typename PtxDefaultPolicies::DownsweepPolicy DownsweepPolicy; // Downsweep kernel policy
+
778 
+
779  cudaError error = cudaSuccess;
+
780  do
+
781  {
+
782  // Declare dispatch parameters
+
783  KernelDispachParams upsweep_dispatch_params;
+
784  KernelDispachParams scan_dispatch_params;
+
785  KernelDispachParams downsweep_dispatch_params;
+
786 
+
787 #ifdef __CUDA_ARCH__
+
788  // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+
789  upsweep_dispatch_params.InitUpsweepPolicy<UpsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+
790  scan_dispatch_params.InitScanPolicy<ScanPolicy>();
+
791  downsweep_dispatch_params.InitDownsweepPolicy<DownsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+
792 #else
+
793  // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+
794  int ptx_version;
+
795  if (CubDebug(error = PtxVersion(ptx_version))) break;
+
796  PtxDefaultPolicies::InitDispatchParams(
+
797  ptx_version,
+
798  upsweep_dispatch_params,
+
799  scan_dispatch_params,
+
800  downsweep_dispatch_params);
+
801 #endif
+
802  // Dispatch
+
803  if (CubDebug(error = Dispatch(
+
804  d_temp_storage,
+
805  temp_storage_bytes,
+
806  RadixSortUpsweepKernel<UpsweepPolicy, Key, SizeT>,
+
807  RadixSortScanKernel<ScanPolicy, SizeT>,
+
808  RadixSortDownsweepKernel<DownsweepPolicy, Key, Value, SizeT>,
+
809  upsweep_dispatch_params,
+
810  scan_dispatch_params,
+
811  downsweep_dispatch_params,
+
812  d_keys,
+
813  d_values,
+
814  num_items,
+
815  begin_bit,
+
816  end_bit,
+
817  stream,
+
818  stream_synchronous))) break;
+
819  }
+
820  while (0);
+
821 
+
822  return error;
+
823  }
+
824 
+
825 
+
868  template <typename Key>
+
869  __host__ __device__ __forceinline__
+
870  static cudaError_t SortKeys(
+
871  void *d_temp_storage,
+
872  size_t &temp_storage_bytes,
+
873  DoubleBuffer<Key> &d_keys,
+
874  int num_items,
+
875  int begin_bit = 0,
+
876  int end_bit = sizeof(Key) * 8,
+
877  cudaStream_t stream = 0,
+
878  bool stream_synchronous = false)
+
879  {
+
880  DoubleBuffer<NullType> d_values;
+
881  return SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream, stream_synchronous);
+
882  }
+
883 
+
884 };
+
885 
+
886 
+
887 } // CUB namespace
+
888 CUB_NS_POSTFIX // Optional outer namespace(s)
+
889 
+
890 
+
+ + + + + diff --git a/docs/html/device__reduce_8cuh_source.html b/docs/html/device__reduce_8cuh_source.html new file mode 100644 index 0000000000..872040822c --- /dev/null +++ b/docs/html/device__reduce_8cuh_source.html @@ -0,0 +1,765 @@ + + + + + + + +CUB: device_reduce.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
device_reduce.cuh
+
+
+Go to the documentation of this file.
1 
+
2 /******************************************************************************
+
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
4  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
5  *
+
6  * Redistribution and use in source and binary forms, with or without
+
7  * modification, are permitted provided that the following conditions are met:
+
8  * * Redistributions of source code must retain the above copyright
+
9  * notice, this list of conditions and the following disclaimer.
+
10  * * Redistributions in binary form must reproduce the above copyright
+
11  * notice, this list of conditions and the following disclaimer in the
+
12  * documentation and/or other materials provided with the distribution.
+
13  * * Neither the name of the NVIDIA CORPORATION nor the
+
14  * names of its contributors may be used to endorse or promote products
+
15  * derived from this software without specific prior written permission.
+
16  *
+
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
27  *
+
28  ******************************************************************************/
+
29 
+
35 #pragma once
+
36 
+
37 #include <stdio.h>
+
38 #include <iterator>
+
39 
+
40 #include "block/block_reduce_tiles.cuh"
+
41 #include "../thread/thread_operators.cuh"
+
42 #include "../grid/grid_even_share.cuh"
+
43 #include "../grid/grid_queue.cuh"
+
44 #include "../util_debug.cuh"
+
45 #include "../util_device.cuh"
+
46 #include "../util_namespace.cuh"
+
47 
+
49 CUB_NS_PREFIX
+
50 
+
52 namespace cub {
+
53 
+
54 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
55 
+
56 
+
57 
+
58 
+
59 
+
60 
+
61 /******************************************************************************
+
62  * Kernel entry points
+
63  *****************************************************************************/
+
64 
+
68 template <
+
69  typename BlockReduceTilesPolicy,
+
70  typename InputIteratorRA,
+
71  typename OutputIteratorRA,
+
72  typename SizeT,
+
73  typename ReductionOp>
+
74 __launch_bounds__ (int(BlockReduceTilesPolicy::BLOCK_THREADS), 1)
+
75 __global__ void ReducePrivatizedKernel(
+
76  InputIteratorRA d_in,
+
77  OutputIteratorRA d_out,
+
78  SizeT num_items,
+
79  GridEvenShare<SizeT> even_share,
+
80  GridQueue<SizeT> queue,
+
81  ReductionOp reduction_op)
+
82 {
+
83  // Data type
+
84  typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
85 
+
86  // Thread block type for reducing input tiles
+
87  typedef BlockReduceTiles<BlockReduceTilesPolicy, InputIteratorRA, SizeT, ReductionOp> BlockReduceTilesT;
+
88 
+
89  // Block-wide aggregate
+
90  T block_aggregate;
+
91 
+
92  // Shared memory storage
+
93  __shared__ typename BlockReduceTilesT::TempStorage temp_storage;
+
94 
+
95  // Consume input tiles
+
96  BlockReduceTilesT(temp_storage, d_in, reduction_op).ConsumeTiles(
+
97  num_items,
+
98  even_share,
+
99  queue,
+
100  block_aggregate,
+
101  Int2Type<BlockReduceTilesPolicy::GRID_MAPPING>());
+
102 
+
103  // Output result
+
104  if (threadIdx.x == 0)
+
105  {
+
106  d_out[blockIdx.x] = block_aggregate;
+
107  }
+
108 }
+
109 
+
110 
+
114 template <
+
115  typename BlockReduceTilesPolicy,
+
116  typename InputIteratorRA,
+
117  typename OutputIteratorRA,
+
118  typename SizeT,
+
119  typename ReductionOp>
+
120 __launch_bounds__ (int(BlockReduceTilesPolicy::BLOCK_THREADS), 1)
+
121 __global__ void ReduceSingleKernel(
+
122  InputIteratorRA d_in,
+
123  OutputIteratorRA d_out,
+
124  SizeT num_items,
+
125  ReductionOp reduction_op)
+
126 {
+
127  // Data type
+
128  typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
129 
+
130  // Thread block type for reducing input tiles
+
131  typedef BlockReduceTiles<BlockReduceTilesPolicy, InputIteratorRA, SizeT, ReductionOp> BlockReduceTilesT;
+
132 
+
133  // Block-wide aggregate
+
134  T block_aggregate;
+
135 
+
136  // Shared memory storage
+
137  __shared__ typename BlockReduceTilesT::TempStorage temp_storage;
+
138 
+
139  // Consume input tiles
+
140  BlockReduceTilesT(temp_storage, d_in, reduction_op).ConsumeTiles(
+
141  SizeT(0),
+
142  SizeT(num_items),
+
143  block_aggregate);
+
144 
+
145  // Output result
+
146  if (threadIdx.x == 0)
+
147  {
+
148  d_out[blockIdx.x] = block_aggregate;
+
149  }
+
150 }
+
151 
+
152 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
153 
+
154 
+
155 /******************************************************************************
+
156  * DeviceReduce
+
157  *****************************************************************************/
+
158 
+ +
176 {
+
177 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
178 
+
179 
+
180  /******************************************************************************
+
181  * Constants and typedefs
+
182  ******************************************************************************/
+
183 
+
185  struct KernelDispachParams
+
186  {
+
187  int block_threads;
+
188  int items_per_thread;
+
189  int vector_load_length;
+
190  BlockReduceAlgorithm block_algorithm;
+
191  PtxLoadModifier load_modifier;
+
192  GridMappingStrategy grid_mapping;
+
193  int subscription_factor;
+
194  int tile_size;
+
195 
+
196  template <typename BlockPolicy>
+
197  __host__ __device__ __forceinline__
+
198  void Init(int subscription_factor = 1)
+
199  {
+
200  block_threads = BlockPolicy::BLOCK_THREADS;
+
201  items_per_thread = BlockPolicy::ITEMS_PER_THREAD;
+
202  vector_load_length = BlockPolicy::VECTOR_LOAD_LENGTH;
+
203  block_algorithm = BlockPolicy::BLOCK_ALGORITHM;
+
204  load_modifier = BlockPolicy::LOAD_MODIFIER;
+
205  grid_mapping = BlockPolicy::GRID_MAPPING;
+
206  this->subscription_factor = subscription_factor;
+
207  tile_size = block_threads * items_per_thread;
+
208  }
+
209 
+
210  __host__ __device__ __forceinline__
+
211  void Print()
+
212  {
+
213  printf("%d threads, %d per thread, %d veclen, %d algo, %d loadmod, %d mapping, %d subscription",
+
214  block_threads,
+
215  items_per_thread,
+
216  vector_load_length,
+
217  block_algorithm,
+
218  load_modifier,
+
219  grid_mapping,
+
220  subscription_factor);
+
221  }
+
222 
+
223  };
+
224 
+
225 
+
226  /******************************************************************************
+
227  * Tuning policies
+
228  ******************************************************************************/
+
229 
+
231  template <
+
232  typename T,
+
233  typename SizeT,
+
234  int ARCH>
+
235  struct TunedPolicies;
+
236 
+
238  template <typename T, typename SizeT>
+
239  struct TunedPolicies<T, SizeT, 350>
+
240  {
+
241  // PrivatizedPolicy (1B): GTX Titan: 206.0 GB/s @ 192M 1B items
+
242  typedef BlockReduceTilesPolicy<128, 12, 1, BLOCK_REDUCE_RAKING, LOAD_LDG, GRID_MAPPING_DYNAMIC> PrivatizedPolicy1B;
+
243 
+
244  // PrivatizedPolicy (4B): GTX Titan: 254.2 GB/s @ 48M 4B items
+
245  typedef BlockReduceTilesPolicy<512, 20, 1, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> PrivatizedPolicy4B;
+
246 
+
247  // PrivatizedPolicy
+
248  typedef typename If<(sizeof(T) < 4),
+
249  PrivatizedPolicy1B,
+
250  PrivatizedPolicy4B>::Type PrivatizedPolicy;
+
251 
+
252  // SinglePolicy
+
253  typedef BlockReduceTilesPolicy<256, 8, 1, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> SinglePolicy;
+
254 
+
255  enum { SUBSCRIPTION_FACTOR = 7 };
+
256 
+
257  };
+
258 
+
260  template <typename T, typename SizeT>
+
261  struct TunedPolicies<T, SizeT, 300>
+
262  {
+
263  // PrivatizedPolicy: GTX670: 154.0 @ 48M 32-bit T
+
264  typedef BlockReduceTilesPolicy<256, 2, 1, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> PrivatizedPolicy;
+
265 
+
266  // SinglePolicy
+
267  typedef BlockReduceTilesPolicy<256, 24, 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> SinglePolicy;
+
268 
+
269  enum { SUBSCRIPTION_FACTOR = 1 };
+
270  };
+
271 
+
273  template <typename T, typename SizeT>
+
274  struct TunedPolicies<T, SizeT, 200>
+
275  {
+
276  // PrivatizedPolicy (1B): GTX 580: 158.1 GB/s @ 192M 1B items
+
277  typedef BlockReduceTilesPolicy<192, 24, 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> PrivatizedPolicy1B;
+
278 
+
279  // PrivatizedPolicy (4B): GTX 580: 178.9 GB/s @ 48M 4B items
+
280  typedef BlockReduceTilesPolicy<128, 8, 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_DYNAMIC> PrivatizedPolicy4B;
+
281 
+
282  // PrivatizedPolicy
+
283  typedef typename If<(sizeof(T) < 4),
+
284  PrivatizedPolicy1B,
+
285  PrivatizedPolicy4B>::Type PrivatizedPolicy;
+
286 
+
287  // SinglePolicy
+
288  typedef BlockReduceTilesPolicy<192, 7, 1, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> SinglePolicy;
+
289 
+
290  enum { SUBSCRIPTION_FACTOR = 2 };
+
291  };
+
292 
+
294  template <typename T, typename SizeT>
+
295  struct TunedPolicies<T, SizeT, 130>
+
296  {
+
297  // PrivatizedPolicy
+
298  typedef BlockReduceTilesPolicy<128, 8, 2, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> PrivatizedPolicy;
+
299 
+
300  // SinglePolicy
+
301  typedef BlockReduceTilesPolicy<32, 4, 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> SinglePolicy;
+
302 
+
303  enum { SUBSCRIPTION_FACTOR = 1 };
+
304  };
+
305 
+
307  template <typename T, typename SizeT>
+
308  struct TunedPolicies<T, SizeT, 100>
+
309  {
+
310  // PrivatizedPolicy
+
311  typedef BlockReduceTilesPolicy<128, 8, 2, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> PrivatizedPolicy;
+
312 
+
313  // SinglePolicy
+
314  typedef BlockReduceTilesPolicy<32, 4, 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> SinglePolicy;
+
315 
+
316  enum { SUBSCRIPTION_FACTOR = 1 };
+
317  };
+
318 
+
319 
+
320 
+
321  /******************************************************************************
+
322  * Default policy initializer
+
323  ******************************************************************************/
+
324 
+
326  template <typename T, typename SizeT>
+
327  struct PtxDefaultPolicies
+
328  {
+
329  static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ?
+
330  350 :
+
331  (CUB_PTX_ARCH >= 300) ?
+
332  300 :
+
333  (CUB_PTX_ARCH >= 200) ?
+
334  200 :
+
335  (CUB_PTX_ARCH >= 130) ?
+
336  130 :
+
337  100;
+
338 
+
339  // Tuned policy set for the current PTX compiler pass
+
340  typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
341 
+
342  // Subscription factor for the current PTX compiler pass
+
343  static const int SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR;
+
344 
+
345  // PrivatizedPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+
346  struct PrivatizedPolicy : PtxTunedPolicies::PrivatizedPolicy {};
+
347 
+
348  // SinglePolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+
349  struct SinglePolicy : PtxTunedPolicies::SinglePolicy {};
+
350 
+
351 
+
355  static void InitDispatchParams(
+
356  int ptx_version,
+
357  KernelDispachParams &privatized_dispatch_params,
+
358  KernelDispachParams &single_dispatch_params)
+
359  {
+
360  if (ptx_version >= 350)
+
361  {
+
362  typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
+
363  privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
364  single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+
365  }
+
366  else if (ptx_version >= 300)
+
367  {
+
368  typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
+
369  privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
370  single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+
371  }
+
372  else if (ptx_version >= 200)
+
373  {
+
374  typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
+
375  privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
376  single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+
377  }
+
378  else if (ptx_version >= 130)
+
379  {
+
380  typedef TunedPolicies<T, SizeT, 130> TunedPolicies;
+
381  privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
382  single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+
383  }
+
384  else
+
385  {
+
386  typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
+
387  privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
388  single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+
389  }
+
390  }
+
391  };
+
392 
+
393 
+
394 
+
395  /******************************************************************************
+
396  * Utility methods
+
397  ******************************************************************************/
+
398 
+
402  template <
+
403  typename ReducePrivatizedKernelPtr,
+
404  typename ReduceSingleKernelPtr,
+
405  typename ResetDrainKernelPtr,
+
406  typename InputIteratorRA,
+
407  typename OutputIteratorRA,
+
408  typename SizeT,
+
409  typename ReductionOp>
+
410  __host__ __device__ __forceinline__
+
411  static cudaError_t Dispatch(
+
412  void *d_temp_storage,
+
413  size_t &temp_storage_bytes,
+
414  ReducePrivatizedKernelPtr privatized_kernel,
+
415  ReduceSingleKernelPtr single_kernel,
+
416  ResetDrainKernelPtr prepare_drain_kernel,
+
417  KernelDispachParams &privatized_dispatch_params,
+
418  KernelDispachParams &single_dispatch_params,
+
419  InputIteratorRA d_in,
+
420  OutputIteratorRA d_out,
+
421  SizeT num_items,
+
422  ReductionOp reduction_op,
+
423  cudaStream_t stream = 0,
+
424  bool stream_synchronous = false)
+
425  {
+
426 #ifndef CUB_RUNTIME_ENABLED
+
427 
+
428  // Kernel launch not supported from this device
+
429  return CubDebug(cudaErrorNotSupported );
+
430 
+
431 #else
+
432 
+
433  // Data type of input iterator
+
434  typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
435 
+
436  cudaError error = cudaSuccess;
+
437  do
+
438  {
+
439  if ((privatized_kernel == NULL) || (num_items <= (single_dispatch_params.tile_size)))
+
440  {
+
441  // Dispatch a single-block reduction kernel
+
442 
+
443  // Return if the caller is simply requesting the size of the storage allocation
+
444  if (d_temp_storage == NULL)
+
445  {
+
446  temp_storage_bytes = 1;
+
447  return cudaSuccess;
+
448  }
+
449 
+
450  // Log single_kernel configuration
+
451  if (stream_synchronous) CubLog("Invoking ReduceSingle<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+
452  single_dispatch_params.block_threads, (long long) stream, single_dispatch_params.items_per_thread);
+
453 
+
454  // Invoke single_kernel
+
455  single_kernel<<<1, single_dispatch_params.block_threads>>>(
+
456  d_in,
+
457  d_out,
+
458  num_items,
+
459  reduction_op);
+
460 
+
461  // Sync the stream if specified
+
462  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
463 
+
464  }
+
465  else
+
466  {
+
467  // Dispatch two kernels: a multi-block kernel to compute
+
468  // privatized per-block reductions, and then a single-block
+
469  // to reduce those
+
470 
+
471  // Get device ordinal
+
472  int device_ordinal;
+
473  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
474 
+
475  // Get SM count
+
476  int sm_count;
+
477  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
478 
+
479  // Get a rough estimate of privatized_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+
480  int privatized_sm_occupancy = CUB_MIN(
+ +
482  ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / privatized_dispatch_params.block_threads);
+
483 
+
484 #ifndef __CUDA_ARCH__
+
485  // We're on the host, so come up with a more accurate estimate of privatized_kernel SM occupancy from actual device properties
+
486  Device device_props;
+
487  if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
488 
+
489  if (CubDebug(error = device_props.MaxSmOccupancy(
+
490  privatized_sm_occupancy,
+
491  privatized_kernel,
+
492  privatized_dispatch_params.block_threads))) break;
+
493 #endif
+
494 
+
495  // Get device occupancy for privatized_kernel
+
496  int privatized_occupancy = privatized_sm_occupancy * sm_count;
+
497 
+
498  // Even-share work distribution
+
499  GridEvenShare<SizeT> even_share;
+
500 
+
501  // Get grid size for privatized_kernel
+
502  int privatized_grid_size;
+
503  switch (privatized_dispatch_params.grid_mapping)
+
504  {
+
505  case GRID_MAPPING_EVEN_SHARE:
+
506 
+
507  // Work is distributed evenly
+
508  even_share.GridInit(
+
509  num_items,
+
510  privatized_occupancy * privatized_dispatch_params.subscription_factor,
+
511  privatized_dispatch_params.tile_size);
+
512  privatized_grid_size = even_share.grid_size;
+
513  break;
+
514 
+
515  case GRID_MAPPING_DYNAMIC:
+
516 
+
517  // Work is distributed dynamically
+
518  int num_tiles = (num_items + privatized_dispatch_params.tile_size - 1) / privatized_dispatch_params.tile_size;
+
519  privatized_grid_size = (num_tiles < privatized_occupancy) ?
+
520  num_tiles : // Not enough to fill the device with threadblocks
+
521  privatized_occupancy; // Fill the device with threadblocks
+
522  break;
+
523  };
+
524 
+
525  // Temporary storage allocation requirements
+
526  void* allocations[2];
+
527  size_t allocation_sizes[2] =
+
528  {
+
529  privatized_grid_size * sizeof(T), // bytes needed for privatized block reductions
+
530  GridQueue<int>::AllocationSize() // bytes needed for grid queue descriptor
+
531  };
+
532 
+
533  // Alias temporaries (or set the necessary size of the storage allocation)
+
534  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
535 
+
536  // Return if the caller is simply requesting the size of the storage allocation
+
537  if (d_temp_storage == NULL)
+
538  return cudaSuccess;
+
539 
+
540  // Privatized per-block reductions
+
541  T *d_block_reductions = (T*) allocations[0];
+
542 
+
543  // Grid queue descriptor
+
544  GridQueue<SizeT> queue(allocations[1]);
+
545 
+
546  // Prepare the dynamic queue descriptor if necessary
+
547  if (privatized_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC)
+
548  {
+
549  // Prepare queue using a kernel so we know it gets prepared once per operation
+
550  if (stream_synchronous) CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream);
+
551 
+
552  // Invoke prepare_drain_kernel
+
553  prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items);
+
554 
+
555  // Sync the stream if specified
+
556  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
557  }
+
558 
+
559  // Log privatized_kernel configuration
+
560  if (stream_synchronous) CubLog("Invoking privatized_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+
561  privatized_grid_size, privatized_dispatch_params.block_threads, (long long) stream, privatized_dispatch_params.items_per_thread, privatized_sm_occupancy);
+
562 
+
563  // Invoke privatized_kernel
+
564  privatized_kernel<<<privatized_grid_size, privatized_dispatch_params.block_threads, 0, stream>>>(
+
565  d_in,
+
566  d_block_reductions,
+
567  num_items,
+
568  even_share,
+
569  queue,
+
570  reduction_op);
+
571 
+
572  // Sync the stream if specified
+
573  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
574 
+
575  // Log single_kernel configuration
+
576  if (stream_synchronous) CubLog("Invoking single_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+
577  1, single_dispatch_params.block_threads, (long long) stream, single_dispatch_params.items_per_thread);
+
578 
+
579  // Invoke single_kernel
+
580  single_kernel<<<1, single_dispatch_params.block_threads, 0, stream>>>(
+
581  d_block_reductions,
+
582  d_out,
+
583  privatized_grid_size,
+
584  reduction_op);
+
585 
+
586  // Sync the stream if specified
+
587  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
588  }
+
589  }
+
590  while (0);
+
591 
+
592  return error;
+
593 
+
594 #endif // CUB_RUNTIME_ENABLED
+
595  }
+
596 
+
597 
+
598 
+
599 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
600 
+
601  /******************************************************************************
+
602  * Interface
+
603  ******************************************************************************/
+
604 
+
646  template <
+
647  typename InputIteratorRA,
+
648  typename OutputIteratorRA,
+
649  typename ReductionOp>
+
650  __host__ __device__ __forceinline__
+
651  static cudaError_t Reduce(
+
652  void *d_temp_storage,
+
653  size_t &temp_storage_bytes,
+
654  InputIteratorRA d_in,
+
655  OutputIteratorRA d_out,
+
656  int num_items,
+
657  ReductionOp reduction_op,
+
658  cudaStream_t stream = 0,
+
659  bool stream_synchronous = false)
+
660  {
+
661  // Type used for array indexing
+
662  typedef int SizeT;
+
663 
+
664  // Data type of input iterator
+
665  typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
666 
+
667  // Tuning polices
+
668  typedef PtxDefaultPolicies<T, SizeT> PtxDefaultPolicies; // Wrapper of default kernel policies
+
669  typedef typename PtxDefaultPolicies::PrivatizedPolicy PrivatizedPolicy; // Multi-block kernel policy
+
670  typedef typename PtxDefaultPolicies::SinglePolicy SinglePolicy; // Single-block kernel policy
+
671 
+
672  cudaError error = cudaSuccess;
+
673  do
+
674  {
+
675  // Declare dispatch parameters
+
676  KernelDispachParams privatized_dispatch_params;
+
677  KernelDispachParams single_dispatch_params;
+
678 
+
679 #ifdef __CUDA_ARCH__
+
680  // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+
681  privatized_dispatch_params.Init<PrivatizedPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+
682  single_dispatch_params.Init<SinglePolicy>();
+
683 #else
+
684  // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+
685  int ptx_version;
+
686  if (CubDebug(error = PtxVersion(ptx_version))) break;
+
687  PtxDefaultPolicies::InitDispatchParams(ptx_version, privatized_dispatch_params, single_dispatch_params);
+
688 #endif
+
689 
+
690  // Dispatch
+
691  if (CubDebug(error = Dispatch(
+
692  d_temp_storage,
+
693  temp_storage_bytes,
+
694  ReducePrivatizedKernel<PrivatizedPolicy, InputIteratorRA, T*, SizeT, ReductionOp>,
+
695  ReduceSingleKernel<SinglePolicy, T*, OutputIteratorRA, SizeT, ReductionOp>,
+
696  ResetDrainKernel<SizeT>,
+
697  privatized_dispatch_params,
+
698  single_dispatch_params,
+
699  d_in,
+
700  d_out,
+
701  num_items,
+
702  reduction_op,
+
703  stream,
+
704  stream_synchronous))) break;
+
705  }
+
706  while (0);
+
707 
+
708  return error;
+
709  }
+
710 
+
711 
+
752  template <
+
753  typename InputIteratorRA,
+
754  typename OutputIteratorRA>
+
755  __host__ __device__ __forceinline__
+
756  static cudaError_t Sum(
+
757  void *d_temp_storage,
+
758  size_t &temp_storage_bytes,
+
759  InputIteratorRA d_in,
+
760  OutputIteratorRA d_out,
+
761  int num_items,
+
762  cudaStream_t stream = 0,
+
763  bool stream_synchronous = false)
+
764  {
+
765  return Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum(), stream, stream_synchronous);
+
766  }
+
767 
+
768 
+
769 };
+
770 
+
771 
+
772 } // CUB namespace
+
773 CUB_NS_POSTFIX // Optional outer namespace(s)
+
774 
+
775 
+
+ + + + + diff --git a/docs/html/device__scan_8cuh_source.html b/docs/html/device__scan_8cuh_source.html new file mode 100644 index 0000000000..7ed16146e7 --- /dev/null +++ b/docs/html/device__scan_8cuh_source.html @@ -0,0 +1,710 @@ + + + + + + + +CUB: device_scan.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
device_scan.cuh
+
+
+Go to the documentation of this file.
1 
+
2 /******************************************************************************
+
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
4  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
5  *
+
6  * Redistribution and use in source and binary forms, with or without
+
7  * modification, are permitted provided that the following conditions are met:
+
8  * * Redistributions of source code must retain the above copyright
+
9  * notice, this list of conditions and the following disclaimer.
+
10  * * Redistributions in binary form must reproduce the above copyright
+
11  * notice, this list of conditions and the following disclaimer in the
+
12  * documentation and/or other materials provided with the distribution.
+
13  * * Neither the name of the NVIDIA CORPORATION nor the
+
14  * names of its contributors may be used to endorse or promote products
+
15  * derived from this software without specific prior written permission.
+
16  *
+
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
27  *
+
28  ******************************************************************************/
+
29 
+
35 #pragma once
+
36 
+
37 #include <stdio.h>
+
38 #include <iterator>
+
39 
+
40 #include "block/block_scan_tiles.cuh"
+
41 #include "../thread/thread_operators.cuh"
+
42 #include "../grid/grid_queue.cuh"
+
43 #include "../util_debug.cuh"
+
44 #include "../util_device.cuh"
+
45 #include "../util_namespace.cuh"
+
46 
+
48 CUB_NS_PREFIX
+
49 
+
51 namespace cub {
+
52 
+
53 
+
54 /******************************************************************************
+
55  * Kernel entry points
+
56  *****************************************************************************/
+
57 
+
58 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
59 
+
60 
+
64 template <
+
65  typename T,
+
66  typename SizeT>
+
67 __global__ void ScanInitKernel(
+
68  GridQueue<SizeT> grid_queue,
+
69  ScanTileDescriptor<T> *d_tile_status,
+
70  int num_tiles)
+
71 {
+
72  typedef ScanTileDescriptor<T> ScanTileDescriptorT;
+
73 
+
74  enum
+
75  {
+
76  TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
+
77  };
+
78 
+
79  // Reset queue descriptor
+
80  if ((blockIdx.x == 0) && (threadIdx.x == 0)) grid_queue.ResetDrain(num_tiles);
+
81 
+
82  // Initialize tile status
+
83  int tile_offset = (blockIdx.x * blockDim.x) + threadIdx.x;
+
84  if (tile_offset < num_tiles)
+
85  {
+
86  // Not-yet-set
+
87  d_tile_status[TILE_STATUS_PADDING + tile_offset].status = SCAN_TILE_INVALID;
+
88  }
+
89 
+
90  if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+
91  {
+
92  // Padding
+
93  d_tile_status[threadIdx.x].status = SCAN_TILE_OOB;
+
94  }
+
95 }
+
96 
+
97 
+
101 template <
+
102  typename BlockScanTilesPolicy,
+
103  typename InputIteratorRA,
+
104  typename OutputIteratorRA,
+
105  typename T,
+
106  typename ScanOp,
+
107  typename Identity,
+
108  typename SizeT>
+
109 __launch_bounds__ (int(BlockScanTilesPolicy::BLOCK_THREADS))
+
110 __global__ void ScanKernel(
+
111  InputIteratorRA d_in,
+
112  OutputIteratorRA d_out,
+
113  ScanTileDescriptor<T> *d_tile_status,
+
114  ScanOp scan_op,
+
115  Identity identity,
+
116  SizeT num_items,
+
117  GridQueue<int> queue)
+
118 {
+
119  enum
+
120  {
+
121  TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
+
122  };
+
123 
+
124  // Thread block type for scanning input tiles
+
125  typedef BlockScanTiles<
+
126  BlockScanTilesPolicy,
+
127  InputIteratorRA,
+
128  OutputIteratorRA,
+
129  ScanOp,
+
130  Identity,
+
131  SizeT> BlockScanTilesT;
+
132 
+
133  // Shared memory for BlockScanTiles
+
134  __shared__ typename BlockScanTilesT::TempStorage temp_storage;
+
135 
+
136  // Process tiles
+
137  BlockScanTilesT(temp_storage, d_in, d_out, scan_op, identity).ConsumeTiles(
+
138  num_items,
+
139  queue,
+
140  d_tile_status + TILE_STATUS_PADDING);
+
141 }
+
142 
+
143 
+
144 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
145 
+
146 
+
147 
+
148 /******************************************************************************
+
149  * DeviceScan
+
150  *****************************************************************************/
+
151 
+ +
174 {
+
175 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
176 
+
177  /******************************************************************************
+
178  * Constants and typedefs
+
179  ******************************************************************************/
+
180 
+
182  struct KernelDispachParams
+
183  {
+
184  // Policy fields
+
185  int block_threads;
+
186  int items_per_thread;
+
187  BlockLoadAlgorithm load_policy;
+
188  BlockStoreAlgorithm store_policy;
+
189  BlockScanAlgorithm scan_algorithm;
+
190 
+
191  // Other misc
+
192  int tile_size;
+
193 
+
194  template <typename BlockScanTilesPolicy>
+
195  __host__ __device__ __forceinline__
+
196  void Init()
+
197  {
+
198  block_threads = BlockScanTilesPolicy::BLOCK_THREADS;
+
199  items_per_thread = BlockScanTilesPolicy::ITEMS_PER_THREAD;
+
200  load_policy = BlockScanTilesPolicy::LOAD_ALGORITHM;
+
201  store_policy = BlockScanTilesPolicy::STORE_ALGORITHM;
+
202  scan_algorithm = BlockScanTilesPolicy::SCAN_ALGORITHM;
+
203 
+
204  tile_size = block_threads * items_per_thread;
+
205  }
+
206 
+
207  __host__ __device__ __forceinline__
+
208  void Print()
+
209  {
+
210  printf("%d, %d, %d, %d, %d",
+
211  block_threads,
+
212  items_per_thread,
+
213  load_policy,
+
214  store_policy,
+
215  scan_algorithm);
+
216  }
+
217 
+
218  };
+
219 
+
220 
+
221  /******************************************************************************
+
222  * Tuning policies
+
223  ******************************************************************************/
+
224 
+
225 
+
227  template <
+
228  typename T,
+
229  typename SizeT,
+
230  int ARCH>
+
231  struct TunedPolicies;
+
232 
+
234  template <typename T, typename SizeT>
+
235  struct TunedPolicies<T, SizeT, 350>
+
236  {
+
237  enum {
+
238  NOMINAL_4B_ITEMS_PER_THREAD = 16,
+
239  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+
240  };
+
241 
+
242  // ScanPolicy: GTX Titan: 29.1B items/s (232.4 GB/s) @ 48M 32-bit T
+
243  typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, false, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE, true, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
244  };
+
245 
+
247  template <typename T, typename SizeT>
+
248  struct TunedPolicies<T, SizeT, 300>
+
249  {
+
250  enum {
+
251  NOMINAL_4B_ITEMS_PER_THREAD = 9,
+
252  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+
253  };
+
254 
+
255  typedef BlockScanTilesPolicy<256, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
256  };
+
257 
+
259  template <typename T, typename SizeT>
+
260  struct TunedPolicies<T, SizeT, 200>
+
261  {
+
262  enum {
+
263  NOMINAL_4B_ITEMS_PER_THREAD = 15,
+
264  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+
265  };
+
266 
+
267  // ScanPolicy: GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+
268  typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
269  };
+
270 
+
272  template <typename T, typename SizeT>
+
273  struct TunedPolicies<T, SizeT, 100>
+
274  {
+
275  enum {
+
276  NOMINAL_4B_ITEMS_PER_THREAD = 7,
+
277  ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+
278  };
+
279  typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_TRANSPOSE, false, BLOCK_SCAN_RAKING> ScanPolicy;
+
280  };
+
281 
+
282 
+
284  template <typename T, typename SizeT>
+
285  struct PtxDefaultPolicies
+
286  {
+
287  static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ?
+
288  350 :
+
289  (CUB_PTX_ARCH >= 300) ?
+
290  300 :
+
291  (CUB_PTX_ARCH >= 200) ?
+
292  200 :
+
293  100;
+
294 
+
295  // Tuned policy set for the current PTX compiler pass
+
296  typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
297 
+
298  // ScanPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+
299  struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};
+
300 
+
304  static void InitDispatchParams(int ptx_version, KernelDispachParams &scan_dispatch_params)
+
305  {
+
306  if (ptx_version >= 350)
+
307  {
+
308  typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
+
309  scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+
310  }
+
311  else if (ptx_version >= 300)
+
312  {
+
313  typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
+
314  scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+
315  }
+
316  else if (ptx_version >= 200)
+
317  {
+
318  typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
+
319  scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+
320  }
+
321  else
+
322  {
+
323  typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
+
324  scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+
325  }
+
326  }
+
327  };
+
328 
+
329 
+
330  /******************************************************************************
+
331  * Utility methods
+
332  ******************************************************************************/
+
333 
+
337  template <
+
338  typename ScanInitKernelPtr,
+
339  typename ScanKernelPtr,
+
340  typename InputIteratorRA,
+
341  typename OutputIteratorRA,
+
342  typename ScanOp,
+
343  typename Identity,
+
344  typename SizeT>
+
345  __host__ __device__ __forceinline__
+
346  static cudaError_t Dispatch(
+
347  int ptx_version,
+
348  void *d_temp_storage,
+
349  size_t &temp_storage_bytes,
+
350  ScanInitKernelPtr init_kernel,
+
351  ScanKernelPtr scan_kernel,
+
352  KernelDispachParams &scan_dispatch_params,
+
353  InputIteratorRA d_in,
+
354  OutputIteratorRA d_out,
+
355  ScanOp scan_op,
+
356  Identity identity,
+
357  SizeT num_items,
+
358  cudaStream_t stream = 0,
+
359  bool stream_synchronous = false)
+
360  {
+
361 
+
362 #ifndef CUB_RUNTIME_ENABLED
+
363 
+
364  // Kernel launch not supported from this device
+
365  return CubDebug(cudaErrorNotSupported);
+
366 
+
367 #else
+
368 
+
369  enum
+
370  {
+
371  TILE_STATUS_PADDING = 32,
+
372  INIT_KERNEL_THREADS = 128
+
373  };
+
374 
+
375  // Data type
+
376  typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
377 
+
378  // Tile status descriptor type
+
379  typedef ScanTileDescriptor<T> ScanTileDescriptorT;
+
380 
+
381  cudaError error = cudaSuccess;
+
382  do
+
383  {
+
384  // Number of input tiles
+
385  int num_tiles = (num_items + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+
386 
+
387  // Temporary storage allocation requirements
+
388  void* allocations[2];
+
389  size_t allocation_sizes[2] =
+
390  {
+
391  (num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptorT), // bytes needed for tile status descriptors
+
392  GridQueue<int>::AllocationSize() // bytes needed for grid queue descriptor
+
393  };
+
394 
+
395  // Alias temporaries (or set the necessary size of the storage allocation)
+
396  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
397 
+
398  // Return if the caller is simply requesting the size of the storage allocation
+
399  if (d_temp_storage == NULL)
+
400  return cudaSuccess;
+
401 
+
402  // Global list of tile status
+
403  ScanTileDescriptorT *d_tile_status = (ScanTileDescriptorT*) allocations[0];
+
404 
+
405  // Grid queue descriptor
+
406  GridQueue<int> queue(allocations[1]);
+
407 
+
408  // Log init_kernel configuration
+
409  int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+
410  if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
411 
+
412  // Invoke init_kernel to initialize tile descriptors and queue descriptors
+
413  init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+
414  queue,
+
415  d_tile_status,
+
416  num_tiles);
+
417 
+
418  // Sync the stream if specified
+
419  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
420 
+
421  // Get grid size for multi-block kernel
+
422  int scan_grid_size;
+
423  int multi_sm_occupancy = -1;
+
424  if (ptx_version < 200)
+
425  {
+
426  // We don't have atomics (or don't have fast ones), so just assign one
+
427  // block per tile (limited to 65K tiles)
+
428  scan_grid_size = num_tiles;
+
429  }
+
430  else
+
431  {
+
432  // We have atomics and can thus reuse blocks across multiple tiles using a queue descriptor.
+
433  // Get GPU id
+
434  int device_ordinal;
+
435  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
436 
+
437  // Get SM count
+
438  int sm_count;
+
439  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
440 
+
441  // Get a rough estimate of scan_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+
442  multi_sm_occupancy = CUB_MIN(
+ +
444  ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / scan_dispatch_params.block_threads);
+
445 
+
446 #ifndef __CUDA_ARCH__
+
447  // We're on the host, so come up with a
+
448  Device device_props;
+
449  if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
450 
+
451  if (CubDebug(error = device_props.MaxSmOccupancy(
+
452  multi_sm_occupancy,
+
453  scan_kernel,
+
454  scan_dispatch_params.block_threads))) break;
+
455 #endif
+
456  // Get device occupancy for scan_kernel
+
457  int scan_occupancy = multi_sm_occupancy * sm_count;
+
458 
+
459  // Get grid size for scan_kernel
+
460  scan_grid_size = (num_tiles < scan_occupancy) ?
+
461  num_tiles : // Not enough to fill the device with threadblocks
+
462  scan_occupancy; // Fill the device with threadblocks
+
463  }
+
464 
+
465  // Log scan_kernel configuration
+
466  if (stream_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+
467  scan_grid_size, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread, multi_sm_occupancy);
+
468 
+
469  // Invoke scan_kernel
+
470  scan_kernel<<<scan_grid_size, scan_dispatch_params.block_threads, 0, stream>>>(
+
471  d_in,
+
472  d_out,
+
473  d_tile_status,
+
474  scan_op,
+
475  identity,
+
476  num_items,
+
477  queue);
+
478 
+
479  // Sync the stream if specified
+
480  if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
481  }
+
482  while (0);
+
483 
+
484  return error;
+
485 
+
486 #endif // CUB_RUNTIME_ENABLED
+
487  }
+
488 
+
489 
+
490 
+
494  template <
+
495  typename InputIteratorRA,
+
496  typename OutputIteratorRA,
+
497  typename ScanOp,
+
498  typename Identity,
+
499  typename SizeT>
+
500  __host__ __device__ __forceinline__
+
501  static cudaError_t Dispatch(
+
502  void *d_temp_storage,
+
503  size_t &temp_storage_bytes,
+
504  InputIteratorRA d_in,
+
505  OutputIteratorRA d_out,
+
506  ScanOp scan_op,
+
507  Identity identity,
+
508  SizeT num_items,
+
509  cudaStream_t stream = 0,
+
510  bool stream_synchronous = false)
+
511  {
+
512  // Data type
+
513  typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
514 
+
515  // Tuning polices
+
516  typedef PtxDefaultPolicies<T, SizeT> PtxDefaultPolicies; // Wrapper of default kernel policies
+
517  typedef typename PtxDefaultPolicies::ScanPolicy ScanPolicy; // Scan kernel policy
+
518 
+
519  cudaError error = cudaSuccess;
+
520  do
+
521  {
+
522  // Declare dispatch parameters
+
523  KernelDispachParams scan_dispatch_params;
+
524 
+
525  int ptx_version;
+
526 #ifdef __CUDA_ARCH__
+
527  // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+
528  scan_dispatch_params.Init<ScanPolicy>();
+
529  ptx_version = CUB_PTX_ARCH;
+
530 #else
+
531  // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+
532  if (CubDebug(error = PtxVersion(ptx_version))) break;
+
533  PtxDefaultPolicies::InitDispatchParams(ptx_version, scan_dispatch_params);
+
534 #endif
+
535 
+
536  Dispatch(
+
537  ptx_version,
+
538  d_temp_storage,
+
539  temp_storage_bytes,
+
540  ScanInitKernel<T, SizeT>,
+
541  ScanKernel<ScanPolicy, InputIteratorRA, OutputIteratorRA, T, ScanOp, Identity, SizeT>,
+
542  scan_dispatch_params,
+
543  d_in,
+
544  d_out,
+
545  scan_op,
+
546  identity,
+
547  num_items,
+
548  stream,
+
549  stream_synchronous);
+
550 
+
551  if (CubDebug(error)) break;
+
552  }
+
553  while (0);
+
554 
+
555  return error;
+
556  }
+
557 
+
558  #endif // DOXYGEN_SHOULD_SKIP_THIS
+
559 
+
560 
+
561  /******************************************************************/
+
565 
+
604  template <
+
605  typename InputIteratorRA,
+
606  typename OutputIteratorRA>
+
607  __host__ __device__ __forceinline__
+
608  static cudaError_t ExclusiveSum(
+
609  void *d_temp_storage,
+
610  size_t &temp_storage_bytes,
+
611  InputIteratorRA d_in,
+
612  OutputIteratorRA d_out,
+
613  int num_items,
+
614  cudaStream_t stream = 0,
+
615  bool stream_synchronous = false)
+
616  {
+
617  typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
618  return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), T(), num_items, stream, stream_synchronous);
+
619  }
+
620 
+
621 
+
665  template <
+
666  typename InputIteratorRA,
+
667  typename OutputIteratorRA,
+
668  typename ScanOp,
+
669  typename Identity>
+
670  __host__ __device__ __forceinline__
+
671  static cudaError_t ExclusiveScan(
+
672  void *d_temp_storage,
+
673  size_t &temp_storage_bytes,
+
674  InputIteratorRA d_in,
+
675  OutputIteratorRA d_out,
+
676  ScanOp scan_op,
+
677  Identity identity,
+
678  int num_items,
+
679  cudaStream_t stream = 0,
+
680  bool stream_synchronous = false)
+
681  {
+
682  return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, identity, num_items, stream, stream_synchronous);
+
683  }
+
684 
+
685 
+
687  /******************************************************************/
+
691 
+
692 
+
730  template <
+
731  typename InputIteratorRA,
+
732  typename OutputIteratorRA>
+
733  __host__ __device__ __forceinline__
+
734  static cudaError_t InclusiveSum(
+
735  void *d_temp_storage,
+
736  size_t &temp_storage_bytes,
+
737  InputIteratorRA d_in,
+
738  OutputIteratorRA d_out,
+
739  int num_items,
+
740  cudaStream_t stream = 0,
+
741  bool stream_synchronous = false)
+
742  {
+
743  return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream, stream_synchronous);
+
744  }
+
745 
+
746 
+
788  template <
+
789  typename InputIteratorRA,
+
790  typename OutputIteratorRA,
+
791  typename ScanOp>
+
792  __host__ __device__ __forceinline__
+
793  static cudaError_t InclusiveScan(
+
794  void *d_temp_storage,
+
795  size_t &temp_storage_bytes,
+
796  InputIteratorRA d_in,
+
797  OutputIteratorRA d_out,
+
798  ScanOp scan_op,
+
799  int num_items,
+
800  cudaStream_t stream = 0,
+
801  bool stream_synchronous = false)
+
802  {
+
803  return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream, stream_synchronous);
+
804  }
+
805 
+
806 };
+
807 
+
808 
+
809 } // CUB namespace
+
810 CUB_NS_POSTFIX // Optional outer namespace(s)
+
811 
+
812 
+
+ + + + + diff --git a/docs/html/functions_0x62.html b/docs/html/functions_0x62.html new file mode 100644 index 0000000000..8d0f8f4431 --- /dev/null +++ b/docs/html/functions_0x62.html @@ -0,0 +1,175 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- b -

+
+ + + + + diff --git a/docs/html/functions_0x63.html b/docs/html/functions_0x63.html new file mode 100644 index 0000000000..8af143b62e --- /dev/null +++ b/docs/html/functions_0x63.html @@ -0,0 +1,159 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- c -

+
+ + + + + diff --git a/docs/html/functions_0x64.html b/docs/html/functions_0x64.html new file mode 100644 index 0000000000..31c4371932 --- /dev/null +++ b/docs/html/functions_0x64.html @@ -0,0 +1,156 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- d -

+
+ + + + + diff --git a/docs/html/functions_0x65.html b/docs/html/functions_0x65.html new file mode 100644 index 0000000000..e95a76fa06 --- /dev/null +++ b/docs/html/functions_0x65.html @@ -0,0 +1,158 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- e -

+
+ + + + + diff --git a/docs/html/functions_0x66.html b/docs/html/functions_0x66.html new file mode 100644 index 0000000000..bb95653b51 --- /dev/null +++ b/docs/html/functions_0x66.html @@ -0,0 +1,150 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- f -

+
+ + + + + diff --git a/docs/html/functions_0x68.html b/docs/html/functions_0x68.html new file mode 100644 index 0000000000..9b9bd565da --- /dev/null +++ b/docs/html/functions_0x68.html @@ -0,0 +1,153 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- h -

+
+ + + + + diff --git a/docs/html/functions_0x69.html b/docs/html/functions_0x69.html new file mode 100644 index 0000000000..6f53445b1d --- /dev/null +++ b/docs/html/functions_0x69.html @@ -0,0 +1,162 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- i -

+
+ + + + + diff --git a/docs/html/functions_0x6c.html b/docs/html/functions_0x6c.html new file mode 100644 index 0000000000..b2100fb3ac --- /dev/null +++ b/docs/html/functions_0x6c.html @@ -0,0 +1,147 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- l -

+
+ + + + + diff --git a/docs/html/functions_0x6d.html b/docs/html/functions_0x6d.html new file mode 100644 index 0000000000..3b20f3b6c9 --- /dev/null +++ b/docs/html/functions_0x6d.html @@ -0,0 +1,180 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- m -

+
+ + + + + diff --git a/docs/html/functions_0x6f.html b/docs/html/functions_0x6f.html new file mode 100644 index 0000000000..e76fd4018c --- /dev/null +++ b/docs/html/functions_0x6f.html @@ -0,0 +1,149 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- o -

+
+ + + + + diff --git a/docs/html/functions_0x70.html b/docs/html/functions_0x70.html new file mode 100644 index 0000000000..0d161ccb01 --- /dev/null +++ b/docs/html/functions_0x70.html @@ -0,0 +1,144 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- p -

+
+ + + + + diff --git a/docs/html/functions_0x72.html b/docs/html/functions_0x72.html new file mode 100644 index 0000000000..8e073ec5a8 --- /dev/null +++ b/docs/html/functions_0x72.html @@ -0,0 +1,160 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- r -

+
+ + + + + diff --git a/docs/html/functions_0x73.html b/docs/html/functions_0x73.html new file mode 100644 index 0000000000..bfaab62689 --- /dev/null +++ b/docs/html/functions_0x73.html @@ -0,0 +1,221 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- s -

+
+ + + + + diff --git a/docs/html/functions_0x74.html b/docs/html/functions_0x74.html new file mode 100644 index 0000000000..1eb890dbe1 --- /dev/null +++ b/docs/html/functions_0x74.html @@ -0,0 +1,165 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- t -

+
+ + + + + diff --git a/docs/html/functions_0x75.html b/docs/html/functions_0x75.html new file mode 100644 index 0000000000..da529e9e23 --- /dev/null +++ b/docs/html/functions_0x75.html @@ -0,0 +1,145 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- u -

+
+ + + + + diff --git a/docs/html/functions_0x77.html b/docs/html/functions_0x77.html new file mode 100644 index 0000000000..0fa68a4662 --- /dev/null +++ b/docs/html/functions_0x77.html @@ -0,0 +1,162 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- w -

+
+ + + + + diff --git a/docs/html/functions_0x7e.html b/docs/html/functions_0x7e.html new file mode 100644 index 0000000000..91f0ed6c0e --- /dev/null +++ b/docs/html/functions_0x7e.html @@ -0,0 +1,144 @@ + + + + + + + +CUB: Class Members + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + + +
+ +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ +
+
Here is a list of all documented class members with links to the class documentation for each member:
+ +

- ~ -

+
+ + + + + diff --git a/docs/html/structcub_1_1_inequality-members.html b/docs/html/structcub_1_1_inequality-members.html new file mode 100644 index 0000000000..04e607d77b --- /dev/null +++ b/docs/html/structcub_1_1_inequality-members.html @@ -0,0 +1,118 @@ + + + + + + + +CUB: Member List + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
cub::Inequality Member List
+
+
+ +

This is the complete list of members for cub::Inequality, including all inherited members.

+ + +
operator()(const T &a, const T &b)cub::Inequalityinline
+ + + + + diff --git a/docs/html/structcub_1_1_inequality.html b/docs/html/structcub_1_1_inequality.html new file mode 100644 index 0000000000..2b65c4c0c4 --- /dev/null +++ b/docs/html/structcub_1_1_inequality.html @@ -0,0 +1,134 @@ + + + + + + + +CUB: cub::Inequality Struct Reference + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+Public Methods | +List of all members
+
+
cub::Inequality Struct Reference
+
+
+

Detailed description

+

Default inequality functor.

+ +

Definition at line 72 of file thread_operators.cuh.

+
+ + + + + +

+Public Methods

+template<typename T >
__host__ __device__
+__forceinline__ bool 
operator() (const T &a, const T &b)
 Boolean inequality operator, returns (a != b)
 
+
The documentation for this struct was generated from the following file: +
+ + + + + diff --git a/docs/html/structcub_1_1_min-members.html b/docs/html/structcub_1_1_min-members.html new file mode 100644 index 0000000000..e4c6bf3838 --- /dev/null +++ b/docs/html/structcub_1_1_min-members.html @@ -0,0 +1,118 @@ + + + + + + + +CUB: Member List + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
cub::Min Member List
+
+
+ +

This is the complete list of members for cub::Min, including all inherited members.

+ + +
operator()(const T &a, const T &b)cub::Mininline
+ + + + + diff --git a/docs/html/structcub_1_1_min.html b/docs/html/structcub_1_1_min.html new file mode 100644 index 0000000000..9dc36d9423 --- /dev/null +++ b/docs/html/structcub_1_1_min.html @@ -0,0 +1,134 @@ + + + + + + + +CUB: cub::Min Struct Reference + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+Public Methods | +List of all members
+
+
cub::Min Struct Reference
+
+
+

Detailed description

+

Default min functor.

+ +

Definition at line 114 of file thread_operators.cuh.

+
+ + + + + +

+Public Methods

+template<typename T >
__host__ __device__
+__forceinline__ T 
operator() (const T &a, const T &b)
 Boolean min operator, returns (a < b) ? a : b
 
+
The documentation for this struct was generated from the following file: +
+ + + + + diff --git a/docs/html/thread__load_8cuh_source.html b/docs/html/thread__load_8cuh_source.html new file mode 100644 index 0000000000..686d16eb79 --- /dev/null +++ b/docs/html/thread__load_8cuh_source.html @@ -0,0 +1,444 @@ + + + + + + + +CUB: thread_load.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
thread_load.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include <cuda.h>
+
37 
+
38 #include <iterator>
+
39 
+
40 #include "../util_ptx.cuh"
+
41 #include "../util_type.cuh"
+
42 #include "../util_namespace.cuh"
+
43 
+
45 CUB_NS_PREFIX
+
46 
+
48 namespace cub {
+
49 
+
55 //-----------------------------------------------------------------------------
+
56 // Tags and constants
+
57 //-----------------------------------------------------------------------------
+
58 
+ +
63 {
+ + + + + + + +
71 };
+
72 
+
73 
+
109 template <
+
110  PtxLoadModifier MODIFIER,
+
111  typename InputIteratorRA>
+
112 __device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(InputIteratorRA itr);
+
113 
+
114 
+
116 
+
117 
+
118 
+
119 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
120 
+
121 
+
125 #define CUB_LOAD_16(cub_modifier, ptx_modifier) \
+
126  template<> \
+
127  __device__ __forceinline__ int4 ThreadLoad<cub_modifier, int4*>(int4* ptr) \
+
128  { \
+
129  int4 retval; \
+
130  asm volatile ("ld."#ptx_modifier".v4.s32 {%0, %1, %2, %3}, [%4];" : \
+
131  "=r"(retval.x), \
+
132  "=r"(retval.y), \
+
133  "=r"(retval.z), \
+
134  "=r"(retval.w) : \
+
135  _CUB_ASM_PTR_(ptr)); \
+
136  return retval; \
+
137  } \
+
138  template<> \
+
139  __device__ __forceinline__ longlong2 ThreadLoad<cub_modifier, longlong2*>(longlong2* ptr) \
+
140  { \
+
141  longlong2 retval; \
+
142  asm volatile ("ld."#ptx_modifier".v2.s64 {%0, %1}, [%2];" : \
+
143  "=l"(retval.x), \
+
144  "=l"(retval.y) : \
+
145  _CUB_ASM_PTR_(ptr)); \
+
146  return retval; \
+
147  }
+
148 
+
152 #define CUB_LOAD_8(cub_modifier, ptx_modifier) \
+
153  template<> \
+
154  __device__ __forceinline__ short4 ThreadLoad<cub_modifier, short4*>(short4* ptr) \
+
155  { \
+
156  short4 retval; \
+
157  asm volatile ("ld."#ptx_modifier".v4.s16 {%0, %1, %2, %3}, [%4];" : \
+
158  "=h"(retval.x), \
+
159  "=h"(retval.y), \
+
160  "=h"(retval.z), \
+
161  "=h"(retval.w) : \
+
162  _CUB_ASM_PTR_(ptr)); \
+
163  return retval; \
+
164  } \
+
165  template<> \
+
166  __device__ __forceinline__ int2 ThreadLoad<cub_modifier, int2*>(int2* ptr) \
+
167  { \
+
168  int2 retval; \
+
169  asm volatile ("ld."#ptx_modifier".v2.s32 {%0, %1}, [%2];" : \
+
170  "=r"(retval.x), \
+
171  "=r"(retval.y) : \
+
172  _CUB_ASM_PTR_(ptr)); \
+
173  return retval; \
+
174  } \
+
175  template<> \
+
176  __device__ __forceinline__ long long ThreadLoad<cub_modifier, long long*>(long long* ptr) \
+
177  { \
+
178  long long retval; \
+
179  asm volatile ("ld."#ptx_modifier".s64 %0, [%1];" : \
+
180  "=l"(retval) : \
+
181  _CUB_ASM_PTR_(ptr)); \
+
182  return retval; \
+
183  }
+
184 
+
188 #define CUB_LOAD_4(cub_modifier, ptx_modifier) \
+
189  template<> \
+
190  __device__ __forceinline__ int ThreadLoad<cub_modifier, int*>(int* ptr) \
+
191  { \
+
192  int retval; \
+
193  asm volatile ("ld."#ptx_modifier".s32 %0, [%1];" : \
+
194  "=r"(retval) : \
+
195  _CUB_ASM_PTR_(ptr)); \
+
196  return retval; \
+
197  }
+
198 
+
199 
+
203 #define CUB_LOAD_2(cub_modifier, ptx_modifier) \
+
204  template<> \
+
205  __device__ __forceinline__ short ThreadLoad<cub_modifier, short*>(short* ptr) \
+
206  { \
+
207  short retval; \
+
208  asm volatile ("ld."#ptx_modifier".s16 %0, [%1];" : \
+
209  "=h"(retval) : \
+
210  _CUB_ASM_PTR_(ptr)); \
+
211  return retval; \
+
212  }
+
213 
+
214 
+
218 #define CUB_LOAD_1(cub_modifier, ptx_modifier) \
+
219  template<> \
+
220  __device__ __forceinline__ char ThreadLoad<cub_modifier, char*>(char* ptr) \
+
221  { \
+
222  short retval; \
+
223  asm volatile ( \
+
224  "{" \
+
225  " .reg .s8 datum;" \
+
226  " ld."#ptx_modifier".s8 datum, [%1];" \
+
227  " cvt.s16.s8 %0, datum;" \
+
228  "}" : \
+
229  "=h"(retval) : \
+
230  _CUB_ASM_PTR_(ptr)); \
+
231  return (char) retval; \
+
232  }
+
233 
+
234 
+
238 #define CUB_LOAD_ALL(cub_modifier, ptx_modifier) \
+
239  CUB_LOAD_16(cub_modifier, ptx_modifier) \
+
240  CUB_LOAD_8(cub_modifier, ptx_modifier) \
+
241  CUB_LOAD_4(cub_modifier, ptx_modifier) \
+
242  CUB_LOAD_2(cub_modifier, ptx_modifier) \
+
243  CUB_LOAD_1(cub_modifier, ptx_modifier) \
+
244 
+
245 
+
249 #if CUB_PTX_ARCH >= 200
+
250  CUB_LOAD_ALL(LOAD_CA, ca)
+
251  CUB_LOAD_ALL(LOAD_CG, cg)
+
252  CUB_LOAD_ALL(LOAD_CS, cs)
+
253  CUB_LOAD_ALL(LOAD_CV, cv)
+
254 #else
+
255  // LOAD_CV on SM10-13 uses "volatile.global" to ensure reads from last level
+
256  CUB_LOAD_ALL(LOAD_CV, volatile.global)
+
257 #endif
+
258 #if CUB_PTX_ARCH >= 350
+
259  CUB_LOAD_ALL(LOAD_LDG, global.nc)
+
260 #endif
+
261 
+
262 
+
264 template <PtxLoadModifier MODIFIER, int COUNT, int MAX>
+
265 struct IterateThreadLoad
+
266 {
+
267  template <typename T>
+
268  static __device__ __forceinline__ void Load(T *ptr, T *vals)
+
269  {
+
270  vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
+
271  IterateThreadLoad<MODIFIER, COUNT + 1, MAX>::Load(ptr, vals);
+
272  }
+
273 };
+
274 
+
276 template <PtxLoadModifier MODIFIER, int MAX>
+
277 struct IterateThreadLoad<MODIFIER, MAX, MAX>
+
278 {
+
279  template <typename T>
+
280  static __device__ __forceinline__ void Load(T *ptr, T *vals) {}
+
281 };
+
282 
+
283 
+
284 
+
288 template <typename InputIteratorRA>
+
289 __device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(
+
290  InputIteratorRA itr,
+
291  Int2Type<LOAD_DEFAULT> modifier,
+
292  Int2Type<false> is_pointer)
+
293 {
+
294  return *itr;
+
295 }
+
296 
+
297 
+
301 template <typename T>
+
302 __device__ __forceinline__ T ThreadLoad(
+
303  T *ptr,
+
304  Int2Type<LOAD_DEFAULT> modifier,
+
305  Int2Type<true> is_pointer)
+
306 {
+
307  return *ptr;
+
308 }
+
309 
+
310 
+
314 template <typename T>
+
315 __device__ __forceinline__ T ThreadLoadVolatile(
+
316  T *ptr,
+
317  Int2Type<true> is_primitive)
+
318 {
+
319  T retval = *reinterpret_cast<volatile T*>(ptr);
+
320 
+
321 #if (CUB_PTX_ARCH <= 130)
+
322  if (sizeof(T) == 1) __threadfence_block();
+
323 #endif
+
324 
+
325  return retval;
+
326 }
+
327 
+
328 
+
332 template <typename T>
+
333 __device__ __forceinline__ T ThreadLoadVolatile(
+
334  T *ptr,
+
335  Int2Type<false> is_primitive)
+
336 {
+
337  typedef typename WordAlignment<T>::VolatileWord VolatileWord; // Word type for memcopying
+
338  enum { NUM_WORDS = sizeof(T) / sizeof(VolatileWord) };
+
339 
+
340  // Memcopy from aliased source into array of uninitialized words
+
341  typename WordAlignment<T>::UninitializedVolatileWords words;
+
342 
+
343  #pragma unroll
+
344  for (int i = 0; i < NUM_WORDS; ++i)
+
345  words.buf[i] = reinterpret_cast<volatile VolatileWord*>(ptr)[i];
+
346 
+
347  // Load from words
+
348  return *reinterpret_cast<T*>(words.buf);
+
349 }
+
350 
+
351 
+
355 template <typename T>
+
356 __device__ __forceinline__ T ThreadLoad(
+
357  T *ptr,
+
358  Int2Type<LOAD_VOLATILE> modifier,
+
359  Int2Type<true> is_pointer)
+
360 {
+
361  return ThreadLoadVolatile(ptr, Int2Type<Traits<T>::PRIMITIVE>());
+
362 }
+
363 
+
364 
+
365 #if (CUB_PTX_ARCH <= 130)
+
366 
+
370 template <typename T>
+
371 __device__ __forceinline__ T ThreadLoad(
+
372  T *ptr,
+
373  Int2Type<LOAD_CG> modifier,
+
374  Int2Type<true> is_pointer)
+
375 {
+
376  return ThreadLoad<LOAD_CV>(ptr);
+
377 }
+
378 
+
379 #endif // (CUB_PTX_ARCH <= 130)
+
380 
+
381 
+
385 template <typename T, int MODIFIER>
+
386 __device__ __forceinline__ T ThreadLoad(
+
387  T *ptr,
+
388  Int2Type<MODIFIER> modifier,
+
389  Int2Type<true> is_pointer)
+
390 {
+
391  typedef typename WordAlignment<T>::DeviceWord DeviceWord;
+
392  enum { NUM_WORDS = sizeof(T) / sizeof(DeviceWord) };
+
393 
+
394  // Memcopy from aliased source into array of uninitialized words
+
395  typename WordAlignment<T>::UninitializedDeviceWords words;
+
396 
+
397  IterateThreadLoad<PtxLoadModifier(MODIFIER), 0, NUM_WORDS>::Load(
+
398  reinterpret_cast<DeviceWord*>(ptr),
+
399  words.buf);
+
400 
+
401  // Load from words
+
402  return *reinterpret_cast<T*>(words.buf);
+
403 }
+
404 
+
405 
+
409 template <
+
410  PtxLoadModifier MODIFIER,
+
411  typename InputIteratorRA>
+
412 __device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(InputIteratorRA itr)
+
413 {
+
414  return ThreadLoad(
+
415  itr,
+
416  Int2Type<MODIFIER>(),
+
417  Int2Type<IsPointer<InputIteratorRA>::VALUE>());
+
418 }
+
419 
+
420 
+
421 
+
422 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
423 
+
424  // end group IoModule
+
426 
+
427 
+
428 } // CUB namespace
+
429 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/thread__operators_8cuh_source.html b/docs/html/thread__operators_8cuh_source.html new file mode 100644 index 0000000000..e8bf57f92c --- /dev/null +++ b/docs/html/thread__operators_8cuh_source.html @@ -0,0 +1,222 @@ + + + + + + + +CUB: thread_operators.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
thread_operators.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 /******************************************************************************
+
35  * Simple functor operators
+
36  ******************************************************************************/
+
37 
+
38 #pragma once
+
39 
+
40 #include "../util_macro.cuh"
+
41 #include "../util_namespace.cuh"
+
42 
+
44 CUB_NS_PREFIX
+
45 
+
47 namespace cub {
+
48 
+
49 
+
58 struct Equality
+
59 {
+
61  template <typename T>
+
62  __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+
63  {
+
64  return a == b;
+
65  }
+
66 };
+
67 
+
68 
+
72 struct Inequality
+
73 {
+
75  template <typename T>
+
76  __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+
77  {
+
78  return a != b;
+
79  }
+
80 };
+
81 
+
82 
+
86 struct Sum
+
87 {
+
89  template <typename T>
+
90  __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
+
91  {
+
92  return a + b;
+
93  }
+
94 };
+
95 
+
96 
+
100 struct Max
+
101 {
+
103  template <typename T>
+
104  __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
+
105  {
+
106  return CUB_MAX(a, b);
+
107  }
+
108 };
+
109 
+
110 
+
114 struct Min
+
115 {
+
117  template <typename T>
+
118  __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
+
119  {
+
120  return CUB_MIN(a, b);
+
121  }
+
122 };
+
123 
+
124 
+
128 template <typename B>
+
129 struct Cast
+
130 {
+
132  template <typename A>
+
133  __host__ __device__ __forceinline__ B operator()(const A &a)
+
134  {
+
135  return (B) a;
+
136  }
+
137 };
+
138 
+
139 
+
140  // end group ThreadModule
+
142 
+
143 
+
144 } // CUB namespace
+
145 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/thread__reduce_8cuh_source.html b/docs/html/thread__reduce_8cuh_source.html new file mode 100644 index 0000000000..880c37d5b5 --- /dev/null +++ b/docs/html/thread__reduce_8cuh_source.html @@ -0,0 +1,212 @@ + + + + + + + +CUB: thread_reduce.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
thread_reduce.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include "../thread/thread_operators.cuh"
+
37 #include "../util_namespace.cuh"
+
38 
+
40 CUB_NS_PREFIX
+
41 
+
43 namespace cub {
+
44 
+
62 template <
+
63  int LENGTH,
+
64  typename T,
+
65  typename ReductionOp>
+
66 __device__ __forceinline__ T ThreadReduce(
+
67  T* input,
+
68  ReductionOp reduction_op,
+
69  T prefix)
+
70 {
+
71  #pragma unroll
+
72  for (int i = 0; i < LENGTH; ++i)
+
73  {
+
74  prefix = reduction_op(prefix, input[i]);
+
75  }
+
76 
+
77  return prefix;
+
78 }
+
79 
+
80 
+
88 template <
+
89  int LENGTH,
+
90  typename T,
+
91  typename ReductionOp>
+
92 __device__ __forceinline__ T ThreadReduce(
+
93  T* input,
+
94  ReductionOp reduction_op)
+
95 {
+
96  T prefix = input[0];
+
97  return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+
98 }
+
99 
+
100 
+
108 template <
+
109  int LENGTH,
+
110  typename T,
+
111  typename ReductionOp>
+
112 __device__ __forceinline__ T ThreadReduce(
+
113  T (&input)[LENGTH],
+
114  ReductionOp reduction_op,
+
115  T prefix)
+
116 {
+
117  return ThreadReduce<LENGTH>(input, reduction_op, prefix);
+
118 }
+
119 
+
120 
+
128 template <
+
129  int LENGTH,
+
130  typename T,
+
131  typename ReductionOp>
+
132 __device__ __forceinline__ T ThreadReduce(
+
133  T (&input)[LENGTH],
+
134  ReductionOp reduction_op)
+
135 {
+
136  return ThreadReduce<LENGTH>((T*) input, reduction_op);
+
137 }
+
138 
+
139 
+
141  // end group ThreadModule
+
143 
+
144 } // CUB namespace
+
145 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/thread__scan_8cuh_source.html b/docs/html/thread__scan_8cuh_source.html new file mode 100644 index 0000000000..11977b659c --- /dev/null +++ b/docs/html/thread__scan_8cuh_source.html @@ -0,0 +1,284 @@ + + + + + + + +CUB: thread_scan.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
thread_scan.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include "../thread/thread_operators.cuh"
+
37 #include "../util_namespace.cuh"
+
38 
+
40 CUB_NS_PREFIX
+
41 
+
43 namespace cub {
+
44 
+
62 template <
+
63  int LENGTH,
+
64  typename T,
+
65  typename ScanOp>
+
66 __device__ __forceinline__ T ThreadScanExclusive(
+
67  T *input,
+
68  T *output,
+
69  ScanOp scan_op,
+
70  T prefix,
+
71  bool apply_prefix = true)
+
72 {
+
73  T inclusive = input[0];
+
74  if (apply_prefix)
+
75  {
+
76  inclusive = scan_op(prefix, inclusive);
+
77  }
+
78  output[0] = prefix;
+
79  T exclusive = inclusive;
+
80 
+
81  #pragma unroll
+
82  for (int i = 1; i < LENGTH; ++i)
+
83  {
+
84  inclusive = scan_op(exclusive, input[i]);
+
85  output[i] = exclusive;
+
86  exclusive = inclusive;
+
87  }
+
88 
+
89  return inclusive;
+
90 }
+
91 
+
92 
+
100 template <
+
101  int LENGTH,
+
102  typename T,
+
103  typename ScanOp>
+
104 __device__ __forceinline__ T ThreadScanExclusive(
+
105  T (&input)[LENGTH],
+
106  T (&output)[LENGTH],
+
107  ScanOp scan_op,
+
108  T prefix,
+
109  bool apply_prefix = true)
+
110 {
+
111  return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix);
+
112 }
+
113 
+
114 
+
122 template <
+
123  int LENGTH,
+
124  typename T,
+
125  typename ScanOp>
+
126 __device__ __forceinline__ T ThreadScanInclusive(
+
127  T *input,
+
128  T *output,
+
129  ScanOp scan_op)
+
130 {
+
131  T inclusive = input[0];
+
132  output[0] = inclusive;
+
133 
+
134  // Continue scan
+
135  #pragma unroll
+
136  for (int i = 0; i < LENGTH; ++i)
+
137  {
+
138  inclusive = scan_op(inclusive, input[i]);
+
139  output[i] = inclusive;
+
140  }
+
141 
+
142  return inclusive;
+
143 }
+
144 
+
145 
+
153 template <
+
154  int LENGTH,
+
155  typename T,
+
156  typename ScanOp>
+
157 __device__ __forceinline__ T ThreadScanInclusive(
+
158  T (&input)[LENGTH],
+
159  T (&output)[LENGTH],
+
160  ScanOp scan_op)
+
161 {
+
162  return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
+
163 }
+
164 
+
165 
+
173 template <
+
174  int LENGTH,
+
175  typename T,
+
176  typename ScanOp>
+
177 __device__ __forceinline__ T ThreadScanInclusive(
+
178  T *input,
+
179  T *output,
+
180  ScanOp scan_op,
+
181  T prefix,
+
182  bool apply_prefix = true)
+
183 {
+
184  T inclusive = input[0];
+
185  if (apply_prefix)
+
186  {
+
187  inclusive = scan_op(prefix, inclusive);
+
188  }
+
189  output[0] = inclusive;
+
190 
+
191  // Continue scan
+
192  #pragma unroll
+
193  for (int i = 1; i < LENGTH; ++i)
+
194  {
+
195  inclusive = scan_op(inclusive, input[i]);
+
196  output[i] = inclusive;
+
197  }
+
198 
+
199  return inclusive;
+
200 }
+
201 
+
202 
+
210 template <
+
211  int LENGTH,
+
212  typename T,
+
213  typename ScanOp>
+
214 __device__ __forceinline__ T ThreadScanInclusive(
+
215  T (&input)[LENGTH],
+
216  T (&output)[LENGTH],
+
217  ScanOp scan_op,
+
218  T prefix,
+
219  bool apply_prefix = true)
+
220 {
+
221  return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+
222 }
+
223 
+
224 
+
226  // end group ThreadModule
+
228 
+
229 
+
230 } // CUB namespace
+
231 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/thread__store_8cuh_source.html b/docs/html/thread__store_8cuh_source.html new file mode 100644 index 0000000000..5e911c54b2 --- /dev/null +++ b/docs/html/thread__store_8cuh_source.html @@ -0,0 +1,422 @@ + + + + + + + +CUB: thread_store.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
thread_store.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include <cuda.h>
+
37 
+
38 #include "../util_ptx.cuh"
+
39 #include "../util_type.cuh"
+
40 #include "../util_namespace.cuh"
+
41 
+
43 CUB_NS_PREFIX
+
44 
+
46 namespace cub {
+
47 
+
54 //-----------------------------------------------------------------------------
+
55 // Tags and constants
+
56 //-----------------------------------------------------------------------------
+
57 
+ +
62 {
+ + + + + + +
69 };
+
70 
+
71 
+
111 template <
+
112  PtxStoreModifier MODIFIER,
+
113  typename OutputIteratorRA,
+
114  typename T>
+
115 __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, T val);
+
116 
+
117 
+
119 
+
120 
+
121 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
122 
+
123 
+
127 #define CUB_STORE_16(cub_modifier, ptx_modifier) \
+
128  template<> \
+
129  __device__ __forceinline__ void ThreadStore<cub_modifier, int4*, int4>(int4* ptr, int4 val) \
+
130  { \
+
131  asm volatile ("st."#ptx_modifier".v4.s32 [%0], {%1, %2, %3, %4};" : : \
+
132  _CUB_ASM_PTR_(ptr), \
+
133  "r"(val.x), \
+
134  "r"(val.y), \
+
135  "r"(val.z), \
+
136  "r"(val.w)); \
+
137  } \
+
138  template<> \
+
139  __device__ __forceinline__ void ThreadStore<cub_modifier, longlong2*, longlong2>(longlong2* ptr, longlong2 val) \
+
140  { \
+
141  asm volatile ("st."#ptx_modifier".v2.s64 [%0], {%1, %2};" : : \
+
142  _CUB_ASM_PTR_(ptr), \
+
143  "l"(val.x), \
+
144  "l"(val.y)); \
+
145  }
+
146 
+
147 
+
151 #define CUB_STORE_8(cub_modifier, ptx_modifier) \
+
152  template<> \
+
153  __device__ __forceinline__ void ThreadStore<cub_modifier, short4*, short4>(short4* ptr, short4 val) \
+
154  { \
+
155  asm volatile ("st."#ptx_modifier".v4.s16 [%0], {%1, %2, %3, %4};" : : \
+
156  _CUB_ASM_PTR_(ptr), \
+
157  "h"(val.x), \
+
158  "h"(val.y), \
+
159  "h"(val.z), \
+
160  "h"(val.w)); \
+
161  } \
+
162  template<> \
+
163  __device__ __forceinline__ void ThreadStore<cub_modifier, int2*, int2>(int2* ptr, int2 val) \
+
164  { \
+
165  asm volatile ("st."#ptx_modifier".v2.s32 [%0], {%1, %2};" : : \
+
166  _CUB_ASM_PTR_(ptr), \
+
167  "r"(val.x), \
+
168  "r"(val.y)); \
+
169  } \
+
170  template<> \
+
171  __device__ __forceinline__ void ThreadStore<cub_modifier, long long*, long long>(long long* ptr, long long val) \
+
172  { \
+
173  asm volatile ("st."#ptx_modifier".s64 [%0], %1;" : : \
+
174  _CUB_ASM_PTR_(ptr), \
+
175  "l"(val)); \
+
176  }
+
177 
+
181 #define CUB_STORE_4(cub_modifier, ptx_modifier) \
+
182  template<> \
+
183  __device__ __forceinline__ void ThreadStore<cub_modifier, int*, int>(int* ptr, int val) \
+
184  { \
+
185  asm volatile ("st."#ptx_modifier".s32 [%0], %1;" : : \
+
186  _CUB_ASM_PTR_(ptr), \
+
187  "r"(val)); \
+
188  }
+
189 
+
190 
+
194 #define CUB_STORE_2(cub_modifier, ptx_modifier) \
+
195  template<> \
+
196  __device__ __forceinline__ void ThreadStore<cub_modifier, short*, short>(short* ptr, short val) \
+
197  { \
+
198  asm volatile ("st."#ptx_modifier".s16 [%0], %1;" : : \
+
199  _CUB_ASM_PTR_(ptr), \
+
200  "h"(val)); \
+
201  }
+
202 
+
203 
+
207 #define CUB_STORE_1(cub_modifier, ptx_modifier) \
+
208  template<> \
+
209  __device__ __forceinline__ void ThreadStore<cub_modifier, char*, char>(char* ptr, char val) \
+
210  { \
+
211  asm volatile ( \
+
212  "{" \
+
213  " .reg .s8 datum;" \
+
214  " cvt.s8.s16 datum, %1;" \
+
215  " st."#ptx_modifier".s8 [%0], datum;" \
+
216  "}" : : \
+
217  _CUB_ASM_PTR_(ptr), \
+
218  "h"(short(val))); \
+
219  }
+
220 
+
224 #define CUB_STORE_ALL(cub_modifier, ptx_modifier) \
+
225  CUB_STORE_16(cub_modifier, ptx_modifier) \
+
226  CUB_STORE_8(cub_modifier, ptx_modifier) \
+
227  CUB_STORE_4(cub_modifier, ptx_modifier) \
+
228  CUB_STORE_2(cub_modifier, ptx_modifier) \
+
229  CUB_STORE_1(cub_modifier, ptx_modifier) \
+
230 
+
231 
+
235 #if CUB_PTX_ARCH >= 200
+
236  CUB_STORE_ALL(STORE_WB, ca)
+
237  CUB_STORE_ALL(STORE_CG, cg)
+
238  CUB_STORE_ALL(STORE_CS, cs)
+
239  CUB_STORE_ALL(STORE_WT, cv)
+
240 #else
+
241  // STORE_WT on SM10-13 uses "volatile.global" to ensure writes to last level
+
242  CUB_STORE_ALL(STORE_WT, volatile.global)
+
243 #endif
+
244 
+
245 
+
246 
+
248 template <PtxStoreModifier MODIFIER, int COUNT, int MAX>
+
249 struct IterateThreadStore
+
250 {
+
251  template <typename T>
+
252  static __device__ __forceinline__ void Store(T *ptr, T *vals)
+
253  {
+
254  ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
+
255  IterateThreadStore<MODIFIER, COUNT + 1, MAX>::Store(ptr, vals);
+
256  }
+
257 };
+
258 
+
260 template <PtxStoreModifier MODIFIER, int MAX>
+
261 struct IterateThreadStore<MODIFIER, MAX, MAX>
+
262 {
+
263  template <typename T>
+
264  static __device__ __forceinline__ void Store(T *ptr, T *vals) {}
+
265 };
+
266 
+
267 
+
268 
+
269 
+
273 template <typename OutputIteratorRA, typename T>
+
274 __device__ __forceinline__ void ThreadStore(
+
275  OutputIteratorRA itr,
+
276  T val,
+
277  Int2Type<STORE_DEFAULT> modifier,
+
278  Int2Type<false> is_pointer)
+
279 {
+
280  *itr = val;
+
281 }
+
282 
+
283 
+
287 template <typename T>
+
288 __device__ __forceinline__ void ThreadStore(
+
289  T *ptr,
+
290  T val,
+
291  Int2Type<STORE_DEFAULT> modifier,
+
292  Int2Type<true> is_pointer)
+
293 {
+
294  *ptr = val;
+
295 }
+
296 
+
297 
+
301 template <typename T>
+
302 __device__ __forceinline__ void ThreadStoreVolatile(
+
303  T *ptr,
+
304  T val,
+
305  Int2Type<true> is_primitive)
+
306 {
+
307  *reinterpret_cast<volatile T*>(ptr) = val;
+
308 }
+
309 
+
310 
+
314 template <typename T>
+
315 __device__ __forceinline__ void ThreadStoreVolatile(
+
316  T *ptr,
+
317  T val,
+
318  Int2Type<false> is_primitive)
+
319 {
+
320  typedef typename WordAlignment<T>::VolatileWord VolatileWord; // Word type for memcopying
+
321  enum { NUM_WORDS = sizeof(T) / sizeof(VolatileWord) };
+
322 
+
323  // Store into array of uninitialized words
+
324  typename WordAlignment<T>::UninitializedVolatileWords words;
+
325  *reinterpret_cast<T*>(words.buf) = val;
+
326 
+
327  // Memcopy words to aliased destination
+
328  #pragma unroll
+
329  for (int i = 0; i < NUM_WORDS; ++i)
+
330  reinterpret_cast<volatile VolatileWord*>(ptr)[i] = words.buf[i];
+
331 }
+
332 
+
333 
+
337 template <typename T>
+
338 __device__ __forceinline__ void ThreadStore(
+
339  T *ptr,
+
340  T val,
+
341  Int2Type<STORE_VOLATILE> modifier,
+
342  Int2Type<true> is_pointer)
+
343 {
+
344  ThreadStoreVolatile(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
+
345 }
+
346 
+
347 
+
348 #if (CUB_PTX_ARCH <= 350)
+
349 
+
353 template <typename T>
+
354 __device__ __forceinline__ void ThreadStore(
+
355  T *ptr,
+
356  T val,
+
357  Int2Type<STORE_CG> modifier,
+
358  Int2Type<true> is_pointer)
+
359 {
+
360  ThreadStore<STORE_DEFAULT>(ptr, val);
+
361 }
+
362 
+
363 #endif // (CUB_PTX_ARCH <= 350)
+
364 
+
365 
+
369 template <typename T, int MODIFIER>
+
370 __device__ __forceinline__ void ThreadStore(
+
371  T *ptr,
+
372  T val,
+
373  Int2Type<MODIFIER> modifier,
+
374  Int2Type<true> is_pointer)
+
375 {
+
376  typedef typename WordAlignment<T>::DeviceWord DeviceWord; // Word type for memcopying
+
377  enum { NUM_WORDS = sizeof(T) / sizeof(DeviceWord) };
+
378 
+
379  // Store into array of uninitialized words
+
380  typename WordAlignment<T>::UninitializedDeviceWords words;
+
381  *reinterpret_cast<T*>(words.buf) = val;
+
382 
+
383  // Memcopy words to aliased destination
+
384  IterateThreadStore<PtxStoreModifier(MODIFIER), 0, NUM_WORDS>::Store(
+
385  reinterpret_cast<DeviceWord*>(ptr),
+
386  words.buf);
+
387 }
+
388 
+
389 
+
393 template <PtxStoreModifier MODIFIER, typename OutputIteratorRA, typename T>
+
394 __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, T val)
+
395 {
+
396  ThreadStore(
+
397  itr,
+
398  val,
+
399  Int2Type<MODIFIER>(),
+
400  Int2Type<IsPointer<OutputIteratorRA>::VALUE>());
+
401 }
+
402 
+
403 
+
404 
+
405 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
406 
+
407  // end group IoModule
+
409 
+
410 
+
411 } // CUB namespace
+
412 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/util__allocator_8cuh_source.html b/docs/html/util__allocator_8cuh_source.html new file mode 100644 index 0000000000..bbabf0da8e --- /dev/null +++ b/docs/html/util__allocator_8cuh_source.html @@ -0,0 +1,681 @@ + + + + + + + +CUB: util_allocator.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
util_allocator.cuh
+
+
+
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
29 /******************************************************************************
+
30  * Simple caching allocator for device memory allocations. The allocator is
+
31  * thread-safe and capable of managing device allocations on multiple devices.
+
32  ******************************************************************************/
+
33 
+
34 #pragma once
+
35 
+
36 #ifndef __CUDA_ARCH__
+
37  #include <set> // NVCC (EDG, really) takes FOREVER to compile std::map
+
38  #include <map>
+
39 #endif
+
40 
+
41 #include <math.h>
+
42 
+
43 #include "util_namespace.cuh"
+
44 #include "util_debug.cuh"
+
45 
+
46 #include "host/spinlock.cuh"
+
47 
+
49 CUB_NS_PREFIX
+
50 
+
52 namespace cub {
+
53 
+
54 
+
61 /******************************************************************************
+
62  * CachingDeviceAllocator (host use)
+
63  ******************************************************************************/
+
64 
+ +
100 {
+
101 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
102 
+
103 
+
104  //---------------------------------------------------------------------
+
105  // Type definitions and constants
+
106  //---------------------------------------------------------------------
+
107 
+
108  enum
+
109  {
+
111  INVALID_DEVICE_ORDINAL = -1,
+
112  };
+
113 
+
117  static unsigned int IntPow(
+
118  unsigned int base,
+
119  unsigned int exp)
+
120  {
+
121  unsigned int retval = 1;
+
122  while (exp > 0)
+
123  {
+
124  if (exp & 1) {
+
125  retval = retval * base; // multiply the result by the current base
+
126  }
+
127  base = base * base; // square the base
+
128  exp = exp >> 1; // divide the exponent in half
+
129  }
+
130  return retval;
+
131  }
+
132 
+
133 
+
137  static void NearestPowerOf(
+
138  unsigned int &power,
+
139  size_t &rounded_bytes,
+
140  unsigned int base,
+
141  size_t value)
+
142  {
+
143  power = 0;
+
144  rounded_bytes = 1;
+
145 
+
146  while (rounded_bytes < value)
+
147  {
+
148  rounded_bytes *= base;
+
149  power++;
+
150  }
+
151  }
+
152 
+
156  struct BlockDescriptor
+
157  {
+
158  int device; // device ordinal
+
159  void* d_ptr; // Device pointer
+
160  size_t bytes; // Size of allocation in bytes
+
161  unsigned int bin; // Bin enumeration
+
162 
+
163  // Constructor
+
164  BlockDescriptor(void *d_ptr, int device) :
+
165  d_ptr(d_ptr),
+
166  bytes(0),
+
167  bin(0),
+
168  device(device) {}
+
169 
+
170  // Constructor
+
171  BlockDescriptor(size_t bytes, unsigned int bin, int device) :
+
172  d_ptr(NULL),
+
173  bytes(bytes),
+
174  bin(bin),
+
175  device(device) {}
+
176 
+
177  // Comparison functor for comparing device pointers
+
178  static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+
179  {
+
180  if (a.device < b.device) {
+
181  return true;
+
182  } else if (a.device > b.device) {
+
183  return false;
+
184  } else {
+
185  return (a.d_ptr < b.d_ptr);
+
186  }
+
187  }
+
188 
+
189  // Comparison functor for comparing allocation sizes
+
190  static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+
191  {
+
192  if (a.device < b.device) {
+
193  return true;
+
194  } else if (a.device > b.device) {
+
195  return false;
+
196  } else {
+
197  return (a.bytes < b.bytes);
+
198  }
+
199  }
+
200  };
+
201 
+
203  typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
204 
+
205 #ifndef __CUDA_ARCH__ // Only define STL container members in host code
+
206 
+
208  typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
209 
+
211  typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
212 
+
214  typedef std::map<int, size_t> GpuCachedBytes;
+
215 
+
216 #endif // __CUDA_ARCH__
+
217 
+
218  //---------------------------------------------------------------------
+
219  // Fields
+
220  //---------------------------------------------------------------------
+
221 
+
222  Spinlock spin_lock;
+
223 
+
224  unsigned int bin_growth;
+
225  unsigned int min_bin;
+
226  unsigned int max_bin;
+
227 
+
228  size_t min_bin_bytes;
+
229  size_t max_bin_bytes;
+
230  size_t max_cached_bytes;
+
231 
+
232  bool debug;
+
233  bool skip_cleanup;
+
234 
+
235 #ifndef __CUDA_ARCH__ // Only define STL container members in host code
+
236 
+
237  GpuCachedBytes cached_bytes;
+
238  CachedBlocks cached_blocks;
+
239  BusyBlocks live_blocks;
+
240 
+
241 #endif // __CUDA_ARCH__
+
242 
+
243 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
244 
+
245  //---------------------------------------------------------------------
+
246  // Methods
+
247  //---------------------------------------------------------------------
+
248 
+ +
253  unsigned int bin_growth,
+
254  unsigned int min_bin,
+
255  unsigned int max_bin,
+
256  size_t max_cached_bytes)
+
257  :
+
258  #ifndef __CUDA_ARCH__ // Only define STL container members in host code
+
259  cached_blocks(BlockDescriptor::SizeCompare),
+
260  live_blocks(BlockDescriptor::PtrCompare),
+
261  #endif
+
262  debug(false),
+
263  spin_lock(0),
+
264  bin_growth(bin_growth),
+
265  min_bin(min_bin),
+
266  max_bin(max_bin),
+
267  min_bin_bytes(IntPow(bin_growth, min_bin)),
+
268  max_bin_bytes(IntPow(bin_growth, max_bin)),
+
269  max_cached_bytes(max_cached_bytes)
+
270  {}
+
271 
+
272 
+
286  CachingDeviceAllocator(bool skip_cleanup = false) :
+
287  #ifndef __CUDA_ARCH__ // Only define STL container members in host code
+
288  cached_blocks(BlockDescriptor::SizeCompare),
+
289  live_blocks(BlockDescriptor::PtrCompare),
+
290  #endif
+
291  skip_cleanup(skip_cleanup),
+
292  debug(false),
+
293  spin_lock(0),
+
294  bin_growth(8),
+
295  min_bin(3),
+
296  max_bin(7),
+
297  min_bin_bytes(IntPow(bin_growth, min_bin)),
+
298  max_bin_bytes(IntPow(bin_growth, max_bin)),
+
299  max_cached_bytes((max_bin_bytes * 3) - 1)
+
300  {}
+
301 
+
302 
+
306  cudaError_t SetMaxCachedBytes(
+
307  size_t max_cached_bytes)
+
308  {
+
309  #ifdef __CUDA_ARCH__
+
310  // Caching functionality only defined on host
+
311  return CubDebug(cudaErrorInvalidConfiguration);
+
312  #else
+
313 
+
314  // Lock
+
315  Lock(&spin_lock);
+
316 
+
317  this->max_cached_bytes = max_cached_bytes;
+
318 
+
319  if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);
+
320 
+
321  // Unlock
+
322  Unlock(&spin_lock);
+
323 
+
324  return cudaSuccess;
+
325 
+
326  #endif // __CUDA_ARCH__
+
327  }
+
328 
+
329 
+
333  cudaError_t DeviceAllocate(
+
334  void** d_ptr,
+
335  size_t bytes,
+
336  int device)
+
337  {
+
338  #ifdef __CUDA_ARCH__
+
339  // Caching functionality only defined on host
+
340  return CubDebug(cudaErrorInvalidConfiguration);
+
341  #else
+
342 
+
343  bool locked = false;
+
344  int entrypoint_device = INVALID_DEVICE_ORDINAL;
+
345  cudaError_t error = cudaSuccess;
+
346 
+
347  // Round up to nearest bin size
+
348  unsigned int bin;
+
349  size_t bin_bytes;
+
350  NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
+
351  if (bin < min_bin) {
+
352  bin = min_bin;
+
353  bin_bytes = min_bin_bytes;
+
354  }
+
355 
+
356  // Check if bin is greater than our maximum bin
+
357  if (bin > max_bin)
+
358  {
+
359  // Allocate the request exactly and give out-of-range bin
+
360  bin = (unsigned int) -1;
+
361  bin_bytes = bytes;
+
362  }
+
363 
+
364  BlockDescriptor search_key(bin_bytes, bin, device);
+
365 
+
366  // Lock
+
367  if (!locked) {
+
368  Lock(&spin_lock);
+
369  locked = true;
+
370  }
+
371 
+
372  do {
+
373  // Find a free block big enough within the same bin on the same device
+
374  CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+
375  if ((block_itr != cached_blocks.end()) &&
+
376  (block_itr->device == device) &&
+
377  (block_itr->bin == search_key.bin))
+
378  {
+
379  // Reuse existing cache block. Insert into live blocks.
+
380  search_key = *block_itr;
+
381  live_blocks.insert(search_key);
+
382 
+
383  // Remove from free blocks
+
384  cached_blocks.erase(block_itr);
+
385  cached_bytes[device] -= search_key.bytes;
+
386 
+
387  if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+
388  device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+
389  }
+
390  else
+
391  {
+
392  // Need to allocate a new cache block. Unlock.
+
393  if (locked) {
+
394  Unlock(&spin_lock);
+
395  locked = false;
+
396  }
+
397 
+
398  // Set to specified device
+
399  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+
400  if (CubDebug(error = cudaSetDevice(device))) break;
+
401 
+
402  // Allocate
+
403  if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
+
404 
+
405  // Lock
+
406  if (!locked) {
+
407  Lock(&spin_lock);
+
408  locked = true;
+
409  }
+
410 
+
411  // Insert into live blocks
+
412  live_blocks.insert(search_key);
+
413 
+
414  if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+
415  device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+
416  }
+
417  } while(0);
+
418 
+
419  // Unlock
+
420  if (locked) {
+
421  Unlock(&spin_lock);
+
422  locked = false;
+
423  }
+
424 
+
425  // Copy device pointer to output parameter (NULL on error)
+
426  *d_ptr = search_key.d_ptr;
+
427 
+
428  // Attempt to revert back to previous device if necessary
+
429  if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+
430  {
+
431  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+
432  }
+
433 
+
434  return error;
+
435 
+
436  #endif // __CUDA_ARCH__
+
437  }
+
438 
+
439 
+
443  cudaError_t DeviceAllocate(
+
444  void** d_ptr,
+
445  size_t bytes)
+
446  {
+
447  #ifdef __CUDA_ARCH__
+
448  // Caching functionality only defined on host
+
449  return CubDebug(cudaErrorInvalidConfiguration);
+
450  #else
+
451  cudaError_t error = cudaSuccess;
+
452  do {
+
453  int current_device;
+
454  if (CubDebug(error = cudaGetDevice(&current_device))) break;
+
455  if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break;
+
456  } while(0);
+
457 
+
458  return error;
+
459 
+
460  #endif // __CUDA_ARCH__
+
461  }
+
462 
+
463 
+
467  cudaError_t DeviceFree(
+
468  void* d_ptr,
+
469  int device)
+
470  {
+
471  #ifdef __CUDA_ARCH__
+
472  // Caching functionality only defined on host
+
473  return CubDebug(cudaErrorInvalidConfiguration);
+
474  #else
+
475 
+
476  bool locked = false;
+
477  int entrypoint_device = INVALID_DEVICE_ORDINAL;
+
478  cudaError_t error = cudaSuccess;
+
479 
+
480  BlockDescriptor search_key(d_ptr, device);
+
481 
+
482  // Lock
+
483  if (!locked) {
+
484  Lock(&spin_lock);
+
485  locked = true;
+
486  }
+
487 
+
488  do {
+
489  // Find corresponding block descriptor
+
490  BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+
491  if (block_itr == live_blocks.end())
+
492  {
+
493  // Cannot find pointer
+
494  if (CubDebug(error = cudaErrorUnknown)) break;
+
495  }
+
496  else
+
497  {
+
498  // Remove from live blocks
+
499  search_key = *block_itr;
+
500  live_blocks.erase(block_itr);
+
501 
+
502  // Check if we should keep the returned allocation
+
503  if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
+
504  {
+
505  // Insert returned allocation into free blocks
+
506  cached_blocks.insert(search_key);
+
507  cached_bytes[device] += search_key.bytes;
+
508 
+
509  if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+
510  device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+
511  }
+
512  else
+
513  {
+
514  // Free the returned allocation. Unlock.
+
515  if (locked) {
+
516  Unlock(&spin_lock);
+
517  locked = false;
+
518  }
+
519 
+
520  // Set to specified device
+
521  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+
522  if (CubDebug(error = cudaSetDevice(device))) break;
+
523 
+
524  // Free device memory
+
525  if (CubDebug(error = cudaFree(d_ptr))) break;
+
526 
+
527  if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+
528  device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+
529  }
+
530  }
+
531  } while (0);
+
532 
+
533  // Unlock
+
534  if (locked) {
+
535  Unlock(&spin_lock);
+
536  locked = false;
+
537  }
+
538 
+
539  // Attempt to revert back to entry-point device if necessary
+
540  if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+
541  {
+
542  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+
543  }
+
544 
+
545  return error;
+
546 
+
547  #endif // __CUDA_ARCH__
+
548  }
+
549 
+
550 
+
554  cudaError_t DeviceFree(
+
555  void* d_ptr)
+
556  {
+
557  #ifdef __CUDA_ARCH__
+
558  // Caching functionality only defined on host
+
559  return CubDebug(cudaErrorInvalidConfiguration);
+
560  #else
+
561 
+
562  int current_device;
+
563  cudaError_t error = cudaSuccess;
+
564 
+
565  do {
+
566  if (CubDebug(error = cudaGetDevice(&current_device))) break;
+
567  if (CubDebug(error = DeviceFree(d_ptr, current_device))) break;
+
568  } while(0);
+
569 
+
570  return error;
+
571 
+
572  #endif // __CUDA_ARCH__
+
573  }
+
574 
+
575 
+
579  cudaError_t FreeAllCached()
+
580  {
+
581  #ifdef __CUDA_ARCH__
+
582  // Caching functionality only defined on host
+
583  return CubDebug(cudaErrorInvalidConfiguration);
+
584  #else
+
585 
+
586  cudaError_t error = cudaSuccess;
+
587  bool locked = false;
+
588  int entrypoint_device = INVALID_DEVICE_ORDINAL;
+
589  int current_device = INVALID_DEVICE_ORDINAL;
+
590 
+
591  // Lock
+
592  if (!locked) {
+
593  Lock(&spin_lock);
+
594  locked = true;
+
595  }
+
596 
+
597  while (!cached_blocks.empty())
+
598  {
+
599  // Get first block
+
600  CachedBlocks::iterator begin = cached_blocks.begin();
+
601 
+
602  // Get entry-point device ordinal if necessary
+
603  if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+
604  {
+
605  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+
606  }
+
607 
+
608  // Set current device ordinal if necessary
+
609  if (begin->device != current_device)
+
610  {
+
611  if (CubDebug(error = cudaSetDevice(begin->device))) break;
+
612  current_device = begin->device;
+
613  }
+
614 
+
615  // Free device memory
+
616  if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+
617 
+
618  // Reduce balance and erase entry
+
619  cached_bytes[current_device] -= begin->bytes;
+
620  cached_blocks.erase(begin);
+
621 
+
622  if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+
623  current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
+
624  }
+
625 
+
626  // Unlock
+
627  if (locked) {
+
628  Unlock(&spin_lock);
+
629  locked = false;
+
630  }
+
631 
+
632  // Attempt to revert back to entry-point device if necessary
+
633  if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+
634  {
+
635  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+
636  }
+
637 
+
638  return error;
+
639 
+
640  #endif // __CUDA_ARCH__
+
641  }
+
642 
+
643 
+ +
648  {
+
649  if (!skip_cleanup)
+
650  FreeAllCached();
+
651  }
+
652 
+
653 };
+
654 
+
655 
+
656 
+
657  // end group UtilModule
+
659 
+
660 } // CUB namespace
+
661 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/util__arch_8cuh_source.html b/docs/html/util__arch_8cuh_source.html new file mode 100644 index 0000000000..9b85590502 --- /dev/null +++ b/docs/html/util__arch_8cuh_source.html @@ -0,0 +1,359 @@ + + + + + + + +CUB: util_arch.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
util_arch.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
35 /******************************************************************************
+
36  * Static architectural properties by SM version.
+
37  *
+
38  * "Device" reflects the PTX architecture targeted by the active compiler
+
39  * pass. It provides useful compile-time statics within device code. E.g.,:
+
40  *
+
41  * __shared__ int[Device::WARP_THREADS];
+
42  *
+
43  * int padded_offset = threadIdx.x + (threadIdx.x >> Device::LOG_SMEM_BANKS);
+
44  *
+
45  ******************************************************************************/
+
46 
+
47 #pragma once
+
48 
+
49 #include "util_namespace.cuh"
+
50 
+
52 CUB_NS_PREFIX
+
53 
+
55 namespace cub {
+
56 
+
57 
+
64 #ifndef __CUDA_ARCH__
+
66  #define CUB_PTX_ARCH 0
+
67 #else
+
68  #define CUB_PTX_ARCH __CUDA_ARCH__
+
69 #endif
+
70 
+
71 
+
73 #if !defined(__CUDA_ARCH__) || defined(CUB_CDP)
+
74 #define CUB_RUNTIME_ENABLED
+
75 #endif
+
76 
+
77 
+
79 #if ((CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH < 200))
+
80  #define CUB_DESTRUCTOR __host__
+
81 #else
+
82  #define CUB_DESTRUCTOR __host__ __device__
+
83 #endif
+
84 
+
85 
+
91 template <int SM_ARCH>
+
92 struct ArchProps
+
93 {
+
94  enum
+
95  {
+
96  LOG_WARP_THREADS =
+
97  5,
+ +
99  1 << LOG_WARP_THREADS,
+ +
101  4,
+ +
103  1 << LOG_SMEM_BANKS,
+ +
105  4,
+ +
107  16 * 1024,
+ +
109  512,
+ +
111  true,
+ +
113  256,
+ +
115  2,
+ +
117  768,
+ +
119  8,
+ +
121  512,
+ +
123  8 * 1024,
+
124  };
+
125 };
+
126 
+
127 
+
128 
+
129 
+
130 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
131 
+
135 template <>
+
136 struct ArchProps<300>
+
137 {
+
138  enum
+
139  {
+
140  LOG_WARP_THREADS = 5, // 32 threads per warp
+
141  WARP_THREADS = 1 << LOG_WARP_THREADS,
+
142  LOG_SMEM_BANKS = 5, // 32 banks
+
143  SMEM_BANKS = 1 << LOG_SMEM_BANKS,
+
144  SMEM_BANK_BYTES = 4, // 4 byte bank words
+
145  SMEM_BYTES = 48 * 1024, // 48KB shared memory
+
146  SMEM_ALLOC_UNIT = 256, // 256B smem allocation segment size
+
147  REGS_BY_BLOCK = false, // Allocates registers by warp
+
148  REG_ALLOC_UNIT = 256, // 256 registers allocated at a time per warp
+
149  WARP_ALLOC_UNIT = 4, // Registers are allocated at a granularity of every 4 warps per threadblock
+
150  MAX_SM_THREADS = 2048, // 2K max threads per SM
+
151  MAX_SM_THREADBLOCKS = 16, // 16 max threadblocks per SM
+
152  MAX_BLOCK_THREADS = 1024, // 1024 max threads per threadblock
+
153  MAX_SM_REGISTERS = 64 * 1024, // 64K max registers per SM
+
154  };
+
155 
+
156  // Callback utility
+
157  template <typename T>
+
158  static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+
159  {
+
160  target.template Callback<ArchProps>();
+
161  }
+
162 };
+
163 
+
164 
+
168 template <>
+
169 struct ArchProps<200>
+
170 {
+
171  enum
+
172  {
+
173  LOG_WARP_THREADS = 5, // 32 threads per warp
+
174  WARP_THREADS = 1 << LOG_WARP_THREADS,
+
175  LOG_SMEM_BANKS = 5, // 32 banks
+
176  SMEM_BANKS = 1 << LOG_SMEM_BANKS,
+
177  SMEM_BANK_BYTES = 4, // 4 byte bank words
+
178  SMEM_BYTES = 48 * 1024, // 48KB shared memory
+
179  SMEM_ALLOC_UNIT = 128, // 128B smem allocation segment size
+
180  REGS_BY_BLOCK = false, // Allocates registers by warp
+
181  REG_ALLOC_UNIT = 64, // 64 registers allocated at a time per warp
+
182  WARP_ALLOC_UNIT = 2, // Registers are allocated at a granularity of every 2 warps per threadblock
+
183  MAX_SM_THREADS = 1536, // 1536 max threads per SM
+
184  MAX_SM_THREADBLOCKS = 8, // 8 max threadblocks per SM
+
185  MAX_BLOCK_THREADS = 1024, // 1024 max threads per threadblock
+
186  MAX_SM_REGISTERS = 32 * 1024, // 32K max registers per SM
+
187  };
+
188 
+
189  // Callback utility
+
190  template <typename T>
+
191  static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+
192  {
+
193  if (sm_version > 200) {
+
194  ArchProps<300>::Callback(target, sm_version);
+
195  } else {
+
196  target.template Callback<ArchProps>();
+
197  }
+
198  }
+
199 };
+
200 
+
201 
+
205 template <>
+
206 struct ArchProps<120>
+
207 {
+
208  enum
+
209  {
+
210  LOG_WARP_THREADS = 5, // 32 threads per warp
+
211  WARP_THREADS = 1 << LOG_WARP_THREADS,
+
212  LOG_SMEM_BANKS = 4, // 16 banks
+
213  SMEM_BANKS = 1 << LOG_SMEM_BANKS,
+
214  SMEM_BANK_BYTES = 4, // 4 byte bank words
+
215  SMEM_BYTES = 16 * 1024, // 16KB shared memory
+
216  SMEM_ALLOC_UNIT = 512, // 512B smem allocation segment size
+
217  REGS_BY_BLOCK = true, // Allocates registers by threadblock
+
218  REG_ALLOC_UNIT = 512, // 512 registers allocated at time per threadblock
+
219  WARP_ALLOC_UNIT = 2, // Registers are allocated at a granularity of every 2 warps per threadblock
+
220  MAX_SM_THREADS = 1024, // 1024 max threads per SM
+
221  MAX_SM_THREADBLOCKS = 8, // 8 max threadblocks per SM
+
222  MAX_BLOCK_THREADS = 512, // 512 max threads per threadblock
+
223  MAX_SM_REGISTERS = 16 * 1024, // 16K max registers per SM
+
224  };
+
225 
+
226  // Callback utility
+
227  template <typename T>
+
228  static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+
229  {
+
230  if (sm_version > 120) {
+
231  ArchProps<200>::Callback(target, sm_version);
+
232  } else {
+
233  target.template Callback<ArchProps>();
+
234  }
+
235  }
+
236 };
+
237 
+
238 
+
242 template <>
+
243 struct ArchProps<100> : ArchProps<0>
+
244 {
+
245  // Callback utility
+
246  template <typename T>
+
247  static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+
248  {
+
249  if (sm_version > 100) {
+
250  ArchProps<120>::Callback(target, sm_version);
+
251  } else {
+
252  target.template Callback<ArchProps>();
+
253  }
+
254  }
+
255 };
+
256 
+
257 
+
261 template <>
+
262 struct ArchProps<350> : ArchProps<300> {}; // Derives from SM30
+
263 
+
267 template <>
+
268 struct ArchProps<210> : ArchProps<200> {}; // Derives from SM20
+
269 
+
273 template <>
+
274 struct ArchProps<130> : ArchProps<120> {}; // Derives from SM12
+
275 
+
279 template <>
+
280 struct ArchProps<110> : ArchProps<100> {}; // Derives from SM10
+
281 
+
282 
+
283 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
284 
+
285 
+
289 struct PtxArchProps : ArchProps<CUB_PTX_ARCH> {};
+
290 
+
291  // end group UtilModule
+
293 
+
294 } // CUB namespace
+
295 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/util__debug_8cuh_source.html b/docs/html/util__debug_8cuh_source.html new file mode 100644 index 0000000000..b474e6e045 --- /dev/null +++ b/docs/html/util__debug_8cuh_source.html @@ -0,0 +1,197 @@ + + + + + + + +CUB: util_debug.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
util_debug.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
37 #pragma once
+
38 
+
39 #include <stdio.h>
+
40 #include "util_namespace.cuh"
+
41 #include "util_arch.cuh"
+
42 
+
44 CUB_NS_PREFIX
+
45 
+
47 namespace cub {
+
48 
+
49 
+
56 #if (defined(DEBUG) || defined(_DEBUG))
+
58  #define CUB_STDERR
+
59 #endif
+
60 
+
61 
+
62 
+
68 __host__ __device__ __forceinline__ cudaError_t Debug(
+
69  cudaError_t error,
+
70  const char* filename,
+
71  int line)
+
72 {
+
73 #ifdef CUB_STDERR
+
74  if (error)
+
75  {
+
76  #if (CUB_PTX_ARCH == 0)
+
77  fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+
78  fflush(stderr);
+
79  #elif (CUB_PTX_ARCH >= 200)
+
80  printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line);
+
81  #endif
+
82  }
+
83 #endif
+
84  return error;
+
85 }
+
86 
+
87 
+
91 #define CubDebug(e) cub::Debug((e), __FILE__, __LINE__)
+
92 
+
93 
+
97 #define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); }
+
98 
+
99 
+
103 #if (CUB_PTX_ARCH == 0)
+
104  #define CubLog(format, ...) printf(format,__VA_ARGS__);
+
105 #elif (CUB_PTX_ARCH >= 200)
+
106  #define CubLog(format, ...) printf("[block %d, thread %d]: " format, blockIdx.x, threadIdx.x, __VA_ARGS__);
+
107 #endif
+
108 
+
109 
+
110 
+
111  // end group UtilModule
+
113 
+
114 } // CUB namespace
+
115 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/util__device_8cuh_source.html b/docs/html/util__device_8cuh_source.html new file mode 100644 index 0000000000..6f790399c7 --- /dev/null +++ b/docs/html/util__device_8cuh_source.html @@ -0,0 +1,450 @@ + + + + + + + +CUB: util_device.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
util_device.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include "util_arch.cuh"
+
37 #include "util_debug.cuh"
+
38 #include "util_namespace.cuh"
+
39 #include "util_macro.cuh"
+
40 
+
42 CUB_NS_PREFIX
+
43 
+
45 namespace cub {
+
46 
+
47 
+
53 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
54 
+
55 
+
59 template <typename T>
+
60 __global__ void EmptyKernel(void) { }
+
61 
+
62 
+
66 template <int ALLOCATIONS>
+
67 __host__ __device__ __forceinline__
+
68 cudaError_t AliasTemporaries(
+
69  void *d_temp_storage,
+
70  size_t &temp_storage_bytes,
+
71  void* (&allocations)[ALLOCATIONS],
+
72  size_t (&allocation_sizes)[ALLOCATIONS])
+
73 {
+
74  const int ALIGN_BYTES = 256;
+
75  const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
+
76 
+
77  // Compute exclusive prefix sum over allocation requests
+
78  size_t bytes_needed = 0;
+
79  for (int i = 0; i < ALLOCATIONS; ++i)
+
80  {
+
81  size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+
82  allocation_sizes[i] = bytes_needed;
+
83  bytes_needed += allocation_bytes;
+
84  }
+
85 
+
86  // Check if the caller is simply requesting the size of the storage allocation
+
87  if (!d_temp_storage)
+
88  {
+
89  temp_storage_bytes = bytes_needed;
+
90  return cudaSuccess;
+
91  }
+
92 
+
93  // Check if enough storage provided
+
94  if (temp_storage_bytes < bytes_needed)
+
95  {
+
96  return CubDebug(cudaErrorMemoryAllocation);
+
97  }
+
98 
+
99  // Alias
+
100  for (int i = 0; i < ALLOCATIONS; ++i)
+
101  {
+
102  allocations[i] = static_cast<char*>(d_temp_storage) + allocation_sizes[i];
+
103  }
+
104 
+
105  return cudaSuccess;
+
106 }
+
107 
+
108 
+
109 
+
110 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
111 
+
112 
+
113 
+
117 __host__ __device__ __forceinline__ cudaError_t PtxVersion(int &ptx_version)
+
118 {
+
119 #ifndef CUB_RUNTIME_ENABLED
+
120 
+
121  // CUDA API calls not supported from this device
+
122  return cudaErrorInvalidConfiguration;
+
123 
+
124 #else
+
125 
+
126  cudaError_t error = cudaSuccess;
+
127  do
+
128  {
+
129  cudaFuncAttributes empty_kernel_attrs;
+
130  if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
+
131  ptx_version = empty_kernel_attrs.ptxVersion * 10;
+
132  }
+
133  while (0);
+
134 
+
135  return error;
+
136 
+
137 #endif
+
138 }
+
139 
+
140 
+
144 __host__ __device__ __forceinline__
+
145 static cudaError_t SyncStream(cudaStream_t stream)
+
146 {
+
147 #ifndef __CUDA_ARCH__
+
148  return cudaStreamSynchronize(stream);
+
149 #else
+
150  // Device can't yet sync on a specific stream
+
151  return cudaDeviceSynchronize();
+
152 #endif
+
153 }
+
154 
+
155 
+
156 
+
160 class Device
+
161 {
+
162 private:
+
163 
+
165  typedef void (*EmptyKernelPtr)();
+
166 
+
168  __host__ __device__ __forceinline__
+
169  EmptyKernelPtr Empty()
+
170  {
+
171  return EmptyKernel<void>;
+
172  }
+
173 
+
174 public:
+
175 
+
176  // Version information
+ + +
179 
+
180  // Target device properties
+
181  int sm_count;
+ + + + + + + + + + + + + +
195 
+
199  template <typename ArchProps>
+
200  __host__ __device__ __forceinline__ void Callback()
+
201  {
+ + + + + + + + + + + + + +
215  }
+
216 
+
217 
+
218 public:
+
219 
+
223  __host__ __device__ __forceinline__
+
224  cudaError_t Init(int device_ordinal)
+
225  {
+
226  #ifndef CUB_RUNTIME_ENABLED
+
227 
+
228  // CUDA API calls not supported from this device
+
229  return CubDebug(cudaErrorInvalidConfiguration);
+
230 
+
231  #else
+
232 
+
233  cudaError_t error = cudaSuccess;
+
234  do
+
235  {
+
236  // Fill in SM version
+
237  int major, minor;
+
238  if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
+
239  if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
+
240  sm_version = major * 100 + minor * 10;
+
241 
+
242  // Fill in static SM properties
+
243  // Initialize our device properties via callback from static device properties
+ +
245 
+
246  // Fill in SM count
+
247  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
248 
+
249  // Fill in PTX version
+
250  #if CUB_PTX_ARCH > 0
+ +
252  #else
+
253  if (CubDebug(error = PtxVersion(ptx_version))) break;
+
254  #endif
+
255 
+
256  }
+
257  while (0);
+
258 
+
259  return error;
+
260 
+
261  #endif
+
262  }
+
263 
+
264 
+
268  __host__ __device__ __forceinline__
+
269  cudaError_t Init()
+
270  {
+
271  #ifndef CUB_RUNTIME_ENABLED
+
272 
+
273  // CUDA API calls not supported from this device
+
274  return CubDebug(cudaErrorInvalidConfiguration);
+
275 
+
276  #else
+
277 
+
278  cudaError_t error = cudaSuccess;
+
279  do
+
280  {
+
281  int device_ordinal;
+
282  if ((error = CubDebug(cudaGetDevice(&device_ordinal)))) break;
+
283  if ((error = Init(device_ordinal))) break;
+
284  }
+
285  while (0);
+
286  return error;
+
287 
+
288  #endif
+
289  }
+
290 
+
291 
+
295  template <typename KernelPtr>
+
296  __host__ __device__ __forceinline__
+
297  cudaError_t MaxSmOccupancy(
+
298  int &max_sm_occupancy,
+
299  KernelPtr kernel_ptr,
+
300  int block_threads)
+
301  {
+
302  #ifndef CUB_RUNTIME_ENABLED
+
303 
+
304  // CUDA API calls not supported from this device
+
305  return CubDebug(cudaErrorInvalidConfiguration);
+
306 
+
307  #else
+
308 
+
309  cudaError_t error = cudaSuccess;
+
310  do
+
311  {
+
312  // Get kernel attributes
+
313  cudaFuncAttributes kernel_attrs;
+
314  if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;
+
315 
+
316  // Number of warps per threadblock
+
317  int block_warps = (block_threads + warp_threads - 1) / warp_threads;
+
318 
+
319  // Max warp occupancy
+
320  int max_warp_occupancy = (block_warps > 0) ?
+
321  max_sm_warps / block_warps :
+ +
323 
+
324  // Maximum register occupancy
+
325  int max_reg_occupancy;
+
326  if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
+
327  {
+
328  // Prevent divide-by-zero
+
329  max_reg_occupancy = max_sm_blocks;
+
330  }
+
331  else if (regs_by_block)
+
332  {
+
333  // Allocates registers by threadblock
+
334  int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
+
335  max_reg_occupancy = max_sm_registers / block_regs;
+
336  }
+
337  else
+
338  {
+
339  // Allocates registers by warp
+
340  int sm_sides = warp_alloc_unit;
+
341  int sm_registers_per_side = max_sm_registers / sm_sides;
+
342  int regs_per_warp = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
+
343  int warps_per_side = sm_registers_per_side / regs_per_warp;
+
344  int warps = warps_per_side * sm_sides;
+
345  max_reg_occupancy = warps / block_warps;
+
346  }
+
347 
+
348  // Shared memory per threadblock
+
349  int block_allocated_smem = CUB_ROUND_UP_NEAREST(
+
350  kernel_attrs.sharedSizeBytes,
+ +
352 
+
353  // Max shared memory occupancy
+
354  int max_smem_occupancy = (block_allocated_smem > 0) ?
+
355  (smem_bytes / block_allocated_smem) :
+ +
357 
+
358  // Max occupancy
+
359  max_sm_occupancy = CUB_MIN(
+
360  CUB_MIN(max_sm_blocks, max_warp_occupancy),
+
361  CUB_MIN(max_smem_occupancy, max_reg_occupancy));
+
362 
+
363 // printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d)", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy);
+
364 
+
365  } while (0);
+
366 
+
367  return error;
+
368 
+
369  #endif
+
370  }
+
371 
+
372 };
+
373 
+
374  // end group UtilModule
+
376 
+
377 } // CUB namespace
+
378 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/util__iterator_8cuh_source.html b/docs/html/util__iterator_8cuh_source.html new file mode 100644 index 0000000000..c36540c380 --- /dev/null +++ b/docs/html/util__iterator_8cuh_source.html @@ -0,0 +1,737 @@ + + + + + + + +CUB: util_iterator.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
util_iterator.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include "thread/thread_load.cuh"
+
37 #include "util_device.cuh"
+
38 #include "util_debug.cuh"
+
39 #include "util_namespace.cuh"
+
40 
+
42 CUB_NS_PREFIX
+
43 
+
45 namespace cub {
+
46 
+
47 
+
48 /******************************************************************************
+
49  * Texture references
+
50  *****************************************************************************/
+
51 
+
52 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
53 
+
54 // Anonymous namespace
+
55 namespace {
+
56 
+
58 template <typename T>
+
59 struct TexIteratorRef
+
60 {
+
61  // Texture reference type
+
62  typedef texture<T, cudaTextureType1D, cudaReadModeElementType> TexRef;
+
63 
+
64  static TexRef ref;
+
65 
+
69  static cudaError_t BindTexture(void *d_in)
+
70  {
+
71  cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<T>();
+
72  if (d_in)
+
73  return (CubDebug(cudaBindTexture(NULL, ref, d_in, tex_desc)));
+
74 
+
75  return cudaSuccess;
+
76  }
+
77 
+
81  static cudaError_t UnbindTexture()
+
82  {
+
83  return CubDebug(cudaUnbindTexture(ref));
+
84  }
+
85 };
+
86 
+
87 // Texture reference definitions
+
88 template <typename Value>
+
89 typename TexIteratorRef<Value>::TexRef TexIteratorRef<Value>::ref = 0;
+
90 
+
91 } // Anonymous namespace
+
92 
+
93 
+
94 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
95 
+
96 
+
97 
+
98 
+
99 
+
100 
+
101 
+
108 /******************************************************************************
+
109  * Iterators
+
110  *****************************************************************************/
+
111 
+
121 template <typename OutputType>
+ +
123 {
+
124 public:
+
125 
+
126 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
127 
+
128  typedef ConstantIteratorRA self_type;
+
129  typedef OutputType value_type;
+
130  typedef OutputType reference;
+
131  typedef OutputType* pointer;
+
132  typedef std::random_access_iterator_tag iterator_category;
+
133  typedef int difference_type;
+
134 
+
135 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
136 
+
137 private:
+
138 
+
139  OutputType val;
+
140 
+
141 public:
+
142 
+
144  __host__ __device__ __forceinline__ ConstantIteratorRA(
+
145  const OutputType &val)
+
146  :
+
147  val(val)
+
148  {}
+
149 
+
150 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
151 
+
152  __host__ __device__ __forceinline__ self_type operator++()
+
153  {
+
154  self_type i = *this;
+
155  return i;
+
156  }
+
157 
+
158  __host__ __device__ __forceinline__ self_type operator++(int junk)
+
159  {
+
160  return *this;
+
161  }
+
162 
+
163  __host__ __device__ __forceinline__ reference operator*()
+
164  {
+
165  return val;
+
166  }
+
167 
+
168  template <typename SizeT>
+
169  __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+
170  {
+
171  return ConstantIteratorRA(val);
+
172  }
+
173 
+
174  template <typename SizeT>
+
175  __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+
176  {
+
177  return ConstantIteratorRA(val);
+
178  }
+
179 
+
180  template <typename SizeT>
+
181  __host__ __device__ __forceinline__ reference operator[](SizeT n)
+
182  {
+
183  return ConstantIteratorRA(val);
+
184  }
+
185 
+
186  __host__ __device__ __forceinline__ pointer operator->()
+
187  {
+
188  return &val;
+
189  }
+
190 
+
191  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+
192  {
+
193  return (val == rhs.val);
+
194  }
+
195 
+
196  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+
197  {
+
198  return (val != rhs.val);
+
199  }
+
200 
+
201 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
202 
+
203 };
+
204 
+
205 
+
206 
+
220 template <typename OutputType, typename ConversionOp, typename InputType>
+ +
222 {
+
223 public:
+
224 
+
225 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
226 
+
227  typedef TransformIteratorRA self_type;
+
228  typedef OutputType value_type;
+
229  typedef OutputType reference;
+
230  typedef OutputType* pointer;
+
231  typedef std::random_access_iterator_tag iterator_category;
+
232  typedef int difference_type;
+
233 
+
234 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
235 
+
236 private:
+
237 
+
238  ConversionOp conversion_op;
+
239  InputType* ptr;
+
240 
+
241 public:
+
242 
+
248  __host__ __device__ __forceinline__ TransformIteratorRA(InputType* ptr, ConversionOp conversion_op) :
+
249  conversion_op(conversion_op),
+
250  ptr(ptr) {}
+
251 
+
252 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
253 
+
254  __host__ __device__ __forceinline__ self_type operator++()
+
255  {
+
256  self_type i = *this;
+
257  ptr++;
+
258  return i;
+
259  }
+
260 
+
261  __host__ __device__ __forceinline__ self_type operator++(int junk)
+
262  {
+
263  ptr++;
+
264  return *this;
+
265  }
+
266 
+
267  __host__ __device__ __forceinline__ reference operator*()
+
268  {
+
269  return conversion_op(*ptr);
+
270  }
+
271 
+
272  template <typename SizeT>
+
273  __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+
274  {
+
275  TransformIteratorRA retval(ptr + n, conversion_op);
+
276  return retval;
+
277  }
+
278 
+
279  template <typename SizeT>
+
280  __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+
281  {
+
282  TransformIteratorRA retval(ptr - n, conversion_op);
+
283  return retval;
+
284  }
+
285 
+
286  template <typename SizeT>
+
287  __host__ __device__ __forceinline__ reference operator[](SizeT n)
+
288  {
+
289  return conversion_op(ptr[n]);
+
290  }
+
291 
+
292  __host__ __device__ __forceinline__ pointer operator->()
+
293  {
+
294  return &conversion_op(*ptr);
+
295  }
+
296 
+
297  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+
298  {
+
299  return (ptr == rhs.ptr);
+
300  }
+
301 
+
302  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+
303  {
+
304  return (ptr != rhs.ptr);
+
305  }
+
306 
+
307 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
308 
+
309 };
+
310 
+
311 
+
312 
+
329 template <typename T>
+ +
331 {
+
332 public:
+
333 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
334 
+
335  typedef TexIteratorRA self_type;
+
336  typedef T value_type;
+
337  typedef T reference;
+
338  typedef T* pointer;
+
339  typedef std::random_access_iterator_tag iterator_category;
+
340  typedef int difference_type;
+
341 
+
342 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
343 
+
345  typedef void TexBindingTag;
+
346 
+
347 private:
+
348 
+
349  T* ptr;
+
350  size_t tex_align_offset;
+
351  cudaTextureObject_t tex_obj;
+
352 
+
353 public:
+
354 
+
358  __host__ __device__ __forceinline__ TexIteratorRA()
+
359  :
+
360  ptr(NULL),
+
361  tex_align_offset(0),
+
362  tex_obj(0)
+
363  {}
+
364 
+
366  cudaError_t BindTexture(
+
367  T *ptr,
+
368  size_t bytes,
+
369  size_t tex_align_offset = 0)
+
370  {
+
371  this->ptr = ptr;
+
372  this->tex_align_offset = tex_align_offset;
+
373 
+
374  int ptx_version;
+
375  cudaError_t error = cudaSuccess;
+
376  if (CubDebug(error = PtxVersion(ptx_version))) return error;
+
377  if (ptx_version >= 300)
+
378  {
+
379  // Use texture object
+
380  cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<T>();
+
381  cudaResourceDesc res_desc;
+
382  cudaTextureDesc tex_desc;
+
383  memset(&res_desc, 0, sizeof(cudaResourceDesc));
+
384  memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+
385  res_desc.resType = cudaResourceTypeLinear;
+
386  res_desc.res.linear.devPtr = ptr;
+
387  res_desc.res.linear.desc = channel_desc;
+
388  res_desc.res.linear.sizeInBytes = bytes;
+
389  tex_desc.readMode = cudaReadModeElementType;
+
390  return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+
391  }
+
392  else
+
393  {
+
394  // Use texture reference
+
395  return TexIteratorRef<T>::BindTexture(ptr);
+
396  }
+
397  }
+
398 
+
400  cudaError_t UnbindTexture()
+
401  {
+
402  int ptx_version;
+
403  cudaError_t error = cudaSuccess;
+
404  if (CubDebug(error = PtxVersion(ptx_version))) return error;
+
405  if (ptx_version < 300)
+
406  {
+
407  // Use texture reference
+
408  return TexIteratorRef<T>::UnbindTexture();
+
409  }
+
410  else
+
411  {
+
412  // Use texture object
+
413  return cudaDestroyTextureObject(tex_obj);
+
414  }
+
415  }
+
416 
+
417 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
418 
+
419  __host__ __device__ __forceinline__ self_type operator++()
+
420  {
+
421  self_type i = *this;
+
422  ptr++;
+
423  tex_align_offset++;
+
424  return i;
+
425  }
+
426 
+
427  __host__ __device__ __forceinline__ self_type operator++(int junk)
+
428  {
+
429  ptr++;
+
430  tex_align_offset++;
+
431  return *this;
+
432  }
+
433 
+
434  __host__ __device__ __forceinline__ reference operator*()
+
435  {
+
436 #if (CUB_PTX_ARCH == 0)
+
437  // Simply dereference the pointer on the host
+
438  return *ptr;
+
439 #elif (CUB_PTX_ARCH < 300)
+
440  // Use the texture reference
+
441  return tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset);
+
442 #else
+
443  // Use the texture object
+
444  return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+
445 #endif
+
446  }
+
447 
+
448  template <typename SizeT>
+
449  __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+
450  {
+
451  TexIteratorRA retval;
+
452  retval.ptr = ptr + n;
+
453  retval.tex_align_offset = tex_align_offset + n;
+
454  return retval;
+
455  }
+
456 
+
457  template <typename SizeT>
+
458  __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+
459  {
+
460  TexIteratorRA retval;
+
461  retval.ptr = ptr - n;
+
462  retval.tex_align_offset = tex_align_offset - n;
+
463  return retval;
+
464  }
+
465 
+
466  template <typename SizeT>
+
467  __host__ __device__ __forceinline__ reference operator[](SizeT n)
+
468  {
+
469 #if (CUB_PTX_ARCH == 0)
+
470  // Simply dereference the pointer on the host
+
471  return ptr[n];
+
472 #elif (CUB_PTX_ARCH < 300)
+
473  // Use the texture reference
+
474  return tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset + n);
+
475 #else
+
476  // Use the texture object
+
477  return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset + n));
+
478 #endif
+
479  }
+
480 
+
481  __host__ __device__ __forceinline__ pointer operator->()
+
482  {
+
483 #if (CUB_PTX_ARCH == 0)
+
484  // Simply dereference the pointer on the host
+
485  return &(*ptr);
+
486 #elif (CUB_PTX_ARCH < 300)
+
487  // Use the texture reference
+
488  return &(tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset));
+
489 #else
+
490  // Use the texture object
+
491  return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+
492 #endif
+
493  }
+
494 
+
495  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+
496  {
+
497  return (ptr == rhs.ptr);
+
498  }
+
499 
+
500  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+
501  {
+
502  return (ptr != rhs.ptr);
+
503  }
+
504 
+
505 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
506 
+
507 };
+
508 
+
509 
+
528 template <typename OutputType, typename ConversionOp, typename InputType>
+ +
530 {
+
531 public:
+
532 
+
533 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
534 
+
535  typedef TexTransformIteratorRA self_type;
+
536  typedef OutputType value_type;
+
537  typedef OutputType reference;
+
538  typedef OutputType* pointer;
+
539  typedef std::random_access_iterator_tag iterator_category;
+
540  typedef int difference_type;
+
541 
+
542 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
543 
+
545  typedef void TexBindingTag;
+
546 
+
547 private:
+
548 
+
549  ConversionOp conversion_op;
+
550  InputType* ptr;
+
551  size_t tex_align_offset;
+
552  cudaTextureObject_t tex_obj;
+
553 
+
554 public:
+
555 
+ +
560  ConversionOp conversion_op)
+
561  :
+
562  conversion_op(conversion_op),
+
563  ptr(NULL),
+
564  tex_align_offset(0),
+
565  tex_obj(0)
+
566  {}
+
567 
+
569  cudaError_t BindTexture(
+
570  InputType* ptr,
+
571  size_t bytes,
+
572  size_t tex_align_offset = 0)
+
573  {
+
574  this->ptr = ptr;
+
575  this->tex_align_offset = tex_align_offset;
+
576 
+
577  int ptx_version;
+
578  cudaError_t error = cudaSuccess;
+
579  if (CubDebug(error = PtxVersion(ptx_version))) return error;
+
580  if (ptx_version >= 300)
+
581  {
+
582  // Use texture object
+
583  cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<InputType>();
+
584  cudaResourceDesc res_desc;
+
585  cudaTextureDesc tex_desc;
+
586  memset(&res_desc, 0, sizeof(cudaResourceDesc));
+
587  memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+
588  res_desc.resType = cudaResourceTypeLinear;
+
589  res_desc.res.linear.devPtr = ptr;
+
590  res_desc.res.linear.desc = channel_desc;
+
591  res_desc.res.linear.sizeInBytes = bytes;
+
592  tex_desc.readMode = cudaReadModeElementType;
+
593  return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+
594  }
+
595  else
+
596  {
+
597  // Use texture reference
+
598  return TexIteratorRef<InputType>::BindTexture(ptr);
+
599  }
+
600  }
+
601 
+
603  cudaError_t UnbindTexture()
+
604  {
+
605  int ptx_version;
+
606  cudaError_t error = cudaSuccess;
+
607  if (CubDebug(error = PtxVersion(ptx_version))) return error;
+
608  if (ptx_version >= 300)
+
609  {
+
610  // Use texture object
+
611  return cudaDestroyTextureObject(tex_obj);
+
612  }
+
613  else
+
614  {
+
615  // Use texture reference
+
616  return TexIteratorRef<InputType>::UnbindTexture();
+
617  }
+
618  }
+
619 
+
620 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
621 
+
622  __host__ __device__ __forceinline__ self_type operator++()
+
623  {
+
624  self_type i = *this;
+
625  ptr++;
+
626  tex_align_offset++;
+
627  return i;
+
628  }
+
629 
+
630  __host__ __device__ __forceinline__ self_type operator++(int junk)
+
631  {
+
632  ptr++;
+
633  tex_align_offset++;
+
634  return *this;
+
635  }
+
636 
+
637  __host__ __device__ __forceinline__ reference operator*()
+
638  {
+
639 #if (CUB_PTX_ARCH == 0)
+
640  // Simply dereference the pointer on the host
+
641  return conversion_op(*ptr);
+
642 #elif (CUB_PTX_ARCH < 300)
+
643  // Use the texture reference
+
644  return conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset));
+
645 #else
+
646  // Use the texture object
+
647  return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+
648 #endif
+
649  }
+
650 
+
651  template <typename SizeT>
+
652  __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+
653  {
+
654  TexTransformIteratorRA retval(conversion_op);
+
655  retval.ptr = ptr + n;
+
656  retval.tex_align_offset = tex_align_offset + n;
+
657  return retval;
+
658  }
+
659 
+
660  template <typename SizeT>
+
661  __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+
662  {
+
663  TexTransformIteratorRA retval(conversion_op);
+
664  retval.ptr = ptr - n;
+
665  retval.tex_align_offset = tex_align_offset - n;
+
666  return retval;
+
667  }
+
668 
+
669  template <typename SizeT>
+
670  __host__ __device__ __forceinline__ reference operator[](SizeT n)
+
671  {
+
672 #if (CUB_PTX_ARCH == 0)
+
673  // Simply dereference the pointer on the host
+
674  return conversion_op(ptr[n]);
+
675 #elif (CUB_PTX_ARCH < 300)
+
676  // Use the texture reference
+
677  return conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset + n));
+
678 #else
+
679  // Use the texture object
+
680  return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset + n));
+
681 #endif
+
682  }
+
683 
+
684  __host__ __device__ __forceinline__ pointer operator->()
+
685  {
+
686 #if (CUB_PTX_ARCH == 0)
+
687  // Simply dereference the pointer on the host
+
688  return &conversion_op(*ptr);
+
689 #elif (CUB_PTX_ARCH < 300)
+
690  // Use the texture reference
+
691  return &conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset));
+
692 #else
+
693  // Use the texture object
+
694  return &conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+
695 #endif
+
696  }
+
697 
+
698  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+
699  {
+
700  return (ptr == rhs.ptr);
+
701  }
+
702 
+
703  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+
704  {
+
705  return (ptr != rhs.ptr);
+
706  }
+
707 
+
708 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
709 
+
710 };
+
711 
+
712 
+
713 
+
714  // end group UtilModule
+
716 
+
717 } // CUB namespace
+
718 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/util__type_8cuh_source.html b/docs/html/util__type_8cuh_source.html new file mode 100644 index 0000000000..0a661ad33a --- /dev/null +++ b/docs/html/util__type_8cuh_source.html @@ -0,0 +1,687 @@ + + + + + + + +CUB: util_type.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
util_type.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include <iostream>
+
37 #include <limits>
+
38 
+
39 #include "util_namespace.cuh"
+
40 
+
42 CUB_NS_PREFIX
+
43 
+
45 namespace cub {
+
46 
+
47 
+
55 /******************************************************************************
+
56  * Type equality
+
57  ******************************************************************************/
+
58 
+
62 template <bool IF, typename ThenType, typename ElseType>
+
63 struct If
+
64 {
+
66  typedef ThenType Type; // true
+
67 };
+
68 
+
69 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
70 
+
71 template <typename ThenType, typename ElseType>
+
72 struct If<false, ThenType, ElseType>
+
73 {
+
74  typedef ElseType Type; // false
+
75 };
+
76 
+
77 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
78 
+
79 
+
80 /******************************************************************************
+
81  * Conditional types
+
82  ******************************************************************************/
+
83 
+
84 
+
88 template <typename A, typename B>
+
89 struct Equals
+
90 {
+
91  enum {
+
92  VALUE = 0,
+
93  NEGATE = 1
+
94  };
+
95 };
+
96 
+
97 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
98 
+
99 template <typename A>
+
100 struct Equals <A, A>
+
101 {
+
102  enum {
+
103  VALUE = 1,
+
104  NEGATE = 0
+
105  };
+
106 };
+
107 
+
108 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
109 
+
110 
+
111 /******************************************************************************
+
112  * Marker types
+
113  ******************************************************************************/
+
114 
+
118 struct NullType
+
119 {
+
120 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
121  template <typename T>
+
122  __host__ __device__ __forceinline__ NullType& operator =(const T& b) { return *this; }
+
123 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
124 };
+
125 
+
126 
+
130 template <int A>
+
131 struct Int2Type
+
132 {
+
133  enum {VALUE = A};
+
134 };
+
135 
+
136 
+
137 /******************************************************************************
+
138  * Size and alignment
+
139  ******************************************************************************/
+
140 
+
141 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
142 
+
143 template <typename T>
+
144 struct WordAlignment
+
145 {
+
146  struct Pad
+
147  {
+
148  T val;
+
149  char byte;
+
150  };
+
151 
+
152  enum
+
153  {
+
155  ALIGN_BYTES = sizeof(Pad) - sizeof(T)
+
156  };
+
157 
+
159  typedef typename If<(ALIGN_BYTES % 4 == 0),
+
160  int,
+
161  typename If<(ALIGN_BYTES % 2 == 0),
+
162  short,
+
163  char>::Type>::Type ShuffleWord;
+
164 
+
166  typedef typename If<(ALIGN_BYTES % 8 == 0),
+
167  long long,
+
168  ShuffleWord>::Type VolatileWord;
+
169 
+
171  typedef typename If<(ALIGN_BYTES % 16 == 0),
+
172  longlong2,
+
173  typename If<(ALIGN_BYTES % 8 == 0),
+
174  long long, // needed to get heterogenous PODs to work on all platforms
+
175  ShuffleWord>::Type>::Type DeviceWord;
+
176 
+
177  enum
+
178  {
+
179  DEVICE_MULTIPLE = sizeof(DeviceWord) / sizeof(T)
+
180  };
+
181 
+
182  struct UninitializedBytes
+
183  {
+
184  char buf[sizeof(T)];
+
185  };
+
186 
+
187  struct UninitializedShuffleWords
+
188  {
+
189  ShuffleWord buf[sizeof(T) / sizeof(ShuffleWord)];
+
190  };
+
191 
+
192  struct UninitializedVolatileWords
+
193  {
+
194  VolatileWord buf[sizeof(T) / sizeof(VolatileWord)];
+
195  };
+
196 
+
197  struct UninitializedDeviceWords
+
198  {
+
199  DeviceWord buf[sizeof(T) / sizeof(DeviceWord)];
+
200  };
+
201 
+
202 
+
203 };
+
204 
+
205 
+
206 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
207 
+
208 
+
209 /******************************************************************************
+
210  * Wrapper types
+
211  ******************************************************************************/
+
212 
+
216 template <typename T>
+ +
218 {
+
220  typedef typename WordAlignment<T>::DeviceWord DeviceWord;
+
221 
+
222  enum
+
223  {
+
224  WORDS = sizeof(T) / sizeof(DeviceWord)
+
225  };
+
226 
+ +
229 
+
231  __host__ __device__ __forceinline__ T& Alias()
+
232  {
+
233  return reinterpret_cast<T&>(*this);
+
234  }
+
235 };
+
236 
+
237 
+
241 template <typename T, int COUNT>
+ +
243 {
+
245  T array[COUNT];
+
246 };
+
247 
+
248 
+
257 template <typename T>
+ +
259 {
+
261  T *d_buffers[2];
+
262 
+
264  int selector;
+
265 
+
267  __host__ __device__ __forceinline__ DoubleBuffer()
+
268  {
+
269  selector = 0;
+
270  d_buffers[0] = NULL;
+
271  d_buffers[1] = NULL;
+
272  }
+
273 
+
275  __host__ __device__ __forceinline__ DoubleBuffer(
+
276  T *d_current,
+
277  T *d_alternate)
+
278  {
+
279  selector = 0;
+
280  d_buffers[0] = d_current;
+
281  d_buffers[1] = d_alternate;
+
282  }
+
283 
+
285  __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
+
286 };
+
287 
+
288 
+
289 
+
290 /******************************************************************************
+
291  * Static math
+
292  ******************************************************************************/
+
293 
+
301 template <int N, int CURRENT_VAL = N, int COUNT = 0>
+
302 struct Log2
+
303 {
+
305  enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE }; // Inductive case
+
306 };
+
307 
+
308 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
309 template <int N, int COUNT>
+
310 struct Log2<N, 0, COUNT>
+
311 {
+
312  enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case
+
313  COUNT :
+
314  COUNT - 1 };
+
315 };
+
316 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
317 
+
318 
+
322 template <int N>
+ +
324 {
+
325  enum { VALUE = ((N & (N - 1)) == 0) };
+
326 };
+
327 
+
328 
+
329 
+
330 /******************************************************************************
+
331  * Pointer vs. iterator detection
+
332  ******************************************************************************/
+
333 
+
334 
+
338 template <typename Tp>
+
339 struct IsPointer
+
340 {
+
341  enum { VALUE = 0 };
+
342 };
+
343 
+
344 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
345 
+
346 template <typename Tp>
+
347 struct IsPointer<Tp*>
+
348 {
+
349  enum { VALUE = 1 };
+
350 };
+
351 
+
352 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
353 
+
354 
+
355 
+
356 /******************************************************************************
+
357  * Qualifier detection
+
358  ******************************************************************************/
+
359 
+
363 template <typename Tp>
+ +
365 {
+
366  enum { VALUE = 0 };
+
367 };
+
368 
+
369 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
370 
+
371 template <typename Tp>
+
372 struct IsVolatile<Tp volatile>
+
373 {
+
374  enum { VALUE = 1 };
+
375 };
+
376 
+
377 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
378 
+
379 
+
380 /******************************************************************************
+
381  * Qualifier removal
+
382  ******************************************************************************/
+
383 
+
390 template <typename Tp, typename Up = Tp>
+ +
392 {
+
394  typedef Up Type;
+
395 };
+
396 
+
397 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
398 
+
399 template <typename Tp, typename Up>
+
400 struct RemoveQualifiers<Tp, volatile Up>
+
401 {
+
402  typedef Up Type;
+
403 };
+
404 
+
405 template <typename Tp, typename Up>
+
406 struct RemoveQualifiers<Tp, const Up>
+
407 {
+
408  typedef Up Type;
+
409 };
+
410 
+
411 template <typename Tp, typename Up>
+
412 struct RemoveQualifiers<Tp, const volatile Up>
+
413 {
+
414  typedef Up Type;
+
415 };
+
416 
+
417 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
418 
+
419 
+
420 
+
421 /******************************************************************************
+
422  * Typedef-detection
+
423  ******************************************************************************/
+
424 
+
425 
+
429 #define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \
+
430  template <typename T> \
+
431  struct detector_name \
+
432  { \
+
433  template <typename C> \
+
434  static char& test(typename C::nested_type_name*); \
+
435  template <typename> \
+
436  static int& test(...); \
+
437  enum \
+
438  { \
+
439  VALUE = sizeof(test<T>(0)) < sizeof(int) \
+
440  }; \
+
441  };
+
442 
+
443 
+
444 
+
445 /******************************************************************************
+
446  * Simple enable-if (similar to Boost)
+
447  ******************************************************************************/
+
448 
+
452 template <bool Condition, class T = void>
+
453 struct EnableIf
+
454 {
+
456  typedef T Type;
+
457 };
+
458 
+
459 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
460 
+
461 template <class T>
+
462 struct EnableIf<false, T> {};
+
463 
+
464 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
465 
+
466 
+
467 /******************************************************************************
+
468  * Typedef-detection
+
469  ******************************************************************************/
+
470 
+
474 template <typename T, typename BinaryOp>
+ +
476 {
+
477 private:
+
478  template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const> struct SFINAE1 {};
+
479  template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)> struct SFINAE2 {};
+
480  template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const> struct SFINAE3 {};
+
481  template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)> struct SFINAE4 {};
+
482 
+
483  template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const> struct SFINAE5 {};
+
484  template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)> struct SFINAE6 {};
+
485  template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const> struct SFINAE7 {};
+
486  template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)> struct SFINAE8 {};
+
487 
+
488  template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
+
489  template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
+
490  template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
+
491  template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
+
492 
+
493  template <typename BinaryOpT> static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
+
494  template <typename BinaryOpT> static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
+
495  template <typename BinaryOpT> static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
+
496  template <typename BinaryOpT> static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
+
497 
+
498  template <typename BinaryOpT> static int Test(...);
+
499 
+
500 public:
+
501 
+
503  static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
+
504 };
+
505 
+
506 
+
507 
+
508 /******************************************************************************
+
509  * Simple type traits utilities.
+
510  *
+
511  * For example:
+
512  * Traits<int>::CATEGORY // SIGNED_INTEGER
+
513  * Traits<NullType>::NULL_TYPE // true
+
514  * Traits<uint4>::CATEGORY // NOT_A_NUMBER
+
515  * Traits<uint4>::PRIMITIVE; // false
+
516  *
+
517  ******************************************************************************/
+
518 
+ +
523 {
+
524  NOT_A_NUMBER,
+
525  SIGNED_INTEGER,
+
526  UNSIGNED_INTEGER,
+
527  FLOATING_POINT
+
528 };
+
529 
+
530 
+
534 template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits>
+ +
536 {
+
538  static const Category CATEGORY = _CATEGORY;
+
539  enum
+
540  {
+
541  PRIMITIVE = _PRIMITIVE,
+
542  NULL_TYPE = _NULL_TYPE,
+
543  };
+
544 };
+
545 
+
546 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
547 
+
551 template <typename _UnsignedBits>
+
552 struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits>
+
553 {
+
554  typedef _UnsignedBits UnsignedBits;
+
555 
+
556  static const Category CATEGORY = UNSIGNED_INTEGER;
+
557  static const UnsignedBits MIN_KEY = UnsignedBits(0);
+
558  static const UnsignedBits MAX_KEY = UnsignedBits(-1);
+
559 
+
560  enum
+
561  {
+
562  PRIMITIVE = true,
+
563  NULL_TYPE = false,
+
564  };
+
565 
+
566 
+
567  static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+
568  {
+
569  return key;
+
570  }
+
571 
+
572  static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+
573  {
+
574  return key;
+
575  }
+
576 };
+
577 
+
578 
+
582 template <typename _UnsignedBits>
+
583 struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits>
+
584 {
+
585  typedef _UnsignedBits UnsignedBits;
+
586 
+
587  static const Category CATEGORY = SIGNED_INTEGER;
+
588  static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+
589  static const UnsignedBits MIN_KEY = HIGH_BIT;
+
590  static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT;
+
591 
+
592  enum
+
593  {
+
594  PRIMITIVE = true,
+
595  NULL_TYPE = false,
+
596  };
+
597 
+
598  static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+
599  {
+
600  return key ^ HIGH_BIT;
+
601  };
+
602 
+
603  static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+
604  {
+
605  return key ^ HIGH_BIT;
+
606  };
+
607 
+
608 };
+
609 
+
610 
+
614 template <typename _UnsignedBits>
+
615 struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits>
+
616 {
+
617  typedef _UnsignedBits UnsignedBits;
+
618 
+
619  static const Category CATEGORY = FLOATING_POINT;
+
620  static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+
621  static const UnsignedBits MIN_KEY = UnsignedBits(-1);
+
622  static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT;
+
623 
+
624  static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+
625  {
+
626  UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
+
627  return key ^ mask;
+
628  };
+
629 
+
630  static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+
631  {
+
632  UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
+
633  return key ^ mask;
+
634  };
+
635 
+
636  enum
+
637  {
+
638  PRIMITIVE = true,
+
639  NULL_TYPE = false,
+
640  };
+
641 };
+
642 
+
643 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
644 
+
645 
+
649 template <typename T> struct NumericTraits : BaseTraits<NOT_A_NUMBER, false, false, T> {};
+
650 
+
651 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
652 
+
653 template <> struct NumericTraits<NullType> : BaseTraits<NOT_A_NUMBER, false, true, NullType> {};
+
654 
+
655 template <> struct NumericTraits<char> : BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char> {};
+
656 template <> struct NumericTraits<signed char> : BaseTraits<SIGNED_INTEGER, true, false, unsigned char> {};
+
657 template <> struct NumericTraits<short> : BaseTraits<SIGNED_INTEGER, true, false, unsigned short> {};
+
658 template <> struct NumericTraits<int> : BaseTraits<SIGNED_INTEGER, true, false, unsigned int> {};
+
659 template <> struct NumericTraits<long> : BaseTraits<SIGNED_INTEGER, true, false, unsigned long> {};
+
660 template <> struct NumericTraits<long long> : BaseTraits<SIGNED_INTEGER, true, false, unsigned long long> {};
+
661 
+
662 template <> struct NumericTraits<unsigned char> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char> {};
+
663 template <> struct NumericTraits<unsigned short> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short> {};
+
664 template <> struct NumericTraits<unsigned int> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int> {};
+
665 template <> struct NumericTraits<unsigned long> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long> {};
+
666 template <> struct NumericTraits<unsigned long long> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long> {};
+
667 
+
668 template <> struct NumericTraits<float> : BaseTraits<FLOATING_POINT, true, false, unsigned int> {};
+
669 template <> struct NumericTraits<double> : BaseTraits<FLOATING_POINT, true, false, unsigned long long> {};
+
670 
+
671 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
672 
+
673 
+
677 template <typename T>
+
678 struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
+
679 
+
680 
+
681  // end group UtilModule
+
683 
+
684 } // CUB namespace
+
685 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/warp__reduce_8cuh_source.html b/docs/html/warp__reduce_8cuh_source.html new file mode 100644 index 0000000000..8505e532a5 --- /dev/null +++ b/docs/html/warp__reduce_8cuh_source.html @@ -0,0 +1,370 @@ + + + + + + + +CUB: warp_reduce.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
warp_reduce.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include "specializations/warp_reduce_shfl.cuh"
+
37 #include "specializations/warp_reduce_smem.cuh"
+
38 #include "../thread/thread_operators.cuh"
+
39 #include "../util_arch.cuh"
+
40 #include "../util_type.cuh"
+
41 #include "../util_namespace.cuh"
+
42 
+
44 CUB_NS_PREFIX
+
45 
+
47 namespace cub {
+
48 
+
49 
+
137 template <
+
138  typename T,
+
139  int LOGICAL_WARPS = 1,
+
140  int LOGICAL_WARP_THREADS = PtxArchProps::WARP_THREADS>
+ +
142 {
+
143 private:
+
144 
+
145  /******************************************************************************
+
146  * Constants and typedefs
+
147  ******************************************************************************/
+
148 
+
149  enum
+
150  {
+
151  POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
152  };
+
153 
+
154 public:
+
155 
+
156  #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
157 
+
159  typedef typename If<(CUB_PTX_ARCH >= 300) && ((LOGICAL_WARPS == 1) || POW_OF_TWO),
+
160  WarpReduceShfl<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS>,
+
161  WarpReduceSmem<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS> >::Type InternalWarpReduce;
+
162 
+
163  #endif // DOXYGEN_SHOULD_SKIP_THIS
+
164 
+
165 
+
166 private:
+
167 
+
169  typedef typename InternalWarpReduce::TempStorage _TempStorage;
+
170 
+
171 
+
172  /******************************************************************************
+
173  * Thread fields
+
174  ******************************************************************************/
+
175 
+
177  _TempStorage &temp_storage;
+
178 
+
180  int warp_id;
+
181 
+
183  int lane_id;
+
184 
+
185 
+
186  /******************************************************************************
+
187  * Utility methods
+
188  ******************************************************************************/
+
189 
+
191  __device__ __forceinline__ _TempStorage& PrivateStorage()
+
192  {
+
193  __shared__ TempStorage private_storage;
+
194  return private_storage;
+
195  }
+
196 
+
197 
+
198 public:
+
199 
+
201  struct TempStorage : Uninitialized<_TempStorage> {};
+
202 
+
203 
+
204  /******************************************************************/
+
208 
+
209 
+
214  __device__ __forceinline__ WarpReduce()
+
215  :
+
216  temp_storage(PrivateStorage()),
+
217  warp_id((LOGICAL_WARPS == 1) ?
+
218  0 :
+
219  threadIdx.x / LOGICAL_WARP_THREADS),
+
220  lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
+
221  LaneId() :
+
222  threadIdx.x % LOGICAL_WARP_THREADS)
+
223  {}
+
224 
+
225 
+
229  __device__ __forceinline__ WarpReduce(
+
230  TempStorage &temp_storage)
+
231  :
+
232  temp_storage(temp_storage.Alias()),
+
233  warp_id((LOGICAL_WARPS == 1) ?
+
234  0 :
+
235  threadIdx.x / LOGICAL_WARP_THREADS),
+
236  lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
+
237  LaneId() :
+
238  threadIdx.x % LOGICAL_WARP_THREADS)
+
239  {}
+
240 
+
241 
+
245  __device__ __forceinline__ WarpReduce(
+
246  int warp_id,
+
247  int lane_id)
+
248  :
+
249  temp_storage(PrivateStorage()),
+
250  warp_id(warp_id),
+
251  lane_id(lane_id)
+
252  {}
+
253 
+
254 
+
258  __device__ __forceinline__ WarpReduce(
+
259  TempStorage &temp_storage,
+
260  int warp_id,
+
261  int lane_id)
+
262  :
+
263  temp_storage(temp_storage.Alias()),
+
264  warp_id(warp_id),
+
265  lane_id(lane_id)
+
266  {}
+
267 
+
268 
+
269 
+
271  /******************************************************************/
+
275 
+
276 
+
309  __device__ __forceinline__ T Sum(
+
310  T input)
+
311  {
+
312  return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, LOGICAL_WARP_THREADS);
+
313  }
+
314 
+
352  __device__ __forceinline__ T Sum(
+
353  T input,
+
354  int valid_items)
+
355  {
+
356  // Determine if we don't need bounds checking
+
357  if (valid_items >= LOGICAL_WARP_THREADS)
+
358  {
+
359  return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, valid_items);
+
360  }
+
361  else
+
362  {
+
363  return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<false, 1>(input, valid_items);
+
364  }
+
365  }
+
366 
+
367 
+
405  template <
+
406  typename Flag>
+
407  __device__ __forceinline__ T HeadSegmentedSum(
+
408  T input,
+
409  Flag head_flag)
+
410  {
+
411  return HeadSegmentedReduce(input, head_flag, cub::Sum());
+
412  }
+
413 
+
414 
+
451  template <
+
452  typename Flag>
+
453  __device__ __forceinline__ T TailSegmentedSum(
+
454  T input,
+
455  Flag tail_flag)
+
456  {
+
457  return TailSegmentedReduce(input, tail_flag, cub::Sum());
+
458  }
+
459 
+
460 
+
461 
+
463  /******************************************************************/
+
467 
+
504  template <typename ReductionOp>
+
505  __device__ __forceinline__ T Reduce(
+
506  T input,
+
507  ReductionOp reduction_op)
+
508  {
+
509  return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, LOGICAL_WARP_THREADS, reduction_op);
+
510  }
+
511 
+
552  template <typename ReductionOp>
+
553  __device__ __forceinline__ T Reduce(
+
554  T input,
+
555  ReductionOp reduction_op,
+
556  int valid_items)
+
557  {
+
558  // Determine if we don't need bounds checking
+
559  if (valid_items >= LOGICAL_WARP_THREADS)
+
560  {
+
561  return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, valid_items, reduction_op);
+
562  }
+
563  else
+
564  {
+
565  return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<false, 1>(input, valid_items, reduction_op);
+
566  }
+
567  }
+
568 
+
569 
+
608  template <
+
609  typename ReductionOp,
+
610  typename Flag>
+
611  __device__ __forceinline__ T HeadSegmentedReduce(
+
612  T input,
+
613  Flag head_flag,
+
614  ReductionOp reduction_op)
+
615  {
+
616  return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<true>(input, head_flag, reduction_op);
+
617  }
+
618 
+
619 
+
658  template <
+
659  typename ReductionOp,
+
660  typename Flag>
+
661  __device__ __forceinline__ T TailSegmentedReduce(
+
662  T input,
+
663  Flag tail_flag,
+
664  ReductionOp reduction_op)
+
665  {
+
666  return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<false>(input, tail_flag, reduction_op);
+
667  }
+
668 
+
669 
+
670 
+
672 };
+
673  // end group WarpModule
+
675 
+
676 } // CUB namespace
+
677 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + + diff --git a/docs/html/warp__scan_8cuh_source.html b/docs/html/warp__scan_8cuh_source.html new file mode 100644 index 0000000000..d8ae6defa0 --- /dev/null +++ b/docs/html/warp__scan_8cuh_source.html @@ -0,0 +1,525 @@ + + + + + + + +CUB: warp_scan.cuh Source File + + + + + + + + + + + + +
+
+ + + + + + +
+
CUB +
+
+
+ + + + + + +
+ All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups
+ + +
+ +
+ + +
+
+
+
warp_scan.cuh
+
+
+Go to the documentation of this file.
1 /******************************************************************************
+
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
+
3  * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+
4  *
+
5  * Redistribution and use in source and binary forms, with or without
+
6  * modification, are permitted provided that the following conditions are met:
+
7  * * Redistributions of source code must retain the above copyright
+
8  * notice, this list of conditions and the following disclaimer.
+
9  * * Redistributions in binary form must reproduce the above copyright
+
10  * notice, this list of conditions and the following disclaimer in the
+
11  * documentation and/or other materials provided with the distribution.
+
12  * * Neither the name of the NVIDIA CORPORATION nor the
+
13  * names of its contributors may be used to endorse or promote products
+
14  * derived from this software without specific prior written permission.
+
15  *
+
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
26  *
+
27  ******************************************************************************/
+
28 
+
34 #pragma once
+
35 
+
36 #include "specializations/warp_scan_shfl.cuh"
+
37 #include "specializations/warp_scan_smem.cuh"
+
38 #include "../thread/thread_operators.cuh"
+
39 #include "../util_arch.cuh"
+
40 #include "../util_type.cuh"
+
41 #include "../util_namespace.cuh"
+
42 
+
44 CUB_NS_PREFIX
+
45 
+
47 namespace cub {
+
48 
+
141 template <
+
142  typename T,
+
143  int LOGICAL_WARPS = 1,
+
144  int LOGICAL_WARP_THREADS = PtxArchProps::WARP_THREADS>
+
145 class WarpScan
+
146 {
+
147 private:
+
148 
+
149  /******************************************************************************
+
150  * Constants and typedefs
+
151  ******************************************************************************/
+
152 
+
153  enum
+
154  {
+
155  POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
156  };
+
157 
+
159  typedef typename If<(CUB_PTX_ARCH >= 300) && ((LOGICAL_WARPS == 1) || POW_OF_TWO),
+
160  WarpScanShfl<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS>,
+
161  WarpScanSmem<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS> >::Type InternalWarpScan;
+
162 
+
164  typedef typename InternalWarpScan::TempStorage _TempStorage;
+
165 
+
166 
+
167  /******************************************************************************
+
168  * Thread fields
+
169  ******************************************************************************/
+
170 
+
172  _TempStorage &temp_storage;
+
173 
+
175  int warp_id;
+
176 
+
178  int lane_id;
+
179 
+
180 
+
181  /******************************************************************************
+
182  * Utility methods
+
183  ******************************************************************************/
+
184 
+
186  __device__ __forceinline__ _TempStorage& PrivateStorage()
+
187  {
+
188  __shared__ TempStorage private_storage;
+
189  return private_storage;
+
190  }
+
191 
+
192 
+
193 public:
+
194 
+
196  struct TempStorage : Uninitialized<_TempStorage> {};
+
197 
+
198 
+
199  /******************************************************************/
+
203 
+
207  __device__ __forceinline__ WarpScan()
+
208  :
+
209  temp_storage(PrivateStorage()),
+
210  warp_id((LOGICAL_WARPS == 1) ?
+
211  0 :
+
212  threadIdx.x / LOGICAL_WARP_THREADS),
+
213  lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
+
214  LaneId() :
+
215  threadIdx.x % LOGICAL_WARP_THREADS)
+
216  {}
+
217 
+
218 
+
222  __device__ __forceinline__ WarpScan(
+
223  TempStorage &temp_storage)
+
224  :
+
225  temp_storage(temp_storage.Alias()),
+
226  warp_id((LOGICAL_WARPS == 1) ?
+
227  0 :
+
228  threadIdx.x / LOGICAL_WARP_THREADS),
+
229  lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
+
230  LaneId() :
+
231  threadIdx.x % LOGICAL_WARP_THREADS)
+
232  {}
+
233 
+
234 
+
238  __device__ __forceinline__ WarpScan(
+
239  int warp_id,
+
240  int lane_id)
+
241  :
+
242  temp_storage(PrivateStorage()),
+
243  warp_id(warp_id),
+
244  lane_id(lane_id)
+
245  {}
+
246 
+
247 
+
251  __device__ __forceinline__ WarpScan(
+
252  TempStorage &temp_storage,
+
253  int warp_id,
+
254  int lane_id)
+
255  :
+
256  temp_storage(temp_storage.Alias()),
+
257  warp_id(warp_id),
+
258  lane_id(lane_id)
+
259  {}
+
260 
+
261 
+
263  /******************************************************************/
+
267 
+
268 
+
300  __device__ __forceinline__ void InclusiveSum(
+
301  T input,
+
302  T &output)
+
303  {
+
304  InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveSum(input, output);
+
305  }
+
306 
+
307 
+
342  __device__ __forceinline__ void InclusiveSum(
+
343  T input,
+
344  T &output,
+
345  T &warp_aggregate)
+
346  {
+
347  InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveSum(input, output, warp_aggregate);
+
348  }
+
349 
+
350 
+
425  template <typename WarpPrefixOp>
+
426  __device__ __forceinline__ void InclusiveSum(
+
427  T input,
+
428  T &output,
+
429  T &warp_aggregate,
+
430  WarpPrefixOp &warp_prefix_op)
+
431  {
+
432  // Compute inclusive warp scan
+
433  InclusiveSum(input, output, warp_aggregate);
+
434 
+
435  // Compute warp-wide prefix from aggregate, then broadcast to other lanes
+
436  T prefix;
+
437  prefix = warp_prefix_op(warp_aggregate);
+
438  prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
+
439 
+
440  // Update output
+
441  output = prefix + output;
+
442  }
+
443 
+
445 
+
446 private:
+
447 
+
449  __device__ __forceinline__ void ExclusiveSum(T input, T &output, Int2Type<true> is_primitive)
+
450  {
+
451  // Compute exclusive warp scan from inclusive warp scan
+
452  T inclusive;
+
453  InclusiveSum(input, inclusive);
+
454  output = inclusive - input;
+
455  }
+
456 
+
458  __device__ __forceinline__ void ExclusiveSum(T input, T &output, Int2Type<false> is_primitive)
+
459  {
+
460  // Delegate to regular scan for non-primitive types (because we won't be able to use subtraction)
+
461  T identity = T();
+
462  ExclusiveScan(input, output, identity, Sum());
+
463  }
+
464 
+
466  __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, Int2Type<true> is_primitive)
+
467  {
+
468  // Compute exclusive warp scan from inclusive warp scan
+
469  T inclusive;
+
470  InclusiveSum(input, inclusive, warp_aggregate);
+
471  output = inclusive - input;
+
472  }
+
473 
+
475  __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, Int2Type<false> is_primitive)
+
476  {
+
477  // Delegate to regular scan for non-primitive types (because we won't be able to use subtraction)
+
478  T identity = T();
+
479  ExclusiveScan(input, output, identity, Sum(), warp_aggregate);
+
480  }
+
481 
+
483  template <typename WarpPrefixOp>
+
484  __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, WarpPrefixOp &warp_prefix_op, Int2Type<true> is_primitive)
+
485  {
+
486  // Compute exclusive warp scan from inclusive warp scan
+
487  T inclusive;
+
488  InclusiveSum(input, inclusive, warp_aggregate, warp_prefix_op);
+
489  output = inclusive - input;
+
490  }
+
491 
+
493  template <typename WarpPrefixOp>
+
494  __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, WarpPrefixOp &warp_prefix_op, Int2Type<false> is_primitive)
+
495  {
+
496  // Delegate to regular scan for non-primitive types (because we won't be able to use subtraction)
+
497  T identity = T();
+
498  ExclusiveScan(input, output, identity, Sum(), warp_aggregate, warp_prefix_op);
+
499  }
+
500 
+
501 public:
+
502 
+
503 
+
504  /******************************************************************/
+
508 
+
509 
+
547  __device__ __forceinline__ void ExclusiveSum(
+
548  T input,
+
549  T &output)
+
550  {
+
551  ExclusiveSum(input, output, Int2Type<Traits<T>::PRIMITIVE>());
+
552  }
+
553 
+
554 
+
592  __device__ __forceinline__ void ExclusiveSum(
+
593  T input,
+
594  T &output,
+
595  T &warp_aggregate)
+
596  {
+
597  ExclusiveSum(input, output, warp_aggregate, Int2Type<Traits<T>::PRIMITIVE>());
+
598  }
+
599 
+
600 
+
678  template <typename WarpPrefixOp>
+
679  __device__ __forceinline__ void ExclusiveSum(
+
680  T input,
+
681  T &output,
+
682  T &warp_aggregate,
+
683  WarpPrefixOp &warp_prefix_op)
+
684  {
+
685  ExclusiveSum(input, output, warp_aggregate, warp_prefix_op, Int2Type<Traits<T>::PRIMITIVE>());
+
686  }
+
687 
+
688 
+
690  /******************************************************************/
+
694 
+
730  template <typename ScanOp>
+
731  __device__ __forceinline__ void InclusiveScan(
+
732  T input,
+
733  T &output,
+
734  ScanOp scan_op)
+
735  {
+
736  InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveScan(input, output, scan_op);
+
737  }
+
738 
+
739 
+
779  template <typename ScanOp>
+
780  __device__ __forceinline__ void InclusiveScan(
+
781  T input,
+
782  T &output,
+
783  ScanOp scan_op,
+
784  T &warp_aggregate)
+
785  {
+
786  InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveScan(input, output, scan_op, warp_aggregate);
+
787  }
+
788 
+
789 
+
866  template <
+
867  typename ScanOp,
+
868  typename WarpPrefixOp>
+
869  __device__ __forceinline__ void InclusiveScan(
+
870  T input,
+
871  T &output,
+
872  ScanOp scan_op,
+
873  T &warp_aggregate,
+
874  WarpPrefixOp &warp_prefix_op)
+
875  {
+
876  // Compute inclusive warp scan
+
877  InclusiveScan(input, output, scan_op, warp_aggregate);
+
878 
+
879  // Compute warp-wide prefix from aggregate, then broadcast to other lanes
+
880  T prefix;
+
881  prefix = warp_prefix_op(warp_aggregate);
+
882  prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
+
883 
+
884  // Update output
+
885  output = scan_op(prefix, output);
+
886  }
+
887 
+
888 
+
890  /******************************************************************/
+
894 
+
930  template <typename ScanOp>
+
931  __device__ __forceinline__ void ExclusiveScan(
+
932  T input,
+
933  T &output,
+
934  T identity,
+
935  ScanOp scan_op)
+
936  {
+
937  InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, identity, scan_op);
+
938  }
+
939 
+
940 
+
978  template <typename ScanOp>
+
979  __device__ __forceinline__ void ExclusiveScan(
+
980  T input,
+
981  T &output,
+
982  T identity,
+
983  ScanOp scan_op,
+
984  T &warp_aggregate)
+
985  {
+
986  InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
+
987  }
+
988 
+
989 
+
1066  template <
+
1067  typename ScanOp,
+
1068  typename WarpPrefixOp>
+
1069  __device__ __forceinline__ void ExclusiveScan(
+
1070  T input,
+
1071  T &output,
+
1072  T identity,
+
1073  ScanOp scan_op,
+
1074  T &warp_aggregate,
+
1075  WarpPrefixOp &warp_prefix_op)
+
1076  {
+
1077  // Exclusive warp scan
+
1078  ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
+
1079 
+
1080  // Compute warp-wide prefix from aggregate, then broadcast to other lanes
+
1081  T prefix = warp_prefix_op(warp_aggregate);
+
1082  prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
+
1083 
+
1084  // Update output
+
1085  output = (lane_id == 0) ?
+
1086  prefix :
+
1087  scan_op(prefix, output);
+
1088  }
+
1089 
+
1090 
+
1092  /******************************************************************/
+
1096 
+
1097 
+
1134  template <typename ScanOp>
+
1135  __device__ __forceinline__ void ExclusiveScan(
+
1136  T input,
+
1137  T &output,
+
1138  ScanOp scan_op)
+
1139  {
+
1140  InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, scan_op);
+
1141  }
+
1142 
+
1143 
+
1181  template <typename ScanOp>
+
1182  __device__ __forceinline__ void ExclusiveScan(
+
1183  T input,
+
1184  T &output,
+
1185  ScanOp scan_op,
+
1186  T &warp_aggregate)
+
1187  {
+
1188  InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, scan_op, warp_aggregate);
+
1189  }
+
1190 
+
1191 
+
1268  template <
+
1269  typename ScanOp,
+
1270  typename WarpPrefixOp>
+
1271  __device__ __forceinline__ void ExclusiveScan(
+
1272  T input,
+
1273  T &output,
+
1274  ScanOp scan_op,
+
1275  T &warp_aggregate,
+
1276  WarpPrefixOp &warp_prefix_op)
+
1277  {
+
1278  // Exclusive warp scan
+
1279  ExclusiveScan(input, output, scan_op, warp_aggregate);
+
1280 
+
1281  // Compute warp-wide prefix from aggregate, then broadcast to other lanes
+
1282  T prefix = warp_prefix_op(warp_aggregate);
+
1283  prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
+
1284 
+
1285  // Update output with prefix
+
1286  output = (lane_id == 0) ?
+
1287  prefix :
+
1288  scan_op(prefix, output);
+
1289  }
+
1290 
+
1292 };
+
1293  // end group WarpModule
+
1295 
+
1296 } // CUB namespace
+
1297 CUB_NS_POSTFIX // Optional outer namespace(s)
+
+ + + + +