+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
40 #include "block/block_radix_sort_upsweep_tiles.cuh"
+
41 #include "block/block_radix_sort_downsweep_tiles.cuh"
+
42 #include "block/block_scan_tiles.cuh"
+
43 #include "../grid/grid_even_share.cuh"
+
44 #include "../util_debug.cuh"
+
45 #include "../util_device.cuh"
+
46 #include "../util_namespace.cuh"
+
+
+
+
+
+
54 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+
+
+
+
+
+
+
+
+
67 typename BlockRadixSortUpsweepTilesPolicy,
+
+
+
70 __launch_bounds__ (
int(BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS), 1)
+
71 __global__
void RadixSortUpsweepKernel(
+
+
+
+
+
76 bool use_primary_bit_granularity,
+
+
78 GridEvenShare<SizeT> even_share)
+
+
+
+
82 typedef typename BlockRadixSortUpsweepTilesPolicy::AltPolicy AltPolicy;
+
+
+
85 typedef BlockRadixSortUpsweepTiles<BlockRadixSortUpsweepTilesPolicy, Key, SizeT> BlockRadixSortUpsweepTilesT;
+
86 typedef BlockRadixSortUpsweepTiles<AltPolicy, Key, SizeT> AltBlockRadixSortUpsweepTilesT;
+
+
+
+
+
91 typename BlockRadixSortUpsweepTilesT::TempStorage pass_storage;
+
92 typename AltBlockRadixSortUpsweepTilesT::TempStorage alt_pass_storage;
+
+
+
+
96 even_share.BlockInit();
+
+
+
99 if (use_primary_bit_granularity)
+
+
+
+
103 BlockRadixSortUpsweepTilesT(temp_storage.pass_storage, d_keys, current_bit).ProcessTiles(
+
104 even_share.block_offset,
+
105 even_share.block_oob,
+
+
+
+
109 if (threadIdx.x < BlockRadixSortUpsweepTilesT::RADIX_DIGITS)
+
+
111 d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
+
+
+
+
+
+
+
+
119 AltBlockRadixSortUpsweepTilesT(temp_storage.alt_pass_storage, d_keys, current_bit).ProcessTiles(
+
120 even_share.block_offset,
+
121 even_share.block_oob,
+
+
+
+
125 if (threadIdx.x < AltBlockRadixSortUpsweepTilesT::RADIX_DIGITS)
+
+
127 d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
+
+
+
+
+
+
+
137 typename BlockScanTilesPolicy,
+
+
139 __launch_bounds__ (
int(BlockScanTilesPolicy::BLOCK_THREADS), 1)
+
140 __global__
void RadixSortScanKernel(
+
+
+
+
+
145 typedef BlockScanTiles<BlockScanTilesPolicy, SizeT*, SizeT*, cub::Sum, SizeT, SizeT> BlockScanTilesT;
+
+
+
148 __shared__
typename BlockScanTilesT::TempStorage temp_storage;
+
+
+
151 BlockScanTilesT block_scan(temp_storage, d_spine, d_spine,
cub::Sum(), SizeT(0)) ;
+
+
+
154 int block_offset = 0;
+
155 RunningBlockPrefixOp<SizeT> prefix_op;
+
156 prefix_op.running_total = 0;
+
157 while (block_offset < num_counts)
+
+
159 block_scan.ConsumeTile<
true,
false>(block_offset, prefix_op);
+
160 block_offset += BlockScanTilesT::TILE_ITEMS;
+
+
+
+
+
+
169 typename BlockRadixSortDownsweepTilesPolicy,
+
+
+
+
173 __launch_bounds__ (
int(BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS))
+
174 __global__
void RadixSortDownsweepKernel(
+
+
+
+
+
+
+
+
182 bool use_primary_bit_granularity,
+
+
+
185 GridEvenShare<SizeT> even_share)
+
+
+
+
189 typedef typename BlockRadixSortDownsweepTilesPolicy::AltPolicy AltPolicy;
+
+
+
192 typedef BlockRadixSortDownsweepTiles<BlockRadixSortDownsweepTilesPolicy, Key, Value, SizeT> BlockRadixSortDownsweepTilesT;
+
193 typedef BlockRadixSortDownsweepTiles<AltPolicy, Key, Value, SizeT> AltBlockRadixSortDownsweepTilesT;
+
+
+
+
+
198 typename BlockRadixSortDownsweepTilesT::TempStorage pass_storage;
+
199 typename AltBlockRadixSortDownsweepTilesT::TempStorage alt_pass_storage;
+
+
+
+
+
204 even_share.BlockInit();
+
+
206 if (use_primary_bit_granularity)
+
+
+
209 BlockRadixSortDownsweepTilesT(temp_storage.pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
+
210 even_share.block_offset,
+
211 even_share.block_oob);
+
+
+
+
+
216 AltBlockRadixSortDownsweepTilesT(temp_storage.alt_pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
+
217 even_share.block_offset,
+
218 even_share.block_oob);
+
+
+
+
+
223 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+
+
+
+
+
+
+
263 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+
+
+
+
+
+
271 struct KernelDispachParams
+
+
+
274 int items_per_thread;
+
275 cudaSharedMemConfig smem_config;
+
+
+
278 int subscription_factor;
+
+
+
281 template <
typename SortBlockPolicy>
+
282 __host__ __device__ __forceinline__
+
283 void InitUpsweepPolicy(
int subscription_factor = 1)
+
+
285 block_threads = SortBlockPolicy::BLOCK_THREADS;
+
286 items_per_thread = SortBlockPolicy::ITEMS_PER_THREAD;
+
287 radix_bits = SortBlockPolicy::RADIX_BITS;
+
288 alt_radix_bits = SortBlockPolicy::AltPolicy::RADIX_BITS;
+
289 smem_config = cudaSharedMemBankSizeFourByte;
+
290 this->subscription_factor = subscription_factor;
+
291 tile_size = block_threads * items_per_thread;
+
+
+
294 template <
typename ScanBlockPolicy>
+
295 __host__ __device__ __forceinline__
+
296 void InitScanPolicy()
+
+
298 block_threads = ScanBlockPolicy::BLOCK_THREADS;
+
299 items_per_thread = ScanBlockPolicy::ITEMS_PER_THREAD;
+
+
+
302 smem_config = cudaSharedMemBankSizeFourByte;
+
303 subscription_factor = 0;
+
304 tile_size = block_threads * items_per_thread;
+
+
+
307 template <
typename SortBlockPolicy>
+
308 __host__ __device__ __forceinline__
+
309 void InitDownsweepPolicy(
int subscription_factor = 1)
+
+
311 block_threads = SortBlockPolicy::BLOCK_THREADS;
+
312 items_per_thread = SortBlockPolicy::ITEMS_PER_THREAD;
+
313 radix_bits = SortBlockPolicy::RADIX_BITS;
+
314 alt_radix_bits = SortBlockPolicy::AltPolicy::RADIX_BITS;
+
315 smem_config = SortBlockPolicy::SMEM_CONFIG;
+
316 this->subscription_factor = subscription_factor;
+
317 tile_size = block_threads * items_per_thread;
+
+
+
+
+
+
+
+
+
+
328 template <
typename Key,
typename Value,
typename SizeT,
int ARCH>
+
329 struct TunedPolicies;
+
+
332 template <
typename Key,
typename Value,
typename SizeT>
+
333 struct TunedPolicies<Key, Value, SizeT, 350>
+
+
+
+
337 SCALE_FACTOR = (CUB_MAX(
sizeof(Key),
sizeof(Value)) + 3) / 4,
+
+
+
+
+
342 typedef BlockRadixSortUpsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR),
LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
+
343 typedef BlockRadixSortUpsweepTilesPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR),
LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
+
+
+
+
+
+
+
+
351 typedef BlockScanTilesPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+
+
354 typedef BlockRadixSortDownsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR),
BLOCK_LOAD_DIRECT,
LOAD_LDG,
false,
true,
BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
+
355 typedef BlockRadixSortDownsweepTilesPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR),
BLOCK_LOAD_DIRECT,
LOAD_LDG,
false,
true,
BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
+
+
+
+
+
+
+
+
363 enum { SUBSCRIPTION_FACTOR = 7 };
+
+
+
+
368 template <
typename Key,
typename Value,
typename SizeT>
+
369 struct TunedPolicies<Key, Value, SizeT, 200>
+
+
+
+
373 SCALE_FACTOR = (CUB_MAX(
sizeof(Key),
sizeof(Value)) + 3) / 4,
+
+
+
+
+
378 typedef BlockRadixSortUpsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR),
LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
+
379 typedef BlockRadixSortUpsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR),
LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
+
+
+
+
383 typedef BlockScanTilesPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+
+
386 typedef BlockRadixSortDownsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
false,
false,
BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
+
387 typedef BlockRadixSortDownsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR),
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_DEFAULT,
false,
false,
BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
+
+
+
390 enum { SUBSCRIPTION_FACTOR = 3 };
+
+
+
+
395 template <
typename Key,
typename Value,
typename SizeT>
+
396 struct TunedPolicies<Key, Value, SizeT, 100>
+
+
+
+
+
+
+
403 typedef BlockRadixSortUpsweepTilesPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy;
+
+
+
406 typedef BlockScanTilesPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+
+
409 typedef BlockRadixSortDownsweepTilesPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy;
+
+
411 enum { SUBSCRIPTION_FACTOR = 3 };
+
+
+
+
+
+
+
+
+
421 template <
typename Key,
typename Value,
typename SizeT>
+
422 struct PtxDefaultPolicies
+
+
+
425 static const int PTX_TUNE_ARCH = (
CUB_PTX_ARCH >= 350) ?
+
+
+
+
+
+
+
432 typedef TunedPolicies<Key, Value, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+
+
435 struct UpsweepPolicy : PtxTunedPolicies::UpsweepPolicy {};
+
+
+
438 struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};
+
+
+
441 struct DownsweepPolicy : PtxTunedPolicies::DownsweepPolicy {};
+
+
+
444 enum { SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR };
+
+
+
450 static void InitDispatchParams(
+
+
452 KernelDispachParams &upsweep_dispatch_params,
+
453 KernelDispachParams &scan_dispatch_params,
+
454 KernelDispachParams &downsweep_dispatch_params)
+
+
456 if (ptx_version >= 350)
+
+
458 typedef TunedPolicies<Key, Value, SizeT, 350> TunedPolicies;
+
459 upsweep_dispatch_params.InitUpsweepPolicy<
typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
460 scan_dispatch_params.InitScanPolicy<
typename TunedPolicies::ScanPolicy>();
+
461 downsweep_dispatch_params.InitDownsweepPolicy<
typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
+
463 else if (ptx_version >= 200)
+
+
465 typedef TunedPolicies<Key, Value, SizeT, 200> TunedPolicies;
+
466 upsweep_dispatch_params.InitUpsweepPolicy<
typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
467 scan_dispatch_params.InitScanPolicy<
typename TunedPolicies::ScanPolicy>();
+
468 downsweep_dispatch_params.InitDownsweepPolicy<
typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
+
+
+
472 typedef TunedPolicies<Key, Value, SizeT, 100> TunedPolicies;
+
473 upsweep_dispatch_params.InitUpsweepPolicy<
typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
474 scan_dispatch_params.InitScanPolicy<
typename TunedPolicies::ScanPolicy>();
+
475 downsweep_dispatch_params.InitDownsweepPolicy<
typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+
+
+
+
+
+
+
+
+
+
+
+
490 typename UpsweepKernelPtr,
+
491 typename SpineKernelPtr,
+
492 typename DownsweepKernelPtr,
+
+
+
+
496 __host__ __device__ __forceinline__
+
497 static cudaError_t Dispatch(
+
498 void *d_temp_storage,
+
499 size_t &temp_storage_bytes,
+
500 UpsweepKernelPtr upsweep_kernel,
+
501 SpineKernelPtr scan_kernel,
+
502 DownsweepKernelPtr downsweep_kernel,
+
503 KernelDispachParams &upsweep_dispatch_params,
+
504 KernelDispachParams &scan_dispatch_params,
+
505 KernelDispachParams &downsweep_dispatch_params,
+
+
+
+
+
510 int end_bit =
sizeof(Key) * 8,
+
511 cudaStream_t stream = 0,
+
512 bool stream_synchronous =
false)
+
+
514 #ifndef CUB_RUNTIME_ENABLED
+
+
+
517 return CubDebug(cudaErrorNotSupported );
+
+
+
+
521 cudaError error = cudaSuccess;
+
+
+
+
+
526 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
+
+
+
+
530 if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
break;
+
+
+
533 int downsweep_sm_occupancy = CUB_MIN(
+
+
+
536 int upsweep_sm_occupancy = downsweep_sm_occupancy;
+
+
538 #ifndef __CUDA_ARCH__
+
+
+
541 if (
CubDebug(error = device_props.
Init(device_ordinal)))
break;
+
+
+
544 downsweep_sm_occupancy,
+
+
546 downsweep_dispatch_params.block_threads)))
break;
+
+
+
549 upsweep_sm_occupancy,
+
+
551 upsweep_dispatch_params.block_threads)))
break;
+
+
+
554 int downsweep_occupancy = downsweep_sm_occupancy * sm_count;
+
+
+
557 GridEvenShare<SizeT> even_share;
+
558 int max_downsweep_grid_size = downsweep_occupancy * downsweep_dispatch_params.subscription_factor;
+
559 int downsweep_grid_size;
+
560 even_share.GridInit(num_items, max_downsweep_grid_size, downsweep_dispatch_params.tile_size);
+
561 downsweep_grid_size = even_share.grid_size;
+
+
+
564 int bins = 1 << downsweep_dispatch_params.radix_bits;
+
565 int spine_size = downsweep_grid_size * bins;
+
566 int spine_tiles = (spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+
567 spine_size = spine_tiles * scan_dispatch_params.tile_size;
+
+
569 int alt_bins = 1 << downsweep_dispatch_params.alt_radix_bits;
+
570 int alt_spine_size = downsweep_grid_size * alt_bins;
+
571 int alt_spine_tiles = (alt_spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+
572 alt_spine_size = alt_spine_tiles * scan_dispatch_params.tile_size;
+
+
+
575 void* allocations[1];
+
576 size_t allocation_sizes[1] =
+
+
578 spine_size *
sizeof(SizeT),
+
+
+
+
582 if (
CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
break;
+
+
+
585 if (d_temp_storage == NULL)
+
+
+
+
589 SizeT *d_spine = (SizeT*) allocations[0];
+
+
591 #ifndef __CUDA_ARCH__
+
+
593 cudaSharedMemConfig original_smem_config;
+
594 if (
CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config)))
break;
+
595 cudaSharedMemConfig current_smem_config = original_smem_config;
+
+
+
598 int current_bit = begin_bit;
+
599 while (current_bit < end_bit)
+
+
+
602 int bits_remaining = end_bit - current_bit;
+
603 bool use_primary_bit_granularity = (bits_remaining % downsweep_dispatch_params.radix_bits == 0);
+
604 int radix_bits = (use_primary_bit_granularity) ?
+
605 downsweep_dispatch_params.radix_bits :
+
606 downsweep_dispatch_params.alt_radix_bits;
+
+
608 #ifndef __CUDA_ARCH__
+
+
610 if (current_smem_config != upsweep_dispatch_params.smem_config)
+
+
612 if (
CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_dispatch_params.smem_config)))
break;
+
613 current_smem_config = upsweep_dispatch_params.smem_config;
+
+
+
+
+
618 if (stream_synchronous)
+
619 CubLog(
"Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n",
+
620 downsweep_grid_size, upsweep_dispatch_params.block_threads, (
long long) stream, upsweep_dispatch_params.smem_config, upsweep_dispatch_params.items_per_thread, upsweep_sm_occupancy, d_keys.
selector, current_bit, radix_bits);
+
+
+
623 upsweep_kernel<<<downsweep_grid_size, upsweep_dispatch_params.block_threads, 0, stream>>>(
+
+
+
+
+
628 use_primary_bit_granularity,
+
629 (current_bit == begin_bit),
+
+
+
+
633 if (stream_synchronous && (
CubDebug(error = SyncStream(stream))))
break;
+
+
+
636 if (stream_synchronous)
CubLog(
"Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+
637 1, scan_dispatch_params.block_threads, (
long long) stream, scan_dispatch_params.items_per_thread);
+
+
+
640 scan_kernel<<<1, scan_dispatch_params.block_threads, 0, stream>>>(
+
+
642 (use_primary_bit_granularity) ? spine_size : alt_spine_size);
+
+
+
645 if (stream_synchronous && (
CubDebug(error = SyncStream(stream))))
break;
+
+
647 #ifndef __CUDA_ARCH__
+
+
649 if (current_smem_config != downsweep_dispatch_params.smem_config)
+
+
651 if (
CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_dispatch_params.smem_config)))
break;
+
652 current_smem_config = downsweep_dispatch_params.smem_config;
+
+
+
+
+
657 if (stream_synchronous)
CubLog(
"Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n",
+
658 downsweep_grid_size, downsweep_dispatch_params.block_threads, (
long long) stream, downsweep_dispatch_params.smem_config, downsweep_dispatch_params.items_per_thread, downsweep_sm_occupancy);
+
+
+
661 downsweep_kernel<<<downsweep_grid_size, downsweep_dispatch_params.block_threads, 0, stream>>>(
+
+
+
+
+
+
+
+
669 use_primary_bit_granularity,
+
670 (current_bit == begin_bit),
+
671 (current_bit + downsweep_dispatch_params.radix_bits >= end_bit),
+
+
+
+
675 if (stream_synchronous && (
CubDebug(error = SyncStream(stream))))
break;
+
+
+
+
+
+
+
682 current_bit += radix_bits;
+
+
+
685 #ifndef __CUDA_ARCH__
+
+
687 if (current_smem_config != original_smem_config)
+
+
689 if (
CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config)))
break;
+
+
+
+
+
+
+
+
+
698 #endif // CUB_RUNTIME_ENABLED
+
+
+
+
+
703 #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+
+
+
+
+
758 __host__ __device__ __forceinline__
+
+
760 void *d_temp_storage,
+
761 size_t &temp_storage_bytes,
+
+
+
+
+
766 int end_bit =
sizeof(Key) * 8,
+
767 cudaStream_t stream = 0,
+
768 bool stream_synchronous =
false)
+
+
+
+
+
+
774 typedef PtxDefaultPolicies<Key, Value, SizeT> PtxDefaultPolicies;
+
775 typedef typename PtxDefaultPolicies::UpsweepPolicy UpsweepPolicy;
+
776 typedef typename PtxDefaultPolicies::ScanPolicy ScanPolicy;
+
777 typedef typename PtxDefaultPolicies::DownsweepPolicy DownsweepPolicy;
+
+
779 cudaError error = cudaSuccess;
+
+
+
+
783 KernelDispachParams upsweep_dispatch_params;
+
784 KernelDispachParams scan_dispatch_params;
+
785 KernelDispachParams downsweep_dispatch_params;
+
+
+
+
789 upsweep_dispatch_params.InitUpsweepPolicy<UpsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+
790 scan_dispatch_params.InitScanPolicy<ScanPolicy>();
+
791 downsweep_dispatch_params.InitDownsweepPolicy<DownsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+
+
+
+
+
796 PtxDefaultPolicies::InitDispatchParams(
+
+
798 upsweep_dispatch_params,
+
799 scan_dispatch_params,
+
800 downsweep_dispatch_params);
+
+
+
+
+
+
806 RadixSortUpsweepKernel<UpsweepPolicy, Key, SizeT>,
+
807 RadixSortScanKernel<ScanPolicy, SizeT>,
+
808 RadixSortDownsweepKernel<DownsweepPolicy, Key, Value, SizeT>,
+
809 upsweep_dispatch_params,
+
810 scan_dispatch_params,
+
811 downsweep_dispatch_params,
+
+
+
+
+
+
+
818 stream_synchronous)))
break;
+
+
+
+
+
+
+
+
868 template <
typename Key>
+
869 __host__ __device__ __forceinline__
+
+
871 void *d_temp_storage,
+
872 size_t &temp_storage_bytes,
+
+
+
+
876 int end_bit =
sizeof(Key) * 8,
+
877 cudaStream_t stream = 0,
+
878 bool stream_synchronous =
false)
+
+
+
881 return SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream, stream_synchronous);
+
+
+
+
+
+
+
+
+
+