Conversation
| #if __cccl_ptx_isa >= 860 | ||
| struct WarpspeedPolicy | ||
| { |
There was a problem hiding this comment.
Note: I am entirely removing warpspeed scan from the old policy hub, since we have the new tuning API now and we did not ship warpspeed scan to any release yet. So this is not a breaking change.
| struct policy_selector_from_types | ||
| { | ||
| static constexpr int input_value_size = int{sizeof(InputValueT)}; | ||
| static constexpr int input_value_alignment = int{alignof(InputValueT)}; | ||
| static constexpr int output_value_size = int{sizeof(OutputValueT)}; | ||
| static constexpr int output_value_alignment = int{alignof(OutputValueT)}; | ||
| static constexpr int accum_size = int{sizeof(AccumT)}; | ||
| static constexpr int accum_alignment = int{alignof(AccumT)}; | ||
| static constexpr type_t input_type = classify_type<InputValueT>; |
There was a problem hiding this comment.
Note: It's not the policy selector's job to handle the type erasure required for CCCL.C, that's what we have the kernel source for.
There was a problem hiding this comment.
Doh! This was the missing piece to being able to straightforwardly make things easily constexpr, since values coming in as arguments can't do that. Wish I noticed it earlier 😅
| using policy_selector_t = detail::scan::policy_selector_from_types< | ||
| detail::it_value_t<InputIteratorT>, | ||
| detail::it_value_t<OutputIteratorT>, | ||
| AccumT, | ||
| OffsetT, | ||
| ScanOpT>; |
There was a problem hiding this comment.
Note: This was incorrect, since it ignored the user provided policy hub.
| REQUIRE(cudaSuccess == cudaGetDeviceProperties(&device_props, current_device)); | ||
|
|
||
| const auto target_block_size = | ||
| selector_t{}(cuda::to_arch_id(cuda::compute_capability{device_props.major, device_props.minor})).block_threads; |
There was a problem hiding this comment.
Note: It could be argued that we should not use a detail function in the unit tests, but we will probably expose ptx_arch_id, or the compute capability version, in the public API when we go public with the tuning API. So this objection would be temporary.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
| }; | ||
|
|
||
| using MaxPolicy = Policy1200; | ||
| using MaxPolicy = Policy1000; |
| // 1); | ||
|
|
||
| warpspeed_policy.tile_size = warpspeed_policy.items_per_thread * squad_reduce_thread_count; | ||
| if (arch >= ::cuda::arch_id::sm_120 && operation_t == op_kind_t::other && is_arithmetic_type(input_type)) |
There was a problem hiding this comment.
should not is_arithmetic_type be fully qualified?
There was a problem hiding this comment.
we don't fully qualify in CUB yet.
| static_cast<int>(kernel_src.InputSize()), | ||
| static_cast<int>(kernel_src.InputAlign()), | ||
| static_cast<int>(kernel_src.OutputSize()), | ||
| static_cast<int>(kernel_src.OutputAlign()), | ||
| static_cast<int>(kernel_src.AccumSize()), | ||
| static_cast<int>(kernel_src.AccumAlign())); |
There was a problem hiding this comment.
my understanding is that everything here is at compile-time
| static_cast<int>(kernel_src.InputSize()), | |
| static_cast<int>(kernel_src.InputAlign()), | |
| static_cast<int>(kernel_src.OutputSize()), | |
| static_cast<int>(kernel_src.OutputAlign()), | |
| static_cast<int>(kernel_src.AccumSize()), | |
| static_cast<int>(kernel_src.AccumAlign())); | |
| int{kernel_src.InputSize()}, | |
| int{kernel_src.InputAlign()}, | |
| int{kernel_src.OutputSize()}, | |
| int{kernel_src.OutputAlign()}, | |
| int{kernel_src.AccumSize()}, | |
| int{kernel_src.AccumAlign())}; |
There was a problem hiding this comment.
It's only constexpr when called through the CUB API. It's just const when called through CCCL.C.
| // TODO(bgruber): put this somewhere else | ||
| constexpr _CCCL_HOST_DEVICE bool is_arithmetic_type(type_t type) | ||
| { | ||
| switch (type) |
There was a problem hiding this comment.
question. Do we really need this kind of dispatch instead of using a template type + cuda::std utilities?
There was a problem hiding this comment.
Unfortunately, yes. We need to be able to compile the entire dispatch and tuning without any types when coming from Python via CCCL.C.
029feca to
b46a654
Compare
This comment has been minimized.
This comment has been minimized.
b46a654 to
db404fb
Compare
🥳 CI Workflow Results🟩 Finished in 1h 32m: Pass: 100%/255 | Total: 8d 11h | Max: 1h 25m | Hits: 71%/161009See results here. |
griwes
left a comment
There was a problem hiding this comment.
I love the unification of the divergent constexpr/nonconstexpr paths.
| struct policy_selector_from_types | ||
| { | ||
| static constexpr int input_value_size = int{sizeof(InputValueT)}; | ||
| static constexpr int input_value_alignment = int{alignof(InputValueT)}; | ||
| static constexpr int output_value_size = int{sizeof(OutputValueT)}; | ||
| static constexpr int output_value_alignment = int{alignof(OutputValueT)}; | ||
| static constexpr int accum_size = int{sizeof(AccumT)}; | ||
| static constexpr int accum_alignment = int{alignof(AccumT)}; | ||
| static constexpr type_t input_type = classify_type<InputValueT>; |
There was a problem hiding this comment.
Doh! This was the missing piece to being able to straightforwardly make things easily constexpr, since values coming in as arguments can't do that. Wish I noticed it earlier 😅
These are some refactorings following the restructuring to support CCCL.C and the new tuning API: #7565
cub.bench.scan.exclusive.sum.baseon SM75;80;86;90;100;120