NVIDIA · PointKernel · May 7, 2026 · May 1, 2026 · May 5, 2026
@@ -830,60 +830,48 @@ class open_addressing_impl : private open_addressing_compatible<Key, Value, Prob
     using temp_allocator_type =
       typename std::allocator_traits<allocator_type>::template rebind_alloc<char>;
 
-    cuco::detail::index_type constexpr stride = std::numeric_limits<std::int32_t>::max();
-
-    cuco::detail::index_type h_num_out{0};
     auto temp_allocator = temp_allocator_type{this->allocator()};
     auto d_num_out =
       reinterpret_cast<size_type*>(temp_allocator.allocate(sizeof(size_type), stream));
 
-    // TODO: PR #580 to be reverted once https://github.com/NVIDIA/cccl/issues/1422 is resolved
-    for (cuco::detail::index_type offset = 0;
-         offset < static_cast<cuco::detail::index_type>(this->capacity());
-         offset += stride) {
-      auto const num_items =
-        std::min(static_cast<cuco::detail::index_type>(this->capacity()) - offset, stride);
-      auto const begin = cuda::make_transform_iterator(
-        cuda::counting_iterator{static_cast<size_type>(offset)},
-        detail::open_addressing_ns::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
-      auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
-        this->empty_key_sentinel(), this->erased_key_sentinel()};
-
-      std::size_t temp_storage_bytes = 0;
-
-      CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
-                                          temp_storage_bytes,
-                                          begin,
-                                          output_begin + h_num_out,
-                                          d_num_out,
-                                          static_cast<std::int32_t>(num_items),
-                                          is_filled,
-                                          stream.get()));
-
-      // Allocate temporary storage
-      auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, stream);
-
-      CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
-                                          temp_storage_bytes,
-                                          begin,
-                                          output_begin + h_num_out,
-                                          d_num_out,
-                                          static_cast<std::int32_t>(num_items),
-                                          is_filled,
-                                          stream.get()));
-
-      size_type temp_count;
-      CUCO_CUDA_TRY(cuco::detail::memcpy_async(
-        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
+    auto const begin = cuda::make_transform_iterator(
+      cuda::counting_iterator{size_type{0}},
+      detail::open_addressing_ns::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
+    auto const is_filled = detail::open_addressing_ns::slot_is_filled<has_payload, key_type>{
+      this->empty_key_sentinel(), this->erased_key_sentinel()};
+
+    std::size_t temp_storage_bytes = 0;
+
+    CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
+                                        temp_storage_bytes,
+                                        begin,
+                                        output_begin,
+                                        d_num_out,
+                                        this->capacity(),
+                                        is_filled,
+                                        stream.get()));
+
+    auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes, stream);
+
+    CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
+                                        temp_storage_bytes,
+                                        begin,
+                                        output_begin,
+                                        d_num_out,
+                                        this->capacity(),
+                                        is_filled,
+                                        stream.get()));
+
+    size_type h_num_out;
+    CUCO_CUDA_TRY(cuco::detail::memcpy_async(
+      &h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
-      stream.sync();
+    stream.sync();
 #else
-      stream.wait();
+    stream.wait();
 #endif
-      h_num_out += temp_count;
-      temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, stream);
-    }
 
+    temp_allocator.deallocate(d_temp_storage, temp_storage_bytes, stream);
     temp_allocator.deallocate(reinterpret_cast<char*>(d_num_out), sizeof(size_type), stream);
 
     return output_begin + h_num_out;