From 8a89b7899a7b9d5dc9ea7d764c07d607251b5dda Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Wed, 3 Jun 2026 10:55:20 +0200 Subject: [PATCH 01/10] [STF] Add C bindings for the places layer Extends the experimental STF C API to mirror the C++ places layer: - green_context_helper (create/destroy/count/device id) and green-context exec_place / data_place factories (CUDA 12.4+). - exec_place scope enter/exit (RAII context activation), affine data_place accessor, and grid sub-place accessor (get_place). - data_place stream-ordered allocate/deallocate and an allocation_is_stream_ordered query, plus machine_init. - task grid accessors: get_grid_dims and get_custream_at_index. Adds coverage in test_places.cpp. Extracted from the python-bindings PR to keep that change reviewable. --- .../stf/include/cccl/c/experimental/stf/stf.h | 140 +++++++- c/experimental/stf/src/stf.cu | 210 +++++++++++- c/experimental/stf/test/test_places.cpp | 322 ++++++++++++++++-- 3 files changed, 646 insertions(+), 26 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 247b35d9bbb..d5a80eb7953 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -106,6 +106,12 @@ typedef struct stf_exec_place_opaque_t* stf_exec_place_handle; //! \brief Opaque handle to a \c data_place. typedef struct stf_data_place_opaque_t* stf_data_place_handle; +//! \brief Opaque handle to a \c green_context_helper. +typedef struct stf_green_context_helper_opaque_t* stf_green_context_helper_handle; + +//! \brief Opaque handle to an active exec_place_scope (RAII context activation). +typedef struct stf_exec_place_scope_opaque_t* stf_exec_place_scope_handle; + //! \brief Opaque handle to an \c exec_place_resources registry. //! //! Handles returned by stf_exec_place_resources_create() are owned by the @@ -150,6 +156,19 @@ stf_exec_place_handle stf_exec_place_device(int dev_id); //! \brief Create execution place for the current CUDA device. stf_exec_place_handle stf_exec_place_current_device(void); +//! \brief Create a green-context helper for \p dev_id with \p sm_count SMs per green context. +//! Requires CUDA 12.4+. Returns NULL on failure. +stf_green_context_helper_handle stf_green_context_helper_create(int sm_count, int dev_id); + +//! \brief Destroy a green-context helper handle. +void stf_green_context_helper_destroy(stf_green_context_helper_handle h); + +//! \brief Number of green contexts created by \p h. +size_t stf_green_context_helper_get_count(stf_green_context_helper_handle h); + +//! \brief Device ordinal used by this green-context helper. +int stf_green_context_helper_get_device_id(stf_green_context_helper_handle h); + //! \brief Deep copy of an execution place handle (caller must stf_exec_place_destroy the result). stf_exec_place_handle stf_exec_place_clone(stf_exec_place_handle h); @@ -181,7 +200,20 @@ stf_exec_place_grid_create(const stf_exec_place_handle* places, size_t count, co //! \brief Same as stf_exec_place_destroy (grids are exec_place handles). void stf_exec_place_grid_destroy(stf_exec_place_handle grid); -//! \brief Create a fresh exec_place_resources registry for standalone place-layer use. +//! \brief Activate the sub-place at linear index \p idx (0 for scalar places). +//! Saves the current CUDA context; call stf_exec_place_scope_exit to restore. +//! \return Opaque scope handle, or NULL on failure. +stf_exec_place_scope_handle stf_exec_place_scope_enter(stf_exec_place_handle place, size_t idx); + +//! \brief Restore the CUDA context saved by stf_exec_place_scope_enter and destroy the scope. +//! \p scope may be NULL (no-op). +void stf_exec_place_scope_exit(stf_exec_place_scope_handle scope); + +//! \brief Get the affine data_place associated with this exec_place. +//! Caller must stf_data_place_destroy the result. +stf_data_place_handle stf_exec_place_get_affine_data_place(stf_exec_place_handle h); + +//! \brief Create a fresh, empty exec_place_resources registry. //! //! The registry lazily creates and owns stream pools for places used with //! stf_exec_place_pick_stream(). Destroying it releases every stream it owns. @@ -202,6 +234,21 @@ void stf_exec_place_resources_destroy(stf_exec_place_resources_handle h); //! finalized for a borrowed registry. CUstream stf_exec_place_pick_stream(stf_exec_place_resources_handle res, stf_exec_place_handle h, int for_computation); +//! \brief Get the sub-place at linear index \p idx. +//! For scalar places, \p idx must be 0. Returns NULL if \p idx is out of bounds. +//! Caller must stf_exec_place_destroy the result. +stf_exec_place_handle stf_exec_place_get_place(stf_exec_place_handle h, size_t idx); + +//! \brief Create an exec_place from green-context helper \p helper and view index \p idx. +//! If \p use_green_ctx_data_place is non-zero, set the affine data_place to a green-context data place. +//! Returns NULL on failure or if \p idx is out of range. +stf_exec_place_handle +stf_exec_place_green_ctx(stf_green_context_helper_handle helper, size_t idx, int use_green_ctx_data_place); + +//! \brief Initialize the machine singleton (P2P access, memory pool setup, topology). +//! Safe to call multiple times; only the first call has effect. +void stf_machine_init(void); + //! \brief Host (CPU/pinned) data placement. stf_data_place_handle stf_data_place_host(void); @@ -220,6 +267,10 @@ stf_data_place_handle stf_data_place_current_device(void); //! \brief Composite partitioned placement over a grid of execution places. stf_data_place_handle stf_data_place_composite(stf_exec_place_handle grid, stf_get_executor_fn mapper); +//! \brief Create a data_place from green-context helper \p helper and view index \p idx. +//! Returns NULL on failure or if \p idx is out of range. +stf_data_place_handle stf_data_place_green_ctx(stf_green_context_helper_handle helper, size_t idx); + //! \brief Deep copy (caller must stf_data_place_destroy). stf_data_place_handle stf_data_place_clone(stf_data_place_handle h); @@ -232,6 +283,35 @@ int stf_data_place_get_device_ordinal(stf_data_place_handle h); //! \brief Human-readable description; pointer valid until the next call on this thread. const char* stf_data_place_to_string(stf_data_place_handle h); +//! \brief Allocate \p size bytes at this data place. +//! +//! For device places the allocation is stream-ordered (cudaMallocAsync). +//! For host/managed places \p stream is ignored. +//! Returns NULL on failure (e.g. unsupported place type or out of memory). +//! +//! \param h Data place handle (must not be NULL) +//! \param size Allocation size in bytes +//! \param stream CUDA stream for stream-ordered allocation (may be NULL) +//! \return Pointer to allocated memory, or NULL on failure +void* stf_data_place_allocate(stf_data_place_handle h, ptrdiff_t size, cudaStream_t stream); + +//! \brief Deallocate memory previously obtained from stf_data_place_allocate(). +//! +//! For device places the deallocation is stream-ordered (cudaFreeAsync). +//! For host/managed places \p stream is ignored. +//! +//! \param h Data place handle (must not be NULL) +//! \param ptr Pointer returned by stf_data_place_allocate() +//! \param size Size of the original allocation in bytes +//! \param stream CUDA stream for stream-ordered deallocation (may be NULL) +void stf_data_place_deallocate(stf_data_place_handle h, void* ptr, size_t size, cudaStream_t stream); + +//! \brief Query whether allocations on this place are stream-ordered. +//! +//! \param h Data place handle (must not be NULL) +//! \return 1 if stream-ordered, 0 otherwise +int stf_data_place_allocation_is_stream_ordered(stf_data_place_handle h); + //! \} //! \defgroup Handles Opaque Handles @@ -1062,6 +1142,64 @@ void stf_task_destroy(stf_task_handle t); void stf_task_enable_capture(stf_task_handle t); +//! +//! \brief Get grid dimensions of a task's exec place +//! +//! When the task's execution place is a grid (size > 1), writes its +//! shape to \p out_dims. Returns 0 on success, non-zero if the task's +//! exec place is not a grid or \p out_dims is NULL. +//! +//! \param t Task handle +//! \param[out] out_dims On success, the grid shape (x, y, z, t) is written here. Must not be NULL. +//! \return 0 on success; non-zero if task exec place is not a grid or \p out_dims is NULL +//! +//! \pre t must be valid task handle +//! \pre stf_task_start() must have been called +//! +//! \note Total number of grid entries is out_dims->x * out_dims->y * out_dims->z * out_dims->t. +//! +//! \par Example: +//! \code +//! stf_task_start(task); +//! stf_dim4 dims; +//! if (stf_task_get_grid_dims(task, &dims) == 0) { +//! printf("Grid: %lu x %lu\n", dims.x, dims.y); +//! } +//! \endcode +//! +//! \see stf_task_get_custream_at_index() + +int stf_task_get_grid_dims(stf_task_handle t, stf_dim4* out_dims); + +//! +//! \brief Get the CUDA stream for a specific grid index +//! +//! When the task's exec place is a grid, returns the CUstream for the +//! given linear index (0 to product of grid dims - 1). +//! +//! \param t Task handle (must have been started; exec place must be a grid) +//! \param place_index Linear index in the grid (0-based; use stf_task_get_grid_dims to get shape) +//! \param[out] out_stream On success, the stream for that index is written here. Must not be NULL. +//! \return 0 on success; non-zero if task is not a grid, index out of range, or no per-index streams +//! +//! \pre t must be valid task handle +//! \pre stf_task_start() must have been called +//! +//! \par Example: +//! \code +//! stf_dim4 dims; +//! stf_task_get_grid_dims(task, &dims); +//! for (size_t i = 0; i < dims.x; ++i) { +//! CUstream s; +//! stf_task_get_custream_at_index(task, i, &s); +//! // launch work on stream s +//! } +//! \endcode +//! +//! \see stf_task_get_grid_dims() + +int stf_task_get_custream_at_index(stf_task_handle t, size_t place_index, CUstream* out_stream); + //! \} //! \defgroup CUDAKernel CUDA Kernel Interface diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 2068903963f..f982eced864 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -120,6 +120,16 @@ template { return static_cast(opaque_bits); } + else if constexpr (::std::is_same_v) + { + return static_cast(opaque_bits); + } +#if _CCCL_CTK_AT_LEAST(12, 4) + else if constexpr (::std::is_same_v) + { + return static_cast(opaque_bits); + } +#endif else { static_assert(stf_dependent_false_v

, "to_opaque: missing pointee -> handle pairing"); @@ -161,6 +171,16 @@ template { return static_cast(opaque_bits); } + else if constexpr (::std::is_same_v) + { + return static_cast(opaque_bits); + } +#if _CCCL_CTK_AT_LEAST(12, 4) + else if constexpr (::std::is_same_v) + { + return static_cast(opaque_bits); + } +#endif else { static_assert(stf_dependent_false_v, "from_opaque_const: missing handle -> pointee pairing"); @@ -198,6 +218,50 @@ stf_exec_place_handle stf_exec_place_current_device(void) })); } +stf_green_context_helper_handle stf_green_context_helper_create(int sm_count, int dev_id) +{ +#if _CCCL_CTK_AT_LEAST(12, 4) + return to_opaque(stf_try_allocate([sm_count, dev_id] { + return new green_context_helper(sm_count, dev_id); + })); +#else + (void) sm_count; + (void) dev_id; + return nullptr; +#endif +} + +void stf_green_context_helper_destroy(stf_green_context_helper_handle h) +{ +#if _CCCL_CTK_AT_LEAST(12, 4) + delete from_opaque(h); +#else + (void) h; +#endif +} + +size_t stf_green_context_helper_get_count(stf_green_context_helper_handle h) +{ +#if _CCCL_CTK_AT_LEAST(12, 4) + _CCCL_ASSERT(h != nullptr, "green_context_helper handle must not be null"); + return from_opaque(h)->get_count(); +#else + (void) h; + return 0; +#endif +} + +int stf_green_context_helper_get_device_id(stf_green_context_helper_handle h) +{ +#if _CCCL_CTK_AT_LEAST(12, 4) + _CCCL_ASSERT(h != nullptr, "green_context_helper handle must not be null"); + return static_cast(from_opaque(h)->get_device_id()); +#else + (void) h; + return -1; +#endif +} + stf_exec_place_handle stf_exec_place_clone(stf_exec_place_handle h) { _CCCL_ASSERT(h != nullptr, "exec_place handle must not be null"); @@ -281,6 +345,27 @@ void stf_exec_place_grid_destroy(stf_exec_place_handle grid) stf_exec_place_destroy(grid); } +stf_exec_place_scope_handle stf_exec_place_scope_enter(stf_exec_place_handle place, size_t idx) +{ + _CCCL_ASSERT(place != nullptr, "exec_place handle must not be null"); + return to_opaque(stf_try_allocate([&] { + return new exec_place_scope(*from_opaque(place), idx); + })); +} + +void stf_exec_place_scope_exit(stf_exec_place_scope_handle scope) +{ + delete from_opaque(scope); +} + +stf_data_place_handle stf_exec_place_get_affine_data_place(stf_exec_place_handle h) +{ + _CCCL_ASSERT(h != nullptr, "exec_place handle must not be null"); + return to_opaque(stf_try_allocate([h] { + return new data_place(from_opaque(h)->affine_data_place()); + })); +} + stf_exec_place_resources_handle stf_exec_place_resources_create(void) { return stf_try_allocate([] { @@ -317,7 +402,45 @@ CUstream stf_exec_place_pick_stream(stf_exec_place_resources_handle res, stf_exe { _CCCL_ASSERT(res != nullptr, "exec_place_resources handle must not be null"); _CCCL_ASSERT(h != nullptr, "exec_place handle must not be null"); - return reinterpret_cast(from_opaque(h)->pick_stream(*res->resources, for_computation != 0)); + return reinterpret_cast(from_opaque(h)->pick_stream(*from_opaque(res), for_computation != 0)); +} + +stf_exec_place_handle stf_exec_place_get_place(stf_exec_place_handle h, size_t idx) +{ + _CCCL_ASSERT(h != nullptr, "exec_place handle must not be null"); + if (idx >= from_opaque(h)->size()) + { + return nullptr; + } + return to_opaque(stf_try_allocate([h, idx] { + return new exec_place(from_opaque(h)->get_place(idx)); + })); +} + +stf_exec_place_handle +stf_exec_place_green_ctx(stf_green_context_helper_handle helper, size_t idx, int use_green_ctx_data_place) +{ +#if _CCCL_CTK_AT_LEAST(12, 4) + _CCCL_ASSERT(helper != nullptr, "green_context_helper handle must not be null"); + auto* gc_helper = from_opaque(helper); + if (idx >= gc_helper->get_count()) + { + return nullptr; + } + return to_opaque(stf_try_allocate([gc_helper, idx, use_green_ctx_data_place] { + return new exec_place(exec_place::green_ctx(gc_helper->get_view(idx), use_green_ctx_data_place != 0)); + })); +#else + (void) helper; + (void) idx; + (void) use_green_ctx_data_place; + return nullptr; +#endif +} + +void stf_machine_init(void) +{ + cuda::experimental::places::reserved::machine::instance(); } stf_data_place_handle stf_data_place_host(void) @@ -369,6 +492,25 @@ stf_data_place_handle stf_data_place_composite(stf_exec_place_handle grid, stf_g return to_opaque(dp); } +stf_data_place_handle stf_data_place_green_ctx(stf_green_context_helper_handle helper, size_t idx) +{ +#if _CCCL_CTK_AT_LEAST(12, 4) + _CCCL_ASSERT(helper != nullptr, "green_context_helper handle must not be null"); + auto* gc_helper = from_opaque(helper); + if (idx >= gc_helper->get_count()) + { + return nullptr; + } + return to_opaque(stf_try_allocate([gc_helper, idx] { + return new data_place(data_place::green_ctx(gc_helper->get_view(idx))); + })); +#else + (void) helper; + (void) idx; + return nullptr; +#endif +} + stf_data_place_handle stf_data_place_clone(stf_data_place_handle h) { _CCCL_ASSERT(h != nullptr, "data_place handle must not be null"); @@ -397,6 +539,37 @@ const char* stf_data_place_to_string(stf_data_place_handle h) return s.c_str(); } +void* stf_data_place_allocate(stf_data_place_handle h, ptrdiff_t size, cudaStream_t stream) +{ + _CCCL_ASSERT(h != nullptr, "data_place handle must not be null"); + try + { + return from_opaque(h)->allocate(static_cast<::std::ptrdiff_t>(size), stream); + } + catch (const ::std::exception& e) + { + fprintf(stderr, "stf_data_place_allocate failed: %s\n", e.what()); + return nullptr; + } + catch (...) + { + fprintf(stderr, "stf_data_place_allocate failed: unknown exception\n"); + return nullptr; + } +} + +void stf_data_place_deallocate(stf_data_place_handle h, void* ptr, size_t size, cudaStream_t stream) +{ + _CCCL_ASSERT(h != nullptr, "data_place handle must not be null"); + from_opaque(h)->deallocate(ptr, size, stream); +} + +int stf_data_place_allocation_is_stream_ordered(stf_data_place_handle h) +{ + _CCCL_ASSERT(h != nullptr, "data_place handle must not be null"); + return from_opaque(h)->allocation_is_stream_ordered() ? 1 : 0; +} + stf_ctx_handle stf_ctx_create(void) { return to_opaque(stf_try_allocate([] { @@ -702,6 +875,41 @@ CUstream stf_task_get_custream(stf_task_handle t) return static_cast(task_ptr->get_stream()); } +int stf_task_get_grid_dims(stf_task_handle t, stf_dim4* out_dims) +{ + if (t == nullptr || out_dims == nullptr) + { + return -1; + } + auto* task_ptr = from_opaque(t); + dim4 d; + if (!task_ptr->get_grid_dims(&d)) + { + return -1; + } + out_dims->x = static_cast(d.x); + out_dims->y = static_cast(d.y); + out_dims->z = static_cast(d.z); + out_dims->t = static_cast(d.t); + return 0; +} + +int stf_task_get_custream_at_index(stf_task_handle t, size_t place_index, CUstream* out_stream) +{ + if (t == nullptr || out_stream == nullptr) + { + return -1; + } + auto* task_ptr = from_opaque(t); + cudaStream_t s = task_ptr->get_stream(place_index); + if (s == nullptr) + { + return -1; + } + *out_stream = static_cast(s); + return 0; +} + void stf_task_destroy(stf_task_handle t) { _CCCL_ASSERT(t != nullptr, "task handle must not be null"); diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp index 9b690813862..6d97f683afe 100644 --- a/c/experimental/stf/test/test_places.cpp +++ b/c/experimental/stf/test/test_places.cpp @@ -231,60 +231,334 @@ C2H_TEST("composite data place with stf_exec_place_grid_create (vector of places } } -C2H_TEST("exec_place_pick_stream standalone resources", "[places][stream]") +C2H_TEST("task on exec_place_grid: get_grid_dims and get_custream_at_index", "[task][places][grid]") { + const size_t nplaces = 2; + stf_exec_place_handle places[2]; + for (size_t i = 0; i < nplaces; i++) + { + places[i] = stf_exec_place_device(0); + } + stf_exec_place_handle grid = stf_exec_place_grid_create(places, nplaces, nullptr); + REQUIRE(grid != nullptr); + for (size_t i = 0; i < nplaces; i++) + { + stf_exec_place_destroy(places[i]); + } + + stf_data_place_handle composite_dplace = stf_data_place_composite(grid, blocked_mapper_1d); + REQUIRE(composite_dplace != nullptr); + stf_exec_place_set_affine_data_place(grid, composite_dplace); + + stf_ctx_handle ctx = stf_ctx_create(); + REQUIRE(ctx != nullptr); + + std::vector X(4, 0.0f); + + stf_logical_data_handle lX = stf_logical_data(ctx, X.data(), X.size() * sizeof(float)); + REQUIRE(lX != nullptr); + + stf_task_handle t = stf_task_create(ctx); + REQUIRE(t != nullptr); + stf_task_set_exec_place(t, grid); + stf_task_add_dep(t, lX, STF_RW); + stf_task_start(t); + + stf_dim4 dims; + int got_dims = stf_task_get_grid_dims(t, &dims); + REQUIRE(got_dims == 0); + REQUIRE(dims.x == 2); + REQUIRE(dims.y == 1); + REQUIRE(dims.z == 1); + REQUIRE(dims.t == 1); + + CUstream s0, s1; + REQUIRE(stf_task_get_custream_at_index(t, 0, &s0) == 0); + REQUIRE(stf_task_get_custream_at_index(t, 1, &s1) == 0); + REQUIRE(s0 != nullptr); + REQUIRE(s1 != nullptr); + + stf_task_end(t); + stf_task_destroy(t); + + stf_data_place_destroy(composite_dplace); + stf_exec_place_grid_destroy(grid); + stf_logical_data_destroy(lX); + stf_ctx_finalize(ctx); +} + +C2H_TEST("task get_grid_dims returns error for non-grid exec_place", "[task][places][grid]") +{ + stf_ctx_handle ctx = stf_ctx_create(); + REQUIRE(ctx != nullptr); + + float val = 0.0f; + auto lX = stf_logical_data(ctx, &val, sizeof(float)); + auto e_dev0 = stf_exec_place_device(0); + + stf_task_handle t = stf_task_create(ctx); + REQUIRE(t != nullptr); + stf_task_set_exec_place(t, e_dev0); + stf_task_add_dep(t, lX, STF_RW); + stf_task_start(t); + + stf_dim4 dims; + REQUIRE(stf_task_get_grid_dims(t, &dims) != 0); + + stf_task_end(t); + stf_task_destroy(t); + + stf_exec_place_destroy(e_dev0); + stf_logical_data_destroy(lX); + stf_ctx_finalize(ctx); +} + +// ===== Place scope and accessor tests (task-free usage) ===== + +C2H_TEST("exec_place_scope enter/exit", "[places][scope]") +{ + stf_machine_init(); + stf_exec_place_handle dev0 = stf_exec_place_device(0); + REQUIRE(dev0 != nullptr); + + stf_exec_place_scope_handle scope = stf_exec_place_scope_enter(dev0, 0); + REQUIRE(scope != nullptr); + + stf_exec_place_scope_exit(scope); + stf_exec_place_scope_exit(nullptr); + + stf_exec_place_destroy(dev0); +} + +C2H_TEST("exec_place_scope nested", "[places][scope]") +{ + stf_machine_init(); + stf_exec_place_handle dev0 = stf_exec_place_device(0); + REQUIRE(dev0 != nullptr); + + stf_exec_place_scope_handle outer = stf_exec_place_scope_enter(dev0, 0); + REQUIRE(outer != nullptr); + + stf_exec_place_scope_handle inner = stf_exec_place_scope_enter(dev0, 0); + REQUIRE(inner != nullptr); + + stf_exec_place_scope_exit(inner); + stf_exec_place_scope_exit(outer); + + stf_exec_place_destroy(dev0); +} + +C2H_TEST("exec_place_get_affine_data_place", "[places][accessor]") +{ + stf_exec_place_handle dev0 = stf_exec_place_device(0); + REQUIRE(dev0 != nullptr); + + stf_data_place_handle dp = stf_exec_place_get_affine_data_place(dev0); + REQUIRE(dp != nullptr); + REQUIRE(stf_data_place_get_device_ordinal(dp) == 0); + + stf_data_place_destroy(dp); + stf_exec_place_destroy(dev0); +} + +C2H_TEST("exec_place_pick_stream standalone", "[places][scope][stream]") +{ + stf_machine_init(); + // Standalone use: no STF context required, just a registry the caller owns. stf_exec_place_resources_handle res = stf_exec_place_resources_create(); REQUIRE(res != nullptr); - stf_exec_place_handle place = stf_exec_place_current_device(); - REQUIRE(place != nullptr); + stf_exec_place_handle dev0 = stf_exec_place_device(0); + REQUIRE(dev0 != nullptr); - CUstream stream = stf_exec_place_pick_stream(res, place, /*for_computation=*/1); - REQUIRE(stream != nullptr); - REQUIRE(cudaStreamSynchronize(reinterpret_cast(stream)) == cudaSuccess); + stf_exec_place_scope_handle scope = stf_exec_place_scope_enter(dev0, 0); + REQUIRE(scope != nullptr); - stf_exec_place_destroy(place); + CUstream s = stf_exec_place_pick_stream(res, dev0, /*for_computation=*/1); + REQUIRE(s != nullptr); + + stf_exec_place_scope_exit(scope); + stf_exec_place_destroy(dev0); stf_exec_place_resources_destroy(res); } -C2H_TEST("exec_place resources are independent", "[places][stream]") +C2H_TEST("exec_place resources are independent", "[places][scope][stream]") { + stf_machine_init(); stf_exec_place_resources_handle res1 = stf_exec_place_resources_create(); stf_exec_place_resources_handle res2 = stf_exec_place_resources_create(); REQUIRE(res1 != nullptr); REQUIRE(res2 != nullptr); - stf_exec_place_handle place = stf_exec_place_current_device(); - REQUIRE(place != nullptr); + stf_exec_place_handle dev0 = stf_exec_place_device(0); + REQUIRE(dev0 != nullptr); + + stf_exec_place_scope_handle scope = stf_exec_place_scope_enter(dev0, 0); + REQUIRE(scope != nullptr); - CUstream stream1 = stf_exec_place_pick_stream(res1, place, /*for_computation=*/1); - CUstream stream2 = stf_exec_place_pick_stream(res2, place, /*for_computation=*/1); + CUstream stream1 = stf_exec_place_pick_stream(res1, dev0, /*for_computation=*/1); + CUstream stream2 = stf_exec_place_pick_stream(res2, dev0, /*for_computation=*/1); REQUIRE(stream1 != nullptr); REQUIRE(stream2 != nullptr); REQUIRE(stream1 != stream2); - stf_exec_place_destroy(place); + stf_exec_place_scope_exit(scope); + stf_exec_place_destroy(dev0); stf_exec_place_resources_destroy(res2); stf_exec_place_resources_destroy(res1); } -C2H_TEST("exec_place_pick_stream borrowed context resources", "[places][stream][ctx]") +C2H_TEST("exec_place_pick_stream borrowed from context", "[places][scope][stream][ctx]") { - stf_ctx_handle ctx = stf_ctx_create(); - REQUIRE(ctx != nullptr); - + stf_machine_init(); + stf_ctx_handle ctx = stf_ctx_create(); stf_exec_place_resources_handle res = stf_ctx_get_place_resources(ctx); REQUIRE(res != nullptr); - stf_exec_place_handle place = stf_exec_place_current_device(); - REQUIRE(place != nullptr); + stf_exec_place_handle dev0 = stf_exec_place_device(0); + stf_exec_place_scope_handle scope = stf_exec_place_scope_enter(dev0, 0); - CUstream stream = stf_exec_place_pick_stream(res, place, /*for_computation=*/1); - REQUIRE(stream != nullptr); - REQUIRE(cudaStreamSynchronize(reinterpret_cast(stream)) == cudaSuccess); + CUstream s = stf_exec_place_pick_stream(res, dev0, /*for_computation=*/1); + REQUIRE(s != nullptr); + stf_exec_place_scope_exit(scope); + stf_exec_place_destroy(dev0); + // `res` is a non-owning wrapper around context resources; destroy only the wrapper. stf_exec_place_resources_destroy(res); - - stf_exec_place_destroy(place); stf_ctx_finalize(ctx); } + +C2H_TEST("exec_place_get_place on grid", "[places][accessor][grid]") +{ + const size_t nplaces = 2; + int device_ids[2] = {0, 0}; + stf_exec_place_handle grid = stf_exec_place_grid_from_devices(device_ids, nplaces); + REQUIRE(grid != nullptr); + + stf_exec_place_handle sub0 = stf_exec_place_get_place(grid, 0); + stf_exec_place_handle sub1 = stf_exec_place_get_place(grid, 1); + REQUIRE(sub0 != nullptr); + REQUIRE(sub1 != nullptr); + REQUIRE(stf_exec_place_is_device(sub0) != 0); + REQUIRE(stf_exec_place_is_device(sub1) != 0); + + stf_exec_place_destroy(sub0); + stf_exec_place_destroy(sub1); + stf_exec_place_grid_destroy(grid); +} + +C2H_TEST("exec_place_get_place on scalar", "[places][accessor]") +{ + stf_exec_place_handle dev0 = stf_exec_place_device(0); + REQUIRE(dev0 != nullptr); + + stf_exec_place_handle sub = stf_exec_place_get_place(dev0, 0); + REQUIRE(sub != nullptr); + REQUIRE(stf_exec_place_is_device(sub) != 0); + + stf_exec_place_destroy(sub); + stf_exec_place_destroy(dev0); +} + +C2H_TEST("exec_place_get_place out of bounds", "[places][accessor]") +{ + stf_exec_place_handle dev0 = stf_exec_place_device(0); + REQUIRE(dev0 != nullptr); + REQUIRE(stf_exec_place_get_place(dev0, 1) == nullptr); + stf_exec_place_destroy(dev0); + + int device_ids[2] = {0, 0}; + stf_exec_place_handle grid = stf_exec_place_grid_from_devices(device_ids, 2); + REQUIRE(grid != nullptr); + REQUIRE(stf_exec_place_get_place(grid, 2) == nullptr); + stf_exec_place_grid_destroy(grid); +} + +C2H_TEST("machine_init idempotent", "[places][machine]") +{ + stf_machine_init(); + stf_machine_init(); +} + +C2H_TEST("data_place_allocate_device", "[places][allocate]") +{ + stf_exec_place_resources_handle res = stf_exec_place_resources_create(); + stf_exec_place_handle ep = stf_exec_place_device(0); + REQUIRE(ep != nullptr); + + stf_exec_place_scope_handle scope = stf_exec_place_scope_enter(ep, 0); + REQUIRE(scope != nullptr); + + CUstream stream = stf_exec_place_pick_stream(res, ep, /*for_computation=*/0); + stf_data_place_handle dplace = stf_exec_place_get_affine_data_place(ep); + REQUIRE(dplace != nullptr); + + void* ptr = stf_data_place_allocate(dplace, 1024, reinterpret_cast(stream)); + REQUIRE(ptr != nullptr); + + stf_data_place_deallocate(dplace, ptr, 1024, reinterpret_cast(stream)); + + stf_data_place_destroy(dplace); + stf_exec_place_scope_exit(scope); + stf_exec_place_destroy(ep); + stf_exec_place_resources_destroy(res); +} + +C2H_TEST("data_place_allocate_host", "[places][allocate]") +{ + stf_data_place_handle dplace = stf_data_place_host(); + REQUIRE(dplace != nullptr); + + void* ptr = stf_data_place_allocate(dplace, 256, nullptr); + REQUIRE(ptr != nullptr); + + int* buf = static_cast(ptr); + buf[0] = 42; + REQUIRE(buf[0] == 42); + + stf_data_place_deallocate(dplace, ptr, 256, nullptr); + stf_data_place_destroy(dplace); +} + +C2H_TEST("data_place_allocate_managed", "[places][allocate]") +{ + stf_data_place_handle dplace = stf_data_place_managed(); + REQUIRE(dplace != nullptr); + + void* ptr = stf_data_place_allocate(dplace, 512, nullptr); + REQUIRE(ptr != nullptr); + + int* buf = static_cast(ptr); + buf[0] = 99; + REQUIRE(buf[0] == 99); + + stf_data_place_deallocate(dplace, ptr, 512, nullptr); + stf_data_place_destroy(dplace); +} + +C2H_TEST("data_place_allocation_is_stream_ordered", "[places][allocate]") +{ + stf_data_place_handle dev = stf_data_place_device(0); + REQUIRE(dev != nullptr); + REQUIRE(stf_data_place_allocation_is_stream_ordered(dev) == 1); + stf_data_place_destroy(dev); + + stf_data_place_handle host = stf_data_place_host(); + REQUIRE(host != nullptr); + REQUIRE(stf_data_place_allocation_is_stream_ordered(host) == 0); + stf_data_place_destroy(host); + + stf_data_place_handle mgd = stf_data_place_managed(); + REQUIRE(mgd != nullptr); + REQUIRE(stf_data_place_allocation_is_stream_ordered(mgd) == 0); + stf_data_place_destroy(mgd); +} + +C2H_TEST("data_place_allocate_invalid_returns_null", "[places][allocate]") +{ + stf_data_place_handle inv = stf_data_place_affine(); + REQUIRE(inv != nullptr); + void* ptr = stf_data_place_allocate(inv, 64, nullptr); + REQUIRE(ptr == nullptr); + stf_data_place_destroy(inv); +} From 9e2a4b4592a279a033f24e008bd903b2117a2b55 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Wed, 3 Jun 2026 14:07:12 +0200 Subject: [PATCH 02/10] [STF] Harden places C API at the C/FFI boundary Address CodeRabbit review feedback: - stf_exec_place_scope_enter now rejects out-of-range indices with NULL, matching the contract of the neighboring index-based accessors. - stf_data_place_deallocate catches and maps C++ exceptions instead of letting them escape the extern "C" entry point. --- .../stf/include/cccl/c/experimental/stf/stf.h | 2 +- c/experimental/stf/src/stf.cu | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index d5a80eb7953..c0fb4f16662 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -202,7 +202,7 @@ void stf_exec_place_grid_destroy(stf_exec_place_handle grid); //! \brief Activate the sub-place at linear index \p idx (0 for scalar places). //! Saves the current CUDA context; call stf_exec_place_scope_exit to restore. -//! \return Opaque scope handle, or NULL on failure. +//! \return Opaque scope handle, or NULL on failure (including when \p idx is out of bounds). stf_exec_place_scope_handle stf_exec_place_scope_enter(stf_exec_place_handle place, size_t idx); //! \brief Restore the CUDA context saved by stf_exec_place_scope_enter and destroy the scope. diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index f982eced864..5ab0347a2bc 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -348,6 +348,10 @@ void stf_exec_place_grid_destroy(stf_exec_place_handle grid) stf_exec_place_scope_handle stf_exec_place_scope_enter(stf_exec_place_handle place, size_t idx) { _CCCL_ASSERT(place != nullptr, "exec_place handle must not be null"); + if (idx >= from_opaque(place)->size()) + { + return nullptr; + } return to_opaque(stf_try_allocate([&] { return new exec_place_scope(*from_opaque(place), idx); })); @@ -561,7 +565,18 @@ void* stf_data_place_allocate(stf_data_place_handle h, ptrdiff_t size, cudaStrea void stf_data_place_deallocate(stf_data_place_handle h, void* ptr, size_t size, cudaStream_t stream) { _CCCL_ASSERT(h != nullptr, "data_place handle must not be null"); - from_opaque(h)->deallocate(ptr, size, stream); + try + { + from_opaque(h)->deallocate(ptr, size, stream); + } + catch (const ::std::exception& e) + { + fprintf(stderr, "stf_data_place_deallocate failed: %s\n", e.what()); + } + catch (...) + { + fprintf(stderr, "stf_data_place_deallocate failed: unknown exception\n"); + } } int stf_data_place_allocation_is_stream_ordered(stf_data_place_handle h) From 3d7dd90e52575df64fc4b7dcbe637449a2580bb7 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Wed, 3 Jun 2026 14:14:59 +0200 Subject: [PATCH 03/10] [STF] Use range-based for loops in test_places (clang-tidy) Fix modernize-loop-convert clang-tidy errors by iterating the places array with range-based for loops instead of index-based loops. --- c/experimental/stf/test/test_places.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp index 6d97f683afe..9a7a8c68cf9 100644 --- a/c/experimental/stf/test/test_places.cpp +++ b/c/experimental/stf/test/test_places.cpp @@ -235,15 +235,15 @@ C2H_TEST("task on exec_place_grid: get_grid_dims and get_custream_at_index", "[t { const size_t nplaces = 2; stf_exec_place_handle places[2]; - for (size_t i = 0; i < nplaces; i++) + for (auto& place : places) { - places[i] = stf_exec_place_device(0); + place = stf_exec_place_device(0); } stf_exec_place_handle grid = stf_exec_place_grid_create(places, nplaces, nullptr); REQUIRE(grid != nullptr); - for (size_t i = 0; i < nplaces; i++) + for (auto& place : places) { - stf_exec_place_destroy(places[i]); + stf_exec_place_destroy(place); } stf_data_place_handle composite_dplace = stf_data_place_composite(grid, blocked_mapper_1d); From 929a1c45689ec72de9d9620ac5a9119eba515d50 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Fri, 5 Jun 2026 07:29:15 +0200 Subject: [PATCH 04/10] [STF] Add unified_task grid introspection used by places C API The places C bindings (stf_task_get_grid_dims / stf_task_get_custream_at_index) call get_grid_dims(dim4*) and get_stream(size_t) on context::unified_task<>, but those overloads were never declared on unified_task in this branch, so stf.cu failed to compile. Add both methods, dispatching the per-place stream to stream_task and returning nullptr/false for graph tasks or non-grid exec places. --- .../experimental/__stf/internal/context.cuh | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index f99c0115c33..6ca85908d30 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -321,6 +321,42 @@ public: }; } + /** When the task's exec place is a grid (size > 1), get the stream for the place at \p place_index + * (linear index). Returns nullptr for graph_task (no per-place streams). */ + cudaStream_t get_stream(size_t place_index) const + { + return payload->*[&](auto& self) -> cudaStream_t { + if constexpr (::std::is_same_v, ::std::decay_t>) + { + return self.get_stream(place_index); + } + else + { + (void) place_index; + return nullptr; + } + }; + } + + /** When the task's exec place is a grid (size > 1), write its shape to \p out_dims and return true; else return + * false. */ + bool get_grid_dims(dim4* out_dims) const + { + if (out_dims == nullptr) + { + return false; + } + return payload->*[&](auto& self) -> bool { + const exec_place& e = self.get_exec_place(); + if (e.size() <= 1) + { + return false; + } + *out_dims = e.get_dims(); + return true; + }; + } + // Get the underlying task base class - both stream_task and graph_task inherit from task. This is convenient when // we do not need the "typed" task, for example when using the "low-level" add_deps method. ::cuda::experimental::stf::task& get_base_task() From de602936a1def9df909e7003755af97bb79a3de6 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Fri, 5 Jun 2026 09:13:22 +0200 Subject: [PATCH 05/10] [STF] Bounds-check unified_task::get_stream(place_index) stream_task::get_stream(size_t) indexes the stream grid without any bounds check, so stf_task_get_custream_at_index could read past the grid for an out-of-range index (UB) and returned success for non-grid exec places, contradicting the documented contract (non-zero on "not a grid" / index out of range). Guard the linear index in the unified_task<> wrapper: return nullptr for graph tasks, non-grid exec places, and out-of-range indices. Add a regression check to the grid test for the out-of-range index case. --- c/experimental/stf/test/test_places.cpp | 4 ++++ .../cuda/experimental/__stf/internal/context.cuh | 10 +++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp index 9a7a8c68cf9..ddcfc5932e9 100644 --- a/c/experimental/stf/test/test_places.cpp +++ b/c/experimental/stf/test/test_places.cpp @@ -278,6 +278,10 @@ C2H_TEST("task on exec_place_grid: get_grid_dims and get_custream_at_index", "[t REQUIRE(s0 != nullptr); REQUIRE(s1 != nullptr); + // Out-of-range linear index must report an error rather than reading past the stream grid. + CUstream s_oob; + REQUIRE(stf_task_get_custream_at_index(t, 2, &s_oob) != 0); + stf_task_end(t); stf_task_destroy(t); diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index 6ca85908d30..9e8ab470fd6 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -322,12 +322,20 @@ public: } /** When the task's exec place is a grid (size > 1), get the stream for the place at \p place_index - * (linear index). Returns nullptr for graph_task (no per-place streams). */ + * (linear index). Returns nullptr for graph_task (no per-place streams), for non-grid exec places, or + * when \p place_index is out of range. */ cudaStream_t get_stream(size_t place_index) const { return payload->*[&](auto& self) -> cudaStream_t { if constexpr (::std::is_same_v, ::std::decay_t>) { + // Per-place streams only exist for grid exec places. stream_task::get_stream(size_t) indexes the + // stream grid without bounds checking, so guard the linear index here before forwarding. + const exec_place& e = self.get_exec_place(); + if (e.size() <= 1 || place_index >= e.size()) + { + return nullptr; + } return self.get_stream(place_index); } else From ca8081707e61f2e7209bffe664de4c6a872569b5 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Mon, 8 Jun 2026 07:26:13 +0200 Subject: [PATCH 06/10] [STF] Test green-context places C API Add direct C API coverage for green-context helper and green-context exec/data place factories so the extracted places bindings are self-contained. --- c/experimental/stf/test/test_places.cpp | 54 +++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp index ddcfc5932e9..06bc074bb49 100644 --- a/c/experimental/stf/test/test_places.cpp +++ b/c/experimental/stf/test/test_places.cpp @@ -10,6 +10,7 @@ #include +#include #include #include @@ -484,6 +485,59 @@ C2H_TEST("machine_init idempotent", "[places][machine]") stf_machine_init(); } +C2H_TEST("green_context_helper and green-context places", "[places][green_ctx]") +{ +#if !defined(CUDART_VERSION) || CUDART_VERSION < 12040 + REQUIRE(stf_green_context_helper_create(1, 0) == nullptr); +#else + stf_machine_init(); + stf_green_context_helper_handle helper = stf_green_context_helper_create(1, 0); + if (helper == nullptr) + { + SKIP("green context support is not available"); + } + + REQUIRE(stf_green_context_helper_get_device_id(helper) == 0); + const size_t count = stf_green_context_helper_get_count(helper); + REQUIRE(count >= 1); + + stf_exec_place_handle default_affine_ep = stf_exec_place_green_ctx(helper, 0, /*use_green_ctx_data_place=*/0); + REQUIRE(default_affine_ep != nullptr); + REQUIRE(stf_exec_place_is_device(default_affine_ep) != 0); + + stf_data_place_handle default_affine_dp = stf_exec_place_get_affine_data_place(default_affine_ep); + REQUIRE(default_affine_dp != nullptr); + REQUIRE(stf_data_place_get_device_ordinal(default_affine_dp) == 0); + + stf_exec_place_handle green_affine_ep = stf_exec_place_green_ctx(helper, 0, /*use_green_ctx_data_place=*/1); + REQUIRE(green_affine_ep != nullptr); + REQUIRE(stf_exec_place_is_device(green_affine_ep) != 0); + + stf_data_place_handle green_affine_dp = stf_exec_place_get_affine_data_place(green_affine_ep); + REQUIRE(green_affine_dp != nullptr); + REQUIRE(stf_data_place_get_device_ordinal(green_affine_dp) == 0); + const std::string green_affine_desc = stf_data_place_to_string(green_affine_dp); + REQUIRE(green_affine_desc.find("green_ctx") != std::string::npos); + + stf_data_place_handle green_dp = stf_data_place_green_ctx(helper, 0); + REQUIRE(green_dp != nullptr); + REQUIRE(stf_data_place_get_device_ordinal(green_dp) == 0); + REQUIRE(stf_data_place_allocation_is_stream_ordered(green_dp) == 1); + const std::string green_dp_desc = stf_data_place_to_string(green_dp); + REQUIRE(green_dp_desc.find("green_ctx") != std::string::npos); + + REQUIRE(stf_exec_place_green_ctx(helper, count, /*use_green_ctx_data_place=*/0) == nullptr); + REQUIRE(stf_data_place_green_ctx(helper, count) == nullptr); + + stf_data_place_destroy(green_dp); + stf_data_place_destroy(green_affine_dp); + stf_exec_place_destroy(green_affine_ep); + stf_data_place_destroy(default_affine_dp); + stf_exec_place_destroy(default_affine_ep); + stf_green_context_helper_destroy(helper); +#endif +} + C2H_TEST("data_place_allocate_device", "[places][allocate]") { stf_exec_place_resources_handle res = stf_exec_place_resources_create(); From 2aca459d30dac5a400a9b15ddb5a1adab8ed4f92 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Mon, 8 Jun 2026 18:07:03 +0200 Subject: [PATCH 07/10] [STF] Guard stf_machine_init at the C boundary machine::instance() does real work on first call (P2P/mempool/topology setup) and can throw. Wrap it in try/catch so a C++ exception never unwinds across the extern "C" boundary into a C caller (UB / terminate), matching the error-reporting convention used by stf_try_allocate. --- .../stf/include/cccl/c/experimental/stf/stf.h | 4 +++- c/experimental/stf/src/stf.cu | 20 ++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index c0fb4f16662..4389ff3fad2 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -246,7 +246,9 @@ stf_exec_place_handle stf_exec_place_green_ctx(stf_green_context_helper_handle helper, size_t idx, int use_green_ctx_data_place); //! \brief Initialize the machine singleton (P2P access, memory pool setup, topology). -//! Safe to call multiple times; only the first call has effect. +//! Safe to call multiple times; only the first call has effect. Any C++ exception +//! raised during initialization is caught and reported to stderr (never propagated +//! across the C boundary). void stf_machine_init(void); //! \brief Host (CPU/pinned) data placement. diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 5ab0347a2bc..fd5c1a25a76 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -444,7 +444,25 @@ stf_exec_place_green_ctx(stf_green_context_helper_handle helper, size_t idx, int void stf_machine_init(void) { - cuda::experimental::places::reserved::machine::instance(); + // machine::instance() does real work on first call (P2P/mempool/topology + // setup) and can throw. Guard the extern "C" boundary so a C++ exception + // never unwinds into a C caller (which would be UB / std::terminate). + try + { + cuda::experimental::places::reserved::machine::instance(); + } + catch (const ::std::exception& exc) + { + ::fflush(stdout); + ::std::fprintf(stderr, "\nEXCEPTION in STF C API (machine init): %s\n", exc.what()); + ::fflush(stderr); + } + catch (...) + { + ::fflush(stdout); + ::std::fprintf(stderr, "\nEXCEPTION in STF C API (machine init): non-standard exception\n"); + ::fflush(stderr); + } } stf_data_place_handle stf_data_place_host(void) From 55e0e84aa3dcbecc32231d215655776e1cc6b8eb Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Mon, 8 Jun 2026 18:13:01 +0200 Subject: [PATCH 08/10] [STF] Document allocate/deallocate size signedness rationale stf_data_place_allocate takes a signed ptrdiff_t while stf_data_place_deallocate takes an unsigned size_t. This mirrors the C++ allocator interface, where the requested size is passed by reference and negated to signal allocation failure; deallocation has no such error to signal. Document the asymmetry on both entry points so the C surface explains why the types differ. --- .../stf/include/cccl/c/experimental/stf/stf.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 4389ff3fad2..ebb13c11ca7 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -291,8 +291,14 @@ const char* stf_data_place_to_string(stf_data_place_handle h); //! For host/managed places \p stream is ignored. //! Returns NULL on failure (e.g. unsupported place type or out of memory). //! +//! \note \p size is signed (ptrdiff_t) to mirror the underlying C++ allocator +//! interface, where the requested size is passed by reference and negated to +//! signal allocation failure while preserving the requested amount. The matching +//! stf_data_place_deallocate() takes an unsigned size_t because at deallocation +//! the size is a known-good quantity with no error to signal. +//! //! \param h Data place handle (must not be NULL) -//! \param size Allocation size in bytes +//! \param size Allocation size in bytes (must be non-negative) //! \param stream CUDA stream for stream-ordered allocation (may be NULL) //! \return Pointer to allocated memory, or NULL on failure void* stf_data_place_allocate(stf_data_place_handle h, ptrdiff_t size, cudaStream_t stream); @@ -302,6 +308,10 @@ void* stf_data_place_allocate(stf_data_place_handle h, ptrdiff_t size, cudaStrea //! For device places the deallocation is stream-ordered (cudaFreeAsync). //! For host/managed places \p stream is ignored. //! +//! \note \p size is unsigned (size_t) on purpose: unlike stf_data_place_allocate(), +//! deallocation never signals failure through the size argument (see that +//! function's note), so it mirrors the unsigned C++ deallocate() signature. +//! //! \param h Data place handle (must not be NULL) //! \param ptr Pointer returned by stf_data_place_allocate() //! \param size Size of the original allocation in bytes From 9e9e70c6548229bbc2995a011b25132434b69ef9 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Mon, 8 Jun 2026 18:20:29 +0200 Subject: [PATCH 09/10] [STF] Fix Doxygen attachment on stf_task_get_* blocks The stf_task_get_grid_dims / stf_task_get_custream_at_index doc blocks opened with a lone //! line and had a blank line between the comment and the declaration, which can detach the comment from the symbol in doxygen. Drop the leading empty //! and the trailing blank line so each block binds to its function. --- c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index ebb13c11ca7..a7134cf3409 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -1154,7 +1154,6 @@ void stf_task_destroy(stf_task_handle t); void stf_task_enable_capture(stf_task_handle t); -//! //! \brief Get grid dimensions of a task's exec place //! //! When the task's execution place is a grid (size > 1), writes its @@ -1180,10 +1179,8 @@ void stf_task_enable_capture(stf_task_handle t); //! \endcode //! //! \see stf_task_get_custream_at_index() - int stf_task_get_grid_dims(stf_task_handle t, stf_dim4* out_dims); -//! //! \brief Get the CUDA stream for a specific grid index //! //! When the task's exec place is a grid, returns the CUstream for the @@ -1209,7 +1206,6 @@ int stf_task_get_grid_dims(stf_task_handle t, stf_dim4* out_dims); //! \endcode //! //! \see stf_task_get_grid_dims() - int stf_task_get_custream_at_index(stf_task_handle t, size_t place_index, CUstream* out_stream); //! \} From ab35eae5daecf710feea97264f7bc7e383ddf89e Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Mon, 8 Jun 2026 18:23:15 +0200 Subject: [PATCH 10/10] [STF] Document grid sentinel/stream-0 behavior for task accessors Note that stf_task_get_grid_dims treats a single-element exec place as "not a grid" (returns non-zero), and that stf_task_get_custream_at_index leaves out_stream untouched on failure and never yields the legacy default stream (CUstream 0) on success, since STF grids use non-default streams. --- c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index a7134cf3409..c0c9d8f0f60 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -1168,6 +1168,8 @@ void stf_task_enable_capture(stf_task_handle t); //! \pre stf_task_start() must have been called //! //! \note Total number of grid entries is out_dims->x * out_dims->y * out_dims->z * out_dims->t. +//! \note A single-element exec place (size 1) is intentionally not treated as a grid: this +//! returns non-zero for it, consistent with stf_task_get_custream_at_index(). //! //! \par Example: //! \code @@ -1194,6 +1196,10 @@ int stf_task_get_grid_dims(stf_task_handle t, stf_dim4* out_dims); //! \pre t must be valid task handle //! \pre stf_task_start() must have been called //! +//! \note On success \p out_stream is set to the grid index's stream; on failure it is left +//! untouched and a non-zero code is returned. STF grids always use non-default streams, so a +//! valid result is never the legacy default stream (CUstream 0). +//! //! \par Example: //! \code //! stf_dim4 dims;