From 8a89b7899a7b9d5dc9ea7d764c07d607251b5dda Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Wed, 3 Jun 2026 10:55:20 +0200
Subject: [PATCH 01/10] [STF] Add C bindings for the places layer

Extends the experimental STF C API to mirror the C++ places layer:

- green_context_helper (create/destroy/count/device id) and green-context
  exec_place / data_place factories (CUDA 12.4+).
- exec_place scope enter/exit (RAII context activation), affine data_place
  accessor, and grid sub-place accessor (get_place).
- data_place stream-ordered allocate/deallocate and an
  allocation_is_stream_ordered query, plus machine_init.
- task grid accessors: get_grid_dims and get_custream_at_index.

Adds coverage in test_places.cpp. Extracted from the python-bindings PR
to keep that change reviewable.
---
 .../stf/include/cccl/c/experimental/stf/stf.h | 140 +++++++-
 c/experimental/stf/src/stf.cu                 | 210 +++++++++++-
 c/experimental/stf/test/test_places.cpp       | 322 ++++++++++++++++--
 3 files changed, 646 insertions(+), 26 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 247b35d9bbb..d5a80eb7953 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -106,6 +106,12 @@ typedef struct stf_exec_place_opaque_t* stf_exec_place_handle;
 //! \brief Opaque handle to a \c data_place.
 typedef struct stf_data_place_opaque_t* stf_data_place_handle;
 
+//! \brief Opaque handle to a \c green_context_helper.
+typedef struct stf_green_context_helper_opaque_t* stf_green_context_helper_handle;
+
+//! \brief Opaque handle to an active exec_place_scope (RAII context activation).
+typedef struct stf_exec_place_scope_opaque_t* stf_exec_place_scope_handle;
+
 //! \brief Opaque handle to an \c exec_place_resources registry.
 //!
 //! Handles returned by stf_exec_place_resources_create() are owned by the
@@ -150,6 +156,19 @@ stf_exec_place_handle stf_exec_place_device(int dev_id);
 //! \brief Create execution place for the current CUDA device.
 stf_exec_place_handle stf_exec_place_current_device(void);
 
+//! \brief Create a green-context helper for \p dev_id with \p sm_count SMs per green context.
+//! Requires CUDA 12.4+. Returns NULL on failure.
+stf_green_context_helper_handle stf_green_context_helper_create(int sm_count, int dev_id);
+
+//! \brief Destroy a green-context helper handle.
+void stf_green_context_helper_destroy(stf_green_context_helper_handle h);
+
+//! \brief Number of green contexts created by \p h.
+size_t stf_green_context_helper_get_count(stf_green_context_helper_handle h);
+
+//! \brief Device ordinal used by this green-context helper.
+int stf_green_context_helper_get_device_id(stf_green_context_helper_handle h);
+
 //! \brief Deep copy of an execution place handle (caller must stf_exec_place_destroy the result).
 stf_exec_place_handle stf_exec_place_clone(stf_exec_place_handle h);
 
@@ -181,7 +200,20 @@ stf_exec_place_grid_create(const stf_exec_place_handle* places, size_t count, co
 //! \brief Same as stf_exec_place_destroy (grids are exec_place handles).
 void stf_exec_place_grid_destroy(stf_exec_place_handle grid);
 
-//! \brief Create a fresh exec_place_resources registry for standalone place-layer use.
+//! \brief Activate the sub-place at linear index \p idx (0 for scalar places).
+//! Saves the current CUDA context; call stf_exec_place_scope_exit to restore.
+//! \return Opaque scope handle, or NULL on failure.
+stf_exec_place_scope_handle stf_exec_place_scope_enter(stf_exec_place_handle place, size_t idx);
+
+//! \brief Restore the CUDA context saved by stf_exec_place_scope_enter and destroy the scope.
+//! \p scope may be NULL (no-op).
+void stf_exec_place_scope_exit(stf_exec_place_scope_handle scope);
+
+//! \brief Get the affine data_place associated with this exec_place.
+//! Caller must stf_data_place_destroy the result.
+stf_data_place_handle stf_exec_place_get_affine_data_place(stf_exec_place_handle h);
+
+//! \brief Create a fresh, empty exec_place_resources registry.
 //!
 //! The registry lazily creates and owns stream pools for places used with
 //! stf_exec_place_pick_stream(). Destroying it releases every stream it owns.
@@ -202,6 +234,21 @@ void stf_exec_place_resources_destroy(stf_exec_place_resources_handle h);
 //! finalized for a borrowed registry.
 CUstream stf_exec_place_pick_stream(stf_exec_place_resources_handle res, stf_exec_place_handle h, int for_computation);
 
+//! \brief Get the sub-place at linear index \p idx.
+//! For scalar places, \p idx must be 0. Returns NULL if \p idx is out of bounds.
+//! Caller must stf_exec_place_destroy the result.
+stf_exec_place_handle stf_exec_place_get_place(stf_exec_place_handle h, size_t idx);
+
+//! \brief Create an exec_place from green-context helper \p helper and view index \p idx.
+//! If \p use_green_ctx_data_place is non-zero, set the affine data_place to a green-context data place.
+//! Returns NULL on failure or if \p idx is out of range.
+stf_exec_place_handle
+stf_exec_place_green_ctx(stf_green_context_helper_handle helper, size_t idx, int use_green_ctx_data_place);
+
+//! \brief Initialize the machine singleton (P2P access, memory pool setup, topology).
+//! Safe to call multiple times; only the first call has effect.
+void stf_machine_init(void);
+
 //! \brief Host (CPU/pinned) data placement.
 stf_data_place_handle stf_data_place_host(void);
 
@@ -220,6 +267,10 @@ stf_data_place_handle stf_data_place_current_device(void);
 //! \brief Composite partitioned placement over a grid of execution places.
 stf_data_place_handle stf_data_place_composite(stf_exec_place_handle grid, stf_get_executor_fn mapper);
 
+//! \brief Create a data_place from green-context helper \p helper and view index \p idx.
+//! Returns NULL on failure or if \p idx is out of range.
+stf_data_place_handle stf_data_place_green_ctx(stf_green_context_helper_handle helper, size_t idx);
+
 //! \brief Deep copy (caller must stf_data_place_destroy).
 stf_data_place_handle stf_data_place_clone(stf_data_place_handle h);
 
@@ -232,6 +283,35 @@ int stf_data_place_get_device_ordinal(stf_data_place_handle h);
 //! \brief Human-readable description; pointer valid until the next call on this thread.
 const char* stf_data_place_to_string(stf_data_place_handle h);
 
+//! \brief Allocate \p size bytes at this data place.
+//!
+//! For device places the allocation is stream-ordered (cudaMallocAsync).
+//! For host/managed places \p stream is ignored.
+//! Returns NULL on failure (e.g. unsupported place type or out of memory).
+//!
+//! \param h     Data place handle (must not be NULL)
+//! \param size  Allocation size in bytes
+//! \param stream CUDA stream for stream-ordered allocation (may be NULL)
+//! \return Pointer to allocated memory, or NULL on failure
+void* stf_data_place_allocate(stf_data_place_handle h, ptrdiff_t size, cudaStream_t stream);
+
+//! \brief Deallocate memory previously obtained from stf_data_place_allocate().
+//!
+//! For device places the deallocation is stream-ordered (cudaFreeAsync).
+//! For host/managed places \p stream is ignored.
+//!
+//! \param h      Data place handle (must not be NULL)
+//! \param ptr    Pointer returned by stf_data_place_allocate()
+//! \param size   Size of the original allocation in bytes
+//! \param stream CUDA stream for stream-ordered deallocation (may be NULL)
+void stf_data_place_deallocate(stf_data_place_handle h, void* ptr, size_t size, cudaStream_t stream);
+
+//! \brief Query whether allocations on this place are stream-ordered.
+//!
+//! \param h Data place handle (must not be NULL)
+//! \return 1 if stream-ordered, 0 otherwise
+int stf_data_place_allocation_is_stream_ordered(stf_data_place_handle h);
+
 //! \}
 
 //! \defgroup Handles Opaque Handles
@@ -1062,6 +1142,64 @@ void stf_task_destroy(stf_task_handle t);
 
 void stf_task_enable_capture(stf_task_handle t);
 
+//!
+//! \brief Get grid dimensions of a task's exec place
+//!
+//! When the task's execution place is a grid (size > 1), writes its
+//! shape to \p out_dims. Returns 0 on success, non-zero if the task's
+//! exec place is not a grid or \p out_dims is NULL.
+//!
+//! \param t Task handle
+//! \param[out] out_dims On success, the grid shape (x, y, z, t) is written here. Must not be NULL.
+//! \return 0 on success; non-zero if task exec place is not a grid or \p out_dims is NULL
+//!
+//! \pre t must be valid task handle
+//! \pre stf_task_start() must have been called
+//!
+//! \note Total number of grid entries is out_dims->x * out_dims->y * out_dims->z * out_dims->t.
+//!
+//! \par Example:
+//! \code
+//! stf_task_start(task);
+//! stf_dim4 dims;
+//! if (stf_task_get_grid_dims(task, &dims) == 0) {
+//!     printf("Grid: %lu x %lu\n", dims.x, dims.y);
+//! }
+//! \endcode
+//!
+//! \see stf_task_get_custream_at_index()
+
+int stf_task_get_grid_dims(stf_task_handle t, stf_dim4* out_dims);
+
+//!
+//! \brief Get the CUDA stream for a specific grid index
+//!
+//! When the task's exec place is a grid, returns the CUstream for the
+//! given linear index (0 to product of grid dims - 1).
+//!
+//! \param t Task handle (must have been started; exec place must be a grid)
+//! \param place_index Linear index in the grid (0-based; use stf_task_get_grid_dims to get shape)
+//! \param[out] out_stream On success, the stream for that index is written here. Must not be NULL.
+//! \return 0 on success; non-zero if task is not a grid, index out of range, or no per-index streams
+//!
+//! \pre t must be valid task handle
+//! \pre stf_task_start() must have been called
+//!
+//! \par Example:
+//! \code
+//! stf_dim4 dims;
+//! stf_task_get_grid_dims(task, &dims);
+//! for (size_t i = 0; i < dims.x; ++i) {
+//!     CUstream s;
+//!     stf_task_get_custream_at_index(task, i, &s);
+//!     // launch work on stream s
+//! }
+//! \endcode
+//!
+//! \see stf_task_get_grid_dims()
+
+int stf_task_get_custream_at_index(stf_task_handle t, size_t place_index, CUstream* out_stream);
+
 //! \}
 
 //! \defgroup CUDAKernel CUDA Kernel Interface
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 2068903963f..f982eced864 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -120,6 +120,16 @@ template <class P>
   {
     return static_cast<stf_host_launch_deps_handle>(opaque_bits);
   }
+  else if constexpr (::std::is_same_v<P, exec_place_scope>)
+  {
+    return static_cast<stf_exec_place_scope_handle>(opaque_bits);
+  }
+#if _CCCL_CTK_AT_LEAST(12, 4)
+  else if constexpr (::std::is_same_v<P, green_context_helper>)
+  {
+    return static_cast<stf_green_context_helper_handle>(opaque_bits);
+  }
+#endif
   else
   {
     static_assert(stf_dependent_false_v<P>, "to_opaque: missing pointee -> handle pairing");
@@ -161,6 +171,16 @@ template <class Opaque>
   {
     return static_cast<const reserved::host_launch_deps*>(opaque_bits);
   }
+  else if constexpr (::std::is_same_v<Opaque*, stf_exec_place_scope_handle>)
+  {
+    return static_cast<const exec_place_scope*>(opaque_bits);
+  }
+#if _CCCL_CTK_AT_LEAST(12, 4)
+  else if constexpr (::std::is_same_v<Opaque*, stf_green_context_helper_handle>)
+  {
+    return static_cast<const green_context_helper*>(opaque_bits);
+  }
+#endif
   else
   {
     static_assert(stf_dependent_false_v<Opaque>, "from_opaque_const: missing handle -> pointee pairing");
@@ -198,6 +218,50 @@ stf_exec_place_handle stf_exec_place_current_device(void)
   }));
 }
 
+stf_green_context_helper_handle stf_green_context_helper_create(int sm_count, int dev_id)
+{
+#if _CCCL_CTK_AT_LEAST(12, 4)
+  return to_opaque(stf_try_allocate([sm_count, dev_id] {
+    return new green_context_helper(sm_count, dev_id);
+  }));
+#else
+  (void) sm_count;
+  (void) dev_id;
+  return nullptr;
+#endif
+}
+
+void stf_green_context_helper_destroy(stf_green_context_helper_handle h)
+{
+#if _CCCL_CTK_AT_LEAST(12, 4)
+  delete from_opaque(h);
+#else
+  (void) h;
+#endif
+}
+
+size_t stf_green_context_helper_get_count(stf_green_context_helper_handle h)
+{
+#if _CCCL_CTK_AT_LEAST(12, 4)
+  _CCCL_ASSERT(h != nullptr, "green_context_helper handle must not be null");
+  return from_opaque(h)->get_count();
+#else
+  (void) h;
+  return 0;
+#endif
+}
+
+int stf_green_context_helper_get_device_id(stf_green_context_helper_handle h)
+{
+#if _CCCL_CTK_AT_LEAST(12, 4)
+  _CCCL_ASSERT(h != nullptr, "green_context_helper handle must not be null");
+  return static_cast<int>(from_opaque(h)->get_device_id());
+#else
+  (void) h;
+  return -1;
+#endif
+}
+
 stf_exec_place_handle stf_exec_place_clone(stf_exec_place_handle h)
 {
   _CCCL_ASSERT(h != nullptr, "exec_place handle must not be null");
@@ -281,6 +345,27 @@ void stf_exec_place_grid_destroy(stf_exec_place_handle grid)
   stf_exec_place_destroy(grid);
 }
 
+stf_exec_place_scope_handle stf_exec_place_scope_enter(stf_exec_place_handle place, size_t idx)
+{
+  _CCCL_ASSERT(place != nullptr, "exec_place handle must not be null");
+  return to_opaque(stf_try_allocate([&] {
+    return new exec_place_scope(*from_opaque(place), idx);
+  }));
+}
+
+void stf_exec_place_scope_exit(stf_exec_place_scope_handle scope)
+{
+  delete from_opaque(scope);
+}
+
+stf_data_place_handle stf_exec_place_get_affine_data_place(stf_exec_place_handle h)
+{
+  _CCCL_ASSERT(h != nullptr, "exec_place handle must not be null");
+  return to_opaque(stf_try_allocate([h] {
+    return new data_place(from_opaque(h)->affine_data_place());
+  }));
+}
+
 stf_exec_place_resources_handle stf_exec_place_resources_create(void)
 {
   return stf_try_allocate([] {
@@ -317,7 +402,45 @@ CUstream stf_exec_place_pick_stream(stf_exec_place_resources_handle res, stf_exe
 {
   _CCCL_ASSERT(res != nullptr, "exec_place_resources handle must not be null");
   _CCCL_ASSERT(h != nullptr, "exec_place handle must not be null");
-  return reinterpret_cast<CUstream>(from_opaque(h)->pick_stream(*res->resources, for_computation != 0));
+  return reinterpret_cast<CUstream>(from_opaque(h)->pick_stream(*from_opaque(res), for_computation != 0));
+}
+
+stf_exec_place_handle stf_exec_place_get_place(stf_exec_place_handle h, size_t idx)
+{
+  _CCCL_ASSERT(h != nullptr, "exec_place handle must not be null");
+  if (idx >= from_opaque(h)->size())
+  {
+    return nullptr;
+  }
+  return to_opaque(stf_try_allocate([h, idx] {
+    return new exec_place(from_opaque(h)->get_place(idx));
+  }));
+}
+
+stf_exec_place_handle
+stf_exec_place_green_ctx(stf_green_context_helper_handle helper, size_t idx, int use_green_ctx_data_place)
+{
+#if _CCCL_CTK_AT_LEAST(12, 4)
+  _CCCL_ASSERT(helper != nullptr, "green_context_helper handle must not be null");
+  auto* gc_helper = from_opaque(helper);
+  if (idx >= gc_helper->get_count())
+  {
+    return nullptr;
+  }
+  return to_opaque(stf_try_allocate([gc_helper, idx, use_green_ctx_data_place] {
+    return new exec_place(exec_place::green_ctx(gc_helper->get_view(idx), use_green_ctx_data_place != 0));
+  }));
+#else
+  (void) helper;
+  (void) idx;
+  (void) use_green_ctx_data_place;
+  return nullptr;
+#endif
+}
+
+void stf_machine_init(void)
+{
+  cuda::experimental::places::reserved::machine::instance();
 }
 
 stf_data_place_handle stf_data_place_host(void)
@@ -369,6 +492,25 @@ stf_data_place_handle stf_data_place_composite(stf_exec_place_handle grid, stf_g
   return to_opaque(dp);
 }
 
+stf_data_place_handle stf_data_place_green_ctx(stf_green_context_helper_handle helper, size_t idx)
+{
+#if _CCCL_CTK_AT_LEAST(12, 4)
+  _CCCL_ASSERT(helper != nullptr, "green_context_helper handle must not be null");
+  auto* gc_helper = from_opaque(helper);
+  if (idx >= gc_helper->get_count())
+  {
+    return nullptr;
+  }
+  return to_opaque(stf_try_allocate([gc_helper, idx] {
+    return new data_place(data_place::green_ctx(gc_helper->get_view(idx)));
+  }));
+#else
+  (void) helper;
+  (void) idx;
+  return nullptr;
+#endif
+}
+
 stf_data_place_handle stf_data_place_clone(stf_data_place_handle h)
 {
   _CCCL_ASSERT(h != nullptr, "data_place handle must not be null");
@@ -397,6 +539,37 @@ const char* stf_data_place_to_string(stf_data_place_handle h)
   return s.c_str();
 }
 
+void* stf_data_place_allocate(stf_data_place_handle h, ptrdiff_t size, cudaStream_t stream)
+{
+  _CCCL_ASSERT(h != nullptr, "data_place handle must not be null");
+  try
+  {
+    return from_opaque(h)->allocate(static_cast<::std::ptrdiff_t>(size), stream);
+  }
+  catch (const ::std::exception& e)
+  {
+    fprintf(stderr, "stf_data_place_allocate failed: %s\n", e.what());
+    return nullptr;
+  }
+  catch (...)
+  {
+    fprintf(stderr, "stf_data_place_allocate failed: unknown exception\n");
+    return nullptr;
+  }
+}
+
+void stf_data_place_deallocate(stf_data_place_handle h, void* ptr, size_t size, cudaStream_t stream)
+{
+  _CCCL_ASSERT(h != nullptr, "data_place handle must not be null");
+  from_opaque(h)->deallocate(ptr, size, stream);
+}
+
+int stf_data_place_allocation_is_stream_ordered(stf_data_place_handle h)
+{
+  _CCCL_ASSERT(h != nullptr, "data_place handle must not be null");
+  return from_opaque(h)->allocation_is_stream_ordered() ? 1 : 0;
+}
+
 stf_ctx_handle stf_ctx_create(void)
 {
   return to_opaque(stf_try_allocate([] {
@@ -702,6 +875,41 @@ CUstream stf_task_get_custream(stf_task_handle t)
   return static_cast<CUstream>(task_ptr->get_stream());
 }
 
+int stf_task_get_grid_dims(stf_task_handle t, stf_dim4* out_dims)
+{
+  if (t == nullptr || out_dims == nullptr)
+  {
+    return -1;
+  }
+  auto* task_ptr = from_opaque(t);
+  dim4 d;
+  if (!task_ptr->get_grid_dims(&d))
+  {
+    return -1;
+  }
+  out_dims->x = static_cast<uint64_t>(d.x);
+  out_dims->y = static_cast<uint64_t>(d.y);
+  out_dims->z = static_cast<uint64_t>(d.z);
+  out_dims->t = static_cast<uint64_t>(d.t);
+  return 0;
+}
+
+int stf_task_get_custream_at_index(stf_task_handle t, size_t place_index, CUstream* out_stream)
+{
+  if (t == nullptr || out_stream == nullptr)
+  {
+    return -1;
+  }
+  auto* task_ptr = from_opaque(t);
+  cudaStream_t s = task_ptr->get_stream(place_index);
+  if (s == nullptr)
+  {
+    return -1;
+  }
+  *out_stream = static_cast<CUstream>(s);
+  return 0;
+}
+
 void stf_task_destroy(stf_task_handle t)
 {
   _CCCL_ASSERT(t != nullptr, "task handle must not be null");
diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp
index 9b690813862..6d97f683afe 100644
--- a/c/experimental/stf/test/test_places.cpp
+++ b/c/experimental/stf/test/test_places.cpp
@@ -231,60 +231,334 @@ C2H_TEST("composite data place with stf_exec_place_grid_create (vector of places
   }
 }
 
-C2H_TEST("exec_place_pick_stream standalone resources", "[places][stream]")
+C2H_TEST("task on exec_place_grid: get_grid_dims and get_custream_at_index", "[task][places][grid]")
 {
+  const size_t nplaces = 2;
+  stf_exec_place_handle places[2];
+  for (size_t i = 0; i < nplaces; i++)
+  {
+    places[i] = stf_exec_place_device(0);
+  }
+  stf_exec_place_handle grid = stf_exec_place_grid_create(places, nplaces, nullptr);
+  REQUIRE(grid != nullptr);
+  for (size_t i = 0; i < nplaces; i++)
+  {
+    stf_exec_place_destroy(places[i]);
+  }
+
+  stf_data_place_handle composite_dplace = stf_data_place_composite(grid, blocked_mapper_1d);
+  REQUIRE(composite_dplace != nullptr);
+  stf_exec_place_set_affine_data_place(grid, composite_dplace);
+
+  stf_ctx_handle ctx = stf_ctx_create();
+  REQUIRE(ctx != nullptr);
+
+  std::vector<float> X(4, 0.0f);
+
+  stf_logical_data_handle lX = stf_logical_data(ctx, X.data(), X.size() * sizeof(float));
+  REQUIRE(lX != nullptr);
+
+  stf_task_handle t = stf_task_create(ctx);
+  REQUIRE(t != nullptr);
+  stf_task_set_exec_place(t, grid);
+  stf_task_add_dep(t, lX, STF_RW);
+  stf_task_start(t);
+
+  stf_dim4 dims;
+  int got_dims = stf_task_get_grid_dims(t, &dims);
+  REQUIRE(got_dims == 0);
+  REQUIRE(dims.x == 2);
+  REQUIRE(dims.y == 1);
+  REQUIRE(dims.z == 1);
+  REQUIRE(dims.t == 1);
+
+  CUstream s0, s1;
+  REQUIRE(stf_task_get_custream_at_index(t, 0, &s0) == 0);
+  REQUIRE(stf_task_get_custream_at_index(t, 1, &s1) == 0);
+  REQUIRE(s0 != nullptr);
+  REQUIRE(s1 != nullptr);
+
+  stf_task_end(t);
+  stf_task_destroy(t);
+
+  stf_data_place_destroy(composite_dplace);
+  stf_exec_place_grid_destroy(grid);
+  stf_logical_data_destroy(lX);
+  stf_ctx_finalize(ctx);
+}
+
+C2H_TEST("task get_grid_dims returns error for non-grid exec_place", "[task][places][grid]")
+{
+  stf_ctx_handle ctx = stf_ctx_create();
+  REQUIRE(ctx != nullptr);
+
+  float val   = 0.0f;
+  auto lX     = stf_logical_data(ctx, &val, sizeof(float));
+  auto e_dev0 = stf_exec_place_device(0);
+
+  stf_task_handle t = stf_task_create(ctx);
+  REQUIRE(t != nullptr);
+  stf_task_set_exec_place(t, e_dev0);
+  stf_task_add_dep(t, lX, STF_RW);
+  stf_task_start(t);
+
+  stf_dim4 dims;
+  REQUIRE(stf_task_get_grid_dims(t, &dims) != 0);
+
+  stf_task_end(t);
+  stf_task_destroy(t);
+
+  stf_exec_place_destroy(e_dev0);
+  stf_logical_data_destroy(lX);
+  stf_ctx_finalize(ctx);
+}
+
+// ===== Place scope and accessor tests (task-free usage) =====
+
+C2H_TEST("exec_place_scope enter/exit", "[places][scope]")
+{
+  stf_machine_init();
+  stf_exec_place_handle dev0 = stf_exec_place_device(0);
+  REQUIRE(dev0 != nullptr);
+
+  stf_exec_place_scope_handle scope = stf_exec_place_scope_enter(dev0, 0);
+  REQUIRE(scope != nullptr);
+
+  stf_exec_place_scope_exit(scope);
+  stf_exec_place_scope_exit(nullptr);
+
+  stf_exec_place_destroy(dev0);
+}
+
+C2H_TEST("exec_place_scope nested", "[places][scope]")
+{
+  stf_machine_init();
+  stf_exec_place_handle dev0 = stf_exec_place_device(0);
+  REQUIRE(dev0 != nullptr);
+
+  stf_exec_place_scope_handle outer = stf_exec_place_scope_enter(dev0, 0);
+  REQUIRE(outer != nullptr);
+
+  stf_exec_place_scope_handle inner = stf_exec_place_scope_enter(dev0, 0);
+  REQUIRE(inner != nullptr);
+
+  stf_exec_place_scope_exit(inner);
+  stf_exec_place_scope_exit(outer);
+
+  stf_exec_place_destroy(dev0);
+}
+
+C2H_TEST("exec_place_get_affine_data_place", "[places][accessor]")
+{
+  stf_exec_place_handle dev0 = stf_exec_place_device(0);
+  REQUIRE(dev0 != nullptr);
+
+  stf_data_place_handle dp = stf_exec_place_get_affine_data_place(dev0);
+  REQUIRE(dp != nullptr);
+  REQUIRE(stf_data_place_get_device_ordinal(dp) == 0);
+
+  stf_data_place_destroy(dp);
+  stf_exec_place_destroy(dev0);
+}
+
+C2H_TEST("exec_place_pick_stream standalone", "[places][scope][stream]")
+{
+  stf_machine_init();
+  // Standalone use: no STF context required, just a registry the caller owns.
   stf_exec_place_resources_handle res = stf_exec_place_resources_create();
   REQUIRE(res != nullptr);
 
-  stf_exec_place_handle place = stf_exec_place_current_device();
-  REQUIRE(place != nullptr);
+  stf_exec_place_handle dev0 = stf_exec_place_device(0);
+  REQUIRE(dev0 != nullptr);
 
-  CUstream stream = stf_exec_place_pick_stream(res, place, /*for_computation=*/1);
-  REQUIRE(stream != nullptr);
-  REQUIRE(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)) == cudaSuccess);
+  stf_exec_place_scope_handle scope = stf_exec_place_scope_enter(dev0, 0);
+  REQUIRE(scope != nullptr);
 
-  stf_exec_place_destroy(place);
+  CUstream s = stf_exec_place_pick_stream(res, dev0, /*for_computation=*/1);
+  REQUIRE(s != nullptr);
+
+  stf_exec_place_scope_exit(scope);
+  stf_exec_place_destroy(dev0);
   stf_exec_place_resources_destroy(res);
 }
 
-C2H_TEST("exec_place resources are independent", "[places][stream]")
+C2H_TEST("exec_place resources are independent", "[places][scope][stream]")
 {
+  stf_machine_init();
   stf_exec_place_resources_handle res1 = stf_exec_place_resources_create();
   stf_exec_place_resources_handle res2 = stf_exec_place_resources_create();
   REQUIRE(res1 != nullptr);
   REQUIRE(res2 != nullptr);
 
-  stf_exec_place_handle place = stf_exec_place_current_device();
-  REQUIRE(place != nullptr);
+  stf_exec_place_handle dev0 = stf_exec_place_device(0);
+  REQUIRE(dev0 != nullptr);
+
+  stf_exec_place_scope_handle scope = stf_exec_place_scope_enter(dev0, 0);
+  REQUIRE(scope != nullptr);
 
-  CUstream stream1 = stf_exec_place_pick_stream(res1, place, /*for_computation=*/1);
-  CUstream stream2 = stf_exec_place_pick_stream(res2, place, /*for_computation=*/1);
+  CUstream stream1 = stf_exec_place_pick_stream(res1, dev0, /*for_computation=*/1);
+  CUstream stream2 = stf_exec_place_pick_stream(res2, dev0, /*for_computation=*/1);
   REQUIRE(stream1 != nullptr);
   REQUIRE(stream2 != nullptr);
   REQUIRE(stream1 != stream2);
 
-  stf_exec_place_destroy(place);
+  stf_exec_place_scope_exit(scope);
+  stf_exec_place_destroy(dev0);
   stf_exec_place_resources_destroy(res2);
   stf_exec_place_resources_destroy(res1);
 }
 
-C2H_TEST("exec_place_pick_stream borrowed context resources", "[places][stream][ctx]")
+C2H_TEST("exec_place_pick_stream borrowed from context", "[places][scope][stream][ctx]")
 {
-  stf_ctx_handle ctx = stf_ctx_create();
-  REQUIRE(ctx != nullptr);
-
+  stf_machine_init();
+  stf_ctx_handle ctx                  = stf_ctx_create();
   stf_exec_place_resources_handle res = stf_ctx_get_place_resources(ctx);
   REQUIRE(res != nullptr);
 
-  stf_exec_place_handle place = stf_exec_place_current_device();
-  REQUIRE(place != nullptr);
+  stf_exec_place_handle dev0        = stf_exec_place_device(0);
+  stf_exec_place_scope_handle scope = stf_exec_place_scope_enter(dev0, 0);
 
-  CUstream stream = stf_exec_place_pick_stream(res, place, /*for_computation=*/1);
-  REQUIRE(stream != nullptr);
-  REQUIRE(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)) == cudaSuccess);
+  CUstream s = stf_exec_place_pick_stream(res, dev0, /*for_computation=*/1);
+  REQUIRE(s != nullptr);
 
+  stf_exec_place_scope_exit(scope);
+  stf_exec_place_destroy(dev0);
+  // `res` is a non-owning wrapper around context resources; destroy only the wrapper.
   stf_exec_place_resources_destroy(res);
-
-  stf_exec_place_destroy(place);
   stf_ctx_finalize(ctx);
 }
+
+C2H_TEST("exec_place_get_place on grid", "[places][accessor][grid]")
+{
+  const size_t nplaces       = 2;
+  int device_ids[2]          = {0, 0};
+  stf_exec_place_handle grid = stf_exec_place_grid_from_devices(device_ids, nplaces);
+  REQUIRE(grid != nullptr);
+
+  stf_exec_place_handle sub0 = stf_exec_place_get_place(grid, 0);
+  stf_exec_place_handle sub1 = stf_exec_place_get_place(grid, 1);
+  REQUIRE(sub0 != nullptr);
+  REQUIRE(sub1 != nullptr);
+  REQUIRE(stf_exec_place_is_device(sub0) != 0);
+  REQUIRE(stf_exec_place_is_device(sub1) != 0);
+
+  stf_exec_place_destroy(sub0);
+  stf_exec_place_destroy(sub1);
+  stf_exec_place_grid_destroy(grid);
+}
+
+C2H_TEST("exec_place_get_place on scalar", "[places][accessor]")
+{
+  stf_exec_place_handle dev0 = stf_exec_place_device(0);
+  REQUIRE(dev0 != nullptr);
+
+  stf_exec_place_handle sub = stf_exec_place_get_place(dev0, 0);
+  REQUIRE(sub != nullptr);
+  REQUIRE(stf_exec_place_is_device(sub) != 0);
+
+  stf_exec_place_destroy(sub);
+  stf_exec_place_destroy(dev0);
+}
+
+C2H_TEST("exec_place_get_place out of bounds", "[places][accessor]")
+{
+  stf_exec_place_handle dev0 = stf_exec_place_device(0);
+  REQUIRE(dev0 != nullptr);
+  REQUIRE(stf_exec_place_get_place(dev0, 1) == nullptr);
+  stf_exec_place_destroy(dev0);
+
+  int device_ids[2]          = {0, 0};
+  stf_exec_place_handle grid = stf_exec_place_grid_from_devices(device_ids, 2);
+  REQUIRE(grid != nullptr);
+  REQUIRE(stf_exec_place_get_place(grid, 2) == nullptr);
+  stf_exec_place_grid_destroy(grid);
+}
+
+C2H_TEST("machine_init idempotent", "[places][machine]")
+{
+  stf_machine_init();
+  stf_machine_init();
+}
+
+C2H_TEST("data_place_allocate_device", "[places][allocate]")
+{
+  stf_exec_place_resources_handle res = stf_exec_place_resources_create();
+  stf_exec_place_handle ep            = stf_exec_place_device(0);
+  REQUIRE(ep != nullptr);
+
+  stf_exec_place_scope_handle scope = stf_exec_place_scope_enter(ep, 0);
+  REQUIRE(scope != nullptr);
+
+  CUstream stream              = stf_exec_place_pick_stream(res, ep, /*for_computation=*/0);
+  stf_data_place_handle dplace = stf_exec_place_get_affine_data_place(ep);
+  REQUIRE(dplace != nullptr);
+
+  void* ptr = stf_data_place_allocate(dplace, 1024, reinterpret_cast<cudaStream_t>(stream));
+  REQUIRE(ptr != nullptr);
+
+  stf_data_place_deallocate(dplace, ptr, 1024, reinterpret_cast<cudaStream_t>(stream));
+
+  stf_data_place_destroy(dplace);
+  stf_exec_place_scope_exit(scope);
+  stf_exec_place_destroy(ep);
+  stf_exec_place_resources_destroy(res);
+}
+
+C2H_TEST("data_place_allocate_host", "[places][allocate]")
+{
+  stf_data_place_handle dplace = stf_data_place_host();
+  REQUIRE(dplace != nullptr);
+
+  void* ptr = stf_data_place_allocate(dplace, 256, nullptr);
+  REQUIRE(ptr != nullptr);
+
+  int* buf = static_cast<int*>(ptr);
+  buf[0]   = 42;
+  REQUIRE(buf[0] == 42);
+
+  stf_data_place_deallocate(dplace, ptr, 256, nullptr);
+  stf_data_place_destroy(dplace);
+}
+
+C2H_TEST("data_place_allocate_managed", "[places][allocate]")
+{
+  stf_data_place_handle dplace = stf_data_place_managed();
+  REQUIRE(dplace != nullptr);
+
+  void* ptr = stf_data_place_allocate(dplace, 512, nullptr);
+  REQUIRE(ptr != nullptr);
+
+  int* buf = static_cast<int*>(ptr);
+  buf[0]   = 99;
+  REQUIRE(buf[0] == 99);
+
+  stf_data_place_deallocate(dplace, ptr, 512, nullptr);
+  stf_data_place_destroy(dplace);
+}
+
+C2H_TEST("data_place_allocation_is_stream_ordered", "[places][allocate]")
+{
+  stf_data_place_handle dev = stf_data_place_device(0);
+  REQUIRE(dev != nullptr);
+  REQUIRE(stf_data_place_allocation_is_stream_ordered(dev) == 1);
+  stf_data_place_destroy(dev);
+
+  stf_data_place_handle host = stf_data_place_host();
+  REQUIRE(host != nullptr);
+  REQUIRE(stf_data_place_allocation_is_stream_ordered(host) == 0);
+  stf_data_place_destroy(host);
+
+  stf_data_place_handle mgd = stf_data_place_managed();
+  REQUIRE(mgd != nullptr);
+  REQUIRE(stf_data_place_allocation_is_stream_ordered(mgd) == 0);
+  stf_data_place_destroy(mgd);
+}
+
+C2H_TEST("data_place_allocate_invalid_returns_null", "[places][allocate]")
+{
+  stf_data_place_handle inv = stf_data_place_affine();
+  REQUIRE(inv != nullptr);
+  void* ptr = stf_data_place_allocate(inv, 64, nullptr);
+  REQUIRE(ptr == nullptr);
+  stf_data_place_destroy(inv);
+}

From 9e2a4b4592a279a033f24e008bd903b2117a2b55 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Wed, 3 Jun 2026 14:07:12 +0200
Subject: [PATCH 02/10] [STF] Harden places C API at the C/FFI boundary

Address CodeRabbit review feedback:
- stf_exec_place_scope_enter now rejects out-of-range indices with NULL,
  matching the contract of the neighboring index-based accessors.
- stf_data_place_deallocate catches and maps C++ exceptions instead of
  letting them escape the extern "C" entry point.
---
 .../stf/include/cccl/c/experimental/stf/stf.h   |  2 +-
 c/experimental/stf/src/stf.cu                   | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index d5a80eb7953..c0fb4f16662 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -202,7 +202,7 @@ void stf_exec_place_grid_destroy(stf_exec_place_handle grid);
 
 //! \brief Activate the sub-place at linear index \p idx (0 for scalar places).
 //! Saves the current CUDA context; call stf_exec_place_scope_exit to restore.
-//! \return Opaque scope handle, or NULL on failure.
+//! \return Opaque scope handle, or NULL on failure (including when \p idx is out of bounds).
 stf_exec_place_scope_handle stf_exec_place_scope_enter(stf_exec_place_handle place, size_t idx);
 
 //! \brief Restore the CUDA context saved by stf_exec_place_scope_enter and destroy the scope.
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index f982eced864..5ab0347a2bc 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -348,6 +348,10 @@ void stf_exec_place_grid_destroy(stf_exec_place_handle grid)
 stf_exec_place_scope_handle stf_exec_place_scope_enter(stf_exec_place_handle place, size_t idx)
 {
   _CCCL_ASSERT(place != nullptr, "exec_place handle must not be null");
+  if (idx >= from_opaque(place)->size())
+  {
+    return nullptr;
+  }
   return to_opaque(stf_try_allocate([&] {
     return new exec_place_scope(*from_opaque(place), idx);
   }));
@@ -561,7 +565,18 @@ void* stf_data_place_allocate(stf_data_place_handle h, ptrdiff_t size, cudaStrea
 void stf_data_place_deallocate(stf_data_place_handle h, void* ptr, size_t size, cudaStream_t stream)
 {
   _CCCL_ASSERT(h != nullptr, "data_place handle must not be null");
-  from_opaque(h)->deallocate(ptr, size, stream);
+  try
+  {
+    from_opaque(h)->deallocate(ptr, size, stream);
+  }
+  catch (const ::std::exception& e)
+  {
+    fprintf(stderr, "stf_data_place_deallocate failed: %s\n", e.what());
+  }
+  catch (...)
+  {
+    fprintf(stderr, "stf_data_place_deallocate failed: unknown exception\n");
+  }
 }
 
 int stf_data_place_allocation_is_stream_ordered(stf_data_place_handle h)

From 3d7dd90e52575df64fc4b7dcbe637449a2580bb7 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Wed, 3 Jun 2026 14:14:59 +0200
Subject: [PATCH 03/10] [STF] Use range-based for loops in test_places
 (clang-tidy)

Fix modernize-loop-convert clang-tidy errors by iterating the places
array with range-based for loops instead of index-based loops.
---
 c/experimental/stf/test/test_places.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp
index 6d97f683afe..9a7a8c68cf9 100644
--- a/c/experimental/stf/test/test_places.cpp
+++ b/c/experimental/stf/test/test_places.cpp
@@ -235,15 +235,15 @@ C2H_TEST("task on exec_place_grid: get_grid_dims and get_custream_at_index", "[t
 {
   const size_t nplaces = 2;
   stf_exec_place_handle places[2];
-  for (size_t i = 0; i < nplaces; i++)
+  for (auto& place : places)
   {
-    places[i] = stf_exec_place_device(0);
+    place = stf_exec_place_device(0);
   }
   stf_exec_place_handle grid = stf_exec_place_grid_create(places, nplaces, nullptr);
   REQUIRE(grid != nullptr);
-  for (size_t i = 0; i < nplaces; i++)
+  for (auto& place : places)
   {
-    stf_exec_place_destroy(places[i]);
+    stf_exec_place_destroy(place);
   }
 
   stf_data_place_handle composite_dplace = stf_data_place_composite(grid, blocked_mapper_1d);

From 929a1c45689ec72de9d9620ac5a9119eba515d50 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Fri, 5 Jun 2026 07:29:15 +0200
Subject: [PATCH 04/10] [STF] Add unified_task grid introspection used by
 places C API

The places C bindings (stf_task_get_grid_dims / stf_task_get_custream_at_index)
call get_grid_dims(dim4*) and get_stream(size_t) on context::unified_task<>,
but those overloads were never declared on unified_task in this branch, so
stf.cu failed to compile. Add both methods, dispatching the per-place stream
to stream_task<Deps...> and returning nullptr/false for graph tasks or
non-grid exec places.
---
 .../experimental/__stf/internal/context.cuh   | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index f99c0115c33..6ca85908d30 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -321,6 +321,42 @@ public:
       };
     }
 
+    /** When the task's exec place is a grid (size > 1), get the stream for the place at \p place_index
+     * (linear index). Returns nullptr for graph_task (no per-place streams). */
+    cudaStream_t get_stream(size_t place_index) const
+    {
+      return payload->*[&](auto& self) -> cudaStream_t {
+        if constexpr (::std::is_same_v<stream_task<Deps...>, ::std::decay_t<decltype(self)>>)
+        {
+          return self.get_stream(place_index);
+        }
+        else
+        {
+          (void) place_index;
+          return nullptr;
+        }
+      };
+    }
+
+    /** When the task's exec place is a grid (size > 1), write its shape to \p out_dims and return true; else return
+     * false. */
+    bool get_grid_dims(dim4* out_dims) const
+    {
+      if (out_dims == nullptr)
+      {
+        return false;
+      }
+      return payload->*[&](auto& self) -> bool {
+        const exec_place& e = self.get_exec_place();
+        if (e.size() <= 1)
+        {
+          return false;
+        }
+        *out_dims = e.get_dims();
+        return true;
+      };
+    }
+
     // Get the underlying task base class - both stream_task and graph_task inherit from task. This is convenient when
     // we do not need the "typed" task, for example when using the "low-level" add_deps method.
     ::cuda::experimental::stf::task& get_base_task()

From de602936a1def9df909e7003755af97bb79a3de6 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Fri, 5 Jun 2026 09:13:22 +0200
Subject: [PATCH 05/10] [STF] Bounds-check
 unified_task::get_stream(place_index)

stream_task::get_stream(size_t) indexes the stream grid without any bounds
check, so stf_task_get_custream_at_index could read past the grid for an
out-of-range index (UB) and returned success for non-grid exec places,
contradicting the documented contract (non-zero on "not a grid" / index out
of range). Guard the linear index in the unified_task<> wrapper: return
nullptr for graph tasks, non-grid exec places, and out-of-range indices.

Add a regression check to the grid test for the out-of-range index case.
---
 c/experimental/stf/test/test_places.cpp                |  4 ++++
 .../cuda/experimental/__stf/internal/context.cuh       | 10 +++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp
index 9a7a8c68cf9..ddcfc5932e9 100644
--- a/c/experimental/stf/test/test_places.cpp
+++ b/c/experimental/stf/test/test_places.cpp
@@ -278,6 +278,10 @@ C2H_TEST("task on exec_place_grid: get_grid_dims and get_custream_at_index", "[t
   REQUIRE(s0 != nullptr);
   REQUIRE(s1 != nullptr);
 
+  // Out-of-range linear index must report an error rather than reading past the stream grid.
+  CUstream s_oob;
+  REQUIRE(stf_task_get_custream_at_index(t, 2, &s_oob) != 0);
+
   stf_task_end(t);
   stf_task_destroy(t);
 
diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index 6ca85908d30..9e8ab470fd6 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -322,12 +322,20 @@ public:
     }
 
     /** When the task's exec place is a grid (size > 1), get the stream for the place at \p place_index
-     * (linear index). Returns nullptr for graph_task (no per-place streams). */
+     * (linear index). Returns nullptr for graph_task (no per-place streams), for non-grid exec places, or
+     * when \p place_index is out of range. */
     cudaStream_t get_stream(size_t place_index) const
     {
       return payload->*[&](auto& self) -> cudaStream_t {
         if constexpr (::std::is_same_v<stream_task<Deps...>, ::std::decay_t<decltype(self)>>)
         {
+          // Per-place streams only exist for grid exec places. stream_task::get_stream(size_t) indexes the
+          // stream grid without bounds checking, so guard the linear index here before forwarding.
+          const exec_place& e = self.get_exec_place();
+          if (e.size() <= 1 || place_index >= e.size())
+          {
+            return nullptr;
+          }
           return self.get_stream(place_index);
         }
         else

From ca8081707e61f2e7209bffe664de4c6a872569b5 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Mon, 8 Jun 2026 07:26:13 +0200
Subject: [PATCH 06/10] [STF] Test green-context places C API

Add direct C API coverage for green-context helper and green-context exec/data place factories so the extracted places bindings are self-contained.
---
 c/experimental/stf/test/test_places.cpp | 54 +++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp
index ddcfc5932e9..06bc074bb49 100644
--- a/c/experimental/stf/test/test_places.cpp
+++ b/c/experimental/stf/test/test_places.cpp
@@ -10,6 +10,7 @@
 
 #include <cuda/__cmath/ceil_div.h>
 
+#include <string>
 #include <vector>
 
 #include <cuda_runtime.h>
@@ -484,6 +485,59 @@ C2H_TEST("machine_init idempotent", "[places][machine]")
   stf_machine_init();
 }
 
+C2H_TEST("green_context_helper and green-context places", "[places][green_ctx]")
+{
+#if !defined(CUDART_VERSION) || CUDART_VERSION < 12040
+  REQUIRE(stf_green_context_helper_create(1, 0) == nullptr);
+#else
+  stf_machine_init();
+  stf_green_context_helper_handle helper = stf_green_context_helper_create(1, 0);
+  if (helper == nullptr)
+  {
+    SKIP("green context support is not available");
+  }
+
+  REQUIRE(stf_green_context_helper_get_device_id(helper) == 0);
+  const size_t count = stf_green_context_helper_get_count(helper);
+  REQUIRE(count >= 1);
+
+  stf_exec_place_handle default_affine_ep = stf_exec_place_green_ctx(helper, 0, /*use_green_ctx_data_place=*/0);
+  REQUIRE(default_affine_ep != nullptr);
+  REQUIRE(stf_exec_place_is_device(default_affine_ep) != 0);
+
+  stf_data_place_handle default_affine_dp = stf_exec_place_get_affine_data_place(default_affine_ep);
+  REQUIRE(default_affine_dp != nullptr);
+  REQUIRE(stf_data_place_get_device_ordinal(default_affine_dp) == 0);
+
+  stf_exec_place_handle green_affine_ep = stf_exec_place_green_ctx(helper, 0, /*use_green_ctx_data_place=*/1);
+  REQUIRE(green_affine_ep != nullptr);
+  REQUIRE(stf_exec_place_is_device(green_affine_ep) != 0);
+
+  stf_data_place_handle green_affine_dp = stf_exec_place_get_affine_data_place(green_affine_ep);
+  REQUIRE(green_affine_dp != nullptr);
+  REQUIRE(stf_data_place_get_device_ordinal(green_affine_dp) == 0);
+  const std::string green_affine_desc = stf_data_place_to_string(green_affine_dp);
+  REQUIRE(green_affine_desc.find("green_ctx") != std::string::npos);
+
+  stf_data_place_handle green_dp = stf_data_place_green_ctx(helper, 0);
+  REQUIRE(green_dp != nullptr);
+  REQUIRE(stf_data_place_get_device_ordinal(green_dp) == 0);
+  REQUIRE(stf_data_place_allocation_is_stream_ordered(green_dp) == 1);
+  const std::string green_dp_desc = stf_data_place_to_string(green_dp);
+  REQUIRE(green_dp_desc.find("green_ctx") != std::string::npos);
+
+  REQUIRE(stf_exec_place_green_ctx(helper, count, /*use_green_ctx_data_place=*/0) == nullptr);
+  REQUIRE(stf_data_place_green_ctx(helper, count) == nullptr);
+
+  stf_data_place_destroy(green_dp);
+  stf_data_place_destroy(green_affine_dp);
+  stf_exec_place_destroy(green_affine_ep);
+  stf_data_place_destroy(default_affine_dp);
+  stf_exec_place_destroy(default_affine_ep);
+  stf_green_context_helper_destroy(helper);
+#endif
+}
+
 C2H_TEST("data_place_allocate_device", "[places][allocate]")
 {
   stf_exec_place_resources_handle res = stf_exec_place_resources_create();

From 2aca459d30dac5a400a9b15ddb5a1adab8ed4f92 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Mon, 8 Jun 2026 18:07:03 +0200
Subject: [PATCH 07/10] [STF] Guard stf_machine_init at the C boundary

machine::instance() does real work on first call (P2P/mempool/topology
setup) and can throw. Wrap it in try/catch so a C++ exception never
unwinds across the extern "C" boundary into a C caller (UB / terminate),
matching the error-reporting convention used by stf_try_allocate.
---
 .../stf/include/cccl/c/experimental/stf/stf.h |  4 +++-
 c/experimental/stf/src/stf.cu                 | 20 ++++++++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index c0fb4f16662..4389ff3fad2 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -246,7 +246,9 @@ stf_exec_place_handle
 stf_exec_place_green_ctx(stf_green_context_helper_handle helper, size_t idx, int use_green_ctx_data_place);
 
 //! \brief Initialize the machine singleton (P2P access, memory pool setup, topology).
-//! Safe to call multiple times; only the first call has effect.
+//! Safe to call multiple times; only the first call has effect. Any C++ exception
+//! raised during initialization is caught and reported to stderr (never propagated
+//! across the C boundary).
 void stf_machine_init(void);
 
 //! \brief Host (CPU/pinned) data placement.
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 5ab0347a2bc..fd5c1a25a76 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -444,7 +444,25 @@ stf_exec_place_green_ctx(stf_green_context_helper_handle helper, size_t idx, int
 
 void stf_machine_init(void)
 {
-  cuda::experimental::places::reserved::machine::instance();
+  // machine::instance() does real work on first call (P2P/mempool/topology
+  // setup) and can throw. Guard the extern "C" boundary so a C++ exception
+  // never unwinds into a C caller (which would be UB / std::terminate).
+  try
+  {
+    cuda::experimental::places::reserved::machine::instance();
+  }
+  catch (const ::std::exception& exc)
+  {
+    ::fflush(stdout);
+    ::std::fprintf(stderr, "\nEXCEPTION in STF C API (machine init): %s\n", exc.what());
+    ::fflush(stderr);
+  }
+  catch (...)
+  {
+    ::fflush(stdout);
+    ::std::fprintf(stderr, "\nEXCEPTION in STF C API (machine init): non-standard exception\n");
+    ::fflush(stderr);
+  }
 }
 
 stf_data_place_handle stf_data_place_host(void)

From 55e0e84aa3dcbecc32231d215655776e1cc6b8eb Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Mon, 8 Jun 2026 18:13:01 +0200
Subject: [PATCH 08/10] [STF] Document allocate/deallocate size signedness
 rationale

stf_data_place_allocate takes a signed ptrdiff_t while stf_data_place_deallocate
takes an unsigned size_t. This mirrors the C++ allocator interface, where the
requested size is passed by reference and negated to signal allocation failure;
deallocation has no such error to signal. Document the asymmetry on both entry
points so the C surface explains why the types differ.
---
 .../stf/include/cccl/c/experimental/stf/stf.h        | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 4389ff3fad2..ebb13c11ca7 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -291,8 +291,14 @@ const char* stf_data_place_to_string(stf_data_place_handle h);
 //! For host/managed places \p stream is ignored.
 //! Returns NULL on failure (e.g. unsupported place type or out of memory).
 //!
+//! \note \p size is signed (ptrdiff_t) to mirror the underlying C++ allocator
+//! interface, where the requested size is passed by reference and negated to
+//! signal allocation failure while preserving the requested amount. The matching
+//! stf_data_place_deallocate() takes an unsigned size_t because at deallocation
+//! the size is a known-good quantity with no error to signal.
+//!
 //! \param h     Data place handle (must not be NULL)
-//! \param size  Allocation size in bytes
+//! \param size  Allocation size in bytes (must be non-negative)
 //! \param stream CUDA stream for stream-ordered allocation (may be NULL)
 //! \return Pointer to allocated memory, or NULL on failure
 void* stf_data_place_allocate(stf_data_place_handle h, ptrdiff_t size, cudaStream_t stream);
@@ -302,6 +308,10 @@ void* stf_data_place_allocate(stf_data_place_handle h, ptrdiff_t size, cudaStrea
 //! For device places the deallocation is stream-ordered (cudaFreeAsync).
 //! For host/managed places \p stream is ignored.
 //!
+//! \note \p size is unsigned (size_t) on purpose: unlike stf_data_place_allocate(),
+//! deallocation never signals failure through the size argument (see that
+//! function's note), so it mirrors the unsigned C++ deallocate() signature.
+//!
 //! \param h      Data place handle (must not be NULL)
 //! \param ptr    Pointer returned by stf_data_place_allocate()
 //! \param size   Size of the original allocation in bytes

From 9e9e70c6548229bbc2995a011b25132434b69ef9 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Mon, 8 Jun 2026 18:20:29 +0200
Subject: [PATCH 09/10] [STF] Fix Doxygen attachment on stf_task_get_* blocks

The stf_task_get_grid_dims / stf_task_get_custream_at_index doc blocks
opened with a lone //! line and had a blank line between the comment and
the declaration, which can detach the comment from the symbol in doxygen.
Drop the leading empty //! and the trailing blank line so each block binds
to its function.
---
 c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index ebb13c11ca7..a7134cf3409 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -1154,7 +1154,6 @@ void stf_task_destroy(stf_task_handle t);
 
 void stf_task_enable_capture(stf_task_handle t);
 
-//!
 //! \brief Get grid dimensions of a task's exec place
 //!
 //! When the task's execution place is a grid (size > 1), writes its
@@ -1180,10 +1179,8 @@ void stf_task_enable_capture(stf_task_handle t);
 //! \endcode
 //!
 //! \see stf_task_get_custream_at_index()
-
 int stf_task_get_grid_dims(stf_task_handle t, stf_dim4* out_dims);
 
-//!
 //! \brief Get the CUDA stream for a specific grid index
 //!
 //! When the task's exec place is a grid, returns the CUstream for the
@@ -1209,7 +1206,6 @@ int stf_task_get_grid_dims(stf_task_handle t, stf_dim4* out_dims);
 //! \endcode
 //!
 //! \see stf_task_get_grid_dims()
-
 int stf_task_get_custream_at_index(stf_task_handle t, size_t place_index, CUstream* out_stream);
 
 //! \}

From ab35eae5daecf710feea97264f7bc7e383ddf89e Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Mon, 8 Jun 2026 18:23:15 +0200
Subject: [PATCH 10/10] [STF] Document grid sentinel/stream-0 behavior for task
 accessors

Note that stf_task_get_grid_dims treats a single-element exec place as
"not a grid" (returns non-zero), and that stf_task_get_custream_at_index
leaves out_stream untouched on failure and never yields the legacy
default stream (CUstream 0) on success, since STF grids use non-default
streams.
---
 c/experimental/stf/include/cccl/c/experimental/stf/stf.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index a7134cf3409..c0c9d8f0f60 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -1168,6 +1168,8 @@ void stf_task_enable_capture(stf_task_handle t);
 //! \pre stf_task_start() must have been called
 //!
 //! \note Total number of grid entries is out_dims->x * out_dims->y * out_dims->z * out_dims->t.
+//! \note A single-element exec place (size 1) is intentionally not treated as a grid: this
+//! returns non-zero for it, consistent with stf_task_get_custream_at_index().
 //!
 //! \par Example:
 //! \code
@@ -1194,6 +1196,10 @@ int stf_task_get_grid_dims(stf_task_handle t, stf_dim4* out_dims);
 //! \pre t must be valid task handle
 //! \pre stf_task_start() must have been called
 //!
+//! \note On success \p out_stream is set to the grid index's stream; on failure it is left
+//! untouched and a non-zero code is returned. STF grids always use non-default streams, so a
+//! valid result is never the legacy default stream (CUstream 0).
+//!
 //! \par Example:
 //! \code
 //! stf_dim4 dims;