From befa7682d6a4c3cb32bdabfa3739d1cbfd7a2d83 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 3 Nov 2025 13:47:24 -0800
Subject: [PATCH 01/15] Implement non-pooling memory allocation.

---
 .../_memory/_device_memory_resource.pyx       | 47 +++++++++++--
 cuda_core/tests/test_graph.py                 | 68 +++++++++++++++++++
 cuda_core/tests/test_memory.py                | 63 ++++++++++++++---
 3 files changed, 162 insertions(+), 16 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index 47b6fd114e..47cef28a55 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -48,9 +48,15 @@ cdef class DeviceMemoryResourceOptions:
     max_size : int, optional
         Maximum pool size. When set to 0, defaults to a system-dependent value.
         (Default to 0)
+
+    mempool_enabled : bool, optional
+        Whether to use a memory pool. Pool-based allocations cannot be captured
+        in a graph but are the only ones that support sharing via IPC.
+        (Default to True)
     """
-    ipc_enabled : cython.bint = False
-    max_size : cython.size_t = 0
+    ipc_enabled     : cython.bint   = False
+    max_size        : cython.size_t = 0
+    mempool_enabled : cython.bint   = True
 
 
 cdef class DeviceMemoryResourceAttributes:
@@ -220,8 +226,10 @@ cdef class DeviceMemoryResource(MemoryResource):
 
         if opts is None:
             DMR_init_current(self, dev_id)
+        elif opts.mempool_enabled:
+            DMR_init_create_mempool(self, dev_id, opts)
         else:
-            DMR_init_create(self, dev_id, opts)
+            DMR_init_no_mempool(self, dev_id, opts)
 
     def __dealloc__(self):
         DMR_close(self)
@@ -343,7 +351,8 @@ cdef class DeviceMemoryResource(MemoryResource):
 
     @property
     def attributes(self) -> DeviceMemoryResourceAttributes:
-        if self._attributes is None:
+        """Memory pool attributes or None if memory pooling is not enabled."""
+        if self._attributes is None and self.is_mempool_enabled:
             ref = weakref.ref(self)
             self._attributes = DeviceMemoryResourceAttributes._init(ref)
         return self._attributes
@@ -356,6 +365,8 @@ cdef class DeviceMemoryResource(MemoryResource):
     @property
     def handle(self) -> driver.CUmemoryPool:
         """Handle to the underlying memory pool."""
+        if self._handle == NULL:
+            raise RuntimeError("Memory resource has no memory pool.")
         return driver.CUmemoryPool(<uintptr_t>(self._handle))
 
     @property
@@ -386,6 +397,11 @@ cdef class DeviceMemoryResource(MemoryResource):
         """
         return self._ipc_data is not None and self._ipc_data._is_mapped
 
+    @property
+    def is_mempool_enabled(self) -> bool:
+        """Whether this memory resource uses a memory pool."""
+        return self._handle != NULL
+
     @property
     def uuid(self) -> Optional[uuid.UUID]:
         """
@@ -430,7 +446,7 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
             ))
 
 
-cdef void DMR_init_create(
+cdef void DMR_init_create_mempool(
     DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts
 ):
     # Create a new memory pool.
@@ -460,11 +476,28 @@ cdef void DMR_init_create(
         self._ipc_data = IPCData(alloc_handle, mapped=False)
 
 
+cdef void DMR_init_no_mempool(
+    DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts
+):
+    # Create mr without a memory pool.
+    if opts.ipc_enabled:
+        raise TypeError("Cannot create an IPC-enabled memory resource without "
+                           "memory pooling enabled.")
+    if opts.max_size != 0:
+        raise TypeError("Cannot supply max_size without memory pooling enabled.")
+
+    self._dev_id = dev_id
+
+
 cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr
-    with nogil:
-        HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
+    if self.is_mempool_enabled:
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
+    else:
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s))
     cdef Buffer buf = Buffer.__new__(Buffer)
     buf._ptr = <uintptr_t>(devptr)
     buf._ptr_obj = None
diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py
index cc558b6d22..d4312363cb 100644
--- a/cuda_core/tests/test_graph.py
+++ b/cuda_core/tests/test_graph.py
@@ -11,6 +11,8 @@
     from cuda import nvrtc
 from cuda.core.experimental import (
     Device,
+    DeviceMemoryResource,
+    DeviceMemoryResourceOptions,
     GraphBuilder,
     GraphCompleteOptions,
     GraphDebugPrintOptions,
@@ -21,6 +23,7 @@
     launch,
 )
 from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return
+from helpers.buffers import compare_equal_buffers, make_scratch_buffer
 
 
 def _common_kernels():
@@ -61,6 +64,30 @@ def _common_kernels_conditional():
     return mod
 
 
+def _common_kernels_alloc():
+    code = """
+    __global__ void set_zero(char *a, size_t nbytes) {
+        size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t stride = blockDim.x * gridDim.x;
+        for (size_t i = idx; i < nbytes; i += stride) {
+            a[i] = 0;
+        }
+    }
+    __global__ void add_one(char *a, size_t nbytes) {
+        size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t stride = blockDim.x * gridDim.x;
+        for (size_t i = idx; i < nbytes; i += stride) {
+            a[i] += 1;
+        }
+    }
+    """
+    arch = "".join(f"{i}" for i in Device().compute_capability)
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("set_zero", "add_one"))
+    return mod
+
+
 def test_graph_is_building(init_cuda):
     gb = Device().create_graph_builder()
     assert gb.is_building is False
@@ -747,3 +774,44 @@ def test_graph_build_mode(init_cuda):
 
     with pytest.raises(ValueError, match="^Unsupported build mode:"):
         gb = Device().create_graph_builder().begin_building(mode=None)
+
+
+def test_graph_alloc(init_cuda):
+    device = Device()
+    stream = device.create_stream()
+    options = DeviceMemoryResourceOptions(mempool_enabled=False)
+    mr = DeviceMemoryResource(device, options=options)
+
+    # Get kernels.
+    mod = _common_kernels_alloc()
+    set_zero = mod.get_kernel("set_zero")
+    add_one = mod.get_kernel("add_one")
+
+    NBYTES = 64
+    target = mr.allocate(NBYTES, stream=stream)
+
+    # Begin graph capture.
+    gb = Device().create_graph_builder().begin_building(mode="thread_local")
+
+    work_buffer = mr.allocate(NBYTES, stream=gb.stream)
+    launch(gb, LaunchConfig(grid=1, block=1), set_zero, int(work_buffer.handle), NBYTES)
+    launch(gb, LaunchConfig(grid=1, block=1), add_one, int(work_buffer.handle), NBYTES)
+    launch(gb, LaunchConfig(grid=1, block=1), add_one, int(work_buffer.handle), NBYTES)
+    target.copy_from(work_buffer, stream=gb.stream)
+
+    # Finalize the graph.
+    graph = gb.end_building().complete()
+
+    # Upload and launch
+    graph.upload(stream)
+    graph.launch(stream)
+    stream.sync()
+
+    # Check the result.
+    expected_buffer = make_scratch_buffer(device, 2, NBYTES)
+    compare_buffer = make_scratch_buffer(device, 0, NBYTES)
+    compare_buffer.copy_from(target, stream=stream)
+    stream.sync()
+    assert compare_equal_buffers(expected_buffer, compare_buffer)
+
+# TODO: check that mr.attributes is None with mempool_enabled=False
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index a261ec7a3d..5a8f798687 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -285,19 +285,28 @@ def test_buffer_dunder_dlpack_device_failure():
         buffer.__dlpack_device__()
 
 
+@pytest.mark.parametrize("use_mempool", [True, False])
 @pytest.mark.parametrize("use_device_object", [True, False])
-def test_device_memory_resource_initialization(mempool_device, use_device_object):
+def test_device_memory_resource_initialization(use_mempool, use_device_object):
     """Test that DeviceMemoryResource can be initialized successfully.
 
     This test verifies that the DeviceMemoryResource initializes properly,
     including the release threshold configuration for performance optimization.
     """
-    device = mempool_device
+    device = Device()
+
+    if use_mempool and not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
+
+    device.set_current()
 
     # This should succeed and configure the memory pool release threshold.
     # The resource can be constructed from either a device or device ordinal.
     device_arg = device if use_device_object else device.device_id
-    mr = DeviceMemoryResource(device_arg)
+    if use_mempool:
+        mr = DeviceMemoryResource(device_arg)
+    else:
+        mr = DeviceMemoryResource(device_arg, options={'mempool_enabled': False})
 
     # Verify basic properties
     assert mr.device_id == device.device_id
@@ -312,6 +321,18 @@ def test_device_memory_resource_initialization(mempool_device, use_device_object
     buffer.close()
 
 
+def test_device_memory_resource_initialization_errors():
+    """Test illegal construct option combinations."""
+    device = Device()
+    device.set_current()
+
+    with pytest.raises(TypeError, match="Cannot create an IPC-enabled memory resource without memory pooling enabled"):
+        DeviceMemoryResource(device, {"mempool_enabled": False, "ipc_enabled": True})
+
+    with pytest.raises(TypeError, match="Cannot supply max_size without memory pooling enabled"):
+        DeviceMemoryResource(device, {"mempool_enabled": False, "max_size": 1024})
+
+
 def test_vmm_allocator_basic_allocation():
     """Test basic VMM allocation functionality.
 
@@ -481,12 +502,23 @@ def test_vmm_allocator_rdma_unsupported_exception():
         VirtualMemoryResource(device, config=options)
 
 
-def test_mempool(mempool_device):
-    device = mempool_device
+@pytest.mark.parametrize("use_mempool", [True, False])
+def test_device_memory_resource(use_mempool):
+
+    device = Device()
+
+    if use_mempool and not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
+
+    device.set_current()
 
     # Test basic pool creation
-    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False)
-    mr = DeviceMemoryResource(device, options=options)
+    if use_mempool:
+        options = DeviceMemoryResourceOptions(max_size=POOL_SIZE)
+        mr = DeviceMemoryResource(device, options=options)
+    else:
+        options = DeviceMemoryResourceOptions(mempool_enabled=False)
+        mr = DeviceMemoryResource(device, options=options)
     assert mr.device_id == device.device_id
     assert mr.is_device_accessible
     assert not mr.is_host_accessible
@@ -523,8 +555,12 @@ def test_mempool(mempool_device):
     dst_buffer.close()
     src_buffer.close()
 
-    # Test error cases
-    # Test IPC operations are disabled
+
+def test_mempool_ipc_errors(mempool_device):
+    """Test error cases when IPC operations are disabled."""
+    device = mempool_device
+    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False)
+    mr = DeviceMemoryResource(device, options=options)
     buffer = mr.allocate(64)
     ipc_error_msg = "Memory resource is not IPC-enabled"
 
@@ -599,6 +635,14 @@ def test_mempool_attributes(ipc_enabled, mempool_device, property_name, expected
         assert value >= current_value, f"{property_name} should be >= {current_prop}"
 
 
+def test_mempool_no_attributes():
+    """Ensure mempool attributes cannot be accessed when memory pooling is disabled."""
+    device = Device()
+    device.set_current()
+    mr = DeviceMemoryResource(device, options={"mempool_enabled": False})
+    assert mr.attributes is None
+
+
 def test_mempool_attributes_ownership(mempool_device):
     """Ensure the attributes bundle handles references correctly."""
     device = mempool_device
@@ -644,3 +688,4 @@ def test_strided_memory_view_refcnt():
     assert av.strides[0] == 1
     assert av.strides[1] == 64
     assert sys.getrefcount(av.strides) >= 2
+

From 88834f7361d419a6ab0c4b66c9aa91d89325cba8 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 4 Nov 2025 16:09:59 -0800
Subject: [PATCH 02/15] Add GraphMemoryResource.

---
 cuda_core/cuda/core/experimental/__init__.py  |   1 +
 .../core/experimental/_memory/__init__.py     |   1 +
 .../_memory/_device_memory_resource.pyx       |   6 +
 .../cuda/core/experimental/_memory/_gmr.pxd   |  13 ++
 .../cuda/core/experimental/_memory/_gmr.pyx   | 160 ++++++++++++++++++
 cuda_core/tests/test_graph.py                 |  89 ++++++++--
 cuda_core/tests/test_memory.py                |  30 +++-
 7 files changed, 281 insertions(+), 19 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_memory/_gmr.pxd
 create mode 100644 cuda_core/cuda/core/experimental/_memory/_gmr.pyx

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 8a60c031c5..67402e10a6 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -42,6 +42,7 @@
     Buffer,
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
+    GraphMemoryResource,
     LegacyPinnedMemoryResource,
     MemoryResource,
     VirtualMemoryResource,
diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index 3c07fbdde6..8dd4385982 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -4,6 +4,7 @@
 
 from ._buffer import *  # noqa: F403
 from ._device_memory_resource import *  # noqa: F403
+from ._gmr import *  # noqa: F403
 from ._ipc import *  # noqa: F403
 from ._legacy import *  # noqa: F403
 from ._virtual_memory_resource import *  # noqa: F403
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index 47cef28a55..067cb5ff16 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -72,6 +72,12 @@ cdef class DeviceMemoryResourceAttributes:
         self._mr_weakref = mr
         return self
 
+    def __repr__(self):
+        return f"{self.__class__.__name__}(%s)" % ", ".join(
+            f"{attr}={getattr(self, attr)}" for attr in dir(self)
+                                            if not attr.startswith("_")
+        )
+
     @DMRA_mempool_attribute(bool)
     def reuse_follow_event_dependencies(self):
         """Allow memory to be reused when there are event dependencies between streams."""
diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pxd b/cuda_core/cuda/core/experimental/_memory/_gmr.pxd
new file mode 100644
index 0000000000..0feb4a398b
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pxd
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._memory._buffer cimport MemoryResource
+from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource
+from cuda.core.experimental._memory._ipc cimport IPCData
+
+
+cdef class cyGraphMemoryResource(MemoryResource):
+    cdef:
+        int _dev_id
diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
new file mode 100644
index 0000000000..34999f5a54
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
@@ -0,0 +1,160 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport uintptr_t, intptr_t
+
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
+from cuda.core.experimental._stream cimport Stream
+from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+
+from functools import cache
+from typing import TYPE_CHECKING
+
+from cuda.core.experimental._utils.cuda_utils import driver
+
+if TYPE_CHECKING:
+    from cuda.core.experimental._memory.buffer import DevicePointerT
+
+__all__ = ['GraphMemoryResource']
+
+
+cdef class GraphMemoryResourceAttributes:
+    cdef:
+        int _dev_id
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError("GraphMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.")
+
+    @classmethod
+    def _init(cls, device_id: int):
+        cdef GraphMemoryResourceAttributes self = GraphMemoryResourceAttributes.__new__(cls)
+        self._dev_id = device_id
+        return self
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(%s)" % ", ".join(
+            f"{attr}={getattr(self, attr)}" for attr in dir(self)
+                                            if not attr.startswith("_")
+        )
+
+    @GMRA_mem_attribute(int)
+    def reserved_mem_current(self):
+        """Current amount of backing memory allocated."""
+
+    @GMRA_mem_attribute(int, settable=True)
+    def reserved_mem_high(self):
+        """High watermark of backing memory allocated."""
+
+    @GMRA_mem_attribute(int)
+    def used_mem_current(self):
+        """Current amount of memory in use."""
+
+    @GMRA_mem_attribute(int, settable=True)
+    def used_mem_high(self):
+        """High watermark of memory in use."""
+
+
+cdef GMRA_mem_attribute(property_type: type, settable : bool = False):
+    _settable = settable
+
+    def decorator(stub):
+        attr_enum = getattr(
+            driver.CUgraphMem_attribute, f"CU_GRAPH_MEM_ATTR_{stub.__name__.upper()}"
+        )
+
+        def fget(GraphMemoryResourceAttributes self) -> property_type:
+            value = GMRA_getattribute(self._dev_id, <cydriver.CUgraphMem_attribute><uintptr_t> attr_enum)
+            return property_type(value)
+
+        if _settable:
+            def fset(GraphMemoryResourceAttributes self, value: int):
+                GMRA_setattribute(self._dev_id, <cydriver.CUgraphMem_attribute><uintptr_t> attr_enum, value)
+        else:
+            fset = None
+
+        return property(fget=fget, fset=fset, doc=stub.__doc__)
+    return decorator
+
+
+cdef int GMRA_getattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum):
+    cdef int value
+    with nogil:
+        HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(device_id, attr_enum, <void *> &value))
+    return value
+
+
+cdef int GMRA_setattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum, int value):
+    with nogil:
+        HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(device_id, attr_enum, <void *> &value))
+
+
+cdef class cyGraphMemoryResource(MemoryResource):
+    def __cinit__(self, int device_id):
+        self._dev_id = device_id
+
+    def allocate(self, size_t size, Stream stream = None) -> Buffer:
+        return GMR_allocate(self, size, stream)
+
+    def deallocate(self, ptr: DevicePointerT, size_t size, Stream stream = None):
+        return GMR_deallocate(ptr, size, stream)
+
+    def close(self):
+        pass
+
+    def trim(self):
+        """Free unused memory that was cached on the specified device for use with graphs back to the OS."""
+        with nogil:
+             HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._dev_id))
+
+    @property
+    def attributes(self) -> GraphMemoryResourceAttributes:
+        """Asynchronous allocation attributes related to graphs."""
+        return GraphMemoryResourceAttributes._init(self._dev_id)
+
+    @property
+    def device_id(self) -> int:
+        """The associated device ordinal."""
+        return self._dev_id
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Return True. This memory resource provides device-accessible buffers."""
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """Return False. This memory resource does not provide host-accessible buffers."""
+        return False
+
+
+class GraphMemoryResource(cyGraphMemoryResource):
+    @cache
+    def __new__(cls, device_id: int | Device):
+        cdef int c_device_id = getattr(device_id, 'device_id', device_id)
+        return cyGraphMemoryResource.__new__(cls, c_device_id)
+
+
+cdef Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream):
+    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUdeviceptr devptr
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s))
+    cdef Buffer buf = Buffer.__new__(Buffer)
+    buf._ptr = <intptr_t>(devptr)
+    buf._ptr_obj = None
+    buf._size = size
+    buf._memory_resource = self
+    buf._alloc_stream = stream
+    return buf
+
+
+cdef void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept:
+    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
+
diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py
index d4312363cb..8539fa9a24 100644
--- a/cuda_core/tests/test_graph.py
+++ b/cuda_core/tests/test_graph.py
@@ -16,6 +16,7 @@
     GraphBuilder,
     GraphCompleteOptions,
     GraphDebugPrintOptions,
+    GraphMemoryResource,
     LaunchConfig,
     LegacyPinnedMemoryResource,
     Program,
@@ -776,42 +777,96 @@ def test_graph_build_mode(init_cuda):
         gb = Device().create_graph_builder().begin_building(mode=None)
 
 
-def test_graph_alloc(init_cuda):
+@pytest.mark.parametrize("use_graph", [False, True])
+def test_graph_alloc(init_cuda, use_graph):
+    """Test graph capture with memory allocated by GraphMemoryResource."""
     device = Device()
     stream = device.create_stream()
-    options = DeviceMemoryResourceOptions(mempool_enabled=False)
-    mr = DeviceMemoryResource(device, options=options)
 
     # Get kernels.
     mod = _common_kernels_alloc()
     set_zero = mod.get_kernel("set_zero")
     add_one = mod.get_kernel("add_one")
 
+    # Allocate a non-graph target buffer in device memory. The work sequence
+    # will fill it.
     NBYTES = 64
-    target = mr.allocate(NBYTES, stream=stream)
+    mr = DeviceMemoryResource(device)
+    tgt_buffer = mr.allocate(NBYTES, stream=stream)
+
+    # Get the memory resource for graphs.
+    gmr = GraphMemoryResource(device)
+    if use_graph:
+        # Trim memory to zero and reset high watermarks.
+        gmr.trim()
+        # gmr.attributes.reserved_mem_high = 0  ## not working
+        # gmr.attributes.used_mem_high = 0
+
+        assert gmr.attributes.reserved_mem_current == 0
+        assert gmr.attributes.reserved_mem_high == 0
+        assert gmr.attributes.used_mem_current == 0
+        assert gmr.attributes.used_mem_high == 0
+
+    # ====== Begin work sequence ======
 
     # Begin graph capture.
-    gb = Device().create_graph_builder().begin_building(mode="thread_local")
+    if use_graph:
+        gb = Device().create_graph_builder().begin_building(mode="thread_local")
 
-    work_buffer = mr.allocate(NBYTES, stream=gb.stream)
-    launch(gb, LaunchConfig(grid=1, block=1), set_zero, int(work_buffer.handle), NBYTES)
-    launch(gb, LaunchConfig(grid=1, block=1), add_one, int(work_buffer.handle), NBYTES)
-    launch(gb, LaunchConfig(grid=1, block=1), add_one, int(work_buffer.handle), NBYTES)
-    target.copy_from(work_buffer, stream=gb.stream)
+    # Set up resources for use in the captured sequence.
+    cap_mr = gmr if use_graph else mr
+    cap_stream = gb.stream if use_graph else stream  # FIXME: should use gb here, not gb.stream
 
-    # Finalize the graph.
-    graph = gb.end_building().complete()
+    # Perform or capture the work.
+    work_buffer = cap_mr.allocate(NBYTES, stream=cap_stream)
+    for kernel in [set_zero, add_one, add_one]:
+        launch(cap_stream, LaunchConfig(grid=1, block=1), kernel, int(work_buffer.handle), NBYTES)
+    tgt_buffer.copy_from(work_buffer, stream=cap_stream)
+
+    if use_graph:
+        # Finalize the graph.
+        graph = gb.end_building().complete()
+
+        # Upload and launch
+        graph.upload(stream)
+        graph.launch(stream)
 
-    # Upload and launch
-    graph.upload(stream)
-    graph.launch(stream)
     stream.sync()
 
+    # ====== End work sequence ======
+
     # Check the result.
     expected_buffer = make_scratch_buffer(device, 2, NBYTES)
     compare_buffer = make_scratch_buffer(device, 0, NBYTES)
-    compare_buffer.copy_from(target, stream=stream)
+    compare_buffer.copy_from(tgt_buffer, stream=stream)
     stream.sync()
     assert compare_equal_buffers(expected_buffer, compare_buffer)
 
-# TODO: check that mr.attributes is None with mempool_enabled=False
+    # Check memory usage.
+    if use_graph:
+        # See that the graph-specific pool has been used.
+        assert gmr.attributes.reserved_mem_current > 0
+        assert gmr.attributes.reserved_mem_high > 0
+        assert gmr.attributes.used_mem_current > 0
+        assert gmr.attributes.used_mem_high > 0
+
+        work_buffer.close()
+
+        # See that the non-graph-specific pool has one allocation.
+        assert mr.attributes.used_mem_current == NBYTES
+        tgt_buffer.close()
+        assert mr.attributes.used_mem_current == 0
+
+    else:
+        # See that the graph-specific pool has not been used.
+        assert gmr.attributes.reserved_mem_current == 0
+        assert gmr.attributes.reserved_mem_high == 0
+        assert gmr.attributes.used_mem_current == 0
+        assert gmr.attributes.used_mem_high == 0
+
+        # See that the non-graph-specific pool has two allocations.
+        assert mr.attributes.used_mem_current == 2 * NBYTES
+        tgt_buffer.close()
+        work_buffer.close()
+        assert mr.attributes.used_mem_current == 0
+
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 5a8f798687..e50e46dc90 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -13,6 +13,7 @@
     np = None
 import ctypes
 import platform
+import re
 
 import pytest
 from cuda.core.experimental import (
@@ -20,6 +21,7 @@
     Device,
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
+    GraphMemoryResource,
     MemoryResource,
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
@@ -133,6 +135,7 @@ def test_package_contents():
         "MemoryResource",
         "DeviceMemoryResource",
         "DeviceMemoryResourceOptions",
+        "GraphMemoryResource",
         "IPCBufferDescriptor",
         "IPCAllocationHandle",
         "LegacyPinnedMemoryResource",
@@ -306,7 +309,7 @@ def test_device_memory_resource_initialization(use_mempool, use_device_object):
     if use_mempool:
         mr = DeviceMemoryResource(device_arg)
     else:
-        mr = DeviceMemoryResource(device_arg, options={'mempool_enabled': False})
+        mr = DeviceMemoryResource(device_arg, options={"mempool_enabled": False})
 
     # Verify basic properties
     assert mr.device_id == device.device_id
@@ -504,7 +507,6 @@ def test_vmm_allocator_rdma_unsupported_exception():
 
 @pytest.mark.parametrize("use_mempool", [True, False])
 def test_device_memory_resource(use_mempool):
-
     device = Device()
 
     if use_mempool and not device.properties.memory_pools_supported:
@@ -635,6 +637,22 @@ def test_mempool_attributes(ipc_enabled, mempool_device, property_name, expected
         assert value >= current_value, f"{property_name} should be >= {current_prop}"
 
 
+def test_mempool_attributes_repr(mempool_device):
+    device = Device()
+    device.set_current()
+    mr = DeviceMemoryResource(device, options={"max_size": 2048})
+    buffer1 = mr.allocate(64)
+    buffer2 = mr.allocate(64)
+    buffer1.close()
+    assert re.match(
+        r"DeviceMemoryResourceAttributes\(release_threshold=\d+, reserved_mem_current=\d+, reserved_mem_high=\d+, "
+        r"reuse_allow_internal_dependencies=(True|False), reuse_allow_opportunistic=(True|False), "
+        r"reuse_follow_event_dependencies=(True|False), used_mem_current=64, used_mem_high=128\)",
+        str(mr.attributes),
+    )
+    buffer2.close()
+
+
 def test_mempool_no_attributes():
     """Ensure mempool attributes cannot be accessed when memory pooling is disabled."""
     device = Device()
@@ -689,3 +707,11 @@ def test_strided_memory_view_refcnt():
     assert av.strides[1] == 64
     assert sys.getrefcount(av.strides) >= 2
 
+def test_graph_memory_resource_object(init_cuda):
+    device = Device()
+    gmr1 = GraphMemoryResource(device)
+    gmr2 = GraphMemoryResource(device)
+
+    # These objects are interned.
+    assert gmr1 is gmr2
+    assert gmr1 == gmr2

From 57855a1735b8e756ce82a050b75a502d8f99bada Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 5 Nov 2025 12:47:01 -0800
Subject: [PATCH 03/15] Remove mempool_enabled option now that
 GraphMemoryResource is ready.

---
 .../_memory/_device_memory_resource.pyx       | 48 ++++---------------
 cuda_core/tests/test_memory.py                | 43 +++--------------
 2 files changed, 15 insertions(+), 76 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index 067cb5ff16..752c5f7950 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -48,15 +48,9 @@ cdef class DeviceMemoryResourceOptions:
     max_size : int, optional
         Maximum pool size. When set to 0, defaults to a system-dependent value.
         (Default to 0)
-
-    mempool_enabled : bool, optional
-        Whether to use a memory pool. Pool-based allocations cannot be captured
-        in a graph but are the only ones that support sharing via IPC.
-        (Default to True)
     """
-    ipc_enabled     : cython.bint   = False
-    max_size        : cython.size_t = 0
-    mempool_enabled : cython.bint   = True
+    ipc_enabled: cython.bint   = False
+    max_size   : cython.size_t = 0
 
 
 cdef class DeviceMemoryResourceAttributes:
@@ -232,10 +226,8 @@ cdef class DeviceMemoryResource(MemoryResource):
 
         if opts is None:
             DMR_init_current(self, dev_id)
-        elif opts.mempool_enabled:
-            DMR_init_create_mempool(self, dev_id, opts)
         else:
-            DMR_init_no_mempool(self, dev_id, opts)
+            DMR_init_create(self, dev_id, opts)
 
     def __dealloc__(self):
         DMR_close(self)
@@ -357,8 +349,8 @@ cdef class DeviceMemoryResource(MemoryResource):
 
     @property
     def attributes(self) -> DeviceMemoryResourceAttributes:
-        """Memory pool attributes or None if memory pooling is not enabled."""
-        if self._attributes is None and self.is_mempool_enabled:
+        """Memory pool attributes."""
+        if self._attributes is None:
             ref = weakref.ref(self)
             self._attributes = DeviceMemoryResourceAttributes._init(ref)
         return self._attributes
@@ -371,8 +363,6 @@ cdef class DeviceMemoryResource(MemoryResource):
     @property
     def handle(self) -> driver.CUmemoryPool:
         """Handle to the underlying memory pool."""
-        if self._handle == NULL:
-            raise RuntimeError("Memory resource has no memory pool.")
         return driver.CUmemoryPool(<uintptr_t>(self._handle))
 
     @property
@@ -403,11 +393,6 @@ cdef class DeviceMemoryResource(MemoryResource):
         """
         return self._ipc_data is not None and self._ipc_data._is_mapped
 
-    @property
-    def is_mempool_enabled(self) -> bool:
-        """Whether this memory resource uses a memory pool."""
-        return self._handle != NULL
-
     @property
     def uuid(self) -> Optional[uuid.UUID]:
         """
@@ -452,7 +437,7 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
             ))
 
 
-cdef void DMR_init_create_mempool(
+cdef void DMR_init_create(
     DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts
 ):
     # Create a new memory pool.
@@ -482,28 +467,11 @@ cdef void DMR_init_create_mempool(
         self._ipc_data = IPCData(alloc_handle, mapped=False)
 
 
-cdef void DMR_init_no_mempool(
-    DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts
-):
-    # Create mr without a memory pool.
-    if opts.ipc_enabled:
-        raise TypeError("Cannot create an IPC-enabled memory resource without "
-                           "memory pooling enabled.")
-    if opts.max_size != 0:
-        raise TypeError("Cannot supply max_size without memory pooling enabled.")
-
-    self._dev_id = dev_id
-
-
 cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr
-    if self.is_mempool_enabled:
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
-    else:
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s))
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
     cdef Buffer buf = Buffer.__new__(Buffer)
     buf._ptr = <uintptr_t>(devptr)
     buf._ptr_obj = None
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index e50e46dc90..fe47ab3b90 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -288,9 +288,8 @@ def test_buffer_dunder_dlpack_device_failure():
         buffer.__dlpack_device__()
 
 
-@pytest.mark.parametrize("use_mempool", [True, False])
 @pytest.mark.parametrize("use_device_object", [True, False])
-def test_device_memory_resource_initialization(use_mempool, use_device_object):
+def test_device_memory_resource_initialization(use_device_object):
     """Test that DeviceMemoryResource can be initialized successfully.
 
     This test verifies that the DeviceMemoryResource initializes properly,
@@ -298,7 +297,7 @@ def test_device_memory_resource_initialization(use_mempool, use_device_object):
     """
     device = Device()
 
-    if use_mempool and not device.properties.memory_pools_supported:
+    if not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
 
     device.set_current()
@@ -306,10 +305,7 @@ def test_device_memory_resource_initialization(use_mempool, use_device_object):
     # This should succeed and configure the memory pool release threshold.
     # The resource can be constructed from either a device or device ordinal.
     device_arg = device if use_device_object else device.device_id
-    if use_mempool:
-        mr = DeviceMemoryResource(device_arg)
-    else:
-        mr = DeviceMemoryResource(device_arg, options={"mempool_enabled": False})
+    mr = DeviceMemoryResource(device_arg)
 
     # Verify basic properties
     assert mr.device_id == device.device_id
@@ -324,18 +320,6 @@ def test_device_memory_resource_initialization(use_mempool, use_device_object):
     buffer.close()
 
 
-def test_device_memory_resource_initialization_errors():
-    """Test illegal construct option combinations."""
-    device = Device()
-    device.set_current()
-
-    with pytest.raises(TypeError, match="Cannot create an IPC-enabled memory resource without memory pooling enabled"):
-        DeviceMemoryResource(device, {"mempool_enabled": False, "ipc_enabled": True})
-
-    with pytest.raises(TypeError, match="Cannot supply max_size without memory pooling enabled"):
-        DeviceMemoryResource(device, {"mempool_enabled": False, "max_size": 1024})
-
-
 def test_vmm_allocator_basic_allocation():
     """Test basic VMM allocation functionality.
 
@@ -505,22 +489,17 @@ def test_vmm_allocator_rdma_unsupported_exception():
         VirtualMemoryResource(device, config=options)
 
 
-@pytest.mark.parametrize("use_mempool", [True, False])
-def test_device_memory_resource(use_mempool):
+def test_device_memory_resource():
     device = Device()
 
-    if use_mempool and not device.properties.memory_pools_supported:
+    if not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
 
     device.set_current()
 
     # Test basic pool creation
-    if use_mempool:
-        options = DeviceMemoryResourceOptions(max_size=POOL_SIZE)
-        mr = DeviceMemoryResource(device, options=options)
-    else:
-        options = DeviceMemoryResourceOptions(mempool_enabled=False)
-        mr = DeviceMemoryResource(device, options=options)
+    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE)
+    mr = DeviceMemoryResource(device, options=options)
     assert mr.device_id == device.device_id
     assert mr.is_device_accessible
     assert not mr.is_host_accessible
@@ -653,14 +632,6 @@ def test_mempool_attributes_repr(mempool_device):
     buffer2.close()
 
 
-def test_mempool_no_attributes():
-    """Ensure mempool attributes cannot be accessed when memory pooling is disabled."""
-    device = Device()
-    device.set_current()
-    mr = DeviceMemoryResource(device, options={"mempool_enabled": False})
-    assert mr.attributes is None
-
-
 def test_mempool_attributes_ownership(mempool_device):
     """Ensure the attributes bundle handles references correctly."""
     device = mempool_device

From 0375941fcd7849b7988a4da723af6c0c84da5237 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 5 Nov 2025 13:05:09 -0800
Subject: [PATCH 04/15] Add docstring and make GraphMemoryResource a singleton.

---
 .../_memory/_device_memory_resource.pyx       |  2 +-
 .../cuda/core/experimental/_memory/_gmr.pyx   | 29 +++++++++++++++++--
 cuda_core/tests/test_memory.py                |  5 ++--
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index 752c5f7950..d7c120cecf 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -133,7 +133,7 @@ cdef int DMRA_getattribute(
 
 cdef class DeviceMemoryResource(MemoryResource):
     """
-    Create a device memory resource managing a stream-ordered memory pool.
+    A device memory resource managing a stream-ordered memory pool.
 
     Parameters
     ----------
diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
index 34999f5a54..23f09cc4c8 100644
--- a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
@@ -132,10 +132,35 @@ cdef class cyGraphMemoryResource(MemoryResource):
 
 
 class GraphMemoryResource(cyGraphMemoryResource):
-    @cache
+    """
+    A memory resource managing the graph-specific memory pool.
+
+    Graph-captured memory operations use a special internal memory pool, which
+    is a per-device singleton. This class serves as the interface to that pool.
+    The only supported operations are allocation, deallocation, and a limited
+    set of status queries.
+
+    This memory resource should be used to allocate memory when graph capturing
+    is enabled. Using this when graphs are not being captured will result in a
+    runtime error.
+
+    Conversely, allocating memory from a `DeviceMemoryResource` when graph
+    capturing is enabled results in a runtime error.
+
+    Parameters
+    ----------
+    device_id : int | Device
+        Device or Device ordinal for which a graph memory resource is obtained.
+    """
+
     def __new__(cls, device_id: int | Device):
         cdef int c_device_id = getattr(device_id, 'device_id', device_id)
-        return cyGraphMemoryResource.__new__(cls, c_device_id)
+        return cls._create(c_device_id)
+
+    @classmethod
+    @cache
+    def _create(cls, int device_id):
+        return cyGraphMemoryResource.__new__(cls, device_id)
 
 
 cdef Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream):
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index fe47ab3b90..373cecb9b6 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -682,7 +682,8 @@ def test_graph_memory_resource_object(init_cuda):
     device = Device()
     gmr1 = GraphMemoryResource(device)
     gmr2 = GraphMemoryResource(device)
+    gmr3 = GraphMemoryResource(device.device_id)
 
     # These objects are interned.
-    assert gmr1 is gmr2
-    assert gmr1 == gmr2
+    assert gmr1 is gmr2 is gmr3
+    assert gmr1 == gmr2 == gmr3

From 0b82b1fa6f6f8da0e002b101b7d6b64bf4d22264 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 5 Nov 2025 15:38:52 -0800
Subject: [PATCH 05/15] Move tests to a separate file.

---
 cuda_core/tests/test_graph.py     | 118 -----------------------------
 cuda_core/tests/test_graph_mem.py | 120 ++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+), 118 deletions(-)
 create mode 100644 cuda_core/tests/test_graph_mem.py

diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py
index 8539fa9a24..39810930fe 100644
--- a/cuda_core/tests/test_graph.py
+++ b/cuda_core/tests/test_graph.py
@@ -16,7 +16,6 @@
     GraphBuilder,
     GraphCompleteOptions,
     GraphDebugPrintOptions,
-    GraphMemoryResource,
     LaunchConfig,
     LegacyPinnedMemoryResource,
     Program,
@@ -65,30 +64,6 @@ def _common_kernels_conditional():
     return mod
 
 
-def _common_kernels_alloc():
-    code = """
-    __global__ void set_zero(char *a, size_t nbytes) {
-        size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t stride = blockDim.x * gridDim.x;
-        for (size_t i = idx; i < nbytes; i += stride) {
-            a[i] = 0;
-        }
-    }
-    __global__ void add_one(char *a, size_t nbytes) {
-        size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t stride = blockDim.x * gridDim.x;
-        for (size_t i = idx; i < nbytes; i += stride) {
-            a[i] += 1;
-        }
-    }
-    """
-    arch = "".join(f"{i}" for i in Device().compute_capability)
-    program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
-    prog = Program(code, code_type="c++", options=program_options)
-    mod = prog.compile("cubin", name_expressions=("set_zero", "add_one"))
-    return mod
-
-
 def test_graph_is_building(init_cuda):
     gb = Device().create_graph_builder()
     assert gb.is_building is False
@@ -777,96 +752,3 @@ def test_graph_build_mode(init_cuda):
         gb = Device().create_graph_builder().begin_building(mode=None)
 
 
-@pytest.mark.parametrize("use_graph", [False, True])
-def test_graph_alloc(init_cuda, use_graph):
-    """Test graph capture with memory allocated by GraphMemoryResource."""
-    device = Device()
-    stream = device.create_stream()
-
-    # Get kernels.
-    mod = _common_kernels_alloc()
-    set_zero = mod.get_kernel("set_zero")
-    add_one = mod.get_kernel("add_one")
-
-    # Allocate a non-graph target buffer in device memory. The work sequence
-    # will fill it.
-    NBYTES = 64
-    mr = DeviceMemoryResource(device)
-    tgt_buffer = mr.allocate(NBYTES, stream=stream)
-
-    # Get the memory resource for graphs.
-    gmr = GraphMemoryResource(device)
-    if use_graph:
-        # Trim memory to zero and reset high watermarks.
-        gmr.trim()
-        # gmr.attributes.reserved_mem_high = 0  ## not working
-        # gmr.attributes.used_mem_high = 0
-
-        assert gmr.attributes.reserved_mem_current == 0
-        assert gmr.attributes.reserved_mem_high == 0
-        assert gmr.attributes.used_mem_current == 0
-        assert gmr.attributes.used_mem_high == 0
-
-    # ====== Begin work sequence ======
-
-    # Begin graph capture.
-    if use_graph:
-        gb = Device().create_graph_builder().begin_building(mode="thread_local")
-
-    # Set up resources for use in the captured sequence.
-    cap_mr = gmr if use_graph else mr
-    cap_stream = gb.stream if use_graph else stream  # FIXME: should use gb here, not gb.stream
-
-    # Perform or capture the work.
-    work_buffer = cap_mr.allocate(NBYTES, stream=cap_stream)
-    for kernel in [set_zero, add_one, add_one]:
-        launch(cap_stream, LaunchConfig(grid=1, block=1), kernel, int(work_buffer.handle), NBYTES)
-    tgt_buffer.copy_from(work_buffer, stream=cap_stream)
-
-    if use_graph:
-        # Finalize the graph.
-        graph = gb.end_building().complete()
-
-        # Upload and launch
-        graph.upload(stream)
-        graph.launch(stream)
-
-    stream.sync()
-
-    # ====== End work sequence ======
-
-    # Check the result.
-    expected_buffer = make_scratch_buffer(device, 2, NBYTES)
-    compare_buffer = make_scratch_buffer(device, 0, NBYTES)
-    compare_buffer.copy_from(tgt_buffer, stream=stream)
-    stream.sync()
-    assert compare_equal_buffers(expected_buffer, compare_buffer)
-
-    # Check memory usage.
-    if use_graph:
-        # See that the graph-specific pool has been used.
-        assert gmr.attributes.reserved_mem_current > 0
-        assert gmr.attributes.reserved_mem_high > 0
-        assert gmr.attributes.used_mem_current > 0
-        assert gmr.attributes.used_mem_high > 0
-
-        work_buffer.close()
-
-        # See that the non-graph-specific pool has one allocation.
-        assert mr.attributes.used_mem_current == NBYTES
-        tgt_buffer.close()
-        assert mr.attributes.used_mem_current == 0
-
-    else:
-        # See that the graph-specific pool has not been used.
-        assert gmr.attributes.reserved_mem_current == 0
-        assert gmr.attributes.reserved_mem_high == 0
-        assert gmr.attributes.used_mem_current == 0
-        assert gmr.attributes.used_mem_high == 0
-
-        # See that the non-graph-specific pool has two allocations.
-        assert mr.attributes.used_mem_current == 2 * NBYTES
-        tgt_buffer.close()
-        work_buffer.close()
-        assert mr.attributes.used_mem_current == 0
-
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
new file mode 100644
index 0000000000..780fb23dd7
--- /dev/null
+++ b/cuda_core/tests/test_graph_mem.py
@@ -0,0 +1,120 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import pytest
+
+from cuda.core.experimental import (
+    Device,
+    DeviceMemoryResource,
+    GraphMemoryResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    launch,
+)
+from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return
+from helpers.buffers import compare_equal_buffers, make_scratch_buffer
+
+def _common_kernels_alloc():
+    code = """
+    __global__ void set_zero(char *a, size_t nbytes) {
+        size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t stride = blockDim.x * gridDim.x;
+        for (size_t i = idx; i < nbytes; i += stride) {
+            a[i] = 0;
+        }
+    }
+    __global__ void add_one(char *a, size_t nbytes) {
+        size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t stride = blockDim.x * gridDim.x;
+        for (size_t i = idx; i < nbytes; i += stride) {
+            a[i] += 1;
+        }
+    }
+    """
+    arch = "".join(f"{i}" for i in Device().compute_capability)
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("set_zero", "add_one"))
+    return mod
+
+
+
+@pytest.mark.parametrize("repeat", [0,1,2])
+@pytest.mark.parametrize("use_graph", [False, True])
+def test_graph_alloc(init_cuda, use_graph, repeat):
+    """Test graph capture with memory allocated by GraphMemoryResource."""
+    NBYTES = 64
+    device = Device()
+    stream = device.create_stream()
+    dmr = DeviceMemoryResource(device)
+    gmr = GraphMemoryResource(device)
+
+    out = dmr.allocate(NBYTES, stream=stream)
+
+    # Get kernels and define the calling sequence.
+    mod = _common_kernels_alloc()
+    set_zero = mod.get_kernel("set_zero")
+    add_one = mod.get_kernel("add_one")
+
+    def apply_kernels(mr, stream, out):
+        buffer = mr.allocate(NBYTES, stream=stream)
+        config = LaunchConfig(grid=1, block=1)
+        for kernel in [set_zero, add_one, add_one]:
+            launch(stream, config, kernel, buffer, NBYTES)
+        out.copy_from(buffer, stream=stream)
+        buffer.close()
+
+    # ====== Begin work sequence ======
+    if use_graph:
+        # Trim memory to zero and reset high watermarks.
+        gmr.trim()
+        # gmr.attributes.reserved_mem_high = 0  ## not working
+        # gmr.attributes.used_mem_high = 0
+
+        assert gmr.attributes.reserved_mem_current == 0
+        # assert gmr.attributes.reserved_mem_high == 0
+        assert gmr.attributes.used_mem_current == 0
+        # assert gmr.attributes.used_mem_high == 0
+
+        # Begin graph capture.
+        gb = Device().create_graph_builder().begin_building(mode="thread_local")
+
+        # Capture work.
+        apply_kernels(mr=gmr, stream=gb.stream, out=out)
+
+        # Finalize the graph.
+        graph = gb.end_building().complete()
+
+        # Upload and launch
+        graph.upload(stream)
+        graph.launch(stream)
+    else:
+        # Do work without graph capture.
+        apply_kernels(mr=dmr, stream=stream, out=out)
+
+    stream.sync()
+    # ====== End work sequence ======
+
+    # Check the result on the host.
+    host_ans = make_scratch_buffer(device, 2, NBYTES)
+    host_tmp = make_scratch_buffer(device, 0, NBYTES)
+    host_tmp.copy_from(out, stream=stream)
+    stream.sync()
+    assert compare_equal_buffers(host_ans, host_tmp)
+    host_ans.close()
+    host_tmp.close()
+
+    # # Check memory usage.
+    # if use_graph:
+    #     assert dmr.attributes.used_mem_current == NBYTES
+    #     assert gmr.attributes.used_mem_current > 0
+    #     out.close()
+    #     assert gmr.attributes.used_mem_current == 0
+    # else:
+    #     assert dmr.attributes.used_mem_current == NBYTES
+    #     assert gmr.attributes.used_mem_current == 0
+    #     out.close()
+    #     assert dmr.attributes.used_mem_current == 0
+

From 53b1c58d2a3dd3ae7de50d359586d0128e84b32e Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 5 Nov 2025 17:15:22 -0800
Subject: [PATCH 06/15] Add errors for DeviceMemoryResource and
 GraphMemoryResource when graph capture state is not as expected.

---
 .../_memory/_device_memory_resource.pyx       | 13 +++
 .../cuda/core/experimental/_memory/_gmr.pyx   | 12 +++
 cuda_core/tests/helpers/buffers.py            | 25 +++++-
 cuda_core/tests/test_graph_mem.py             | 82 ++++++++++++++++---
 4 files changed, 120 insertions(+), 12 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index d7c120cecf..4e66f083fd 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -467,10 +467,21 @@ cdef void DMR_init_create(
         self._ipc_data = IPCData(alloc_handle, mapped=False)
 
 
+# Raise an exception if the given stream is capturing.
+# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error.
+cdef void check_not_capturing(cydriver.CUstream s) nogil:
+    cdef cydriver.CUstreamCaptureStatus capturing
+    HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing))
+    if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE:
+        raise RuntimeError("DeviceMemoryResource cannot perform memory operations on "
+                           "a capturing stream (consider using GraphMemoryResource).")
+
+
 cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr
     with nogil:
+        check_not_capturing(s)
         HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
     cdef Buffer buf = Buffer.__new__(Buffer)
     buf._ptr = <uintptr_t>(devptr)
@@ -486,7 +497,9 @@ cdef void DMR_deallocate(
 ) noexcept:
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
+    cdef cydriver.CUstreamCaptureStatus capturing
     with nogil:
+        check_not_capturing(s)
         HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
 
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
index 23f09cc4c8..0bf9ea45f8 100644
--- a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
@@ -163,10 +163,21 @@ class GraphMemoryResource(cyGraphMemoryResource):
         return cyGraphMemoryResource.__new__(cls, device_id)
 
 
+# Raise an exception if the given stream is capturing.
+# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error.
+cdef void check_capturing(cydriver.CUstream s) nogil:
+    cdef cydriver.CUstreamCaptureStatus capturing
+    HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing))
+    if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE:
+        raise RuntimeError("GraphMemoryResource cannot perform memory operations on "
+                           "a non-capturing stream.")
+
+
 cdef Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr
     with nogil:
+        check_capturing(s)
         HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s))
     cdef Buffer buf = Buffer.__new__(Buffer)
     buf._ptr = <intptr_t>(devptr)
@@ -181,5 +192,6 @@ cdef void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept:
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
     with nogil:
+        check_capturing(s)
         HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
 
diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py
index 972e83e131..e6fc0633ac 100644
--- a/cuda_core/tests/helpers/buffers.py
+++ b/cuda_core/tests/helpers/buffers.py
@@ -3,12 +3,18 @@
 
 import ctypes
 
-from cuda.core.experimental import Buffer, MemoryResource
+from cuda.core.experimental import Buffer, Device, MemoryResource
 from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
 from . import libc
 
-__all__ = ["DummyUnifiedMemoryResource", "PatternGen", "make_scratch_buffer", "compare_equal_buffers"]
+__all__ = [
+    "compare_buffer_to_constant",
+    "compare_equal_buffers",
+    "DummyUnifiedMemoryResource",
+    "make_scratch_buffer",
+    "PatternGen",
+]
 
 
 class DummyUnifiedMemoryResource(MemoryResource):
@@ -102,6 +108,7 @@ def _get_pattern_buffer(self, seed, value):
 
 def make_scratch_buffer(device, value, nbytes):
     """Create a unified memory buffer with the specified value."""
+    assert 0 <= int(value) < 256
     buffer = DummyUnifiedMemoryResource(device).allocate(nbytes)
     ptr = ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_byte))
     ctypes.memset(ptr, value & 0xFF, nbytes)
@@ -115,3 +122,17 @@ def compare_equal_buffers(buffer1, buffer2):
     ptr1 = ctypes.cast(int(buffer1.handle), ctypes.POINTER(ctypes.c_byte))
     ptr2 = ctypes.cast(int(buffer2.handle), ctypes.POINTER(ctypes.c_byte))
     return libc.memcmp(ptr1, ptr2, buffer1.size) == 0
+
+
+def compare_buffer_to_constant(buffer, value):
+    device_id = buffer.memory_resource.device_id
+    device = Device(device_id)
+    stream = device.create_stream()
+    expected = make_scratch_buffer(device, value, buffer.size)
+    tmp = make_scratch_buffer(device, 0, buffer.size)
+    tmp.copy_from(buffer, stream=stream)
+    stream.sync()
+    result = compare_equal_buffers(expected, tmp)
+    expected.close()
+    tmp.close()
+    return result
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
index 780fb23dd7..735e8490b8 100644
--- a/cuda_core/tests/test_graph_mem.py
+++ b/cuda_core/tests/test_graph_mem.py
@@ -14,7 +14,7 @@
     launch,
 )
 from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return
-from helpers.buffers import compare_equal_buffers, make_scratch_buffer
+from helpers.buffers import compare_buffer_to_constant
 
 def _common_kernels_alloc():
     code = """
@@ -40,8 +40,7 @@ def _common_kernels_alloc():
     return mod
 
 
-
-@pytest.mark.parametrize("repeat", [0,1,2])
+@pytest.mark.parametrize("repeat", range(3))
 @pytest.mark.parametrize("use_graph", [False, True])
 def test_graph_alloc(init_cuda, use_graph, repeat):
     """Test graph capture with memory allocated by GraphMemoryResource."""
@@ -80,6 +79,8 @@ def apply_kernels(mr, stream, out):
 
         # Begin graph capture.
         gb = Device().create_graph_builder().begin_building(mode="thread_local")
+        # import code
+        # code.interact(local=dict(globals(), **locals()))
 
         # Capture work.
         apply_kernels(mr=gmr, stream=gb.stream, out=out)
@@ -98,13 +99,7 @@ def apply_kernels(mr, stream, out):
     # ====== End work sequence ======
 
     # Check the result on the host.
-    host_ans = make_scratch_buffer(device, 2, NBYTES)
-    host_tmp = make_scratch_buffer(device, 0, NBYTES)
-    host_tmp.copy_from(out, stream=stream)
-    stream.sync()
-    assert compare_equal_buffers(host_ans, host_tmp)
-    host_ans.close()
-    host_tmp.close()
+    assert compare_buffer_to_constant(out, 2)
 
     # # Check memory usage.
     # if use_graph:
@@ -118,3 +113,70 @@ def apply_kernels(mr, stream, out):
     #     out.close()
     #     assert dmr.attributes.used_mem_current == 0
 
+
+@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+def test_gmr_check_capture(init_cuda, mode):
+   """
+   Test expected errors (and non-errors) using GraphMemoryResource with graph
+   capture.
+   """
+   device = Device()
+   stream = device.create_stream()
+   gmr = GraphMemoryResource(device)
+
+   # Not capturing
+   with pytest.raises(RuntimeError,
+       match=r"GraphMemoryResource cannot perform memory operations on a "
+             r"non-capturing stream\."
+   ):
+       gmr.allocate(1, stream=stream)
+
+   # Capturing
+   gb = device.create_graph_builder().begin_building(mode=mode)
+   gmr.allocate(1, stream=gb.stream).close()  # no error
+   gb.end_building().complete()
+
+
+@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+def test_mr_check_capture(init_cuda, mode):
+   """
+   Test expected errors (and non-errors) using DeviceMemoryResource with graph
+   capture.
+   """
+   device = Device()
+   stream = device.create_stream()
+   dmr = DeviceMemoryResource(device)
+
+   # Not capturing
+   dmr.allocate(1, stream=stream).close()  # no error
+
+   # Capturing
+   gb = device.create_graph_builder().begin_building(mode=mode)
+   with pytest.raises(RuntimeError,
+       match=r"DeviceMemoryResource cannot perform memory operations on a capturing "
+             r"stream \(consider using GraphMemoryResource\)\."
+   ):
+       dmr.allocate(1, stream=gb.stream)
+   gb.end_building().complete()
+
+
+# This tests causes unraisable errors at shutdown.
+# @pytest.mark.parametrize("mode", ["global", "thread_local"])
+# def test_cross_stream_capture_error(init_cuda, mode):
+#    """
+#    Test errors related to unsafe API calls in global or thread_local capture
+#    mode.
+#    """
+#    # When graph capturing is turned on for an unrelated stream, the driver
+#    # raises an error. Not sure how to detect this.
+#    from cuda.core.experimental._utils.cuda_utils import CUDAError  # FIXME
+#    device = Device()
+#    stream = device.create_stream()
+#    dmr = DeviceMemoryResource(device)
+#
+#    with pytest.raises(RuntimeError, match="Build process encountered an error"):
+#        gb = device.create_graph_builder().begin_building(mode)
+#        with pytest.raises(CUDAError, match="CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED"):
+#            dmr.allocate(1, stream=stream)  # not targeting gb.stream
+#        gb.end_building().complete()
+

From 1b8409b9de8322839fd2f718c455e41a0bf87fd3 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 7 Nov 2025 12:32:41 -0800
Subject: [PATCH 07/15] Add tests for attributes and memory allocation escaping
 graphs.

---
 .../core/experimental/_memory/_buffer.pyx     |   2 +-
 .../_memory/_device_memory_resource.pyx       |  10 +-
 .../cuda/core/experimental/_memory/_gmr.pyx   |  24 +-
 .../core/experimental/_utils/cuda_utils.pxd   |   2 +-
 cuda_core/tests/helpers/buffers.py            |  10 +-
 cuda_core/tests/test_graph_mem.py             | 216 ++++++++++++------
 6 files changed, 175 insertions(+), 89 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index 2251272742..ef58998b52 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -274,7 +274,7 @@ cdef class Buffer:
 
 # Buffer Implementation
 # ---------------------
-cdef Buffer_close(Buffer self, stream):
+cdef inline void Buffer_close(Buffer self, stream):
     cdef Stream s
     if self._ptr and self._memory_resource is not None:
         if stream is None:
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index 4e66f083fd..ab5dc1e600 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -469,7 +469,7 @@ cdef void DMR_init_create(
 
 # Raise an exception if the given stream is capturing.
 # A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error.
-cdef void check_not_capturing(cydriver.CUstream s) nogil:
+cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil:
     cdef cydriver.CUstreamCaptureStatus capturing
     HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing))
     if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE:
@@ -477,7 +477,7 @@ cdef void check_not_capturing(cydriver.CUstream s) nogil:
                            "a capturing stream (consider using GraphMemoryResource).")
 
 
-cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
+cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr
     with nogil:
@@ -492,18 +492,17 @@ cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
     return buf
 
 
-cdef void DMR_deallocate(
+cdef inline void DMR_deallocate(
     DeviceMemoryResource self, uintptr_t ptr, size_t size, Stream stream
 ) noexcept:
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
     cdef cydriver.CUstreamCaptureStatus capturing
     with nogil:
-        check_not_capturing(s)
         HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
 
 
-cdef DMR_close(DeviceMemoryResource self):
+cdef inline DMR_close(DeviceMemoryResource self):
     if self._handle == NULL:
         return
 
@@ -517,3 +516,4 @@ cdef DMR_close(DeviceMemoryResource self):
         self._attributes = None
         self._mempool_owned = False
         self._ipc_data = None
+
diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
index 0bf9ea45f8..7c717c051e 100644
--- a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from libc.stdint cimport uintptr_t, intptr_t
+from libc.stdint cimport uintptr_t, intptr_t, uint64_t
 
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
@@ -71,8 +71,10 @@ cdef GMRA_mem_attribute(property_type: type, settable : bool = False):
             return property_type(value)
 
         if _settable:
-            def fset(GraphMemoryResourceAttributes self, value: int):
-                GMRA_setattribute(self._dev_id, <cydriver.CUgraphMem_attribute><uintptr_t> attr_enum, value)
+            def fset(GraphMemoryResourceAttributes self, uint64_t value):
+                if value != 0:
+                    raise AttributeError(f"Attribute {stub.__name__!r} may only be set to zero (got {value}).")
+                GMRA_setattribute(self._dev_id, <cydriver.CUgraphMem_attribute><uintptr_t> attr_enum)
         else:
             fset = None
 
@@ -80,16 +82,17 @@ cdef GMRA_mem_attribute(property_type: type, settable : bool = False):
     return decorator
 
 
-cdef int GMRA_getattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum):
-    cdef int value
+cdef inline uint64_t GMRA_getattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum):
+    cdef uint64_t value
     with nogil:
         HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(device_id, attr_enum, <void *> &value))
     return value
 
 
-cdef int GMRA_setattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum, int value):
+cdef inline void GMRA_setattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum):
+    cdef uint64_t zero = 0
     with nogil:
-        HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(device_id, attr_enum, <void *> &value))
+        HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(device_id, attr_enum, <void *> &zero))
 
 
 cdef class cyGraphMemoryResource(MemoryResource):
@@ -165,7 +168,7 @@ class GraphMemoryResource(cyGraphMemoryResource):
 
 # Raise an exception if the given stream is capturing.
 # A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error.
-cdef void check_capturing(cydriver.CUstream s) nogil:
+cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil:
     cdef cydriver.CUstreamCaptureStatus capturing
     HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing))
     if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE:
@@ -173,7 +176,7 @@ cdef void check_capturing(cydriver.CUstream s) nogil:
                            "a non-capturing stream.")
 
 
-cdef Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream):
+cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr
     with nogil:
@@ -188,10 +191,9 @@ cdef Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream)
     return buf
 
 
-cdef void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept:
+cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept:
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
     with nogil:
-        check_capturing(s)
         HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
 
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
index ad6da14dae..0e75202498 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
@@ -14,7 +14,7 @@ ctypedef fused supported_error_type:
 
 
 # mimic CU_DEVICE_INVALID
-cdef cydriver.CUcontext CU_CONTEXT_INVALID = <cydriver.CUcontext>(-2)
+cdef const cydriver.CUcontext CU_CONTEXT_INVALID = <cydriver.CUcontext>(-2)
 
 
 cdef cydriver.CUdevice get_device_from_ctx(
diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py
index e6fc0633ac..b4d769eab3 100644
--- a/cuda_core/tests/helpers/buffers.py
+++ b/cuda_core/tests/helpers/buffers.py
@@ -108,13 +108,17 @@ def _get_pattern_buffer(self, seed, value):
 
 def make_scratch_buffer(device, value, nbytes):
     """Create a unified memory buffer with the specified value."""
-    assert 0 <= int(value) < 256
     buffer = DummyUnifiedMemoryResource(device).allocate(nbytes)
-    ptr = ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_byte))
-    ctypes.memset(ptr, value & 0xFF, nbytes)
+    set_buffer(buffer, value)
     return buffer
 
 
+def set_buffer(buffer, value):
+    assert 0 <= int(value) < 256
+    ptr = ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_byte))
+    ctypes.memset(ptr, value & 0xFF, buffer.size)
+
+
 def compare_equal_buffers(buffer1, buffer2):
     """Compare the contents of two host-accessible buffers for bitwise equality."""
     if buffer1.size != buffer2.size:
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
index 735e8490b8..b5abcd28d8 100644
--- a/cuda_core/tests/test_graph_mem.py
+++ b/cuda_core/tests/test_graph_mem.py
@@ -7,14 +7,15 @@
 from cuda.core.experimental import (
     Device,
     DeviceMemoryResource,
+    GraphCompleteOptions,
     GraphMemoryResource,
+    launch,
     LaunchConfig,
     Program,
     ProgramOptions,
-    launch,
 )
 from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return
-from helpers.buffers import compare_buffer_to_constant
+from helpers.buffers import compare_buffer_to_constant, make_scratch_buffer, set_buffer
 
 def _common_kernels_alloc():
     code = """
@@ -40,16 +41,47 @@ def _common_kernels_alloc():
     return mod
 
 
-@pytest.mark.parametrize("repeat", range(3))
-@pytest.mark.parametrize("use_graph", [False, True])
-def test_graph_alloc(init_cuda, use_graph, repeat):
-    """Test graph capture with memory allocated by GraphMemoryResource."""
+class GraphMemoryTestManager:
+    """
+    Manages changes to the state of the graph memory system, for testing.
+    """
+    def __init__(self, gmr, stream, mode=None):
+        self.device = Device(gmr.device_id)
+        self.gmr = gmr
+        self.stream = stream
+        self.mode = "relaxed" if mode is None else mode
+
+    def reset(self):
+        """Trim unused graph memory and reset usage statistics."""
+        self.gmr.trim()
+        self.gmr.attributes.reserved_mem_high = 0
+        self.gmr.attributes.used_mem_high = 0
+
+    def alloc(self, num, nbytes):
+        """Allocate num buffers of size nbytes from graph memory."""
+        gb = self.device.create_graph_builder().begin_building(self.mode)
+        buffers = [self.gmr.allocate(nbytes, stream=gb.stream) for _ in range(num)]
+        graph = gb.end_building().complete()
+        graph.upload(self.stream)
+        graph.launch(self.stream)
+        self.stream.sync()
+        return buffers
+
+    def free(self, buffers):
+        """Free graph memory buffers."""
+        for buffer in buffers:
+            buffer.close(stream=self.stream)
+        self.stream.sync()
+
+
+@pytest.mark.parametrize("mode", ["no_graph", "global", "thread_local", "relaxed"])
+def test_graph_alloc(init_cuda, mode):
+    """Test basic graph capture with memory allocated and deallocated by GraphMemoryResource."""
     NBYTES = 64
     device = Device()
     stream = device.create_stream()
     dmr = DeviceMemoryResource(device)
     gmr = GraphMemoryResource(device)
-
     out = dmr.allocate(NBYTES, stream=stream)
 
     # Get kernels and define the calling sequence.
@@ -65,57 +97,126 @@ def apply_kernels(mr, stream, out):
         out.copy_from(buffer, stream=stream)
         buffer.close()
 
-    # ====== Begin work sequence ======
-    if use_graph:
-        # Trim memory to zero and reset high watermarks.
-        gmr.trim()
-        # gmr.attributes.reserved_mem_high = 0  ## not working
-        # gmr.attributes.used_mem_high = 0
-
-        assert gmr.attributes.reserved_mem_current == 0
-        # assert gmr.attributes.reserved_mem_high == 0
-        assert gmr.attributes.used_mem_current == 0
-        # assert gmr.attributes.used_mem_high == 0
-
-        # Begin graph capture.
-        gb = Device().create_graph_builder().begin_building(mode="thread_local")
-        # import code
-        # code.interact(local=dict(globals(), **locals()))
-
-        # Capture work.
+    # Apply kernels, with or without graph capture.
+    if mode == "no_graph":
+        # Do work without graph capture.
+        apply_kernels(mr=dmr, stream=stream, out=out)
+    else:
+        # Capture work, then upload and launch.
+        gb = device.create_graph_builder().begin_building(mode)
         apply_kernels(mr=gmr, stream=gb.stream, out=out)
-
-        # Finalize the graph.
         graph = gb.end_building().complete()
-
-        # Upload and launch
         graph.upload(stream)
         graph.launch(stream)
-    else:
-        # Do work without graph capture.
-        apply_kernels(mr=dmr, stream=stream, out=out)
 
     stream.sync()
-    # ====== End work sequence ======
 
     # Check the result on the host.
     assert compare_buffer_to_constant(out, 2)
 
-    # # Check memory usage.
-    # if use_graph:
-    #     assert dmr.attributes.used_mem_current == NBYTES
-    #     assert gmr.attributes.used_mem_current > 0
-    #     out.close()
-    #     assert gmr.attributes.used_mem_current == 0
-    # else:
-    #     assert dmr.attributes.used_mem_current == NBYTES
-    #     assert gmr.attributes.used_mem_current == 0
-    #     out.close()
-    #     assert dmr.attributes.used_mem_current == 0
+
+@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+def test_graph_alloc_with_output(init_cuda, mode):
+    """Test for memory allocated in a graph being used outside the graph."""
+    NBYTES = 64
+    device = Device()
+    stream = device.create_stream()
+    gmr = GraphMemoryResource(device)
+
+    # Get kernels and define the calling sequence.
+    mod = _common_kernels_alloc()
+    set_zero = mod.get_kernel("set_zero")
+    add_one = mod.get_kernel("add_one")
+
+    # Make an input of 0s.
+    in_ = make_scratch_buffer(device, 0, NBYTES)
+
+    # Construct a graph to copy and increment the input. It returns a new
+    # buffer allocated within the graph.  The auto_free_on_launch option
+    # is required to properly use the output buffer.
+    gb = device.create_graph_builder().begin_building(mode)
+    out = gmr.allocate(NBYTES, gb.stream)
+    out.copy_from(in_, stream=gb.stream)
+    launch(gb, LaunchConfig(grid=1, block=1), add_one, out, NBYTES)
+    options = GraphCompleteOptions(auto_free_on_launch=True)
+    graph = gb.end_building().complete(options)
+
+    # Launch the graph. The output buffer is allocated and set to one.
+    graph.upload(stream)
+    graph.launch(stream)
+    stream.sync()
+    assert compare_buffer_to_constant(out, 1)
+
+    # Update the input buffer and rerun the graph.
+    set_buffer(in_, 5)
+    graph.upload(stream)
+    graph.launch(stream)
+    stream.sync()
+    assert compare_buffer_to_constant(out, 6)
 
 
 @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
-def test_gmr_check_capture(init_cuda, mode):
+def test_graph_mem_set_attributes(init_cuda, mode):
+    device = Device()
+    stream = device.create_stream()
+    gmr = GraphMemoryResource(device)
+    mman = GraphMemoryTestManager(gmr, stream, mode)
+
+    # Make an allocation and obvserve usage.
+    buffer = mman.alloc(1, 1024)
+    assert gmr.attributes.reserved_mem_current > 0
+    assert gmr.attributes.reserved_mem_high > 0
+    assert gmr.attributes.used_mem_current > 0
+    assert gmr.attributes.used_mem_high > 0
+
+    # Incorrect attribute usage.
+    with pytest.raises(AttributeError,
+        match=r"property 'reserved_mem_current' .* has no setter"
+    ):
+        gmr.attributes.reserved_mem_current = 0
+
+    with pytest.raises(AttributeError,
+        match=r"Attribute 'reserved_mem_high' may only be set to zero \(got 1\)\."
+    ):
+        gmr.attributes.reserved_mem_high = 1
+
+    with pytest.raises(AttributeError,
+        match=r"property 'used_mem_current' .* has no setter"
+    ):
+        gmr.attributes.used_mem_current = 0
+
+    with pytest.raises(AttributeError,
+        match=r"Attribute 'used_mem_high' may only be set to zero \(got 1\)\."
+    ):
+        gmr.attributes.used_mem_high = 1
+
+    # Free memory, but usage is not reduced yet.
+    mman.free(buffer)
+    assert gmr.attributes.reserved_mem_current > 0
+    assert gmr.attributes.reserved_mem_high > 0
+    assert gmr.attributes.used_mem_current > 0
+    assert gmr.attributes.used_mem_high > 0
+
+    gmr.trim()
+
+    # The high-water marks remain after free and trim.
+    assert gmr.attributes.reserved_mem_current == 0
+    assert gmr.attributes.reserved_mem_high > 0
+    assert gmr.attributes.used_mem_current == 0
+    assert gmr.attributes.used_mem_high > 0
+
+    # Reset the high-water marks.
+    gmr.attributes.reserved_mem_high = 0
+    gmr.attributes.used_mem_high = 0
+
+    assert gmr.attributes.reserved_mem_high == 0
+    assert gmr.attributes.used_mem_high == 0
+
+    mman.reset()
+
+
+@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+def test_gmr_check_capture_state(init_cuda, mode):
    """
    Test expected errors (and non-errors) using GraphMemoryResource with graph
    capture.
@@ -133,12 +234,12 @@ def test_gmr_check_capture(init_cuda, mode):
 
    # Capturing
    gb = device.create_graph_builder().begin_building(mode=mode)
-   gmr.allocate(1, stream=gb.stream).close()  # no error
+   buffer = gmr.allocate(1, stream=gb.stream).close()  # no error
    gb.end_building().complete()
 
 
 @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
-def test_mr_check_capture(init_cuda, mode):
+def test_dmr_check_capture_state(init_cuda, mode):
    """
    Test expected errors (and non-errors) using DeviceMemoryResource with graph
    capture.
@@ -159,24 +260,3 @@ def test_mr_check_capture(init_cuda, mode):
        dmr.allocate(1, stream=gb.stream)
    gb.end_building().complete()
 
-
-# This tests causes unraisable errors at shutdown.
-# @pytest.mark.parametrize("mode", ["global", "thread_local"])
-# def test_cross_stream_capture_error(init_cuda, mode):
-#    """
-#    Test errors related to unsafe API calls in global or thread_local capture
-#    mode.
-#    """
-#    # When graph capturing is turned on for an unrelated stream, the driver
-#    # raises an error. Not sure how to detect this.
-#    from cuda.core.experimental._utils.cuda_utils import CUDAError  # FIXME
-#    device = Device()
-#    stream = device.create_stream()
-#    dmr = DeviceMemoryResource(device)
-#
-#    with pytest.raises(RuntimeError, match="Build process encountered an error"):
-#        gb = device.create_graph_builder().begin_building(mode)
-#        with pytest.raises(CUDAError, match="CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED"):
-#            dmr.allocate(1, stream=stream)  # not targeting gb.stream
-#        gb.end_building().complete()
-

From e9422b23f60a155da5c520303c4fb6b49304d0d9 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 10 Nov 2025 11:32:25 -0800
Subject: [PATCH 08/15] Simplify logic for converting IsStreamT arguments.

---
 .../cuda/core/experimental/_launcher.pyx      | 20 ++----
 cuda_core/cuda/core/experimental/_stream.pxd  |  3 -
 cuda_core/cuda/core/experimental/_stream.pyx  | 70 +++++++++++--------
 3 files changed, 43 insertions(+), 50 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx
index a06d885ff8..516fc87a27 100644
--- a/cuda_core/cuda/core/experimental/_launcher.pyx
+++ b/cuda_core/cuda/core/experimental/_launcher.pyx
@@ -4,8 +4,6 @@
 
 from libc.stdint cimport uintptr_t
 
-from cuda.core.experimental._stream cimport _try_to_get_stream_ptr
-
 from typing import Union
 
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
@@ -58,17 +56,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
         launching kernel.
 
     """
-    if stream is None:
-        raise ValueError("stream cannot be None, stream must either be a Stream object or support __cuda_stream__")
-    try:
-        stream_handle = stream.handle
-    except AttributeError:
-        try:
-            stream_handle = driver.CUstream(<uintptr_t>(_try_to_get_stream_ptr(stream)))
-        except Exception:
-            raise ValueError(
-                f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
-            ) from None
+    stream = Stream._init(stream)
     assert_type(kernel, Kernel)
     _lazy_init()
     config = check_or_create_options(LaunchConfig, config, "launch config")
@@ -85,7 +73,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
     # rich.
     if _use_ex:
         drv_cfg = _to_native_launch_config(config)
-        drv_cfg.hStream = stream_handle
+        drv_cfg.hStream = stream.handle
         if config.cooperative_launch:
             _check_cooperative_launch(kernel, config, stream)
         handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
@@ -93,12 +81,12 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
         # TODO: check if config has any unsupported attrs
         handle_return(
             driver.cuLaunchKernel(
-                int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream_handle, args_ptr, 0
+                int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
             )
         )
 
 
-def _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
+cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
     dev = stream.device
     num_sm = dev.properties.multiprocessor_count
     max_grid_size = (
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd
index 8f382e5d01..68a410d1ed 100644
--- a/cuda_core/cuda/core/experimental/_stream.pxd
+++ b/cuda_core/cuda/core/experimental/_stream.pxd
@@ -5,9 +5,6 @@
 from cuda.bindings cimport cydriver
 
 
-cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*
-
-
 cdef class Stream:
 
     cdef:
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index cdc4651bda..b177ea217d 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -62,36 +62,6 @@ class IsStreamT(Protocol):
         ...
 
 
-cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*:
-    try:
-        cuda_stream_attr = obj.__cuda_stream__
-    except AttributeError:
-        raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") from None
-
-    if callable(cuda_stream_attr):
-        info = cuda_stream_attr()
-    else:
-        info = cuda_stream_attr
-        warnings.simplefilter("once", DeprecationWarning)
-        warnings.warn(
-            "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method",
-            stacklevel=3,
-            category=DeprecationWarning,
-        )
-
-    try:
-        len_info = len(info)
-    except TypeError as e:
-        raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}") from e
-    if len_info != 2:
-        raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements")
-    if info[0] != 0:
-        raise RuntimeError(
-            f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}"
-        )
-    return <cydriver.CUstream><uintptr_t>(info[1])
-
-
 cdef class Stream:
     """Represent a queue of GPU operations that are executed in a specific order.
 
@@ -139,12 +109,15 @@ cdef class Stream:
 
     @classmethod
     def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None):
+        if isinstance(obj, Stream):
+            return obj
+
         cdef Stream self = Stream.__new__(cls)
 
         if obj is not None and options is not None:
             raise ValueError("obj and options cannot be both specified")
         if obj is not None:
-            self._handle = _try_to_get_stream_ptr(obj)
+            self._handle = _handle_from_stream_t(obj)
             # TODO: check if obj is created under the current context/device
             self._owner = obj
             return self
@@ -445,3 +418,38 @@ cpdef Stream default_stream():
         return C_PER_THREAD_DEFAULT_STREAM
     else:
         return C_LEGACY_DEFAULT_STREAM
+
+
+cdef cydriver.CUstream _handle_from_stream_t(obj) except*:
+    if isinstance(obj, Stream):
+        return <cydriver.CUstream><uintptr_t>(obj.handle)
+
+    try:
+        cuda_stream_attr = obj.__cuda_stream__
+    except AttributeError:
+        raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") from None
+
+    if callable(cuda_stream_attr):
+        info = cuda_stream_attr()
+    else:
+        info = cuda_stream_attr
+        warnings.simplefilter("once", DeprecationWarning)
+        warnings.warn(
+            "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method",
+            stacklevel=3,
+            category=DeprecationWarning,
+        )
+
+    try:
+        len_info = len(info)
+    except TypeError as e:
+        raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}") from e
+    if len_info != 2:
+        raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements")
+    if info[0] != 0:
+        raise RuntimeError(
+            f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}"
+        )
+    return <cydriver.CUstream><uintptr_t>(info[1])
+
+

From 3e21b9b4dc9a74a1bc1883665f9bcb80a89aa098 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 10 Nov 2025 12:10:41 -0800
Subject: [PATCH 09/15] Standardize Stream arguments to IsStreamT. Update
 Buffer and MemoryResource methods to take any kind of stream-providing
 object. Update graph allocation tests.

---
 cuda_core/cuda/core/experimental/_device.pyx  |  8 +++---
 .../cuda/core/experimental/_launcher.pyx      |  4 +--
 .../core/experimental/_memory/_buffer.pyx     | 25 +++++++++----------
 .../_memory/_device_memory_resource.pyx       | 16 ++++++------
 .../cuda/core/experimental/_memory/_gmr.pyx   | 21 +++++++++++-----
 cuda_core/tests/test_graph_mem.py             | 12 ++++-----
 6 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index bc6167793a..833c74c0a8 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -9,7 +9,7 @@ from cuda.bindings cimport cydriver
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 import threading
-from typing import Union, TYPE_CHECKING
+from typing import Optional, TYPE_CHECKING, Union
 
 from cuda.core.experimental._context import Context, ContextOptions
 from cuda.core.experimental._event import Event, EventOptions
@@ -1242,7 +1242,7 @@ class Device:
         """
         raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189")
 
-    def create_stream(self, obj: IsStreamT | None = None, options: StreamOptions | None = None) -> Stream:
+    def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions | None = None) -> Stream:
         """Create a Stream object.
 
         New stream objects can be created in two different ways:
@@ -1295,7 +1295,7 @@ class Device:
         ctx = self._get_current_context()
         return Event._init(self._id, ctx, options, True)
 
-    def allocate(self, size, stream: Stream | None = None) -> Buffer:
+    def allocate(self, size, stream: Optional[IsStreamT] = None) -> Buffer:
         """Allocate device memory from a specified stream.
 
         Allocates device memory of `size` bytes on the specified `stream`
@@ -1311,7 +1311,7 @@ class Device:
         ----------
         size : int
             Number of bytes to allocate.
-        stream : :obj:`~_stream.Stream`, optional
+        stream : :obj:`~_stream.IsStreamT`, optional
             The stream establishing the stream ordering semantic.
             Default value of `None` uses default stream.
 
diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx
index 516fc87a27..939c0fa803 100644
--- a/cuda_core/cuda/core/experimental/_launcher.pyx
+++ b/cuda_core/cuda/core/experimental/_launcher.pyx
@@ -37,13 +37,13 @@ def _lazy_init():
     _inited = True
 
 
-def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kernel, *kernel_args):
+def launch(stream: IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args):
     """Launches a :obj:`~_module.Kernel`
     object with launch-time configuration.
 
     Parameters
     ----------
-    stream : :obj:`~_stream.Stream`
+    stream : :obj:`~_stream.IsStreamT`
         The stream establishing the stream ordering semantic of a
         launch.
     config : :obj:`LaunchConfig`
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index ef58998b52..f18d5ec02d 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -15,9 +15,10 @@ from cuda.core.experimental._utils.cuda_utils cimport (
 )
 
 import abc
-from typing import TypeVar, Union
+from typing import Optional, TypeVar, Union
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
+from cuda.core.experimental._stream import IsStreamT
 from cuda.core.experimental._utils.cuda_utils import driver
 
 __all__ = ['Buffer', 'MemoryResource']
@@ -116,7 +117,7 @@ cdef class Buffer:
         """
         Buffer_close(self, stream)
 
-    def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
+    def copy_to(self, dst: Buffer = None, *, stream: IsStreamT) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
 
         Copies the data from this buffer to the provided dst buffer.
@@ -127,13 +128,12 @@ cdef class Buffer:
         ----------
         dst : :obj:`~_memory.Buffer`
             Source buffer to copy data from
-        stream : Stream
+        stream : IsStreamT
             Keyword argument specifying the stream for the
             asynchronous copy
 
         """
-        if stream is None:
-            raise ValueError("stream must be provided")
+        stream = Stream._init(stream)
 
         cdef size_t src_size = self._size
 
@@ -152,20 +152,19 @@ cdef class Buffer:
         raise_if_driver_error(err)
         return dst
 
-    def copy_from(self, src: Buffer, *, stream: Stream):
+    def copy_from(self, src: Buffer, *, stream: IsStreamT):
         """Copy from the src buffer to this buffer asynchronously on the given stream.
 
         Parameters
         ----------
         src : :obj:`~_memory.Buffer`
             Source buffer to copy data from
-        stream : Stream
+        stream : IsStreamT
             Keyword argument specifying the stream for the
             asynchronous copy
 
         """
-        if stream is None:
-            raise ValueError("stream must be provided")
+        stream = Stream._init(stream)
 
         cdef size_t dst_size = self._size
         cdef size_t src_size = src._size
@@ -305,14 +304,14 @@ cdef class MemoryResource:
     """
 
     @abc.abstractmethod
-    def allocate(self, size_t size, stream: Stream = None) -> Buffer:
+    def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer:
         """Allocate a buffer of the requested size.
 
         Parameters
         ----------
         size : int
             The size of the buffer to allocate, in bytes.
-        stream : Stream, optional
+        stream : IsStreamT, optional
             The stream on which to perform the allocation asynchronously.
             If None, it is up to each memory resource implementation to decide
             and document the behavior.
@@ -326,7 +325,7 @@ cdef class MemoryResource:
         ...
 
     @abc.abstractmethod
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -335,7 +334,7 @@ cdef class MemoryResource:
             The pointer or handle to the buffer to deallocate.
         size : int
             The size of the buffer to deallocate, in bytes.
-        stream : Stream, optional
+        stream : IsStreamT, optional
             The stream on which to perform the deallocation asynchronously.
             If None, it is up to each memory resource implementation to decide
             and document the behavior.
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index ab5dc1e600..9ce3e5be39 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -308,14 +308,14 @@ cdef class DeviceMemoryResource(MemoryResource):
             raise RuntimeError("Imported memory resource cannot be exported")
         return self._ipc_data._alloc_handle
 
-    def allocate(self, size_t size, stream: Stream = None) -> Buffer:
+    def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer:
         """Allocate a buffer of the requested size.
 
         Parameters
         ----------
         size : int
             The size of the buffer to allocate, in bytes.
-        stream : Stream, optional
+        stream : IsStreamT, optional
             The stream on which to perform the allocation asynchronously.
             If None, an internal stream is used.
 
@@ -327,11 +327,10 @@ cdef class DeviceMemoryResource(MemoryResource):
         """
         if self.is_mapped:
             raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
-        if stream is None:
-            stream = default_stream()
-        return DMR_allocate(self, size, <Stream>stream)
+        stream = Stream._init(stream) if stream is not None else default_stream()
+        return DMR_allocate(self, size, <Stream> stream)
 
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -340,12 +339,13 @@ cdef class DeviceMemoryResource(MemoryResource):
             The pointer or handle to the buffer to deallocate.
         size : int
             The size of the buffer to deallocate, in bytes.
-        stream : Stream, optional
+        stream : IsStreamT, optional
             The stream on which to perform the deallocation asynchronously.
             If the buffer is deallocated without an explicit stream, the allocation stream
             is used.
         """
-        DMR_deallocate(self, <uintptr_t>ptr, size, <Stream>stream)
+        stream = Stream._init(stream) if stream is not None else default_stream()
+        DMR_deallocate(self, <uintptr_t>ptr, size, <Stream> stream)
 
     @property
     def attributes(self) -> DeviceMemoryResourceAttributes:
diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
index 7c717c051e..09a561fdde 100644
--- a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx
@@ -8,11 +8,11 @@ from libc.stdint cimport uintptr_t, intptr_t, uint64_t
 
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
-from cuda.core.experimental._stream cimport Stream
+from cuda.core.experimental._stream cimport default_stream, Stream
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 from functools import cache
-from typing import TYPE_CHECKING
+from typing import Optional, TYPE_CHECKING
 
 from cuda.core.experimental._utils.cuda_utils import driver
 
@@ -99,13 +99,22 @@ cdef class cyGraphMemoryResource(MemoryResource):
     def __cinit__(self, int device_id):
         self._dev_id = device_id
 
-    def allocate(self, size_t size, Stream stream = None) -> Buffer:
-        return GMR_allocate(self, size, stream)
+    def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer:
+        """
+        Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
+        """
+        stream = Stream._init(stream) if stream is not None else default_stream()
+        return GMR_allocate(self, size, <Stream> stream)
 
-    def deallocate(self, ptr: DevicePointerT, size_t size, Stream stream = None):
-        return GMR_deallocate(ptr, size, stream)
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None):
+        """
+        Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
+        """
+        stream = Stream._init(stream) if stream is not None else default_stream()
+        return GMR_deallocate(ptr, size, <Stream> stream)
 
     def close(self):
+        """No operation (provided for compatibility)."""
         pass
 
     def trim(self):
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
index b5abcd28d8..6e4bf907ea 100644
--- a/cuda_core/tests/test_graph_mem.py
+++ b/cuda_core/tests/test_graph_mem.py
@@ -60,7 +60,7 @@ def reset(self):
     def alloc(self, num, nbytes):
         """Allocate num buffers of size nbytes from graph memory."""
         gb = self.device.create_graph_builder().begin_building(self.mode)
-        buffers = [self.gmr.allocate(nbytes, stream=gb.stream) for _ in range(num)]
+        buffers = [self.gmr.allocate(nbytes, stream=gb) for _ in range(num)]
         graph = gb.end_building().complete()
         graph.upload(self.stream)
         graph.launch(self.stream)
@@ -104,7 +104,7 @@ def apply_kernels(mr, stream, out):
     else:
         # Capture work, then upload and launch.
         gb = device.create_graph_builder().begin_building(mode)
-        apply_kernels(mr=gmr, stream=gb.stream, out=out)
+        apply_kernels(mr=gmr, stream=gb, out=out)
         graph = gb.end_building().complete()
         graph.upload(stream)
         graph.launch(stream)
@@ -135,8 +135,8 @@ def test_graph_alloc_with_output(init_cuda, mode):
     # buffer allocated within the graph.  The auto_free_on_launch option
     # is required to properly use the output buffer.
     gb = device.create_graph_builder().begin_building(mode)
-    out = gmr.allocate(NBYTES, gb.stream)
-    out.copy_from(in_, stream=gb.stream)
+    out = gmr.allocate(NBYTES, gb)
+    out.copy_from(in_, stream=gb)
     launch(gb, LaunchConfig(grid=1, block=1), add_one, out, NBYTES)
     options = GraphCompleteOptions(auto_free_on_launch=True)
     graph = gb.end_building().complete(options)
@@ -234,7 +234,7 @@ def test_gmr_check_capture_state(init_cuda, mode):
 
    # Capturing
    gb = device.create_graph_builder().begin_building(mode=mode)
-   buffer = gmr.allocate(1, stream=gb.stream).close()  # no error
+   buffer = gmr.allocate(1, stream=gb).close()  # no error
    gb.end_building().complete()
 
 
@@ -257,6 +257,6 @@ def test_dmr_check_capture_state(init_cuda, mode):
        match=r"DeviceMemoryResource cannot perform memory operations on a capturing "
              r"stream \(consider using GraphMemoryResource\)\."
    ):
-       dmr.allocate(1, stream=gb.stream)
+       dmr.allocate(1, stream=gb)
    gb.end_building().complete()
 

From 98ccfc769d9984e83b9cbef2e7e3edb44445d151 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 10 Nov 2025 13:10:55 -0800
Subject: [PATCH 10/15] Add tests for IsStreamT conversions.

---
 cuda_core/tests/helpers/misc.py  | 26 ++++++++++++++++++++++++++
 cuda_core/tests/test_launcher.py | 21 ++++++++++++++++-----
 cuda_core/tests/test_memory.py   | 31 +++++++++++++++++++++----------
 cuda_core/tests/test_stream.py   |  6 +++++-
 4 files changed, 68 insertions(+), 16 deletions(-)
 create mode 100644 cuda_core/tests/helpers/misc.py

diff --git a/cuda_core/tests/helpers/misc.py b/cuda_core/tests/helpers/misc.py
new file mode 100644
index 0000000000..6389e2a3a5
--- /dev/null
+++ b/cuda_core/tests/helpers/misc.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental import Stream
+
+
+class StreamWrapper:
+    """
+    A wrapper around Stream for testing IsStreamT conversions.
+    """
+
+    def __init__(self, stream: Stream):
+        self._stream = stream
+
+    def __cuda_stream__(self):
+        return self._stream.__cuda_stream__()
+
+    def close(self):
+        self._stream.close()
+
+    @property
+    def handle(self):
+        return self._stream.handle
+
+    def sync(self):
+        return self._stream.sync()
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index a951fc418a..138ea24aa8 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -24,6 +24,7 @@
 from cuda.core.experimental._utils.cuda_utils import CUDAError
 
 from conftest import skipif_need_cuda_headers
+from helpers.misc import StreamWrapper
 
 
 def test_launch_config_init(init_cuda):
@@ -179,9 +180,10 @@ def test_launch_invalid_values(init_cuda):
     )
 
 
+@pytest.mark.parametrize("wrap_stream", [True, False])
 @pytest.mark.parametrize("python_type, cpp_type, init_value", PARAMS)
 @pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
-def test_launch_scalar_argument(python_type, cpp_type, init_value):
+def test_launch_scalar_argument(python_type, cpp_type, init_value, wrap_stream):
     dev = Device()
     dev.set_current()
 
@@ -219,19 +221,25 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value):
     ker = mod.get_kernel(ker_name)
 
     # Launch with 1 thread
+    stream = dev.default_stream
+    if wrap_stream:
+        stream = StreamWrapper(stream)
     config = LaunchConfig(grid=1, block=1)
-    launch(dev.default_stream, config, ker, arr.ctypes.data, scalar)
-    dev.default_stream.sync()
+    launch(stream, config, ker, arr.ctypes.data, scalar)
+    stream.sync()
 
     # Check result
     assert arr[0] == init_value, f"Expected {init_value}, got {arr[0]}"
 
 
 @skipif_need_cuda_headers  # cg
-def test_cooperative_launch():
+@pytest.mark.parametrize("wrap_stream", [True, False])
+def test_cooperative_launch(wrap_stream):
     dev = Device()
     dev.set_current()
     s = dev.create_stream(options={"nonblocking": True})
+    if wrap_stream:
+        s = StreamWrapper(s)
 
     # CUDA kernel templated on type T
     code = r"""
@@ -272,6 +280,7 @@ def test_cooperative_launch():
 
 
 @pytest.mark.skipif(cp is None, reason="cupy not installed")
+@pytest.mark.parametrize("wrap_stream", [True, False])
 @pytest.mark.parametrize(
     "memory_resource_class",
     [
@@ -285,11 +294,13 @@ def test_cooperative_launch():
         ),
     ],
 )
-def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class):
+def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class, wrap_stream):
     """Test that kernels can access memory allocated by memory resources."""
     dev = Device()
     dev.set_current()
     stream = dev.create_stream()
+    if wrap_stream:
+        stream = StreamWrapper(stream)
     # tell CuPy to use our stream as the current stream:
     cp.cuda.ExternalStream(int(stream.handle)).use()
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 373cecb9b6..193dc3bd29 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -31,6 +31,7 @@
 from cuda.core.experimental._utils.cuda_utils import handle_return
 from cuda.core.experimental.utils import StridedMemoryView
 from helpers.buffers import DummyUnifiedMemoryResource
+from helpers.misc import StreamWrapper
 
 from cuda_python_test_helpers import supports_ipc_mempool
 
@@ -167,10 +168,12 @@ def test_buffer_initialization():
     buffer_initialization(DummyPinnedMemoryResource(device))
 
 
-def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
+def buffer_copy_to(dummy_mr: MemoryResource, device: Device, wrap_stream, check=False):
     src_buffer = dummy_mr.allocate(size=1024)
     dst_buffer = dummy_mr.allocate(size=1024)
     stream = device.create_stream()
+    if wrap_stream:
+        stream = StreamWrapper(stream)
 
     if check:
         src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
@@ -190,18 +193,21 @@ def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
     src_buffer.close()
 
 
-def test_buffer_copy_to():
+@pytest.mark.parametrize("wrap_stream", [True, False])
+def test_buffer_copy_to(wrap_stream):
     device = Device()
     device.set_current()
-    buffer_copy_to(DummyDeviceMemoryResource(device), device)
-    buffer_copy_to(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True)
+    buffer_copy_to(DummyDeviceMemoryResource(device), device, wrap_stream)
+    buffer_copy_to(DummyUnifiedMemoryResource(device), device, wrap_stream)
+    buffer_copy_to(DummyPinnedMemoryResource(device), device, wrap_stream, check=True)
 
 
-def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
+def buffer_copy_from(dummy_mr: MemoryResource, device, wrap_stream, check=False):
     src_buffer = dummy_mr.allocate(size=1024)
     dst_buffer = dummy_mr.allocate(size=1024)
     stream = device.create_stream()
+    if wrap_stream:
+        stream = StreamWrapper(stream)
 
     if check:
         src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
@@ -221,12 +227,13 @@ def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
     src_buffer.close()
 
 
-def test_buffer_copy_from():
+@pytest.mark.parametrize("wrap_stream", [True, False])
+def test_buffer_copy_from(wrap_stream):
     device = Device()
     device.set_current()
-    buffer_copy_from(DummyDeviceMemoryResource(device), device)
-    buffer_copy_from(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True)
+    buffer_copy_from(DummyDeviceMemoryResource(device), device, wrap_stream)
+    buffer_copy_from(DummyUnifiedMemoryResource(device), device, wrap_stream)
+    buffer_copy_from(DummyPinnedMemoryResource(device), device, wrap_stream, check=True)
 
 
 def buffer_close(dummy_mr: MemoryResource):
@@ -526,12 +533,16 @@ def test_device_memory_resource():
     buffer = mr.allocate(1024, stream=stream)
     assert buffer.handle != 0
     buffer.close()
+    buffer = mr.allocate(1024, stream=StreamWrapper(stream))
+    assert buffer.handle != 0
+    buffer.close()
 
     # Test memory copying between buffers from same pool
     src_buffer = mr.allocate(64)
     dst_buffer = mr.allocate(64)
     stream = device.create_stream()
     src_buffer.copy_to(dst_buffer, stream=stream)
+    src_buffer.copy_to(dst_buffer, stream=StreamWrapper(stream))
     device.sync()
     dst_buffer.close()
     src_buffer.close()
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index c8165548f6..2e10fb1003 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -6,6 +6,7 @@
 from cuda.core.experimental._event import Event
 from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
 from cuda.core.experimental._utils.cuda_utils import driver
+from helpers.misc import StreamWrapper
 
 
 def test_stream_init_disabled():
@@ -76,9 +77,12 @@ def test_stream_context(init_cuda):
     assert context._handle is not None
 
 
-def test_stream_from_foreign_stream(init_cuda):
+@pytest.mark.parametrize("wrap_stream", [True, False])
+def test_stream_from_foreign_stream(init_cuda, wrap_stream):
     device = Device()
     other_stream = device.create_stream(options=StreamOptions())
+    if wrap_stream:
+        other_stream = StreamWrapper(other_stream)
     stream = device.create_stream(obj=other_stream)
     # Now that __eq__ is implemented (issue #664), we can compare directly
     assert other_stream == stream

From 183f7af26c7dababc6cb2c62faa91b70e508e2a0 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 12 Nov 2025 10:57:36 -0800
Subject: [PATCH 11/15] Expand files named _gmr.*. Add __eq__ and __hash__
 support to StreamWrapper (testing only)

---
 cuda_core/cuda/core/experimental/_memory/__init__.py        | 2 +-
 .../_memory/{_gmr.pxd => _graph_memory_resource.pxd}        | 2 +-
 .../_memory/{_gmr.pyx => _graph_memory_resource.pyx}        | 0
 cuda_core/tests/helpers/misc.py                             | 6 ++++++
 4 files changed, 8 insertions(+), 2 deletions(-)
 rename cuda_core/cuda/core/experimental/_memory/{_gmr.pxd => _graph_memory_resource.pxd} (81%)
 rename cuda_core/cuda/core/experimental/_memory/{_gmr.pyx => _graph_memory_resource.pyx} (100%)

diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index 8dd4385982..20b90d7fdd 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -4,7 +4,7 @@
 
 from ._buffer import *  # noqa: F403
 from ._device_memory_resource import *  # noqa: F403
-from ._gmr import *  # noqa: F403
+from ._graph_memory_resource import *  # noqa: F403
 from ._ipc import *  # noqa: F403
 from ._legacy import *  # noqa: F403
 from ._virtual_memory_resource import *  # noqa: F403
diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pxd b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
similarity index 81%
rename from cuda_core/cuda/core/experimental/_memory/_gmr.pxd
rename to cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
index 0feb4a398b..7d20757587 100644
--- a/cuda_core/cuda/core/experimental/_memory/_gmr.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
@@ -4,7 +4,7 @@
 
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport MemoryResource
-from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource
+from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
 from cuda.core.experimental._memory._ipc cimport IPCData
 
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
similarity index 100%
rename from cuda_core/cuda/core/experimental/_memory/_gmr.pyx
rename to cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
diff --git a/cuda_core/tests/helpers/misc.py b/cuda_core/tests/helpers/misc.py
index 6389e2a3a5..33508091a7 100644
--- a/cuda_core/tests/helpers/misc.py
+++ b/cuda_core/tests/helpers/misc.py
@@ -24,3 +24,9 @@ def handle(self):
 
     def sync(self):
         return self._stream.sync()
+
+    def __eq__(self, rhs):
+        return self._stream == Stream._init(rhs)
+
+    def __hash__(self):
+        return hash(self._stream)

From 13e3dfbfdfa50dd574905e3c428bb6753e3e1da3 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 12 Nov 2025 13:20:00 -0800
Subject: [PATCH 12/15] Fix format/lint issues.

---
 .../cuda/core/experimental/_launcher.pyx      |   4 -
 .../_memory/_device_memory_resource.pyx       |   2 -
 .../_memory/_graph_memory_resource.pyx        |   1 -
 cuda_core/cuda/core/experimental/_stream.pyx  |   2 -
 cuda_core/tests/test_graph.py                 |   5 -
 cuda_core/tests/test_graph_mem.py             | 100 ++++++++----------
 cuda_core/tests/test_launcher.py              |   2 +-
 cuda_core/tests/test_memory.py                |   1 +
 8 files changed, 48 insertions(+), 69 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx
index 939c0fa803..b94c3d2b71 100644
--- a/cuda_core/cuda/core/experimental/_launcher.pyx
+++ b/cuda_core/cuda/core/experimental/_launcher.pyx
@@ -2,10 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport uintptr_t
-
-from typing import Union
-
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
 from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
 from cuda.core.experimental._module import Kernel
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index 9ce3e5be39..b354d595c6 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -497,7 +497,6 @@ cdef inline void DMR_deallocate(
 ) noexcept:
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
-    cdef cydriver.CUstreamCaptureStatus capturing
     with nogil:
         HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
 
@@ -516,4 +515,3 @@ cdef inline DMR_close(DeviceMemoryResource self):
         self._attributes = None
         self._mempool_owned = False
         self._ipc_data = None
-
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
index 09a561fdde..6ba09481d6 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
@@ -205,4 +205,3 @@ cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexce
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
     with nogil:
         HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
-
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index b177ea217d..c9192c5ba4 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -451,5 +451,3 @@ cdef cydriver.CUstream _handle_from_stream_t(obj) except*:
             f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}"
         )
     return <cydriver.CUstream><uintptr_t>(info[1])
-
-
diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py
index 39810930fe..cc558b6d22 100644
--- a/cuda_core/tests/test_graph.py
+++ b/cuda_core/tests/test_graph.py
@@ -11,8 +11,6 @@
     from cuda import nvrtc
 from cuda.core.experimental import (
     Device,
-    DeviceMemoryResource,
-    DeviceMemoryResourceOptions,
     GraphBuilder,
     GraphCompleteOptions,
     GraphDebugPrintOptions,
@@ -23,7 +21,6 @@
     launch,
 )
 from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return
-from helpers.buffers import compare_equal_buffers, make_scratch_buffer
 
 
 def _common_kernels():
@@ -750,5 +747,3 @@ def test_graph_build_mode(init_cuda):
 
     with pytest.raises(ValueError, match="^Unsupported build mode:"):
         gb = Device().create_graph_builder().begin_building(mode=None)
-
-
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
index 6e4bf907ea..44a5be2617 100644
--- a/cuda_core/tests/test_graph_mem.py
+++ b/cuda_core/tests/test_graph_mem.py
@@ -3,20 +3,19 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import pytest
-
 from cuda.core.experimental import (
     Device,
     DeviceMemoryResource,
     GraphCompleteOptions,
     GraphMemoryResource,
-    launch,
     LaunchConfig,
     Program,
     ProgramOptions,
+    launch,
 )
-from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return
 from helpers.buffers import compare_buffer_to_constant, make_scratch_buffer, set_buffer
 
+
 def _common_kernels_alloc():
     code = """
     __global__ void set_zero(char *a, size_t nbytes) {
@@ -45,6 +44,7 @@ class GraphMemoryTestManager:
     """
     Manages changes to the state of the graph memory system, for testing.
     """
+
     def __init__(self, gmr, stream, mode=None):
         self.device = Device(gmr.device_id)
         self.gmr = gmr
@@ -125,7 +125,6 @@ def test_graph_alloc_with_output(init_cuda, mode):
 
     # Get kernels and define the calling sequence.
     mod = _common_kernels_alloc()
-    set_zero = mod.get_kernel("set_zero")
     add_one = mod.get_kernel("add_one")
 
     # Make an input of 0s.
@@ -170,24 +169,16 @@ def test_graph_mem_set_attributes(init_cuda, mode):
     assert gmr.attributes.used_mem_high > 0
 
     # Incorrect attribute usage.
-    with pytest.raises(AttributeError,
-        match=r"property 'reserved_mem_current' .* has no setter"
-    ):
+    with pytest.raises(AttributeError, match=r"property 'reserved_mem_current' .* has no setter"):
         gmr.attributes.reserved_mem_current = 0
 
-    with pytest.raises(AttributeError,
-        match=r"Attribute 'reserved_mem_high' may only be set to zero \(got 1\)\."
-    ):
+    with pytest.raises(AttributeError, match=r"Attribute 'reserved_mem_high' may only be set to zero \(got 1\)\."):
         gmr.attributes.reserved_mem_high = 1
 
-    with pytest.raises(AttributeError,
-        match=r"property 'used_mem_current' .* has no setter"
-    ):
+    with pytest.raises(AttributeError, match=r"property 'used_mem_current' .* has no setter"):
         gmr.attributes.used_mem_current = 0
 
-    with pytest.raises(AttributeError,
-        match=r"Attribute 'used_mem_high' may only be set to zero \(got 1\)\."
-    ):
+    with pytest.raises(AttributeError, match=r"Attribute 'used_mem_high' may only be set to zero \(got 1\)\."):
         gmr.attributes.used_mem_high = 1
 
     # Free memory, but usage is not reduced yet.
@@ -217,46 +208,47 @@ def test_graph_mem_set_attributes(init_cuda, mode):
 
 @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
 def test_gmr_check_capture_state(init_cuda, mode):
-   """
-   Test expected errors (and non-errors) using GraphMemoryResource with graph
-   capture.
-   """
-   device = Device()
-   stream = device.create_stream()
-   gmr = GraphMemoryResource(device)
-
-   # Not capturing
-   with pytest.raises(RuntimeError,
-       match=r"GraphMemoryResource cannot perform memory operations on a "
-             r"non-capturing stream\."
-   ):
-       gmr.allocate(1, stream=stream)
-
-   # Capturing
-   gb = device.create_graph_builder().begin_building(mode=mode)
-   buffer = gmr.allocate(1, stream=gb).close()  # no error
-   gb.end_building().complete()
+    """
+    Test expected errors (and non-errors) using GraphMemoryResource with graph
+    capture.
+    """
+    device = Device()
+    stream = device.create_stream()
+    gmr = GraphMemoryResource(device)
+
+    # Not capturing
+    with pytest.raises(
+        RuntimeError,
+        match=r"GraphMemoryResource cannot perform memory operations on a "
+        r"non-capturing stream\.",
+    ):
+        gmr.allocate(1, stream=stream)
+
+    # Capturing
+    gb = device.create_graph_builder().begin_building(mode=mode)
+    gmr.allocate(1, stream=gb)  # no error
+    gb.end_building().complete()
 
 
 @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
 def test_dmr_check_capture_state(init_cuda, mode):
-   """
-   Test expected errors (and non-errors) using DeviceMemoryResource with graph
-   capture.
-   """
-   device = Device()
-   stream = device.create_stream()
-   dmr = DeviceMemoryResource(device)
-
-   # Not capturing
-   dmr.allocate(1, stream=stream).close()  # no error
-
-   # Capturing
-   gb = device.create_graph_builder().begin_building(mode=mode)
-   with pytest.raises(RuntimeError,
-       match=r"DeviceMemoryResource cannot perform memory operations on a capturing "
-             r"stream \(consider using GraphMemoryResource\)\."
-   ):
-       dmr.allocate(1, stream=gb)
-   gb.end_building().complete()
+    """
+    Test expected errors (and non-errors) using DeviceMemoryResource with graph
+    capture.
+    """
+    device = Device()
+    stream = device.create_stream()
+    dmr = DeviceMemoryResource(device)
+
+    # Not capturing
+    dmr.allocate(1, stream=stream).close()  # no error
 
+    # Capturing
+    gb = device.create_graph_builder().begin_building(mode=mode)
+    with pytest.raises(
+        RuntimeError,
+        match=r"DeviceMemoryResource cannot perform memory operations on a capturing "
+        r"stream \(consider using GraphMemoryResource\)\.",
+    ):
+        dmr.allocate(1, stream=gb)
+    gb.end_building().complete()
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index 138ea24aa8..f2fd64344c 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -22,9 +22,9 @@
 )
 from cuda.core.experimental._memory import _SynchronousMemoryResource
 from cuda.core.experimental._utils.cuda_utils import CUDAError
+from helpers.misc import StreamWrapper
 
 from conftest import skipif_need_cuda_headers
-from helpers.misc import StreamWrapper
 
 
 def test_launch_config_init(init_cuda):
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 193dc3bd29..5ca4e41211 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -689,6 +689,7 @@ def test_strided_memory_view_refcnt():
     assert av.strides[1] == 64
     assert sys.getrefcount(av.strides) >= 2
 
+
 def test_graph_memory_resource_object(init_cuda):
     device = Device()
     gmr1 = GraphMemoryResource(device)

From 7408fe8a050015b39a9d2b797241c624036d2af1 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 13 Nov 2025 08:29:14 -0800
Subject: [PATCH 13/15] Minor clean up.

---
 .../_memory/_graph_memory_resource.pxd             |  3 ---
 .../_memory/_graph_memory_resource.pyx             | 14 ++++++++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
index 7d20757587..f9c7798e76 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
@@ -2,10 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport MemoryResource
-from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
-from cuda.core.experimental._memory._ipc cimport IPCData
 
 
 cdef class cyGraphMemoryResource(MemoryResource):
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
index 6ba09481d6..6fbb6088f0 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
@@ -47,7 +47,10 @@ cdef class GraphMemoryResourceAttributes:
 
     @GMRA_mem_attribute(int, settable=True)
     def reserved_mem_high(self):
-        """High watermark of backing memory allocated."""
+        """
+        High watermark of backing memory allocated. It can be set to zero to
+        reset it to the current usage.
+        """
 
     @GMRA_mem_attribute(int)
     def used_mem_current(self):
@@ -55,10 +58,13 @@ cdef class GraphMemoryResourceAttributes:
 
     @GMRA_mem_attribute(int, settable=True)
     def used_mem_high(self):
-        """High watermark of memory in use."""
+        """
+        High watermark of memory in use. It can be set to zero to reset it to
+        the current usage.
+        """
 
 
-cdef GMRA_mem_attribute(property_type: type, settable : bool = False):
+cdef GMRA_mem_attribute(property_type: type, settable: bool = False):
     _settable = settable
 
     def decorator(stub):
@@ -161,7 +167,7 @@ class GraphMemoryResource(cyGraphMemoryResource):
 
     Parameters
     ----------
-    device_id : int | Device
+    device_id: int | Device
         Device or Device ordinal for which a graph memory resource is obtained.
     """
 

From c1dbc6a5f1baa6a57fbb58a37e7603a3097a0f7e Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 19 Nov 2025 10:03:17 -0800
Subject: [PATCH 14/15] Change public signatures to accept "Stream |
 GraphBuiler" where only streams were accepted. Add a helper Stream_accept to
 accept Stream-like arguments throughout. Revert changes to Stream._init that
 relaxed acceptance criteria for first argument. Revert addition of
 StreamWrapper and associated tests. Suppress invalid context errors in
 deallocate to avoid noise during shutdown.

---
 cuda_core/cuda/core/experimental/_device.pyx  |  8 ++--
 .../cuda/core/experimental/_launcher.pyx      | 12 +++---
 .../core/experimental/_memory/_buffer.pyx     | 40 +++++++------------
 .../_memory/_device_memory_resource.pyx       | 19 +++++----
 .../_memory/_graph_memory_resource.pyx        | 12 +++---
 .../_memory/_virtual_memory_resource.py       | 13 ++++--
 cuda_core/cuda/core/experimental/_stream.pxd  |  2 +
 cuda_core/cuda/core/experimental/_stream.pyx  | 26 +++++++++---
 cuda_core/tests/helpers/misc.py               | 32 ---------------
 cuda_core/tests/test_launcher.py              | 16 ++------
 cuda_core/tests/test_memory.py                | 31 +++++---------
 cuda_core/tests/test_stream.py                |  6 +--
 12 files changed, 87 insertions(+), 130 deletions(-)
 delete mode 100644 cuda_core/tests/helpers/misc.py

diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index 833c74c0a8..445f526f8f 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -1242,7 +1242,7 @@ class Device:
         """
         raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189")
 
-    def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions | None = None) -> Stream:
+    def create_stream(self, obj: IsStreamT | None = None, options: StreamOptions | None = None) -> Stream:
         """Create a Stream object.
 
         New stream objects can be created in two different ways:
@@ -1295,7 +1295,7 @@ class Device:
         ctx = self._get_current_context()
         return Event._init(self._id, ctx, options, True)
 
-    def allocate(self, size, stream: Optional[IsStreamT] = None) -> Buffer:
+    def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """Allocate device memory from a specified stream.
 
         Allocates device memory of `size` bytes on the specified `stream`
@@ -1311,7 +1311,7 @@ class Device:
         ----------
         size : int
             Number of bytes to allocate.
-        stream : :obj:`~_stream.IsStreamT`, optional
+        stream : :obj:`~_stream.Stream`, optional
             The stream establishing the stream ordering semantic.
             Default value of `None` uses default stream.
 
@@ -1322,8 +1322,6 @@ class Device:
 
         """
         self._check_context_initialized()
-        if stream is None:
-            stream = default_stream()
         return self._mr.allocate(size, stream)
 
     def sync(self):
diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx
index b94c3d2b71..a647c0fb6e 100644
--- a/cuda_core/cuda/core/experimental/_launcher.pyx
+++ b/cuda_core/cuda/core/experimental/_launcher.pyx
@@ -1,11 +1,13 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
+from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
+from cuda.core.experimental._stream cimport Stream_accept
+
 
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
-from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
 from cuda.core.experimental._module import Kernel
-from cuda.core.experimental._stream import IsStreamT, Stream
+from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._utils.clear_error_support import assert_type
 from cuda.core.experimental._utils.cuda_utils import (
     _reduce_3_tuple,
@@ -33,13 +35,13 @@ def _lazy_init():
     _inited = True
 
 
-def launch(stream: IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args):
+def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args):
     """Launches a :obj:`~_module.Kernel`
     object with launch-time configuration.
 
     Parameters
     ----------
-    stream : :obj:`~_stream.IsStreamT`
+    stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
         The stream establishing the stream ordering semantic of a
         launch.
     config : :obj:`LaunchConfig`
@@ -52,7 +54,7 @@ def launch(stream: IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args
         launching kernel.
 
     """
-    stream = Stream._init(stream)
+    stream = Stream_accept(stream, allow_default=False, default_value=None, allow_stream_protocol=True)
     assert_type(kernel, Kernel)
     _lazy_init()
     config = check_or_create_options(LaunchConfig, config, "launch config")
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index f18d5ec02d..8f01b638fb 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -9,16 +9,15 @@ from libc.stdint cimport uintptr_t
 from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
 from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor
 from cuda.core.experimental._memory cimport _ipc
-from cuda.core.experimental._stream cimport default_stream, Stream
+from cuda.core.experimental._stream cimport Stream_accept, Stream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
 )
 
 import abc
-from typing import Optional, TypeVar, Union
+from typing import TypeVar, Union
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
-from cuda.core.experimental._stream import IsStreamT
 from cuda.core.experimental._utils.cuda_utils import driver
 
 __all__ = ['Buffer', 'MemoryResource']
@@ -103,7 +102,7 @@ cdef class Buffer:
         """Export a buffer allocated for sharing between processes."""
         return _ipc.Buffer_get_ipc_descriptor(self)
 
-    def close(self, stream: Stream = None):
+    def close(self, stream: Stream | GraphBuilder | None = None):
         """Deallocate this buffer asynchronously on the given stream.
 
         This buffer is released back to their memory resource
@@ -111,13 +110,13 @@ cdef class Buffer:
 
         Parameters
         ----------
-        stream : Stream, optional
+        stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
             The stream object to use for asynchronous deallocation. If None,
             the behavior depends on the underlying memory resource.
         """
         Buffer_close(self, stream)
 
-    def copy_to(self, dst: Buffer = None, *, stream: IsStreamT) -> Buffer:
+    def copy_to(self, dst: Buffer = None, *, stream: Stream | GraphBuilder) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
 
         Copies the data from this buffer to the provided dst buffer.
@@ -128,13 +127,12 @@ cdef class Buffer:
         ----------
         dst : :obj:`~_memory.Buffer`
             Source buffer to copy data from
-        stream : IsStreamT
+        stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
             Keyword argument specifying the stream for the
             asynchronous copy
 
         """
-        stream = Stream._init(stream)
-
+        stream = Stream_accept(stream)
         cdef size_t src_size = self._size
 
         if dst is None:
@@ -152,20 +150,19 @@ cdef class Buffer:
         raise_if_driver_error(err)
         return dst
 
-    def copy_from(self, src: Buffer, *, stream: IsStreamT):
+    def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder):
         """Copy from the src buffer to this buffer asynchronously on the given stream.
 
         Parameters
         ----------
         src : :obj:`~_memory.Buffer`
             Source buffer to copy data from
-        stream : IsStreamT
+        stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
             Keyword argument specifying the stream for the
             asynchronous copy
 
         """
-        stream = Stream._init(stream)
-
+        stream = Stream_accept(stream)
         cdef size_t dst_size = self._size
         cdef size_t src_size = src._size
 
@@ -276,14 +273,7 @@ cdef class Buffer:
 cdef inline void Buffer_close(Buffer self, stream):
     cdef Stream s
     if self._ptr and self._memory_resource is not None:
-        if stream is None:
-            if self._alloc_stream is not None:
-                s = self._alloc_stream
-            else:
-                # TODO: remove this branch when from_handle takes a stream
-                s = <Stream>(default_stream())
-        else:
-            s = <Stream>stream
+        s = Stream_accept(stream, allow_default=True, default_value=self._alloc_stream)
         self._memory_resource.deallocate(self._ptr, self._size, s)
         self._ptr = 0
         self._memory_resource = None
@@ -304,14 +294,14 @@ cdef class MemoryResource:
     """
 
     @abc.abstractmethod
-    def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer:
+    def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """Allocate a buffer of the requested size.
 
         Parameters
         ----------
         size : int
             The size of the buffer to allocate, in bytes.
-        stream : IsStreamT, optional
+        stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
             The stream on which to perform the allocation asynchronously.
             If None, it is up to each memory resource implementation to decide
             and document the behavior.
@@ -325,7 +315,7 @@ cdef class MemoryResource:
         ...
 
     @abc.abstractmethod
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None):
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -334,7 +324,7 @@ cdef class MemoryResource:
             The pointer or handle to the buffer to deallocate.
         size : int
             The size of the buffer to deallocate, in bytes.
-        stream : IsStreamT, optional
+        stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
             The stream on which to perform the deallocation asynchronously.
             If None, it is up to each memory resource implementation to decide
             and document the behavior.
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index b354d595c6..12f5a666ab 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -12,7 +12,7 @@ from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
 from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData
-from cuda.core.experimental._stream cimport default_stream, Stream
+from cuda.core.experimental._stream cimport Stream_accept, Stream
 from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN,
@@ -308,14 +308,14 @@ cdef class DeviceMemoryResource(MemoryResource):
             raise RuntimeError("Imported memory resource cannot be exported")
         return self._ipc_data._alloc_handle
 
-    def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer:
+    def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """Allocate a buffer of the requested size.
 
         Parameters
         ----------
         size : int
             The size of the buffer to allocate, in bytes.
-        stream : IsStreamT, optional
+        stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
             The stream on which to perform the allocation asynchronously.
             If None, an internal stream is used.
 
@@ -327,10 +327,10 @@ cdef class DeviceMemoryResource(MemoryResource):
         """
         if self.is_mapped:
             raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
-        stream = Stream._init(stream) if stream is not None else default_stream()
+        stream = Stream_accept(stream, allow_default=True)
         return DMR_allocate(self, size, <Stream> stream)
 
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None):
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -339,12 +339,12 @@ cdef class DeviceMemoryResource(MemoryResource):
             The pointer or handle to the buffer to deallocate.
         size : int
             The size of the buffer to deallocate, in bytes.
-        stream : IsStreamT, optional
+        stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
             The stream on which to perform the deallocation asynchronously.
             If the buffer is deallocated without an explicit stream, the allocation stream
             is used.
         """
-        stream = Stream._init(stream) if stream is not None else default_stream()
+        stream = Stream_accept(stream, allow_default=True)
         DMR_deallocate(self, <uintptr_t>ptr, size, <Stream> stream)
 
     @property
@@ -497,8 +497,11 @@ cdef inline void DMR_deallocate(
 ) noexcept:
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
+    cdef cydriver.CUresult r
     with nogil:
-        HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
+        r = cydriver.cuMemFreeAsync(devptr, s)
+        if r != cydriver.CUDA_ERROR_INVALID_CONTEXT:
+            HANDLE_RETURN(r)
 
 
 cdef inline DMR_close(DeviceMemoryResource self):
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
index 6fbb6088f0..06234ef4b8 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
@@ -8,11 +8,11 @@ from libc.stdint cimport uintptr_t, intptr_t, uint64_t
 
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
-from cuda.core.experimental._stream cimport default_stream, Stream
+from cuda.core.experimental._stream cimport Stream_accept, Stream
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 from functools import cache
-from typing import Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING
 
 from cuda.core.experimental._utils.cuda_utils import driver
 
@@ -105,18 +105,18 @@ cdef class cyGraphMemoryResource(MemoryResource):
     def __cinit__(self, int device_id):
         self._dev_id = device_id
 
-    def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer:
+    def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """
         Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
         """
-        stream = Stream._init(stream) if stream is not None else default_stream()
+        stream = Stream_accept(stream, allow_default=True)
         return GMR_allocate(self, size, <Stream> stream)
 
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None):
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
         """
         Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
         """
-        stream = Stream._init(stream) if stream is not None else default_stream()
+        stream = Stream_accept(stream, allow_default=True)
         return GMR_deallocate(ptr, size, <Stream> stream)
 
     def close(self):
diff --git a/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py b/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py
index 5379f0b8f1..40b8e7267d 100644
--- a/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py
+++ b/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py
@@ -2,11 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 from dataclasses import dataclass, field
-from typing import Iterable, Literal, Union
+from typing import TYPE_CHECKING, Iterable, Literal, Union
 
 from cuda.core.experimental._memory._buffer import Buffer, MemoryResource
-from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._utils.cuda_utils import (
     Transaction,
     check_or_create_options,
@@ -17,6 +18,10 @@
     _check_driver_error as raise_if_driver_error,
 )
 
+if TYPE_CHECKING:
+    from cuda.core.experimental._graph import GraphBuilder
+    from cuda.core.experimental._stream import Stream
+
 __all__ = ["VirtualMemoryResourceOptions", "VirtualMemoryResource"]
 
 VirtualMemoryHandleTypeT = Union[Literal["posix_fd", "generic", "win32", "win32_kmt", "fabric"], None]
@@ -457,7 +462,7 @@ def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list:
 
         return descs
 
-    def allocate(self, size: int, stream: Stream = None) -> Buffer:
+    def allocate(self, size: int, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """
         Allocate a buffer of the given size using CUDA virtual memory.
 
@@ -541,7 +546,7 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer:
         buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self)
         return buf
 
-    def deallocate(self, ptr: int, size: int, stream: Stream = None) -> None:
+    def deallocate(self, ptr: int, size: int, stream: Stream | GraphBuilder | None = None) -> None:
         """
         Deallocate memory on the device using CUDA VMM APIs.
         """
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd
index 68a410d1ed..59c27749bc 100644
--- a/cuda_core/cuda/core/experimental/_stream.pxd
+++ b/cuda_core/cuda/core/experimental/_stream.pxd
@@ -22,3 +22,5 @@ cdef class Stream:
 
 
 cpdef Stream default_stream()
+cdef Stream Stream_accept(arg, bint allow_default=*, Stream default_value=*, bint allow_stream_protocol=*)
+# from cuda.core.experimental._stream cimport Stream_accept
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index c9192c5ba4..0f53dbb582 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -108,16 +108,13 @@ cdef class Stream:
         return self
 
     @classmethod
-    def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None):
-        if isinstance(obj, Stream):
-            return obj
-
+    def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None):
         cdef Stream self = Stream.__new__(cls)
 
         if obj is not None and options is not None:
             raise ValueError("obj and options cannot be both specified")
         if obj is not None:
-            self._handle = _handle_from_stream_t(obj)
+            self._handle = _handle_from_stream_protocol(obj)
             # TODO: check if obj is created under the current context/device
             self._owner = obj
             return self
@@ -420,7 +417,7 @@ cpdef Stream default_stream():
         return C_LEGACY_DEFAULT_STREAM
 
 
-cdef cydriver.CUstream _handle_from_stream_t(obj) except*:
+cdef cydriver.CUstream _handle_from_stream_protocol(obj) except*:
     if isinstance(obj, Stream):
         return <cydriver.CUstream><uintptr_t>(obj.handle)
 
@@ -451,3 +448,20 @@ cdef cydriver.CUstream _handle_from_stream_t(obj) except*:
             f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}"
         )
     return <cydriver.CUstream><uintptr_t>(info[1])
+
+# Helper for API functions that accept either Stream or GraphBuilder. Performs
+# needed checks and returns the relevant stream.
+cdef Stream Stream_accept(arg, bint allow_default=False, Stream default_value=None, bint allow_stream_protocol=False):
+    if arg is None:
+        if allow_default:
+            if default_value is not None:
+                return default_value
+            else:
+                return default_stream()
+    elif isinstance(arg, Stream):
+        return <Stream> arg
+    elif isinstance(arg, GraphBuilder):
+        return <Stream> arg.stream
+    elif allow_stream_protocol and isinstance(arg, IsStreamT):
+        return Stream._init(arg)
+    raise TypeError(f"Stream or GraphBuilder expected, got {type(arg).__name__}")
diff --git a/cuda_core/tests/helpers/misc.py b/cuda_core/tests/helpers/misc.py
deleted file mode 100644
index 33508091a7..0000000000
--- a/cuda_core/tests/helpers/misc.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-from cuda.core.experimental import Stream
-
-
-class StreamWrapper:
-    """
-    A wrapper around Stream for testing IsStreamT conversions.
-    """
-
-    def __init__(self, stream: Stream):
-        self._stream = stream
-
-    def __cuda_stream__(self):
-        return self._stream.__cuda_stream__()
-
-    def close(self):
-        self._stream.close()
-
-    @property
-    def handle(self):
-        return self._stream.handle
-
-    def sync(self):
-        return self._stream.sync()
-
-    def __eq__(self, rhs):
-        return self._stream == Stream._init(rhs)
-
-    def __hash__(self):
-        return hash(self._stream)
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index f2fd64344c..34a62ba8c2 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -22,7 +22,6 @@
 )
 from cuda.core.experimental._memory import _SynchronousMemoryResource
 from cuda.core.experimental._utils.cuda_utils import CUDAError
-from helpers.misc import StreamWrapper
 
 from conftest import skipif_need_cuda_headers
 
@@ -180,10 +179,9 @@ def test_launch_invalid_values(init_cuda):
     )
 
 
-@pytest.mark.parametrize("wrap_stream", [True, False])
 @pytest.mark.parametrize("python_type, cpp_type, init_value", PARAMS)
 @pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
-def test_launch_scalar_argument(python_type, cpp_type, init_value, wrap_stream):
+def test_launch_scalar_argument(python_type, cpp_type, init_value):
     dev = Device()
     dev.set_current()
 
@@ -222,8 +220,6 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value, wrap_stream):
 
     # Launch with 1 thread
     stream = dev.default_stream
-    if wrap_stream:
-        stream = StreamWrapper(stream)
     config = LaunchConfig(grid=1, block=1)
     launch(stream, config, ker, arr.ctypes.data, scalar)
     stream.sync()
@@ -233,13 +229,10 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value, wrap_stream):
 
 
 @skipif_need_cuda_headers  # cg
-@pytest.mark.parametrize("wrap_stream", [True, False])
-def test_cooperative_launch(wrap_stream):
+def test_cooperative_launch():
     dev = Device()
     dev.set_current()
     s = dev.create_stream(options={"nonblocking": True})
-    if wrap_stream:
-        s = StreamWrapper(s)
 
     # CUDA kernel templated on type T
     code = r"""
@@ -280,7 +273,6 @@ def test_cooperative_launch(wrap_stream):
 
 
 @pytest.mark.skipif(cp is None, reason="cupy not installed")
-@pytest.mark.parametrize("wrap_stream", [True, False])
 @pytest.mark.parametrize(
     "memory_resource_class",
     [
@@ -294,13 +286,11 @@ def test_cooperative_launch(wrap_stream):
         ),
     ],
 )
-def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class, wrap_stream):
+def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class):
     """Test that kernels can access memory allocated by memory resources."""
     dev = Device()
     dev.set_current()
     stream = dev.create_stream()
-    if wrap_stream:
-        stream = StreamWrapper(stream)
     # tell CuPy to use our stream as the current stream:
     cp.cuda.ExternalStream(int(stream.handle)).use()
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 5ca4e41211..0c33b13b00 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -31,7 +31,6 @@
 from cuda.core.experimental._utils.cuda_utils import handle_return
 from cuda.core.experimental.utils import StridedMemoryView
 from helpers.buffers import DummyUnifiedMemoryResource
-from helpers.misc import StreamWrapper
 
 from cuda_python_test_helpers import supports_ipc_mempool
 
@@ -168,12 +167,10 @@ def test_buffer_initialization():
     buffer_initialization(DummyPinnedMemoryResource(device))
 
 
-def buffer_copy_to(dummy_mr: MemoryResource, device: Device, wrap_stream, check=False):
+def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
     src_buffer = dummy_mr.allocate(size=1024)
     dst_buffer = dummy_mr.allocate(size=1024)
     stream = device.create_stream()
-    if wrap_stream:
-        stream = StreamWrapper(stream)
 
     if check:
         src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
@@ -193,21 +190,18 @@ def buffer_copy_to(dummy_mr: MemoryResource, device: Device, wrap_stream, check=
     src_buffer.close()
 
 
-@pytest.mark.parametrize("wrap_stream", [True, False])
-def test_buffer_copy_to(wrap_stream):
+def test_buffer_copy_to():
     device = Device()
     device.set_current()
-    buffer_copy_to(DummyDeviceMemoryResource(device), device, wrap_stream)
-    buffer_copy_to(DummyUnifiedMemoryResource(device), device, wrap_stream)
-    buffer_copy_to(DummyPinnedMemoryResource(device), device, wrap_stream, check=True)
+    buffer_copy_to(DummyDeviceMemoryResource(device), device)
+    buffer_copy_to(DummyUnifiedMemoryResource(device), device)
+    buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True)
 
 
-def buffer_copy_from(dummy_mr: MemoryResource, device, wrap_stream, check=False):
+def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
     src_buffer = dummy_mr.allocate(size=1024)
     dst_buffer = dummy_mr.allocate(size=1024)
     stream = device.create_stream()
-    if wrap_stream:
-        stream = StreamWrapper(stream)
 
     if check:
         src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
@@ -227,13 +221,12 @@ def buffer_copy_from(dummy_mr: MemoryResource, device, wrap_stream, check=False)
     src_buffer.close()
 
 
-@pytest.mark.parametrize("wrap_stream", [True, False])
-def test_buffer_copy_from(wrap_stream):
+def test_buffer_copy_from():
     device = Device()
     device.set_current()
-    buffer_copy_from(DummyDeviceMemoryResource(device), device, wrap_stream)
-    buffer_copy_from(DummyUnifiedMemoryResource(device), device, wrap_stream)
-    buffer_copy_from(DummyPinnedMemoryResource(device), device, wrap_stream, check=True)
+    buffer_copy_from(DummyDeviceMemoryResource(device), device)
+    buffer_copy_from(DummyUnifiedMemoryResource(device), device)
+    buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True)
 
 
 def buffer_close(dummy_mr: MemoryResource):
@@ -533,16 +526,12 @@ def test_device_memory_resource():
     buffer = mr.allocate(1024, stream=stream)
     assert buffer.handle != 0
     buffer.close()
-    buffer = mr.allocate(1024, stream=StreamWrapper(stream))
-    assert buffer.handle != 0
-    buffer.close()
 
     # Test memory copying between buffers from same pool
     src_buffer = mr.allocate(64)
     dst_buffer = mr.allocate(64)
     stream = device.create_stream()
     src_buffer.copy_to(dst_buffer, stream=stream)
-    src_buffer.copy_to(dst_buffer, stream=StreamWrapper(stream))
     device.sync()
     dst_buffer.close()
     src_buffer.close()
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index 2e10fb1003..c8165548f6 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -6,7 +6,6 @@
 from cuda.core.experimental._event import Event
 from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
 from cuda.core.experimental._utils.cuda_utils import driver
-from helpers.misc import StreamWrapper
 
 
 def test_stream_init_disabled():
@@ -77,12 +76,9 @@ def test_stream_context(init_cuda):
     assert context._handle is not None
 
 
-@pytest.mark.parametrize("wrap_stream", [True, False])
-def test_stream_from_foreign_stream(init_cuda, wrap_stream):
+def test_stream_from_foreign_stream(init_cuda):
     device = Device()
     other_stream = device.create_stream(options=StreamOptions())
-    if wrap_stream:
-        other_stream = StreamWrapper(other_stream)
     stream = device.create_stream(obj=other_stream)
     # Now that __eq__ is implemented (issue #664), we can compare directly
     assert other_stream == stream

From 2f14443f6d7f1f7f1b224c99ac8f280bae86cb3f Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 19 Nov 2025 17:14:38 -0800
Subject: [PATCH 15/15] Add deprecation warning when stream protocol is used
 with launch.

---
 cuda_core/cuda/core/experimental/_stream.pyx | 16 ++++++++++++++--
 cuda_core/tests/helpers/misc.py              | 14 ++++++++++++++
 cuda_core/tests/test_launcher.py             |  9 +++++++++
 cuda_core/tests/test_stream.py               |  3 ++-
 4 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 cuda_core/tests/helpers/misc.py

diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 0f53dbb582..53a6f8dd34 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -462,6 +462,18 @@ cdef Stream Stream_accept(arg, bint allow_default=False, Stream default_value=No
         return <Stream> arg
     elif isinstance(arg, GraphBuilder):
         return <Stream> arg.stream
-    elif allow_stream_protocol and isinstance(arg, IsStreamT):
-        return Stream._init(arg)
+    elif allow_stream_protocol:
+        try:
+            stream = Stream._init(arg)
+        except:
+            pass
+        else:
+            warnings.warn(
+                "Passing foreign stream objects to this function via the "
+                "stream protocol is deprecated. Convert the object explicitly "
+                "using Stream(obj) instead.",
+                stacklevel=2,
+                category=DeprecationWarning,
+            )
+            return <Stream> stream
     raise TypeError(f"Stream or GraphBuilder expected, got {type(arg).__name__}")
diff --git a/cuda_core/tests/helpers/misc.py b/cuda_core/tests/helpers/misc.py
new file mode 100644
index 0000000000..d5f9cd7534
--- /dev/null
+++ b/cuda_core/tests/helpers/misc.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+class StreamWrapper:
+    """
+    A wrapper around Stream for testing IsStreamT conversions.
+    """
+
+    def __init__(self, stream):
+        self._stream = stream
+
+    def __cuda_stream__(self):
+        return self._stream.__cuda_stream__()
+
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index 34a62ba8c2..d2e0a89a28 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -4,6 +4,7 @@
 import ctypes
 
 import helpers
+from helpers.misc import StreamWrapper
 
 try:
     import cupy as cp
@@ -140,6 +141,14 @@ def test_launch_invalid_values(init_cuda):
     with pytest.raises(TypeError):
         launch(stream, ker, None)
 
+    msg = (
+        r"Passing foreign stream objects to this function via the stream "
+        r"protocol is deprecated\. Convert the object explicitly using "
+        r"Stream\(obj\) instead\."
+    )
+    with pytest.warns(DeprecationWarning, match=msg):
+        launch(StreamWrapper(stream), config, ker)
+
     launch(stream, config, ker)
 
 
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index c8165548f6..695a70e931 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -6,6 +6,7 @@
 from cuda.core.experimental._event import Event
 from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
 from cuda.core.experimental._utils.cuda_utils import driver
+from helpers.misc import StreamWrapper
 
 
 def test_stream_init_disabled():
@@ -79,7 +80,7 @@ def test_stream_context(init_cuda):
 def test_stream_from_foreign_stream(init_cuda):
     device = Device()
     other_stream = device.create_stream(options=StreamOptions())
-    stream = device.create_stream(obj=other_stream)
+    stream = device.create_stream(obj=StreamWrapper(other_stream))
     # Now that __eq__ is implemented (issue #664), we can compare directly
     assert other_stream == stream
     assert hash(other_stream) == hash(stream)