From befa7682d6a4c3cb32bdabfa3739d1cbfd7a2d83 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 3 Nov 2025 13:47:24 -0800 Subject: [PATCH 01/15] Implement non-pooling memory allocation. --- .../_memory/_device_memory_resource.pyx | 47 +++++++++++-- cuda_core/tests/test_graph.py | 68 +++++++++++++++++++ cuda_core/tests/test_memory.py | 63 ++++++++++++++--- 3 files changed, 162 insertions(+), 16 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index 47b6fd114e..47cef28a55 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -48,9 +48,15 @@ cdef class DeviceMemoryResourceOptions: max_size : int, optional Maximum pool size. When set to 0, defaults to a system-dependent value. (Default to 0) + + mempool_enabled : bool, optional + Whether to use a memory pool. Pool-based allocations cannot be captured + in a graph but are the only ones that support sharing via IPC. + (Default to True) """ - ipc_enabled : cython.bint = False - max_size : cython.size_t = 0 + ipc_enabled : cython.bint = False + max_size : cython.size_t = 0 + mempool_enabled : cython.bint = True cdef class DeviceMemoryResourceAttributes: @@ -220,8 +226,10 @@ cdef class DeviceMemoryResource(MemoryResource): if opts is None: DMR_init_current(self, dev_id) + elif opts.mempool_enabled: + DMR_init_create_mempool(self, dev_id, opts) else: - DMR_init_create(self, dev_id, opts) + DMR_init_no_mempool(self, dev_id, opts) def __dealloc__(self): DMR_close(self) @@ -343,7 +351,8 @@ cdef class DeviceMemoryResource(MemoryResource): @property def attributes(self) -> DeviceMemoryResourceAttributes: - if self._attributes is None: + """Memory pool attributes or None if memory pooling is not enabled.""" + if self._attributes is None and self.is_mempool_enabled: ref = weakref.ref(self) self._attributes = DeviceMemoryResourceAttributes._init(ref) return self._attributes @@ -356,6 +365,8 @@ cdef class DeviceMemoryResource(MemoryResource): @property def handle(self) -> driver.CUmemoryPool: """Handle to the underlying memory pool.""" + if self._handle == NULL: + raise RuntimeError("Memory resource has no memory pool.") return driver.CUmemoryPool((self._handle)) @property @@ -386,6 +397,11 @@ cdef class DeviceMemoryResource(MemoryResource): """ return self._ipc_data is not None and self._ipc_data._is_mapped + @property + def is_mempool_enabled(self) -> bool: + """Whether this memory resource uses a memory pool.""" + return self._handle != NULL + @property def uuid(self) -> Optional[uuid.UUID]: """ @@ -430,7 +446,7 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): )) -cdef void DMR_init_create( +cdef void DMR_init_create_mempool( DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts ): # Create a new memory pool. @@ -460,11 +476,28 @@ cdef void DMR_init_create( self._ipc_data = IPCData(alloc_handle, mapped=False) +cdef void DMR_init_no_mempool( + DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts +): + # Create mr without a memory pool. + if opts.ipc_enabled: + raise TypeError("Cannot create an IPC-enabled memory resource without " + "memory pooling enabled.") + if opts.max_size != 0: + raise TypeError("Cannot supply max_size without memory pooling enabled.") + + self._dev_id = dev_id + + cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr - with nogil: - HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s)) + if self.is_mempool_enabled: + with nogil: + HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s)) + else: + with nogil: + HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s)) cdef Buffer buf = Buffer.__new__(Buffer) buf._ptr = (devptr) buf._ptr_obj = None diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py index cc558b6d22..d4312363cb 100644 --- a/cuda_core/tests/test_graph.py +++ b/cuda_core/tests/test_graph.py @@ -11,6 +11,8 @@ from cuda import nvrtc from cuda.core.experimental import ( Device, + DeviceMemoryResource, + DeviceMemoryResourceOptions, GraphBuilder, GraphCompleteOptions, GraphDebugPrintOptions, @@ -21,6 +23,7 @@ launch, ) from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return +from helpers.buffers import compare_equal_buffers, make_scratch_buffer def _common_kernels(): @@ -61,6 +64,30 @@ def _common_kernels_conditional(): return mod +def _common_kernels_alloc(): + code = """ + __global__ void set_zero(char *a, size_t nbytes) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = idx; i < nbytes; i += stride) { + a[i] = 0; + } + } + __global__ void add_one(char *a, size_t nbytes) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = idx; i < nbytes; i += stride) { + a[i] += 1; + } + } + """ + arch = "".join(f"{i}" for i in Device().compute_capability) + program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("set_zero", "add_one")) + return mod + + def test_graph_is_building(init_cuda): gb = Device().create_graph_builder() assert gb.is_building is False @@ -747,3 +774,44 @@ def test_graph_build_mode(init_cuda): with pytest.raises(ValueError, match="^Unsupported build mode:"): gb = Device().create_graph_builder().begin_building(mode=None) + + +def test_graph_alloc(init_cuda): + device = Device() + stream = device.create_stream() + options = DeviceMemoryResourceOptions(mempool_enabled=False) + mr = DeviceMemoryResource(device, options=options) + + # Get kernels. + mod = _common_kernels_alloc() + set_zero = mod.get_kernel("set_zero") + add_one = mod.get_kernel("add_one") + + NBYTES = 64 + target = mr.allocate(NBYTES, stream=stream) + + # Begin graph capture. + gb = Device().create_graph_builder().begin_building(mode="thread_local") + + work_buffer = mr.allocate(NBYTES, stream=gb.stream) + launch(gb, LaunchConfig(grid=1, block=1), set_zero, int(work_buffer.handle), NBYTES) + launch(gb, LaunchConfig(grid=1, block=1), add_one, int(work_buffer.handle), NBYTES) + launch(gb, LaunchConfig(grid=1, block=1), add_one, int(work_buffer.handle), NBYTES) + target.copy_from(work_buffer, stream=gb.stream) + + # Finalize the graph. + graph = gb.end_building().complete() + + # Upload and launch + graph.upload(stream) + graph.launch(stream) + stream.sync() + + # Check the result. + expected_buffer = make_scratch_buffer(device, 2, NBYTES) + compare_buffer = make_scratch_buffer(device, 0, NBYTES) + compare_buffer.copy_from(target, stream=stream) + stream.sync() + assert compare_equal_buffers(expected_buffer, compare_buffer) + +# TODO: check that mr.attributes is None with mempool_enabled=False diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index a261ec7a3d..5a8f798687 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -285,19 +285,28 @@ def test_buffer_dunder_dlpack_device_failure(): buffer.__dlpack_device__() +@pytest.mark.parametrize("use_mempool", [True, False]) @pytest.mark.parametrize("use_device_object", [True, False]) -def test_device_memory_resource_initialization(mempool_device, use_device_object): +def test_device_memory_resource_initialization(use_mempool, use_device_object): """Test that DeviceMemoryResource can be initialized successfully. This test verifies that the DeviceMemoryResource initializes properly, including the release threshold configuration for performance optimization. """ - device = mempool_device + device = Device() + + if use_mempool and not device.properties.memory_pools_supported: + pytest.skip("Device does not support mempool operations") + + device.set_current() # This should succeed and configure the memory pool release threshold. # The resource can be constructed from either a device or device ordinal. device_arg = device if use_device_object else device.device_id - mr = DeviceMemoryResource(device_arg) + if use_mempool: + mr = DeviceMemoryResource(device_arg) + else: + mr = DeviceMemoryResource(device_arg, options={'mempool_enabled': False}) # Verify basic properties assert mr.device_id == device.device_id @@ -312,6 +321,18 @@ def test_device_memory_resource_initialization(mempool_device, use_device_object buffer.close() +def test_device_memory_resource_initialization_errors(): + """Test illegal construct option combinations.""" + device = Device() + device.set_current() + + with pytest.raises(TypeError, match="Cannot create an IPC-enabled memory resource without memory pooling enabled"): + DeviceMemoryResource(device, {"mempool_enabled": False, "ipc_enabled": True}) + + with pytest.raises(TypeError, match="Cannot supply max_size without memory pooling enabled"): + DeviceMemoryResource(device, {"mempool_enabled": False, "max_size": 1024}) + + def test_vmm_allocator_basic_allocation(): """Test basic VMM allocation functionality. @@ -481,12 +502,23 @@ def test_vmm_allocator_rdma_unsupported_exception(): VirtualMemoryResource(device, config=options) -def test_mempool(mempool_device): - device = mempool_device +@pytest.mark.parametrize("use_mempool", [True, False]) +def test_device_memory_resource(use_mempool): + + device = Device() + + if use_mempool and not device.properties.memory_pools_supported: + pytest.skip("Device does not support mempool operations") + + device.set_current() # Test basic pool creation - options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False) - mr = DeviceMemoryResource(device, options=options) + if use_mempool: + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE) + mr = DeviceMemoryResource(device, options=options) + else: + options = DeviceMemoryResourceOptions(mempool_enabled=False) + mr = DeviceMemoryResource(device, options=options) assert mr.device_id == device.device_id assert mr.is_device_accessible assert not mr.is_host_accessible @@ -523,8 +555,12 @@ def test_mempool(mempool_device): dst_buffer.close() src_buffer.close() - # Test error cases - # Test IPC operations are disabled + +def test_mempool_ipc_errors(mempool_device): + """Test error cases when IPC operations are disabled.""" + device = mempool_device + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False) + mr = DeviceMemoryResource(device, options=options) buffer = mr.allocate(64) ipc_error_msg = "Memory resource is not IPC-enabled" @@ -599,6 +635,14 @@ def test_mempool_attributes(ipc_enabled, mempool_device, property_name, expected assert value >= current_value, f"{property_name} should be >= {current_prop}" +def test_mempool_no_attributes(): + """Ensure mempool attributes cannot be accessed when memory pooling is disabled.""" + device = Device() + device.set_current() + mr = DeviceMemoryResource(device, options={"mempool_enabled": False}) + assert mr.attributes is None + + def test_mempool_attributes_ownership(mempool_device): """Ensure the attributes bundle handles references correctly.""" device = mempool_device @@ -644,3 +688,4 @@ def test_strided_memory_view_refcnt(): assert av.strides[0] == 1 assert av.strides[1] == 64 assert sys.getrefcount(av.strides) >= 2 + From 88834f7361d419a6ab0c4b66c9aa91d89325cba8 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 4 Nov 2025 16:09:59 -0800 Subject: [PATCH 02/15] Add GraphMemoryResource. --- cuda_core/cuda/core/experimental/__init__.py | 1 + .../core/experimental/_memory/__init__.py | 1 + .../_memory/_device_memory_resource.pyx | 6 + .../cuda/core/experimental/_memory/_gmr.pxd | 13 ++ .../cuda/core/experimental/_memory/_gmr.pyx | 160 ++++++++++++++++++ cuda_core/tests/test_graph.py | 89 ++++++++-- cuda_core/tests/test_memory.py | 30 +++- 7 files changed, 281 insertions(+), 19 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_memory/_gmr.pxd create mode 100644 cuda_core/cuda/core/experimental/_memory/_gmr.pyx diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 8a60c031c5..67402e10a6 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -42,6 +42,7 @@ Buffer, DeviceMemoryResource, DeviceMemoryResourceOptions, + GraphMemoryResource, LegacyPinnedMemoryResource, MemoryResource, VirtualMemoryResource, diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py index 3c07fbdde6..8dd4385982 100644 --- a/cuda_core/cuda/core/experimental/_memory/__init__.py +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -4,6 +4,7 @@ from ._buffer import * # noqa: F403 from ._device_memory_resource import * # noqa: F403 +from ._gmr import * # noqa: F403 from ._ipc import * # noqa: F403 from ._legacy import * # noqa: F403 from ._virtual_memory_resource import * # noqa: F403 diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index 47cef28a55..067cb5ff16 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -72,6 +72,12 @@ cdef class DeviceMemoryResourceAttributes: self._mr_weakref = mr return self + def __repr__(self): + return f"{self.__class__.__name__}(%s)" % ", ".join( + f"{attr}={getattr(self, attr)}" for attr in dir(self) + if not attr.startswith("_") + ) + @DMRA_mempool_attribute(bool) def reuse_follow_event_dependencies(self): """Allow memory to be reused when there are event dependencies between streams.""" diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pxd b/cuda_core/cuda/core/experimental/_memory/_gmr.pxd new file mode 100644 index 0000000000..0feb4a398b --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pxd @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver +from cuda.core.experimental._memory._buffer cimport MemoryResource +from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource +from cuda.core.experimental._memory._ipc cimport IPCData + + +cdef class cyGraphMemoryResource(MemoryResource): + cdef: + int _dev_id diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx new file mode 100644 index 0000000000..34999f5a54 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx @@ -0,0 +1,160 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport uintptr_t, intptr_t + +from cuda.bindings cimport cydriver +from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource +from cuda.core.experimental._stream cimport Stream +from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN + +from functools import cache +from typing import TYPE_CHECKING + +from cuda.core.experimental._utils.cuda_utils import driver + +if TYPE_CHECKING: + from cuda.core.experimental._memory.buffer import DevicePointerT + +__all__ = ['GraphMemoryResource'] + + +cdef class GraphMemoryResourceAttributes: + cdef: + int _dev_id + + def __init__(self, *args, **kwargs): + raise RuntimeError("GraphMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.") + + @classmethod + def _init(cls, device_id: int): + cdef GraphMemoryResourceAttributes self = GraphMemoryResourceAttributes.__new__(cls) + self._dev_id = device_id + return self + + def __repr__(self): + return f"{self.__class__.__name__}(%s)" % ", ".join( + f"{attr}={getattr(self, attr)}" for attr in dir(self) + if not attr.startswith("_") + ) + + @GMRA_mem_attribute(int) + def reserved_mem_current(self): + """Current amount of backing memory allocated.""" + + @GMRA_mem_attribute(int, settable=True) + def reserved_mem_high(self): + """High watermark of backing memory allocated.""" + + @GMRA_mem_attribute(int) + def used_mem_current(self): + """Current amount of memory in use.""" + + @GMRA_mem_attribute(int, settable=True) + def used_mem_high(self): + """High watermark of memory in use.""" + + +cdef GMRA_mem_attribute(property_type: type, settable : bool = False): + _settable = settable + + def decorator(stub): + attr_enum = getattr( + driver.CUgraphMem_attribute, f"CU_GRAPH_MEM_ATTR_{stub.__name__.upper()}" + ) + + def fget(GraphMemoryResourceAttributes self) -> property_type: + value = GMRA_getattribute(self._dev_id, attr_enum) + return property_type(value) + + if _settable: + def fset(GraphMemoryResourceAttributes self, value: int): + GMRA_setattribute(self._dev_id, attr_enum, value) + else: + fset = None + + return property(fget=fget, fset=fset, doc=stub.__doc__) + return decorator + + +cdef int GMRA_getattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum): + cdef int value + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(device_id, attr_enum, &value)) + return value + + +cdef int GMRA_setattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum, int value): + with nogil: + HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(device_id, attr_enum, &value)) + + +cdef class cyGraphMemoryResource(MemoryResource): + def __cinit__(self, int device_id): + self._dev_id = device_id + + def allocate(self, size_t size, Stream stream = None) -> Buffer: + return GMR_allocate(self, size, stream) + + def deallocate(self, ptr: DevicePointerT, size_t size, Stream stream = None): + return GMR_deallocate(ptr, size, stream) + + def close(self): + pass + + def trim(self): + """Free unused memory that was cached on the specified device for use with graphs back to the OS.""" + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._dev_id)) + + @property + def attributes(self) -> GraphMemoryResourceAttributes: + """Asynchronous allocation attributes related to graphs.""" + return GraphMemoryResourceAttributes._init(self._dev_id) + + @property + def device_id(self) -> int: + """The associated device ordinal.""" + return self._dev_id + + @property + def is_device_accessible(self) -> bool: + """Return True. This memory resource provides device-accessible buffers.""" + return True + + @property + def is_host_accessible(self) -> bool: + """Return False. This memory resource does not provide host-accessible buffers.""" + return False + + +class GraphMemoryResource(cyGraphMemoryResource): + @cache + def __new__(cls, device_id: int | Device): + cdef int c_device_id = getattr(device_id, 'device_id', device_id) + return cyGraphMemoryResource.__new__(cls, c_device_id) + + +cdef Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream): + cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUdeviceptr devptr + with nogil: + HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s)) + cdef Buffer buf = Buffer.__new__(Buffer) + buf._ptr = (devptr) + buf._ptr_obj = None + buf._size = size + buf._memory_resource = self + buf._alloc_stream = stream + return buf + + +cdef void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept: + cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUdeviceptr devptr = ptr + with nogil: + HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) + diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py index d4312363cb..8539fa9a24 100644 --- a/cuda_core/tests/test_graph.py +++ b/cuda_core/tests/test_graph.py @@ -16,6 +16,7 @@ GraphBuilder, GraphCompleteOptions, GraphDebugPrintOptions, + GraphMemoryResource, LaunchConfig, LegacyPinnedMemoryResource, Program, @@ -776,42 +777,96 @@ def test_graph_build_mode(init_cuda): gb = Device().create_graph_builder().begin_building(mode=None) -def test_graph_alloc(init_cuda): +@pytest.mark.parametrize("use_graph", [False, True]) +def test_graph_alloc(init_cuda, use_graph): + """Test graph capture with memory allocated by GraphMemoryResource.""" device = Device() stream = device.create_stream() - options = DeviceMemoryResourceOptions(mempool_enabled=False) - mr = DeviceMemoryResource(device, options=options) # Get kernels. mod = _common_kernels_alloc() set_zero = mod.get_kernel("set_zero") add_one = mod.get_kernel("add_one") + # Allocate a non-graph target buffer in device memory. The work sequence + # will fill it. NBYTES = 64 - target = mr.allocate(NBYTES, stream=stream) + mr = DeviceMemoryResource(device) + tgt_buffer = mr.allocate(NBYTES, stream=stream) + + # Get the memory resource for graphs. + gmr = GraphMemoryResource(device) + if use_graph: + # Trim memory to zero and reset high watermarks. + gmr.trim() + # gmr.attributes.reserved_mem_high = 0 ## not working + # gmr.attributes.used_mem_high = 0 + + assert gmr.attributes.reserved_mem_current == 0 + assert gmr.attributes.reserved_mem_high == 0 + assert gmr.attributes.used_mem_current == 0 + assert gmr.attributes.used_mem_high == 0 + + # ====== Begin work sequence ====== # Begin graph capture. - gb = Device().create_graph_builder().begin_building(mode="thread_local") + if use_graph: + gb = Device().create_graph_builder().begin_building(mode="thread_local") - work_buffer = mr.allocate(NBYTES, stream=gb.stream) - launch(gb, LaunchConfig(grid=1, block=1), set_zero, int(work_buffer.handle), NBYTES) - launch(gb, LaunchConfig(grid=1, block=1), add_one, int(work_buffer.handle), NBYTES) - launch(gb, LaunchConfig(grid=1, block=1), add_one, int(work_buffer.handle), NBYTES) - target.copy_from(work_buffer, stream=gb.stream) + # Set up resources for use in the captured sequence. + cap_mr = gmr if use_graph else mr + cap_stream = gb.stream if use_graph else stream # FIXME: should use gb here, not gb.stream - # Finalize the graph. - graph = gb.end_building().complete() + # Perform or capture the work. + work_buffer = cap_mr.allocate(NBYTES, stream=cap_stream) + for kernel in [set_zero, add_one, add_one]: + launch(cap_stream, LaunchConfig(grid=1, block=1), kernel, int(work_buffer.handle), NBYTES) + tgt_buffer.copy_from(work_buffer, stream=cap_stream) + + if use_graph: + # Finalize the graph. + graph = gb.end_building().complete() + + # Upload and launch + graph.upload(stream) + graph.launch(stream) - # Upload and launch - graph.upload(stream) - graph.launch(stream) stream.sync() + # ====== End work sequence ====== + # Check the result. expected_buffer = make_scratch_buffer(device, 2, NBYTES) compare_buffer = make_scratch_buffer(device, 0, NBYTES) - compare_buffer.copy_from(target, stream=stream) + compare_buffer.copy_from(tgt_buffer, stream=stream) stream.sync() assert compare_equal_buffers(expected_buffer, compare_buffer) -# TODO: check that mr.attributes is None with mempool_enabled=False + # Check memory usage. + if use_graph: + # See that the graph-specific pool has been used. + assert gmr.attributes.reserved_mem_current > 0 + assert gmr.attributes.reserved_mem_high > 0 + assert gmr.attributes.used_mem_current > 0 + assert gmr.attributes.used_mem_high > 0 + + work_buffer.close() + + # See that the non-graph-specific pool has one allocation. + assert mr.attributes.used_mem_current == NBYTES + tgt_buffer.close() + assert mr.attributes.used_mem_current == 0 + + else: + # See that the graph-specific pool has not been used. + assert gmr.attributes.reserved_mem_current == 0 + assert gmr.attributes.reserved_mem_high == 0 + assert gmr.attributes.used_mem_current == 0 + assert gmr.attributes.used_mem_high == 0 + + # See that the non-graph-specific pool has two allocations. + assert mr.attributes.used_mem_current == 2 * NBYTES + tgt_buffer.close() + work_buffer.close() + assert mr.attributes.used_mem_current == 0 + diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 5a8f798687..e50e46dc90 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -13,6 +13,7 @@ np = None import ctypes import platform +import re import pytest from cuda.core.experimental import ( @@ -20,6 +21,7 @@ Device, DeviceMemoryResource, DeviceMemoryResourceOptions, + GraphMemoryResource, MemoryResource, VirtualMemoryResource, VirtualMemoryResourceOptions, @@ -133,6 +135,7 @@ def test_package_contents(): "MemoryResource", "DeviceMemoryResource", "DeviceMemoryResourceOptions", + "GraphMemoryResource", "IPCBufferDescriptor", "IPCAllocationHandle", "LegacyPinnedMemoryResource", @@ -306,7 +309,7 @@ def test_device_memory_resource_initialization(use_mempool, use_device_object): if use_mempool: mr = DeviceMemoryResource(device_arg) else: - mr = DeviceMemoryResource(device_arg, options={'mempool_enabled': False}) + mr = DeviceMemoryResource(device_arg, options={"mempool_enabled": False}) # Verify basic properties assert mr.device_id == device.device_id @@ -504,7 +507,6 @@ def test_vmm_allocator_rdma_unsupported_exception(): @pytest.mark.parametrize("use_mempool", [True, False]) def test_device_memory_resource(use_mempool): - device = Device() if use_mempool and not device.properties.memory_pools_supported: @@ -635,6 +637,22 @@ def test_mempool_attributes(ipc_enabled, mempool_device, property_name, expected assert value >= current_value, f"{property_name} should be >= {current_prop}" +def test_mempool_attributes_repr(mempool_device): + device = Device() + device.set_current() + mr = DeviceMemoryResource(device, options={"max_size": 2048}) + buffer1 = mr.allocate(64) + buffer2 = mr.allocate(64) + buffer1.close() + assert re.match( + r"DeviceMemoryResourceAttributes\(release_threshold=\d+, reserved_mem_current=\d+, reserved_mem_high=\d+, " + r"reuse_allow_internal_dependencies=(True|False), reuse_allow_opportunistic=(True|False), " + r"reuse_follow_event_dependencies=(True|False), used_mem_current=64, used_mem_high=128\)", + str(mr.attributes), + ) + buffer2.close() + + def test_mempool_no_attributes(): """Ensure mempool attributes cannot be accessed when memory pooling is disabled.""" device = Device() @@ -689,3 +707,11 @@ def test_strided_memory_view_refcnt(): assert av.strides[1] == 64 assert sys.getrefcount(av.strides) >= 2 +def test_graph_memory_resource_object(init_cuda): + device = Device() + gmr1 = GraphMemoryResource(device) + gmr2 = GraphMemoryResource(device) + + # These objects are interned. + assert gmr1 is gmr2 + assert gmr1 == gmr2 From 57855a1735b8e756ce82a050b75a502d8f99bada Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 5 Nov 2025 12:47:01 -0800 Subject: [PATCH 03/15] Remove mempool_enabled option now that GraphMemoryResource is ready. --- .../_memory/_device_memory_resource.pyx | 48 ++++--------------- cuda_core/tests/test_memory.py | 43 +++-------------- 2 files changed, 15 insertions(+), 76 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index 067cb5ff16..752c5f7950 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -48,15 +48,9 @@ cdef class DeviceMemoryResourceOptions: max_size : int, optional Maximum pool size. When set to 0, defaults to a system-dependent value. (Default to 0) - - mempool_enabled : bool, optional - Whether to use a memory pool. Pool-based allocations cannot be captured - in a graph but are the only ones that support sharing via IPC. - (Default to True) """ - ipc_enabled : cython.bint = False - max_size : cython.size_t = 0 - mempool_enabled : cython.bint = True + ipc_enabled: cython.bint = False + max_size : cython.size_t = 0 cdef class DeviceMemoryResourceAttributes: @@ -232,10 +226,8 @@ cdef class DeviceMemoryResource(MemoryResource): if opts is None: DMR_init_current(self, dev_id) - elif opts.mempool_enabled: - DMR_init_create_mempool(self, dev_id, opts) else: - DMR_init_no_mempool(self, dev_id, opts) + DMR_init_create(self, dev_id, opts) def __dealloc__(self): DMR_close(self) @@ -357,8 +349,8 @@ cdef class DeviceMemoryResource(MemoryResource): @property def attributes(self) -> DeviceMemoryResourceAttributes: - """Memory pool attributes or None if memory pooling is not enabled.""" - if self._attributes is None and self.is_mempool_enabled: + """Memory pool attributes.""" + if self._attributes is None: ref = weakref.ref(self) self._attributes = DeviceMemoryResourceAttributes._init(ref) return self._attributes @@ -371,8 +363,6 @@ cdef class DeviceMemoryResource(MemoryResource): @property def handle(self) -> driver.CUmemoryPool: """Handle to the underlying memory pool.""" - if self._handle == NULL: - raise RuntimeError("Memory resource has no memory pool.") return driver.CUmemoryPool((self._handle)) @property @@ -403,11 +393,6 @@ cdef class DeviceMemoryResource(MemoryResource): """ return self._ipc_data is not None and self._ipc_data._is_mapped - @property - def is_mempool_enabled(self) -> bool: - """Whether this memory resource uses a memory pool.""" - return self._handle != NULL - @property def uuid(self) -> Optional[uuid.UUID]: """ @@ -452,7 +437,7 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): )) -cdef void DMR_init_create_mempool( +cdef void DMR_init_create( DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts ): # Create a new memory pool. @@ -482,28 +467,11 @@ cdef void DMR_init_create_mempool( self._ipc_data = IPCData(alloc_handle, mapped=False) -cdef void DMR_init_no_mempool( - DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts -): - # Create mr without a memory pool. - if opts.ipc_enabled: - raise TypeError("Cannot create an IPC-enabled memory resource without " - "memory pooling enabled.") - if opts.max_size != 0: - raise TypeError("Cannot supply max_size without memory pooling enabled.") - - self._dev_id = dev_id - - cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr - if self.is_mempool_enabled: - with nogil: - HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s)) - else: - with nogil: - HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s)) + with nogil: + HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s)) cdef Buffer buf = Buffer.__new__(Buffer) buf._ptr = (devptr) buf._ptr_obj = None diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index e50e46dc90..fe47ab3b90 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -288,9 +288,8 @@ def test_buffer_dunder_dlpack_device_failure(): buffer.__dlpack_device__() -@pytest.mark.parametrize("use_mempool", [True, False]) @pytest.mark.parametrize("use_device_object", [True, False]) -def test_device_memory_resource_initialization(use_mempool, use_device_object): +def test_device_memory_resource_initialization(use_device_object): """Test that DeviceMemoryResource can be initialized successfully. This test verifies that the DeviceMemoryResource initializes properly, @@ -298,7 +297,7 @@ def test_device_memory_resource_initialization(use_mempool, use_device_object): """ device = Device() - if use_mempool and not device.properties.memory_pools_supported: + if not device.properties.memory_pools_supported: pytest.skip("Device does not support mempool operations") device.set_current() @@ -306,10 +305,7 @@ def test_device_memory_resource_initialization(use_mempool, use_device_object): # This should succeed and configure the memory pool release threshold. # The resource can be constructed from either a device or device ordinal. device_arg = device if use_device_object else device.device_id - if use_mempool: - mr = DeviceMemoryResource(device_arg) - else: - mr = DeviceMemoryResource(device_arg, options={"mempool_enabled": False}) + mr = DeviceMemoryResource(device_arg) # Verify basic properties assert mr.device_id == device.device_id @@ -324,18 +320,6 @@ def test_device_memory_resource_initialization(use_mempool, use_device_object): buffer.close() -def test_device_memory_resource_initialization_errors(): - """Test illegal construct option combinations.""" - device = Device() - device.set_current() - - with pytest.raises(TypeError, match="Cannot create an IPC-enabled memory resource without memory pooling enabled"): - DeviceMemoryResource(device, {"mempool_enabled": False, "ipc_enabled": True}) - - with pytest.raises(TypeError, match="Cannot supply max_size without memory pooling enabled"): - DeviceMemoryResource(device, {"mempool_enabled": False, "max_size": 1024}) - - def test_vmm_allocator_basic_allocation(): """Test basic VMM allocation functionality. @@ -505,22 +489,17 @@ def test_vmm_allocator_rdma_unsupported_exception(): VirtualMemoryResource(device, config=options) -@pytest.mark.parametrize("use_mempool", [True, False]) -def test_device_memory_resource(use_mempool): +def test_device_memory_resource(): device = Device() - if use_mempool and not device.properties.memory_pools_supported: + if not device.properties.memory_pools_supported: pytest.skip("Device does not support mempool operations") device.set_current() # Test basic pool creation - if use_mempool: - options = DeviceMemoryResourceOptions(max_size=POOL_SIZE) - mr = DeviceMemoryResource(device, options=options) - else: - options = DeviceMemoryResourceOptions(mempool_enabled=False) - mr = DeviceMemoryResource(device, options=options) + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE) + mr = DeviceMemoryResource(device, options=options) assert mr.device_id == device.device_id assert mr.is_device_accessible assert not mr.is_host_accessible @@ -653,14 +632,6 @@ def test_mempool_attributes_repr(mempool_device): buffer2.close() -def test_mempool_no_attributes(): - """Ensure mempool attributes cannot be accessed when memory pooling is disabled.""" - device = Device() - device.set_current() - mr = DeviceMemoryResource(device, options={"mempool_enabled": False}) - assert mr.attributes is None - - def test_mempool_attributes_ownership(mempool_device): """Ensure the attributes bundle handles references correctly.""" device = mempool_device From 0375941fcd7849b7988a4da723af6c0c84da5237 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 5 Nov 2025 13:05:09 -0800 Subject: [PATCH 04/15] Add docstring and make GraphMemoryResource a singleton. --- .../_memory/_device_memory_resource.pyx | 2 +- .../cuda/core/experimental/_memory/_gmr.pyx | 29 +++++++++++++++++-- cuda_core/tests/test_memory.py | 5 ++-- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index 752c5f7950..d7c120cecf 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -133,7 +133,7 @@ cdef int DMRA_getattribute( cdef class DeviceMemoryResource(MemoryResource): """ - Create a device memory resource managing a stream-ordered memory pool. + A device memory resource managing a stream-ordered memory pool. Parameters ---------- diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx index 34999f5a54..23f09cc4c8 100644 --- a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx @@ -132,10 +132,35 @@ cdef class cyGraphMemoryResource(MemoryResource): class GraphMemoryResource(cyGraphMemoryResource): - @cache + """ + A memory resource managing the graph-specific memory pool. + + Graph-captured memory operations use a special internal memory pool, which + is a per-device singleton. This class serves as the interface to that pool. + The only supported operations are allocation, deallocation, and a limited + set of status queries. + + This memory resource should be used to allocate memory when graph capturing + is enabled. Using this when graphs are not being captured will result in a + runtime error. + + Conversely, allocating memory from a `DeviceMemoryResource` when graph + capturing is enabled results in a runtime error. + + Parameters + ---------- + device_id : int | Device + Device or Device ordinal for which a graph memory resource is obtained. + """ + def __new__(cls, device_id: int | Device): cdef int c_device_id = getattr(device_id, 'device_id', device_id) - return cyGraphMemoryResource.__new__(cls, c_device_id) + return cls._create(c_device_id) + + @classmethod + @cache + def _create(cls, int device_id): + return cyGraphMemoryResource.__new__(cls, device_id) cdef Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream): diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index fe47ab3b90..373cecb9b6 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -682,7 +682,8 @@ def test_graph_memory_resource_object(init_cuda): device = Device() gmr1 = GraphMemoryResource(device) gmr2 = GraphMemoryResource(device) + gmr3 = GraphMemoryResource(device.device_id) # These objects are interned. - assert gmr1 is gmr2 - assert gmr1 == gmr2 + assert gmr1 is gmr2 is gmr3 + assert gmr1 == gmr2 == gmr3 From 0b82b1fa6f6f8da0e002b101b7d6b64bf4d22264 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 5 Nov 2025 15:38:52 -0800 Subject: [PATCH 05/15] Move tests to a separate file. --- cuda_core/tests/test_graph.py | 118 ----------------------------- cuda_core/tests/test_graph_mem.py | 120 ++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 118 deletions(-) create mode 100644 cuda_core/tests/test_graph_mem.py diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py index 8539fa9a24..39810930fe 100644 --- a/cuda_core/tests/test_graph.py +++ b/cuda_core/tests/test_graph.py @@ -16,7 +16,6 @@ GraphBuilder, GraphCompleteOptions, GraphDebugPrintOptions, - GraphMemoryResource, LaunchConfig, LegacyPinnedMemoryResource, Program, @@ -65,30 +64,6 @@ def _common_kernels_conditional(): return mod -def _common_kernels_alloc(): - code = """ - __global__ void set_zero(char *a, size_t nbytes) { - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = blockDim.x * gridDim.x; - for (size_t i = idx; i < nbytes; i += stride) { - a[i] = 0; - } - } - __global__ void add_one(char *a, size_t nbytes) { - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = blockDim.x * gridDim.x; - for (size_t i = idx; i < nbytes; i += stride) { - a[i] += 1; - } - } - """ - arch = "".join(f"{i}" for i in Device().compute_capability) - program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") - prog = Program(code, code_type="c++", options=program_options) - mod = prog.compile("cubin", name_expressions=("set_zero", "add_one")) - return mod - - def test_graph_is_building(init_cuda): gb = Device().create_graph_builder() assert gb.is_building is False @@ -777,96 +752,3 @@ def test_graph_build_mode(init_cuda): gb = Device().create_graph_builder().begin_building(mode=None) -@pytest.mark.parametrize("use_graph", [False, True]) -def test_graph_alloc(init_cuda, use_graph): - """Test graph capture with memory allocated by GraphMemoryResource.""" - device = Device() - stream = device.create_stream() - - # Get kernels. - mod = _common_kernels_alloc() - set_zero = mod.get_kernel("set_zero") - add_one = mod.get_kernel("add_one") - - # Allocate a non-graph target buffer in device memory. The work sequence - # will fill it. - NBYTES = 64 - mr = DeviceMemoryResource(device) - tgt_buffer = mr.allocate(NBYTES, stream=stream) - - # Get the memory resource for graphs. - gmr = GraphMemoryResource(device) - if use_graph: - # Trim memory to zero and reset high watermarks. - gmr.trim() - # gmr.attributes.reserved_mem_high = 0 ## not working - # gmr.attributes.used_mem_high = 0 - - assert gmr.attributes.reserved_mem_current == 0 - assert gmr.attributes.reserved_mem_high == 0 - assert gmr.attributes.used_mem_current == 0 - assert gmr.attributes.used_mem_high == 0 - - # ====== Begin work sequence ====== - - # Begin graph capture. - if use_graph: - gb = Device().create_graph_builder().begin_building(mode="thread_local") - - # Set up resources for use in the captured sequence. - cap_mr = gmr if use_graph else mr - cap_stream = gb.stream if use_graph else stream # FIXME: should use gb here, not gb.stream - - # Perform or capture the work. - work_buffer = cap_mr.allocate(NBYTES, stream=cap_stream) - for kernel in [set_zero, add_one, add_one]: - launch(cap_stream, LaunchConfig(grid=1, block=1), kernel, int(work_buffer.handle), NBYTES) - tgt_buffer.copy_from(work_buffer, stream=cap_stream) - - if use_graph: - # Finalize the graph. - graph = gb.end_building().complete() - - # Upload and launch - graph.upload(stream) - graph.launch(stream) - - stream.sync() - - # ====== End work sequence ====== - - # Check the result. - expected_buffer = make_scratch_buffer(device, 2, NBYTES) - compare_buffer = make_scratch_buffer(device, 0, NBYTES) - compare_buffer.copy_from(tgt_buffer, stream=stream) - stream.sync() - assert compare_equal_buffers(expected_buffer, compare_buffer) - - # Check memory usage. - if use_graph: - # See that the graph-specific pool has been used. - assert gmr.attributes.reserved_mem_current > 0 - assert gmr.attributes.reserved_mem_high > 0 - assert gmr.attributes.used_mem_current > 0 - assert gmr.attributes.used_mem_high > 0 - - work_buffer.close() - - # See that the non-graph-specific pool has one allocation. - assert mr.attributes.used_mem_current == NBYTES - tgt_buffer.close() - assert mr.attributes.used_mem_current == 0 - - else: - # See that the graph-specific pool has not been used. - assert gmr.attributes.reserved_mem_current == 0 - assert gmr.attributes.reserved_mem_high == 0 - assert gmr.attributes.used_mem_current == 0 - assert gmr.attributes.used_mem_high == 0 - - # See that the non-graph-specific pool has two allocations. - assert mr.attributes.used_mem_current == 2 * NBYTES - tgt_buffer.close() - work_buffer.close() - assert mr.attributes.used_mem_current == 0 - diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py new file mode 100644 index 0000000000..780fb23dd7 --- /dev/null +++ b/cuda_core/tests/test_graph_mem.py @@ -0,0 +1,120 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import pytest + +from cuda.core.experimental import ( + Device, + DeviceMemoryResource, + GraphMemoryResource, + LaunchConfig, + Program, + ProgramOptions, + launch, +) +from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return +from helpers.buffers import compare_equal_buffers, make_scratch_buffer + +def _common_kernels_alloc(): + code = """ + __global__ void set_zero(char *a, size_t nbytes) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = idx; i < nbytes; i += stride) { + a[i] = 0; + } + } + __global__ void add_one(char *a, size_t nbytes) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = idx; i < nbytes; i += stride) { + a[i] += 1; + } + } + """ + arch = "".join(f"{i}" for i in Device().compute_capability) + program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("set_zero", "add_one")) + return mod + + + +@pytest.mark.parametrize("repeat", [0,1,2]) +@pytest.mark.parametrize("use_graph", [False, True]) +def test_graph_alloc(init_cuda, use_graph, repeat): + """Test graph capture with memory allocated by GraphMemoryResource.""" + NBYTES = 64 + device = Device() + stream = device.create_stream() + dmr = DeviceMemoryResource(device) + gmr = GraphMemoryResource(device) + + out = dmr.allocate(NBYTES, stream=stream) + + # Get kernels and define the calling sequence. + mod = _common_kernels_alloc() + set_zero = mod.get_kernel("set_zero") + add_one = mod.get_kernel("add_one") + + def apply_kernels(mr, stream, out): + buffer = mr.allocate(NBYTES, stream=stream) + config = LaunchConfig(grid=1, block=1) + for kernel in [set_zero, add_one, add_one]: + launch(stream, config, kernel, buffer, NBYTES) + out.copy_from(buffer, stream=stream) + buffer.close() + + # ====== Begin work sequence ====== + if use_graph: + # Trim memory to zero and reset high watermarks. + gmr.trim() + # gmr.attributes.reserved_mem_high = 0 ## not working + # gmr.attributes.used_mem_high = 0 + + assert gmr.attributes.reserved_mem_current == 0 + # assert gmr.attributes.reserved_mem_high == 0 + assert gmr.attributes.used_mem_current == 0 + # assert gmr.attributes.used_mem_high == 0 + + # Begin graph capture. + gb = Device().create_graph_builder().begin_building(mode="thread_local") + + # Capture work. + apply_kernels(mr=gmr, stream=gb.stream, out=out) + + # Finalize the graph. + graph = gb.end_building().complete() + + # Upload and launch + graph.upload(stream) + graph.launch(stream) + else: + # Do work without graph capture. + apply_kernels(mr=dmr, stream=stream, out=out) + + stream.sync() + # ====== End work sequence ====== + + # Check the result on the host. + host_ans = make_scratch_buffer(device, 2, NBYTES) + host_tmp = make_scratch_buffer(device, 0, NBYTES) + host_tmp.copy_from(out, stream=stream) + stream.sync() + assert compare_equal_buffers(host_ans, host_tmp) + host_ans.close() + host_tmp.close() + + # # Check memory usage. + # if use_graph: + # assert dmr.attributes.used_mem_current == NBYTES + # assert gmr.attributes.used_mem_current > 0 + # out.close() + # assert gmr.attributes.used_mem_current == 0 + # else: + # assert dmr.attributes.used_mem_current == NBYTES + # assert gmr.attributes.used_mem_current == 0 + # out.close() + # assert dmr.attributes.used_mem_current == 0 + From 53b1c58d2a3dd3ae7de50d359586d0128e84b32e Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 5 Nov 2025 17:15:22 -0800 Subject: [PATCH 06/15] Add errors for DeviceMemoryResource and GraphMemoryResource when graph capture state is not as expected. --- .../_memory/_device_memory_resource.pyx | 13 +++ .../cuda/core/experimental/_memory/_gmr.pyx | 12 +++ cuda_core/tests/helpers/buffers.py | 25 +++++- cuda_core/tests/test_graph_mem.py | 82 ++++++++++++++++--- 4 files changed, 120 insertions(+), 12 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index d7c120cecf..4e66f083fd 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -467,10 +467,21 @@ cdef void DMR_init_create( self._ipc_data = IPCData(alloc_handle, mapped=False) +# Raise an exception if the given stream is capturing. +# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error. +cdef void check_not_capturing(cydriver.CUstream s) nogil: + cdef cydriver.CUstreamCaptureStatus capturing + HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing)) + if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE: + raise RuntimeError("DeviceMemoryResource cannot perform memory operations on " + "a capturing stream (consider using GraphMemoryResource).") + + cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr with nogil: + check_not_capturing(s) HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s)) cdef Buffer buf = Buffer.__new__(Buffer) buf._ptr = (devptr) @@ -486,7 +497,9 @@ cdef void DMR_deallocate( ) noexcept: cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr = ptr + cdef cydriver.CUstreamCaptureStatus capturing with nogil: + check_not_capturing(s) HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx index 23f09cc4c8..0bf9ea45f8 100644 --- a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx @@ -163,10 +163,21 @@ class GraphMemoryResource(cyGraphMemoryResource): return cyGraphMemoryResource.__new__(cls, device_id) +# Raise an exception if the given stream is capturing. +# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error. +cdef void check_capturing(cydriver.CUstream s) nogil: + cdef cydriver.CUstreamCaptureStatus capturing + HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing)) + if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE: + raise RuntimeError("GraphMemoryResource cannot perform memory operations on " + "a non-capturing stream.") + + cdef Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr with nogil: + check_capturing(s) HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s)) cdef Buffer buf = Buffer.__new__(Buffer) buf._ptr = (devptr) @@ -181,5 +192,6 @@ cdef void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept: cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr = ptr with nogil: + check_capturing(s) HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py index 972e83e131..e6fc0633ac 100644 --- a/cuda_core/tests/helpers/buffers.py +++ b/cuda_core/tests/helpers/buffers.py @@ -3,12 +3,18 @@ import ctypes -from cuda.core.experimental import Buffer, MemoryResource +from cuda.core.experimental import Buffer, Device, MemoryResource from cuda.core.experimental._utils.cuda_utils import driver, handle_return from . import libc -__all__ = ["DummyUnifiedMemoryResource", "PatternGen", "make_scratch_buffer", "compare_equal_buffers"] +__all__ = [ + "compare_buffer_to_constant", + "compare_equal_buffers", + "DummyUnifiedMemoryResource", + "make_scratch_buffer", + "PatternGen", +] class DummyUnifiedMemoryResource(MemoryResource): @@ -102,6 +108,7 @@ def _get_pattern_buffer(self, seed, value): def make_scratch_buffer(device, value, nbytes): """Create a unified memory buffer with the specified value.""" + assert 0 <= int(value) < 256 buffer = DummyUnifiedMemoryResource(device).allocate(nbytes) ptr = ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_byte)) ctypes.memset(ptr, value & 0xFF, nbytes) @@ -115,3 +122,17 @@ def compare_equal_buffers(buffer1, buffer2): ptr1 = ctypes.cast(int(buffer1.handle), ctypes.POINTER(ctypes.c_byte)) ptr2 = ctypes.cast(int(buffer2.handle), ctypes.POINTER(ctypes.c_byte)) return libc.memcmp(ptr1, ptr2, buffer1.size) == 0 + + +def compare_buffer_to_constant(buffer, value): + device_id = buffer.memory_resource.device_id + device = Device(device_id) + stream = device.create_stream() + expected = make_scratch_buffer(device, value, buffer.size) + tmp = make_scratch_buffer(device, 0, buffer.size) + tmp.copy_from(buffer, stream=stream) + stream.sync() + result = compare_equal_buffers(expected, tmp) + expected.close() + tmp.close() + return result diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py index 780fb23dd7..735e8490b8 100644 --- a/cuda_core/tests/test_graph_mem.py +++ b/cuda_core/tests/test_graph_mem.py @@ -14,7 +14,7 @@ launch, ) from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return -from helpers.buffers import compare_equal_buffers, make_scratch_buffer +from helpers.buffers import compare_buffer_to_constant def _common_kernels_alloc(): code = """ @@ -40,8 +40,7 @@ def _common_kernels_alloc(): return mod - -@pytest.mark.parametrize("repeat", [0,1,2]) +@pytest.mark.parametrize("repeat", range(3)) @pytest.mark.parametrize("use_graph", [False, True]) def test_graph_alloc(init_cuda, use_graph, repeat): """Test graph capture with memory allocated by GraphMemoryResource.""" @@ -80,6 +79,8 @@ def apply_kernels(mr, stream, out): # Begin graph capture. gb = Device().create_graph_builder().begin_building(mode="thread_local") + # import code + # code.interact(local=dict(globals(), **locals())) # Capture work. apply_kernels(mr=gmr, stream=gb.stream, out=out) @@ -98,13 +99,7 @@ def apply_kernels(mr, stream, out): # ====== End work sequence ====== # Check the result on the host. - host_ans = make_scratch_buffer(device, 2, NBYTES) - host_tmp = make_scratch_buffer(device, 0, NBYTES) - host_tmp.copy_from(out, stream=stream) - stream.sync() - assert compare_equal_buffers(host_ans, host_tmp) - host_ans.close() - host_tmp.close() + assert compare_buffer_to_constant(out, 2) # # Check memory usage. # if use_graph: @@ -118,3 +113,70 @@ def apply_kernels(mr, stream, out): # out.close() # assert dmr.attributes.used_mem_current == 0 + +@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) +def test_gmr_check_capture(init_cuda, mode): + """ + Test expected errors (and non-errors) using GraphMemoryResource with graph + capture. + """ + device = Device() + stream = device.create_stream() + gmr = GraphMemoryResource(device) + + # Not capturing + with pytest.raises(RuntimeError, + match=r"GraphMemoryResource cannot perform memory operations on a " + r"non-capturing stream\." + ): + gmr.allocate(1, stream=stream) + + # Capturing + gb = device.create_graph_builder().begin_building(mode=mode) + gmr.allocate(1, stream=gb.stream).close() # no error + gb.end_building().complete() + + +@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) +def test_mr_check_capture(init_cuda, mode): + """ + Test expected errors (and non-errors) using DeviceMemoryResource with graph + capture. + """ + device = Device() + stream = device.create_stream() + dmr = DeviceMemoryResource(device) + + # Not capturing + dmr.allocate(1, stream=stream).close() # no error + + # Capturing + gb = device.create_graph_builder().begin_building(mode=mode) + with pytest.raises(RuntimeError, + match=r"DeviceMemoryResource cannot perform memory operations on a capturing " + r"stream \(consider using GraphMemoryResource\)\." + ): + dmr.allocate(1, stream=gb.stream) + gb.end_building().complete() + + +# This tests causes unraisable errors at shutdown. +# @pytest.mark.parametrize("mode", ["global", "thread_local"]) +# def test_cross_stream_capture_error(init_cuda, mode): +# """ +# Test errors related to unsafe API calls in global or thread_local capture +# mode. +# """ +# # When graph capturing is turned on for an unrelated stream, the driver +# # raises an error. Not sure how to detect this. +# from cuda.core.experimental._utils.cuda_utils import CUDAError # FIXME +# device = Device() +# stream = device.create_stream() +# dmr = DeviceMemoryResource(device) +# +# with pytest.raises(RuntimeError, match="Build process encountered an error"): +# gb = device.create_graph_builder().begin_building(mode) +# with pytest.raises(CUDAError, match="CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED"): +# dmr.allocate(1, stream=stream) # not targeting gb.stream +# gb.end_building().complete() + From 1b8409b9de8322839fd2f718c455e41a0bf87fd3 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 7 Nov 2025 12:32:41 -0800 Subject: [PATCH 07/15] Add tests for attributes and memory allocation escaping graphs. --- .../core/experimental/_memory/_buffer.pyx | 2 +- .../_memory/_device_memory_resource.pyx | 10 +- .../cuda/core/experimental/_memory/_gmr.pyx | 24 +- .../core/experimental/_utils/cuda_utils.pxd | 2 +- cuda_core/tests/helpers/buffers.py | 10 +- cuda_core/tests/test_graph_mem.py | 216 ++++++++++++------ 6 files changed, 175 insertions(+), 89 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx index 2251272742..ef58998b52 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -274,7 +274,7 @@ cdef class Buffer: # Buffer Implementation # --------------------- -cdef Buffer_close(Buffer self, stream): +cdef inline void Buffer_close(Buffer self, stream): cdef Stream s if self._ptr and self._memory_resource is not None: if stream is None: diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index 4e66f083fd..ab5dc1e600 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -469,7 +469,7 @@ cdef void DMR_init_create( # Raise an exception if the given stream is capturing. # A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error. -cdef void check_not_capturing(cydriver.CUstream s) nogil: +cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil: cdef cydriver.CUstreamCaptureStatus capturing HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing)) if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE: @@ -477,7 +477,7 @@ cdef void check_not_capturing(cydriver.CUstream s) nogil: "a capturing stream (consider using GraphMemoryResource).") -cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): +cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr with nogil: @@ -492,18 +492,17 @@ cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): return buf -cdef void DMR_deallocate( +cdef inline void DMR_deallocate( DeviceMemoryResource self, uintptr_t ptr, size_t size, Stream stream ) noexcept: cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr = ptr cdef cydriver.CUstreamCaptureStatus capturing with nogil: - check_not_capturing(s) HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) -cdef DMR_close(DeviceMemoryResource self): +cdef inline DMR_close(DeviceMemoryResource self): if self._handle == NULL: return @@ -517,3 +516,4 @@ cdef DMR_close(DeviceMemoryResource self): self._attributes = None self._mempool_owned = False self._ipc_data = None + diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx index 0bf9ea45f8..7c717c051e 100644 --- a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx @@ -4,7 +4,7 @@ from __future__ import annotations -from libc.stdint cimport uintptr_t, intptr_t +from libc.stdint cimport uintptr_t, intptr_t, uint64_t from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource @@ -71,8 +71,10 @@ cdef GMRA_mem_attribute(property_type: type, settable : bool = False): return property_type(value) if _settable: - def fset(GraphMemoryResourceAttributes self, value: int): - GMRA_setattribute(self._dev_id, attr_enum, value) + def fset(GraphMemoryResourceAttributes self, uint64_t value): + if value != 0: + raise AttributeError(f"Attribute {stub.__name__!r} may only be set to zero (got {value}).") + GMRA_setattribute(self._dev_id, attr_enum) else: fset = None @@ -80,16 +82,17 @@ cdef GMRA_mem_attribute(property_type: type, settable : bool = False): return decorator -cdef int GMRA_getattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum): - cdef int value +cdef inline uint64_t GMRA_getattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum): + cdef uint64_t value with nogil: HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(device_id, attr_enum, &value)) return value -cdef int GMRA_setattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum, int value): +cdef inline void GMRA_setattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum): + cdef uint64_t zero = 0 with nogil: - HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(device_id, attr_enum, &value)) + HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(device_id, attr_enum, &zero)) cdef class cyGraphMemoryResource(MemoryResource): @@ -165,7 +168,7 @@ class GraphMemoryResource(cyGraphMemoryResource): # Raise an exception if the given stream is capturing. # A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error. -cdef void check_capturing(cydriver.CUstream s) nogil: +cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil: cdef cydriver.CUstreamCaptureStatus capturing HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing)) if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE: @@ -173,7 +176,7 @@ cdef void check_capturing(cydriver.CUstream s) nogil: "a non-capturing stream.") -cdef Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream): +cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr with nogil: @@ -188,10 +191,9 @@ cdef Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream) return buf -cdef void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept: +cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept: cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr = ptr with nogil: - check_capturing(s) HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index ad6da14dae..0e75202498 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -14,7 +14,7 @@ ctypedef fused supported_error_type: # mimic CU_DEVICE_INVALID -cdef cydriver.CUcontext CU_CONTEXT_INVALID = (-2) +cdef const cydriver.CUcontext CU_CONTEXT_INVALID = (-2) cdef cydriver.CUdevice get_device_from_ctx( diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py index e6fc0633ac..b4d769eab3 100644 --- a/cuda_core/tests/helpers/buffers.py +++ b/cuda_core/tests/helpers/buffers.py @@ -108,13 +108,17 @@ def _get_pattern_buffer(self, seed, value): def make_scratch_buffer(device, value, nbytes): """Create a unified memory buffer with the specified value.""" - assert 0 <= int(value) < 256 buffer = DummyUnifiedMemoryResource(device).allocate(nbytes) - ptr = ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_byte)) - ctypes.memset(ptr, value & 0xFF, nbytes) + set_buffer(buffer, value) return buffer +def set_buffer(buffer, value): + assert 0 <= int(value) < 256 + ptr = ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_byte)) + ctypes.memset(ptr, value & 0xFF, buffer.size) + + def compare_equal_buffers(buffer1, buffer2): """Compare the contents of two host-accessible buffers for bitwise equality.""" if buffer1.size != buffer2.size: diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py index 735e8490b8..b5abcd28d8 100644 --- a/cuda_core/tests/test_graph_mem.py +++ b/cuda_core/tests/test_graph_mem.py @@ -7,14 +7,15 @@ from cuda.core.experimental import ( Device, DeviceMemoryResource, + GraphCompleteOptions, GraphMemoryResource, + launch, LaunchConfig, Program, ProgramOptions, - launch, ) from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return -from helpers.buffers import compare_buffer_to_constant +from helpers.buffers import compare_buffer_to_constant, make_scratch_buffer, set_buffer def _common_kernels_alloc(): code = """ @@ -40,16 +41,47 @@ def _common_kernels_alloc(): return mod -@pytest.mark.parametrize("repeat", range(3)) -@pytest.mark.parametrize("use_graph", [False, True]) -def test_graph_alloc(init_cuda, use_graph, repeat): - """Test graph capture with memory allocated by GraphMemoryResource.""" +class GraphMemoryTestManager: + """ + Manages changes to the state of the graph memory system, for testing. + """ + def __init__(self, gmr, stream, mode=None): + self.device = Device(gmr.device_id) + self.gmr = gmr + self.stream = stream + self.mode = "relaxed" if mode is None else mode + + def reset(self): + """Trim unused graph memory and reset usage statistics.""" + self.gmr.trim() + self.gmr.attributes.reserved_mem_high = 0 + self.gmr.attributes.used_mem_high = 0 + + def alloc(self, num, nbytes): + """Allocate num buffers of size nbytes from graph memory.""" + gb = self.device.create_graph_builder().begin_building(self.mode) + buffers = [self.gmr.allocate(nbytes, stream=gb.stream) for _ in range(num)] + graph = gb.end_building().complete() + graph.upload(self.stream) + graph.launch(self.stream) + self.stream.sync() + return buffers + + def free(self, buffers): + """Free graph memory buffers.""" + for buffer in buffers: + buffer.close(stream=self.stream) + self.stream.sync() + + +@pytest.mark.parametrize("mode", ["no_graph", "global", "thread_local", "relaxed"]) +def test_graph_alloc(init_cuda, mode): + """Test basic graph capture with memory allocated and deallocated by GraphMemoryResource.""" NBYTES = 64 device = Device() stream = device.create_stream() dmr = DeviceMemoryResource(device) gmr = GraphMemoryResource(device) - out = dmr.allocate(NBYTES, stream=stream) # Get kernels and define the calling sequence. @@ -65,57 +97,126 @@ def apply_kernels(mr, stream, out): out.copy_from(buffer, stream=stream) buffer.close() - # ====== Begin work sequence ====== - if use_graph: - # Trim memory to zero and reset high watermarks. - gmr.trim() - # gmr.attributes.reserved_mem_high = 0 ## not working - # gmr.attributes.used_mem_high = 0 - - assert gmr.attributes.reserved_mem_current == 0 - # assert gmr.attributes.reserved_mem_high == 0 - assert gmr.attributes.used_mem_current == 0 - # assert gmr.attributes.used_mem_high == 0 - - # Begin graph capture. - gb = Device().create_graph_builder().begin_building(mode="thread_local") - # import code - # code.interact(local=dict(globals(), **locals())) - - # Capture work. + # Apply kernels, with or without graph capture. + if mode == "no_graph": + # Do work without graph capture. + apply_kernels(mr=dmr, stream=stream, out=out) + else: + # Capture work, then upload and launch. + gb = device.create_graph_builder().begin_building(mode) apply_kernels(mr=gmr, stream=gb.stream, out=out) - - # Finalize the graph. graph = gb.end_building().complete() - - # Upload and launch graph.upload(stream) graph.launch(stream) - else: - # Do work without graph capture. - apply_kernels(mr=dmr, stream=stream, out=out) stream.sync() - # ====== End work sequence ====== # Check the result on the host. assert compare_buffer_to_constant(out, 2) - # # Check memory usage. - # if use_graph: - # assert dmr.attributes.used_mem_current == NBYTES - # assert gmr.attributes.used_mem_current > 0 - # out.close() - # assert gmr.attributes.used_mem_current == 0 - # else: - # assert dmr.attributes.used_mem_current == NBYTES - # assert gmr.attributes.used_mem_current == 0 - # out.close() - # assert dmr.attributes.used_mem_current == 0 + +@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) +def test_graph_alloc_with_output(init_cuda, mode): + """Test for memory allocated in a graph being used outside the graph.""" + NBYTES = 64 + device = Device() + stream = device.create_stream() + gmr = GraphMemoryResource(device) + + # Get kernels and define the calling sequence. + mod = _common_kernels_alloc() + set_zero = mod.get_kernel("set_zero") + add_one = mod.get_kernel("add_one") + + # Make an input of 0s. + in_ = make_scratch_buffer(device, 0, NBYTES) + + # Construct a graph to copy and increment the input. It returns a new + # buffer allocated within the graph. The auto_free_on_launch option + # is required to properly use the output buffer. + gb = device.create_graph_builder().begin_building(mode) + out = gmr.allocate(NBYTES, gb.stream) + out.copy_from(in_, stream=gb.stream) + launch(gb, LaunchConfig(grid=1, block=1), add_one, out, NBYTES) + options = GraphCompleteOptions(auto_free_on_launch=True) + graph = gb.end_building().complete(options) + + # Launch the graph. The output buffer is allocated and set to one. + graph.upload(stream) + graph.launch(stream) + stream.sync() + assert compare_buffer_to_constant(out, 1) + + # Update the input buffer and rerun the graph. + set_buffer(in_, 5) + graph.upload(stream) + graph.launch(stream) + stream.sync() + assert compare_buffer_to_constant(out, 6) @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) -def test_gmr_check_capture(init_cuda, mode): +def test_graph_mem_set_attributes(init_cuda, mode): + device = Device() + stream = device.create_stream() + gmr = GraphMemoryResource(device) + mman = GraphMemoryTestManager(gmr, stream, mode) + + # Make an allocation and obvserve usage. + buffer = mman.alloc(1, 1024) + assert gmr.attributes.reserved_mem_current > 0 + assert gmr.attributes.reserved_mem_high > 0 + assert gmr.attributes.used_mem_current > 0 + assert gmr.attributes.used_mem_high > 0 + + # Incorrect attribute usage. + with pytest.raises(AttributeError, + match=r"property 'reserved_mem_current' .* has no setter" + ): + gmr.attributes.reserved_mem_current = 0 + + with pytest.raises(AttributeError, + match=r"Attribute 'reserved_mem_high' may only be set to zero \(got 1\)\." + ): + gmr.attributes.reserved_mem_high = 1 + + with pytest.raises(AttributeError, + match=r"property 'used_mem_current' .* has no setter" + ): + gmr.attributes.used_mem_current = 0 + + with pytest.raises(AttributeError, + match=r"Attribute 'used_mem_high' may only be set to zero \(got 1\)\." + ): + gmr.attributes.used_mem_high = 1 + + # Free memory, but usage is not reduced yet. + mman.free(buffer) + assert gmr.attributes.reserved_mem_current > 0 + assert gmr.attributes.reserved_mem_high > 0 + assert gmr.attributes.used_mem_current > 0 + assert gmr.attributes.used_mem_high > 0 + + gmr.trim() + + # The high-water marks remain after free and trim. + assert gmr.attributes.reserved_mem_current == 0 + assert gmr.attributes.reserved_mem_high > 0 + assert gmr.attributes.used_mem_current == 0 + assert gmr.attributes.used_mem_high > 0 + + # Reset the high-water marks. + gmr.attributes.reserved_mem_high = 0 + gmr.attributes.used_mem_high = 0 + + assert gmr.attributes.reserved_mem_high == 0 + assert gmr.attributes.used_mem_high == 0 + + mman.reset() + + +@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) +def test_gmr_check_capture_state(init_cuda, mode): """ Test expected errors (and non-errors) using GraphMemoryResource with graph capture. @@ -133,12 +234,12 @@ def test_gmr_check_capture(init_cuda, mode): # Capturing gb = device.create_graph_builder().begin_building(mode=mode) - gmr.allocate(1, stream=gb.stream).close() # no error + buffer = gmr.allocate(1, stream=gb.stream).close() # no error gb.end_building().complete() @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) -def test_mr_check_capture(init_cuda, mode): +def test_dmr_check_capture_state(init_cuda, mode): """ Test expected errors (and non-errors) using DeviceMemoryResource with graph capture. @@ -159,24 +260,3 @@ def test_mr_check_capture(init_cuda, mode): dmr.allocate(1, stream=gb.stream) gb.end_building().complete() - -# This tests causes unraisable errors at shutdown. -# @pytest.mark.parametrize("mode", ["global", "thread_local"]) -# def test_cross_stream_capture_error(init_cuda, mode): -# """ -# Test errors related to unsafe API calls in global or thread_local capture -# mode. -# """ -# # When graph capturing is turned on for an unrelated stream, the driver -# # raises an error. Not sure how to detect this. -# from cuda.core.experimental._utils.cuda_utils import CUDAError # FIXME -# device = Device() -# stream = device.create_stream() -# dmr = DeviceMemoryResource(device) -# -# with pytest.raises(RuntimeError, match="Build process encountered an error"): -# gb = device.create_graph_builder().begin_building(mode) -# with pytest.raises(CUDAError, match="CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED"): -# dmr.allocate(1, stream=stream) # not targeting gb.stream -# gb.end_building().complete() - From e9422b23f60a155da5c520303c4fb6b49304d0d9 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 10 Nov 2025 11:32:25 -0800 Subject: [PATCH 08/15] Simplify logic for converting IsStreamT arguments. --- .../cuda/core/experimental/_launcher.pyx | 20 ++---- cuda_core/cuda/core/experimental/_stream.pxd | 3 - cuda_core/cuda/core/experimental/_stream.pyx | 70 +++++++++++-------- 3 files changed, 43 insertions(+), 50 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx index a06d885ff8..516fc87a27 100644 --- a/cuda_core/cuda/core/experimental/_launcher.pyx +++ b/cuda_core/cuda/core/experimental/_launcher.pyx @@ -4,8 +4,6 @@ from libc.stdint cimport uintptr_t -from cuda.core.experimental._stream cimport _try_to_get_stream_ptr - from typing import Union from cuda.core.experimental._kernel_arg_handler import ParamHolder @@ -58,17 +56,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne launching kernel. """ - if stream is None: - raise ValueError("stream cannot be None, stream must either be a Stream object or support __cuda_stream__") - try: - stream_handle = stream.handle - except AttributeError: - try: - stream_handle = driver.CUstream((_try_to_get_stream_ptr(stream))) - except Exception: - raise ValueError( - f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})" - ) from None + stream = Stream._init(stream) assert_type(kernel, Kernel) _lazy_init() config = check_or_create_options(LaunchConfig, config, "launch config") @@ -85,7 +73,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne # rich. if _use_ex: drv_cfg = _to_native_launch_config(config) - drv_cfg.hStream = stream_handle + drv_cfg.hStream = stream.handle if config.cooperative_launch: _check_cooperative_launch(kernel, config, stream) handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0)) @@ -93,12 +81,12 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne # TODO: check if config has any unsupported attrs handle_return( driver.cuLaunchKernel( - int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream_handle, args_ptr, 0 + int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0 ) ) -def _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream): +cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream): dev = stream.device num_sm = dev.properties.multiprocessor_count max_grid_size = ( diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index 8f382e5d01..68a410d1ed 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -5,9 +5,6 @@ from cuda.bindings cimport cydriver -cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except* - - cdef class Stream: cdef: diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index cdc4651bda..b177ea217d 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -62,36 +62,6 @@ class IsStreamT(Protocol): ... -cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*: - try: - cuda_stream_attr = obj.__cuda_stream__ - except AttributeError: - raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") from None - - if callable(cuda_stream_attr): - info = cuda_stream_attr() - else: - info = cuda_stream_attr - warnings.simplefilter("once", DeprecationWarning) - warnings.warn( - "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method", - stacklevel=3, - category=DeprecationWarning, - ) - - try: - len_info = len(info) - except TypeError as e: - raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}") from e - if len_info != 2: - raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements") - if info[0] != 0: - raise RuntimeError( - f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}" - ) - return (info[1]) - - cdef class Stream: """Represent a queue of GPU operations that are executed in a specific order. @@ -139,12 +109,15 @@ cdef class Stream: @classmethod def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None): + if isinstance(obj, Stream): + return obj + cdef Stream self = Stream.__new__(cls) if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") if obj is not None: - self._handle = _try_to_get_stream_ptr(obj) + self._handle = _handle_from_stream_t(obj) # TODO: check if obj is created under the current context/device self._owner = obj return self @@ -445,3 +418,38 @@ cpdef Stream default_stream(): return C_PER_THREAD_DEFAULT_STREAM else: return C_LEGACY_DEFAULT_STREAM + + +cdef cydriver.CUstream _handle_from_stream_t(obj) except*: + if isinstance(obj, Stream): + return (obj.handle) + + try: + cuda_stream_attr = obj.__cuda_stream__ + except AttributeError: + raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") from None + + if callable(cuda_stream_attr): + info = cuda_stream_attr() + else: + info = cuda_stream_attr + warnings.simplefilter("once", DeprecationWarning) + warnings.warn( + "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method", + stacklevel=3, + category=DeprecationWarning, + ) + + try: + len_info = len(info) + except TypeError as e: + raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}") from e + if len_info != 2: + raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements") + if info[0] != 0: + raise RuntimeError( + f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}" + ) + return (info[1]) + + From 3e21b9b4dc9a74a1bc1883665f9bcb80a89aa098 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 10 Nov 2025 12:10:41 -0800 Subject: [PATCH 09/15] Standardize Stream arguments to IsStreamT. Update Buffer and MemoryResource methods to take any kind of stream-providing object. Update graph allocation tests. --- cuda_core/cuda/core/experimental/_device.pyx | 8 +++--- .../cuda/core/experimental/_launcher.pyx | 4 +-- .../core/experimental/_memory/_buffer.pyx | 25 +++++++++---------- .../_memory/_device_memory_resource.pyx | 16 ++++++------ .../cuda/core/experimental/_memory/_gmr.pyx | 21 +++++++++++----- cuda_core/tests/test_graph_mem.py | 12 ++++----- 6 files changed, 47 insertions(+), 39 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index bc6167793a..833c74c0a8 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -9,7 +9,7 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN import threading -from typing import Union, TYPE_CHECKING +from typing import Optional, TYPE_CHECKING, Union from cuda.core.experimental._context import Context, ContextOptions from cuda.core.experimental._event import Event, EventOptions @@ -1242,7 +1242,7 @@ class Device: """ raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") - def create_stream(self, obj: IsStreamT | None = None, options: StreamOptions | None = None) -> Stream: + def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions | None = None) -> Stream: """Create a Stream object. New stream objects can be created in two different ways: @@ -1295,7 +1295,7 @@ class Device: ctx = self._get_current_context() return Event._init(self._id, ctx, options, True) - def allocate(self, size, stream: Stream | None = None) -> Buffer: + def allocate(self, size, stream: Optional[IsStreamT] = None) -> Buffer: """Allocate device memory from a specified stream. Allocates device memory of `size` bytes on the specified `stream` @@ -1311,7 +1311,7 @@ class Device: ---------- size : int Number of bytes to allocate. - stream : :obj:`~_stream.Stream`, optional + stream : :obj:`~_stream.IsStreamT`, optional The stream establishing the stream ordering semantic. Default value of `None` uses default stream. diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx index 516fc87a27..939c0fa803 100644 --- a/cuda_core/cuda/core/experimental/_launcher.pyx +++ b/cuda_core/cuda/core/experimental/_launcher.pyx @@ -37,13 +37,13 @@ def _lazy_init(): _inited = True -def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kernel, *kernel_args): +def launch(stream: IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args): """Launches a :obj:`~_module.Kernel` object with launch-time configuration. Parameters ---------- - stream : :obj:`~_stream.Stream` + stream : :obj:`~_stream.IsStreamT` The stream establishing the stream ordering semantic of a launch. config : :obj:`LaunchConfig` diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx index ef58998b52..f18d5ec02d 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -15,9 +15,10 @@ from cuda.core.experimental._utils.cuda_utils cimport ( ) import abc -from typing import TypeVar, Union +from typing import Optional, TypeVar, Union from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule +from cuda.core.experimental._stream import IsStreamT from cuda.core.experimental._utils.cuda_utils import driver __all__ = ['Buffer', 'MemoryResource'] @@ -116,7 +117,7 @@ cdef class Buffer: """ Buffer_close(self, stream) - def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: + def copy_to(self, dst: Buffer = None, *, stream: IsStreamT) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. Copies the data from this buffer to the provided dst buffer. @@ -127,13 +128,12 @@ cdef class Buffer: ---------- dst : :obj:`~_memory.Buffer` Source buffer to copy data from - stream : Stream + stream : IsStreamT Keyword argument specifying the stream for the asynchronous copy """ - if stream is None: - raise ValueError("stream must be provided") + stream = Stream._init(stream) cdef size_t src_size = self._size @@ -152,20 +152,19 @@ cdef class Buffer: raise_if_driver_error(err) return dst - def copy_from(self, src: Buffer, *, stream: Stream): + def copy_from(self, src: Buffer, *, stream: IsStreamT): """Copy from the src buffer to this buffer asynchronously on the given stream. Parameters ---------- src : :obj:`~_memory.Buffer` Source buffer to copy data from - stream : Stream + stream : IsStreamT Keyword argument specifying the stream for the asynchronous copy """ - if stream is None: - raise ValueError("stream must be provided") + stream = Stream._init(stream) cdef size_t dst_size = self._size cdef size_t src_size = src._size @@ -305,14 +304,14 @@ cdef class MemoryResource: """ @abc.abstractmethod - def allocate(self, size_t size, stream: Stream = None) -> Buffer: + def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer: """Allocate a buffer of the requested size. Parameters ---------- size : int The size of the buffer to allocate, in bytes. - stream : Stream, optional + stream : IsStreamT, optional The stream on which to perform the allocation asynchronously. If None, it is up to each memory resource implementation to decide and document the behavior. @@ -326,7 +325,7 @@ cdef class MemoryResource: ... @abc.abstractmethod - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None): """Deallocate a buffer previously allocated by this resource. Parameters @@ -335,7 +334,7 @@ cdef class MemoryResource: The pointer or handle to the buffer to deallocate. size : int The size of the buffer to deallocate, in bytes. - stream : Stream, optional + stream : IsStreamT, optional The stream on which to perform the deallocation asynchronously. If None, it is up to each memory resource implementation to decide and document the behavior. diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index ab5dc1e600..9ce3e5be39 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -308,14 +308,14 @@ cdef class DeviceMemoryResource(MemoryResource): raise RuntimeError("Imported memory resource cannot be exported") return self._ipc_data._alloc_handle - def allocate(self, size_t size, stream: Stream = None) -> Buffer: + def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer: """Allocate a buffer of the requested size. Parameters ---------- size : int The size of the buffer to allocate, in bytes. - stream : Stream, optional + stream : IsStreamT, optional The stream on which to perform the allocation asynchronously. If None, an internal stream is used. @@ -327,11 +327,10 @@ cdef class DeviceMemoryResource(MemoryResource): """ if self.is_mapped: raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") - if stream is None: - stream = default_stream() - return DMR_allocate(self, size, stream) + stream = Stream._init(stream) if stream is not None else default_stream() + return DMR_allocate(self, size, stream) - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None): """Deallocate a buffer previously allocated by this resource. Parameters @@ -340,12 +339,13 @@ cdef class DeviceMemoryResource(MemoryResource): The pointer or handle to the buffer to deallocate. size : int The size of the buffer to deallocate, in bytes. - stream : Stream, optional + stream : IsStreamT, optional The stream on which to perform the deallocation asynchronously. If the buffer is deallocated without an explicit stream, the allocation stream is used. """ - DMR_deallocate(self, ptr, size, stream) + stream = Stream._init(stream) if stream is not None else default_stream() + DMR_deallocate(self, ptr, size, stream) @property def attributes(self) -> DeviceMemoryResourceAttributes: diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx index 7c717c051e..09a561fdde 100644 --- a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_gmr.pyx @@ -8,11 +8,11 @@ from libc.stdint cimport uintptr_t, intptr_t, uint64_t from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource -from cuda.core.experimental._stream cimport Stream +from cuda.core.experimental._stream cimport default_stream, Stream from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN from functools import cache -from typing import TYPE_CHECKING +from typing import Optional, TYPE_CHECKING from cuda.core.experimental._utils.cuda_utils import driver @@ -99,13 +99,22 @@ cdef class cyGraphMemoryResource(MemoryResource): def __cinit__(self, int device_id): self._dev_id = device_id - def allocate(self, size_t size, Stream stream = None) -> Buffer: - return GMR_allocate(self, size, stream) + def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer: + """ + Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`. + """ + stream = Stream._init(stream) if stream is not None else default_stream() + return GMR_allocate(self, size, stream) - def deallocate(self, ptr: DevicePointerT, size_t size, Stream stream = None): - return GMR_deallocate(ptr, size, stream) + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None): + """ + Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`. + """ + stream = Stream._init(stream) if stream is not None else default_stream() + return GMR_deallocate(ptr, size, stream) def close(self): + """No operation (provided for compatibility).""" pass def trim(self): diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py index b5abcd28d8..6e4bf907ea 100644 --- a/cuda_core/tests/test_graph_mem.py +++ b/cuda_core/tests/test_graph_mem.py @@ -60,7 +60,7 @@ def reset(self): def alloc(self, num, nbytes): """Allocate num buffers of size nbytes from graph memory.""" gb = self.device.create_graph_builder().begin_building(self.mode) - buffers = [self.gmr.allocate(nbytes, stream=gb.stream) for _ in range(num)] + buffers = [self.gmr.allocate(nbytes, stream=gb) for _ in range(num)] graph = gb.end_building().complete() graph.upload(self.stream) graph.launch(self.stream) @@ -104,7 +104,7 @@ def apply_kernels(mr, stream, out): else: # Capture work, then upload and launch. gb = device.create_graph_builder().begin_building(mode) - apply_kernels(mr=gmr, stream=gb.stream, out=out) + apply_kernels(mr=gmr, stream=gb, out=out) graph = gb.end_building().complete() graph.upload(stream) graph.launch(stream) @@ -135,8 +135,8 @@ def test_graph_alloc_with_output(init_cuda, mode): # buffer allocated within the graph. The auto_free_on_launch option # is required to properly use the output buffer. gb = device.create_graph_builder().begin_building(mode) - out = gmr.allocate(NBYTES, gb.stream) - out.copy_from(in_, stream=gb.stream) + out = gmr.allocate(NBYTES, gb) + out.copy_from(in_, stream=gb) launch(gb, LaunchConfig(grid=1, block=1), add_one, out, NBYTES) options = GraphCompleteOptions(auto_free_on_launch=True) graph = gb.end_building().complete(options) @@ -234,7 +234,7 @@ def test_gmr_check_capture_state(init_cuda, mode): # Capturing gb = device.create_graph_builder().begin_building(mode=mode) - buffer = gmr.allocate(1, stream=gb.stream).close() # no error + buffer = gmr.allocate(1, stream=gb).close() # no error gb.end_building().complete() @@ -257,6 +257,6 @@ def test_dmr_check_capture_state(init_cuda, mode): match=r"DeviceMemoryResource cannot perform memory operations on a capturing " r"stream \(consider using GraphMemoryResource\)\." ): - dmr.allocate(1, stream=gb.stream) + dmr.allocate(1, stream=gb) gb.end_building().complete() From 98ccfc769d9984e83b9cbef2e7e3edb44445d151 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 10 Nov 2025 13:10:55 -0800 Subject: [PATCH 10/15] Add tests for IsStreamT conversions. --- cuda_core/tests/helpers/misc.py | 26 ++++++++++++++++++++++++++ cuda_core/tests/test_launcher.py | 21 ++++++++++++++++----- cuda_core/tests/test_memory.py | 31 +++++++++++++++++++++---------- cuda_core/tests/test_stream.py | 6 +++++- 4 files changed, 68 insertions(+), 16 deletions(-) create mode 100644 cuda_core/tests/helpers/misc.py diff --git a/cuda_core/tests/helpers/misc.py b/cuda_core/tests/helpers/misc.py new file mode 100644 index 0000000000..6389e2a3a5 --- /dev/null +++ b/cuda_core/tests/helpers/misc.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core.experimental import Stream + + +class StreamWrapper: + """ + A wrapper around Stream for testing IsStreamT conversions. + """ + + def __init__(self, stream: Stream): + self._stream = stream + + def __cuda_stream__(self): + return self._stream.__cuda_stream__() + + def close(self): + self._stream.close() + + @property + def handle(self): + return self._stream.handle + + def sync(self): + return self._stream.sync() diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index a951fc418a..138ea24aa8 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -24,6 +24,7 @@ from cuda.core.experimental._utils.cuda_utils import CUDAError from conftest import skipif_need_cuda_headers +from helpers.misc import StreamWrapper def test_launch_config_init(init_cuda): @@ -179,9 +180,10 @@ def test_launch_invalid_values(init_cuda): ) +@pytest.mark.parametrize("wrap_stream", [True, False]) @pytest.mark.parametrize("python_type, cpp_type, init_value", PARAMS) @pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+") -def test_launch_scalar_argument(python_type, cpp_type, init_value): +def test_launch_scalar_argument(python_type, cpp_type, init_value, wrap_stream): dev = Device() dev.set_current() @@ -219,19 +221,25 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value): ker = mod.get_kernel(ker_name) # Launch with 1 thread + stream = dev.default_stream + if wrap_stream: + stream = StreamWrapper(stream) config = LaunchConfig(grid=1, block=1) - launch(dev.default_stream, config, ker, arr.ctypes.data, scalar) - dev.default_stream.sync() + launch(stream, config, ker, arr.ctypes.data, scalar) + stream.sync() # Check result assert arr[0] == init_value, f"Expected {init_value}, got {arr[0]}" @skipif_need_cuda_headers # cg -def test_cooperative_launch(): +@pytest.mark.parametrize("wrap_stream", [True, False]) +def test_cooperative_launch(wrap_stream): dev = Device() dev.set_current() s = dev.create_stream(options={"nonblocking": True}) + if wrap_stream: + s = StreamWrapper(s) # CUDA kernel templated on type T code = r""" @@ -272,6 +280,7 @@ def test_cooperative_launch(): @pytest.mark.skipif(cp is None, reason="cupy not installed") +@pytest.mark.parametrize("wrap_stream", [True, False]) @pytest.mark.parametrize( "memory_resource_class", [ @@ -285,11 +294,13 @@ def test_cooperative_launch(): ), ], ) -def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class): +def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class, wrap_stream): """Test that kernels can access memory allocated by memory resources.""" dev = Device() dev.set_current() stream = dev.create_stream() + if wrap_stream: + stream = StreamWrapper(stream) # tell CuPy to use our stream as the current stream: cp.cuda.ExternalStream(int(stream.handle)).use() diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 373cecb9b6..193dc3bd29 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -31,6 +31,7 @@ from cuda.core.experimental._utils.cuda_utils import handle_return from cuda.core.experimental.utils import StridedMemoryView from helpers.buffers import DummyUnifiedMemoryResource +from helpers.misc import StreamWrapper from cuda_python_test_helpers import supports_ipc_mempool @@ -167,10 +168,12 @@ def test_buffer_initialization(): buffer_initialization(DummyPinnedMemoryResource(device)) -def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False): +def buffer_copy_to(dummy_mr: MemoryResource, device: Device, wrap_stream, check=False): src_buffer = dummy_mr.allocate(size=1024) dst_buffer = dummy_mr.allocate(size=1024) stream = device.create_stream() + if wrap_stream: + stream = StreamWrapper(stream) if check: src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) @@ -190,18 +193,21 @@ def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False): src_buffer.close() -def test_buffer_copy_to(): +@pytest.mark.parametrize("wrap_stream", [True, False]) +def test_buffer_copy_to(wrap_stream): device = Device() device.set_current() - buffer_copy_to(DummyDeviceMemoryResource(device), device) - buffer_copy_to(DummyUnifiedMemoryResource(device), device) - buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True) + buffer_copy_to(DummyDeviceMemoryResource(device), device, wrap_stream) + buffer_copy_to(DummyUnifiedMemoryResource(device), device, wrap_stream) + buffer_copy_to(DummyPinnedMemoryResource(device), device, wrap_stream, check=True) -def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): +def buffer_copy_from(dummy_mr: MemoryResource, device, wrap_stream, check=False): src_buffer = dummy_mr.allocate(size=1024) dst_buffer = dummy_mr.allocate(size=1024) stream = device.create_stream() + if wrap_stream: + stream = StreamWrapper(stream) if check: src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) @@ -221,12 +227,13 @@ def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): src_buffer.close() -def test_buffer_copy_from(): +@pytest.mark.parametrize("wrap_stream", [True, False]) +def test_buffer_copy_from(wrap_stream): device = Device() device.set_current() - buffer_copy_from(DummyDeviceMemoryResource(device), device) - buffer_copy_from(DummyUnifiedMemoryResource(device), device) - buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True) + buffer_copy_from(DummyDeviceMemoryResource(device), device, wrap_stream) + buffer_copy_from(DummyUnifiedMemoryResource(device), device, wrap_stream) + buffer_copy_from(DummyPinnedMemoryResource(device), device, wrap_stream, check=True) def buffer_close(dummy_mr: MemoryResource): @@ -526,12 +533,16 @@ def test_device_memory_resource(): buffer = mr.allocate(1024, stream=stream) assert buffer.handle != 0 buffer.close() + buffer = mr.allocate(1024, stream=StreamWrapper(stream)) + assert buffer.handle != 0 + buffer.close() # Test memory copying between buffers from same pool src_buffer = mr.allocate(64) dst_buffer = mr.allocate(64) stream = device.create_stream() src_buffer.copy_to(dst_buffer, stream=stream) + src_buffer.copy_to(dst_buffer, stream=StreamWrapper(stream)) device.sync() dst_buffer.close() src_buffer.close() diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index c8165548f6..2e10fb1003 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -6,6 +6,7 @@ from cuda.core.experimental._event import Event from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM from cuda.core.experimental._utils.cuda_utils import driver +from helpers.misc import StreamWrapper def test_stream_init_disabled(): @@ -76,9 +77,12 @@ def test_stream_context(init_cuda): assert context._handle is not None -def test_stream_from_foreign_stream(init_cuda): +@pytest.mark.parametrize("wrap_stream", [True, False]) +def test_stream_from_foreign_stream(init_cuda, wrap_stream): device = Device() other_stream = device.create_stream(options=StreamOptions()) + if wrap_stream: + other_stream = StreamWrapper(other_stream) stream = device.create_stream(obj=other_stream) # Now that __eq__ is implemented (issue #664), we can compare directly assert other_stream == stream From 183f7af26c7dababc6cb2c62faa91b70e508e2a0 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 12 Nov 2025 10:57:36 -0800 Subject: [PATCH 11/15] Expand files named _gmr.*. Add __eq__ and __hash__ support to StreamWrapper (testing only) --- cuda_core/cuda/core/experimental/_memory/__init__.py | 2 +- .../_memory/{_gmr.pxd => _graph_memory_resource.pxd} | 2 +- .../_memory/{_gmr.pyx => _graph_memory_resource.pyx} | 0 cuda_core/tests/helpers/misc.py | 6 ++++++ 4 files changed, 8 insertions(+), 2 deletions(-) rename cuda_core/cuda/core/experimental/_memory/{_gmr.pxd => _graph_memory_resource.pxd} (81%) rename cuda_core/cuda/core/experimental/_memory/{_gmr.pyx => _graph_memory_resource.pyx} (100%) diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py index 8dd4385982..20b90d7fdd 100644 --- a/cuda_core/cuda/core/experimental/_memory/__init__.py +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -4,7 +4,7 @@ from ._buffer import * # noqa: F403 from ._device_memory_resource import * # noqa: F403 -from ._gmr import * # noqa: F403 +from ._graph_memory_resource import * # noqa: F403 from ._ipc import * # noqa: F403 from ._legacy import * # noqa: F403 from ._virtual_memory_resource import * # noqa: F403 diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pxd b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd similarity index 81% rename from cuda_core/cuda/core/experimental/_memory/_gmr.pxd rename to cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd index 0feb4a398b..7d20757587 100644 --- a/cuda_core/cuda/core/experimental/_memory/_gmr.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd @@ -4,7 +4,7 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport MemoryResource -from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource +from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource from cuda.core.experimental._memory._ipc cimport IPCData diff --git a/cuda_core/cuda/core/experimental/_memory/_gmr.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx similarity index 100% rename from cuda_core/cuda/core/experimental/_memory/_gmr.pyx rename to cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx diff --git a/cuda_core/tests/helpers/misc.py b/cuda_core/tests/helpers/misc.py index 6389e2a3a5..33508091a7 100644 --- a/cuda_core/tests/helpers/misc.py +++ b/cuda_core/tests/helpers/misc.py @@ -24,3 +24,9 @@ def handle(self): def sync(self): return self._stream.sync() + + def __eq__(self, rhs): + return self._stream == Stream._init(rhs) + + def __hash__(self): + return hash(self._stream) From 13e3dfbfdfa50dd574905e3c428bb6753e3e1da3 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 12 Nov 2025 13:20:00 -0800 Subject: [PATCH 12/15] Fix format/lint issues. --- .../cuda/core/experimental/_launcher.pyx | 4 - .../_memory/_device_memory_resource.pyx | 2 - .../_memory/_graph_memory_resource.pyx | 1 - cuda_core/cuda/core/experimental/_stream.pyx | 2 - cuda_core/tests/test_graph.py | 5 - cuda_core/tests/test_graph_mem.py | 100 ++++++++---------- cuda_core/tests/test_launcher.py | 2 +- cuda_core/tests/test_memory.py | 1 + 8 files changed, 48 insertions(+), 69 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx index 939c0fa803..b94c3d2b71 100644 --- a/cuda_core/cuda/core/experimental/_launcher.pyx +++ b/cuda_core/cuda/core/experimental/_launcher.pyx @@ -2,10 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from libc.stdint cimport uintptr_t - -from typing import Union - from cuda.core.experimental._kernel_arg_handler import ParamHolder from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config from cuda.core.experimental._module import Kernel diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index 9ce3e5be39..b354d595c6 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -497,7 +497,6 @@ cdef inline void DMR_deallocate( ) noexcept: cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr = ptr - cdef cydriver.CUstreamCaptureStatus capturing with nogil: HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) @@ -516,4 +515,3 @@ cdef inline DMR_close(DeviceMemoryResource self): self._attributes = None self._mempool_owned = False self._ipc_data = None - diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx index 09a561fdde..6ba09481d6 100644 --- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx @@ -205,4 +205,3 @@ cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexce cdef cydriver.CUdeviceptr devptr = ptr with nogil: HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) - diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index b177ea217d..c9192c5ba4 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -451,5 +451,3 @@ cdef cydriver.CUstream _handle_from_stream_t(obj) except*: f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}" ) return (info[1]) - - diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py index 39810930fe..cc558b6d22 100644 --- a/cuda_core/tests/test_graph.py +++ b/cuda_core/tests/test_graph.py @@ -11,8 +11,6 @@ from cuda import nvrtc from cuda.core.experimental import ( Device, - DeviceMemoryResource, - DeviceMemoryResourceOptions, GraphBuilder, GraphCompleteOptions, GraphDebugPrintOptions, @@ -23,7 +21,6 @@ launch, ) from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return -from helpers.buffers import compare_equal_buffers, make_scratch_buffer def _common_kernels(): @@ -750,5 +747,3 @@ def test_graph_build_mode(init_cuda): with pytest.raises(ValueError, match="^Unsupported build mode:"): gb = Device().create_graph_builder().begin_building(mode=None) - - diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py index 6e4bf907ea..44a5be2617 100644 --- a/cuda_core/tests/test_graph_mem.py +++ b/cuda_core/tests/test_graph_mem.py @@ -3,20 +3,19 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import pytest - from cuda.core.experimental import ( Device, DeviceMemoryResource, GraphCompleteOptions, GraphMemoryResource, - launch, LaunchConfig, Program, ProgramOptions, + launch, ) -from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return from helpers.buffers import compare_buffer_to_constant, make_scratch_buffer, set_buffer + def _common_kernels_alloc(): code = """ __global__ void set_zero(char *a, size_t nbytes) { @@ -45,6 +44,7 @@ class GraphMemoryTestManager: """ Manages changes to the state of the graph memory system, for testing. """ + def __init__(self, gmr, stream, mode=None): self.device = Device(gmr.device_id) self.gmr = gmr @@ -125,7 +125,6 @@ def test_graph_alloc_with_output(init_cuda, mode): # Get kernels and define the calling sequence. mod = _common_kernels_alloc() - set_zero = mod.get_kernel("set_zero") add_one = mod.get_kernel("add_one") # Make an input of 0s. @@ -170,24 +169,16 @@ def test_graph_mem_set_attributes(init_cuda, mode): assert gmr.attributes.used_mem_high > 0 # Incorrect attribute usage. - with pytest.raises(AttributeError, - match=r"property 'reserved_mem_current' .* has no setter" - ): + with pytest.raises(AttributeError, match=r"property 'reserved_mem_current' .* has no setter"): gmr.attributes.reserved_mem_current = 0 - with pytest.raises(AttributeError, - match=r"Attribute 'reserved_mem_high' may only be set to zero \(got 1\)\." - ): + with pytest.raises(AttributeError, match=r"Attribute 'reserved_mem_high' may only be set to zero \(got 1\)\."): gmr.attributes.reserved_mem_high = 1 - with pytest.raises(AttributeError, - match=r"property 'used_mem_current' .* has no setter" - ): + with pytest.raises(AttributeError, match=r"property 'used_mem_current' .* has no setter"): gmr.attributes.used_mem_current = 0 - with pytest.raises(AttributeError, - match=r"Attribute 'used_mem_high' may only be set to zero \(got 1\)\." - ): + with pytest.raises(AttributeError, match=r"Attribute 'used_mem_high' may only be set to zero \(got 1\)\."): gmr.attributes.used_mem_high = 1 # Free memory, but usage is not reduced yet. @@ -217,46 +208,47 @@ def test_graph_mem_set_attributes(init_cuda, mode): @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) def test_gmr_check_capture_state(init_cuda, mode): - """ - Test expected errors (and non-errors) using GraphMemoryResource with graph - capture. - """ - device = Device() - stream = device.create_stream() - gmr = GraphMemoryResource(device) - - # Not capturing - with pytest.raises(RuntimeError, - match=r"GraphMemoryResource cannot perform memory operations on a " - r"non-capturing stream\." - ): - gmr.allocate(1, stream=stream) - - # Capturing - gb = device.create_graph_builder().begin_building(mode=mode) - buffer = gmr.allocate(1, stream=gb).close() # no error - gb.end_building().complete() + """ + Test expected errors (and non-errors) using GraphMemoryResource with graph + capture. + """ + device = Device() + stream = device.create_stream() + gmr = GraphMemoryResource(device) + + # Not capturing + with pytest.raises( + RuntimeError, + match=r"GraphMemoryResource cannot perform memory operations on a " + r"non-capturing stream\.", + ): + gmr.allocate(1, stream=stream) + + # Capturing + gb = device.create_graph_builder().begin_building(mode=mode) + gmr.allocate(1, stream=gb) # no error + gb.end_building().complete() @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) def test_dmr_check_capture_state(init_cuda, mode): - """ - Test expected errors (and non-errors) using DeviceMemoryResource with graph - capture. - """ - device = Device() - stream = device.create_stream() - dmr = DeviceMemoryResource(device) - - # Not capturing - dmr.allocate(1, stream=stream).close() # no error - - # Capturing - gb = device.create_graph_builder().begin_building(mode=mode) - with pytest.raises(RuntimeError, - match=r"DeviceMemoryResource cannot perform memory operations on a capturing " - r"stream \(consider using GraphMemoryResource\)\." - ): - dmr.allocate(1, stream=gb) - gb.end_building().complete() + """ + Test expected errors (and non-errors) using DeviceMemoryResource with graph + capture. + """ + device = Device() + stream = device.create_stream() + dmr = DeviceMemoryResource(device) + + # Not capturing + dmr.allocate(1, stream=stream).close() # no error + # Capturing + gb = device.create_graph_builder().begin_building(mode=mode) + with pytest.raises( + RuntimeError, + match=r"DeviceMemoryResource cannot perform memory operations on a capturing " + r"stream \(consider using GraphMemoryResource\)\.", + ): + dmr.allocate(1, stream=gb) + gb.end_building().complete() diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index 138ea24aa8..f2fd64344c 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -22,9 +22,9 @@ ) from cuda.core.experimental._memory import _SynchronousMemoryResource from cuda.core.experimental._utils.cuda_utils import CUDAError +from helpers.misc import StreamWrapper from conftest import skipif_need_cuda_headers -from helpers.misc import StreamWrapper def test_launch_config_init(init_cuda): diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 193dc3bd29..5ca4e41211 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -689,6 +689,7 @@ def test_strided_memory_view_refcnt(): assert av.strides[1] == 64 assert sys.getrefcount(av.strides) >= 2 + def test_graph_memory_resource_object(init_cuda): device = Device() gmr1 = GraphMemoryResource(device) From 7408fe8a050015b39a9d2b797241c624036d2af1 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 13 Nov 2025 08:29:14 -0800 Subject: [PATCH 13/15] Minor clean up. --- .../_memory/_graph_memory_resource.pxd | 3 --- .../_memory/_graph_memory_resource.pyx | 14 ++++++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd index 7d20757587..f9c7798e76 100644 --- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd @@ -2,10 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport MemoryResource -from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource -from cuda.core.experimental._memory._ipc cimport IPCData cdef class cyGraphMemoryResource(MemoryResource): diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx index 6ba09481d6..6fbb6088f0 100644 --- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx @@ -47,7 +47,10 @@ cdef class GraphMemoryResourceAttributes: @GMRA_mem_attribute(int, settable=True) def reserved_mem_high(self): - """High watermark of backing memory allocated.""" + """ + High watermark of backing memory allocated. It can be set to zero to + reset it to the current usage. + """ @GMRA_mem_attribute(int) def used_mem_current(self): @@ -55,10 +58,13 @@ cdef class GraphMemoryResourceAttributes: @GMRA_mem_attribute(int, settable=True) def used_mem_high(self): - """High watermark of memory in use.""" + """ + High watermark of memory in use. It can be set to zero to reset it to + the current usage. + """ -cdef GMRA_mem_attribute(property_type: type, settable : bool = False): +cdef GMRA_mem_attribute(property_type: type, settable: bool = False): _settable = settable def decorator(stub): @@ -161,7 +167,7 @@ class GraphMemoryResource(cyGraphMemoryResource): Parameters ---------- - device_id : int | Device + device_id: int | Device Device or Device ordinal for which a graph memory resource is obtained. """ From c1dbc6a5f1baa6a57fbb58a37e7603a3097a0f7e Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 19 Nov 2025 10:03:17 -0800 Subject: [PATCH 14/15] Change public signatures to accept "Stream | GraphBuiler" where only streams were accepted. Add a helper Stream_accept to accept Stream-like arguments throughout. Revert changes to Stream._init that relaxed acceptance criteria for first argument. Revert addition of StreamWrapper and associated tests. Suppress invalid context errors in deallocate to avoid noise during shutdown. --- cuda_core/cuda/core/experimental/_device.pyx | 8 ++-- .../cuda/core/experimental/_launcher.pyx | 12 +++--- .../core/experimental/_memory/_buffer.pyx | 40 +++++++------------ .../_memory/_device_memory_resource.pyx | 19 +++++---- .../_memory/_graph_memory_resource.pyx | 12 +++--- .../_memory/_virtual_memory_resource.py | 13 ++++-- cuda_core/cuda/core/experimental/_stream.pxd | 2 + cuda_core/cuda/core/experimental/_stream.pyx | 26 +++++++++--- cuda_core/tests/helpers/misc.py | 32 --------------- cuda_core/tests/test_launcher.py | 16 ++------ cuda_core/tests/test_memory.py | 31 +++++--------- cuda_core/tests/test_stream.py | 6 +-- 12 files changed, 87 insertions(+), 130 deletions(-) delete mode 100644 cuda_core/tests/helpers/misc.py diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index 833c74c0a8..445f526f8f 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -1242,7 +1242,7 @@ class Device: """ raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") - def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions | None = None) -> Stream: + def create_stream(self, obj: IsStreamT | None = None, options: StreamOptions | None = None) -> Stream: """Create a Stream object. New stream objects can be created in two different ways: @@ -1295,7 +1295,7 @@ class Device: ctx = self._get_current_context() return Event._init(self._id, ctx, options, True) - def allocate(self, size, stream: Optional[IsStreamT] = None) -> Buffer: + def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer: """Allocate device memory from a specified stream. Allocates device memory of `size` bytes on the specified `stream` @@ -1311,7 +1311,7 @@ class Device: ---------- size : int Number of bytes to allocate. - stream : :obj:`~_stream.IsStreamT`, optional + stream : :obj:`~_stream.Stream`, optional The stream establishing the stream ordering semantic. Default value of `None` uses default stream. @@ -1322,8 +1322,6 @@ class Device: """ self._check_context_initialized() - if stream is None: - stream = default_stream() return self._mr.allocate(size, stream) def sync(self): diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx index b94c3d2b71..a647c0fb6e 100644 --- a/cuda_core/cuda/core/experimental/_launcher.pyx +++ b/cuda_core/cuda/core/experimental/_launcher.pyx @@ -1,11 +1,13 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 +from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config +from cuda.core.experimental._stream cimport Stream_accept + from cuda.core.experimental._kernel_arg_handler import ParamHolder -from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config from cuda.core.experimental._module import Kernel -from cuda.core.experimental._stream import IsStreamT, Stream +from cuda.core.experimental._stream import Stream from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( _reduce_3_tuple, @@ -33,13 +35,13 @@ def _lazy_init(): _inited = True -def launch(stream: IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args): +def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args): """Launches a :obj:`~_module.Kernel` object with launch-time configuration. Parameters ---------- - stream : :obj:`~_stream.IsStreamT` + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` The stream establishing the stream ordering semantic of a launch. config : :obj:`LaunchConfig` @@ -52,7 +54,7 @@ def launch(stream: IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args launching kernel. """ - stream = Stream._init(stream) + stream = Stream_accept(stream, allow_default=False, default_value=None, allow_stream_protocol=True) assert_type(kernel, Kernel) _lazy_init() config = check_or_create_options(LaunchConfig, config, "launch config") diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx index f18d5ec02d..8f01b638fb 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -9,16 +9,15 @@ from libc.stdint cimport uintptr_t from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor from cuda.core.experimental._memory cimport _ipc -from cuda.core.experimental._stream cimport default_stream, Stream +from cuda.core.experimental._stream cimport Stream_accept, Stream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, ) import abc -from typing import Optional, TypeVar, Union +from typing import TypeVar, Union from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule -from cuda.core.experimental._stream import IsStreamT from cuda.core.experimental._utils.cuda_utils import driver __all__ = ['Buffer', 'MemoryResource'] @@ -103,7 +102,7 @@ cdef class Buffer: """Export a buffer allocated for sharing between processes.""" return _ipc.Buffer_get_ipc_descriptor(self) - def close(self, stream: Stream = None): + def close(self, stream: Stream | GraphBuilder | None = None): """Deallocate this buffer asynchronously on the given stream. This buffer is released back to their memory resource @@ -111,13 +110,13 @@ cdef class Buffer: Parameters ---------- - stream : Stream, optional + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional The stream object to use for asynchronous deallocation. If None, the behavior depends on the underlying memory resource. """ Buffer_close(self, stream) - def copy_to(self, dst: Buffer = None, *, stream: IsStreamT) -> Buffer: + def copy_to(self, dst: Buffer = None, *, stream: Stream | GraphBuilder) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. Copies the data from this buffer to the provided dst buffer. @@ -128,13 +127,12 @@ cdef class Buffer: ---------- dst : :obj:`~_memory.Buffer` Source buffer to copy data from - stream : IsStreamT + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` Keyword argument specifying the stream for the asynchronous copy """ - stream = Stream._init(stream) - + stream = Stream_accept(stream) cdef size_t src_size = self._size if dst is None: @@ -152,20 +150,19 @@ cdef class Buffer: raise_if_driver_error(err) return dst - def copy_from(self, src: Buffer, *, stream: IsStreamT): + def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder): """Copy from the src buffer to this buffer asynchronously on the given stream. Parameters ---------- src : :obj:`~_memory.Buffer` Source buffer to copy data from - stream : IsStreamT + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` Keyword argument specifying the stream for the asynchronous copy """ - stream = Stream._init(stream) - + stream = Stream_accept(stream) cdef size_t dst_size = self._size cdef size_t src_size = src._size @@ -276,14 +273,7 @@ cdef class Buffer: cdef inline void Buffer_close(Buffer self, stream): cdef Stream s if self._ptr and self._memory_resource is not None: - if stream is None: - if self._alloc_stream is not None: - s = self._alloc_stream - else: - # TODO: remove this branch when from_handle takes a stream - s = (default_stream()) - else: - s = stream + s = Stream_accept(stream, allow_default=True, default_value=self._alloc_stream) self._memory_resource.deallocate(self._ptr, self._size, s) self._ptr = 0 self._memory_resource = None @@ -304,14 +294,14 @@ cdef class MemoryResource: """ @abc.abstractmethod - def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer: + def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer: """Allocate a buffer of the requested size. Parameters ---------- size : int The size of the buffer to allocate, in bytes. - stream : IsStreamT, optional + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional The stream on which to perform the allocation asynchronously. If None, it is up to each memory resource implementation to decide and document the behavior. @@ -325,7 +315,7 @@ cdef class MemoryResource: ... @abc.abstractmethod - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None): + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None): """Deallocate a buffer previously allocated by this resource. Parameters @@ -334,7 +324,7 @@ cdef class MemoryResource: The pointer or handle to the buffer to deallocate. size : int The size of the buffer to deallocate, in bytes. - stream : IsStreamT, optional + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional The stream on which to perform the deallocation asynchronously. If None, it is up to each memory resource implementation to decide and document the behavior. diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index b354d595c6..12f5a666ab 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -12,7 +12,7 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource from cuda.core.experimental._memory cimport _ipc from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData -from cuda.core.experimental._stream cimport default_stream, Stream +from cuda.core.experimental._stream cimport Stream_accept, Stream from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN, @@ -308,14 +308,14 @@ cdef class DeviceMemoryResource(MemoryResource): raise RuntimeError("Imported memory resource cannot be exported") return self._ipc_data._alloc_handle - def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer: + def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer: """Allocate a buffer of the requested size. Parameters ---------- size : int The size of the buffer to allocate, in bytes. - stream : IsStreamT, optional + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional The stream on which to perform the allocation asynchronously. If None, an internal stream is used. @@ -327,10 +327,10 @@ cdef class DeviceMemoryResource(MemoryResource): """ if self.is_mapped: raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") - stream = Stream._init(stream) if stream is not None else default_stream() + stream = Stream_accept(stream, allow_default=True) return DMR_allocate(self, size, stream) - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None): + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None): """Deallocate a buffer previously allocated by this resource. Parameters @@ -339,12 +339,12 @@ cdef class DeviceMemoryResource(MemoryResource): The pointer or handle to the buffer to deallocate. size : int The size of the buffer to deallocate, in bytes. - stream : IsStreamT, optional + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional The stream on which to perform the deallocation asynchronously. If the buffer is deallocated without an explicit stream, the allocation stream is used. """ - stream = Stream._init(stream) if stream is not None else default_stream() + stream = Stream_accept(stream, allow_default=True) DMR_deallocate(self, ptr, size, stream) @property @@ -497,8 +497,11 @@ cdef inline void DMR_deallocate( ) noexcept: cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr = ptr + cdef cydriver.CUresult r with nogil: - HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) + r = cydriver.cuMemFreeAsync(devptr, s) + if r != cydriver.CUDA_ERROR_INVALID_CONTEXT: + HANDLE_RETURN(r) cdef inline DMR_close(DeviceMemoryResource self): diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx index 6fbb6088f0..06234ef4b8 100644 --- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx @@ -8,11 +8,11 @@ from libc.stdint cimport uintptr_t, intptr_t, uint64_t from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource -from cuda.core.experimental._stream cimport default_stream, Stream +from cuda.core.experimental._stream cimport Stream_accept, Stream from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN from functools import cache -from typing import Optional, TYPE_CHECKING +from typing import TYPE_CHECKING from cuda.core.experimental._utils.cuda_utils import driver @@ -105,18 +105,18 @@ cdef class cyGraphMemoryResource(MemoryResource): def __cinit__(self, int device_id): self._dev_id = device_id - def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer: + def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer: """ Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`. """ - stream = Stream._init(stream) if stream is not None else default_stream() + stream = Stream_accept(stream, allow_default=True) return GMR_allocate(self, size, stream) - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None): + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None): """ Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`. """ - stream = Stream._init(stream) if stream is not None else default_stream() + stream = Stream_accept(stream, allow_default=True) return GMR_deallocate(ptr, size, stream) def close(self): diff --git a/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py b/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py index 5379f0b8f1..40b8e7267d 100644 --- a/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py +++ b/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py @@ -2,11 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from dataclasses import dataclass, field -from typing import Iterable, Literal, Union +from typing import TYPE_CHECKING, Iterable, Literal, Union from cuda.core.experimental._memory._buffer import Buffer, MemoryResource -from cuda.core.experimental._stream import Stream from cuda.core.experimental._utils.cuda_utils import ( Transaction, check_or_create_options, @@ -17,6 +18,10 @@ _check_driver_error as raise_if_driver_error, ) +if TYPE_CHECKING: + from cuda.core.experimental._graph import GraphBuilder + from cuda.core.experimental._stream import Stream + __all__ = ["VirtualMemoryResourceOptions", "VirtualMemoryResource"] VirtualMemoryHandleTypeT = Union[Literal["posix_fd", "generic", "win32", "win32_kmt", "fabric"], None] @@ -457,7 +462,7 @@ def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list: return descs - def allocate(self, size: int, stream: Stream = None) -> Buffer: + def allocate(self, size: int, stream: Stream | GraphBuilder | None = None) -> Buffer: """ Allocate a buffer of the given size using CUDA virtual memory. @@ -541,7 +546,7 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer: buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self) return buf - def deallocate(self, ptr: int, size: int, stream: Stream = None) -> None: + def deallocate(self, ptr: int, size: int, stream: Stream | GraphBuilder | None = None) -> None: """ Deallocate memory on the device using CUDA VMM APIs. """ diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index 68a410d1ed..59c27749bc 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -22,3 +22,5 @@ cdef class Stream: cpdef Stream default_stream() +cdef Stream Stream_accept(arg, bint allow_default=*, Stream default_value=*, bint allow_stream_protocol=*) +# from cuda.core.experimental._stream cimport Stream_accept diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index c9192c5ba4..0f53dbb582 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -108,16 +108,13 @@ cdef class Stream: return self @classmethod - def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None): - if isinstance(obj, Stream): - return obj - + def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None): cdef Stream self = Stream.__new__(cls) if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") if obj is not None: - self._handle = _handle_from_stream_t(obj) + self._handle = _handle_from_stream_protocol(obj) # TODO: check if obj is created under the current context/device self._owner = obj return self @@ -420,7 +417,7 @@ cpdef Stream default_stream(): return C_LEGACY_DEFAULT_STREAM -cdef cydriver.CUstream _handle_from_stream_t(obj) except*: +cdef cydriver.CUstream _handle_from_stream_protocol(obj) except*: if isinstance(obj, Stream): return (obj.handle) @@ -451,3 +448,20 @@ cdef cydriver.CUstream _handle_from_stream_t(obj) except*: f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}" ) return (info[1]) + +# Helper for API functions that accept either Stream or GraphBuilder. Performs +# needed checks and returns the relevant stream. +cdef Stream Stream_accept(arg, bint allow_default=False, Stream default_value=None, bint allow_stream_protocol=False): + if arg is None: + if allow_default: + if default_value is not None: + return default_value + else: + return default_stream() + elif isinstance(arg, Stream): + return arg + elif isinstance(arg, GraphBuilder): + return arg.stream + elif allow_stream_protocol and isinstance(arg, IsStreamT): + return Stream._init(arg) + raise TypeError(f"Stream or GraphBuilder expected, got {type(arg).__name__}") diff --git a/cuda_core/tests/helpers/misc.py b/cuda_core/tests/helpers/misc.py deleted file mode 100644 index 33508091a7..0000000000 --- a/cuda_core/tests/helpers/misc.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from cuda.core.experimental import Stream - - -class StreamWrapper: - """ - A wrapper around Stream for testing IsStreamT conversions. - """ - - def __init__(self, stream: Stream): - self._stream = stream - - def __cuda_stream__(self): - return self._stream.__cuda_stream__() - - def close(self): - self._stream.close() - - @property - def handle(self): - return self._stream.handle - - def sync(self): - return self._stream.sync() - - def __eq__(self, rhs): - return self._stream == Stream._init(rhs) - - def __hash__(self): - return hash(self._stream) diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index f2fd64344c..34a62ba8c2 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -22,7 +22,6 @@ ) from cuda.core.experimental._memory import _SynchronousMemoryResource from cuda.core.experimental._utils.cuda_utils import CUDAError -from helpers.misc import StreamWrapper from conftest import skipif_need_cuda_headers @@ -180,10 +179,9 @@ def test_launch_invalid_values(init_cuda): ) -@pytest.mark.parametrize("wrap_stream", [True, False]) @pytest.mark.parametrize("python_type, cpp_type, init_value", PARAMS) @pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+") -def test_launch_scalar_argument(python_type, cpp_type, init_value, wrap_stream): +def test_launch_scalar_argument(python_type, cpp_type, init_value): dev = Device() dev.set_current() @@ -222,8 +220,6 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value, wrap_stream): # Launch with 1 thread stream = dev.default_stream - if wrap_stream: - stream = StreamWrapper(stream) config = LaunchConfig(grid=1, block=1) launch(stream, config, ker, arr.ctypes.data, scalar) stream.sync() @@ -233,13 +229,10 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value, wrap_stream): @skipif_need_cuda_headers # cg -@pytest.mark.parametrize("wrap_stream", [True, False]) -def test_cooperative_launch(wrap_stream): +def test_cooperative_launch(): dev = Device() dev.set_current() s = dev.create_stream(options={"nonblocking": True}) - if wrap_stream: - s = StreamWrapper(s) # CUDA kernel templated on type T code = r""" @@ -280,7 +273,6 @@ def test_cooperative_launch(wrap_stream): @pytest.mark.skipif(cp is None, reason="cupy not installed") -@pytest.mark.parametrize("wrap_stream", [True, False]) @pytest.mark.parametrize( "memory_resource_class", [ @@ -294,13 +286,11 @@ def test_cooperative_launch(wrap_stream): ), ], ) -def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class, wrap_stream): +def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class): """Test that kernels can access memory allocated by memory resources.""" dev = Device() dev.set_current() stream = dev.create_stream() - if wrap_stream: - stream = StreamWrapper(stream) # tell CuPy to use our stream as the current stream: cp.cuda.ExternalStream(int(stream.handle)).use() diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 5ca4e41211..0c33b13b00 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -31,7 +31,6 @@ from cuda.core.experimental._utils.cuda_utils import handle_return from cuda.core.experimental.utils import StridedMemoryView from helpers.buffers import DummyUnifiedMemoryResource -from helpers.misc import StreamWrapper from cuda_python_test_helpers import supports_ipc_mempool @@ -168,12 +167,10 @@ def test_buffer_initialization(): buffer_initialization(DummyPinnedMemoryResource(device)) -def buffer_copy_to(dummy_mr: MemoryResource, device: Device, wrap_stream, check=False): +def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False): src_buffer = dummy_mr.allocate(size=1024) dst_buffer = dummy_mr.allocate(size=1024) stream = device.create_stream() - if wrap_stream: - stream = StreamWrapper(stream) if check: src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) @@ -193,21 +190,18 @@ def buffer_copy_to(dummy_mr: MemoryResource, device: Device, wrap_stream, check= src_buffer.close() -@pytest.mark.parametrize("wrap_stream", [True, False]) -def test_buffer_copy_to(wrap_stream): +def test_buffer_copy_to(): device = Device() device.set_current() - buffer_copy_to(DummyDeviceMemoryResource(device), device, wrap_stream) - buffer_copy_to(DummyUnifiedMemoryResource(device), device, wrap_stream) - buffer_copy_to(DummyPinnedMemoryResource(device), device, wrap_stream, check=True) + buffer_copy_to(DummyDeviceMemoryResource(device), device) + buffer_copy_to(DummyUnifiedMemoryResource(device), device) + buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True) -def buffer_copy_from(dummy_mr: MemoryResource, device, wrap_stream, check=False): +def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): src_buffer = dummy_mr.allocate(size=1024) dst_buffer = dummy_mr.allocate(size=1024) stream = device.create_stream() - if wrap_stream: - stream = StreamWrapper(stream) if check: src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) @@ -227,13 +221,12 @@ def buffer_copy_from(dummy_mr: MemoryResource, device, wrap_stream, check=False) src_buffer.close() -@pytest.mark.parametrize("wrap_stream", [True, False]) -def test_buffer_copy_from(wrap_stream): +def test_buffer_copy_from(): device = Device() device.set_current() - buffer_copy_from(DummyDeviceMemoryResource(device), device, wrap_stream) - buffer_copy_from(DummyUnifiedMemoryResource(device), device, wrap_stream) - buffer_copy_from(DummyPinnedMemoryResource(device), device, wrap_stream, check=True) + buffer_copy_from(DummyDeviceMemoryResource(device), device) + buffer_copy_from(DummyUnifiedMemoryResource(device), device) + buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True) def buffer_close(dummy_mr: MemoryResource): @@ -533,16 +526,12 @@ def test_device_memory_resource(): buffer = mr.allocate(1024, stream=stream) assert buffer.handle != 0 buffer.close() - buffer = mr.allocate(1024, stream=StreamWrapper(stream)) - assert buffer.handle != 0 - buffer.close() # Test memory copying between buffers from same pool src_buffer = mr.allocate(64) dst_buffer = mr.allocate(64) stream = device.create_stream() src_buffer.copy_to(dst_buffer, stream=stream) - src_buffer.copy_to(dst_buffer, stream=StreamWrapper(stream)) device.sync() dst_buffer.close() src_buffer.close() diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index 2e10fb1003..c8165548f6 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -6,7 +6,6 @@ from cuda.core.experimental._event import Event from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM from cuda.core.experimental._utils.cuda_utils import driver -from helpers.misc import StreamWrapper def test_stream_init_disabled(): @@ -77,12 +76,9 @@ def test_stream_context(init_cuda): assert context._handle is not None -@pytest.mark.parametrize("wrap_stream", [True, False]) -def test_stream_from_foreign_stream(init_cuda, wrap_stream): +def test_stream_from_foreign_stream(init_cuda): device = Device() other_stream = device.create_stream(options=StreamOptions()) - if wrap_stream: - other_stream = StreamWrapper(other_stream) stream = device.create_stream(obj=other_stream) # Now that __eq__ is implemented (issue #664), we can compare directly assert other_stream == stream From 2f14443f6d7f1f7f1b224c99ac8f280bae86cb3f Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 19 Nov 2025 17:14:38 -0800 Subject: [PATCH 15/15] Add deprecation warning when stream protocol is used with launch. --- cuda_core/cuda/core/experimental/_stream.pyx | 16 ++++++++++++++-- cuda_core/tests/helpers/misc.py | 14 ++++++++++++++ cuda_core/tests/test_launcher.py | 9 +++++++++ cuda_core/tests/test_stream.py | 3 ++- 4 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 cuda_core/tests/helpers/misc.py diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 0f53dbb582..53a6f8dd34 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -462,6 +462,18 @@ cdef Stream Stream_accept(arg, bint allow_default=False, Stream default_value=No return arg elif isinstance(arg, GraphBuilder): return arg.stream - elif allow_stream_protocol and isinstance(arg, IsStreamT): - return Stream._init(arg) + elif allow_stream_protocol: + try: + stream = Stream._init(arg) + except: + pass + else: + warnings.warn( + "Passing foreign stream objects to this function via the " + "stream protocol is deprecated. Convert the object explicitly " + "using Stream(obj) instead.", + stacklevel=2, + category=DeprecationWarning, + ) + return stream raise TypeError(f"Stream or GraphBuilder expected, got {type(arg).__name__}") diff --git a/cuda_core/tests/helpers/misc.py b/cuda_core/tests/helpers/misc.py new file mode 100644 index 0000000000..d5f9cd7534 --- /dev/null +++ b/cuda_core/tests/helpers/misc.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +class StreamWrapper: + """ + A wrapper around Stream for testing IsStreamT conversions. + """ + + def __init__(self, stream): + self._stream = stream + + def __cuda_stream__(self): + return self._stream.__cuda_stream__() + diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index 34a62ba8c2..d2e0a89a28 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -4,6 +4,7 @@ import ctypes import helpers +from helpers.misc import StreamWrapper try: import cupy as cp @@ -140,6 +141,14 @@ def test_launch_invalid_values(init_cuda): with pytest.raises(TypeError): launch(stream, ker, None) + msg = ( + r"Passing foreign stream objects to this function via the stream " + r"protocol is deprecated\. Convert the object explicitly using " + r"Stream\(obj\) instead\." + ) + with pytest.warns(DeprecationWarning, match=msg): + launch(StreamWrapper(stream), config, ker) + launch(stream, config, ker) diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index c8165548f6..695a70e931 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -6,6 +6,7 @@ from cuda.core.experimental._event import Event from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM from cuda.core.experimental._utils.cuda_utils import driver +from helpers.misc import StreamWrapper def test_stream_init_disabled(): @@ -79,7 +80,7 @@ def test_stream_context(init_cuda): def test_stream_from_foreign_stream(init_cuda): device = Device() other_stream = device.create_stream(options=StreamOptions()) - stream = device.create_stream(obj=other_stream) + stream = device.create_stream(obj=StreamWrapper(other_stream)) # Now that __eq__ is implemented (issue #664), we can compare directly assert other_stream == stream assert hash(other_stream) == hash(stream)