diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 8a60c031c..67402e10a 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -42,6 +42,7 @@ Buffer, DeviceMemoryResource, DeviceMemoryResourceOptions, + GraphMemoryResource, LegacyPinnedMemoryResource, MemoryResource, VirtualMemoryResource, diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index bc6167793..833c74c0a 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -9,7 +9,7 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN import threading -from typing import Union, TYPE_CHECKING +from typing import Optional, TYPE_CHECKING, Union from cuda.core.experimental._context import Context, ContextOptions from cuda.core.experimental._event import Event, EventOptions @@ -1242,7 +1242,7 @@ class Device: """ raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") - def create_stream(self, obj: IsStreamT | None = None, options: StreamOptions | None = None) -> Stream: + def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions | None = None) -> Stream: """Create a Stream object. New stream objects can be created in two different ways: @@ -1295,7 +1295,7 @@ class Device: ctx = self._get_current_context() return Event._init(self._id, ctx, options, True) - def allocate(self, size, stream: Stream | None = None) -> Buffer: + def allocate(self, size, stream: Optional[IsStreamT] = None) -> Buffer: """Allocate device memory from a specified stream. Allocates device memory of `size` bytes on the specified `stream` @@ -1311,7 +1311,7 @@ class Device: ---------- size : int Number of bytes to allocate. - stream : :obj:`~_stream.Stream`, optional + stream : :obj:`~_stream.IsStreamT`, optional The stream establishing the stream ordering semantic. Default value of `None` uses default stream. diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx index a06d885ff..b94c3d2b7 100644 --- a/cuda_core/cuda/core/experimental/_launcher.pyx +++ b/cuda_core/cuda/core/experimental/_launcher.pyx @@ -2,12 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from libc.stdint cimport uintptr_t - -from cuda.core.experimental._stream cimport _try_to_get_stream_ptr - -from typing import Union - from cuda.core.experimental._kernel_arg_handler import ParamHolder from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config from cuda.core.experimental._module import Kernel @@ -39,13 +33,13 @@ def _lazy_init(): _inited = True -def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kernel, *kernel_args): +def launch(stream: IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args): """Launches a :obj:`~_module.Kernel` object with launch-time configuration. Parameters ---------- - stream : :obj:`~_stream.Stream` + stream : :obj:`~_stream.IsStreamT` The stream establishing the stream ordering semantic of a launch. config : :obj:`LaunchConfig` @@ -58,17 +52,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne launching kernel. """ - if stream is None: - raise ValueError("stream cannot be None, stream must either be a Stream object or support __cuda_stream__") - try: - stream_handle = stream.handle - except AttributeError: - try: - stream_handle = driver.CUstream((_try_to_get_stream_ptr(stream))) - except Exception: - raise ValueError( - f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})" - ) from None + stream = Stream._init(stream) assert_type(kernel, Kernel) _lazy_init() config = check_or_create_options(LaunchConfig, config, "launch config") @@ -85,7 +69,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne # rich. if _use_ex: drv_cfg = _to_native_launch_config(config) - drv_cfg.hStream = stream_handle + drv_cfg.hStream = stream.handle if config.cooperative_launch: _check_cooperative_launch(kernel, config, stream) handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0)) @@ -93,12 +77,12 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne # TODO: check if config has any unsupported attrs handle_return( driver.cuLaunchKernel( - int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream_handle, args_ptr, 0 + int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0 ) ) -def _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream): +cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream): dev = stream.device num_sm = dev.properties.multiprocessor_count max_grid_size = ( diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py index 3c07fbdde..20b90d7fd 100644 --- a/cuda_core/cuda/core/experimental/_memory/__init__.py +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -4,6 +4,7 @@ from ._buffer import * # noqa: F403 from ._device_memory_resource import * # noqa: F403 +from ._graph_memory_resource import * # noqa: F403 from ._ipc import * # noqa: F403 from ._legacy import * # noqa: F403 from ._virtual_memory_resource import * # noqa: F403 diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx index 225127274..f18d5ec02 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -15,9 +15,10 @@ from cuda.core.experimental._utils.cuda_utils cimport ( ) import abc -from typing import TypeVar, Union +from typing import Optional, TypeVar, Union from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule +from cuda.core.experimental._stream import IsStreamT from cuda.core.experimental._utils.cuda_utils import driver __all__ = ['Buffer', 'MemoryResource'] @@ -116,7 +117,7 @@ cdef class Buffer: """ Buffer_close(self, stream) - def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: + def copy_to(self, dst: Buffer = None, *, stream: IsStreamT) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. Copies the data from this buffer to the provided dst buffer. @@ -127,13 +128,12 @@ cdef class Buffer: ---------- dst : :obj:`~_memory.Buffer` Source buffer to copy data from - stream : Stream + stream : IsStreamT Keyword argument specifying the stream for the asynchronous copy """ - if stream is None: - raise ValueError("stream must be provided") + stream = Stream._init(stream) cdef size_t src_size = self._size @@ -152,20 +152,19 @@ cdef class Buffer: raise_if_driver_error(err) return dst - def copy_from(self, src: Buffer, *, stream: Stream): + def copy_from(self, src: Buffer, *, stream: IsStreamT): """Copy from the src buffer to this buffer asynchronously on the given stream. Parameters ---------- src : :obj:`~_memory.Buffer` Source buffer to copy data from - stream : Stream + stream : IsStreamT Keyword argument specifying the stream for the asynchronous copy """ - if stream is None: - raise ValueError("stream must be provided") + stream = Stream._init(stream) cdef size_t dst_size = self._size cdef size_t src_size = src._size @@ -274,7 +273,7 @@ cdef class Buffer: # Buffer Implementation # --------------------- -cdef Buffer_close(Buffer self, stream): +cdef inline void Buffer_close(Buffer self, stream): cdef Stream s if self._ptr and self._memory_resource is not None: if stream is None: @@ -305,14 +304,14 @@ cdef class MemoryResource: """ @abc.abstractmethod - def allocate(self, size_t size, stream: Stream = None) -> Buffer: + def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer: """Allocate a buffer of the requested size. Parameters ---------- size : int The size of the buffer to allocate, in bytes. - stream : Stream, optional + stream : IsStreamT, optional The stream on which to perform the allocation asynchronously. If None, it is up to each memory resource implementation to decide and document the behavior. @@ -326,7 +325,7 @@ cdef class MemoryResource: ... @abc.abstractmethod - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None): """Deallocate a buffer previously allocated by this resource. Parameters @@ -335,7 +334,7 @@ cdef class MemoryResource: The pointer or handle to the buffer to deallocate. size : int The size of the buffer to deallocate, in bytes. - stream : Stream, optional + stream : IsStreamT, optional The stream on which to perform the deallocation asynchronously. If None, it is up to each memory resource implementation to decide and document the behavior. diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index 47b6fd114..b354d595c 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -49,8 +49,8 @@ cdef class DeviceMemoryResourceOptions: Maximum pool size. When set to 0, defaults to a system-dependent value. (Default to 0) """ - ipc_enabled : cython.bint = False - max_size : cython.size_t = 0 + ipc_enabled: cython.bint = False + max_size : cython.size_t = 0 cdef class DeviceMemoryResourceAttributes: @@ -66,6 +66,12 @@ cdef class DeviceMemoryResourceAttributes: self._mr_weakref = mr return self + def __repr__(self): + return f"{self.__class__.__name__}(%s)" % ", ".join( + f"{attr}={getattr(self, attr)}" for attr in dir(self) + if not attr.startswith("_") + ) + @DMRA_mempool_attribute(bool) def reuse_follow_event_dependencies(self): """Allow memory to be reused when there are event dependencies between streams.""" @@ -127,7 +133,7 @@ cdef int DMRA_getattribute( cdef class DeviceMemoryResource(MemoryResource): """ - Create a device memory resource managing a stream-ordered memory pool. + A device memory resource managing a stream-ordered memory pool. Parameters ---------- @@ -302,14 +308,14 @@ cdef class DeviceMemoryResource(MemoryResource): raise RuntimeError("Imported memory resource cannot be exported") return self._ipc_data._alloc_handle - def allocate(self, size_t size, stream: Stream = None) -> Buffer: + def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer: """Allocate a buffer of the requested size. Parameters ---------- size : int The size of the buffer to allocate, in bytes. - stream : Stream, optional + stream : IsStreamT, optional The stream on which to perform the allocation asynchronously. If None, an internal stream is used. @@ -321,11 +327,10 @@ cdef class DeviceMemoryResource(MemoryResource): """ if self.is_mapped: raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") - if stream is None: - stream = default_stream() - return DMR_allocate(self, size, stream) + stream = Stream._init(stream) if stream is not None else default_stream() + return DMR_allocate(self, size, stream) - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None): """Deallocate a buffer previously allocated by this resource. Parameters @@ -334,15 +339,17 @@ cdef class DeviceMemoryResource(MemoryResource): The pointer or handle to the buffer to deallocate. size : int The size of the buffer to deallocate, in bytes. - stream : Stream, optional + stream : IsStreamT, optional The stream on which to perform the deallocation asynchronously. If the buffer is deallocated without an explicit stream, the allocation stream is used. """ - DMR_deallocate(self, ptr, size, stream) + stream = Stream._init(stream) if stream is not None else default_stream() + DMR_deallocate(self, ptr, size, stream) @property def attributes(self) -> DeviceMemoryResourceAttributes: + """Memory pool attributes.""" if self._attributes is None: ref = weakref.ref(self) self._attributes = DeviceMemoryResourceAttributes._init(ref) @@ -460,10 +467,21 @@ cdef void DMR_init_create( self._ipc_data = IPCData(alloc_handle, mapped=False) -cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): +# Raise an exception if the given stream is capturing. +# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error. +cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil: + cdef cydriver.CUstreamCaptureStatus capturing + HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing)) + if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE: + raise RuntimeError("DeviceMemoryResource cannot perform memory operations on " + "a capturing stream (consider using GraphMemoryResource).") + + +cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr with nogil: + check_not_capturing(s) HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s)) cdef Buffer buf = Buffer.__new__(Buffer) buf._ptr = (devptr) @@ -474,7 +492,7 @@ cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): return buf -cdef void DMR_deallocate( +cdef inline void DMR_deallocate( DeviceMemoryResource self, uintptr_t ptr, size_t size, Stream stream ) noexcept: cdef cydriver.CUstream s = stream._handle @@ -483,7 +501,7 @@ cdef void DMR_deallocate( HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) -cdef DMR_close(DeviceMemoryResource self): +cdef inline DMR_close(DeviceMemoryResource self): if self._handle == NULL: return diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd new file mode 100644 index 000000000..f9c7798e7 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core.experimental._memory._buffer cimport MemoryResource + + +cdef class cyGraphMemoryResource(MemoryResource): + cdef: + int _dev_id diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx new file mode 100644 index 000000000..6fbb6088f --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx @@ -0,0 +1,213 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport uintptr_t, intptr_t, uint64_t + +from cuda.bindings cimport cydriver +from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource +from cuda.core.experimental._stream cimport default_stream, Stream +from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN + +from functools import cache +from typing import Optional, TYPE_CHECKING + +from cuda.core.experimental._utils.cuda_utils import driver + +if TYPE_CHECKING: + from cuda.core.experimental._memory.buffer import DevicePointerT + +__all__ = ['GraphMemoryResource'] + + +cdef class GraphMemoryResourceAttributes: + cdef: + int _dev_id + + def __init__(self, *args, **kwargs): + raise RuntimeError("GraphMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.") + + @classmethod + def _init(cls, device_id: int): + cdef GraphMemoryResourceAttributes self = GraphMemoryResourceAttributes.__new__(cls) + self._dev_id = device_id + return self + + def __repr__(self): + return f"{self.__class__.__name__}(%s)" % ", ".join( + f"{attr}={getattr(self, attr)}" for attr in dir(self) + if not attr.startswith("_") + ) + + @GMRA_mem_attribute(int) + def reserved_mem_current(self): + """Current amount of backing memory allocated.""" + + @GMRA_mem_attribute(int, settable=True) + def reserved_mem_high(self): + """ + High watermark of backing memory allocated. It can be set to zero to + reset it to the current usage. + """ + + @GMRA_mem_attribute(int) + def used_mem_current(self): + """Current amount of memory in use.""" + + @GMRA_mem_attribute(int, settable=True) + def used_mem_high(self): + """ + High watermark of memory in use. It can be set to zero to reset it to + the current usage. + """ + + +cdef GMRA_mem_attribute(property_type: type, settable: bool = False): + _settable = settable + + def decorator(stub): + attr_enum = getattr( + driver.CUgraphMem_attribute, f"CU_GRAPH_MEM_ATTR_{stub.__name__.upper()}" + ) + + def fget(GraphMemoryResourceAttributes self) -> property_type: + value = GMRA_getattribute(self._dev_id, attr_enum) + return property_type(value) + + if _settable: + def fset(GraphMemoryResourceAttributes self, uint64_t value): + if value != 0: + raise AttributeError(f"Attribute {stub.__name__!r} may only be set to zero (got {value}).") + GMRA_setattribute(self._dev_id, attr_enum) + else: + fset = None + + return property(fget=fget, fset=fset, doc=stub.__doc__) + return decorator + + +cdef inline uint64_t GMRA_getattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum): + cdef uint64_t value + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(device_id, attr_enum, &value)) + return value + + +cdef inline void GMRA_setattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum): + cdef uint64_t zero = 0 + with nogil: + HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(device_id, attr_enum, &zero)) + + +cdef class cyGraphMemoryResource(MemoryResource): + def __cinit__(self, int device_id): + self._dev_id = device_id + + def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer: + """ + Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`. + """ + stream = Stream._init(stream) if stream is not None else default_stream() + return GMR_allocate(self, size, stream) + + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None): + """ + Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`. + """ + stream = Stream._init(stream) if stream is not None else default_stream() + return GMR_deallocate(ptr, size, stream) + + def close(self): + """No operation (provided for compatibility).""" + pass + + def trim(self): + """Free unused memory that was cached on the specified device for use with graphs back to the OS.""" + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._dev_id)) + + @property + def attributes(self) -> GraphMemoryResourceAttributes: + """Asynchronous allocation attributes related to graphs.""" + return GraphMemoryResourceAttributes._init(self._dev_id) + + @property + def device_id(self) -> int: + """The associated device ordinal.""" + return self._dev_id + + @property + def is_device_accessible(self) -> bool: + """Return True. This memory resource provides device-accessible buffers.""" + return True + + @property + def is_host_accessible(self) -> bool: + """Return False. This memory resource does not provide host-accessible buffers.""" + return False + + +class GraphMemoryResource(cyGraphMemoryResource): + """ + A memory resource managing the graph-specific memory pool. + + Graph-captured memory operations use a special internal memory pool, which + is a per-device singleton. This class serves as the interface to that pool. + The only supported operations are allocation, deallocation, and a limited + set of status queries. + + This memory resource should be used to allocate memory when graph capturing + is enabled. Using this when graphs are not being captured will result in a + runtime error. + + Conversely, allocating memory from a `DeviceMemoryResource` when graph + capturing is enabled results in a runtime error. + + Parameters + ---------- + device_id: int | Device + Device or Device ordinal for which a graph memory resource is obtained. + """ + + def __new__(cls, device_id: int | Device): + cdef int c_device_id = getattr(device_id, 'device_id', device_id) + return cls._create(c_device_id) + + @classmethod + @cache + def _create(cls, int device_id): + return cyGraphMemoryResource.__new__(cls, device_id) + + +# Raise an exception if the given stream is capturing. +# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error. +cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil: + cdef cydriver.CUstreamCaptureStatus capturing + HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing)) + if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE: + raise RuntimeError("GraphMemoryResource cannot perform memory operations on " + "a non-capturing stream.") + + +cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream): + cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUdeviceptr devptr + with nogil: + check_capturing(s) + HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s)) + cdef Buffer buf = Buffer.__new__(Buffer) + buf._ptr = (devptr) + buf._ptr_obj = None + buf._size = size + buf._memory_resource = self + buf._alloc_stream = stream + return buf + + +cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept: + cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUdeviceptr devptr = ptr + with nogil: + HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index 8f382e5d0..68a410d1e 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -5,9 +5,6 @@ from cuda.bindings cimport cydriver -cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except* - - cdef class Stream: cdef: diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index cdc4651bd..c9192c5ba 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -62,36 +62,6 @@ class IsStreamT(Protocol): ... -cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*: - try: - cuda_stream_attr = obj.__cuda_stream__ - except AttributeError: - raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") from None - - if callable(cuda_stream_attr): - info = cuda_stream_attr() - else: - info = cuda_stream_attr - warnings.simplefilter("once", DeprecationWarning) - warnings.warn( - "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method", - stacklevel=3, - category=DeprecationWarning, - ) - - try: - len_info = len(info) - except TypeError as e: - raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}") from e - if len_info != 2: - raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements") - if info[0] != 0: - raise RuntimeError( - f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}" - ) - return (info[1]) - - cdef class Stream: """Represent a queue of GPU operations that are executed in a specific order. @@ -139,12 +109,15 @@ cdef class Stream: @classmethod def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None): + if isinstance(obj, Stream): + return obj + cdef Stream self = Stream.__new__(cls) if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") if obj is not None: - self._handle = _try_to_get_stream_ptr(obj) + self._handle = _handle_from_stream_t(obj) # TODO: check if obj is created under the current context/device self._owner = obj return self @@ -445,3 +418,36 @@ cpdef Stream default_stream(): return C_PER_THREAD_DEFAULT_STREAM else: return C_LEGACY_DEFAULT_STREAM + + +cdef cydriver.CUstream _handle_from_stream_t(obj) except*: + if isinstance(obj, Stream): + return (obj.handle) + + try: + cuda_stream_attr = obj.__cuda_stream__ + except AttributeError: + raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") from None + + if callable(cuda_stream_attr): + info = cuda_stream_attr() + else: + info = cuda_stream_attr + warnings.simplefilter("once", DeprecationWarning) + warnings.warn( + "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method", + stacklevel=3, + category=DeprecationWarning, + ) + + try: + len_info = len(info) + except TypeError as e: + raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}") from e + if len_info != 2: + raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements") + if info[0] != 0: + raise RuntimeError( + f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}" + ) + return (info[1]) diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index ad6da14da..0e7520249 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -14,7 +14,7 @@ ctypedef fused supported_error_type: # mimic CU_DEVICE_INVALID -cdef cydriver.CUcontext CU_CONTEXT_INVALID = (-2) +cdef const cydriver.CUcontext CU_CONTEXT_INVALID = (-2) cdef cydriver.CUdevice get_device_from_ctx( diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py index 972e83e13..b4d769eab 100644 --- a/cuda_core/tests/helpers/buffers.py +++ b/cuda_core/tests/helpers/buffers.py @@ -3,12 +3,18 @@ import ctypes -from cuda.core.experimental import Buffer, MemoryResource +from cuda.core.experimental import Buffer, Device, MemoryResource from cuda.core.experimental._utils.cuda_utils import driver, handle_return from . import libc -__all__ = ["DummyUnifiedMemoryResource", "PatternGen", "make_scratch_buffer", "compare_equal_buffers"] +__all__ = [ + "compare_buffer_to_constant", + "compare_equal_buffers", + "DummyUnifiedMemoryResource", + "make_scratch_buffer", + "PatternGen", +] class DummyUnifiedMemoryResource(MemoryResource): @@ -103,11 +109,16 @@ def _get_pattern_buffer(self, seed, value): def make_scratch_buffer(device, value, nbytes): """Create a unified memory buffer with the specified value.""" buffer = DummyUnifiedMemoryResource(device).allocate(nbytes) - ptr = ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_byte)) - ctypes.memset(ptr, value & 0xFF, nbytes) + set_buffer(buffer, value) return buffer +def set_buffer(buffer, value): + assert 0 <= int(value) < 256 + ptr = ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_byte)) + ctypes.memset(ptr, value & 0xFF, buffer.size) + + def compare_equal_buffers(buffer1, buffer2): """Compare the contents of two host-accessible buffers for bitwise equality.""" if buffer1.size != buffer2.size: @@ -115,3 +126,17 @@ def compare_equal_buffers(buffer1, buffer2): ptr1 = ctypes.cast(int(buffer1.handle), ctypes.POINTER(ctypes.c_byte)) ptr2 = ctypes.cast(int(buffer2.handle), ctypes.POINTER(ctypes.c_byte)) return libc.memcmp(ptr1, ptr2, buffer1.size) == 0 + + +def compare_buffer_to_constant(buffer, value): + device_id = buffer.memory_resource.device_id + device = Device(device_id) + stream = device.create_stream() + expected = make_scratch_buffer(device, value, buffer.size) + tmp = make_scratch_buffer(device, 0, buffer.size) + tmp.copy_from(buffer, stream=stream) + stream.sync() + result = compare_equal_buffers(expected, tmp) + expected.close() + tmp.close() + return result diff --git a/cuda_core/tests/helpers/misc.py b/cuda_core/tests/helpers/misc.py new file mode 100644 index 000000000..33508091a --- /dev/null +++ b/cuda_core/tests/helpers/misc.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core.experimental import Stream + + +class StreamWrapper: + """ + A wrapper around Stream for testing IsStreamT conversions. + """ + + def __init__(self, stream: Stream): + self._stream = stream + + def __cuda_stream__(self): + return self._stream.__cuda_stream__() + + def close(self): + self._stream.close() + + @property + def handle(self): + return self._stream.handle + + def sync(self): + return self._stream.sync() + + def __eq__(self, rhs): + return self._stream == Stream._init(rhs) + + def __hash__(self): + return hash(self._stream) diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py new file mode 100644 index 000000000..44a5be261 --- /dev/null +++ b/cuda_core/tests/test_graph_mem.py @@ -0,0 +1,254 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import pytest +from cuda.core.experimental import ( + Device, + DeviceMemoryResource, + GraphCompleteOptions, + GraphMemoryResource, + LaunchConfig, + Program, + ProgramOptions, + launch, +) +from helpers.buffers import compare_buffer_to_constant, make_scratch_buffer, set_buffer + + +def _common_kernels_alloc(): + code = """ + __global__ void set_zero(char *a, size_t nbytes) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = idx; i < nbytes; i += stride) { + a[i] = 0; + } + } + __global__ void add_one(char *a, size_t nbytes) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = idx; i < nbytes; i += stride) { + a[i] += 1; + } + } + """ + arch = "".join(f"{i}" for i in Device().compute_capability) + program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("set_zero", "add_one")) + return mod + + +class GraphMemoryTestManager: + """ + Manages changes to the state of the graph memory system, for testing. + """ + + def __init__(self, gmr, stream, mode=None): + self.device = Device(gmr.device_id) + self.gmr = gmr + self.stream = stream + self.mode = "relaxed" if mode is None else mode + + def reset(self): + """Trim unused graph memory and reset usage statistics.""" + self.gmr.trim() + self.gmr.attributes.reserved_mem_high = 0 + self.gmr.attributes.used_mem_high = 0 + + def alloc(self, num, nbytes): + """Allocate num buffers of size nbytes from graph memory.""" + gb = self.device.create_graph_builder().begin_building(self.mode) + buffers = [self.gmr.allocate(nbytes, stream=gb) for _ in range(num)] + graph = gb.end_building().complete() + graph.upload(self.stream) + graph.launch(self.stream) + self.stream.sync() + return buffers + + def free(self, buffers): + """Free graph memory buffers.""" + for buffer in buffers: + buffer.close(stream=self.stream) + self.stream.sync() + + +@pytest.mark.parametrize("mode", ["no_graph", "global", "thread_local", "relaxed"]) +def test_graph_alloc(init_cuda, mode): + """Test basic graph capture with memory allocated and deallocated by GraphMemoryResource.""" + NBYTES = 64 + device = Device() + stream = device.create_stream() + dmr = DeviceMemoryResource(device) + gmr = GraphMemoryResource(device) + out = dmr.allocate(NBYTES, stream=stream) + + # Get kernels and define the calling sequence. + mod = _common_kernels_alloc() + set_zero = mod.get_kernel("set_zero") + add_one = mod.get_kernel("add_one") + + def apply_kernels(mr, stream, out): + buffer = mr.allocate(NBYTES, stream=stream) + config = LaunchConfig(grid=1, block=1) + for kernel in [set_zero, add_one, add_one]: + launch(stream, config, kernel, buffer, NBYTES) + out.copy_from(buffer, stream=stream) + buffer.close() + + # Apply kernels, with or without graph capture. + if mode == "no_graph": + # Do work without graph capture. + apply_kernels(mr=dmr, stream=stream, out=out) + else: + # Capture work, then upload and launch. + gb = device.create_graph_builder().begin_building(mode) + apply_kernels(mr=gmr, stream=gb, out=out) + graph = gb.end_building().complete() + graph.upload(stream) + graph.launch(stream) + + stream.sync() + + # Check the result on the host. + assert compare_buffer_to_constant(out, 2) + + +@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) +def test_graph_alloc_with_output(init_cuda, mode): + """Test for memory allocated in a graph being used outside the graph.""" + NBYTES = 64 + device = Device() + stream = device.create_stream() + gmr = GraphMemoryResource(device) + + # Get kernels and define the calling sequence. + mod = _common_kernels_alloc() + add_one = mod.get_kernel("add_one") + + # Make an input of 0s. + in_ = make_scratch_buffer(device, 0, NBYTES) + + # Construct a graph to copy and increment the input. It returns a new + # buffer allocated within the graph. The auto_free_on_launch option + # is required to properly use the output buffer. + gb = device.create_graph_builder().begin_building(mode) + out = gmr.allocate(NBYTES, gb) + out.copy_from(in_, stream=gb) + launch(gb, LaunchConfig(grid=1, block=1), add_one, out, NBYTES) + options = GraphCompleteOptions(auto_free_on_launch=True) + graph = gb.end_building().complete(options) + + # Launch the graph. The output buffer is allocated and set to one. + graph.upload(stream) + graph.launch(stream) + stream.sync() + assert compare_buffer_to_constant(out, 1) + + # Update the input buffer and rerun the graph. + set_buffer(in_, 5) + graph.upload(stream) + graph.launch(stream) + stream.sync() + assert compare_buffer_to_constant(out, 6) + + +@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) +def test_graph_mem_set_attributes(init_cuda, mode): + device = Device() + stream = device.create_stream() + gmr = GraphMemoryResource(device) + mman = GraphMemoryTestManager(gmr, stream, mode) + + # Make an allocation and obvserve usage. + buffer = mman.alloc(1, 1024) + assert gmr.attributes.reserved_mem_current > 0 + assert gmr.attributes.reserved_mem_high > 0 + assert gmr.attributes.used_mem_current > 0 + assert gmr.attributes.used_mem_high > 0 + + # Incorrect attribute usage. + with pytest.raises(AttributeError, match=r"property 'reserved_mem_current' .* has no setter"): + gmr.attributes.reserved_mem_current = 0 + + with pytest.raises(AttributeError, match=r"Attribute 'reserved_mem_high' may only be set to zero \(got 1\)\."): + gmr.attributes.reserved_mem_high = 1 + + with pytest.raises(AttributeError, match=r"property 'used_mem_current' .* has no setter"): + gmr.attributes.used_mem_current = 0 + + with pytest.raises(AttributeError, match=r"Attribute 'used_mem_high' may only be set to zero \(got 1\)\."): + gmr.attributes.used_mem_high = 1 + + # Free memory, but usage is not reduced yet. + mman.free(buffer) + assert gmr.attributes.reserved_mem_current > 0 + assert gmr.attributes.reserved_mem_high > 0 + assert gmr.attributes.used_mem_current > 0 + assert gmr.attributes.used_mem_high > 0 + + gmr.trim() + + # The high-water marks remain after free and trim. + assert gmr.attributes.reserved_mem_current == 0 + assert gmr.attributes.reserved_mem_high > 0 + assert gmr.attributes.used_mem_current == 0 + assert gmr.attributes.used_mem_high > 0 + + # Reset the high-water marks. + gmr.attributes.reserved_mem_high = 0 + gmr.attributes.used_mem_high = 0 + + assert gmr.attributes.reserved_mem_high == 0 + assert gmr.attributes.used_mem_high == 0 + + mman.reset() + + +@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) +def test_gmr_check_capture_state(init_cuda, mode): + """ + Test expected errors (and non-errors) using GraphMemoryResource with graph + capture. + """ + device = Device() + stream = device.create_stream() + gmr = GraphMemoryResource(device) + + # Not capturing + with pytest.raises( + RuntimeError, + match=r"GraphMemoryResource cannot perform memory operations on a " + r"non-capturing stream\.", + ): + gmr.allocate(1, stream=stream) + + # Capturing + gb = device.create_graph_builder().begin_building(mode=mode) + gmr.allocate(1, stream=gb) # no error + gb.end_building().complete() + + +@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) +def test_dmr_check_capture_state(init_cuda, mode): + """ + Test expected errors (and non-errors) using DeviceMemoryResource with graph + capture. + """ + device = Device() + stream = device.create_stream() + dmr = DeviceMemoryResource(device) + + # Not capturing + dmr.allocate(1, stream=stream).close() # no error + + # Capturing + gb = device.create_graph_builder().begin_building(mode=mode) + with pytest.raises( + RuntimeError, + match=r"DeviceMemoryResource cannot perform memory operations on a capturing " + r"stream \(consider using GraphMemoryResource\)\.", + ): + dmr.allocate(1, stream=gb) + gb.end_building().complete() diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index a951fc418..f2fd64344 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -22,6 +22,7 @@ ) from cuda.core.experimental._memory import _SynchronousMemoryResource from cuda.core.experimental._utils.cuda_utils import CUDAError +from helpers.misc import StreamWrapper from conftest import skipif_need_cuda_headers @@ -179,9 +180,10 @@ def test_launch_invalid_values(init_cuda): ) +@pytest.mark.parametrize("wrap_stream", [True, False]) @pytest.mark.parametrize("python_type, cpp_type, init_value", PARAMS) @pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+") -def test_launch_scalar_argument(python_type, cpp_type, init_value): +def test_launch_scalar_argument(python_type, cpp_type, init_value, wrap_stream): dev = Device() dev.set_current() @@ -219,19 +221,25 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value): ker = mod.get_kernel(ker_name) # Launch with 1 thread + stream = dev.default_stream + if wrap_stream: + stream = StreamWrapper(stream) config = LaunchConfig(grid=1, block=1) - launch(dev.default_stream, config, ker, arr.ctypes.data, scalar) - dev.default_stream.sync() + launch(stream, config, ker, arr.ctypes.data, scalar) + stream.sync() # Check result assert arr[0] == init_value, f"Expected {init_value}, got {arr[0]}" @skipif_need_cuda_headers # cg -def test_cooperative_launch(): +@pytest.mark.parametrize("wrap_stream", [True, False]) +def test_cooperative_launch(wrap_stream): dev = Device() dev.set_current() s = dev.create_stream(options={"nonblocking": True}) + if wrap_stream: + s = StreamWrapper(s) # CUDA kernel templated on type T code = r""" @@ -272,6 +280,7 @@ def test_cooperative_launch(): @pytest.mark.skipif(cp is None, reason="cupy not installed") +@pytest.mark.parametrize("wrap_stream", [True, False]) @pytest.mark.parametrize( "memory_resource_class", [ @@ -285,11 +294,13 @@ def test_cooperative_launch(): ), ], ) -def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class): +def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class, wrap_stream): """Test that kernels can access memory allocated by memory resources.""" dev = Device() dev.set_current() stream = dev.create_stream() + if wrap_stream: + stream = StreamWrapper(stream) # tell CuPy to use our stream as the current stream: cp.cuda.ExternalStream(int(stream.handle)).use() diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index a261ec7a3..5ca4e4121 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -13,6 +13,7 @@ np = None import ctypes import platform +import re import pytest from cuda.core.experimental import ( @@ -20,6 +21,7 @@ Device, DeviceMemoryResource, DeviceMemoryResourceOptions, + GraphMemoryResource, MemoryResource, VirtualMemoryResource, VirtualMemoryResourceOptions, @@ -29,6 +31,7 @@ from cuda.core.experimental._utils.cuda_utils import handle_return from cuda.core.experimental.utils import StridedMemoryView from helpers.buffers import DummyUnifiedMemoryResource +from helpers.misc import StreamWrapper from cuda_python_test_helpers import supports_ipc_mempool @@ -133,6 +136,7 @@ def test_package_contents(): "MemoryResource", "DeviceMemoryResource", "DeviceMemoryResourceOptions", + "GraphMemoryResource", "IPCBufferDescriptor", "IPCAllocationHandle", "LegacyPinnedMemoryResource", @@ -164,10 +168,12 @@ def test_buffer_initialization(): buffer_initialization(DummyPinnedMemoryResource(device)) -def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False): +def buffer_copy_to(dummy_mr: MemoryResource, device: Device, wrap_stream, check=False): src_buffer = dummy_mr.allocate(size=1024) dst_buffer = dummy_mr.allocate(size=1024) stream = device.create_stream() + if wrap_stream: + stream = StreamWrapper(stream) if check: src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) @@ -187,18 +193,21 @@ def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False): src_buffer.close() -def test_buffer_copy_to(): +@pytest.mark.parametrize("wrap_stream", [True, False]) +def test_buffer_copy_to(wrap_stream): device = Device() device.set_current() - buffer_copy_to(DummyDeviceMemoryResource(device), device) - buffer_copy_to(DummyUnifiedMemoryResource(device), device) - buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True) + buffer_copy_to(DummyDeviceMemoryResource(device), device, wrap_stream) + buffer_copy_to(DummyUnifiedMemoryResource(device), device, wrap_stream) + buffer_copy_to(DummyPinnedMemoryResource(device), device, wrap_stream, check=True) -def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): +def buffer_copy_from(dummy_mr: MemoryResource, device, wrap_stream, check=False): src_buffer = dummy_mr.allocate(size=1024) dst_buffer = dummy_mr.allocate(size=1024) stream = device.create_stream() + if wrap_stream: + stream = StreamWrapper(stream) if check: src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) @@ -218,12 +227,13 @@ def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): src_buffer.close() -def test_buffer_copy_from(): +@pytest.mark.parametrize("wrap_stream", [True, False]) +def test_buffer_copy_from(wrap_stream): device = Device() device.set_current() - buffer_copy_from(DummyDeviceMemoryResource(device), device) - buffer_copy_from(DummyUnifiedMemoryResource(device), device) - buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True) + buffer_copy_from(DummyDeviceMemoryResource(device), device, wrap_stream) + buffer_copy_from(DummyUnifiedMemoryResource(device), device, wrap_stream) + buffer_copy_from(DummyPinnedMemoryResource(device), device, wrap_stream, check=True) def buffer_close(dummy_mr: MemoryResource): @@ -286,13 +296,18 @@ def test_buffer_dunder_dlpack_device_failure(): @pytest.mark.parametrize("use_device_object", [True, False]) -def test_device_memory_resource_initialization(mempool_device, use_device_object): +def test_device_memory_resource_initialization(use_device_object): """Test that DeviceMemoryResource can be initialized successfully. This test verifies that the DeviceMemoryResource initializes properly, including the release threshold configuration for performance optimization. """ - device = mempool_device + device = Device() + + if not device.properties.memory_pools_supported: + pytest.skip("Device does not support mempool operations") + + device.set_current() # This should succeed and configure the memory pool release threshold. # The resource can be constructed from either a device or device ordinal. @@ -481,11 +496,16 @@ def test_vmm_allocator_rdma_unsupported_exception(): VirtualMemoryResource(device, config=options) -def test_mempool(mempool_device): - device = mempool_device +def test_device_memory_resource(): + device = Device() + + if not device.properties.memory_pools_supported: + pytest.skip("Device does not support mempool operations") + + device.set_current() # Test basic pool creation - options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False) + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE) mr = DeviceMemoryResource(device, options=options) assert mr.device_id == device.device_id assert mr.is_device_accessible @@ -513,18 +533,26 @@ def test_mempool(mempool_device): buffer = mr.allocate(1024, stream=stream) assert buffer.handle != 0 buffer.close() + buffer = mr.allocate(1024, stream=StreamWrapper(stream)) + assert buffer.handle != 0 + buffer.close() # Test memory copying between buffers from same pool src_buffer = mr.allocate(64) dst_buffer = mr.allocate(64) stream = device.create_stream() src_buffer.copy_to(dst_buffer, stream=stream) + src_buffer.copy_to(dst_buffer, stream=StreamWrapper(stream)) device.sync() dst_buffer.close() src_buffer.close() - # Test error cases - # Test IPC operations are disabled + +def test_mempool_ipc_errors(mempool_device): + """Test error cases when IPC operations are disabled.""" + device = mempool_device + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False) + mr = DeviceMemoryResource(device, options=options) buffer = mr.allocate(64) ipc_error_msg = "Memory resource is not IPC-enabled" @@ -599,6 +627,22 @@ def test_mempool_attributes(ipc_enabled, mempool_device, property_name, expected assert value >= current_value, f"{property_name} should be >= {current_prop}" +def test_mempool_attributes_repr(mempool_device): + device = Device() + device.set_current() + mr = DeviceMemoryResource(device, options={"max_size": 2048}) + buffer1 = mr.allocate(64) + buffer2 = mr.allocate(64) + buffer1.close() + assert re.match( + r"DeviceMemoryResourceAttributes\(release_threshold=\d+, reserved_mem_current=\d+, reserved_mem_high=\d+, " + r"reuse_allow_internal_dependencies=(True|False), reuse_allow_opportunistic=(True|False), " + r"reuse_follow_event_dependencies=(True|False), used_mem_current=64, used_mem_high=128\)", + str(mr.attributes), + ) + buffer2.close() + + def test_mempool_attributes_ownership(mempool_device): """Ensure the attributes bundle handles references correctly.""" device = mempool_device @@ -644,3 +688,14 @@ def test_strided_memory_view_refcnt(): assert av.strides[0] == 1 assert av.strides[1] == 64 assert sys.getrefcount(av.strides) >= 2 + + +def test_graph_memory_resource_object(init_cuda): + device = Device() + gmr1 = GraphMemoryResource(device) + gmr2 = GraphMemoryResource(device) + gmr3 = GraphMemoryResource(device.device_id) + + # These objects are interned. + assert gmr1 is gmr2 is gmr3 + assert gmr1 == gmr2 == gmr3 diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index c8165548f..2e10fb100 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -6,6 +6,7 @@ from cuda.core.experimental._event import Event from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM from cuda.core.experimental._utils.cuda_utils import driver +from helpers.misc import StreamWrapper def test_stream_init_disabled(): @@ -76,9 +77,12 @@ def test_stream_context(init_cuda): assert context._handle is not None -def test_stream_from_foreign_stream(init_cuda): +@pytest.mark.parametrize("wrap_stream", [True, False]) +def test_stream_from_foreign_stream(init_cuda, wrap_stream): device = Device() other_stream = device.create_stream(options=StreamOptions()) + if wrap_stream: + other_stream = StreamWrapper(other_stream) stream = device.create_stream(obj=other_stream) # Now that __eq__ is implemented (issue #664), we can compare directly assert other_stream == stream