diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 8a60c031c..67402e10a 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -42,6 +42,7 @@
     Buffer,
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
+    GraphMemoryResource,
     LegacyPinnedMemoryResource,
     MemoryResource,
     VirtualMemoryResource,
diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index bc6167793..833c74c0a 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -9,7 +9,7 @@ from cuda.bindings cimport cydriver
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 import threading
-from typing import Union, TYPE_CHECKING
+from typing import Optional, TYPE_CHECKING, Union
 
 from cuda.core.experimental._context import Context, ContextOptions
 from cuda.core.experimental._event import Event, EventOptions
@@ -1242,7 +1242,7 @@ class Device:
         """
         raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189")
 
-    def create_stream(self, obj: IsStreamT | None = None, options: StreamOptions | None = None) -> Stream:
+    def create_stream(self, obj: Optional[IsStreamT] = None, options: StreamOptions | None = None) -> Stream:
         """Create a Stream object.
 
         New stream objects can be created in two different ways:
@@ -1295,7 +1295,7 @@ class Device:
         ctx = self._get_current_context()
         return Event._init(self._id, ctx, options, True)
 
-    def allocate(self, size, stream: Stream | None = None) -> Buffer:
+    def allocate(self, size, stream: Optional[IsStreamT] = None) -> Buffer:
         """Allocate device memory from a specified stream.
 
         Allocates device memory of `size` bytes on the specified `stream`
@@ -1311,7 +1311,7 @@ class Device:
         ----------
         size : int
             Number of bytes to allocate.
-        stream : :obj:`~_stream.Stream`, optional
+        stream : :obj:`~_stream.IsStreamT`, optional
             The stream establishing the stream ordering semantic.
             Default value of `None` uses default stream.
 
diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx
index a06d885ff..b94c3d2b7 100644
--- a/cuda_core/cuda/core/experimental/_launcher.pyx
+++ b/cuda_core/cuda/core/experimental/_launcher.pyx
@@ -2,12 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport uintptr_t
-
-from cuda.core.experimental._stream cimport _try_to_get_stream_ptr
-
-from typing import Union
-
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
 from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
 from cuda.core.experimental._module import Kernel
@@ -39,13 +33,13 @@ def _lazy_init():
     _inited = True
 
 
-def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kernel, *kernel_args):
+def launch(stream: IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args):
     """Launches a :obj:`~_module.Kernel`
     object with launch-time configuration.
 
     Parameters
     ----------
-    stream : :obj:`~_stream.Stream`
+    stream : :obj:`~_stream.IsStreamT`
         The stream establishing the stream ordering semantic of a
         launch.
     config : :obj:`LaunchConfig`
@@ -58,17 +52,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
         launching kernel.
 
     """
-    if stream is None:
-        raise ValueError("stream cannot be None, stream must either be a Stream object or support __cuda_stream__")
-    try:
-        stream_handle = stream.handle
-    except AttributeError:
-        try:
-            stream_handle = driver.CUstream(<uintptr_t>(_try_to_get_stream_ptr(stream)))
-        except Exception:
-            raise ValueError(
-                f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
-            ) from None
+    stream = Stream._init(stream)
     assert_type(kernel, Kernel)
     _lazy_init()
     config = check_or_create_options(LaunchConfig, config, "launch config")
@@ -85,7 +69,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
     # rich.
     if _use_ex:
         drv_cfg = _to_native_launch_config(config)
-        drv_cfg.hStream = stream_handle
+        drv_cfg.hStream = stream.handle
         if config.cooperative_launch:
             _check_cooperative_launch(kernel, config, stream)
         handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
@@ -93,12 +77,12 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
         # TODO: check if config has any unsupported attrs
         handle_return(
             driver.cuLaunchKernel(
-                int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream_handle, args_ptr, 0
+                int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
             )
         )
 
 
-def _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
+cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
     dev = stream.device
     num_sm = dev.properties.multiprocessor_count
     max_grid_size = (
diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index 3c07fbdde..20b90d7fd 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -4,6 +4,7 @@
 
 from ._buffer import *  # noqa: F403
 from ._device_memory_resource import *  # noqa: F403
+from ._graph_memory_resource import *  # noqa: F403
 from ._ipc import *  # noqa: F403
 from ._legacy import *  # noqa: F403
 from ._virtual_memory_resource import *  # noqa: F403
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index 225127274..f18d5ec02 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -15,9 +15,10 @@ from cuda.core.experimental._utils.cuda_utils cimport (
 )
 
 import abc
-from typing import TypeVar, Union
+from typing import Optional, TypeVar, Union
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
+from cuda.core.experimental._stream import IsStreamT
 from cuda.core.experimental._utils.cuda_utils import driver
 
 __all__ = ['Buffer', 'MemoryResource']
@@ -116,7 +117,7 @@ cdef class Buffer:
         """
         Buffer_close(self, stream)
 
-    def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
+    def copy_to(self, dst: Buffer = None, *, stream: IsStreamT) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
 
         Copies the data from this buffer to the provided dst buffer.
@@ -127,13 +128,12 @@ cdef class Buffer:
         ----------
         dst : :obj:`~_memory.Buffer`
             Source buffer to copy data from
-        stream : Stream
+        stream : IsStreamT
             Keyword argument specifying the stream for the
             asynchronous copy
 
         """
-        if stream is None:
-            raise ValueError("stream must be provided")
+        stream = Stream._init(stream)
 
         cdef size_t src_size = self._size
 
@@ -152,20 +152,19 @@ cdef class Buffer:
         raise_if_driver_error(err)
         return dst
 
-    def copy_from(self, src: Buffer, *, stream: Stream):
+    def copy_from(self, src: Buffer, *, stream: IsStreamT):
         """Copy from the src buffer to this buffer asynchronously on the given stream.
 
         Parameters
         ----------
         src : :obj:`~_memory.Buffer`
             Source buffer to copy data from
-        stream : Stream
+        stream : IsStreamT
             Keyword argument specifying the stream for the
             asynchronous copy
 
         """
-        if stream is None:
-            raise ValueError("stream must be provided")
+        stream = Stream._init(stream)
 
         cdef size_t dst_size = self._size
         cdef size_t src_size = src._size
@@ -274,7 +273,7 @@ cdef class Buffer:
 
 # Buffer Implementation
 # ---------------------
-cdef Buffer_close(Buffer self, stream):
+cdef inline void Buffer_close(Buffer self, stream):
     cdef Stream s
     if self._ptr and self._memory_resource is not None:
         if stream is None:
@@ -305,14 +304,14 @@ cdef class MemoryResource:
     """
 
     @abc.abstractmethod
-    def allocate(self, size_t size, stream: Stream = None) -> Buffer:
+    def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer:
         """Allocate a buffer of the requested size.
 
         Parameters
         ----------
         size : int
             The size of the buffer to allocate, in bytes.
-        stream : Stream, optional
+        stream : IsStreamT, optional
             The stream on which to perform the allocation asynchronously.
             If None, it is up to each memory resource implementation to decide
             and document the behavior.
@@ -326,7 +325,7 @@ cdef class MemoryResource:
         ...
 
     @abc.abstractmethod
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -335,7 +334,7 @@ cdef class MemoryResource:
             The pointer or handle to the buffer to deallocate.
         size : int
             The size of the buffer to deallocate, in bytes.
-        stream : Stream, optional
+        stream : IsStreamT, optional
             The stream on which to perform the deallocation asynchronously.
             If None, it is up to each memory resource implementation to decide
             and document the behavior.
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index 47b6fd114..b354d595c 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -49,8 +49,8 @@ cdef class DeviceMemoryResourceOptions:
         Maximum pool size. When set to 0, defaults to a system-dependent value.
         (Default to 0)
     """
-    ipc_enabled : cython.bint = False
-    max_size : cython.size_t = 0
+    ipc_enabled: cython.bint   = False
+    max_size   : cython.size_t = 0
 
 
 cdef class DeviceMemoryResourceAttributes:
@@ -66,6 +66,12 @@ cdef class DeviceMemoryResourceAttributes:
         self._mr_weakref = mr
         return self
 
+    def __repr__(self):
+        return f"{self.__class__.__name__}(%s)" % ", ".join(
+            f"{attr}={getattr(self, attr)}" for attr in dir(self)
+                                            if not attr.startswith("_")
+        )
+
     @DMRA_mempool_attribute(bool)
     def reuse_follow_event_dependencies(self):
         """Allow memory to be reused when there are event dependencies between streams."""
@@ -127,7 +133,7 @@ cdef int DMRA_getattribute(
 
 cdef class DeviceMemoryResource(MemoryResource):
     """
-    Create a device memory resource managing a stream-ordered memory pool.
+    A device memory resource managing a stream-ordered memory pool.
 
     Parameters
     ----------
@@ -302,14 +308,14 @@ cdef class DeviceMemoryResource(MemoryResource):
             raise RuntimeError("Imported memory resource cannot be exported")
         return self._ipc_data._alloc_handle
 
-    def allocate(self, size_t size, stream: Stream = None) -> Buffer:
+    def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer:
         """Allocate a buffer of the requested size.
 
         Parameters
         ----------
         size : int
             The size of the buffer to allocate, in bytes.
-        stream : Stream, optional
+        stream : IsStreamT, optional
             The stream on which to perform the allocation asynchronously.
             If None, an internal stream is used.
 
@@ -321,11 +327,10 @@ cdef class DeviceMemoryResource(MemoryResource):
         """
         if self.is_mapped:
             raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
-        if stream is None:
-            stream = default_stream()
-        return DMR_allocate(self, size, <Stream>stream)
+        stream = Stream._init(stream) if stream is not None else default_stream()
+        return DMR_allocate(self, size, <Stream> stream)
 
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -334,15 +339,17 @@ cdef class DeviceMemoryResource(MemoryResource):
             The pointer or handle to the buffer to deallocate.
         size : int
             The size of the buffer to deallocate, in bytes.
-        stream : Stream, optional
+        stream : IsStreamT, optional
             The stream on which to perform the deallocation asynchronously.
             If the buffer is deallocated without an explicit stream, the allocation stream
             is used.
         """
-        DMR_deallocate(self, <uintptr_t>ptr, size, <Stream>stream)
+        stream = Stream._init(stream) if stream is not None else default_stream()
+        DMR_deallocate(self, <uintptr_t>ptr, size, <Stream> stream)
 
     @property
     def attributes(self) -> DeviceMemoryResourceAttributes:
+        """Memory pool attributes."""
         if self._attributes is None:
             ref = weakref.ref(self)
             self._attributes = DeviceMemoryResourceAttributes._init(ref)
@@ -460,10 +467,21 @@ cdef void DMR_init_create(
         self._ipc_data = IPCData(alloc_handle, mapped=False)
 
 
-cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
+# Raise an exception if the given stream is capturing.
+# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error.
+cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil:
+    cdef cydriver.CUstreamCaptureStatus capturing
+    HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing))
+    if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE:
+        raise RuntimeError("DeviceMemoryResource cannot perform memory operations on "
+                           "a capturing stream (consider using GraphMemoryResource).")
+
+
+cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr
     with nogil:
+        check_not_capturing(s)
         HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
     cdef Buffer buf = Buffer.__new__(Buffer)
     buf._ptr = <uintptr_t>(devptr)
@@ -474,7 +492,7 @@ cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
     return buf
 
 
-cdef void DMR_deallocate(
+cdef inline void DMR_deallocate(
     DeviceMemoryResource self, uintptr_t ptr, size_t size, Stream stream
 ) noexcept:
     cdef cydriver.CUstream s = stream._handle
@@ -483,7 +501,7 @@ cdef void DMR_deallocate(
         HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
 
 
-cdef DMR_close(DeviceMemoryResource self):
+cdef inline DMR_close(DeviceMemoryResource self):
     if self._handle == NULL:
         return
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
new file mode 100644
index 000000000..f9c7798e7
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental._memory._buffer cimport MemoryResource
+
+
+cdef class cyGraphMemoryResource(MemoryResource):
+    cdef:
+        int _dev_id
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
new file mode 100644
index 000000000..6fbb6088f
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
@@ -0,0 +1,213 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport uintptr_t, intptr_t, uint64_t
+
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
+from cuda.core.experimental._stream cimport default_stream, Stream
+from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+
+from functools import cache
+from typing import Optional, TYPE_CHECKING
+
+from cuda.core.experimental._utils.cuda_utils import driver
+
+if TYPE_CHECKING:
+    from cuda.core.experimental._memory.buffer import DevicePointerT
+
+__all__ = ['GraphMemoryResource']
+
+
+cdef class GraphMemoryResourceAttributes:
+    cdef:
+        int _dev_id
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError("GraphMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.")
+
+    @classmethod
+    def _init(cls, device_id: int):
+        cdef GraphMemoryResourceAttributes self = GraphMemoryResourceAttributes.__new__(cls)
+        self._dev_id = device_id
+        return self
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(%s)" % ", ".join(
+            f"{attr}={getattr(self, attr)}" for attr in dir(self)
+                                            if not attr.startswith("_")
+        )
+
+    @GMRA_mem_attribute(int)
+    def reserved_mem_current(self):
+        """Current amount of backing memory allocated."""
+
+    @GMRA_mem_attribute(int, settable=True)
+    def reserved_mem_high(self):
+        """
+        High watermark of backing memory allocated. It can be set to zero to
+        reset it to the current usage.
+        """
+
+    @GMRA_mem_attribute(int)
+    def used_mem_current(self):
+        """Current amount of memory in use."""
+
+    @GMRA_mem_attribute(int, settable=True)
+    def used_mem_high(self):
+        """
+        High watermark of memory in use. It can be set to zero to reset it to
+        the current usage.
+        """
+
+
+cdef GMRA_mem_attribute(property_type: type, settable: bool = False):
+    _settable = settable
+
+    def decorator(stub):
+        attr_enum = getattr(
+            driver.CUgraphMem_attribute, f"CU_GRAPH_MEM_ATTR_{stub.__name__.upper()}"
+        )
+
+        def fget(GraphMemoryResourceAttributes self) -> property_type:
+            value = GMRA_getattribute(self._dev_id, <cydriver.CUgraphMem_attribute><uintptr_t> attr_enum)
+            return property_type(value)
+
+        if _settable:
+            def fset(GraphMemoryResourceAttributes self, uint64_t value):
+                if value != 0:
+                    raise AttributeError(f"Attribute {stub.__name__!r} may only be set to zero (got {value}).")
+                GMRA_setattribute(self._dev_id, <cydriver.CUgraphMem_attribute><uintptr_t> attr_enum)
+        else:
+            fset = None
+
+        return property(fget=fget, fset=fset, doc=stub.__doc__)
+    return decorator
+
+
+cdef inline uint64_t GMRA_getattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum):
+    cdef uint64_t value
+    with nogil:
+        HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(device_id, attr_enum, <void *> &value))
+    return value
+
+
+cdef inline void GMRA_setattribute(int device_id, cydriver.CUgraphMem_attribute attr_enum):
+    cdef uint64_t zero = 0
+    with nogil:
+        HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(device_id, attr_enum, <void *> &zero))
+
+
+cdef class cyGraphMemoryResource(MemoryResource):
+    def __cinit__(self, int device_id):
+        self._dev_id = device_id
+
+    def allocate(self, size_t size, stream: Optional[IsStreamT] = None) -> Buffer:
+        """
+        Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
+        """
+        stream = Stream._init(stream) if stream is not None else default_stream()
+        return GMR_allocate(self, size, <Stream> stream)
+
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Optional[IsStreamT] = None):
+        """
+        Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
+        """
+        stream = Stream._init(stream) if stream is not None else default_stream()
+        return GMR_deallocate(ptr, size, <Stream> stream)
+
+    def close(self):
+        """No operation (provided for compatibility)."""
+        pass
+
+    def trim(self):
+        """Free unused memory that was cached on the specified device for use with graphs back to the OS."""
+        with nogil:
+             HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._dev_id))
+
+    @property
+    def attributes(self) -> GraphMemoryResourceAttributes:
+        """Asynchronous allocation attributes related to graphs."""
+        return GraphMemoryResourceAttributes._init(self._dev_id)
+
+    @property
+    def device_id(self) -> int:
+        """The associated device ordinal."""
+        return self._dev_id
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Return True. This memory resource provides device-accessible buffers."""
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """Return False. This memory resource does not provide host-accessible buffers."""
+        return False
+
+
+class GraphMemoryResource(cyGraphMemoryResource):
+    """
+    A memory resource managing the graph-specific memory pool.
+
+    Graph-captured memory operations use a special internal memory pool, which
+    is a per-device singleton. This class serves as the interface to that pool.
+    The only supported operations are allocation, deallocation, and a limited
+    set of status queries.
+
+    This memory resource should be used to allocate memory when graph capturing
+    is enabled. Using this when graphs are not being captured will result in a
+    runtime error.
+
+    Conversely, allocating memory from a `DeviceMemoryResource` when graph
+    capturing is enabled results in a runtime error.
+
+    Parameters
+    ----------
+    device_id: int | Device
+        Device or Device ordinal for which a graph memory resource is obtained.
+    """
+
+    def __new__(cls, device_id: int | Device):
+        cdef int c_device_id = getattr(device_id, 'device_id', device_id)
+        return cls._create(c_device_id)
+
+    @classmethod
+    @cache
+    def _create(cls, int device_id):
+        return cyGraphMemoryResource.__new__(cls, device_id)
+
+
+# Raise an exception if the given stream is capturing.
+# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error.
+cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil:
+    cdef cydriver.CUstreamCaptureStatus capturing
+    HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing))
+    if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE:
+        raise RuntimeError("GraphMemoryResource cannot perform memory operations on "
+                           "a non-capturing stream.")
+
+
+cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream):
+    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUdeviceptr devptr
+    with nogil:
+        check_capturing(s)
+        HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s))
+    cdef Buffer buf = Buffer.__new__(Buffer)
+    buf._ptr = <intptr_t>(devptr)
+    buf._ptr_obj = None
+    buf._size = size
+    buf._memory_resource = self
+    buf._alloc_stream = stream
+    return buf
+
+
+cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept:
+    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd
index 8f382e5d0..68a410d1e 100644
--- a/cuda_core/cuda/core/experimental/_stream.pxd
+++ b/cuda_core/cuda/core/experimental/_stream.pxd
@@ -5,9 +5,6 @@
 from cuda.bindings cimport cydriver
 
 
-cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*
-
-
 cdef class Stream:
 
     cdef:
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index cdc4651bd..c9192c5ba 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -62,36 +62,6 @@ class IsStreamT(Protocol):
         ...
 
 
-cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*:
-    try:
-        cuda_stream_attr = obj.__cuda_stream__
-    except AttributeError:
-        raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") from None
-
-    if callable(cuda_stream_attr):
-        info = cuda_stream_attr()
-    else:
-        info = cuda_stream_attr
-        warnings.simplefilter("once", DeprecationWarning)
-        warnings.warn(
-            "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method",
-            stacklevel=3,
-            category=DeprecationWarning,
-        )
-
-    try:
-        len_info = len(info)
-    except TypeError as e:
-        raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}") from e
-    if len_info != 2:
-        raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements")
-    if info[0] != 0:
-        raise RuntimeError(
-            f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}"
-        )
-    return <cydriver.CUstream><uintptr_t>(info[1])
-
-
 cdef class Stream:
     """Represent a queue of GPU operations that are executed in a specific order.
 
@@ -139,12 +109,15 @@ cdef class Stream:
 
     @classmethod
     def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None):
+        if isinstance(obj, Stream):
+            return obj
+
         cdef Stream self = Stream.__new__(cls)
 
         if obj is not None and options is not None:
             raise ValueError("obj and options cannot be both specified")
         if obj is not None:
-            self._handle = _try_to_get_stream_ptr(obj)
+            self._handle = _handle_from_stream_t(obj)
             # TODO: check if obj is created under the current context/device
             self._owner = obj
             return self
@@ -445,3 +418,36 @@ cpdef Stream default_stream():
         return C_PER_THREAD_DEFAULT_STREAM
     else:
         return C_LEGACY_DEFAULT_STREAM
+
+
+cdef cydriver.CUstream _handle_from_stream_t(obj) except*:
+    if isinstance(obj, Stream):
+        return <cydriver.CUstream><uintptr_t>(obj.handle)
+
+    try:
+        cuda_stream_attr = obj.__cuda_stream__
+    except AttributeError:
+        raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") from None
+
+    if callable(cuda_stream_attr):
+        info = cuda_stream_attr()
+    else:
+        info = cuda_stream_attr
+        warnings.simplefilter("once", DeprecationWarning)
+        warnings.warn(
+            "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method",
+            stacklevel=3,
+            category=DeprecationWarning,
+        )
+
+    try:
+        len_info = len(info)
+    except TypeError as e:
+        raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}") from e
+    if len_info != 2:
+        raise RuntimeError(f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements")
+    if info[0] != 0:
+        raise RuntimeError(
+            f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}"
+        )
+    return <cydriver.CUstream><uintptr_t>(info[1])
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
index ad6da14da..0e7520249 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
@@ -14,7 +14,7 @@ ctypedef fused supported_error_type:
 
 
 # mimic CU_DEVICE_INVALID
-cdef cydriver.CUcontext CU_CONTEXT_INVALID = <cydriver.CUcontext>(-2)
+cdef const cydriver.CUcontext CU_CONTEXT_INVALID = <cydriver.CUcontext>(-2)
 
 
 cdef cydriver.CUdevice get_device_from_ctx(
diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py
index 972e83e13..b4d769eab 100644
--- a/cuda_core/tests/helpers/buffers.py
+++ b/cuda_core/tests/helpers/buffers.py
@@ -3,12 +3,18 @@
 
 import ctypes
 
-from cuda.core.experimental import Buffer, MemoryResource
+from cuda.core.experimental import Buffer, Device, MemoryResource
 from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
 from . import libc
 
-__all__ = ["DummyUnifiedMemoryResource", "PatternGen", "make_scratch_buffer", "compare_equal_buffers"]
+__all__ = [
+    "compare_buffer_to_constant",
+    "compare_equal_buffers",
+    "DummyUnifiedMemoryResource",
+    "make_scratch_buffer",
+    "PatternGen",
+]
 
 
 class DummyUnifiedMemoryResource(MemoryResource):
@@ -103,11 +109,16 @@ def _get_pattern_buffer(self, seed, value):
 def make_scratch_buffer(device, value, nbytes):
     """Create a unified memory buffer with the specified value."""
     buffer = DummyUnifiedMemoryResource(device).allocate(nbytes)
-    ptr = ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_byte))
-    ctypes.memset(ptr, value & 0xFF, nbytes)
+    set_buffer(buffer, value)
     return buffer
 
 
+def set_buffer(buffer, value):
+    assert 0 <= int(value) < 256
+    ptr = ctypes.cast(int(buffer.handle), ctypes.POINTER(ctypes.c_byte))
+    ctypes.memset(ptr, value & 0xFF, buffer.size)
+
+
 def compare_equal_buffers(buffer1, buffer2):
     """Compare the contents of two host-accessible buffers for bitwise equality."""
     if buffer1.size != buffer2.size:
@@ -115,3 +126,17 @@ def compare_equal_buffers(buffer1, buffer2):
     ptr1 = ctypes.cast(int(buffer1.handle), ctypes.POINTER(ctypes.c_byte))
     ptr2 = ctypes.cast(int(buffer2.handle), ctypes.POINTER(ctypes.c_byte))
     return libc.memcmp(ptr1, ptr2, buffer1.size) == 0
+
+
+def compare_buffer_to_constant(buffer, value):
+    device_id = buffer.memory_resource.device_id
+    device = Device(device_id)
+    stream = device.create_stream()
+    expected = make_scratch_buffer(device, value, buffer.size)
+    tmp = make_scratch_buffer(device, 0, buffer.size)
+    tmp.copy_from(buffer, stream=stream)
+    stream.sync()
+    result = compare_equal_buffers(expected, tmp)
+    expected.close()
+    tmp.close()
+    return result
diff --git a/cuda_core/tests/helpers/misc.py b/cuda_core/tests/helpers/misc.py
new file mode 100644
index 000000000..33508091a
--- /dev/null
+++ b/cuda_core/tests/helpers/misc.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental import Stream
+
+
+class StreamWrapper:
+    """
+    A wrapper around Stream for testing IsStreamT conversions.
+    """
+
+    def __init__(self, stream: Stream):
+        self._stream = stream
+
+    def __cuda_stream__(self):
+        return self._stream.__cuda_stream__()
+
+    def close(self):
+        self._stream.close()
+
+    @property
+    def handle(self):
+        return self._stream.handle
+
+    def sync(self):
+        return self._stream.sync()
+
+    def __eq__(self, rhs):
+        return self._stream == Stream._init(rhs)
+
+    def __hash__(self):
+        return hash(self._stream)
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
new file mode 100644
index 000000000..44a5be261
--- /dev/null
+++ b/cuda_core/tests/test_graph_mem.py
@@ -0,0 +1,254 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import pytest
+from cuda.core.experimental import (
+    Device,
+    DeviceMemoryResource,
+    GraphCompleteOptions,
+    GraphMemoryResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    launch,
+)
+from helpers.buffers import compare_buffer_to_constant, make_scratch_buffer, set_buffer
+
+
+def _common_kernels_alloc():
+    code = """
+    __global__ void set_zero(char *a, size_t nbytes) {
+        size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t stride = blockDim.x * gridDim.x;
+        for (size_t i = idx; i < nbytes; i += stride) {
+            a[i] = 0;
+        }
+    }
+    __global__ void add_one(char *a, size_t nbytes) {
+        size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t stride = blockDim.x * gridDim.x;
+        for (size_t i = idx; i < nbytes; i += stride) {
+            a[i] += 1;
+        }
+    }
+    """
+    arch = "".join(f"{i}" for i in Device().compute_capability)
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("set_zero", "add_one"))
+    return mod
+
+
+class GraphMemoryTestManager:
+    """
+    Manages changes to the state of the graph memory system, for testing.
+    """
+
+    def __init__(self, gmr, stream, mode=None):
+        self.device = Device(gmr.device_id)
+        self.gmr = gmr
+        self.stream = stream
+        self.mode = "relaxed" if mode is None else mode
+
+    def reset(self):
+        """Trim unused graph memory and reset usage statistics."""
+        self.gmr.trim()
+        self.gmr.attributes.reserved_mem_high = 0
+        self.gmr.attributes.used_mem_high = 0
+
+    def alloc(self, num, nbytes):
+        """Allocate num buffers of size nbytes from graph memory."""
+        gb = self.device.create_graph_builder().begin_building(self.mode)
+        buffers = [self.gmr.allocate(nbytes, stream=gb) for _ in range(num)]
+        graph = gb.end_building().complete()
+        graph.upload(self.stream)
+        graph.launch(self.stream)
+        self.stream.sync()
+        return buffers
+
+    def free(self, buffers):
+        """Free graph memory buffers."""
+        for buffer in buffers:
+            buffer.close(stream=self.stream)
+        self.stream.sync()
+
+
+@pytest.mark.parametrize("mode", ["no_graph", "global", "thread_local", "relaxed"])
+def test_graph_alloc(init_cuda, mode):
+    """Test basic graph capture with memory allocated and deallocated by GraphMemoryResource."""
+    NBYTES = 64
+    device = Device()
+    stream = device.create_stream()
+    dmr = DeviceMemoryResource(device)
+    gmr = GraphMemoryResource(device)
+    out = dmr.allocate(NBYTES, stream=stream)
+
+    # Get kernels and define the calling sequence.
+    mod = _common_kernels_alloc()
+    set_zero = mod.get_kernel("set_zero")
+    add_one = mod.get_kernel("add_one")
+
+    def apply_kernels(mr, stream, out):
+        buffer = mr.allocate(NBYTES, stream=stream)
+        config = LaunchConfig(grid=1, block=1)
+        for kernel in [set_zero, add_one, add_one]:
+            launch(stream, config, kernel, buffer, NBYTES)
+        out.copy_from(buffer, stream=stream)
+        buffer.close()
+
+    # Apply kernels, with or without graph capture.
+    if mode == "no_graph":
+        # Do work without graph capture.
+        apply_kernels(mr=dmr, stream=stream, out=out)
+    else:
+        # Capture work, then upload and launch.
+        gb = device.create_graph_builder().begin_building(mode)
+        apply_kernels(mr=gmr, stream=gb, out=out)
+        graph = gb.end_building().complete()
+        graph.upload(stream)
+        graph.launch(stream)
+
+    stream.sync()
+
+    # Check the result on the host.
+    assert compare_buffer_to_constant(out, 2)
+
+
+@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+def test_graph_alloc_with_output(init_cuda, mode):
+    """Test for memory allocated in a graph being used outside the graph."""
+    NBYTES = 64
+    device = Device()
+    stream = device.create_stream()
+    gmr = GraphMemoryResource(device)
+
+    # Get kernels and define the calling sequence.
+    mod = _common_kernels_alloc()
+    add_one = mod.get_kernel("add_one")
+
+    # Make an input of 0s.
+    in_ = make_scratch_buffer(device, 0, NBYTES)
+
+    # Construct a graph to copy and increment the input. It returns a new
+    # buffer allocated within the graph.  The auto_free_on_launch option
+    # is required to properly use the output buffer.
+    gb = device.create_graph_builder().begin_building(mode)
+    out = gmr.allocate(NBYTES, gb)
+    out.copy_from(in_, stream=gb)
+    launch(gb, LaunchConfig(grid=1, block=1), add_one, out, NBYTES)
+    options = GraphCompleteOptions(auto_free_on_launch=True)
+    graph = gb.end_building().complete(options)
+
+    # Launch the graph. The output buffer is allocated and set to one.
+    graph.upload(stream)
+    graph.launch(stream)
+    stream.sync()
+    assert compare_buffer_to_constant(out, 1)
+
+    # Update the input buffer and rerun the graph.
+    set_buffer(in_, 5)
+    graph.upload(stream)
+    graph.launch(stream)
+    stream.sync()
+    assert compare_buffer_to_constant(out, 6)
+
+
+@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+def test_graph_mem_set_attributes(init_cuda, mode):
+    device = Device()
+    stream = device.create_stream()
+    gmr = GraphMemoryResource(device)
+    mman = GraphMemoryTestManager(gmr, stream, mode)
+
+    # Make an allocation and obvserve usage.
+    buffer = mman.alloc(1, 1024)
+    assert gmr.attributes.reserved_mem_current > 0
+    assert gmr.attributes.reserved_mem_high > 0
+    assert gmr.attributes.used_mem_current > 0
+    assert gmr.attributes.used_mem_high > 0
+
+    # Incorrect attribute usage.
+    with pytest.raises(AttributeError, match=r"property 'reserved_mem_current' .* has no setter"):
+        gmr.attributes.reserved_mem_current = 0
+
+    with pytest.raises(AttributeError, match=r"Attribute 'reserved_mem_high' may only be set to zero \(got 1\)\."):
+        gmr.attributes.reserved_mem_high = 1
+
+    with pytest.raises(AttributeError, match=r"property 'used_mem_current' .* has no setter"):
+        gmr.attributes.used_mem_current = 0
+
+    with pytest.raises(AttributeError, match=r"Attribute 'used_mem_high' may only be set to zero \(got 1\)\."):
+        gmr.attributes.used_mem_high = 1
+
+    # Free memory, but usage is not reduced yet.
+    mman.free(buffer)
+    assert gmr.attributes.reserved_mem_current > 0
+    assert gmr.attributes.reserved_mem_high > 0
+    assert gmr.attributes.used_mem_current > 0
+    assert gmr.attributes.used_mem_high > 0
+
+    gmr.trim()
+
+    # The high-water marks remain after free and trim.
+    assert gmr.attributes.reserved_mem_current == 0
+    assert gmr.attributes.reserved_mem_high > 0
+    assert gmr.attributes.used_mem_current == 0
+    assert gmr.attributes.used_mem_high > 0
+
+    # Reset the high-water marks.
+    gmr.attributes.reserved_mem_high = 0
+    gmr.attributes.used_mem_high = 0
+
+    assert gmr.attributes.reserved_mem_high == 0
+    assert gmr.attributes.used_mem_high == 0
+
+    mman.reset()
+
+
+@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+def test_gmr_check_capture_state(init_cuda, mode):
+    """
+    Test expected errors (and non-errors) using GraphMemoryResource with graph
+    capture.
+    """
+    device = Device()
+    stream = device.create_stream()
+    gmr = GraphMemoryResource(device)
+
+    # Not capturing
+    with pytest.raises(
+        RuntimeError,
+        match=r"GraphMemoryResource cannot perform memory operations on a "
+        r"non-capturing stream\.",
+    ):
+        gmr.allocate(1, stream=stream)
+
+    # Capturing
+    gb = device.create_graph_builder().begin_building(mode=mode)
+    gmr.allocate(1, stream=gb)  # no error
+    gb.end_building().complete()
+
+
+@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
+def test_dmr_check_capture_state(init_cuda, mode):
+    """
+    Test expected errors (and non-errors) using DeviceMemoryResource with graph
+    capture.
+    """
+    device = Device()
+    stream = device.create_stream()
+    dmr = DeviceMemoryResource(device)
+
+    # Not capturing
+    dmr.allocate(1, stream=stream).close()  # no error
+
+    # Capturing
+    gb = device.create_graph_builder().begin_building(mode=mode)
+    with pytest.raises(
+        RuntimeError,
+        match=r"DeviceMemoryResource cannot perform memory operations on a capturing "
+        r"stream \(consider using GraphMemoryResource\)\.",
+    ):
+        dmr.allocate(1, stream=gb)
+    gb.end_building().complete()
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index a951fc418..f2fd64344 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -22,6 +22,7 @@
 )
 from cuda.core.experimental._memory import _SynchronousMemoryResource
 from cuda.core.experimental._utils.cuda_utils import CUDAError
+from helpers.misc import StreamWrapper
 
 from conftest import skipif_need_cuda_headers
 
@@ -179,9 +180,10 @@ def test_launch_invalid_values(init_cuda):
     )
 
 
+@pytest.mark.parametrize("wrap_stream", [True, False])
 @pytest.mark.parametrize("python_type, cpp_type, init_value", PARAMS)
 @pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
-def test_launch_scalar_argument(python_type, cpp_type, init_value):
+def test_launch_scalar_argument(python_type, cpp_type, init_value, wrap_stream):
     dev = Device()
     dev.set_current()
 
@@ -219,19 +221,25 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value):
     ker = mod.get_kernel(ker_name)
 
     # Launch with 1 thread
+    stream = dev.default_stream
+    if wrap_stream:
+        stream = StreamWrapper(stream)
     config = LaunchConfig(grid=1, block=1)
-    launch(dev.default_stream, config, ker, arr.ctypes.data, scalar)
-    dev.default_stream.sync()
+    launch(stream, config, ker, arr.ctypes.data, scalar)
+    stream.sync()
 
     # Check result
     assert arr[0] == init_value, f"Expected {init_value}, got {arr[0]}"
 
 
 @skipif_need_cuda_headers  # cg
-def test_cooperative_launch():
+@pytest.mark.parametrize("wrap_stream", [True, False])
+def test_cooperative_launch(wrap_stream):
     dev = Device()
     dev.set_current()
     s = dev.create_stream(options={"nonblocking": True})
+    if wrap_stream:
+        s = StreamWrapper(s)
 
     # CUDA kernel templated on type T
     code = r"""
@@ -272,6 +280,7 @@ def test_cooperative_launch():
 
 
 @pytest.mark.skipif(cp is None, reason="cupy not installed")
+@pytest.mark.parametrize("wrap_stream", [True, False])
 @pytest.mark.parametrize(
     "memory_resource_class",
     [
@@ -285,11 +294,13 @@ def test_cooperative_launch():
         ),
     ],
 )
-def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class):
+def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_resource_class, wrap_stream):
     """Test that kernels can access memory allocated by memory resources."""
     dev = Device()
     dev.set_current()
     stream = dev.create_stream()
+    if wrap_stream:
+        stream = StreamWrapper(stream)
     # tell CuPy to use our stream as the current stream:
     cp.cuda.ExternalStream(int(stream.handle)).use()
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index a261ec7a3..5ca4e4121 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -13,6 +13,7 @@
     np = None
 import ctypes
 import platform
+import re
 
 import pytest
 from cuda.core.experimental import (
@@ -20,6 +21,7 @@
     Device,
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
+    GraphMemoryResource,
     MemoryResource,
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
@@ -29,6 +31,7 @@
 from cuda.core.experimental._utils.cuda_utils import handle_return
 from cuda.core.experimental.utils import StridedMemoryView
 from helpers.buffers import DummyUnifiedMemoryResource
+from helpers.misc import StreamWrapper
 
 from cuda_python_test_helpers import supports_ipc_mempool
 
@@ -133,6 +136,7 @@ def test_package_contents():
         "MemoryResource",
         "DeviceMemoryResource",
         "DeviceMemoryResourceOptions",
+        "GraphMemoryResource",
         "IPCBufferDescriptor",
         "IPCAllocationHandle",
         "LegacyPinnedMemoryResource",
@@ -164,10 +168,12 @@ def test_buffer_initialization():
     buffer_initialization(DummyPinnedMemoryResource(device))
 
 
-def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
+def buffer_copy_to(dummy_mr: MemoryResource, device: Device, wrap_stream, check=False):
     src_buffer = dummy_mr.allocate(size=1024)
     dst_buffer = dummy_mr.allocate(size=1024)
     stream = device.create_stream()
+    if wrap_stream:
+        stream = StreamWrapper(stream)
 
     if check:
         src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
@@ -187,18 +193,21 @@ def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
     src_buffer.close()
 
 
-def test_buffer_copy_to():
+@pytest.mark.parametrize("wrap_stream", [True, False])
+def test_buffer_copy_to(wrap_stream):
     device = Device()
     device.set_current()
-    buffer_copy_to(DummyDeviceMemoryResource(device), device)
-    buffer_copy_to(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True)
+    buffer_copy_to(DummyDeviceMemoryResource(device), device, wrap_stream)
+    buffer_copy_to(DummyUnifiedMemoryResource(device), device, wrap_stream)
+    buffer_copy_to(DummyPinnedMemoryResource(device), device, wrap_stream, check=True)
 
 
-def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
+def buffer_copy_from(dummy_mr: MemoryResource, device, wrap_stream, check=False):
     src_buffer = dummy_mr.allocate(size=1024)
     dst_buffer = dummy_mr.allocate(size=1024)
     stream = device.create_stream()
+    if wrap_stream:
+        stream = StreamWrapper(stream)
 
     if check:
         src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
@@ -218,12 +227,13 @@ def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
     src_buffer.close()
 
 
-def test_buffer_copy_from():
+@pytest.mark.parametrize("wrap_stream", [True, False])
+def test_buffer_copy_from(wrap_stream):
     device = Device()
     device.set_current()
-    buffer_copy_from(DummyDeviceMemoryResource(device), device)
-    buffer_copy_from(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True)
+    buffer_copy_from(DummyDeviceMemoryResource(device), device, wrap_stream)
+    buffer_copy_from(DummyUnifiedMemoryResource(device), device, wrap_stream)
+    buffer_copy_from(DummyPinnedMemoryResource(device), device, wrap_stream, check=True)
 
 
 def buffer_close(dummy_mr: MemoryResource):
@@ -286,13 +296,18 @@ def test_buffer_dunder_dlpack_device_failure():
 
 
 @pytest.mark.parametrize("use_device_object", [True, False])
-def test_device_memory_resource_initialization(mempool_device, use_device_object):
+def test_device_memory_resource_initialization(use_device_object):
     """Test that DeviceMemoryResource can be initialized successfully.
 
     This test verifies that the DeviceMemoryResource initializes properly,
     including the release threshold configuration for performance optimization.
     """
-    device = mempool_device
+    device = Device()
+
+    if not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
+
+    device.set_current()
 
     # This should succeed and configure the memory pool release threshold.
     # The resource can be constructed from either a device or device ordinal.
@@ -481,11 +496,16 @@ def test_vmm_allocator_rdma_unsupported_exception():
         VirtualMemoryResource(device, config=options)
 
 
-def test_mempool(mempool_device):
-    device = mempool_device
+def test_device_memory_resource():
+    device = Device()
+
+    if not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
+
+    device.set_current()
 
     # Test basic pool creation
-    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False)
+    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE)
     mr = DeviceMemoryResource(device, options=options)
     assert mr.device_id == device.device_id
     assert mr.is_device_accessible
@@ -513,18 +533,26 @@ def test_mempool(mempool_device):
     buffer = mr.allocate(1024, stream=stream)
     assert buffer.handle != 0
     buffer.close()
+    buffer = mr.allocate(1024, stream=StreamWrapper(stream))
+    assert buffer.handle != 0
+    buffer.close()
 
     # Test memory copying between buffers from same pool
     src_buffer = mr.allocate(64)
     dst_buffer = mr.allocate(64)
     stream = device.create_stream()
     src_buffer.copy_to(dst_buffer, stream=stream)
+    src_buffer.copy_to(dst_buffer, stream=StreamWrapper(stream))
     device.sync()
     dst_buffer.close()
     src_buffer.close()
 
-    # Test error cases
-    # Test IPC operations are disabled
+
+def test_mempool_ipc_errors(mempool_device):
+    """Test error cases when IPC operations are disabled."""
+    device = mempool_device
+    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False)
+    mr = DeviceMemoryResource(device, options=options)
     buffer = mr.allocate(64)
     ipc_error_msg = "Memory resource is not IPC-enabled"
 
@@ -599,6 +627,22 @@ def test_mempool_attributes(ipc_enabled, mempool_device, property_name, expected
         assert value >= current_value, f"{property_name} should be >= {current_prop}"
 
 
+def test_mempool_attributes_repr(mempool_device):
+    device = Device()
+    device.set_current()
+    mr = DeviceMemoryResource(device, options={"max_size": 2048})
+    buffer1 = mr.allocate(64)
+    buffer2 = mr.allocate(64)
+    buffer1.close()
+    assert re.match(
+        r"DeviceMemoryResourceAttributes\(release_threshold=\d+, reserved_mem_current=\d+, reserved_mem_high=\d+, "
+        r"reuse_allow_internal_dependencies=(True|False), reuse_allow_opportunistic=(True|False), "
+        r"reuse_follow_event_dependencies=(True|False), used_mem_current=64, used_mem_high=128\)",
+        str(mr.attributes),
+    )
+    buffer2.close()
+
+
 def test_mempool_attributes_ownership(mempool_device):
     """Ensure the attributes bundle handles references correctly."""
     device = mempool_device
@@ -644,3 +688,14 @@ def test_strided_memory_view_refcnt():
     assert av.strides[0] == 1
     assert av.strides[1] == 64
     assert sys.getrefcount(av.strides) >= 2
+
+
+def test_graph_memory_resource_object(init_cuda):
+    device = Device()
+    gmr1 = GraphMemoryResource(device)
+    gmr2 = GraphMemoryResource(device)
+    gmr3 = GraphMemoryResource(device.device_id)
+
+    # These objects are interned.
+    assert gmr1 is gmr2 is gmr3
+    assert gmr1 == gmr2 == gmr3
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index c8165548f..2e10fb100 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -6,6 +6,7 @@
 from cuda.core.experimental._event import Event
 from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
 from cuda.core.experimental._utils.cuda_utils import driver
+from helpers.misc import StreamWrapper
 
 
 def test_stream_init_disabled():
@@ -76,9 +77,12 @@ def test_stream_context(init_cuda):
     assert context._handle is not None
 
 
-def test_stream_from_foreign_stream(init_cuda):
+@pytest.mark.parametrize("wrap_stream", [True, False])
+def test_stream_from_foreign_stream(init_cuda, wrap_stream):
     device = Device()
     other_stream = device.create_stream(options=StreamOptions())
+    if wrap_stream:
+        other_stream = StreamWrapper(other_stream)
     stream = device.create_stream(obj=other_stream)
     # Now that __eq__ is implemented (issue #664), we can compare directly
     assert other_stream == stream