From 35d7dd56c3551965d83115577da8d931daed8868 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Wed, 10 Sep 2025 13:22:18 -0700
Subject: [PATCH 01/35] commit initial draft

---
 cuda_core/cuda/core/experimental/_memory.pyx | 146 +++++++++++++++++++
 1 file changed, 146 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 44e7a77c7..22e6b87ae 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -508,3 +508,149 @@ class _SynchronousMemoryResource(MemoryResource):
     @property
     def device_id(self) -> int:
         return self._dev_id
+
+@dataclass
+class VMMConfig:
+    """A configuration object for the VMMAllocatedMemoryResource
+       Stores configuration information which tells the resource how to use the CUDA VMM APIs
+    """
+    """
+    Configuration for CUDA VMM allocations.
+
+    Args:
+        handle_type: Export handle type for the physical allocation. Use
+            CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR on Linux if you plan to
+            import/export the allocation (required for cuMemRetainAllocationHandle).
+            Use CU_MEM_HANDLE_TYPE_NONE if you don't need an exportable handle.
+        gpu_direct_rdma: Hint that the allocation should be GDR-capable (if supported).
+        granularity: 'recommended' or 'minimum'. Controls granularity query and size rounding.
+        addr_hint: A (optional) virtual address hint to try to reserve at. 0 -> let CUDA choose.
+        addr_align: Alignment for the VA reservation. If None, use the queried granularity.
+        peers: Extra device IDs that should be granted access in addition to `device`.
+        self_access: Access flags for the owning device ('rw', 'r', or 'none').
+        peer_access: Access flags for peers ('rw' or 'r').
+    """
+    handle_type: int  # driver.CUmemAllocationHandleType
+    gpu_direct_rdma: bool = True
+    granularity: str = "recommended"  # or "minimum"
+    addr_hint: int = 0
+    addr_align: Optional[int] = None
+    peers: Iterable[int] = field(default_factory=tuple)
+    self_access: str = "rw"   # 'rw' | 'r' | 'none'
+    peer_access: str = "rw"   # 'rw' | 'r'
+
+    def _granularity_flag(self, driver) -> int:
+        # Prefer recommended granularity unless user asked for minimum
+        try:
+            flags = driver.CUmemAllocationGranularity_flags
+            return (flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
+                    if self.granularity == "minimum"
+                    else flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)
+        except AttributeError:
+            # Fallback if enum names differ in your bindings
+            return 0
+
+    @staticmethod
+    def _access_to_flags(driver, spec: str) -> int:
+        f = driver.CUmemAccess_flags
+        if spec == "rw":
+            return f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+        if spec == "r":
+            return f.CU_MEM_ACCESS_FLAGS_PROT_READ
+        if spec == "none":
+            return 0
+        raise ValueError(f"Unknown access spec: {spec!r}")
+    
+
+class VMMAllocatedMemoryResource(MemoryResource):
+    """Create a device memory resource that uses the CUDA VMM APIs to allocate memory.
+
+    Parameters
+    ----------
+    device_id : int
+        Device ordinal for which a memory resource is constructed. The mempool that is
+        set to *current* on ``device_id`` is used. If no mempool is set to current yet,
+        the driver would use the *default* mempool on the device.
+    
+    config : VMMConfig
+    """
+
+    __slots__ = ("_dev_id",)
+
+    def __init__(self, device_id: int):
+        err, self._handle = driver.cuDeviceGetMemPool(device_id)
+        raise_if_driver_error(err)
+        self._dev_id = device_id
+
+        # Set a higher release threshold to improve performance when there are no active allocations.
+        # By default, the release threshold is 0, which means memory is immediately released back
+        # to the OS when there are no active suballocations, causing performance issues.
+        # Check current release threshold
+        err, current_threshold = driver.cuMemPoolGetAttribute(
+            self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD
+        )
+        raise_if_driver_error(err)
+        # If threshold is 0 (default), set it to maximum to retain memory in the pool
+        if int(current_threshold) == 0:
+            err, = driver.cuMemPoolSetAttribute(
+                self._handle,
+                driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                driver.cuuint64_t(0xFFFFFFFFFFFFFFFF),
+            )
+            raise_if_driver_error(err)
+
+    def allocate(self, size_t size, stream: Stream = None) -> Buffer:
+        """Allocate a buffer of the requested size.
+
+        Parameters
+        ----------
+        size : int
+            The size of the buffer to allocate, in bytes.
+        stream : Stream, optional
+            The stream on which to perform the allocation asynchronously.
+            If None, an internal stream is used.
+
+        Returns
+        -------
+        Buffer
+            The allocated buffer object, which is accessible on the device that this memory
+            resource was created for.
+        """
+        if stream is None:
+            stream = default_stream()
+        err, ptr = driver.cuMemAllocFromPoolAsync(size, self._handle, stream.handle)
+        raise_if_driver_error(err)
+        return Buffer._init(ptr, size, self)
+
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
+        """Deallocate a buffer previously allocated by this resource.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerT`
+            The pointer or handle to the buffer to deallocate.
+        size : int
+            The size of the buffer to deallocate, in bytes.
+        stream : Stream, optional
+            The stream on which to perform the deallocation asynchronously.
+            If None, an internal stream is used.
+        """
+        if stream is None:
+            stream = default_stream()
+        err, = driver.cuMemFreeAsync(ptr, stream.handle)
+        raise_if_driver_error(err)
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """bool: this memory resource provides device-accessible buffers."""
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """bool: this memory resource does not provides host-accessible buffers."""
+        return False
+
+    @property
+    def device_id(self) -> int:
+        """int: the associated device ordinal."""
+        return self._dev_id

From 1de97e2897f1a2f79dd84a87742ac992a6c0021b Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Fri, 12 Sep 2025 10:18:39 -0700
Subject: [PATCH 02/35] add modification/growing option

---
 cuda_core/cuda/core/experimental/_memory.pyx | 420 +++++++++++++++----
 1 file changed, 348 insertions(+), 72 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 22e6b87ae..70880a69b 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -530,26 +530,18 @@ class VMMConfig:
         self_access: Access flags for the owning device ('rw', 'r', or 'none').
         peer_access: Access flags for peers ('rw' or 'r').
     """
-    handle_type: int  # driver.CUmemAllocationHandleType
+    # TODO: for enums, do we re-expose them as cuda-core Enums or leave them as driver enums?
+    allocation_type: driver.CUmemAllocationType
+    location_type: driver.CUmemLocationType # Only supports CU_MEM_LOCATION_TYPE_DEVICE
+    handle_type: driver.CUmemAllocationHandleType
     gpu_direct_rdma: bool = True
-    granularity: str = "recommended"  # or "minimum"
-    addr_hint: int = 0
+    granularity: driver.CUmemAllocationGranularity_flags
+    addr_hint: Optional[int] = 0
     addr_align: Optional[int] = None
     peers: Iterable[int] = field(default_factory=tuple)
     self_access: str = "rw"   # 'rw' | 'r' | 'none'
     peer_access: str = "rw"   # 'rw' | 'r'
 
-    def _granularity_flag(self, driver) -> int:
-        # Prefer recommended granularity unless user asked for minimum
-        try:
-            flags = driver.CUmemAllocationGranularity_flags
-            return (flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
-                    if self.granularity == "minimum"
-                    else flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)
-        except AttributeError:
-            # Fallback if enum names differ in your bindings
-            return 0
-
     @staticmethod
     def _access_to_flags(driver, spec: str) -> int:
         f = driver.CUmemAccess_flags
@@ -573,84 +565,368 @@ class VMMAllocatedMemoryResource(MemoryResource):
         the driver would use the *default* mempool on the device.
     
     config : VMMConfig
+        A configuration object for the VMMAllocatedMemoryResource
     """
-
-    __slots__ = ("_dev_id",)
-
-    def __init__(self, device_id: int):
-        err, self._handle = driver.cuDeviceGetMemPool(device_id)
-        raise_if_driver_error(err)
-        self._dev_id = device_id
-
-        # Set a higher release threshold to improve performance when there are no active allocations.
-        # By default, the release threshold is 0, which means memory is immediately released back
-        # to the OS when there are no active suballocations, causing performance issues.
-        # Check current release threshold
-        err, current_threshold = driver.cuMemPoolGetAttribute(
-            self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD
-        )
-        raise_if_driver_error(err)
-        # If threshold is 0 (default), set it to maximum to retain memory in the pool
-        if int(current_threshold) == 0:
-            err, = driver.cuMemPoolSetAttribute(
-                self._handle,
-                driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-                driver.cuuint64_t(0xFFFFFFFFFFFFFFFF),
+    def __init__(self, device, config: VMMConfig = None):
+        self.device = device
+        if config is None:
+            config = VMMConfig(
+                allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
+                location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
+                handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+                gpu_direct_rdma=True,
+                granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED,
+                addr_hint=0,
+                addr_align=None,
+                peers=(),
+                self_access="rw",
+                peer_access="rw",
             )
-            raise_if_driver_error(err)
+        self.config = config
 
-    def allocate(self, size_t size, stream: Stream = None) -> Buffer:
-        """Allocate a buffer of the requested size.
+    def _align_up(self, size: int, gran: int) -> int:
+        """
+        Align a size up to the nearest multiple of a granularity.
+        """
+        return (size + gran - 1) & ~(gran - 1)
 
+    def modify_allocation(self, buf: Buffer, new_size: int, config: VMMConfig = None) -> Buffer:
+        """
+        Grow an existing allocation using CUDA VMM, with a configurable policy.
+        
+        This implements true growing allocations that preserve the base pointer
+        by extending the virtual address range and mapping additional physical memory.
+        
         Parameters
         ----------
-        size : int
-            The size of the buffer to allocate, in bytes.
-        stream : Stream, optional
-            The stream on which to perform the allocation asynchronously.
-            If None, an internal stream is used.
-
+        buf : Buffer
+            The existing buffer to grow
+        new_size : int
+            The new total size for the allocation
+        config : VMMConfig, optional
+            Configuration for the new physical memory chunks. If None, uses current config.
+            
         Returns
         -------
         Buffer
-            The allocated buffer object, which is accessible on the device that this memory
-            resource was created for.
+            The same buffer with updated size, preserving the original pointer
         """
-        if stream is None:
-            stream = default_stream()
-        err, ptr = driver.cuMemAllocFromPoolAsync(size, self._handle, stream.handle)
-        raise_if_driver_error(err)
-        return Buffer._init(ptr, size, self)
-
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
-        """Deallocate a buffer previously allocated by this resource.
+        if new_size <= buf.size:
+            # No growth needed, return original buffer
+            return buf
+            
+        if config is not None:
+            self.config = config
+            
+        # Build allocation properties for new chunks
+        prop = driver.CUmemAllocationProp()
+        prop.type = self.config.allocation_type
+        prop.location.type = self.config.location_type
+        prop.location.id = self.device.device_id
+        prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0
+        prop.requestedHandleTypes = self.config.handle_type
+        
+        # Query granularity
+        gran_flag = self.config.granularity
+        res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
+        if res != driver.CUresult.CUDA_SUCCESS:
+            raise Exception(f"cuMemGetAllocationGranularity failed: {res}")
+            
+        # Calculate sizes
+        additional_size = new_size - buf.size
+        aligned_additional_size = self._align_up(additional_size, gran)
+        total_aligned_size = self._align_up(new_size, gran)
+        addr_align = self.config.addr_align or gran
+        
+        # Try to extend the existing VA range first
+        res, new_ptr = driver.cuMemAddressReserve(
+            aligned_additional_size, 
+            addr_align, 
+            buf.ptr + buf.size,  # fixedAddr hint - try to extend at end of current range
+            0
+        )
+        
+        if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (buf.ptr + buf.size):
+            # Fallback: couldn't extend contiguously, need full remapping
+            return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align)
+        else:
+            # Success! We can extend the VA range contiguously
+            return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr)
 
-        Parameters
-        ----------
-        ptr : :obj:`~_memory.DevicePointerT`
-            The pointer or handle to the buffer to deallocate.
-        size : int
-            The size of the buffer to deallocate, in bytes.
-        stream : Stream, optional
-            The stream on which to perform the deallocation asynchronously.
-            If None, an internal stream is used.
+    def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, 
+                                   aligned_additional_size: int, new_ptr: int) -> Buffer:
         """
-        if stream is None:
-            stream = default_stream()
-        err, = driver.cuMemFreeAsync(ptr, stream.handle)
-        raise_if_driver_error(err)
+        Fast path: extend the VA range contiguously.
+        
+        This preserves the original pointer by mapping new physical memory
+        to the extended portion of the virtual address range.
+        """
+        # Create new physical memory for the additional size
+        res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
+        if res != driver.CUresult.CUDA_SUCCESS:
+            driver.cuMemAddressFree(new_ptr, aligned_additional_size)
+            raise Exception(f"cuMemCreate failed: {res}")
+        
+        # Map the new physical memory to the extended VA range
+        res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
+        if res != driver.CUresult.CUDA_SUCCESS:
+            driver.cuMemAddressFree(new_ptr, aligned_additional_size)
+            driver.cuMemRelease(new_handle)
+            raise Exception(f"cuMemMap failed: {res}")
+        
+        # Set access permissions for the new portion
+        descs = self._build_access_descriptors(prop)
+        if descs:
+            res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs))
+            if res != driver.CUresult.CUDA_SUCCESS:
+                driver.cuMemUnmap(new_ptr, aligned_additional_size)
+                driver.cuMemAddressFree(new_ptr, aligned_additional_size)
+                driver.cuMemRelease(new_handle)
+                raise Exception(f"cuMemSetAccess failed: {res}")
+        
+        # Update the buffer size (pointer stays the same!)
+        buf._size = new_size
+        
+        return buf
+
+    def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
+                                   aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer:
+        """
+        Slow path: full remapping when contiguous extension fails.
+        
+        This creates a new VA range and remaps both old and new physical memory.
+        The buffer's pointer will change.
+        """
+        # Reserve a completely new, larger VA range
+        res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0)
+        if res != driver.CUresult.CUDA_SUCCESS:
+            raise Exception(f"cuMemAddressReserve failed: {res}")
+        
+        # Get the old allocation handle for remapping
+        result, old_handle = driver.cuMemRetainAllocationHandle(buf.ptr)
+        if result != driver.CUresult.CUDA_SUCCESS:
+            driver.cuMemAddressFree(new_ptr, total_aligned_size)
+            raise Exception(f"Failed to retain old allocation handle: {result}")
+        
+        # Unmap the old VA range
+        result, = driver.cuMemUnmap(buf.ptr, buf.size)
+        if result != driver.CUresult.CUDA_SUCCESS:
+            driver.cuMemAddressFree(new_ptr, total_aligned_size)
+            driver.cuMemRelease(old_handle)
+            raise Exception(f"Failed to unmap old allocation: {result}")
+        
+        # Remap the old physical memory to the new VA range
+        res, = driver.cuMemMap(new_ptr, buf.size, 0, old_handle, 0)
+        if res != driver.CUresult.CUDA_SUCCESS:
+            driver.cuMemAddressFree(new_ptr, total_aligned_size)
+            driver.cuMemRelease(old_handle)
+            raise Exception(f"cuMemMap failed for old memory: {res}")
+        
+        # Create new physical memory for the additional size
+        res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
+        if res != driver.CUresult.CUDA_SUCCESS:
+            driver.cuMemUnmap(new_ptr, total_aligned_size)
+            driver.cuMemAddressFree(new_ptr, total_aligned_size)
+            driver.cuMemRelease(old_handle)
+            raise Exception(f"cuMemCreate failed for new memory: {res}")
+        
+        # Map the new physical memory to the extended portion
+        res, = driver.cuMemMap(new_ptr + buf.size, aligned_additional_size, 0, new_handle, 0)
+        if res != driver.CUresult.CUDA_SUCCESS:
+            driver.cuMemUnmap(new_ptr, total_aligned_size)
+            driver.cuMemAddressFree(new_ptr, total_aligned_size)
+            driver.cuMemRelease(old_handle)
+            driver.cuMemRelease(new_handle)
+            raise Exception(f"cuMemMap failed for new memory: {res}")
+        
+        # Set access permissions for the entire new range
+        descs = self._build_access_descriptors(prop)
+        if descs:
+            res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs))
+            if res != driver.CUresult.CUDA_SUCCESS:
+                driver.cuMemUnmap(new_ptr, total_aligned_size)
+                driver.cuMemAddressFree(new_ptr, total_aligned_size)
+                driver.cuMemRelease(old_handle)
+                driver.cuMemRelease(new_handle)
+                raise Exception(f"cuMemSetAccess failed: {res}")
+        
+        # Free the old VA range
+        driver.cuMemAddressFree(buf.ptr, buf.size)
+        
+        # Update the buffer with new pointer and size
+        buf._ptr = new_ptr
+        buf._size = total_aligned_size
+        buf._ptr_obj = new_ptr
+        
+        return buf
+
+    def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list:
+        """
+        Build access descriptors for memory access permissions.
+        
+        Returns
+        -------
+        list
+            List of CUmemAccessDesc objects for setting memory access
+        """
+        descs = []
+        
+        # Owner access
+        owner_flags = VMMConfig._access_to_flags(driver, self.config.self_access)
+        if owner_flags:
+            d = driver.CUmemAccessDesc()
+            d.location.type = prop.location.type
+            d.location.id = prop.location.id
+            d.flags = owner_flags
+            descs.append(d)
+        
+        # Peer device access
+        peer_flags = VMMConfig._access_to_flags(driver, self.config.peer_access)
+        for peer_dev in self.config.peers:
+            if peer_flags:
+                d = driver.CUmemAccessDesc()
+                d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+                d.location.id = int(peer_dev)
+                d.flags = peer_flags
+                descs.append(d)
+        
+        return descs
+        
+
+    def allocate(self, size: int, stream: Stream = None) -> Buffer:
+        """
+        Allocate memory using CUDA VMM with a configurable policy.
+        """
+        config = self.config
+        # ---- Build allocation properties ----
+        prop = driver.CUmemAllocationProp()
+        prop.type = config.allocation_type
+        # TODO: Support host alloation if required
+        if config.location_type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
+            raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}")
+        prop.location.type = config.location_type
+        prop.location.id = self.device.device_id
+        prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0
+        prop.requestedHandleTypes = config.handle_type
+
+        # ---- Query and apply granularity ----
+        # Choose min vs recommended granularity per config
+        gran_flag = config.granularity
+        res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
+        if res != driver.CUresult.CUDA_SUCCESS:
+            raise Exception(f"cuMemGetAllocationGranularity failed: {res}")
+
+        aligned_size = self._align_up(size, gran)
+        addr_align = config.addr_align or gran
+
+        # ---- Create physical memory ----
+        res, handle = driver.cuMemCreate(aligned_size, prop, 0)
+        if res != driver.CUresult.CUDA_SUCCESS:
+            raise Exception(f"cuMemCreate failed: {res}")
+
+        # ---- Reserve VA space ----
+        # Potentially, use a separate size for the VA reservation from the physical allocation size
+        res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0)
+        if res != driver.CUresult.CUDA_SUCCESS:
+            # tidy up physical handle on failure
+            driver.cuMemRelease(handle)
+            raise Exception(f"cuMemAddressReserve failed: {res}")
+
+        # ---- Map physical memory into VA ----
+        res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0)
+        if res != driver.CUresult.CUDA_SUCCESS:
+            driver.cuMemAddressFree(ptr, aligned_size)
+            driver.cuMemRelease(handle)
+            raise Exception(f"cuMemMap failed: {res}")
+
+        # ---- Set access for owner + peers ----
+        descs = []
+
+        # Owner access
+        owner_flags = VMMAllocationConfig._access_to_flags(driver, config.self_access)
+        if owner_flags:
+            d = driver.CUmemAccessDesc()
+            d.location.type = prop.location.type
+            d.location.id = prop.location.id
+            d.flags = owner_flags
+            descs.append(d)
+
+        # Peer device access
+        peer_flags = VMMAllocationConfig._access_to_flags(driver, config.peer_access)
+        for peer_dev in config.peers:
+            if peer_flags:
+                d = driver.CUmemAccessDesc()
+                d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+                d.location.id = int(peer_dev)
+                d.flags = peer_flags
+                descs.append(d)
+
+        if descs:
+            res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs))
+            if res != driver.CUresult.CUDA_SUCCESS:
+                # Try to unwind on failure
+                driver.cuMemUnmap(ptr, aligned_size)
+                driver.cuMemAddressFree(ptr, aligned_size)
+                driver.cuMemRelease(handle)
+                raise Exception(f"cuMemSetAccess failed: {res}")
+
+        # Done — return a Buffer that tracks this VA range
+        buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self)
+        return buf
+
+    def deallocate(self, ptr: int, size: int, stream: Stream=None) -> None:
+        """
+        Deallocate memory on the device using CUDA VMM APIs.
+        """
+        result, handle = driver.cuMemRetainAllocationHandle(ptr)
+        if result != driver.CUresult.CUDA_SUCCESS:
+            raise Exception(f"Failed to retain allocation handle: {result}")
+        result, = driver.cuMemUnmap(ptr, size)
+        if result != driver.CUresult.CUDA_SUCCESS:
+            raise Exception(f"Failed to unmap physical allocation: {result}")
+        result, = driver.cuMemAddressFree(ptr, size)
+        if result != driver.CUresult.CUDA_SUCCESS:
+            raise Exception(f"Failed to free address: {result}")
+        result, = driver.cuMemRelease(handle)
+        if result != driver.CUresult.CUDA_SUCCESS:
+            raise Exception(f"Failed to release physical allocation: {result}")
+
 
     @property
     def is_device_accessible(self) -> bool:
-        """bool: this memory resource provides device-accessible buffers."""
+        """
+        Indicates whether the allocated memory is accessible from the device.
+
+        Returns:
+            bool: Always True for NVSHMEM memory.
+        """
         return True
 
     @property
     def is_host_accessible(self) -> bool:
-        """bool: this memory resource does not provides host-accessible buffers."""
+        """
+        Indicates whether the allocated memory is accessible from the host.
+
+        Returns:
+            bool: Always False for NVSHMEM memory.
+        """
         return False
 
     @property
     def device_id(self) -> int:
-        """int: the associated device ordinal."""
-        return self._dev_id
+        """
+        Get the device ID associated with this memory resource.
+
+        Returns:
+            int: CUDA device ID.
+        """
+        return self.device.device_id
+
+    def __repr__(self) -> str:
+        """
+        Return a string representation of the NvshmemResource.
+
+        Returns:
+            str: A string describing the object
+        """
+        return f"<VMMAllocatedMemoryResource device={self.device}>"

From e7fd8d086efbd952a6ede09378f2e628ebfb64f8 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Fri, 12 Sep 2025 10:22:02 -0700
Subject: [PATCH 03/35] add tests

---
 cuda_core/tests/test_memory.py | 126 +++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 491521ff9..bc9a8386a 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -283,3 +283,129 @@ def test_device_memory_resource_initialization():
     assert buffer.size == 1024
     assert buffer.device_id == device.device_id
     buffer.close()
+
+    def test_vmm_allocator_basic_allocation():
+        """Test basic VMM allocation functionality.
+        
+        This test verifies that VMMAllocatedMemoryResource can allocate memory
+        using CUDA VMM APIs with default configuration.
+        """
+        device = Device()
+        device.set_current()
+        
+        # Create VMM allocator with default config
+        vmm_mr = VMMAllocatedMemoryResource(device)
+        
+        # Test basic allocation
+        buffer = vmm_mr.allocate(4096)
+        assert buffer.size >= 4096  # May be aligned up
+        assert buffer.device_id == device.device_id
+        assert buffer.memory_resource == vmm_mr
+        
+        # Test deallocation
+        buffer.close()
+        
+        # Test multiple allocations
+        buffers = []
+        for i in range(5):
+            buf = vmm_mr.allocate(1024 * (i + 1))
+            buffers.append(buf)
+            assert buf.size >= 1024 * (i + 1)
+        
+        # Clean up
+        for buf in buffers:
+            buf.close()
+
+    def test_vmm_allocator_policy_configuration():
+        """Test VMM allocator with different policy configurations.
+        
+        This test verifies that VMMAllocatedMemoryResource can be configured
+        with different allocation policies and that the configuration affects
+        the allocation behavior.
+        """
+        device = Device()
+        device.set_current()
+        
+        # Test with custom VMM config
+        custom_config = VMMConfig(
+            allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
+            location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
+            granularity=driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_MINIMUM,
+            gpu_direct_rdma=True,
+            handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC,
+            peers=(),
+            self_access="rw",
+            peer_access="rw",
+        )
+        
+        vmm_mr = VMMAllocatedMemoryResource(device, config=custom_config)
+        
+        # Verify configuration is applied
+        assert vmm_mr.config == custom_config
+        assert vmm_mr.config.gpu_direct_rdma is True
+        assert vmm_mr.config.granularity == driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_MINIMUM
+        
+        # Test allocation with custom config
+        buffer = vmm_mr.allocate(8192)
+        assert buffer.size >= 8192
+        assert buffer.device_id == device.device_id
+        
+        # Test policy modification
+        new_config = VMMConfig(
+            allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
+            location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
+            granularity=driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED,
+            gpu_direct_rdma=False,
+            handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC,
+            peers=(),
+            self_access="r",  # Read-only access
+            peer_access="r",
+        )
+        
+        # Modify allocation policy
+        modified_buffer = vmm_mr.modify_allocation(buffer, 16384, config=new_config)
+        assert modified_buffer.size >= 16384
+        assert vmm_mr.config == new_config
+        assert vmm_mr.config.self_access == "r"
+        
+        # Clean up
+        modified_buffer.close()
+
+    def test_vmm_allocator_grow_allocation():
+        """Test VMM allocator's ability to grow existing allocations.
+        
+        This test verifies that VMMAllocatedMemoryResource can grow existing
+        allocations while preserving the base pointer when possible.
+        """
+        device = Device()
+        device.set_current()
+        
+        vmm_mr = VMMAllocatedMemoryResource(device)
+        
+        # Create initial allocation
+        buffer = vmm_mr.allocate(4096)
+        original_ptr = buffer.handle
+        original_size = buffer.size
+        
+        # Grow the allocation
+        grown_buffer = vmm_mr.modify_allocation(buffer, 8192)
+        
+        # Verify growth
+        assert grown_buffer.size >= 8192
+        assert grown_buffer.size > original_size
+        
+        # The pointer should ideally be preserved (fast path)
+        # but may change if contiguous extension fails (slow path)
+        assert grown_buffer.handle is not None
+        
+        # Test growing to same size (should return original buffer)
+        same_buffer = vmm_mr.modify_allocation(grown_buffer, 8192)
+        assert same_buffer is grown_buffer
+        
+        # Test growing to smaller size (should return original buffer)
+        smaller_buffer = vmm_mr.modify_allocation(grown_buffer, 4096)
+        assert smaller_buffer is grown_buffer
+        
+        # Clean up
+        grown_buffer.close()
+

From c941700438191acd689305fdf2f8c0d51465ff13 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Fri, 12 Sep 2025 15:17:20 -0700
Subject: [PATCH 04/35] Add tests and make them pass

---
 cuda_core/cuda/core/experimental/__init__.py |   2 +-
 cuda_core/cuda/core/experimental/_memory.pyx |  85 +++----
 cuda_core/tests/test_memory.py               | 250 +++++++++----------
 3 files changed, 165 insertions(+), 172 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index fffb80a5c..af06f4393 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -14,7 +14,7 @@
 from cuda.core.experimental._launch_config import LaunchConfig
 from cuda.core.experimental._launcher import launch
 from cuda.core.experimental._linker import Linker, LinkerOptions
-from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource
+from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource, VMMAllocatedMemoryResource, VMMConfig
 from cuda.core.experimental._module import Kernel, ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._stream import Stream, StreamOptions
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 70880a69b..0f4bd0efd 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -11,7 +11,8 @@ from cuda.core.experimental._utils.cuda_utils cimport (
 )
 
 import abc
-from typing import TypeVar, Union
+from typing import TypeVar, Union, Optional, Iterable
+from dataclasses import dataclass, field
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
@@ -531,11 +532,11 @@ class VMMConfig:
         peer_access: Access flags for peers ('rw' or 'r').
     """
     # TODO: for enums, do we re-expose them as cuda-core Enums or leave them as driver enums?
-    allocation_type: driver.CUmemAllocationType
-    location_type: driver.CUmemLocationType # Only supports CU_MEM_LOCATION_TYPE_DEVICE
-    handle_type: driver.CUmemAllocationHandleType
+    allocation_type: driver.CUmemAllocationType = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
+    location_type: driver.CUmemLocationType = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+    handle_type: driver.CUmemAllocationHandleType = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+    granularity: driver.CUmemAllocationGranularity_flags = driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED
     gpu_direct_rdma: bool = True
-    granularity: driver.CUmemAllocationGranularity_flags
     addr_hint: Optional[int] = 0
     addr_align: Optional[int] = None
     peers: Iterable[int] = field(default_factory=tuple)
@@ -543,7 +544,7 @@ class VMMConfig:
     peer_access: str = "rw"   # 'rw' | 'r'
 
     @staticmethod
-    def _access_to_flags(driver, spec: str) -> int:
+    def _access_to_flags(driver, spec: str):
         f = driver.CUmemAccess_flags
         if spec == "rw":
             return f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
@@ -569,20 +570,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
     """
     def __init__(self, device, config: VMMConfig = None):
         self.device = device
-        if config is None:
-            config = VMMConfig(
-                allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
-                location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
-                handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
-                gpu_direct_rdma=True,
-                granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED,
-                addr_hint=0,
-                addr_align=None,
-                peers=(),
-                self_access="rw",
-                peer_access="rw",
-            )
-        self.config = config
+        self.config = config or VMMConfig()
 
     def _align_up(self, size: int, gran: int) -> int:
         """
@@ -610,14 +598,14 @@ class VMMAllocatedMemoryResource(MemoryResource):
         -------
         Buffer
             The same buffer with updated size, preserving the original pointer
-        """
+        """ 
+        if config is not None:
+            self.config = config
+        
         if new_size <= buf.size:
             # No growth needed, return original buffer
             return buf
             
-        if config is not None:
-            self.config = config
-            
         # Build allocation properties for new chunks
         prop = driver.CUmemAllocationProp()
         prop.type = self.config.allocation_type
@@ -636,17 +624,18 @@ class VMMAllocatedMemoryResource(MemoryResource):
         additional_size = new_size - buf.size
         aligned_additional_size = self._align_up(additional_size, gran)
         total_aligned_size = self._align_up(new_size, gran)
+        aligned_prev_size = total_aligned_size - aligned_additional_size
         addr_align = self.config.addr_align or gran
         
         # Try to extend the existing VA range first
         res, new_ptr = driver.cuMemAddressReserve(
-            aligned_additional_size, 
-            addr_align, 
-            buf.ptr + buf.size,  # fixedAddr hint - try to extend at end of current range
+            aligned_additional_size,
+            addr_align,
+            int(buf.handle) + aligned_prev_size,  # fixedAddr hint - aligned end of current range
             0
         )
         
-        if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (buf.ptr + buf.size):
+        if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size):
             # Fallback: couldn't extend contiguously, need full remapping
             return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align)
         else:
@@ -703,20 +692,21 @@ class VMMAllocatedMemoryResource(MemoryResource):
             raise Exception(f"cuMemAddressReserve failed: {res}")
         
         # Get the old allocation handle for remapping
-        result, old_handle = driver.cuMemRetainAllocationHandle(buf.ptr)
+        result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle)
         if result != driver.CUresult.CUDA_SUCCESS:
             driver.cuMemAddressFree(new_ptr, total_aligned_size)
             raise Exception(f"Failed to retain old allocation handle: {result}")
         
-        # Unmap the old VA range
-        result, = driver.cuMemUnmap(buf.ptr, buf.size)
+        # Unmap the old VA range (aligned previous size)
+        aligned_prev_size = total_aligned_size - aligned_additional_size
+        result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size)
         if result != driver.CUresult.CUDA_SUCCESS:
             driver.cuMemAddressFree(new_ptr, total_aligned_size)
             driver.cuMemRelease(old_handle)
             raise Exception(f"Failed to unmap old allocation: {result}")
         
-        # Remap the old physical memory to the new VA range
-        res, = driver.cuMemMap(new_ptr, buf.size, 0, old_handle, 0)
+        # Remap the old physical memory to the new VA range (aligned previous size)
+        res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0)
         if res != driver.CUresult.CUDA_SUCCESS:
             driver.cuMemAddressFree(new_ptr, total_aligned_size)
             driver.cuMemRelease(old_handle)
@@ -730,8 +720,8 @@ class VMMAllocatedMemoryResource(MemoryResource):
             driver.cuMemRelease(old_handle)
             raise Exception(f"cuMemCreate failed for new memory: {res}")
         
-        # Map the new physical memory to the extended portion
-        res, = driver.cuMemMap(new_ptr + buf.size, aligned_additional_size, 0, new_handle, 0)
+        # Map the new physical memory to the extended portion (aligned offset)
+        res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
         if res != driver.CUresult.CUDA_SUCCESS:
             driver.cuMemUnmap(new_ptr, total_aligned_size)
             driver.cuMemAddressFree(new_ptr, total_aligned_size)
@@ -750,15 +740,18 @@ class VMMAllocatedMemoryResource(MemoryResource):
                 driver.cuMemRelease(new_handle)
                 raise Exception(f"cuMemSetAccess failed: {res}")
         
-        # Free the old VA range
-        driver.cuMemAddressFree(buf.ptr, buf.size)
-        
-        # Update the buffer with new pointer and size
-        buf._ptr = new_ptr
-        buf._size = total_aligned_size
-        buf._ptr_obj = new_ptr
-        
-        return buf
+        # Free the old VA range (aligned previous size)
+        driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)
+
+        # Invalidate the old buffer so its destructor won't try to free again
+        buf._ptr = 0
+        buf._ptr_obj = None
+        buf._size = 0
+        buf._mr = None
+
+        # Return a new Buffer for the new mapping
+        return Buffer.from_handle(ptr=new_ptr, size=new_size, mr=self)
+
 
     def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list:
         """
@@ -843,7 +836,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
         descs = []
 
         # Owner access
-        owner_flags = VMMAllocationConfig._access_to_flags(driver, config.self_access)
+        owner_flags = VMMConfig._access_to_flags(driver, config.self_access)
         if owner_flags:
             d = driver.CUmemAccessDesc()
             d.location.type = prop.location.type
@@ -852,7 +845,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
             descs.append(d)
 
         # Peer device access
-        peer_flags = VMMAllocationConfig._access_to_flags(driver, config.peer_access)
+        peer_flags = VMMConfig._access_to_flags(driver, config.peer_access)
         for peer_dev in config.peers:
             if peer_flags:
                 d = driver.CUmemAccessDesc()
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index bc9a8386a..0ceef8f27 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -10,7 +10,7 @@
 
 import pytest
 
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource, VMMAllocatedMemoryResource, VMMConfig
 from cuda.core.experimental._memory import DLDeviceType
 from cuda.core.experimental._utils.cuda_utils import handle_return
 
@@ -284,128 +284,128 @@ def test_device_memory_resource_initialization():
     assert buffer.device_id == device.device_id
     buffer.close()
 
-    def test_vmm_allocator_basic_allocation():
-        """Test basic VMM allocation functionality.
-        
-        This test verifies that VMMAllocatedMemoryResource can allocate memory
-        using CUDA VMM APIs with default configuration.
-        """
-        device = Device()
-        device.set_current()
-        
-        # Create VMM allocator with default config
-        vmm_mr = VMMAllocatedMemoryResource(device)
-        
-        # Test basic allocation
-        buffer = vmm_mr.allocate(4096)
-        assert buffer.size >= 4096  # May be aligned up
-        assert buffer.device_id == device.device_id
-        assert buffer.memory_resource == vmm_mr
-        
-        # Test deallocation
-        buffer.close()
-        
-        # Test multiple allocations
-        buffers = []
-        for i in range(5):
-            buf = vmm_mr.allocate(1024 * (i + 1))
-            buffers.append(buf)
-            assert buf.size >= 1024 * (i + 1)
-        
-        # Clean up
-        for buf in buffers:
-            buf.close()
-
-    def test_vmm_allocator_policy_configuration():
-        """Test VMM allocator with different policy configurations.
-        
-        This test verifies that VMMAllocatedMemoryResource can be configured
-        with different allocation policies and that the configuration affects
-        the allocation behavior.
-        """
-        device = Device()
-        device.set_current()
-        
-        # Test with custom VMM config
-        custom_config = VMMConfig(
-            allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
-            location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
-            granularity=driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_MINIMUM,
-            gpu_direct_rdma=True,
-            handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC,
-            peers=(),
-            self_access="rw",
-            peer_access="rw",
-        )
-        
-        vmm_mr = VMMAllocatedMemoryResource(device, config=custom_config)
-        
-        # Verify configuration is applied
-        assert vmm_mr.config == custom_config
-        assert vmm_mr.config.gpu_direct_rdma is True
-        assert vmm_mr.config.granularity == driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_MINIMUM
-        
-        # Test allocation with custom config
-        buffer = vmm_mr.allocate(8192)
-        assert buffer.size >= 8192
-        assert buffer.device_id == device.device_id
-        
-        # Test policy modification
-        new_config = VMMConfig(
-            allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
-            location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
-            granularity=driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED,
-            gpu_direct_rdma=False,
-            handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC,
-            peers=(),
-            self_access="r",  # Read-only access
-            peer_access="r",
-        )
-        
-        # Modify allocation policy
-        modified_buffer = vmm_mr.modify_allocation(buffer, 16384, config=new_config)
-        assert modified_buffer.size >= 16384
-        assert vmm_mr.config == new_config
-        assert vmm_mr.config.self_access == "r"
-        
-        # Clean up
-        modified_buffer.close()
-
-    def test_vmm_allocator_grow_allocation():
-        """Test VMM allocator's ability to grow existing allocations.
-        
-        This test verifies that VMMAllocatedMemoryResource can grow existing
-        allocations while preserving the base pointer when possible.
-        """
-        device = Device()
-        device.set_current()
-        
-        vmm_mr = VMMAllocatedMemoryResource(device)
-        
-        # Create initial allocation
-        buffer = vmm_mr.allocate(4096)
-        original_ptr = buffer.handle
-        original_size = buffer.size
-        
-        # Grow the allocation
-        grown_buffer = vmm_mr.modify_allocation(buffer, 8192)
-        
-        # Verify growth
-        assert grown_buffer.size >= 8192
-        assert grown_buffer.size > original_size
-        
-        # The pointer should ideally be preserved (fast path)
-        # but may change if contiguous extension fails (slow path)
-        assert grown_buffer.handle is not None
-        
-        # Test growing to same size (should return original buffer)
-        same_buffer = vmm_mr.modify_allocation(grown_buffer, 8192)
-        assert same_buffer is grown_buffer
-        
-        # Test growing to smaller size (should return original buffer)
-        smaller_buffer = vmm_mr.modify_allocation(grown_buffer, 4096)
-        assert smaller_buffer is grown_buffer
-        
-        # Clean up
-        grown_buffer.close()
+def test_vmm_allocator_basic_allocation():
+    """Test basic VMM allocation functionality.
+    
+    This test verifies that VMMAllocatedMemoryResource can allocate memory
+    using CUDA VMM APIs with default configuration.
+    """
+    device = Device()
+    device.set_current()
+    
+    # Create VMM allocator with default config
+    vmm_mr = VMMAllocatedMemoryResource(device)
+    
+    # Test basic allocation
+    buffer = vmm_mr.allocate(4096)
+    assert buffer.size >= 4096  # May be aligned up
+    assert buffer.device_id == device.device_id
+    assert buffer.memory_resource == vmm_mr
+    
+    # Test deallocation
+    buffer.close()
+    
+    # Test multiple allocations
+    buffers = []
+    for i in range(5):
+        buf = vmm_mr.allocate(1024 * (i + 1))
+        buffers.append(buf)
+        assert buf.size >= 1024 * (i + 1)
+    
+    # Clean up
+    for buf in buffers:
+        buf.close()
+
+def test_vmm_allocator_policy_configuration():
+    """Test VMM allocator with different policy configurations.
+    
+    This test verifies that VMMAllocatedMemoryResource can be configured
+    with different allocation policies and that the configuration affects
+    the allocation behavior.
+    """
+    device = Device()
+    device.set_current()
+    
+    # Test with custom VMM config
+    custom_config = VMMConfig(
+        allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
+        location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
+        granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM,
+        gpu_direct_rdma=True,
+        handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+        peers=(),
+        self_access="rw",
+        peer_access="rw",
+    )
+    
+    vmm_mr = VMMAllocatedMemoryResource(device, config=custom_config)
+    
+    # Verify configuration is applied
+    assert vmm_mr.config == custom_config
+    assert vmm_mr.config.gpu_direct_rdma is True
+    assert vmm_mr.config.granularity == driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
+    
+    # Test allocation with custom config
+    buffer = vmm_mr.allocate(8192)
+    assert buffer.size >= 8192
+    assert buffer.device_id == device.device_id
+    
+    # Test policy modification
+    new_config = VMMConfig(
+        allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
+        location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
+        granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED,
+        gpu_direct_rdma=False,
+        handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+        peers=(),
+        self_access="r",  # Read-only access
+        peer_access="r",
+    )
+    
+    # Modify allocation policy
+    modified_buffer = vmm_mr.modify_allocation(buffer, 16384, config=new_config)
+    assert modified_buffer.size >= 16384
+    assert vmm_mr.config == new_config
+    assert vmm_mr.config.self_access == "r"
+    
+    # Clean up
+    modified_buffer.close()
+
+def test_vmm_allocator_grow_allocation():
+    """Test VMM allocator's ability to grow existing allocations.
+    
+    This test verifies that VMMAllocatedMemoryResource can grow existing
+    allocations while preserving the base pointer when possible.
+    """
+    device = Device()
+    device.set_current()
+    
+    vmm_mr = VMMAllocatedMemoryResource(device)
+    
+    # Create initial allocation
+    buffer = vmm_mr.allocate(4096)
+    original_ptr = buffer.handle
+    original_size = buffer.size
+    
+    # Grow the allocation
+    grown_buffer = vmm_mr.modify_allocation(buffer, 8192)
+    
+    # Verify growth
+    assert grown_buffer.size >= 8192
+    assert grown_buffer.size > original_size
+    
+    # The pointer should ideally be preserved (fast path)
+    # but may change if contiguous extension fails (slow path)
+    assert grown_buffer.handle is not None
+    
+    # Test growing to same size (should return original buffer)
+    same_buffer = vmm_mr.modify_allocation(grown_buffer, 8192)
+    assert same_buffer is grown_buffer
+    
+    # Test growing to smaller size (should return original buffer)
+    smaller_buffer = vmm_mr.modify_allocation(grown_buffer, 4096)
+    assert smaller_buffer is grown_buffer
+    
+    # Clean up
+    grown_buffer.close()
 

From bb5de7f34b3f5d4083687054de806acf1d160b7d Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Fri, 12 Sep 2025 15:32:09 -0700
Subject: [PATCH 05/35] Fix format with pre-commit hooks

---
 cuda_core/tests/test_memory.py | 74 +++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 33 deletions(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 0ceef8f27..71f523189 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -10,7 +10,14 @@
 
 import pytest
 
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource, VMMAllocatedMemoryResource, VMMConfig
+from cuda.core.experimental import (
+    Buffer,
+    Device,
+    DeviceMemoryResource,
+    MemoryResource,
+    VMMAllocatedMemoryResource,
+    VMMConfig,
+)
 from cuda.core.experimental._memory import DLDeviceType
 from cuda.core.experimental._utils.cuda_utils import handle_return
 
@@ -284,48 +291,50 @@ def test_device_memory_resource_initialization():
     assert buffer.device_id == device.device_id
     buffer.close()
 
+
 def test_vmm_allocator_basic_allocation():
     """Test basic VMM allocation functionality.
-    
+
     This test verifies that VMMAllocatedMemoryResource can allocate memory
     using CUDA VMM APIs with default configuration.
     """
     device = Device()
     device.set_current()
-    
+
     # Create VMM allocator with default config
     vmm_mr = VMMAllocatedMemoryResource(device)
-    
+
     # Test basic allocation
     buffer = vmm_mr.allocate(4096)
     assert buffer.size >= 4096  # May be aligned up
     assert buffer.device_id == device.device_id
     assert buffer.memory_resource == vmm_mr
-    
+
     # Test deallocation
     buffer.close()
-    
+
     # Test multiple allocations
     buffers = []
     for i in range(5):
         buf = vmm_mr.allocate(1024 * (i + 1))
         buffers.append(buf)
         assert buf.size >= 1024 * (i + 1)
-    
+
     # Clean up
     for buf in buffers:
         buf.close()
 
+
 def test_vmm_allocator_policy_configuration():
     """Test VMM allocator with different policy configurations.
-    
+
     This test verifies that VMMAllocatedMemoryResource can be configured
     with different allocation policies and that the configuration affects
     the allocation behavior.
     """
     device = Device()
     device.set_current()
-    
+
     # Test with custom VMM config
     custom_config = VMMConfig(
         allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
@@ -337,19 +346,19 @@ def test_vmm_allocator_policy_configuration():
         self_access="rw",
         peer_access="rw",
     )
-    
+
     vmm_mr = VMMAllocatedMemoryResource(device, config=custom_config)
-    
+
     # Verify configuration is applied
     assert vmm_mr.config == custom_config
     assert vmm_mr.config.gpu_direct_rdma is True
     assert vmm_mr.config.granularity == driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
-    
+
     # Test allocation with custom config
     buffer = vmm_mr.allocate(8192)
     assert buffer.size >= 8192
     assert buffer.device_id == device.device_id
-    
+
     # Test policy modification
     new_config = VMMConfig(
         allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
@@ -361,51 +370,50 @@ def test_vmm_allocator_policy_configuration():
         self_access="r",  # Read-only access
         peer_access="r",
     )
-    
+
     # Modify allocation policy
     modified_buffer = vmm_mr.modify_allocation(buffer, 16384, config=new_config)
     assert modified_buffer.size >= 16384
     assert vmm_mr.config == new_config
     assert vmm_mr.config.self_access == "r"
-    
+
     # Clean up
     modified_buffer.close()
 
+
 def test_vmm_allocator_grow_allocation():
     """Test VMM allocator's ability to grow existing allocations.
-    
+
     This test verifies that VMMAllocatedMemoryResource can grow existing
     allocations while preserving the base pointer when possible.
     """
     device = Device()
     device.set_current()
-    
+
     vmm_mr = VMMAllocatedMemoryResource(device)
-    
+
     # Create initial allocation
-    buffer = vmm_mr.allocate(4096)
-    original_ptr = buffer.handle
+    buffer = vmm_mr.allocate(2 * 1024 * 1024)
     original_size = buffer.size
-    
+
     # Grow the allocation
-    grown_buffer = vmm_mr.modify_allocation(buffer, 8192)
-    
+    grown_buffer = vmm_mr.modify_allocation(buffer, 4 * 1024 * 1024)
+
     # Verify growth
-    assert grown_buffer.size >= 8192
+    assert grown_buffer.size >= 4 * 1024 * 1024
     assert grown_buffer.size > original_size
-    
+
     # The pointer should ideally be preserved (fast path)
     # but may change if contiguous extension fails (slow path)
     assert grown_buffer.handle is not None
-    
+
     # Test growing to same size (should return original buffer)
-    same_buffer = vmm_mr.modify_allocation(grown_buffer, 8192)
-    assert same_buffer is grown_buffer
-    
+    same_buffer = vmm_mr.modify_allocation(grown_buffer, 4 * 1024 * 1024)
+    assert same_buffer.size == grown_buffer.size
+
     # Test growing to smaller size (should return original buffer)
-    smaller_buffer = vmm_mr.modify_allocation(grown_buffer, 4096)
-    assert smaller_buffer is grown_buffer
-    
+    smaller_buffer = vmm_mr.modify_allocation(grown_buffer, 2 * 1024 * 1024)
+    assert smaller_buffer.size == grown_buffer.size
+
     # Clean up
     grown_buffer.close()
-

From 4517ca8980d67f2cec215d3922eb336515f7559e Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Fri, 12 Sep 2025 15:38:40 -0700
Subject: [PATCH 06/35] Fix format with pre-commit hooks

---
 cuda_core/cuda/core/experimental/__init__.py |  9 ++-
 cuda_core/cuda/core/experimental/_memory.pyx | 62 ++++++++++----------
 cuda_core/tests/test_memory.py               |  2 +-
 3 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index af06f4393..536899308 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -14,7 +14,14 @@
 from cuda.core.experimental._launch_config import LaunchConfig
 from cuda.core.experimental._launcher import launch
 from cuda.core.experimental._linker import Linker, LinkerOptions
-from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource, VMMAllocatedMemoryResource, VMMConfig
+from cuda.core.experimental._memory import (
+    Buffer,
+    DeviceMemoryResource,
+    LegacyPinnedMemoryResource,
+    MemoryResource,
+    VMMAllocatedMemoryResource,
+    VMMConfig,
+)
 from cuda.core.experimental._module import Kernel, ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._stream import Stream, StreamOptions
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 0f4bd0efd..39a5f9d7c 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -553,7 +553,7 @@ class VMMConfig:
         if spec == "none":
             return 0
         raise ValueError(f"Unknown access spec: {spec!r}")
-    
+
 
 class VMMAllocatedMemoryResource(MemoryResource):
     """Create a device memory resource that uses the CUDA VMM APIs to allocate memory.
@@ -564,7 +564,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
         Device ordinal for which a memory resource is constructed. The mempool that is
         set to *current* on ``device_id`` is used. If no mempool is set to current yet,
         the driver would use the *default* mempool on the device.
-    
+
     config : VMMConfig
         A configuration object for the VMMAllocatedMemoryResource
     """
@@ -581,10 +581,10 @@ class VMMAllocatedMemoryResource(MemoryResource):
     def modify_allocation(self, buf: Buffer, new_size: int, config: VMMConfig = None) -> Buffer:
         """
         Grow an existing allocation using CUDA VMM, with a configurable policy.
-        
+
         This implements true growing allocations that preserve the base pointer
         by extending the virtual address range and mapping additional physical memory.
-        
+
         Parameters
         ----------
         buf : Buffer
@@ -593,19 +593,19 @@ class VMMAllocatedMemoryResource(MemoryResource):
             The new total size for the allocation
         config : VMMConfig, optional
             Configuration for the new physical memory chunks. If None, uses current config.
-            
+
         Returns
         -------
         Buffer
             The same buffer with updated size, preserving the original pointer
-        """ 
+        """
         if config is not None:
             self.config = config
-        
+
         if new_size <= buf.size:
             # No growth needed, return original buffer
             return buf
-            
+
         # Build allocation properties for new chunks
         prop = driver.CUmemAllocationProp()
         prop.type = self.config.allocation_type
@@ -613,20 +613,20 @@ class VMMAllocatedMemoryResource(MemoryResource):
         prop.location.id = self.device.device_id
         prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0
         prop.requestedHandleTypes = self.config.handle_type
-        
+
         # Query granularity
         gran_flag = self.config.granularity
         res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
         if res != driver.CUresult.CUDA_SUCCESS:
             raise Exception(f"cuMemGetAllocationGranularity failed: {res}")
-            
+
         # Calculate sizes
         additional_size = new_size - buf.size
         aligned_additional_size = self._align_up(additional_size, gran)
         total_aligned_size = self._align_up(new_size, gran)
         aligned_prev_size = total_aligned_size - aligned_additional_size
         addr_align = self.config.addr_align or gran
-        
+
         # Try to extend the existing VA range first
         res, new_ptr = driver.cuMemAddressReserve(
             aligned_additional_size,
@@ -634,7 +634,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
             int(buf.handle) + aligned_prev_size,  # fixedAddr hint - aligned end of current range
             0
         )
-        
+
         if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size):
             # Fallback: couldn't extend contiguously, need full remapping
             return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align)
@@ -642,11 +642,11 @@ class VMMAllocatedMemoryResource(MemoryResource):
             # Success! We can extend the VA range contiguously
             return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr)
 
-    def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, 
+    def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
                                    aligned_additional_size: int, new_ptr: int) -> Buffer:
         """
         Fast path: extend the VA range contiguously.
-        
+
         This preserves the original pointer by mapping new physical memory
         to the extended portion of the virtual address range.
         """
@@ -655,14 +655,14 @@ class VMMAllocatedMemoryResource(MemoryResource):
         if res != driver.CUresult.CUDA_SUCCESS:
             driver.cuMemAddressFree(new_ptr, aligned_additional_size)
             raise Exception(f"cuMemCreate failed: {res}")
-        
+
         # Map the new physical memory to the extended VA range
         res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
         if res != driver.CUresult.CUDA_SUCCESS:
             driver.cuMemAddressFree(new_ptr, aligned_additional_size)
             driver.cuMemRelease(new_handle)
             raise Exception(f"cuMemMap failed: {res}")
-        
+
         # Set access permissions for the new portion
         descs = self._build_access_descriptors(prop)
         if descs:
@@ -672,17 +672,17 @@ class VMMAllocatedMemoryResource(MemoryResource):
                 driver.cuMemAddressFree(new_ptr, aligned_additional_size)
                 driver.cuMemRelease(new_handle)
                 raise Exception(f"cuMemSetAccess failed: {res}")
-        
+
         # Update the buffer size (pointer stays the same!)
         buf._size = new_size
-        
+
         return buf
 
     def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
                                    aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer:
         """
         Slow path: full remapping when contiguous extension fails.
-        
+
         This creates a new VA range and remaps both old and new physical memory.
         The buffer's pointer will change.
         """
@@ -690,13 +690,13 @@ class VMMAllocatedMemoryResource(MemoryResource):
         res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0)
         if res != driver.CUresult.CUDA_SUCCESS:
             raise Exception(f"cuMemAddressReserve failed: {res}")
-        
+
         # Get the old allocation handle for remapping
         result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle)
         if result != driver.CUresult.CUDA_SUCCESS:
             driver.cuMemAddressFree(new_ptr, total_aligned_size)
             raise Exception(f"Failed to retain old allocation handle: {result}")
-        
+
         # Unmap the old VA range (aligned previous size)
         aligned_prev_size = total_aligned_size - aligned_additional_size
         result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size)
@@ -704,14 +704,14 @@ class VMMAllocatedMemoryResource(MemoryResource):
             driver.cuMemAddressFree(new_ptr, total_aligned_size)
             driver.cuMemRelease(old_handle)
             raise Exception(f"Failed to unmap old allocation: {result}")
-        
+
         # Remap the old physical memory to the new VA range (aligned previous size)
         res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0)
         if res != driver.CUresult.CUDA_SUCCESS:
             driver.cuMemAddressFree(new_ptr, total_aligned_size)
             driver.cuMemRelease(old_handle)
             raise Exception(f"cuMemMap failed for old memory: {res}")
-        
+
         # Create new physical memory for the additional size
         res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
         if res != driver.CUresult.CUDA_SUCCESS:
@@ -719,7 +719,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
             driver.cuMemAddressFree(new_ptr, total_aligned_size)
             driver.cuMemRelease(old_handle)
             raise Exception(f"cuMemCreate failed for new memory: {res}")
-        
+
         # Map the new physical memory to the extended portion (aligned offset)
         res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
         if res != driver.CUresult.CUDA_SUCCESS:
@@ -728,7 +728,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
             driver.cuMemRelease(old_handle)
             driver.cuMemRelease(new_handle)
             raise Exception(f"cuMemMap failed for new memory: {res}")
-        
+
         # Set access permissions for the entire new range
         descs = self._build_access_descriptors(prop)
         if descs:
@@ -739,7 +739,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
                 driver.cuMemRelease(old_handle)
                 driver.cuMemRelease(new_handle)
                 raise Exception(f"cuMemSetAccess failed: {res}")
-        
+
         # Free the old VA range (aligned previous size)
         driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)
 
@@ -756,14 +756,14 @@ class VMMAllocatedMemoryResource(MemoryResource):
     def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list:
         """
         Build access descriptors for memory access permissions.
-        
+
         Returns
         -------
         list
             List of CUmemAccessDesc objects for setting memory access
         """
         descs = []
-        
+
         # Owner access
         owner_flags = VMMConfig._access_to_flags(driver, self.config.self_access)
         if owner_flags:
@@ -772,7 +772,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
             d.location.id = prop.location.id
             d.flags = owner_flags
             descs.append(d)
-        
+
         # Peer device access
         peer_flags = VMMConfig._access_to_flags(driver, self.config.peer_access)
         for peer_dev in self.config.peers:
@@ -782,9 +782,9 @@ class VMMAllocatedMemoryResource(MemoryResource):
                 d.location.id = int(peer_dev)
                 d.flags = peer_flags
                 descs.append(d)
-        
+
         return descs
-        
+
 
     def allocate(self, size: int, stream: Stream = None) -> Buffer:
         """
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 71f523189..129f46825 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1,6 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-
+# Dummy change
 try:
     from cuda.bindings import driver
 except ImportError:

From aa4f8df1825086a0acf947691541f1ffdcf93235 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Mon, 15 Sep 2025 11:02:42 -0700
Subject: [PATCH 07/35] Expose enumertor options through VMMAllocationOptions
 rather than exporting driver enums

---
 cuda_core/cuda/core/experimental/__init__.py |  2 +-
 cuda_core/cuda/core/experimental/_memory.pyx | 95 ++++++++++++++------
 cuda_core/tests/test_memory.py               | 25 +++---
 3 files changed, 83 insertions(+), 39 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 536899308..0cb515e05 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -20,7 +20,7 @@
     LegacyPinnedMemoryResource,
     MemoryResource,
     VMMAllocatedMemoryResource,
-    VMMConfig,
+    VMMAllocationOptions,
 )
 from cuda.core.experimental._module import Kernel, ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 39a5f9d7c..d25d23854 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -511,7 +511,7 @@ class _SynchronousMemoryResource(MemoryResource):
         return self._dev_id
 
 @dataclass
-class VMMConfig:
+class VMMAllocationOptions:
     """A configuration object for the VMMAllocatedMemoryResource
        Stores configuration information which tells the resource how to use the CUDA VMM APIs
     """
@@ -531,11 +531,11 @@ class VMMConfig:
         self_access: Access flags for the owning device ('rw', 'r', or 'none').
         peer_access: Access flags for peers ('rw' or 'r').
     """
-    # TODO: for enums, do we re-expose them as cuda-core Enums or leave them as driver enums?
-    allocation_type: driver.CUmemAllocationType = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
-    location_type: driver.CUmemLocationType = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-    handle_type: driver.CUmemAllocationHandleType = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
-    granularity: driver.CUmemAllocationGranularity_flags = driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+    # Human-friendly strings; normalized in __post_init__
+    allocation_type: str = "pinned"          # pinned
+    location_type: str = "device"            # device
+    handle_type: str = "posix-fd"           # posix-fd | generic | none
+    granularity: str = "recommended"        # minimum | recommended
     gpu_direct_rdma: bool = True
     addr_hint: Optional[int] = 0
     addr_align: Optional[int] = None
@@ -544,7 +544,7 @@ class VMMConfig:
     peer_access: str = "rw"   # 'rw' | 'r'
 
     @staticmethod
-    def _access_to_flags(driver, spec: str):
+    def _access_to_flags(spec: str):
         f = driver.CUmemAccess_flags
         if spec == "rw":
             return f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
@@ -554,6 +554,51 @@ class VMMConfig:
             return 0
         raise ValueError(f"Unknown access spec: {spec!r}")
 
+    @staticmethod
+    def _allocation_type_to_driver(spec: str):
+        if spec == "pinned":
+            return driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
+        if spec == "managed":
+            return driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
+        raise ValueError(f"Unsupported allocation_type: {spec!r}")
+
+    @staticmethod
+    def _location_type_to_driver(spec: str):
+        if spec == "device":
+            return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+        if spec == "host":
+            return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+        if spec == "host-numa":
+            return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA
+        if spec == "host-numa-current":
+            return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT
+        raise ValueError(f"Unsupported location_type: {spec!r}")
+
+    @staticmethod
+    def _handle_type_to_driver(spec: str):
+        if spec == "posix-fd":
+            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+        if spec == "generic":
+            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC
+        if spec == "none":
+            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
+        if spec == "win32":
+            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32
+        if spec == "win32-kmt":
+            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32_KMT
+        if spec == "fabric":
+            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
+        raise ValueError(f"Unsupported handle_type: {spec!r}")
+
+    @staticmethod
+    def _granularity_to_driver(spec: str):
+        f = driver.CUmemAllocationGranularity_flags
+        if spec == "minimum":
+            return f.CU_MEM_ALLOC_GRANULARITY_MINIMUM
+        if spec == "recommended":
+            return f.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+        raise ValueError(f"Unsupported granularity: {spec!r}")
+
 
 class VMMAllocatedMemoryResource(MemoryResource):
     """Create a device memory resource that uses the CUDA VMM APIs to allocate memory.
@@ -565,12 +610,12 @@ class VMMAllocatedMemoryResource(MemoryResource):
         set to *current* on ``device_id`` is used. If no mempool is set to current yet,
         the driver would use the *default* mempool on the device.
 
-    config : VMMConfig
+    config : VMMAllocationOptions
         A configuration object for the VMMAllocatedMemoryResource
     """
-    def __init__(self, device, config: VMMConfig = None):
+    def __init__(self, device, config: VMMAllocationOptions = None):
         self.device = device
-        self.config = config or VMMConfig()
+        self.config = config or VMMAllocationOptions()
 
     def _align_up(self, size: int, gran: int) -> int:
         """
@@ -578,7 +623,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
         """
         return (size + gran - 1) & ~(gran - 1)
 
-    def modify_allocation(self, buf: Buffer, new_size: int, config: VMMConfig = None) -> Buffer:
+    def modify_allocation(self, buf: Buffer, new_size: int, config: VMMAllocationOptions = None) -> Buffer:
         """
         Grow an existing allocation using CUDA VMM, with a configurable policy.
 
@@ -591,7 +636,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
             The existing buffer to grow
         new_size : int
             The new total size for the allocation
-        config : VMMConfig, optional
+        config : VMMAllocationOptions, optional
             Configuration for the new physical memory chunks. If None, uses current config.
 
         Returns
@@ -608,14 +653,14 @@ class VMMAllocatedMemoryResource(MemoryResource):
 
         # Build allocation properties for new chunks
         prop = driver.CUmemAllocationProp()
-        prop.type = self.config.allocation_type
-        prop.location.type = self.config.location_type
+        prop.type = VMMAllocationOptions._allocation_type_to_driver(self.config.allocation_type)
+        prop.location.type = VMMAllocationOptions._location_type_to_driver(self.config.location_type)
         prop.location.id = self.device.device_id
         prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0
-        prop.requestedHandleTypes = self.config.handle_type
+        prop.requestedHandleTypes = VMMAllocationOptions._handle_type_to_driver(self.config.handle_type)
 
         # Query granularity
-        gran_flag = self.config.granularity
+        gran_flag = VMMAllocationOptions._granularity_to_driver(self.config.granularity)
         res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
         if res != driver.CUresult.CUDA_SUCCESS:
             raise Exception(f"cuMemGetAllocationGranularity failed: {res}")
@@ -765,7 +810,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
         descs = []
 
         # Owner access
-        owner_flags = VMMConfig._access_to_flags(driver, self.config.self_access)
+        owner_flags = VMMAllocationOptions._access_to_flags(self.config.self_access)
         if owner_flags:
             d = driver.CUmemAccessDesc()
             d.location.type = prop.location.type
@@ -774,7 +819,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
             descs.append(d)
 
         # Peer device access
-        peer_flags = VMMConfig._access_to_flags(driver, self.config.peer_access)
+        peer_flags = VMMAllocationOptions._access_to_flags(self.config.peer_access)
         for peer_dev in self.config.peers:
             if peer_flags:
                 d = driver.CUmemAccessDesc()
@@ -793,18 +838,18 @@ class VMMAllocatedMemoryResource(MemoryResource):
         config = self.config
         # ---- Build allocation properties ----
         prop = driver.CUmemAllocationProp()
-        prop.type = config.allocation_type
+        prop.type = VMMAllocationOptions._allocation_type_to_driver(config.allocation_type)
         # TODO: Support host alloation if required
-        if config.location_type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
+        if  prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
             raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}")
-        prop.location.type = config.location_type
+        prop.location.type = VMMAllocationOptions._location_type_to_driver(config.location_type)
         prop.location.id = self.device.device_id
         prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0
-        prop.requestedHandleTypes = config.handle_type
+        prop.requestedHandleTypes = VMMAllocationOptions._handle_type_to_driver(config.handle_type)
 
         # ---- Query and apply granularity ----
         # Choose min vs recommended granularity per config
-        gran_flag = config.granularity
+        gran_flag = VMMAllocationOptions._granularity_to_driver(config.granularity)
         res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
         if res != driver.CUresult.CUDA_SUCCESS:
             raise Exception(f"cuMemGetAllocationGranularity failed: {res}")
@@ -836,7 +881,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
         descs = []
 
         # Owner access
-        owner_flags = VMMConfig._access_to_flags(driver, config.self_access)
+        owner_flags = VMMAllocationOptions._access_to_flags(config.self_access)
         if owner_flags:
             d = driver.CUmemAccessDesc()
             d.location.type = prop.location.type
@@ -845,7 +890,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
             descs.append(d)
 
         # Peer device access
-        peer_flags = VMMConfig._access_to_flags(driver, config.peer_access)
+        peer_flags = VMMAllocationOptions._access_to_flags(config.peer_access)
         for peer_dev in config.peers:
             if peer_flags:
                 d = driver.CUmemAccessDesc()
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 129f46825..8bf024760 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1,6 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-# Dummy change
 try:
     from cuda.bindings import driver
 except ImportError:
@@ -16,7 +15,7 @@
     DeviceMemoryResource,
     MemoryResource,
     VMMAllocatedMemoryResource,
-    VMMConfig,
+    VMMAllocationOptions,
 )
 from cuda.core.experimental._memory import DLDeviceType
 from cuda.core.experimental._utils.cuda_utils import handle_return
@@ -336,12 +335,12 @@ def test_vmm_allocator_policy_configuration():
     device.set_current()
 
     # Test with custom VMM config
-    custom_config = VMMConfig(
-        allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
-        location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
-        granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM,
+    custom_config = VMMAllocationOptions(
+        allocation_type="pinned",
+        location_type="device",
+        granularity="minimum",
         gpu_direct_rdma=True,
-        handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+        handle_type="posix-fd",
         peers=(),
         self_access="rw",
         peer_access="rw",
@@ -352,7 +351,7 @@ def test_vmm_allocator_policy_configuration():
     # Verify configuration is applied
     assert vmm_mr.config == custom_config
     assert vmm_mr.config.gpu_direct_rdma is True
-    assert vmm_mr.config.granularity == driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
+    assert vmm_mr.config.granularity == "minimum"
 
     # Test allocation with custom config
     buffer = vmm_mr.allocate(8192)
@@ -360,12 +359,12 @@ def test_vmm_allocator_policy_configuration():
     assert buffer.device_id == device.device_id
 
     # Test policy modification
-    new_config = VMMConfig(
-        allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED,
-        location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
-        granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED,
+    new_config = VMMAllocationOptions(
+        allocation_type="pinned",
+        location_type="device",
+        granularity="recommended",
         gpu_direct_rdma=False,
-        handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+        handle_type="posix-fd",
         peers=(),
         self_access="r",  # Read-only access
         peer_access="r",

From b1d99e55e0d43aa87b940dd2a6ed3d45b2f88281 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Thu, 18 Sep 2025 11:02:45 -0700
Subject: [PATCH 08/35] fix merge conflict

---
 cuda_core/tests/test_memory.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 9fcf63f94..21c8baf3d 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -10,7 +10,6 @@
 
 import pytest
 
-<<<<<<< HEAD
 from cuda.core.experimental import (
     Buffer,
     Device,
@@ -19,11 +18,8 @@
     VMMAllocatedMemoryResource,
     VMMAllocationOptions,
 )
-from cuda.core.experimental._memory import DLDeviceType
-=======
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource
 from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor
->>>>>>> d8b4acc1838845d08eaa3f7248246af5244617a8
 from cuda.core.experimental._utils.cuda_utils import handle_return
 
 POOL_SIZE = 2097152  # 2MB size

From a9f41916d864c139e295a63550f90058c334f7f9 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Thu, 18 Sep 2025 11:44:12 -0700
Subject: [PATCH 09/35] fix pre-commit issues

---
 cuda_core/tests/test_memory.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 21c8baf3d..0887dd64d 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -9,6 +9,8 @@
 import platform
 
 import pytest
+from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor
+from cuda.core.experimental._utils.cuda_utils import handle_return
 
 from cuda.core.experimental import (
     Buffer,
@@ -18,9 +20,6 @@
     VMMAllocatedMemoryResource,
     VMMAllocationOptions,
 )
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource
-from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor
-from cuda.core.experimental._utils.cuda_utils import handle_return
 
 POOL_SIZE = 2097152  # 2MB size
 

From d1b3379d4e665e444e2c3b352b8f787693143578 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 18 Sep 2025 18:53:18 +0000
Subject: [PATCH 10/35] [pre-commit.ci] auto code formatting

---
 cuda_core/tests/test_memory.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 0887dd64d..e66ef35c0 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -9,8 +9,6 @@
 import platform
 
 import pytest
-from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor
-from cuda.core.experimental._utils.cuda_utils import handle_return
 
 from cuda.core.experimental import (
     Buffer,
@@ -20,6 +18,8 @@
     VMMAllocatedMemoryResource,
     VMMAllocationOptions,
 )
+from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor
+from cuda.core.experimental._utils.cuda_utils import handle_return
 
 POOL_SIZE = 2097152  # 2MB size
 

From 071ab74d30c12c3e1023453bdc9bba45af65bd59 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Thu, 18 Sep 2025 14:54:42 -0700
Subject: [PATCH 11/35] Address Leo's first comments

---
 cuda_core/cuda/core/experimental/__init__.py |  4 +-
 cuda_core/cuda/core/experimental/_memory.pyx | 72 +++++++++++---------
 cuda_core/tests/test_memory.py               | 20 +++---
 3 files changed, 51 insertions(+), 45 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index d08a96538..bbf75ac85 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -20,8 +20,8 @@
     IPCChannel,
     LegacyPinnedMemoryResource,
     MemoryResource,
-    VMMAllocatedMemoryResource,
-    VMMAllocationOptions,
+    VirtualMemoryResource,
+    VirtualMemoryResourceOptions,
 )
 from cuda.core.experimental._module import Kernel, ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index bdea9e823..fc7839d35 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -13,7 +13,7 @@ from cuda.core.experimental._utils.cuda_utils cimport (
 from dataclasses import dataclass
 from typing import TypeVar, Union, TYPE_CHECKING
 import abc
-from typing import TypeVar, Union, Optional, Iterable
+from typing import TypeVar, Union, Optional, Iterable, Literal
 from dataclasses import dataclass, field
 import array
 import cython
@@ -904,9 +904,15 @@ class _SynchronousMemoryResource(MemoryResource):
     def device_id(self) -> int:
         return self._dev_id
 
+VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none"]
+VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"]
+VirtualMemoryGranularityT = Literal["minimum", "recommended"]
+VirtualMemoryAccessTypeT = Literal["rw", "r", "none"]
+VirtualMemoryAllocationTypeT = Literal["pinned", "managed"]
+
 @dataclass
-class VMMAllocationOptions:
-    """A configuration object for the VMMAllocatedMemoryResource
+class VirtualMemoryResourceOptions:
+    """A configuration object for the VirtualMemoryResource
        Stores configuration information which tells the resource how to use the CUDA VMM APIs
     """
     """
@@ -926,16 +932,16 @@ class VMMAllocationOptions:
         peer_access: Access flags for peers ('rw' or 'r').
     """
     # Human-friendly strings; normalized in __post_init__
-    allocation_type: str = "pinned"          # pinned
-    location_type: str = "device"            # device
-    handle_type: str = "posix-fd"           # posix-fd | generic | none
-    granularity: str = "recommended"        # minimum | recommended
+    allocation_type: VirtualMemoryAllocationTypeT = "pinned"
+    location_type: VirtualMemoryLocationTypeT = "device"
+    handle_type: VirtualMemoryHandleTypeT = "posix_fd"
+    granularity: VirtualMemoryGranularityT = "recommended"
     gpu_direct_rdma: bool = True
     addr_hint: Optional[int] = 0
     addr_align: Optional[int] = None
     peers: Iterable[int] = field(default_factory=tuple)
-    self_access: str = "rw"   # 'rw' | 'r' | 'none'
-    peer_access: str = "rw"   # 'rw' | 'r'
+    self_access: VirtualMemoryAccessTypeT = "rw"
+    peer_access: VirtualMemoryAccessTypeT = "rw"
 
     @staticmethod
     def _access_to_flags(spec: str):
@@ -962,15 +968,15 @@ class VMMAllocationOptions:
             return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
         if spec == "host":
             return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
-        if spec == "host-numa":
+        if spec == "host_numa":
             return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA
-        if spec == "host-numa-current":
+        if spec == "host_numa_current":
             return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT
         raise ValueError(f"Unsupported location_type: {spec!r}")
 
     @staticmethod
     def _handle_type_to_driver(spec: str):
-        if spec == "posix-fd":
+        if spec == "posix_fd":
             return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
         if spec == "generic":
             return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC
@@ -978,7 +984,7 @@ class VMMAllocationOptions:
             return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
         if spec == "win32":
             return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32
-        if spec == "win32-kmt":
+        if spec == "win32_kmt":
             return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32_KMT
         if spec == "fabric":
             return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
@@ -994,7 +1000,7 @@ class VMMAllocationOptions:
         raise ValueError(f"Unsupported granularity: {spec!r}")
 
 
-class VMMAllocatedMemoryResource(MemoryResource):
+class VirtualMemoryResource(MemoryResource):
     """Create a device memory resource that uses the CUDA VMM APIs to allocate memory.
 
     Parameters
@@ -1004,12 +1010,12 @@ class VMMAllocatedMemoryResource(MemoryResource):
         set to *current* on ``device_id`` is used. If no mempool is set to current yet,
         the driver would use the *default* mempool on the device.
 
-    config : VMMAllocationOptions
-        A configuration object for the VMMAllocatedMemoryResource
+    config : VirtualMemoryResourceOptions
+        A configuration object for the VirtualMemoryResource
     """
-    def __init__(self, device, config: VMMAllocationOptions = None):
+    def __init__(self, device, config: VirtualMemoryResourceOptions = None):
         self.device = device
-        self.config = config or VMMAllocationOptions()
+        self.config = config or VirtualMemoryResourceOptions()
 
     def _align_up(self, size: int, gran: int) -> int:
         """
@@ -1017,7 +1023,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
         """
         return (size + gran - 1) & ~(gran - 1)
 
-    def modify_allocation(self, buf: Buffer, new_size: int, config: VMMAllocationOptions = None) -> Buffer:
+    def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryResourceOptions = None) -> Buffer:
         """
         Grow an existing allocation using CUDA VMM, with a configurable policy.
 
@@ -1030,7 +1036,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
             The existing buffer to grow
         new_size : int
             The new total size for the allocation
-        config : VMMAllocationOptions, optional
+        config : VirtualMemoryResourceOptions, optional
             Configuration for the new physical memory chunks. If None, uses current config.
 
         Returns
@@ -1047,14 +1053,14 @@ class VMMAllocatedMemoryResource(MemoryResource):
 
         # Build allocation properties for new chunks
         prop = driver.CUmemAllocationProp()
-        prop.type = VMMAllocationOptions._allocation_type_to_driver(self.config.allocation_type)
-        prop.location.type = VMMAllocationOptions._location_type_to_driver(self.config.location_type)
+        prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type)
+        prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(self.config.location_type)
         prop.location.id = self.device.device_id
         prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0
-        prop.requestedHandleTypes = VMMAllocationOptions._handle_type_to_driver(self.config.handle_type)
+        prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(self.config.handle_type)
 
         # Query granularity
-        gran_flag = VMMAllocationOptions._granularity_to_driver(self.config.granularity)
+        gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(self.config.granularity)
         res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
         if res != driver.CUresult.CUDA_SUCCESS:
             raise Exception(f"cuMemGetAllocationGranularity failed: {res}")
@@ -1204,7 +1210,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
         descs = []
 
         # Owner access
-        owner_flags = VMMAllocationOptions._access_to_flags(self.config.self_access)
+        owner_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.self_access)
         if owner_flags:
             d = driver.CUmemAccessDesc()
             d.location.type = prop.location.type
@@ -1213,7 +1219,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
             descs.append(d)
 
         # Peer device access
-        peer_flags = VMMAllocationOptions._access_to_flags(self.config.peer_access)
+        peer_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.peer_access)
         for peer_dev in self.config.peers:
             if peer_flags:
                 d = driver.CUmemAccessDesc()
@@ -1232,18 +1238,18 @@ class VMMAllocatedMemoryResource(MemoryResource):
         config = self.config
         # ---- Build allocation properties ----
         prop = driver.CUmemAllocationProp()
-        prop.type = VMMAllocationOptions._allocation_type_to_driver(config.allocation_type)
+        prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type)
         # TODO: Support host alloation if required
         if  prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
             raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}")
-        prop.location.type = VMMAllocationOptions._location_type_to_driver(config.location_type)
+        prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type)
         prop.location.id = self.device.device_id
         prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0
-        prop.requestedHandleTypes = VMMAllocationOptions._handle_type_to_driver(config.handle_type)
+        prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(config.handle_type)
 
         # ---- Query and apply granularity ----
         # Choose min vs recommended granularity per config
-        gran_flag = VMMAllocationOptions._granularity_to_driver(config.granularity)
+        gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(config.granularity)
         res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
         if res != driver.CUresult.CUDA_SUCCESS:
             raise Exception(f"cuMemGetAllocationGranularity failed: {res}")
@@ -1275,7 +1281,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
         descs = []
 
         # Owner access
-        owner_flags = VMMAllocationOptions._access_to_flags(config.self_access)
+        owner_flags = VirtualMemoryResourceOptions._access_to_flags(config.self_access)
         if owner_flags:
             d = driver.CUmemAccessDesc()
             d.location.type = prop.location.type
@@ -1284,7 +1290,7 @@ class VMMAllocatedMemoryResource(MemoryResource):
             descs.append(d)
 
         # Peer device access
-        peer_flags = VMMAllocationOptions._access_to_flags(config.peer_access)
+        peer_flags = VirtualMemoryResourceOptions._access_to_flags(config.peer_access)
         for peer_dev in config.peers:
             if peer_flags:
                 d = driver.CUmemAccessDesc()
@@ -1361,4 +1367,4 @@ class VMMAllocatedMemoryResource(MemoryResource):
         Returns:
             str: A string describing the object
         """
-        return f"<VMMAllocatedMemoryResource device={self.device}>"
+        return f"<VirtualMemoryResource device={self.device}>"
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index e66ef35c0..b5cf0d60c 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -15,8 +15,8 @@
     Device,
     DeviceMemoryResource,
     MemoryResource,
-    VMMAllocatedMemoryResource,
-    VMMAllocationOptions,
+    VirtualMemoryResource,
+    VirtualMemoryResourceOptions,
 )
 from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor
 from cuda.core.experimental._utils.cuda_utils import handle_return
@@ -310,14 +310,14 @@ def test_device_memory_resource_initialization(mempool_device, use_device_object
 def test_vmm_allocator_basic_allocation():
     """Test basic VMM allocation functionality.
 
-    This test verifies that VMMAllocatedMemoryResource can allocate memory
+    This test verifies that VirtualMemoryResource can allocate memory
     using CUDA VMM APIs with default configuration.
     """
     device = Device()
     device.set_current()
 
     # Create VMM allocator with default config
-    vmm_mr = VMMAllocatedMemoryResource(device)
+    vmm_mr = VirtualMemoryResource(device)
 
     # Test basic allocation
     buffer = vmm_mr.allocate(4096)
@@ -343,7 +343,7 @@ def test_vmm_allocator_basic_allocation():
 def test_vmm_allocator_policy_configuration():
     """Test VMM allocator with different policy configurations.
 
-    This test verifies that VMMAllocatedMemoryResource can be configured
+    This test verifies that VirtualMemoryResource can be configured
     with different allocation policies and that the configuration affects
     the allocation behavior.
     """
@@ -351,7 +351,7 @@ def test_vmm_allocator_policy_configuration():
     device.set_current()
 
     # Test with custom VMM config
-    custom_config = VMMAllocationOptions(
+    custom_config = VirtualMemoryResourceOptions(
         allocation_type="pinned",
         location_type="device",
         granularity="minimum",
@@ -362,7 +362,7 @@ def test_vmm_allocator_policy_configuration():
         peer_access="rw",
     )
 
-    vmm_mr = VMMAllocatedMemoryResource(device, config=custom_config)
+    vmm_mr = VirtualMemoryResource(device, config=custom_config)
 
     # Verify configuration is applied
     assert vmm_mr.config == custom_config
@@ -375,7 +375,7 @@ def test_vmm_allocator_policy_configuration():
     assert buffer.device_id == device.device_id
 
     # Test policy modification
-    new_config = VMMAllocationOptions(
+    new_config = VirtualMemoryResourceOptions(
         allocation_type="pinned",
         location_type="device",
         granularity="recommended",
@@ -399,13 +399,13 @@ def test_vmm_allocator_policy_configuration():
 def test_vmm_allocator_grow_allocation():
     """Test VMM allocator's ability to grow existing allocations.
 
-    This test verifies that VMMAllocatedMemoryResource can grow existing
+    This test verifies that VirtualMemoryResource can grow existing
     allocations while preserving the base pointer when possible.
     """
     device = Device()
     device.set_current()
 
-    vmm_mr = VMMAllocatedMemoryResource(device)
+    vmm_mr = VirtualMemoryResource(device)
 
     # Create initial allocation
     buffer = vmm_mr.allocate(2 * 1024 * 1024)

From 52f2644863a1d278ebcba09911ee55e729d8a806 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Fri, 19 Sep 2025 14:12:14 -0700
Subject: [PATCH 12/35] save state before I muck with error handling and it
 gets too messy

---
 cuda_core/cuda/core/experimental/_memory.pyx | 103 ++++++++-----------
 1 file changed, 44 insertions(+), 59 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index fc7839d35..ae992afba 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -946,58 +946,47 @@ class VirtualMemoryResourceOptions:
     @staticmethod
     def _access_to_flags(spec: str):
         f = driver.CUmemAccess_flags
-        if spec == "rw":
-            return f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
-        if spec == "r":
-            return f.CU_MEM_ACCESS_FLAGS_PROT_READ
-        if spec == "none":
-            return 0
-        raise ValueError(f"Unknown access spec: {spec!r}")
+        _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0}
+        flags = _access_flags.get(string)
+        if not flags:
+            raise ValueError(f"Unknown access spec: {spec!r}")
+        return flags
 
     @staticmethod
     def _allocation_type_to_driver(spec: str):
-        if spec == "pinned":
-            return driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
-        if spec == "managed":
-            return driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
-        raise ValueError(f"Unsupported allocation_type: {spec!r}")
+        f = driver.CUmemAllocationType
+        _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED, "managed": f.CU_MEM_ALLOCATION_TYPE_MANAGED}
+        alloc_type = _allocation_type.get(spec)
+        if not alloc_type:
+            raise ValueError(f"Unsupported allocation_type: {spec!r}")
+        return alloc_type
 
     @staticmethod
     def _location_type_to_driver(spec: str):
-        if spec == "device":
-            return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-        if spec == "host":
-            return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
-        if spec == "host_numa":
-            return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA
-        if spec == "host_numa_current":
-            return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT
-        raise ValueError(f"Unsupported location_type: {spec!r}")
+        f = driver.CUmemLocationType
+        _location_type = {"device": f.CU_MEM_LOCATION_TYPE_DEVICE, "host": f.CU_MEM_LOCATION_TYPE_HOST, "host_numa": f.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": f.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT}
+        loc_type = _location_type.get(spec)
+        if not loc_type:
+            raise ValueError(f"Unsupported location_type: {spec!r}")
+        return loc_type
 
     @staticmethod
     def _handle_type_to_driver(spec: str):
-        if spec == "posix_fd":
-            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
-        if spec == "generic":
-            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC
-        if spec == "none":
-            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
-        if spec == "win32":
-            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32
-        if spec == "win32_kmt":
-            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32_KMT
-        if spec == "fabric":
-            return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
-        raise ValueError(f"Unsupported handle_type: {spec!r}")
+        f = driver.CUmemAllocationHandleType
+        _handle_type = {"posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "generic": f.CU_MEM_HANDLE_TYPE_GENERIC, "none": f.CU_MEM_HANDLE_TYPE_NONE, "win32": f.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC}
+        handle_type = _handle_type.get(spec)
+        if not handle_type:
+            raise ValueError(f"Unsupported handle_type: {spec!r}")
+        return handle_type
 
     @staticmethod
     def _granularity_to_driver(spec: str):
         f = driver.CUmemAllocationGranularity_flags
-        if spec == "minimum":
-            return f.CU_MEM_ALLOC_GRANULARITY_MINIMUM
-        if spec == "recommended":
-            return f.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED
-        raise ValueError(f"Unsupported granularity: {spec!r}")
+        _granularity = {"minimum": f.CU_MEM_ALLOC_GRANULARITY_MINIMUM, "recommended": f.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED}
+        granularity = _granularity.get(spec)
+        if not granularity:
+            raise ValueError(f"Unsupported granularity: {spec!r}")
+        return granularity
 
 
 class VirtualMemoryResource(MemoryResource):
@@ -1015,7 +1004,9 @@ class VirtualMemoryResource(MemoryResource):
     """
     def __init__(self, device, config: VirtualMemoryResourceOptions = None):
         self.device = device
-        self.config = config or VirtualMemoryResourceOptions()
+        self.config = check_or_create_options(
+            VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False
+        )
 
     def _align_up(self, size: int, gran: int) -> int:
         """
@@ -1047,10 +1038,6 @@ class VirtualMemoryResource(MemoryResource):
         if config is not None:
             self.config = config
 
-        if new_size <= buf.size:
-            # No growth needed, return original buffer
-            return buf
-
         # Build allocation properties for new chunks
         prop = driver.CUmemAllocationProp()
         prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type)
@@ -1062,8 +1049,7 @@ class VirtualMemoryResource(MemoryResource):
         # Query granularity
         gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(self.config.granularity)
         res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
-        if res != driver.CUresult.CUDA_SUCCESS:
-            raise Exception(f"cuMemGetAllocationGranularity failed: {res}")
+        raise_if_driver_error(res)
 
         # Calculate sizes
         additional_size = new_size - buf.size
@@ -1097,9 +1083,9 @@ class VirtualMemoryResource(MemoryResource):
         """
         # Create new physical memory for the additional size
         res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
-        if res != driver.CUresult.CUDA_SUCCESS:
-            driver.cuMemAddressFree(new_ptr, aligned_additional_size)
-            raise Exception(f"cuMemCreate failed: {res}")
+            if res != driver.CUresult.CUDA_SUCCESS:
+                driver.cuMemAddressFree(new_ptr, aligned_additional_size)
+                raise Exception(f"cuMemCreate failed: {res}")
 
         # Map the new physical memory to the extended VA range
         res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
@@ -1235,6 +1221,9 @@ class VirtualMemoryResource(MemoryResource):
         """
         Allocate memory using CUDA VMM with a configurable policy.
         """
+        if stream is not None:
+            raise NotImplementedError("Stream is not supported with VirtualMemoryResource")
+
         config = self.config
         # ---- Build allocation properties ----
         prop = driver.CUmemAllocationProp()
@@ -1242,6 +1231,7 @@ class VirtualMemoryResource(MemoryResource):
         # TODO: Support host alloation if required
         if  prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
             raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}")
+
         prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type)
         prop.location.id = self.device.device_id
         prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0
@@ -1251,8 +1241,7 @@ class VirtualMemoryResource(MemoryResource):
         # Choose min vs recommended granularity per config
         gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(config.granularity)
         res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
-        if res != driver.CUresult.CUDA_SUCCESS:
-            raise Exception(f"cuMemGetAllocationGranularity failed: {res}")
+        raise_if_driver_error(res)
 
         aligned_size = self._align_up(size, gran)
         addr_align = config.addr_align or gran
@@ -1317,17 +1306,13 @@ class VirtualMemoryResource(MemoryResource):
         Deallocate memory on the device using CUDA VMM APIs.
         """
         result, handle = driver.cuMemRetainAllocationHandle(ptr)
-        if result != driver.CUresult.CUDA_SUCCESS:
-            raise Exception(f"Failed to retain allocation handle: {result}")
+        raise_if_driver_error(result)
         result, = driver.cuMemUnmap(ptr, size)
-        if result != driver.CUresult.CUDA_SUCCESS:
-            raise Exception(f"Failed to unmap physical allocation: {result}")
+        raise_if_driver_error(result)
         result, = driver.cuMemAddressFree(ptr, size)
-        if result != driver.CUresult.CUDA_SUCCESS:
-            raise Exception(f"Failed to free address: {result}")
+        raise_if_driver_error(result)
         result, = driver.cuMemRelease(handle)
-        if result != driver.CUresult.CUDA_SUCCESS:
-            raise Exception(f"Failed to release physical allocation: {result}")
+        raise_if_driver_error(result)
 
 
     @property

From 24888186e9d388850aefae11d3c66753a006cc21 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Fri, 19 Sep 2025 15:17:56 -0700
Subject: [PATCH 13/35] Overhaul the error handling and implement a
 Transaction() class to help with that.

---
 cuda_core/cuda/core/experimental/_memory.pyx  | 251 +++++++++---------
 .../core/experimental/_utils/cuda_utils.pyx   |  33 +++
 2 files changed, 157 insertions(+), 127 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index ae992afba..8787ce7fe 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -21,9 +21,10 @@ import os
 import platform
 import weakref
 
+
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core.experimental._utils.cuda_utils import driver, Transaction
 
 if platform.system() == "Linux":
     import socket
@@ -947,7 +948,7 @@ class VirtualMemoryResourceOptions:
     def _access_to_flags(spec: str):
         f = driver.CUmemAccess_flags
         _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0}
-        flags = _access_flags.get(string)
+        flags = _access_flags.get(spec)
         if not flags:
             raise ValueError(f"Unknown access spec: {spec!r}")
         return flags
@@ -1081,32 +1082,31 @@ class VirtualMemoryResource(MemoryResource):
         This preserves the original pointer by mapping new physical memory
         to the extended portion of the virtual address range.
         """
-        # Create new physical memory for the additional size
-        res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
-            if res != driver.CUresult.CUDA_SUCCESS:
-                driver.cuMemAddressFree(new_ptr, aligned_additional_size)
-                raise Exception(f"cuMemCreate failed: {res}")
-
-        # Map the new physical memory to the extended VA range
-        res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
-        if res != driver.CUresult.CUDA_SUCCESS:
-            driver.cuMemAddressFree(new_ptr, aligned_additional_size)
-            driver.cuMemRelease(new_handle)
-            raise Exception(f"cuMemMap failed: {res}")
-
-        # Set access permissions for the new portion
-        descs = self._build_access_descriptors(prop)
-        if descs:
-            res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs))
-            if res != driver.CUresult.CUDA_SUCCESS:
-                driver.cuMemUnmap(new_ptr, aligned_additional_size)
-                driver.cuMemAddressFree(new_ptr, aligned_additional_size)
-                driver.cuMemRelease(new_handle)
-                raise Exception(f"cuMemSetAccess failed: {res}")
-
-        # Update the buffer size (pointer stays the same!)
+        with Transaction() as trans:
+            # Create new physical memory for the additional size
+            trans.append(lambda: driver.cuMemAddressFree(new_ptr, aligned_additional_size))
+            res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
+            raise_if_driver_error(res)
+            # Register undo for creation
+            trans.append(lambda: driver.cuMemRelease(new_handle))
+
+            # Map the new physical memory to the extended VA range
+            res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
+            raise_if_driver_error(res)
+            # Register undo for mapping
+            trans.append(lambda: driver.cuMemUnmap(new_ptr, aligned_additional_size))
+
+            # Set access permissions for the new portion
+            descs = self._build_access_descriptors(prop)
+            if descs:
+                res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs))
+                raise_if_driver_error(res)
+
+            # All succeeded, cancel undo actions
+            trans.commit()
+
+        # Update the buffer size (pointer stays the same)
         buf._size = new_size
-
         return buf
 
     def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
@@ -1117,59 +1117,61 @@ class VirtualMemoryResource(MemoryResource):
         This creates a new VA range and remaps both old and new physical memory.
         The buffer's pointer will change.
         """
-        # Reserve a completely new, larger VA range
-        res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0)
-        if res != driver.CUresult.CUDA_SUCCESS:
-            raise Exception(f"cuMemAddressReserve failed: {res}")
-
-        # Get the old allocation handle for remapping
-        result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle)
-        if result != driver.CUresult.CUDA_SUCCESS:
-            driver.cuMemAddressFree(new_ptr, total_aligned_size)
-            raise Exception(f"Failed to retain old allocation handle: {result}")
-
-        # Unmap the old VA range (aligned previous size)
-        aligned_prev_size = total_aligned_size - aligned_additional_size
-        result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size)
-        if result != driver.CUresult.CUDA_SUCCESS:
-            driver.cuMemAddressFree(new_ptr, total_aligned_size)
-            driver.cuMemRelease(old_handle)
-            raise Exception(f"Failed to unmap old allocation: {result}")
-
-        # Remap the old physical memory to the new VA range (aligned previous size)
-        res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0)
-        if res != driver.CUresult.CUDA_SUCCESS:
-            driver.cuMemAddressFree(new_ptr, total_aligned_size)
-            driver.cuMemRelease(old_handle)
-            raise Exception(f"cuMemMap failed for old memory: {res}")
-
-        # Create new physical memory for the additional size
-        res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
-        if res != driver.CUresult.CUDA_SUCCESS:
-            driver.cuMemUnmap(new_ptr, total_aligned_size)
-            driver.cuMemAddressFree(new_ptr, total_aligned_size)
-            driver.cuMemRelease(old_handle)
-            raise Exception(f"cuMemCreate failed for new memory: {res}")
-
-        # Map the new physical memory to the extended portion (aligned offset)
-        res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
-        if res != driver.CUresult.CUDA_SUCCESS:
-            driver.cuMemUnmap(new_ptr, total_aligned_size)
-            driver.cuMemAddressFree(new_ptr, total_aligned_size)
-            driver.cuMemRelease(old_handle)
-            driver.cuMemRelease(new_handle)
-            raise Exception(f"cuMemMap failed for new memory: {res}")
-
-        # Set access permissions for the entire new range
-        descs = self._build_access_descriptors(prop)
-        if descs:
-            res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs))
-            if res != driver.CUresult.CUDA_SUCCESS:
-                driver.cuMemUnmap(new_ptr, total_aligned_size)
-                driver.cuMemAddressFree(new_ptr, total_aligned_size)
-                driver.cuMemRelease(old_handle)
-                driver.cuMemRelease(new_handle)
-                raise Exception(f"cuMemSetAccess failed: {res}")
+        with Transaction() as trans:
+            # Reserve a completely new, larger VA range
+            res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0)
+            raise_if_driver_error(res)
+            # Register undo for VA reservation
+            trans.append(lambda: driver.cuMemAddressFree(new_ptr, total_aligned_size))
+
+            # Get the old allocation handle for remapping
+            result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle)
+            raise_if_driver_error(result)
+            # Register undo for old_handle
+            trans.append(lambda: driver.cuMemRelease(old_handle))
+
+            # Unmap the old VA range (aligned previous size)
+            aligned_prev_size = total_aligned_size - aligned_additional_size
+            result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size)
+            raise_if_driver_error(result)
+
+            def _remap_old():
+                # Try to remap the old physical memory back to the original VA range
+                try:
+                    driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0)
+                except Exception:
+                    pass
+            trans.append(_remap_old)
+
+            # Remap the old physical memory to the new VA range (aligned previous size)
+            res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0)
+            raise_if_driver_error(res)
+
+            # Register undo for mapping
+            trans.append(lambda: driver.cuMemUnmap(new_ptr, aligned_prev_size))
+
+            # Create new physical memory for the additional size
+            res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
+            raise_if_driver_error(res)
+
+            # Register undo for new physical memory
+            trans.append(lambda: driver.cuMemRelease(new_handle))
+
+            # Map the new physical memory to the extended portion (aligned offset)
+            res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
+            raise_if_driver_error(res)
+
+            # Register undo for mapping
+            trans.append(lambda: driver.cuMemUnmap(int(new_ptr) + aligned_prev_size, aligned_additional_size))
+
+            # Set access permissions for the entire new range
+            descs = self._build_access_descriptors(prop)
+            if descs:
+                res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs))
+                raise_if_driver_error(res)
+
+            # All succeeded, cancel undo actions
+            trans.commit()
 
         # Free the old VA range (aligned previous size)
         driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)
@@ -1246,56 +1248,51 @@ class VirtualMemoryResource(MemoryResource):
         aligned_size = self._align_up(size, gran)
         addr_align = config.addr_align or gran
 
-        # ---- Create physical memory ----
-        res, handle = driver.cuMemCreate(aligned_size, prop, 0)
-        if res != driver.CUresult.CUDA_SUCCESS:
-            raise Exception(f"cuMemCreate failed: {res}")
-
-        # ---- Reserve VA space ----
-        # Potentially, use a separate size for the VA reservation from the physical allocation size
-        res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0)
-        if res != driver.CUresult.CUDA_SUCCESS:
-            # tidy up physical handle on failure
-            driver.cuMemRelease(handle)
-            raise Exception(f"cuMemAddressReserve failed: {res}")
-
-        # ---- Map physical memory into VA ----
-        res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0)
-        if res != driver.CUresult.CUDA_SUCCESS:
-            driver.cuMemAddressFree(ptr, aligned_size)
-            driver.cuMemRelease(handle)
-            raise Exception(f"cuMemMap failed: {res}")
-
-        # ---- Set access for owner + peers ----
-        descs = []
-
-        # Owner access
-        owner_flags = VirtualMemoryResourceOptions._access_to_flags(config.self_access)
-        if owner_flags:
-            d = driver.CUmemAccessDesc()
-            d.location.type = prop.location.type
-            d.location.id = prop.location.id
-            d.flags = owner_flags
-            descs.append(d)
-
-        # Peer device access
-        peer_flags = VirtualMemoryResourceOptions._access_to_flags(config.peer_access)
-        for peer_dev in config.peers:
-            if peer_flags:
+        # ---- Transactional allocation ----
+        with Transaction() as trans:
+            # ---- Create physical memory ----
+            res, handle = driver.cuMemCreate(aligned_size, prop, 0)
+            raise_if_driver_error(res)
+            # Register undo for physical memory
+            trans.append(lambda: driver.cuMemRelease(handle))
+
+            # ---- Reserve VA space ----
+            # Potentially, use a separate size for the VA reservation from the physical allocation size
+            res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0)
+            raise_if_driver_error(res)
+            # Register undo for VA reservation
+            trans.append(lambda: driver.cuMemAddressFree(ptr, aligned_size))
+
+            # ---- Map physical memory into VA ----
+            res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0)
+            trans.append(lambda: driver.cuMemUnmap(ptr, aligned_size))
+            raise_if_driver_error(res)
+
+            # ---- Set access for owner + peers ----
+            descs = []
+
+            # Owner access
+            owner_flags = VirtualMemoryResourceOptions._access_to_flags(config.self_access)
+            if owner_flags:
                 d = driver.CUmemAccessDesc()
-                d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-                d.location.id = int(peer_dev)
-                d.flags = peer_flags
+                d.location.type = prop.location.type
+                d.location.id = prop.location.id
+                d.flags = owner_flags
                 descs.append(d)
 
-        if descs:
-            res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs))
-            if res != driver.CUresult.CUDA_SUCCESS:
-                # Try to unwind on failure
-                driver.cuMemUnmap(ptr, aligned_size)
-                driver.cuMemAddressFree(ptr, aligned_size)
-                driver.cuMemRelease(handle)
-                raise Exception(f"cuMemSetAccess failed: {res}")
+            # Peer device access
+            peer_flags = VirtualMemoryResourceOptions._access_to_flags(config.peer_access)
+            for peer_dev in config.peers:
+                if peer_flags:
+                    d = driver.CUmemAccessDesc()
+                    d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+                    d.location.id = int(peer_dev)
+                    d.flags = peer_flags
+                    descs.append(d)
+
+            if descs:
+                res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs))
+            trans.commit()
 
         # Done — return a Buffer that tracks this VA range
         buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self)
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index 86588f733..620b7b95b 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -3,9 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import functools
+from functools import partial
 import importlib.metadata
 from collections import namedtuple
 from collections.abc import Sequence
+from contextlib import ExitStack
 from typing import Callable
 
 try:
@@ -222,3 +224,34 @@ def get_binding_version():
     except importlib.metadata.PackageNotFoundError:
         major_minor = importlib.metadata.version("cuda-python").split(".")[:2]
     return tuple(int(v) for v in major_minor)
+
+
+class Transaction:
+    def __init__(self):
+        self._stack = ExitStack()
+        self._entered = False
+
+    def __enter__(self):
+        self._stack.__enter__()
+        self._entered = True
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        # If exit callbacks remain, they'll run in LIFO order.
+        return self._stack.__exit__(exc_type, exc, tb)
+
+    def append(self, fn, /, *args, **kwargs):
+        """
+        Register an undo action (runs if the with-block exits without commit()).
+        Values are bound now via partial so late mutations don't bite you.
+        """
+        if not self._entered:
+            raise RuntimeError("Transaction must be entered before append()")
+        self._stack.callback(partial(fn, *args, **kwargs))
+
+    def commit(self):
+        """
+        Disarm all undo actions. After this, exiting the with-block does nothing.
+        """
+        # pop_all() empties this stack so no callbacks are triggered on exit.
+        self._stack.pop_all()

From df6243da95a65dcc3c3e3179c92cf1495370906d Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Fri, 19 Sep 2025 15:43:36 -0700
Subject: [PATCH 14/35] fix re-importation of dataclasses

---
 cuda_core/cuda/core/experimental/_memory.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 8787ce7fe..47cd5cbbd 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -10,7 +10,6 @@ from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
 )
 
-from dataclasses import dataclass
 from typing import TypeVar, Union, TYPE_CHECKING
 import abc
 from typing import TypeVar, Union, Optional, Iterable, Literal

From d8b9af8f4d7050074a041438fd93d47099d2a709 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Fri, 19 Sep 2025 17:11:33 -0700
Subject: [PATCH 15/35] Checkpoint with modified tests and code with comments
 taken into account passing

---
 cuda_core/cuda/core/experimental/_memory.pyx | 33 +++++++++++++-------
 cuda_core/tests/test_memory.py               |  4 +--
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 47cd5cbbd..28918c6c8 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -23,7 +23,7 @@ import weakref
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
-from cuda.core.experimental._utils.cuda_utils import driver, Transaction
+from cuda.core.experimental._utils.cuda_utils import driver, Transaction, get_binding_version
 
 if platform.system() == "Linux":
     import socket
@@ -914,9 +914,6 @@ VirtualMemoryAllocationTypeT = Literal["pinned", "managed"]
 class VirtualMemoryResourceOptions:
     """A configuration object for the VirtualMemoryResource
        Stores configuration information which tells the resource how to use the CUDA VMM APIs
-    """
-    """
-    Configuration for CUDA VMM allocations.
 
     Args:
         handle_type: Export handle type for the physical allocation. Use
@@ -948,16 +945,20 @@ class VirtualMemoryResourceOptions:
         f = driver.CUmemAccess_flags
         _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0}
         flags = _access_flags.get(spec)
-        if not flags:
+        if flags is None:
             raise ValueError(f"Unknown access spec: {spec!r}")
         return flags
 
     @staticmethod
     def _allocation_type_to_driver(spec: str):
         f = driver.CUmemAllocationType
-        _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED, "managed": f.CU_MEM_ALLOCATION_TYPE_MANAGED}
+        # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not
+        _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED}
+        ver_major, ver_minor = get_binding_version()
+        if ver_major >= 13:
+            _allocation_type["managed"] = f.CU_MEM_ALLOCATION_TYPE_MANAGED
         alloc_type = _allocation_type.get(spec)
-        if not alloc_type:
+        if alloc_type is None:
             raise ValueError(f"Unsupported allocation_type: {spec!r}")
         return alloc_type
 
@@ -966,16 +967,22 @@ class VirtualMemoryResourceOptions:
         f = driver.CUmemLocationType
         _location_type = {"device": f.CU_MEM_LOCATION_TYPE_DEVICE, "host": f.CU_MEM_LOCATION_TYPE_HOST, "host_numa": f.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": f.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT}
         loc_type = _location_type.get(spec)
-        if not loc_type:
+        if loc_type is None:
             raise ValueError(f"Unsupported location_type: {spec!r}")
         return loc_type
 
     @staticmethod
     def _handle_type_to_driver(spec: str):
         f = driver.CUmemAllocationHandleType
-        _handle_type = {"posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "generic": f.CU_MEM_HANDLE_TYPE_GENERIC, "none": f.CU_MEM_HANDLE_TYPE_NONE, "win32": f.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC}
+        _handle_type = {
+            "none": f.CU_MEM_HANDLE_TYPE_NONE,
+            "posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+            "win32": f.CU_MEM_HANDLE_TYPE_WIN32,
+            "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT,
+            "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC,
+        }
         handle_type = _handle_type.get(spec)
-        if not handle_type:
+        if handle_type is None:
             raise ValueError(f"Unsupported handle_type: {spec!r}")
         return handle_type
 
@@ -984,7 +991,7 @@ class VirtualMemoryResourceOptions:
         f = driver.CUmemAllocationGranularity_flags
         _granularity = {"minimum": f.CU_MEM_ALLOC_GRANULARITY_MINIMUM, "recommended": f.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED}
         granularity = _granularity.get(spec)
-        if not granularity:
+        if granularity is None:
             raise ValueError(f"Unsupported granularity: {spec!r}")
         return granularity
 
@@ -1038,6 +1045,10 @@ class VirtualMemoryResource(MemoryResource):
         if config is not None:
             self.config = config
 
+        # No-op if new size is less than or equal to the current size
+        if new_size <= buf.size:
+            return buf
+
         # Build allocation properties for new chunks
         prop = driver.CUmemAllocationProp()
         prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type)
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index b5cf0d60c..8c5910142 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -356,7 +356,7 @@ def test_vmm_allocator_policy_configuration():
         location_type="device",
         granularity="minimum",
         gpu_direct_rdma=True,
-        handle_type="posix-fd",
+        handle_type="posix_fd",
         peers=(),
         self_access="rw",
         peer_access="rw",
@@ -380,7 +380,7 @@ def test_vmm_allocator_policy_configuration():
         location_type="device",
         granularity="recommended",
         gpu_direct_rdma=False,
-        handle_type="posix-fd",
+        handle_type="posix_fd",
         peers=(),
         self_access="r",  # Read-only access
         peer_access="r",

From 6fa2e7ded7a1da5f5f263505c65a1ba296b9ab5d Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Fri, 19 Sep 2025 17:39:33 -0700
Subject: [PATCH 16/35] docstrings, a couple review comments

---
 cuda_core/cuda/core/experimental/_memory.pyx  | 105 ++++++++++++------
 .../core/experimental/_utils/cuda_utils.pyx   |  17 +++
 2 files changed, 89 insertions(+), 33 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 28918c6c8..0f4298e4f 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -1028,6 +1028,9 @@ class VirtualMemoryResource(MemoryResource):
         This implements true growing allocations that preserve the base pointer
         by extending the virtual address range and mapping additional physical memory.
 
+        This function uses transactional allocation: if any step fails, the original buffer is not modified and
+        all steps the function took are rolled back so a new allocation is not created.
+
         Parameters
         ----------
         buf : Buffer
@@ -1040,15 +1043,11 @@ class VirtualMemoryResource(MemoryResource):
         Returns
         -------
         Buffer
-            The same buffer with updated size, preserving the original pointer
+            The same buffer with updated size and properties, preserving the original pointer
         """
         if config is not None:
             self.config = config
 
-        # No-op if new size is less than or equal to the current size
-        if new_size <= buf.size:
-            return buf
-
         # Build allocation properties for new chunks
         prop = driver.CUmemAllocationProp()
         prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type)
@@ -1064,6 +1063,14 @@ class VirtualMemoryResource(MemoryResource):
 
         # Calculate sizes
         additional_size = new_size - buf.size
+        if additional_size <= 0:
+            # Same size: only update access policy if needed; avoid zero-sized driver calls
+            descs = self._build_access_descriptors(prop)
+            if descs:
+                res, = driver.cuMemSetAccess(int(buf.handle), buf.size, descs, len(descs))
+                raise_if_driver_error(res)
+            return buf
+
         aligned_additional_size = self._align_up(additional_size, gran)
         total_aligned_size = self._align_up(new_size, gran)
         aligned_prev_size = total_aligned_size - aligned_additional_size
@@ -1087,10 +1094,22 @@ class VirtualMemoryResource(MemoryResource):
     def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
                                    aligned_additional_size: int, new_ptr: int) -> Buffer:
         """
-        Fast path: extend the VA range contiguously.
+        Fast path for growing a virtual memory allocation when the new region can be
+        reserved contiguously after the existing buffer.
 
-        This preserves the original pointer by mapping new physical memory
-        to the extended portion of the virtual address range.
+        This function creates and maps new physical memory for the additional size,
+        sets access permissions, and updates the buffer size in place (the pointer
+        remains unchanged).
+
+        Args:
+            buf (Buffer): The buffer to grow.
+            new_size (int): The new total size in bytes.
+            prop (driver.CUmemAllocationProp): Allocation properties for the new memory.
+            aligned_additional_size (int): The size of the new region to allocate, aligned to granularity.
+            new_ptr (int): The address of the newly reserved contiguous VA region (should be at the end of the current buffer).
+
+        Returns:
+            Buffer: The same buffer object with its size updated to `new_size`.
         """
         with Transaction() as trans:
             # Create new physical memory for the additional size
@@ -1122,10 +1141,24 @@ class VirtualMemoryResource(MemoryResource):
     def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
                                    aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer:
         """
-        Slow path: full remapping when contiguous extension fails.
+        Slow path for growing a virtual memory allocation when the new region cannot be
+        reserved contiguously after the existing buffer.
+
+        This function reserves a new, larger virtual address (VA) range, remaps the old
+        physical memory to the beginning of the new VA range, creates and maps new physical
+        memory for the additional size, sets access permissions, and updates the buffer's
+        pointer and size.
+
+        Args:
+            buf (Buffer): The buffer to grow.
+            new_size (int): The new total size in bytes.
+            prop (driver.CUmemAllocationProp): Allocation properties for the new memory.
+            aligned_additional_size (int): The size of the new region to allocate, aligned to granularity.
+            total_aligned_size (int): The total new size to reserve, aligned to granularity.
+            addr_align (int): The required address alignment for the new VA range.
 
-        This creates a new VA range and remaps both old and new physical memory.
-        The buffer's pointer will change.
+        Returns:
+            Buffer: The buffer object updated with the new pointer and size.
         """
         with Transaction() as trans:
             # Reserve a completely new, larger VA range
@@ -1231,7 +1264,33 @@ class VirtualMemoryResource(MemoryResource):
 
     def allocate(self, size: int, stream: Stream = None) -> Buffer:
         """
-        Allocate memory using CUDA VMM with a configurable policy.
+        Allocate a buffer of the given size using CUDA virtual memory.
+
+        Parameters
+        ----------
+        size : int
+            The size in bytes of the buffer to allocate.
+        stream : Stream, optional
+            CUDA stream to associate with the allocation (not currently supported).
+
+        Returns
+        -------
+        Buffer
+            A Buffer object representing the allocated virtual memory.
+
+        Raises
+        ------
+        NotImplementedError
+            If a stream is provided or if the location type is not device memory.
+        CUDAError
+            If any CUDA driver API call fails during allocation.
+
+        Notes
+        -----
+        This method uses transactional allocation: if any step fails, all resources
+        allocated so far are automatically cleaned up. The allocation is performed
+        with the configured granularity, access permissions, and peer access as
+        specified in the resource's configuration.
         """
         if stream is not None:
             raise NotImplementedError("Stream is not supported with VirtualMemoryResource")
@@ -1279,27 +1338,7 @@ class VirtualMemoryResource(MemoryResource):
             raise_if_driver_error(res)
 
             # ---- Set access for owner + peers ----
-            descs = []
-
-            # Owner access
-            owner_flags = VirtualMemoryResourceOptions._access_to_flags(config.self_access)
-            if owner_flags:
-                d = driver.CUmemAccessDesc()
-                d.location.type = prop.location.type
-                d.location.id = prop.location.id
-                d.flags = owner_flags
-                descs.append(d)
-
-            # Peer device access
-            peer_flags = VirtualMemoryResourceOptions._access_to_flags(config.peer_access)
-            for peer_dev in config.peers:
-                if peer_flags:
-                    d = driver.CUmemAccessDesc()
-                    d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-                    d.location.id = int(peer_dev)
-                    d.flags = peer_flags
-                    descs.append(d)
-
+            descs = self._build_access_descriptors(prop)
             if descs:
                 res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs))
             trans.commit()
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index 620b7b95b..dedb8ac53 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -227,6 +227,23 @@ def get_binding_version():
 
 
 class Transaction:
+    """
+    A context manager for transactional operations with undo capability.
+
+    The Transaction class allows you to register undo actions (callbacks) that will be executed
+    if the transaction is not committed before exiting the context. This is useful for managing
+    resources or operations that need to be rolled back in case of errors or early exits.
+
+    Usage:
+        with Transaction() as txn:
+            txn.append(some_cleanup_function, arg1, arg2)
+            # ... perform operations ...
+            txn.commit()  # Disarm undo actions; nothing will be rolled back on exit
+
+    Methods:
+        append(fn, *args, **kwargs): Register an undo action to be called on rollback.
+        commit(): Disarm all undo actions; nothing will be rolled back on exit.
+    """
     def __init__(self):
         self._stack = ExitStack()
         self._entered = False

From 4dd65f4e3adfd57d6cdc8bc8e541a423806377a6 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Fri, 19 Sep 2025 17:48:18 -0700
Subject: [PATCH 17/35] address a review comment

---
 cuda_core/tests/test_memory.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 8c5910142..553950f83 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -417,9 +417,9 @@ def test_vmm_allocator_grow_allocation():
     # Verify growth
     assert grown_buffer.size >= 4 * 1024 * 1024
     assert grown_buffer.size > original_size
-
-    # The pointer should ideally be preserved (fast path)
-    # but may change if contiguous extension fails (slow path)
+    # Because of the slow path, the pointer may change
+    # We cannot assert that the new pointer is the same, 
+    # but we can assert that a new pointer was assigned
     assert grown_buffer.handle is not None
 
     # Test growing to same size (should return original buffer)

From 1b86916d461a0cc5ed56a268e8275d8705be0ba6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 20 Sep 2025 00:48:55 +0000
Subject: [PATCH 18/35] [pre-commit.ci] auto code formatting

---
 cuda_core/tests/test_memory.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 553950f83..303fd4dff 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -418,7 +418,7 @@ def test_vmm_allocator_grow_allocation():
     assert grown_buffer.size >= 4 * 1024 * 1024
     assert grown_buffer.size > original_size
     # Because of the slow path, the pointer may change
-    # We cannot assert that the new pointer is the same, 
+    # We cannot assert that the new pointer is the same,
     # but we can assert that a new pointer was assigned
     assert grown_buffer.handle is not None
 

From ebba14378cff5db4651c7498799bb09b127f4e35 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Mon, 29 Sep 2025 22:17:45 -0700
Subject: [PATCH 19/35] First pass on Keith's comments

---
 cuda_core/cuda/core/experimental/_memory.pyx | 66 +++++++++-----------
 1 file changed, 28 insertions(+), 38 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index d03e48a80..67815874a 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -945,12 +945,20 @@ class VirtualMemoryResourceOptions:
     peers: Iterable[int] = field(default_factory=tuple)
     self_access: VirtualMemoryAccessTypeT = "rw"
     peer_access: VirtualMemoryAccessTypeT = "rw"
+    _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0}
+    _handle_types = {"none": f.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": f.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC}
+    _granularity = {"recommended": f.CU_MEM_ALLOCATION_GRANULARITY_RECOMMENDED, "minimum": f.CU_MEM_ALLOCATION_GRANULARITY_MINIMUM}
+    _location_type = {"device": f.CU_MEM_LOCATION_TYPE_DEVICE, "host": f.CU_MEM_LOCATION_TYPE_HOST, "host_numa": f.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": f.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT}
+    # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not
+    _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED}
+    ver_major, ver_minor = get_binding_version()
+    if ver_major >= 13:
+        _allocation_type["managed"] = f.CU_MEM_ALLOCATION_TYPE_MANAGED
 
     @staticmethod
     def _access_to_flags(spec: str):
         f = driver.CUmemAccess_flags
-        _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0}
-        flags = _access_flags.get(spec)
+        flags = VirtualMemoryResourceOptions._access_flags.get(spec)
         if flags is None:
             raise ValueError(f"Unknown access spec: {spec!r}")
         return flags
@@ -958,12 +966,7 @@ class VirtualMemoryResourceOptions:
     @staticmethod
     def _allocation_type_to_driver(spec: str):
         f = driver.CUmemAllocationType
-        # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not
-        _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED}
-        ver_major, ver_minor = get_binding_version()
-        if ver_major >= 13:
-            _allocation_type["managed"] = f.CU_MEM_ALLOCATION_TYPE_MANAGED
-        alloc_type = _allocation_type.get(spec)
+        alloc_type = VirtualMemoryResourceOptions._allocation_type.get(spec)
         if alloc_type is None:
             raise ValueError(f"Unsupported allocation_type: {spec!r}")
         return alloc_type
@@ -971,8 +974,7 @@ class VirtualMemoryResourceOptions:
     @staticmethod
     def _location_type_to_driver(spec: str):
         f = driver.CUmemLocationType
-        _location_type = {"device": f.CU_MEM_LOCATION_TYPE_DEVICE, "host": f.CU_MEM_LOCATION_TYPE_HOST, "host_numa": f.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": f.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT}
-        loc_type = _location_type.get(spec)
+        loc_type = VirtualMemoryResourceOptions._location_type.get(spec)
         if loc_type is None:
             raise ValueError(f"Unsupported location_type: {spec!r}")
         return loc_type
@@ -980,14 +982,7 @@ class VirtualMemoryResourceOptions:
     @staticmethod
     def _handle_type_to_driver(spec: str):
         f = driver.CUmemAllocationHandleType
-        _handle_type = {
-            "none": f.CU_MEM_HANDLE_TYPE_NONE,
-            "posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
-            "win32": f.CU_MEM_HANDLE_TYPE_WIN32,
-            "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT,
-            "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC,
-        }
-        handle_type = _handle_type.get(spec)
+        handle_type = VirtualMemoryResourceOptions._handle_types.get(spec)
         if handle_type is None:
             raise ValueError(f"Unsupported handle_type: {spec!r}")
         return handle_type
@@ -995,8 +990,7 @@ class VirtualMemoryResourceOptions:
     @staticmethod
     def _granularity_to_driver(spec: str):
         f = driver.CUmemAllocationGranularity_flags
-        _granularity = {"minimum": f.CU_MEM_ALLOC_GRANULARITY_MINIMUM, "recommended": f.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED}
-        granularity = _granularity.get(spec)
+        granularity = VirtualMemoryResourceOptions._granularity.get(spec)
         if granularity is None:
             raise ValueError(f"Unsupported granularity: {spec!r}")
         return granularity
@@ -1008,9 +1002,7 @@ class VirtualMemoryResource(MemoryResource):
     Parameters
     ----------
     device_id : int
-        Device ordinal for which a memory resource is constructed. The mempool that is
-        set to *current* on ``device_id`` is used. If no mempool is set to current yet,
-        the driver would use the *default* mempool on the device.
+        Device ordinal for which a memory resource is constructed.
 
     config : VirtualMemoryResourceOptions
         A configuration object for the VirtualMemoryResource
@@ -1021,6 +1013,7 @@ class VirtualMemoryResource(MemoryResource):
             VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False
         )
 
+    @staticmethod
     def _align_up(self, size: int, gran: int) -> int:
         """
         Align a size up to the nearest multiple of a granularity.
@@ -1077,8 +1070,8 @@ class VirtualMemoryResource(MemoryResource):
                 raise_if_driver_error(res)
             return buf
 
-        aligned_additional_size = self._align_up(additional_size, gran)
-        total_aligned_size = self._align_up(new_size, gran)
+        aligned_additional_size = VirtualMemoryResource._align_up(additional_size, gran)
+        total_aligned_size = VirtualMemoryResource._align_up(new_size, gran)
         aligned_prev_size = total_aligned_size - aligned_additional_size
         addr_align = self.config.addr_align or gran
 
@@ -1223,7 +1216,8 @@ class VirtualMemoryResource(MemoryResource):
             trans.commit()
 
         # Free the old VA range (aligned previous size)
-        driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)
+        result, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)
+        raise_if_driver_error(result)
 
         # Invalidate the old buffer so its destructor won't try to free again
         buf._ptr = 0
@@ -1257,8 +1251,8 @@ class VirtualMemoryResource(MemoryResource):
 
         # Peer device access
         peer_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.peer_access)
-        for peer_dev in self.config.peers:
-            if peer_flags:
+        if peer_flags:
+            for peer_dev in self.config.peers:
                 d = driver.CUmemAccessDesc()
                 d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
                 d.location.id = int(peer_dev)
@@ -1305,8 +1299,8 @@ class VirtualMemoryResource(MemoryResource):
         # ---- Build allocation properties ----
         prop = driver.CUmemAllocationProp()
         prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type)
-        # TODO: Support host alloation if required
-        if  prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
+        
+        if prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
             raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}")
 
         prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type)
@@ -1320,7 +1314,7 @@ class VirtualMemoryResource(MemoryResource):
         res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
         raise_if_driver_error(res)
 
-        aligned_size = self._align_up(size, gran)
+        aligned_size = VirtualMemoryResource._align_up(size, gran)
         addr_align = config.addr_align or gran
 
         # ---- Transactional allocation ----
@@ -1347,6 +1341,8 @@ class VirtualMemoryResource(MemoryResource):
             descs = self._build_access_descriptors(prop)
             if descs:
                 res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs))
+                raise_if_driver_error(res)
+
             trans.commit()
 
         # Done — return a Buffer that tracks this VA range
@@ -1371,9 +1367,6 @@ class VirtualMemoryResource(MemoryResource):
     def is_device_accessible(self) -> bool:
         """
         Indicates whether the allocated memory is accessible from the device.
-
-        Returns:
-            bool: Always True for NVSHMEM memory.
         """
         return True
 
@@ -1381,9 +1374,6 @@ class VirtualMemoryResource(MemoryResource):
     def is_host_accessible(self) -> bool:
         """
         Indicates whether the allocated memory is accessible from the host.
-
-        Returns:
-            bool: Always False for NVSHMEM memory.
         """
         return False
 
@@ -1399,7 +1389,7 @@ class VirtualMemoryResource(MemoryResource):
 
     def __repr__(self) -> str:
         """
-        Return a string representation of the NvshmemResource.
+        Return a string representation of the VirtualMemoryResource.
 
         Returns:
             str: A string describing the object

From e0c863d3a54d3933a82c530fc2811fa1e8141b24 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Mon, 29 Sep 2025 22:30:36 -0700
Subject: [PATCH 20/35] Second pass on Keith's comments

---
 cuda_core/cuda/core/experimental/_memory.pyx  | 38 ++++++++++---------
 .../core/experimental/_utils/cuda_utils.pyx   |  8 ++++
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 67815874a..9e66c16c5 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -1012,6 +1012,8 @@ class VirtualMemoryResource(MemoryResource):
         self.config = check_or_create_options(
             VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False
         )
+        if self.config.location_type == "host":
+            self.device = None
 
     @staticmethod
     def _align_up(self, size: int, gran: int) -> int:
@@ -1112,17 +1114,17 @@ class VirtualMemoryResource(MemoryResource):
         """
         with Transaction() as trans:
             # Create new physical memory for the additional size
-            trans.append(lambda: driver.cuMemAddressFree(new_ptr, aligned_additional_size))
+            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(new_ptr, aligned_additional_size)))
             res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
             raise_if_driver_error(res)
             # Register undo for creation
-            trans.append(lambda: driver.cuMemRelease(new_handle))
+            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(new_handle)))
 
             # Map the new physical memory to the extended VA range
             res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
             raise_if_driver_error(res)
             # Register undo for mapping
-            trans.append(lambda: driver.cuMemUnmap(new_ptr, aligned_additional_size))
+            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(new_ptr, aligned_additional_size)))
 
             # Set access permissions for the new portion
             descs = self._build_access_descriptors(prop)
@@ -1164,13 +1166,13 @@ class VirtualMemoryResource(MemoryResource):
             res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0)
             raise_if_driver_error(res)
             # Register undo for VA reservation
-            trans.append(lambda: driver.cuMemAddressFree(new_ptr, total_aligned_size))
+            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(new_ptr, total_aligned_size)))
 
             # Get the old allocation handle for remapping
             result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle)
             raise_if_driver_error(result)
             # Register undo for old_handle
-            trans.append(lambda: driver.cuMemRelease(old_handle))
+            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(old_handle)))
 
             # Unmap the old VA range (aligned previous size)
             aligned_prev_size = total_aligned_size - aligned_additional_size
@@ -1190,21 +1192,21 @@ class VirtualMemoryResource(MemoryResource):
             raise_if_driver_error(res)
 
             # Register undo for mapping
-            trans.append(lambda: driver.cuMemUnmap(new_ptr, aligned_prev_size))
+            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(new_ptr, aligned_prev_size)))
 
             # Create new physical memory for the additional size
             res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
             raise_if_driver_error(res)
 
             # Register undo for new physical memory
-            trans.append(lambda: driver.cuMemRelease(new_handle))
+            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(new_handle)))
 
             # Map the new physical memory to the extended portion (aligned offset)
             res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
             raise_if_driver_error(res)
 
             # Register undo for mapping
-            trans.append(lambda: driver.cuMemUnmap(int(new_ptr) + aligned_prev_size, aligned_additional_size))
+            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(int(new_ptr) + aligned_prev_size, aligned_additional_size)))
 
             # Set access permissions for the entire new range
             descs = self._build_access_descriptors(prop)
@@ -1216,7 +1218,7 @@ class VirtualMemoryResource(MemoryResource):
             trans.commit()
 
         # Free the old VA range (aligned previous size)
-        result, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)
+        result, = wrap_driver_function_with_error_handling(driver.cuMemAddressFree(int(buf.handle), aligned_prev_size))
         raise_if_driver_error(result)
 
         # Invalidate the old buffer so its destructor won't try to free again
@@ -1299,12 +1301,12 @@ class VirtualMemoryResource(MemoryResource):
         # ---- Build allocation properties ----
         prop = driver.CUmemAllocationProp()
         prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type)
-        
+
         if prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
             raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}")
 
         prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type)
-        prop.location.id = self.device.device_id
+        prop.location.id = self.device.device_id if config.location_type == "device" else -1
         prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0
         prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(config.handle_type)
 
@@ -1323,18 +1325,18 @@ class VirtualMemoryResource(MemoryResource):
             res, handle = driver.cuMemCreate(aligned_size, prop, 0)
             raise_if_driver_error(res)
             # Register undo for physical memory
-            trans.append(lambda: driver.cuMemRelease(handle))
+            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(handle)))
 
             # ---- Reserve VA space ----
             # Potentially, use a separate size for the VA reservation from the physical allocation size
             res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0)
             raise_if_driver_error(res)
             # Register undo for VA reservation
-            trans.append(lambda: driver.cuMemAddressFree(ptr, aligned_size))
+            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(ptr, aligned_size)))
 
             # ---- Map physical memory into VA ----
             res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0)
-            trans.append(lambda: driver.cuMemUnmap(ptr, aligned_size))
+            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(ptr, aligned_size)))
             raise_if_driver_error(res)
 
             # ---- Set access for owner + peers ----
@@ -1368,14 +1370,14 @@ class VirtualMemoryResource(MemoryResource):
         """
         Indicates whether the allocated memory is accessible from the device.
         """
-        return True
+        return self.config.location_type == "device"
 
     @property
     def is_host_accessible(self) -> bool:
         """
         Indicates whether the allocated memory is accessible from the host.
         """
-        return False
+        return self.config.location_type == "host"
 
     @property
     def device_id(self) -> int:
@@ -1383,9 +1385,9 @@ class VirtualMemoryResource(MemoryResource):
         Get the device ID associated with this memory resource.
 
         Returns:
-            int: CUDA device ID.
+            int: CUDA device ID. -1 if the memory resource allocates host memory
         """
-        return self.device.device_id
+        return self.device.device_id if self.config.location_type == "device" else -1
 
     def __repr__(self) -> str:
         """
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index dedb8ac53..f4f33b048 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -225,6 +225,14 @@ def get_binding_version():
         major_minor = importlib.metadata.version("cuda-python").split(".")[:2]
     return tuple(int(v) for v in major_minor)
 
+def wrap_driver_function_with_error_handling(func):
+    """
+    A wrapper that handles driver errors and raises a CUDAError.
+    """
+    def wrapper(*args, **kwargs):
+        res, = func(*args, **kwargs)
+        _check_driver_error(res)
+    return wrapper
 
 class Transaction:
     """

From 452160cf958702c836e6057a1cbe664a6e883709 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Mon, 29 Sep 2025 22:31:14 -0700
Subject: [PATCH 21/35] Added helper function for error handling

---
 cuda_core/cuda/core/experimental/_memory.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 9e66c16c5..27dd90a0c 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -7,7 +7,7 @@ from __future__ import annotations
 from libc.stdint cimport uintptr_t
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
-    check_or_create_options,
+    check_or_create_options, wrap_driver_function_with_error_handling,
 )
 import sys
 

From 63bce2a06eb3f73dbfedaa829069f95d8ccb1a42 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 30 Sep 2025 05:33:13 +0000
Subject: [PATCH 22/35] [pre-commit.ci] auto code formatting

---
 cuda_core/tests/test_memory.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 303fd4dff..e375a32d3 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -9,7 +9,6 @@
 import platform
 
 import pytest
-
 from cuda.core.experimental import (
     Buffer,
     Device,

From b630314923550bbd3b31a671467c7e9308e4fc7f Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Mon, 29 Sep 2025 22:37:16 -0700
Subject: [PATCH 23/35] need to import, not cimport

---
 cuda_core/cuda/core/experimental/_memory.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 27dd90a0c..04577a718 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -7,7 +7,7 @@ from __future__ import annotations
 from libc.stdint cimport uintptr_t
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
-    check_or_create_options, wrap_driver_function_with_error_handling,
+    check_or_create_options,
 )
 import sys
 
@@ -24,7 +24,7 @@ import weakref
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
-from cuda.core.experimental._utils.cuda_utils import driver, Transaction, get_binding_version
+from cuda.core.experimental._utils.cuda_utils import driver, Transaction, get_binding_version, wrap_driver_function_with_error_handling
 
 if platform.system() == "Linux":
     import socket

From b2b824cebda1fc00d8bf537d0481ab642053674d Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Mon, 29 Sep 2025 22:38:45 -0700
Subject: [PATCH 24/35] need to import, not cimport

---
 cuda_core/cuda/core/experimental/_memory.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 04577a718..a12fd79c0 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -24,7 +24,9 @@ import weakref
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
-from cuda.core.experimental._utils.cuda_utils import driver, Transaction, get_binding_version, wrap_driver_function_with_error_handling
+from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, 
+                                                       wrap_driver_function_with_error_handling,
+                                                     )
 
 if platform.system() == "Linux":
     import socket

From efdae393688522f5aa5d6bef0dbcbfbea94f7483 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Mon, 29 Sep 2025 22:50:32 -0700
Subject: [PATCH 25/35] build and test fixes

---
 cuda_core/cuda/core/experimental/_memory.pyx | 49 ++++++++++----------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index a12fd79c0..624b6c40c 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -947,19 +947,23 @@ class VirtualMemoryResourceOptions:
     peers: Iterable[int] = field(default_factory=tuple)
     self_access: VirtualMemoryAccessTypeT = "rw"
     peer_access: VirtualMemoryAccessTypeT = "rw"
-    _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0}
-    _handle_types = {"none": f.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": f.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC}
-    _granularity = {"recommended": f.CU_MEM_ALLOCATION_GRANULARITY_RECOMMENDED, "minimum": f.CU_MEM_ALLOCATION_GRANULARITY_MINIMUM}
-    _location_type = {"device": f.CU_MEM_LOCATION_TYPE_DEVICE, "host": f.CU_MEM_LOCATION_TYPE_HOST, "host_numa": f.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": f.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT}
+    a = driver.CUmemAccess_flags
+    _access_flags = {"rw": a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": a.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0}
+    h = driver.CUmemAllocationHandleType
+    _handle_types = {"none": h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": h.CU_MEM_HANDLE_TYPE_FABRIC}
+    g = driver.CUmemAllocationGranularity_flags
+    _granularity = {"recommended": g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": g.CU_MEM_ALLOC_GRANULARITY_MINIMUM}
+    l = driver.CUmemLocationType
+    _location_type = {"device": l.CU_MEM_LOCATION_TYPE_DEVICE, "host": l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT}
     # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not
-    _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED}
+    a = driver.CUmemAllocationType
+    _allocation_type = {"pinned": a.CU_MEM_ALLOCATION_TYPE_PINNED}
     ver_major, ver_minor = get_binding_version()
     if ver_major >= 13:
-        _allocation_type["managed"] = f.CU_MEM_ALLOCATION_TYPE_MANAGED
+        _allocation_type["managed"] = a.CU_MEM_ALLOCATION_TYPE_MANAGED
 
     @staticmethod
     def _access_to_flags(spec: str):
-        f = driver.CUmemAccess_flags
         flags = VirtualMemoryResourceOptions._access_flags.get(spec)
         if flags is None:
             raise ValueError(f"Unknown access spec: {spec!r}")
@@ -967,7 +971,6 @@ class VirtualMemoryResourceOptions:
 
     @staticmethod
     def _allocation_type_to_driver(spec: str):
-        f = driver.CUmemAllocationType
         alloc_type = VirtualMemoryResourceOptions._allocation_type.get(spec)
         if alloc_type is None:
             raise ValueError(f"Unsupported allocation_type: {spec!r}")
@@ -975,7 +978,6 @@ class VirtualMemoryResourceOptions:
 
     @staticmethod
     def _location_type_to_driver(spec: str):
-        f = driver.CUmemLocationType
         loc_type = VirtualMemoryResourceOptions._location_type.get(spec)
         if loc_type is None:
             raise ValueError(f"Unsupported location_type: {spec!r}")
@@ -983,7 +985,6 @@ class VirtualMemoryResourceOptions:
 
     @staticmethod
     def _handle_type_to_driver(spec: str):
-        f = driver.CUmemAllocationHandleType
         handle_type = VirtualMemoryResourceOptions._handle_types.get(spec)
         if handle_type is None:
             raise ValueError(f"Unsupported handle_type: {spec!r}")
@@ -991,7 +992,6 @@ class VirtualMemoryResourceOptions:
 
     @staticmethod
     def _granularity_to_driver(spec: str):
-        f = driver.CUmemAllocationGranularity_flags
         granularity = VirtualMemoryResourceOptions._granularity.get(spec)
         if granularity is None:
             raise ValueError(f"Unsupported granularity: {spec!r}")
@@ -1018,7 +1018,7 @@ class VirtualMemoryResource(MemoryResource):
             self.device = None
 
     @staticmethod
-    def _align_up(self, size: int, gran: int) -> int:
+    def _align_up(size: int, gran: int) -> int:
         """
         Align a size up to the nearest multiple of a granularity.
         """
@@ -1116,17 +1116,17 @@ class VirtualMemoryResource(MemoryResource):
         """
         with Transaction() as trans:
             # Create new physical memory for the additional size
-            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(new_ptr, aligned_additional_size)))
+            trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), new_ptr, aligned_additional_size)
             res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
             raise_if_driver_error(res)
             # Register undo for creation
-            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(new_handle)))
+            trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), new_handle)
 
             # Map the new physical memory to the extended VA range
             res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
             raise_if_driver_error(res)
             # Register undo for mapping
-            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(new_ptr, aligned_additional_size)))
+            trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), new_ptr, aligned_additional_size)
 
             # Set access permissions for the new portion
             descs = self._build_access_descriptors(prop)
@@ -1168,13 +1168,13 @@ class VirtualMemoryResource(MemoryResource):
             res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0)
             raise_if_driver_error(res)
             # Register undo for VA reservation
-            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(new_ptr, total_aligned_size)))
+            trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), new_ptr, total_aligned_size)
 
             # Get the old allocation handle for remapping
             result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle)
             raise_if_driver_error(result)
             # Register undo for old_handle
-            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(old_handle)))
+            trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), old_handle)
 
             # Unmap the old VA range (aligned previous size)
             aligned_prev_size = total_aligned_size - aligned_additional_size
@@ -1194,21 +1194,21 @@ class VirtualMemoryResource(MemoryResource):
             raise_if_driver_error(res)
 
             # Register undo for mapping
-            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(new_ptr, aligned_prev_size)))
+            trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), new_ptr, aligned_prev_size)
 
             # Create new physical memory for the additional size
             res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
             raise_if_driver_error(res)
 
             # Register undo for new physical memory
-            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(new_handle)))
+            trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), new_handle)
 
             # Map the new physical memory to the extended portion (aligned offset)
             res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
             raise_if_driver_error(res)
 
             # Register undo for mapping
-            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(int(new_ptr) + aligned_prev_size, aligned_additional_size)))
+            trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), int(new_ptr) + aligned_prev_size, aligned_additional_size)
 
             # Set access permissions for the entire new range
             descs = self._build_access_descriptors(prop)
@@ -1220,8 +1220,7 @@ class VirtualMemoryResource(MemoryResource):
             trans.commit()
 
         # Free the old VA range (aligned previous size)
-        result, = wrap_driver_function_with_error_handling(driver.cuMemAddressFree(int(buf.handle), aligned_prev_size))
-        raise_if_driver_error(result)
+        wrap_driver_function_with_error_handling(driver.cuMemAddressFree)(int(buf.handle), aligned_prev_size)
 
         # Invalidate the old buffer so its destructor won't try to free again
         buf._ptr = 0
@@ -1327,18 +1326,18 @@ class VirtualMemoryResource(MemoryResource):
             res, handle = driver.cuMemCreate(aligned_size, prop, 0)
             raise_if_driver_error(res)
             # Register undo for physical memory
-            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(handle)))
+            trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), handle)
 
             # ---- Reserve VA space ----
             # Potentially, use a separate size for the VA reservation from the physical allocation size
             res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0)
             raise_if_driver_error(res)
             # Register undo for VA reservation
-            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(ptr, aligned_size)))
+            trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), ptr, aligned_size)
 
             # ---- Map physical memory into VA ----
             res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0)
-            trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(ptr, aligned_size)))
+            trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), ptr, aligned_size)
             raise_if_driver_error(res)
 
             # ---- Set access for owner + peers ----

From 5283e5627db541fb491a1f5f3202635020cea604 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 30 Sep 2025 05:59:45 +0000
Subject: [PATCH 26/35] [pre-commit.ci] auto code formatting

---
 cuda_core/cuda/core/experimental/_memory.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 624b6c40c..52968ca53 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -24,7 +24,7 @@ import weakref
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
-from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, 
+from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version,
                                                        wrap_driver_function_with_error_handling,
                                                      )
 

From 49fd8dafdc078bee74321080f9e41299ac8136bd Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Tue, 30 Sep 2025 13:24:56 -0700
Subject: [PATCH 27/35] address next round of comments

---
 cuda_core/cuda/core/experimental/_memory.pyx  | 34 ++++++++++++-------
 .../core/experimental/_utils/cuda_utils.pyx   |  8 -----
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 624b6c40c..436f04cb0 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -25,11 +25,15 @@ import weakref
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
 from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, 
-                                                       wrap_driver_function_with_error_handling,
+                                                       handle_return,
                                                      )
 
 if platform.system() == "Linux":
     import socket
+# Helper to invoke CUDA driver functions with standard error handling.
+def _driver_call(func, /, *args):
+    handle_return(func(*args))
+
 
 if TYPE_CHECKING:
     import cuda.bindings.driver
@@ -1088,6 +1092,10 @@ class VirtualMemoryResource(MemoryResource):
         )
 
         if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size):
+            # Check for specific errors that are not recoverable with the slow path
+            if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED):
+                raise RuntimeError(f"Failed to extend VA range: {res}")
+            _driver_call(driver.cuMemAddressFree, new_ptr, aligned_additional_size)
             # Fallback: couldn't extend contiguously, need full remapping
             return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align)
         else:
@@ -1116,17 +1124,17 @@ class VirtualMemoryResource(MemoryResource):
         """
         with Transaction() as trans:
             # Create new physical memory for the additional size
-            trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), new_ptr, aligned_additional_size)
+            trans.append(_driver_call, driver.cuMemAddressFree, new_ptr, aligned_additional_size)
             res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
             raise_if_driver_error(res)
             # Register undo for creation
-            trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), new_handle)
+            trans.append(_driver_call, driver.cuMemRelease, new_handle)
 
             # Map the new physical memory to the extended VA range
             res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
             raise_if_driver_error(res)
             # Register undo for mapping
-            trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), new_ptr, aligned_additional_size)
+            trans.append(_driver_call, driver.cuMemUnmap, new_ptr, aligned_additional_size)
 
             # Set access permissions for the new portion
             descs = self._build_access_descriptors(prop)
@@ -1168,13 +1176,13 @@ class VirtualMemoryResource(MemoryResource):
             res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0)
             raise_if_driver_error(res)
             # Register undo for VA reservation
-            trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), new_ptr, total_aligned_size)
+            trans.append(_driver_call, driver.cuMemAddressFree, new_ptr, total_aligned_size)
 
             # Get the old allocation handle for remapping
             result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle)
             raise_if_driver_error(result)
             # Register undo for old_handle
-            trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), old_handle)
+            trans.append(_driver_call, driver.cuMemRelease, old_handle)
 
             # Unmap the old VA range (aligned previous size)
             aligned_prev_size = total_aligned_size - aligned_additional_size
@@ -1194,21 +1202,21 @@ class VirtualMemoryResource(MemoryResource):
             raise_if_driver_error(res)
 
             # Register undo for mapping
-            trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), new_ptr, aligned_prev_size)
+            trans.append(_driver_call, driver.cuMemUnmap, new_ptr, aligned_prev_size)
 
             # Create new physical memory for the additional size
             res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
             raise_if_driver_error(res)
 
             # Register undo for new physical memory
-            trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), new_handle)
+            trans.append(_driver_call, driver.cuMemRelease, new_handle)
 
             # Map the new physical memory to the extended portion (aligned offset)
             res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
             raise_if_driver_error(res)
 
             # Register undo for mapping
-            trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), int(new_ptr) + aligned_prev_size, aligned_additional_size)
+            trans.append(_driver_call, driver.cuMemUnmap, int(new_ptr) + aligned_prev_size, aligned_additional_size)
 
             # Set access permissions for the entire new range
             descs = self._build_access_descriptors(prop)
@@ -1220,7 +1228,7 @@ class VirtualMemoryResource(MemoryResource):
             trans.commit()
 
         # Free the old VA range (aligned previous size)
-        wrap_driver_function_with_error_handling(driver.cuMemAddressFree)(int(buf.handle), aligned_prev_size)
+        handle_return(driver.cuMemAddressFree(int(buf.handle), aligned_prev_size))
 
         # Invalidate the old buffer so its destructor won't try to free again
         buf._ptr = 0
@@ -1326,18 +1334,18 @@ class VirtualMemoryResource(MemoryResource):
             res, handle = driver.cuMemCreate(aligned_size, prop, 0)
             raise_if_driver_error(res)
             # Register undo for physical memory
-            trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), handle)
+            trans.append(_driver_call, driver.cuMemRelease, handle)
 
             # ---- Reserve VA space ----
             # Potentially, use a separate size for the VA reservation from the physical allocation size
             res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0)
             raise_if_driver_error(res)
             # Register undo for VA reservation
-            trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), ptr, aligned_size)
+            trans.append(_driver_call, driver.cuMemAddressFree, ptr, aligned_size)
 
             # ---- Map physical memory into VA ----
             res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0)
-            trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), ptr, aligned_size)
+            trans.append(_driver_call, driver.cuMemUnmap, ptr, aligned_size)
             raise_if_driver_error(res)
 
             # ---- Set access for owner + peers ----
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index f4f33b048..dedb8ac53 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -225,14 +225,6 @@ def get_binding_version():
         major_minor = importlib.metadata.version("cuda-python").split(".")[:2]
     return tuple(int(v) for v in major_minor)
 
-def wrap_driver_function_with_error_handling(func):
-    """
-    A wrapper that handles driver errors and raises a CUDAError.
-    """
-    def wrapper(*args, **kwargs):
-        res, = func(*args, **kwargs)
-        _check_driver_error(res)
-    return wrapper
 
 class Transaction:
     """

From e09bda7e64fcec09e65095132e5673104cddef28 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 30 Sep 2025 20:32:47 +0000
Subject: [PATCH 28/35] [pre-commit.ci] auto code formatting

---
 cuda_core/cuda/core/experimental/_memory.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 436f04cb0..3a1fb9d5b 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -24,7 +24,7 @@ import weakref
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
-from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, 
+from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version,
                                                        handle_return,
                                                      )
 

From a086c3c0c79f9c13c10b63419b5f97479512ddad Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Wed, 1 Oct 2025 17:32:40 -0700
Subject: [PATCH 29/35] Next round of review comments

---
 cuda_core/cuda/core/experimental/_memory.pyx  | 36 +++++++++----------
 .../core/experimental/_utils/cuda_utils.pyx   |  1 +
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 436f04cb0..b453e8b26 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -24,15 +24,11 @@ import weakref
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
-from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, 
-                                                       handle_return,
-                                                     )
+from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version )
 
 if platform.system() == "Linux":
     import socket
-# Helper to invoke CUDA driver functions with standard error handling.
-def _driver_call(func, /, *args):
-    handle_return(func(*args))
+# (Removed) helper for driver calls; use raise_if_driver_error with direct calls instead.
 
 
 if TYPE_CHECKING:
@@ -1095,7 +1091,8 @@ class VirtualMemoryResource(MemoryResource):
             # Check for specific errors that are not recoverable with the slow path
             if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED):
                 raise RuntimeError(f"Failed to extend VA range: {res}")
-            _driver_call(driver.cuMemAddressFree, new_ptr, aligned_additional_size)
+            res2, = driver.cuMemAddressFree(new_ptr, aligned_additional_size)
+            raise_if_driver_error(res2)
             # Fallback: couldn't extend contiguously, need full remapping
             return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align)
         else:
@@ -1124,17 +1121,17 @@ class VirtualMemoryResource(MemoryResource):
         """
         with Transaction() as trans:
             # Create new physical memory for the additional size
-            trans.append(_driver_call, driver.cuMemAddressFree, new_ptr, aligned_additional_size)
+            trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0]))
             res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
             raise_if_driver_error(res)
             # Register undo for creation
-            trans.append(_driver_call, driver.cuMemRelease, new_handle)
+            trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
 
             # Map the new physical memory to the extended VA range
             res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
             raise_if_driver_error(res)
             # Register undo for mapping
-            trans.append(_driver_call, driver.cuMemUnmap, new_ptr, aligned_additional_size)
+            trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0]))
 
             # Set access permissions for the new portion
             descs = self._build_access_descriptors(prop)
@@ -1176,13 +1173,13 @@ class VirtualMemoryResource(MemoryResource):
             res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0)
             raise_if_driver_error(res)
             # Register undo for VA reservation
-            trans.append(_driver_call, driver.cuMemAddressFree, new_ptr, total_aligned_size)
+            trans.append(lambda np=new_ptr, s=total_aligned_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0]))
 
             # Get the old allocation handle for remapping
             result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle)
             raise_if_driver_error(result)
             # Register undo for old_handle
-            trans.append(_driver_call, driver.cuMemRelease, old_handle)
+            trans.append(lambda h=old_handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
 
             # Unmap the old VA range (aligned previous size)
             aligned_prev_size = total_aligned_size - aligned_additional_size
@@ -1202,21 +1199,21 @@ class VirtualMemoryResource(MemoryResource):
             raise_if_driver_error(res)
 
             # Register undo for mapping
-            trans.append(_driver_call, driver.cuMemUnmap, new_ptr, aligned_prev_size)
+            trans.append(lambda np=new_ptr, s=aligned_prev_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0]))
 
             # Create new physical memory for the additional size
             res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
             raise_if_driver_error(res)
 
             # Register undo for new physical memory
-            trans.append(_driver_call, driver.cuMemRelease, new_handle)
+            trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
 
             # Map the new physical memory to the extended portion (aligned offset)
             res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
             raise_if_driver_error(res)
 
             # Register undo for mapping
-            trans.append(_driver_call, driver.cuMemUnmap, int(new_ptr) + aligned_prev_size, aligned_additional_size)
+            trans.append(lambda base=int(new_ptr), offs=aligned_prev_size, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(base + offs, s)[0]))
 
             # Set access permissions for the entire new range
             descs = self._build_access_descriptors(prop)
@@ -1228,7 +1225,8 @@ class VirtualMemoryResource(MemoryResource):
             trans.commit()
 
         # Free the old VA range (aligned previous size)
-        handle_return(driver.cuMemAddressFree(int(buf.handle), aligned_prev_size))
+        res2, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)
+        raise_if_driver_error(res2)
 
         # Invalidate the old buffer so its destructor won't try to free again
         buf._ptr = 0
@@ -1334,18 +1332,18 @@ class VirtualMemoryResource(MemoryResource):
             res, handle = driver.cuMemCreate(aligned_size, prop, 0)
             raise_if_driver_error(res)
             # Register undo for physical memory
-            trans.append(_driver_call, driver.cuMemRelease, handle)
+            trans.append(lambda h=handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
 
             # ---- Reserve VA space ----
             # Potentially, use a separate size for the VA reservation from the physical allocation size
             res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0)
             raise_if_driver_error(res)
             # Register undo for VA reservation
-            trans.append(_driver_call, driver.cuMemAddressFree, ptr, aligned_size)
+            trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemAddressFree(p, s)[0]))
 
             # ---- Map physical memory into VA ----
             res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0)
-            trans.append(_driver_call, driver.cuMemUnmap, ptr, aligned_size)
+            trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemUnmap(p, s)[0]))
             raise_if_driver_error(res)
 
             # ---- Set access for owner + peers ----
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index dedb8ac53..83eef2f33 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -255,6 +255,7 @@ class Transaction:
 
     def __exit__(self, exc_type, exc, tb):
         # If exit callbacks remain, they'll run in LIFO order.
+        self._entered = False
         return self._stack.__exit__(exc_type, exc, tb)
 
     def append(self, fn, /, *args, **kwargs):

From e90b9b0999e069376752a80951a939241f73864f Mon Sep 17 00:00:00 2001
From: Ben Glick <glick@glick.cloud>
Date: Thu, 2 Oct 2025 09:42:35 -0700
Subject: [PATCH 30/35] Update cuda_core/cuda/core/experimental/_memory.pyx

Co-authored-by: Keith Kraus <keith.j.kraus@gmail.com>
---
 cuda_core/cuda/core/experimental/_memory.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index b453e8b26..ba376b9c3 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -28,8 +28,6 @@ from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_
 
 if platform.system() == "Linux":
     import socket
-# (Removed) helper for driver calls; use raise_if_driver_error with direct calls instead.
-
 
 if TYPE_CHECKING:
     import cuda.bindings.driver

From ae8263c320f3fcb91d4631f9a6f6ba2b36ddf194 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Thu, 2 Oct 2025 10:43:03 -0700
Subject: [PATCH 31/35] Handle missing error check and address review comments

---
 cuda_core/cuda/core/experimental/_memory.pyx | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index ba376b9c3..2ff306ea4 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -1088,7 +1088,7 @@ class VirtualMemoryResource(MemoryResource):
         if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size):
             # Check for specific errors that are not recoverable with the slow path
             if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED):
-                raise RuntimeError(f"Failed to extend VA range: {res}")
+                raise_if_driver_error(res)
             res2, = driver.cuMemAddressFree(new_ptr, aligned_additional_size)
             raise_if_driver_error(res2)
             # Fallback: couldn't extend contiguously, need full remapping
@@ -1187,7 +1187,8 @@ class VirtualMemoryResource(MemoryResource):
             def _remap_old():
                 # Try to remap the old physical memory back to the original VA range
                 try:
-                    driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0)
+                    res, = driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0)
+                    raise_if_driver_error(res)
                 except Exception:
                     pass
             trans.append(_remap_old)
@@ -1307,9 +1308,6 @@ class VirtualMemoryResource(MemoryResource):
         prop = driver.CUmemAllocationProp()
         prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type)
 
-        if prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
-            raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}")
-
         prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type)
         prop.location.id = self.device.device_id if config.location_type == "device" else -1
         prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0

From fea55e0d680f2b6e409049501f9674ed97b0e940 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 7 Oct 2025 22:24:54 +0000
Subject: [PATCH 32/35] nit: hide non-public dataclass members

---
 cuda_core/cuda/core/experimental/_memory.pyx | 25 +++++++++++---------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 0fc6ef895..044d80196 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -1087,12 +1087,14 @@ class _SynchronousMemoryResource(MemoryResource):
     def device_id(self) -> int:
         return self._dev_id
 
+
 VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none"]
 VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"]
 VirtualMemoryGranularityT = Literal["minimum", "recommended"]
 VirtualMemoryAccessTypeT = Literal["rw", "r", "none"]
 VirtualMemoryAllocationTypeT = Literal["pinned", "managed"]
 
+
 @dataclass
 class VirtualMemoryResourceOptions:
     """A configuration object for the VirtualMemoryResource
@@ -1122,20 +1124,21 @@ class VirtualMemoryResourceOptions:
     peers: Iterable[int] = field(default_factory=tuple)
     self_access: VirtualMemoryAccessTypeT = "rw"
     peer_access: VirtualMemoryAccessTypeT = "rw"
-    a = driver.CUmemAccess_flags
-    _access_flags = {"rw": a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": a.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0}
-    h = driver.CUmemAllocationHandleType
-    _handle_types = {"none": h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": h.CU_MEM_HANDLE_TYPE_FABRIC}
-    g = driver.CUmemAllocationGranularity_flags
-    _granularity = {"recommended": g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": g.CU_MEM_ALLOC_GRANULARITY_MINIMUM}
-    l = driver.CUmemLocationType
-    _location_type = {"device": l.CU_MEM_LOCATION_TYPE_DEVICE, "host": l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT}
+
+    _a = driver.CUmemAccess_flags
+    _access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0}
+    _h = driver.CUmemAllocationHandleType
+    _handle_types = {"none": _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC}
+    _g = driver.CUmemAllocationGranularity_flags
+    _granularity = {"recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM}
+    _l = driver.CUmemLocationType
+    _location_type = {"device": _l.CU_MEM_LOCATION_TYPE_DEVICE, "host": _l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT}
     # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not
-    a = driver.CUmemAllocationType
-    _allocation_type = {"pinned": a.CU_MEM_ALLOCATION_TYPE_PINNED}
+    _a = driver.CUmemAllocationType
+    _allocation_type = {"pinned": _a.CU_MEM_ALLOCATION_TYPE_PINNED}
     ver_major, ver_minor = get_binding_version()
     if ver_major >= 13:
-        _allocation_type["managed"] = a.CU_MEM_ALLOCATION_TYPE_MANAGED
+        _allocation_type["managed"] = _a.CU_MEM_ALLOCATION_TYPE_MANAGED
 
     @staticmethod
     def _access_to_flags(spec: str):

From 6450712da7e5d1e43b48d0168e534605fa5f552b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 7 Oct 2025 22:28:08 +0000
Subject: [PATCH 33/35] add basic docs

---
 cuda_core/docs/source/api.rst                 | 2 ++
 cuda_core/docs/source/release/0.X.Y-notes.rst | 1 +
 2 files changed, 3 insertions(+)

diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index f239c69cd..d7f4d3642 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -27,6 +27,7 @@ CUDA runtime
    MemoryResource
    DeviceMemoryResource
    LegacyPinnedMemoryResource
+   VirtualMemoryResource
 
    :template: dataclass.rst
 
@@ -36,6 +37,7 @@ CUDA runtime
    GraphDebugPrintOptions
    StreamOptions
    LaunchConfig
+   VirtualMemoryResourceOptions
 
 
 CUDA compilation toolchain
diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
index 7c1487329..7907839e8 100644
--- a/cuda_core/docs/source/release/0.X.Y-notes.rst
+++ b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -32,6 +32,7 @@ New features
 - Stream-ordered memory allocation can now be shared on Linux via :class:`DeviceMemoryResource`.
 - Added NVVM IR support to :class:`Program`. NVVM IR is now understood with ``code_type="nvvm"``.
 - Added an :attr:`ObjectCode.code_type` attribute for querying the code type.
+- Added :class:`VirtualMemoryResource` for low-level virtual memory management.
 
 
 New examples

From 4af54ac5ec9015f19bcc27e5a083fc77318c1805 Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Tue, 7 Oct 2025 17:11:12 -0700
Subject: [PATCH 34/35] add windows support

---
 cuda_core/cuda/core/experimental/_memory.pyx |  2 +-
 cuda_core/tests/test_memory.py               | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 044d80196..a7ec0751c 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -1088,7 +1088,7 @@ class _SynchronousMemoryResource(MemoryResource):
         return self._dev_id
 
 
-VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none"]
+VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none", "win32"]
 VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"]
 VirtualMemoryGranularityT = Literal["minimum", "recommended"]
 VirtualMemoryAccessTypeT = Literal["rw", "r", "none"]
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 028055a54..f7d07260f 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -322,9 +322,11 @@ def test_vmm_allocator_basic_allocation():
     """
     device = Device()
     device.set_current()
-
+    options = VirtualMemoryResourceOptions()
+    if platform.system() == "Windows":
+        options.handle_type = "win32"
     # Create VMM allocator with default config
-    vmm_mr = VirtualMemoryResource(device)
+    vmm_mr = VirtualMemoryResource(device, config=options)
 
     # Test basic allocation
     buffer = vmm_mr.allocate(4096)
@@ -363,7 +365,7 @@ def test_vmm_allocator_policy_configuration():
         location_type="device",
         granularity="minimum",
         gpu_direct_rdma=True,
-        handle_type="posix_fd",
+        handle_type="posix_fd" if platform.system() != "Windows" else "win32",
         peers=(),
         self_access="rw",
         peer_access="rw",
@@ -412,7 +414,10 @@ def test_vmm_allocator_grow_allocation():
     device = Device()
     device.set_current()
 
-    vmm_mr = VirtualMemoryResource(device)
+    options = VirtualMemoryResourceOptions()
+    if platform.system() == "Windows":
+        options.handle_type = "win32"
+    vmm_mr = VirtualMemoryResource(device, config=options)
 
     # Create initial allocation
     buffer = vmm_mr.allocate(2 * 1024 * 1024)

From 9db04b10715f1c83c26cf8cbf1700bfe8f431bfa Mon Sep 17 00:00:00 2001
From: Benjamin Glick <benjaming@nvidia.com>
Date: Tue, 7 Oct 2025 18:38:19 -0700
Subject: [PATCH 35/35] remove windows tests

---
 cuda_core/cuda/core/experimental/_memory.pyx |  4 +++-
 cuda_core/tests/test_memory.py               | 11 +++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index a7ec0751c..3786f066b 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -1088,7 +1088,7 @@ class _SynchronousMemoryResource(MemoryResource):
         return self._dev_id
 
 
-VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none", "win32"]
+VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none", "win32", "win32_kmt", "fabric"]
 VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"]
 VirtualMemoryGranularityT = Literal["minimum", "recommended"]
 VirtualMemoryAccessTypeT = Literal["rw", "r", "none"]
@@ -1194,6 +1194,8 @@ class VirtualMemoryResource(MemoryResource):
         )
         if self.config.location_type == "host":
             self.device = None
+        if platform.system() == "Windows":
+            raise NotImplementedError("VirtualMemoryResource is not supported on Windows")
 
     @staticmethod
     def _align_up(size: int, gran: int) -> int:
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index f7d07260f..8c980837e 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -320,11 +320,11 @@ def test_vmm_allocator_basic_allocation():
     This test verifies that VirtualMemoryResource can allocate memory
     using CUDA VMM APIs with default configuration.
     """
+    if platform.system() == "Windows":
+        pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
     device = Device()
     device.set_current()
     options = VirtualMemoryResourceOptions()
-    if platform.system() == "Windows":
-        options.handle_type = "win32"
     # Create VMM allocator with default config
     vmm_mr = VirtualMemoryResource(device, config=options)
 
@@ -356,6 +356,8 @@ def test_vmm_allocator_policy_configuration():
     with different allocation policies and that the configuration affects
     the allocation behavior.
     """
+    if platform.system() == "Windows":
+        pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
     device = Device()
     device.set_current()
 
@@ -411,12 +413,13 @@ def test_vmm_allocator_grow_allocation():
     This test verifies that VirtualMemoryResource can grow existing
     allocations while preserving the base pointer when possible.
     """
+    if platform.system() == "Windows":
+        pytest.skip("VirtualMemoryResource is not supported on Windows TCC")
     device = Device()
     device.set_current()
 
     options = VirtualMemoryResourceOptions()
-    if platform.system() == "Windows":
-        options.handle_type = "win32"
+
     vmm_mr = VirtualMemoryResource(device, config=options)
 
     # Create initial allocation