NVIDIA · SubGlitch1 · Sep 15, 2025 · Sep 15, 2025
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
@@ -14,7 +14,7 @@
 from cuda.core.experimental._launch_config import LaunchConfig
 from cuda.core.experimental._launcher import launch
 from cuda.core.experimental._linker import Linker, LinkerOptions
-from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource
+from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource, VMMAllocatedMemoryResource
 from cuda.core.experimental._module import Kernel, ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._stream import Stream, StreamOptions

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
@@ -1312,6 +1312,42 @@ def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
             stream = default_stream()
         return self._mr.allocate(size, stream)
 
+    def create_vmm_memory_resource(self, allocation_type=None) -> "VMMAllocatedMemoryResource":
+        """Create a VMMAllocatedMemoryResource for this device.
+
+        Creates a memory resource that uses CUDA's Virtual Memory Management APIs
+        for fine-grained control over memory allocation and mapping. This is useful for:
+
+        - NVSHMEM/NCCL external buffer registration
+        - Growing allocations without changing pointer addresses  
+        - EGM (Extended GPU Memory) on Grace-Hopper or Grace-Blackwell systems
+        - Custom memory access patterns and sharing between processes
+
+        Parameters
+        ----------
+        allocation_type : driver.CUmemAllocationType, optional
+            The type of memory allocation. Defaults to CU_MEM_ALLOCATION_TYPE_PINNED.
+
+        Returns
+        -------
+        VMMAllocatedMemoryResource
+            A newly-created VMMAllocatedMemoryResource for this device.
+
+        Raises
+        ------
+        RuntimeError
+            If this device does not support virtual memory management.
+
+        Examples
+        --------
+        >>> device = Device()
+        >>> vmm_mr = device.create_vmm_memory_resource()
+        >>> device.memory_resource = vmm_mr  # Set as default for the device
+        >>> buffer = device.allocate(1024)  # Now uses VMM allocation
+        """
+        from cuda.core.experimental._memory import VMMAllocatedMemoryResource
+        return VMMAllocatedMemoryResource(self._id, allocation_type)
+
     def sync(self):
         """Synchronize the device.
 

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -508,3 +508,172 @@ class _SynchronousMemoryResource(MemoryResource):
     @property
     def device_id(self) -> int:
         return self._dev_id
+
+
+class VMMAllocatedMemoryResource(MemoryResource):
+    """Create a memory resource that uses CUDA's Virtual Memory Management APIs.
+
+    This memory resource uses cuMemCreate, cuMemAddressReserve, cuMemMap, and related
+    APIs to provide fine-grained control over memory allocation and mapping. This is
+    useful for:
+
+    - NVSHMEM/NCCL external buffer registration
+    - Growing allocations without changing pointer addresses  
+    - EGM (Extended GPU Memory) on Grace-Hopper or Grace-Blackwell systems
+    - Custom memory access patterns and sharing between processes
+
+    Parameters
+    ----------
+    device_id : int
+        Device ordinal for which memory allocations will be created.
+    allocation_type : driver.CUmemAllocationType, optional
+        The type of memory allocation. Defaults to CU_MEM_ALLOCATION_TYPE_PINNED.
+    """
+
+    __slots__ = ("_dev_id", "_allocation_type", "_allocations")
+
+    def __init__(self, device_id: int, allocation_type=None):
+        if allocation_type is None:
+            allocation_type = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
+
+        self._dev_id = device_id
+        self._allocation_type = allocation_type
+        self._allocations = {}  # Track allocations: ptr -> (handle, reserved_ptr, size)
+        self._handle = None
+
+        # Check if device supports virtual memory management
+        err, vmm_supported = driver.cuDeviceGetAttribute(
+            driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, 
+            device_id
+        )
+        raise_if_driver_error(err)
+        if not vmm_supported:
+            raise RuntimeError(f"Device {device_id} does not support virtual memory management")
+
+    def allocate(self, size_t size, stream: Stream = None) -> Buffer:
+        """Allocate a buffer using virtual memory management APIs.
+
+        Parameters
+        ----------
+        size : int
+            The size of the buffer to allocate, in bytes.
+        stream : Stream, optional
+            Currently ignored as VMM operations are synchronous.
+
+        Returns
+        -------
+        Buffer
+            The allocated buffer object, which is accessible on the device.
+        """
+        # Get allocation granularity
+        allocation_prop = driver.CUmemAllocationProp()
+        allocation_prop.type = self._allocation_type
+        allocation_prop.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+        allocation_prop.location.id = self._dev_id
+        allocation_prop.requestedHandleTypes = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
+
+        err, granularity = driver.cuMemGetAllocationGranularity(
+            allocation_prop, 
+            driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
+        )
+        raise_if_driver_error(err)
+
+        # Round size up to granularity
+        aligned_size = ((size + granularity - 1) // granularity) * granularity
+
+        # Create the memory allocation
+        err, mem_handle = driver.cuMemCreate(aligned_size, allocation_prop, 0)
+        raise_if_driver_error(err)
+
+        # Reserve address space
+        err, reserved_ptr = driver.cuMemAddressReserve(aligned_size, 0, 0, 0)
+        raise_if_driver_error(err)
+
+        try:
+            # Map the allocation to the reserved address
+            err, = driver.cuMemMap(reserved_ptr, aligned_size, 0, mem_handle, 0)
+            raise_if_driver_error(err)
+
+            # Set access permissions
+            access_desc = driver.CUmemAccessDesc()
+            access_desc.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+            access_desc.location.id = self._dev_id
+            access_desc.flags = driver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+
+            err, = driver.cuMemSetAccess(reserved_ptr, aligned_size, [access_desc], 1)
+            raise_if_driver_error(err)
+
+            # Store allocation info for cleanup
+            self._allocations[int(reserved_ptr)] = (mem_handle, reserved_ptr, aligned_size)
+
+            return Buffer._init(reserved_ptr, size, self)
+
+        except Exception:
+            # Clean up on error
+            try:
+                driver.cuMemAddressFree(reserved_ptr, aligned_size)
+            except:
+                pass
+            try:
+                driver.cuMemRelease(mem_handle)
+            except:
+                pass
+            raise
+
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
+        """Deallocate a buffer previously allocated by this resource.
+
+        Parameters
+        ----------
+        ptr : DevicePointerT
+            The pointer to the buffer to deallocate.
+        size : int
+            The size of the buffer to deallocate, in bytes.
+        stream : Stream, optional
+            Currently ignored as VMM operations are synchronous.
+        """
+        ptr_int = int(ptr)
+        if ptr_int not in self._allocations:
+            raise ValueError(f"Pointer {ptr_int:x} was not allocated by this memory resource")
+
+        mem_handle, reserved_ptr, aligned_size = self._allocations.pop(ptr_int)
+
+        # Unmap the memory
+        err, = driver.cuMemUnmap(reserved_ptr, aligned_size)
+        raise_if_driver_error(err)
+
+        # Free the address reservation
+        err, = driver.cuMemAddressFree(reserved_ptr, aligned_size)
+        raise_if_driver_error(err)
+
+        # Release the memory handle
+        err, = driver.cuMemRelease(mem_handle)
+        raise_if_driver_error(err)
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """bool: this memory resource provides device-accessible buffers."""
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """bool: this memory resource does not provide host-accessible buffers by default."""
+        # VMM allocations are typically device-only unless specifically configured for host access
+        return False
+
+    @property
+    def device_id(self) -> int:
+        """int: the associated device ordinal."""
+        return self._dev_id
+
+    def __del__(self):
+        """Clean up any remaining allocations."""
+        # Clean up any remaining allocations
+        for ptr_int, (mem_handle, reserved_ptr, aligned_size) in list(self._allocations.items()):
+            try:
+                driver.cuMemUnmap(reserved_ptr, aligned_size)
+                driver.cuMemAddressFree(reserved_ptr, aligned_size)
+                driver.cuMemRelease(mem_handle)
+            except:
+                pass  # Ignore errors during cleanup
+        self._allocations.clear()
diff --git a/cuda_core/examples/vmm_memory_example.py b/cuda_core/examples/vmm_memory_example.py
@@ -0,0 +1,103 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Example demonstrating the VMMAllocatedMemoryResource for fine-grained memory management.
+
+This example shows how to use CUDA's Virtual Memory Management APIs through the
+VMMAllocatedMemoryResource class for advanced memory allocation scenarios.
+"""
+
+import sys
+
+from cuda.core.experimental import Device, VMMAllocatedMemoryResource, Stream
+from cuda.core.experimental._utils.cuda_utils import driver
+
+
+def main():
+    """Demonstrate VMMAllocatedMemoryResource usage."""
+    try:
+        # Get the default device
+        device = Device()
+        print(f"Using device {device.device_id}: {device.properties.name}")
+
+        # Check if device supports virtual memory management
+        err, vmm_supported = driver.cuDeviceGetAttribute(
+            driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, 
+            device.device_id
+        )
+
+        if err != driver.CUresult.CUDA_SUCCESS or not vmm_supported:
+            print(f"Device {device.device_id} does not support virtual memory management.")
+            print("This feature requires a modern GPU with compute capability 6.0 or higher.")
+            sys.exit(1)
+
+        print(f"Device {device.device_id} supports virtual memory management!")
+
+        # Create a VMMAllocatedMemoryResource using the convenience method
+        vmm_mr = device.create_vmm_memory_resource()
+        print(f"Created VMMAllocatedMemoryResource for device {device.device_id}")
+
+        # Optionally set it as the default memory resource for the device
+        # device.memory_resource = vmm_mr
+
+        # Create a stream for operations
+        stream = Stream()
+
+        # Allocate some memory using VMM
+        sizes = [1024, 4096, 1024*1024]  # 1KB, 4KB, 1MB
+        buffers = []
+
+        print("\nAllocating buffers using VMM:")
+        for i, size in enumerate(sizes):
+            buffer = vmm_mr.allocate(size, stream)
+            buffers.append(buffer)
+            print(f"  Buffer {i+1}: {size:,} bytes at address 0x{int(buffer.handle):016x}")
+
+            # Verify buffer properties
+            assert buffer.is_device_accessible
+            assert not buffer.is_host_accessible
+            assert buffer.device_id == device.device_id
+            assert buffer.memory_resource is vmm_mr
+
+        # Demonstrate buffer copying
+        if len(buffers) >= 2:
+            print(f"\nCopying from buffer 1 to buffer 2...")
+            # Note: In a real application, you would initialize buffer 1 with data first
+            buffers[1].copy_from(buffers[0], stream=stream)
+            stream.sync()  # Wait for copy to complete
+            print("Copy completed!")
+
+        # Clean up buffers
+        print("\nCleaning up buffers:")
+        for i, buffer in enumerate(buffers):
+            buffer.close()
+            print(f"  Buffer {i+1} deallocated")
+
+        print("\nVMM memory management example completed successfully!")
+
+        # Demonstrate advanced usage: custom allocation type
+        print("\nDemonstrating custom allocation type:")
+        try:
+            # Create with managed memory type (if supported)
+            vmm_mr_managed = device.create_vmm_memory_resource(
+                driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
+            )
+
+            managed_buffer = vmm_mr_managed.allocate(4096, stream)
+            print(f"  Managed buffer: 4096 bytes at address 0x{int(managed_buffer.handle):016x}")
+            managed_buffer.close()
+            print("  Managed buffer deallocated")
+
+        except Exception as e:
+            print(f"  Managed memory allocation failed: {e}")
+            print("  This is expected on some systems/drivers")
+
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()