diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index fffb80a5c..5ceb9a022 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -14,7 +14,7 @@ from cuda.core.experimental._launch_config import LaunchConfig from cuda.core.experimental._launcher import launch from cuda.core.experimental._linker import Linker, LinkerOptions -from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource +from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource, VMMAllocatedMemoryResource from cuda.core.experimental._module import Kernel, ObjectCode from cuda.core.experimental._program import Program, ProgramOptions from cuda.core.experimental._stream import Stream, StreamOptions diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 0499baa58..c3c84e690 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -1312,6 +1312,42 @@ def allocate(self, size, stream: Optional[Stream] = None) -> Buffer: stream = default_stream() return self._mr.allocate(size, stream) + def create_vmm_memory_resource(self, allocation_type=None) -> "VMMAllocatedMemoryResource": + """Create a VMMAllocatedMemoryResource for this device. + + Creates a memory resource that uses CUDA's Virtual Memory Management APIs + for fine-grained control over memory allocation and mapping. This is useful for: + + - NVSHMEM/NCCL external buffer registration + - Growing allocations without changing pointer addresses + - EGM (Extended GPU Memory) on Grace-Hopper or Grace-Blackwell systems + - Custom memory access patterns and sharing between processes + + Parameters + ---------- + allocation_type : driver.CUmemAllocationType, optional + The type of memory allocation. Defaults to CU_MEM_ALLOCATION_TYPE_PINNED. + + Returns + ------- + VMMAllocatedMemoryResource + A newly-created VMMAllocatedMemoryResource for this device. + + Raises + ------ + RuntimeError + If this device does not support virtual memory management. + + Examples + -------- + >>> device = Device() + >>> vmm_mr = device.create_vmm_memory_resource() + >>> device.memory_resource = vmm_mr # Set as default for the device + >>> buffer = device.allocate(1024) # Now uses VMM allocation + """ + from cuda.core.experimental._memory import VMMAllocatedMemoryResource + return VMMAllocatedMemoryResource(self._id, allocation_type) + def sync(self): """Synchronize the device. diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 44e7a77c7..eb1b58567 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -508,3 +508,172 @@ class _SynchronousMemoryResource(MemoryResource): @property def device_id(self) -> int: return self._dev_id + + +class VMMAllocatedMemoryResource(MemoryResource): + """Create a memory resource that uses CUDA's Virtual Memory Management APIs. + + This memory resource uses cuMemCreate, cuMemAddressReserve, cuMemMap, and related + APIs to provide fine-grained control over memory allocation and mapping. This is + useful for: + + - NVSHMEM/NCCL external buffer registration + - Growing allocations without changing pointer addresses + - EGM (Extended GPU Memory) on Grace-Hopper or Grace-Blackwell systems + - Custom memory access patterns and sharing between processes + + Parameters + ---------- + device_id : int + Device ordinal for which memory allocations will be created. + allocation_type : driver.CUmemAllocationType, optional + The type of memory allocation. Defaults to CU_MEM_ALLOCATION_TYPE_PINNED. + """ + + __slots__ = ("_dev_id", "_allocation_type", "_allocations") + + def __init__(self, device_id: int, allocation_type=None): + if allocation_type is None: + allocation_type = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED + + self._dev_id = device_id + self._allocation_type = allocation_type + self._allocations = {} # Track allocations: ptr -> (handle, reserved_ptr, size) + self._handle = None + + # Check if device supports virtual memory management + err, vmm_supported = driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, + device_id + ) + raise_if_driver_error(err) + if not vmm_supported: + raise RuntimeError(f"Device {device_id} does not support virtual memory management") + + def allocate(self, size_t size, stream: Stream = None) -> Buffer: + """Allocate a buffer using virtual memory management APIs. + + Parameters + ---------- + size : int + The size of the buffer to allocate, in bytes. + stream : Stream, optional + Currently ignored as VMM operations are synchronous. + + Returns + ------- + Buffer + The allocated buffer object, which is accessible on the device. + """ + # Get allocation granularity + allocation_prop = driver.CUmemAllocationProp() + allocation_prop.type = self._allocation_type + allocation_prop.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + allocation_prop.location.id = self._dev_id + allocation_prop.requestedHandleTypes = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + + err, granularity = driver.cuMemGetAllocationGranularity( + allocation_prop, + driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM + ) + raise_if_driver_error(err) + + # Round size up to granularity + aligned_size = ((size + granularity - 1) // granularity) * granularity + + # Create the memory allocation + err, mem_handle = driver.cuMemCreate(aligned_size, allocation_prop, 0) + raise_if_driver_error(err) + + # Reserve address space + err, reserved_ptr = driver.cuMemAddressReserve(aligned_size, 0, 0, 0) + raise_if_driver_error(err) + + try: + # Map the allocation to the reserved address + err, = driver.cuMemMap(reserved_ptr, aligned_size, 0, mem_handle, 0) + raise_if_driver_error(err) + + # Set access permissions + access_desc = driver.CUmemAccessDesc() + access_desc.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + access_desc.location.id = self._dev_id + access_desc.flags = driver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE + + err, = driver.cuMemSetAccess(reserved_ptr, aligned_size, [access_desc], 1) + raise_if_driver_error(err) + + # Store allocation info for cleanup + self._allocations[int(reserved_ptr)] = (mem_handle, reserved_ptr, aligned_size) + + return Buffer._init(reserved_ptr, size, self) + + except Exception: + # Clean up on error + try: + driver.cuMemAddressFree(reserved_ptr, aligned_size) + except: + pass + try: + driver.cuMemRelease(mem_handle) + except: + pass + raise + + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): + """Deallocate a buffer previously allocated by this resource. + + Parameters + ---------- + ptr : DevicePointerT + The pointer to the buffer to deallocate. + size : int + The size of the buffer to deallocate, in bytes. + stream : Stream, optional + Currently ignored as VMM operations are synchronous. + """ + ptr_int = int(ptr) + if ptr_int not in self._allocations: + raise ValueError(f"Pointer {ptr_int:x} was not allocated by this memory resource") + + mem_handle, reserved_ptr, aligned_size = self._allocations.pop(ptr_int) + + # Unmap the memory + err, = driver.cuMemUnmap(reserved_ptr, aligned_size) + raise_if_driver_error(err) + + # Free the address reservation + err, = driver.cuMemAddressFree(reserved_ptr, aligned_size) + raise_if_driver_error(err) + + # Release the memory handle + err, = driver.cuMemRelease(mem_handle) + raise_if_driver_error(err) + + @property + def is_device_accessible(self) -> bool: + """bool: this memory resource provides device-accessible buffers.""" + return True + + @property + def is_host_accessible(self) -> bool: + """bool: this memory resource does not provide host-accessible buffers by default.""" + # VMM allocations are typically device-only unless specifically configured for host access + return False + + @property + def device_id(self) -> int: + """int: the associated device ordinal.""" + return self._dev_id + + def __del__(self): + """Clean up any remaining allocations.""" + # Clean up any remaining allocations + for ptr_int, (mem_handle, reserved_ptr, aligned_size) in list(self._allocations.items()): + try: + driver.cuMemUnmap(reserved_ptr, aligned_size) + driver.cuMemAddressFree(reserved_ptr, aligned_size) + driver.cuMemRelease(mem_handle) + except: + pass # Ignore errors during cleanup + self._allocations.clear() diff --git a/cuda_core/examples/vmm_memory_example.py b/cuda_core/examples/vmm_memory_example.py new file mode 100644 index 000000000..07115d2e6 --- /dev/null +++ b/cuda_core/examples/vmm_memory_example.py @@ -0,0 +1,103 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Example demonstrating the VMMAllocatedMemoryResource for fine-grained memory management. + +This example shows how to use CUDA's Virtual Memory Management APIs through the +VMMAllocatedMemoryResource class for advanced memory allocation scenarios. +""" + +import sys + +from cuda.core.experimental import Device, VMMAllocatedMemoryResource, Stream +from cuda.core.experimental._utils.cuda_utils import driver + + +def main(): + """Demonstrate VMMAllocatedMemoryResource usage.""" + try: + # Get the default device + device = Device() + print(f"Using device {device.device_id}: {device.properties.name}") + + # Check if device supports virtual memory management + err, vmm_supported = driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, + device.device_id + ) + + if err != driver.CUresult.CUDA_SUCCESS or not vmm_supported: + print(f"Device {device.device_id} does not support virtual memory management.") + print("This feature requires a modern GPU with compute capability 6.0 or higher.") + sys.exit(1) + + print(f"Device {device.device_id} supports virtual memory management!") + + # Create a VMMAllocatedMemoryResource using the convenience method + vmm_mr = device.create_vmm_memory_resource() + print(f"Created VMMAllocatedMemoryResource for device {device.device_id}") + + # Optionally set it as the default memory resource for the device + # device.memory_resource = vmm_mr + + # Create a stream for operations + stream = Stream() + + # Allocate some memory using VMM + sizes = [1024, 4096, 1024*1024] # 1KB, 4KB, 1MB + buffers = [] + + print("\nAllocating buffers using VMM:") + for i, size in enumerate(sizes): + buffer = vmm_mr.allocate(size, stream) + buffers.append(buffer) + print(f" Buffer {i+1}: {size:,} bytes at address 0x{int(buffer.handle):016x}") + + # Verify buffer properties + assert buffer.is_device_accessible + assert not buffer.is_host_accessible + assert buffer.device_id == device.device_id + assert buffer.memory_resource is vmm_mr + + # Demonstrate buffer copying + if len(buffers) >= 2: + print(f"\nCopying from buffer 1 to buffer 2...") + # Note: In a real application, you would initialize buffer 1 with data first + buffers[1].copy_from(buffers[0], stream=stream) + stream.sync() # Wait for copy to complete + print("Copy completed!") + + # Clean up buffers + print("\nCleaning up buffers:") + for i, buffer in enumerate(buffers): + buffer.close() + print(f" Buffer {i+1} deallocated") + + print("\nVMM memory management example completed successfully!") + + # Demonstrate advanced usage: custom allocation type + print("\nDemonstrating custom allocation type:") + try: + # Create with managed memory type (if supported) + vmm_mr_managed = device.create_vmm_memory_resource( + driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED + ) + + managed_buffer = vmm_mr_managed.allocate(4096, stream) + print(f" Managed buffer: 4096 bytes at address 0x{int(managed_buffer.handle):016x}") + managed_buffer.close() + print(" Managed buffer deallocated") + + except Exception as e: + print(f" Managed memory allocation failed: {e}") + print(" This is expected on some systems/drivers") + + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/cuda_core/tests/test_vmm_memory_resource.py b/cuda_core/tests/test_vmm_memory_resource.py new file mode 100644 index 000000000..260af0386 --- /dev/null +++ b/cuda_core/tests/test_vmm_memory_resource.py @@ -0,0 +1,133 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from cuda.core.experimental import Device, VMMAllocatedMemoryResource +from cuda.core.experimental._utils.cuda_utils import driver + + +class TestVMMAllocatedMemoryResource: + def test_vmm_memory_resource_creation(self): + """Test creating a VMMAllocatedMemoryResource.""" + device = Device() + + # Check if device supports VMM + err, vmm_supported = driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, + device.device_id + ) + if err != driver.CUresult.CUDA_SUCCESS or not vmm_supported: + pytest.skip("Device does not support virtual memory management") + + mr = device.create_vmm_memory_resource() + + assert mr.device_id == device.device_id + assert mr.is_device_accessible is True + assert mr.is_host_accessible is False + + def test_vmm_memory_resource_allocation_deallocation(self): + """Test allocating and deallocating memory with VMMAllocatedMemoryResource.""" + device = Device() + + # Check if device supports VMM + err, vmm_supported = driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, + device.device_id + ) + if err != driver.CUresult.CUDA_SUCCESS or not vmm_supported: + pytest.skip("Device does not support virtual memory management") + + mr = device.create_vmm_memory_resource() + + # Test allocation + size = 1024 * 1024 # 1 MB + buffer = mr.allocate(size) + + assert buffer.size == size + assert buffer.memory_resource is mr + assert buffer.is_device_accessible is True + assert buffer.is_host_accessible is False + assert buffer.device_id == device.device_id + + # Test deallocation + buffer.close() + + # Verify the buffer is closed + assert buffer.handle is None + + def test_vmm_memory_resource_multiple_allocations(self): + """Test multiple allocations with VMMAllocatedMemoryResource.""" + device = Device() + + # Check if device supports VMM + err, vmm_supported = driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, + device.device_id + ) + if err != driver.CUresult.CUDA_SUCCESS or not vmm_supported: + pytest.skip("Device does not support virtual memory management") + + mr = device.create_vmm_memory_resource() + + # Allocate multiple buffers + buffers = [] + for i in range(5): + size = (i + 1) * 1024 # Different sizes + buffer = mr.allocate(size) + buffers.append(buffer) + + assert buffer.size == size + assert buffer.memory_resource is mr + + # Deallocate all buffers + for buffer in buffers: + buffer.close() + + def test_vmm_memory_resource_with_different_allocation_types(self): + """Test VMMAllocatedMemoryResource with different allocation types.""" + device = Device() + + # Check if device supports VMM + err, vmm_supported = driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, + device.device_id + ) + if err != driver.CUresult.CUDA_SUCCESS or not vmm_supported: + pytest.skip("Device does not support virtual memory management") + + # Test with pinned allocation type (default) + mr_pinned = device.create_vmm_memory_resource( + driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED + ) + + buffer = mr_pinned.allocate(1024) + assert buffer.size == 1024 + buffer.close() + + def test_vmm_memory_resource_invalid_device(self): + """Test VMMAllocatedMemoryResource creation with invalid device.""" + # This should raise an error for an invalid device ID + with pytest.raises((ValueError, RuntimeError, Exception)): # Accept any exception for invalid device + invalid_device = Device(0) # Get a valid device first + invalid_device._id = 999 # Hack to test invalid device + invalid_device.create_vmm_memory_resource() + + def test_vmm_memory_resource_deallocate_untracked_pointer(self): + """Test deallocating a pointer that wasn't allocated by this resource.""" + device = Device() + + # Check if device supports VMM + err, vmm_supported = driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, + device.device_id + ) + if err != driver.CUresult.CUDA_SUCCESS or not vmm_supported: + pytest.skip("Device does not support virtual memory management") + + mr = device.create_vmm_memory_resource() + + # Try to deallocate a fake pointer + with pytest.raises(ValueError, match="was not allocated by this memory resource"): + mr.deallocate(0x12345678, 1024)