From 35d7dd56c3551965d83115577da8d931daed8868 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Wed, 10 Sep 2025 13:22:18 -0700 Subject: [PATCH 01/35] commit initial draft --- cuda_core/cuda/core/experimental/_memory.pyx | 146 +++++++++++++++++++ 1 file changed, 146 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 44e7a77c7..22e6b87ae 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -508,3 +508,149 @@ class _SynchronousMemoryResource(MemoryResource): @property def device_id(self) -> int: return self._dev_id + +@dataclass +class VMMConfig: + """A configuration object for the VMMAllocatedMemoryResource + Stores configuration information which tells the resource how to use the CUDA VMM APIs + """ + """ + Configuration for CUDA VMM allocations. + + Args: + handle_type: Export handle type for the physical allocation. Use + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR on Linux if you plan to + import/export the allocation (required for cuMemRetainAllocationHandle). + Use CU_MEM_HANDLE_TYPE_NONE if you don't need an exportable handle. + gpu_direct_rdma: Hint that the allocation should be GDR-capable (if supported). + granularity: 'recommended' or 'minimum'. Controls granularity query and size rounding. + addr_hint: A (optional) virtual address hint to try to reserve at. 0 -> let CUDA choose. + addr_align: Alignment for the VA reservation. If None, use the queried granularity. + peers: Extra device IDs that should be granted access in addition to `device`. + self_access: Access flags for the owning device ('rw', 'r', or 'none'). + peer_access: Access flags for peers ('rw' or 'r'). + """ + handle_type: int # driver.CUmemAllocationHandleType + gpu_direct_rdma: bool = True + granularity: str = "recommended" # or "minimum" + addr_hint: int = 0 + addr_align: Optional[int] = None + peers: Iterable[int] = field(default_factory=tuple) + self_access: str = "rw" # 'rw' | 'r' | 'none' + peer_access: str = "rw" # 'rw' | 'r' + + def _granularity_flag(self, driver) -> int: + # Prefer recommended granularity unless user asked for minimum + try: + flags = driver.CUmemAllocationGranularity_flags + return (flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM + if self.granularity == "minimum" + else flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED) + except AttributeError: + # Fallback if enum names differ in your bindings + return 0 + + @staticmethod + def _access_to_flags(driver, spec: str) -> int: + f = driver.CUmemAccess_flags + if spec == "rw": + return f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE + if spec == "r": + return f.CU_MEM_ACCESS_FLAGS_PROT_READ + if spec == "none": + return 0 + raise ValueError(f"Unknown access spec: {spec!r}") + + +class VMMAllocatedMemoryResource(MemoryResource): + """Create a device memory resource that uses the CUDA VMM APIs to allocate memory. + + Parameters + ---------- + device_id : int + Device ordinal for which a memory resource is constructed. The mempool that is + set to *current* on ``device_id`` is used. If no mempool is set to current yet, + the driver would use the *default* mempool on the device. + + config : VMMConfig + """ + + __slots__ = ("_dev_id",) + + def __init__(self, device_id: int): + err, self._handle = driver.cuDeviceGetMemPool(device_id) + raise_if_driver_error(err) + self._dev_id = device_id + + # Set a higher release threshold to improve performance when there are no active allocations. + # By default, the release threshold is 0, which means memory is immediately released back + # to the OS when there are no active suballocations, causing performance issues. + # Check current release threshold + err, current_threshold = driver.cuMemPoolGetAttribute( + self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD + ) + raise_if_driver_error(err) + # If threshold is 0 (default), set it to maximum to retain memory in the pool + if int(current_threshold) == 0: + err, = driver.cuMemPoolSetAttribute( + self._handle, + driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + driver.cuuint64_t(0xFFFFFFFFFFFFFFFF), + ) + raise_if_driver_error(err) + + def allocate(self, size_t size, stream: Stream = None) -> Buffer: + """Allocate a buffer of the requested size. + + Parameters + ---------- + size : int + The size of the buffer to allocate, in bytes. + stream : Stream, optional + The stream on which to perform the allocation asynchronously. + If None, an internal stream is used. + + Returns + ------- + Buffer + The allocated buffer object, which is accessible on the device that this memory + resource was created for. + """ + if stream is None: + stream = default_stream() + err, ptr = driver.cuMemAllocFromPoolAsync(size, self._handle, stream.handle) + raise_if_driver_error(err) + return Buffer._init(ptr, size, self) + + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): + """Deallocate a buffer previously allocated by this resource. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerT` + The pointer or handle to the buffer to deallocate. + size : int + The size of the buffer to deallocate, in bytes. + stream : Stream, optional + The stream on which to perform the deallocation asynchronously. + If None, an internal stream is used. + """ + if stream is None: + stream = default_stream() + err, = driver.cuMemFreeAsync(ptr, stream.handle) + raise_if_driver_error(err) + + @property + def is_device_accessible(self) -> bool: + """bool: this memory resource provides device-accessible buffers.""" + return True + + @property + def is_host_accessible(self) -> bool: + """bool: this memory resource does not provides host-accessible buffers.""" + return False + + @property + def device_id(self) -> int: + """int: the associated device ordinal.""" + return self._dev_id From 1de97e2897f1a2f79dd84a87742ac992a6c0021b Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Fri, 12 Sep 2025 10:18:39 -0700 Subject: [PATCH 02/35] add modification/growing option --- cuda_core/cuda/core/experimental/_memory.pyx | 420 +++++++++++++++---- 1 file changed, 348 insertions(+), 72 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 22e6b87ae..70880a69b 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -530,26 +530,18 @@ class VMMConfig: self_access: Access flags for the owning device ('rw', 'r', or 'none'). peer_access: Access flags for peers ('rw' or 'r'). """ - handle_type: int # driver.CUmemAllocationHandleType + # TODO: for enums, do we re-expose them as cuda-core Enums or leave them as driver enums? + allocation_type: driver.CUmemAllocationType + location_type: driver.CUmemLocationType # Only supports CU_MEM_LOCATION_TYPE_DEVICE + handle_type: driver.CUmemAllocationHandleType gpu_direct_rdma: bool = True - granularity: str = "recommended" # or "minimum" - addr_hint: int = 0 + granularity: driver.CUmemAllocationGranularity_flags + addr_hint: Optional[int] = 0 addr_align: Optional[int] = None peers: Iterable[int] = field(default_factory=tuple) self_access: str = "rw" # 'rw' | 'r' | 'none' peer_access: str = "rw" # 'rw' | 'r' - def _granularity_flag(self, driver) -> int: - # Prefer recommended granularity unless user asked for minimum - try: - flags = driver.CUmemAllocationGranularity_flags - return (flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM - if self.granularity == "minimum" - else flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED) - except AttributeError: - # Fallback if enum names differ in your bindings - return 0 - @staticmethod def _access_to_flags(driver, spec: str) -> int: f = driver.CUmemAccess_flags @@ -573,84 +565,368 @@ class VMMAllocatedMemoryResource(MemoryResource): the driver would use the *default* mempool on the device. config : VMMConfig + A configuration object for the VMMAllocatedMemoryResource """ - - __slots__ = ("_dev_id",) - - def __init__(self, device_id: int): - err, self._handle = driver.cuDeviceGetMemPool(device_id) - raise_if_driver_error(err) - self._dev_id = device_id - - # Set a higher release threshold to improve performance when there are no active allocations. - # By default, the release threshold is 0, which means memory is immediately released back - # to the OS when there are no active suballocations, causing performance issues. - # Check current release threshold - err, current_threshold = driver.cuMemPoolGetAttribute( - self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD - ) - raise_if_driver_error(err) - # If threshold is 0 (default), set it to maximum to retain memory in the pool - if int(current_threshold) == 0: - err, = driver.cuMemPoolSetAttribute( - self._handle, - driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, - driver.cuuint64_t(0xFFFFFFFFFFFFFFFF), + def __init__(self, device, config: VMMConfig = None): + self.device = device + if config is None: + config = VMMConfig( + allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, + location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, + handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, + gpu_direct_rdma=True, + granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, + addr_hint=0, + addr_align=None, + peers=(), + self_access="rw", + peer_access="rw", ) - raise_if_driver_error(err) + self.config = config - def allocate(self, size_t size, stream: Stream = None) -> Buffer: - """Allocate a buffer of the requested size. + def _align_up(self, size: int, gran: int) -> int: + """ + Align a size up to the nearest multiple of a granularity. + """ + return (size + gran - 1) & ~(gran - 1) + def modify_allocation(self, buf: Buffer, new_size: int, config: VMMConfig = None) -> Buffer: + """ + Grow an existing allocation using CUDA VMM, with a configurable policy. + + This implements true growing allocations that preserve the base pointer + by extending the virtual address range and mapping additional physical memory. + Parameters ---------- - size : int - The size of the buffer to allocate, in bytes. - stream : Stream, optional - The stream on which to perform the allocation asynchronously. - If None, an internal stream is used. - + buf : Buffer + The existing buffer to grow + new_size : int + The new total size for the allocation + config : VMMConfig, optional + Configuration for the new physical memory chunks. If None, uses current config. + Returns ------- Buffer - The allocated buffer object, which is accessible on the device that this memory - resource was created for. + The same buffer with updated size, preserving the original pointer """ - if stream is None: - stream = default_stream() - err, ptr = driver.cuMemAllocFromPoolAsync(size, self._handle, stream.handle) - raise_if_driver_error(err) - return Buffer._init(ptr, size, self) - - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): - """Deallocate a buffer previously allocated by this resource. + if new_size <= buf.size: + # No growth needed, return original buffer + return buf + + if config is not None: + self.config = config + + # Build allocation properties for new chunks + prop = driver.CUmemAllocationProp() + prop.type = self.config.allocation_type + prop.location.type = self.config.location_type + prop.location.id = self.device.device_id + prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0 + prop.requestedHandleTypes = self.config.handle_type + + # Query granularity + gran_flag = self.config.granularity + res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) + if res != driver.CUresult.CUDA_SUCCESS: + raise Exception(f"cuMemGetAllocationGranularity failed: {res}") + + # Calculate sizes + additional_size = new_size - buf.size + aligned_additional_size = self._align_up(additional_size, gran) + total_aligned_size = self._align_up(new_size, gran) + addr_align = self.config.addr_align or gran + + # Try to extend the existing VA range first + res, new_ptr = driver.cuMemAddressReserve( + aligned_additional_size, + addr_align, + buf.ptr + buf.size, # fixedAddr hint - try to extend at end of current range + 0 + ) + + if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (buf.ptr + buf.size): + # Fallback: couldn't extend contiguously, need full remapping + return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align) + else: + # Success! We can extend the VA range contiguously + return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr) - Parameters - ---------- - ptr : :obj:`~_memory.DevicePointerT` - The pointer or handle to the buffer to deallocate. - size : int - The size of the buffer to deallocate, in bytes. - stream : Stream, optional - The stream on which to perform the deallocation asynchronously. - If None, an internal stream is used. + def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, + aligned_additional_size: int, new_ptr: int) -> Buffer: """ - if stream is None: - stream = default_stream() - err, = driver.cuMemFreeAsync(ptr, stream.handle) - raise_if_driver_error(err) + Fast path: extend the VA range contiguously. + + This preserves the original pointer by mapping new physical memory + to the extended portion of the virtual address range. + """ + # Create new physical memory for the additional size + res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) + if res != driver.CUresult.CUDA_SUCCESS: + driver.cuMemAddressFree(new_ptr, aligned_additional_size) + raise Exception(f"cuMemCreate failed: {res}") + + # Map the new physical memory to the extended VA range + res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) + if res != driver.CUresult.CUDA_SUCCESS: + driver.cuMemAddressFree(new_ptr, aligned_additional_size) + driver.cuMemRelease(new_handle) + raise Exception(f"cuMemMap failed: {res}") + + # Set access permissions for the new portion + descs = self._build_access_descriptors(prop) + if descs: + res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs)) + if res != driver.CUresult.CUDA_SUCCESS: + driver.cuMemUnmap(new_ptr, aligned_additional_size) + driver.cuMemAddressFree(new_ptr, aligned_additional_size) + driver.cuMemRelease(new_handle) + raise Exception(f"cuMemSetAccess failed: {res}") + + # Update the buffer size (pointer stays the same!) + buf._size = new_size + + return buf + + def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, + aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer: + """ + Slow path: full remapping when contiguous extension fails. + + This creates a new VA range and remaps both old and new physical memory. + The buffer's pointer will change. + """ + # Reserve a completely new, larger VA range + res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0) + if res != driver.CUresult.CUDA_SUCCESS: + raise Exception(f"cuMemAddressReserve failed: {res}") + + # Get the old allocation handle for remapping + result, old_handle = driver.cuMemRetainAllocationHandle(buf.ptr) + if result != driver.CUresult.CUDA_SUCCESS: + driver.cuMemAddressFree(new_ptr, total_aligned_size) + raise Exception(f"Failed to retain old allocation handle: {result}") + + # Unmap the old VA range + result, = driver.cuMemUnmap(buf.ptr, buf.size) + if result != driver.CUresult.CUDA_SUCCESS: + driver.cuMemAddressFree(new_ptr, total_aligned_size) + driver.cuMemRelease(old_handle) + raise Exception(f"Failed to unmap old allocation: {result}") + + # Remap the old physical memory to the new VA range + res, = driver.cuMemMap(new_ptr, buf.size, 0, old_handle, 0) + if res != driver.CUresult.CUDA_SUCCESS: + driver.cuMemAddressFree(new_ptr, total_aligned_size) + driver.cuMemRelease(old_handle) + raise Exception(f"cuMemMap failed for old memory: {res}") + + # Create new physical memory for the additional size + res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) + if res != driver.CUresult.CUDA_SUCCESS: + driver.cuMemUnmap(new_ptr, total_aligned_size) + driver.cuMemAddressFree(new_ptr, total_aligned_size) + driver.cuMemRelease(old_handle) + raise Exception(f"cuMemCreate failed for new memory: {res}") + + # Map the new physical memory to the extended portion + res, = driver.cuMemMap(new_ptr + buf.size, aligned_additional_size, 0, new_handle, 0) + if res != driver.CUresult.CUDA_SUCCESS: + driver.cuMemUnmap(new_ptr, total_aligned_size) + driver.cuMemAddressFree(new_ptr, total_aligned_size) + driver.cuMemRelease(old_handle) + driver.cuMemRelease(new_handle) + raise Exception(f"cuMemMap failed for new memory: {res}") + + # Set access permissions for the entire new range + descs = self._build_access_descriptors(prop) + if descs: + res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs)) + if res != driver.CUresult.CUDA_SUCCESS: + driver.cuMemUnmap(new_ptr, total_aligned_size) + driver.cuMemAddressFree(new_ptr, total_aligned_size) + driver.cuMemRelease(old_handle) + driver.cuMemRelease(new_handle) + raise Exception(f"cuMemSetAccess failed: {res}") + + # Free the old VA range + driver.cuMemAddressFree(buf.ptr, buf.size) + + # Update the buffer with new pointer and size + buf._ptr = new_ptr + buf._size = total_aligned_size + buf._ptr_obj = new_ptr + + return buf + + def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list: + """ + Build access descriptors for memory access permissions. + + Returns + ------- + list + List of CUmemAccessDesc objects for setting memory access + """ + descs = [] + + # Owner access + owner_flags = VMMConfig._access_to_flags(driver, self.config.self_access) + if owner_flags: + d = driver.CUmemAccessDesc() + d.location.type = prop.location.type + d.location.id = prop.location.id + d.flags = owner_flags + descs.append(d) + + # Peer device access + peer_flags = VMMConfig._access_to_flags(driver, self.config.peer_access) + for peer_dev in self.config.peers: + if peer_flags: + d = driver.CUmemAccessDesc() + d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + d.location.id = int(peer_dev) + d.flags = peer_flags + descs.append(d) + + return descs + + + def allocate(self, size: int, stream: Stream = None) -> Buffer: + """ + Allocate memory using CUDA VMM with a configurable policy. + """ + config = self.config + # ---- Build allocation properties ---- + prop = driver.CUmemAllocationProp() + prop.type = config.allocation_type + # TODO: Support host alloation if required + if config.location_type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: + raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}") + prop.location.type = config.location_type + prop.location.id = self.device.device_id + prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0 + prop.requestedHandleTypes = config.handle_type + + # ---- Query and apply granularity ---- + # Choose min vs recommended granularity per config + gran_flag = config.granularity + res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) + if res != driver.CUresult.CUDA_SUCCESS: + raise Exception(f"cuMemGetAllocationGranularity failed: {res}") + + aligned_size = self._align_up(size, gran) + addr_align = config.addr_align or gran + + # ---- Create physical memory ---- + res, handle = driver.cuMemCreate(aligned_size, prop, 0) + if res != driver.CUresult.CUDA_SUCCESS: + raise Exception(f"cuMemCreate failed: {res}") + + # ---- Reserve VA space ---- + # Potentially, use a separate size for the VA reservation from the physical allocation size + res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0) + if res != driver.CUresult.CUDA_SUCCESS: + # tidy up physical handle on failure + driver.cuMemRelease(handle) + raise Exception(f"cuMemAddressReserve failed: {res}") + + # ---- Map physical memory into VA ---- + res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0) + if res != driver.CUresult.CUDA_SUCCESS: + driver.cuMemAddressFree(ptr, aligned_size) + driver.cuMemRelease(handle) + raise Exception(f"cuMemMap failed: {res}") + + # ---- Set access for owner + peers ---- + descs = [] + + # Owner access + owner_flags = VMMAllocationConfig._access_to_flags(driver, config.self_access) + if owner_flags: + d = driver.CUmemAccessDesc() + d.location.type = prop.location.type + d.location.id = prop.location.id + d.flags = owner_flags + descs.append(d) + + # Peer device access + peer_flags = VMMAllocationConfig._access_to_flags(driver, config.peer_access) + for peer_dev in config.peers: + if peer_flags: + d = driver.CUmemAccessDesc() + d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + d.location.id = int(peer_dev) + d.flags = peer_flags + descs.append(d) + + if descs: + res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs)) + if res != driver.CUresult.CUDA_SUCCESS: + # Try to unwind on failure + driver.cuMemUnmap(ptr, aligned_size) + driver.cuMemAddressFree(ptr, aligned_size) + driver.cuMemRelease(handle) + raise Exception(f"cuMemSetAccess failed: {res}") + + # Done — return a Buffer that tracks this VA range + buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self) + return buf + + def deallocate(self, ptr: int, size: int, stream: Stream=None) -> None: + """ + Deallocate memory on the device using CUDA VMM APIs. + """ + result, handle = driver.cuMemRetainAllocationHandle(ptr) + if result != driver.CUresult.CUDA_SUCCESS: + raise Exception(f"Failed to retain allocation handle: {result}") + result, = driver.cuMemUnmap(ptr, size) + if result != driver.CUresult.CUDA_SUCCESS: + raise Exception(f"Failed to unmap physical allocation: {result}") + result, = driver.cuMemAddressFree(ptr, size) + if result != driver.CUresult.CUDA_SUCCESS: + raise Exception(f"Failed to free address: {result}") + result, = driver.cuMemRelease(handle) + if result != driver.CUresult.CUDA_SUCCESS: + raise Exception(f"Failed to release physical allocation: {result}") + @property def is_device_accessible(self) -> bool: - """bool: this memory resource provides device-accessible buffers.""" + """ + Indicates whether the allocated memory is accessible from the device. + + Returns: + bool: Always True for NVSHMEM memory. + """ return True @property def is_host_accessible(self) -> bool: - """bool: this memory resource does not provides host-accessible buffers.""" + """ + Indicates whether the allocated memory is accessible from the host. + + Returns: + bool: Always False for NVSHMEM memory. + """ return False @property def device_id(self) -> int: - """int: the associated device ordinal.""" - return self._dev_id + """ + Get the device ID associated with this memory resource. + + Returns: + int: CUDA device ID. + """ + return self.device.device_id + + def __repr__(self) -> str: + """ + Return a string representation of the NvshmemResource. + + Returns: + str: A string describing the object + """ + return f"" From e7fd8d086efbd952a6ede09378f2e628ebfb64f8 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Fri, 12 Sep 2025 10:22:02 -0700 Subject: [PATCH 03/35] add tests --- cuda_core/tests/test_memory.py | 126 +++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 491521ff9..bc9a8386a 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -283,3 +283,129 @@ def test_device_memory_resource_initialization(): assert buffer.size == 1024 assert buffer.device_id == device.device_id buffer.close() + + def test_vmm_allocator_basic_allocation(): + """Test basic VMM allocation functionality. + + This test verifies that VMMAllocatedMemoryResource can allocate memory + using CUDA VMM APIs with default configuration. + """ + device = Device() + device.set_current() + + # Create VMM allocator with default config + vmm_mr = VMMAllocatedMemoryResource(device) + + # Test basic allocation + buffer = vmm_mr.allocate(4096) + assert buffer.size >= 4096 # May be aligned up + assert buffer.device_id == device.device_id + assert buffer.memory_resource == vmm_mr + + # Test deallocation + buffer.close() + + # Test multiple allocations + buffers = [] + for i in range(5): + buf = vmm_mr.allocate(1024 * (i + 1)) + buffers.append(buf) + assert buf.size >= 1024 * (i + 1) + + # Clean up + for buf in buffers: + buf.close() + + def test_vmm_allocator_policy_configuration(): + """Test VMM allocator with different policy configurations. + + This test verifies that VMMAllocatedMemoryResource can be configured + with different allocation policies and that the configuration affects + the allocation behavior. + """ + device = Device() + device.set_current() + + # Test with custom VMM config + custom_config = VMMConfig( + allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, + location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, + granularity=driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_MINIMUM, + gpu_direct_rdma=True, + handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC, + peers=(), + self_access="rw", + peer_access="rw", + ) + + vmm_mr = VMMAllocatedMemoryResource(device, config=custom_config) + + # Verify configuration is applied + assert vmm_mr.config == custom_config + assert vmm_mr.config.gpu_direct_rdma is True + assert vmm_mr.config.granularity == driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_MINIMUM + + # Test allocation with custom config + buffer = vmm_mr.allocate(8192) + assert buffer.size >= 8192 + assert buffer.device_id == device.device_id + + # Test policy modification + new_config = VMMConfig( + allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, + location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, + granularity=driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, + gpu_direct_rdma=False, + handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC, + peers=(), + self_access="r", # Read-only access + peer_access="r", + ) + + # Modify allocation policy + modified_buffer = vmm_mr.modify_allocation(buffer, 16384, config=new_config) + assert modified_buffer.size >= 16384 + assert vmm_mr.config == new_config + assert vmm_mr.config.self_access == "r" + + # Clean up + modified_buffer.close() + + def test_vmm_allocator_grow_allocation(): + """Test VMM allocator's ability to grow existing allocations. + + This test verifies that VMMAllocatedMemoryResource can grow existing + allocations while preserving the base pointer when possible. + """ + device = Device() + device.set_current() + + vmm_mr = VMMAllocatedMemoryResource(device) + + # Create initial allocation + buffer = vmm_mr.allocate(4096) + original_ptr = buffer.handle + original_size = buffer.size + + # Grow the allocation + grown_buffer = vmm_mr.modify_allocation(buffer, 8192) + + # Verify growth + assert grown_buffer.size >= 8192 + assert grown_buffer.size > original_size + + # The pointer should ideally be preserved (fast path) + # but may change if contiguous extension fails (slow path) + assert grown_buffer.handle is not None + + # Test growing to same size (should return original buffer) + same_buffer = vmm_mr.modify_allocation(grown_buffer, 8192) + assert same_buffer is grown_buffer + + # Test growing to smaller size (should return original buffer) + smaller_buffer = vmm_mr.modify_allocation(grown_buffer, 4096) + assert smaller_buffer is grown_buffer + + # Clean up + grown_buffer.close() + From c941700438191acd689305fdf2f8c0d51465ff13 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Fri, 12 Sep 2025 15:17:20 -0700 Subject: [PATCH 04/35] Add tests and make them pass --- cuda_core/cuda/core/experimental/__init__.py | 2 +- cuda_core/cuda/core/experimental/_memory.pyx | 85 +++---- cuda_core/tests/test_memory.py | 250 +++++++++---------- 3 files changed, 165 insertions(+), 172 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index fffb80a5c..af06f4393 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -14,7 +14,7 @@ from cuda.core.experimental._launch_config import LaunchConfig from cuda.core.experimental._launcher import launch from cuda.core.experimental._linker import Linker, LinkerOptions -from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource +from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource, VMMAllocatedMemoryResource, VMMConfig from cuda.core.experimental._module import Kernel, ObjectCode from cuda.core.experimental._program import Program, ProgramOptions from cuda.core.experimental._stream import Stream, StreamOptions diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 70880a69b..0f4bd0efd 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -11,7 +11,8 @@ from cuda.core.experimental._utils.cuda_utils cimport ( ) import abc -from typing import TypeVar, Union +from typing import TypeVar, Union, Optional, Iterable +from dataclasses import dataclass, field from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream @@ -531,11 +532,11 @@ class VMMConfig: peer_access: Access flags for peers ('rw' or 'r'). """ # TODO: for enums, do we re-expose them as cuda-core Enums or leave them as driver enums? - allocation_type: driver.CUmemAllocationType - location_type: driver.CUmemLocationType # Only supports CU_MEM_LOCATION_TYPE_DEVICE - handle_type: driver.CUmemAllocationHandleType + allocation_type: driver.CUmemAllocationType = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED + location_type: driver.CUmemLocationType = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + handle_type: driver.CUmemAllocationHandleType = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR + granularity: driver.CUmemAllocationGranularity_flags = driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED gpu_direct_rdma: bool = True - granularity: driver.CUmemAllocationGranularity_flags addr_hint: Optional[int] = 0 addr_align: Optional[int] = None peers: Iterable[int] = field(default_factory=tuple) @@ -543,7 +544,7 @@ class VMMConfig: peer_access: str = "rw" # 'rw' | 'r' @staticmethod - def _access_to_flags(driver, spec: str) -> int: + def _access_to_flags(driver, spec: str): f = driver.CUmemAccess_flags if spec == "rw": return f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE @@ -569,20 +570,7 @@ class VMMAllocatedMemoryResource(MemoryResource): """ def __init__(self, device, config: VMMConfig = None): self.device = device - if config is None: - config = VMMConfig( - allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, - location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, - handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, - gpu_direct_rdma=True, - granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, - addr_hint=0, - addr_align=None, - peers=(), - self_access="rw", - peer_access="rw", - ) - self.config = config + self.config = config or VMMConfig() def _align_up(self, size: int, gran: int) -> int: """ @@ -610,14 +598,14 @@ class VMMAllocatedMemoryResource(MemoryResource): ------- Buffer The same buffer with updated size, preserving the original pointer - """ + """ + if config is not None: + self.config = config + if new_size <= buf.size: # No growth needed, return original buffer return buf - if config is not None: - self.config = config - # Build allocation properties for new chunks prop = driver.CUmemAllocationProp() prop.type = self.config.allocation_type @@ -636,17 +624,18 @@ class VMMAllocatedMemoryResource(MemoryResource): additional_size = new_size - buf.size aligned_additional_size = self._align_up(additional_size, gran) total_aligned_size = self._align_up(new_size, gran) + aligned_prev_size = total_aligned_size - aligned_additional_size addr_align = self.config.addr_align or gran # Try to extend the existing VA range first res, new_ptr = driver.cuMemAddressReserve( - aligned_additional_size, - addr_align, - buf.ptr + buf.size, # fixedAddr hint - try to extend at end of current range + aligned_additional_size, + addr_align, + int(buf.handle) + aligned_prev_size, # fixedAddr hint - aligned end of current range 0 ) - if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (buf.ptr + buf.size): + if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size): # Fallback: couldn't extend contiguously, need full remapping return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align) else: @@ -703,20 +692,21 @@ class VMMAllocatedMemoryResource(MemoryResource): raise Exception(f"cuMemAddressReserve failed: {res}") # Get the old allocation handle for remapping - result, old_handle = driver.cuMemRetainAllocationHandle(buf.ptr) + result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle) if result != driver.CUresult.CUDA_SUCCESS: driver.cuMemAddressFree(new_ptr, total_aligned_size) raise Exception(f"Failed to retain old allocation handle: {result}") - # Unmap the old VA range - result, = driver.cuMemUnmap(buf.ptr, buf.size) + # Unmap the old VA range (aligned previous size) + aligned_prev_size = total_aligned_size - aligned_additional_size + result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size) if result != driver.CUresult.CUDA_SUCCESS: driver.cuMemAddressFree(new_ptr, total_aligned_size) driver.cuMemRelease(old_handle) raise Exception(f"Failed to unmap old allocation: {result}") - # Remap the old physical memory to the new VA range - res, = driver.cuMemMap(new_ptr, buf.size, 0, old_handle, 0) + # Remap the old physical memory to the new VA range (aligned previous size) + res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0) if res != driver.CUresult.CUDA_SUCCESS: driver.cuMemAddressFree(new_ptr, total_aligned_size) driver.cuMemRelease(old_handle) @@ -730,8 +720,8 @@ class VMMAllocatedMemoryResource(MemoryResource): driver.cuMemRelease(old_handle) raise Exception(f"cuMemCreate failed for new memory: {res}") - # Map the new physical memory to the extended portion - res, = driver.cuMemMap(new_ptr + buf.size, aligned_additional_size, 0, new_handle, 0) + # Map the new physical memory to the extended portion (aligned offset) + res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) if res != driver.CUresult.CUDA_SUCCESS: driver.cuMemUnmap(new_ptr, total_aligned_size) driver.cuMemAddressFree(new_ptr, total_aligned_size) @@ -750,15 +740,18 @@ class VMMAllocatedMemoryResource(MemoryResource): driver.cuMemRelease(new_handle) raise Exception(f"cuMemSetAccess failed: {res}") - # Free the old VA range - driver.cuMemAddressFree(buf.ptr, buf.size) - - # Update the buffer with new pointer and size - buf._ptr = new_ptr - buf._size = total_aligned_size - buf._ptr_obj = new_ptr - - return buf + # Free the old VA range (aligned previous size) + driver.cuMemAddressFree(int(buf.handle), aligned_prev_size) + + # Invalidate the old buffer so its destructor won't try to free again + buf._ptr = 0 + buf._ptr_obj = None + buf._size = 0 + buf._mr = None + + # Return a new Buffer for the new mapping + return Buffer.from_handle(ptr=new_ptr, size=new_size, mr=self) + def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list: """ @@ -843,7 +836,7 @@ class VMMAllocatedMemoryResource(MemoryResource): descs = [] # Owner access - owner_flags = VMMAllocationConfig._access_to_flags(driver, config.self_access) + owner_flags = VMMConfig._access_to_flags(driver, config.self_access) if owner_flags: d = driver.CUmemAccessDesc() d.location.type = prop.location.type @@ -852,7 +845,7 @@ class VMMAllocatedMemoryResource(MemoryResource): descs.append(d) # Peer device access - peer_flags = VMMAllocationConfig._access_to_flags(driver, config.peer_access) + peer_flags = VMMConfig._access_to_flags(driver, config.peer_access) for peer_dev in config.peers: if peer_flags: d = driver.CUmemAccessDesc() diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index bc9a8386a..0ceef8f27 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -10,7 +10,7 @@ import pytest -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource, VMMAllocatedMemoryResource, VMMConfig from cuda.core.experimental._memory import DLDeviceType from cuda.core.experimental._utils.cuda_utils import handle_return @@ -284,128 +284,128 @@ def test_device_memory_resource_initialization(): assert buffer.device_id == device.device_id buffer.close() - def test_vmm_allocator_basic_allocation(): - """Test basic VMM allocation functionality. - - This test verifies that VMMAllocatedMemoryResource can allocate memory - using CUDA VMM APIs with default configuration. - """ - device = Device() - device.set_current() - - # Create VMM allocator with default config - vmm_mr = VMMAllocatedMemoryResource(device) - - # Test basic allocation - buffer = vmm_mr.allocate(4096) - assert buffer.size >= 4096 # May be aligned up - assert buffer.device_id == device.device_id - assert buffer.memory_resource == vmm_mr - - # Test deallocation - buffer.close() - - # Test multiple allocations - buffers = [] - for i in range(5): - buf = vmm_mr.allocate(1024 * (i + 1)) - buffers.append(buf) - assert buf.size >= 1024 * (i + 1) - - # Clean up - for buf in buffers: - buf.close() - - def test_vmm_allocator_policy_configuration(): - """Test VMM allocator with different policy configurations. - - This test verifies that VMMAllocatedMemoryResource can be configured - with different allocation policies and that the configuration affects - the allocation behavior. - """ - device = Device() - device.set_current() - - # Test with custom VMM config - custom_config = VMMConfig( - allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, - location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, - granularity=driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_MINIMUM, - gpu_direct_rdma=True, - handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC, - peers=(), - self_access="rw", - peer_access="rw", - ) - - vmm_mr = VMMAllocatedMemoryResource(device, config=custom_config) - - # Verify configuration is applied - assert vmm_mr.config == custom_config - assert vmm_mr.config.gpu_direct_rdma is True - assert vmm_mr.config.granularity == driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_MINIMUM - - # Test allocation with custom config - buffer = vmm_mr.allocate(8192) - assert buffer.size >= 8192 - assert buffer.device_id == device.device_id - - # Test policy modification - new_config = VMMConfig( - allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, - location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, - granularity=driver.CUmemAllocationGranularity.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, - gpu_direct_rdma=False, - handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC, - peers=(), - self_access="r", # Read-only access - peer_access="r", - ) - - # Modify allocation policy - modified_buffer = vmm_mr.modify_allocation(buffer, 16384, config=new_config) - assert modified_buffer.size >= 16384 - assert vmm_mr.config == new_config - assert vmm_mr.config.self_access == "r" - - # Clean up - modified_buffer.close() - - def test_vmm_allocator_grow_allocation(): - """Test VMM allocator's ability to grow existing allocations. - - This test verifies that VMMAllocatedMemoryResource can grow existing - allocations while preserving the base pointer when possible. - """ - device = Device() - device.set_current() - - vmm_mr = VMMAllocatedMemoryResource(device) - - # Create initial allocation - buffer = vmm_mr.allocate(4096) - original_ptr = buffer.handle - original_size = buffer.size - - # Grow the allocation - grown_buffer = vmm_mr.modify_allocation(buffer, 8192) - - # Verify growth - assert grown_buffer.size >= 8192 - assert grown_buffer.size > original_size - - # The pointer should ideally be preserved (fast path) - # but may change if contiguous extension fails (slow path) - assert grown_buffer.handle is not None - - # Test growing to same size (should return original buffer) - same_buffer = vmm_mr.modify_allocation(grown_buffer, 8192) - assert same_buffer is grown_buffer - - # Test growing to smaller size (should return original buffer) - smaller_buffer = vmm_mr.modify_allocation(grown_buffer, 4096) - assert smaller_buffer is grown_buffer - - # Clean up - grown_buffer.close() +def test_vmm_allocator_basic_allocation(): + """Test basic VMM allocation functionality. + + This test verifies that VMMAllocatedMemoryResource can allocate memory + using CUDA VMM APIs with default configuration. + """ + device = Device() + device.set_current() + + # Create VMM allocator with default config + vmm_mr = VMMAllocatedMemoryResource(device) + + # Test basic allocation + buffer = vmm_mr.allocate(4096) + assert buffer.size >= 4096 # May be aligned up + assert buffer.device_id == device.device_id + assert buffer.memory_resource == vmm_mr + + # Test deallocation + buffer.close() + + # Test multiple allocations + buffers = [] + for i in range(5): + buf = vmm_mr.allocate(1024 * (i + 1)) + buffers.append(buf) + assert buf.size >= 1024 * (i + 1) + + # Clean up + for buf in buffers: + buf.close() + +def test_vmm_allocator_policy_configuration(): + """Test VMM allocator with different policy configurations. + + This test verifies that VMMAllocatedMemoryResource can be configured + with different allocation policies and that the configuration affects + the allocation behavior. + """ + device = Device() + device.set_current() + + # Test with custom VMM config + custom_config = VMMConfig( + allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, + location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, + granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM, + gpu_direct_rdma=True, + handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, + peers=(), + self_access="rw", + peer_access="rw", + ) + + vmm_mr = VMMAllocatedMemoryResource(device, config=custom_config) + + # Verify configuration is applied + assert vmm_mr.config == custom_config + assert vmm_mr.config.gpu_direct_rdma is True + assert vmm_mr.config.granularity == driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM + + # Test allocation with custom config + buffer = vmm_mr.allocate(8192) + assert buffer.size >= 8192 + assert buffer.device_id == device.device_id + + # Test policy modification + new_config = VMMConfig( + allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, + location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, + granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, + gpu_direct_rdma=False, + handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, + peers=(), + self_access="r", # Read-only access + peer_access="r", + ) + + # Modify allocation policy + modified_buffer = vmm_mr.modify_allocation(buffer, 16384, config=new_config) + assert modified_buffer.size >= 16384 + assert vmm_mr.config == new_config + assert vmm_mr.config.self_access == "r" + + # Clean up + modified_buffer.close() + +def test_vmm_allocator_grow_allocation(): + """Test VMM allocator's ability to grow existing allocations. + + This test verifies that VMMAllocatedMemoryResource can grow existing + allocations while preserving the base pointer when possible. + """ + device = Device() + device.set_current() + + vmm_mr = VMMAllocatedMemoryResource(device) + + # Create initial allocation + buffer = vmm_mr.allocate(4096) + original_ptr = buffer.handle + original_size = buffer.size + + # Grow the allocation + grown_buffer = vmm_mr.modify_allocation(buffer, 8192) + + # Verify growth + assert grown_buffer.size >= 8192 + assert grown_buffer.size > original_size + + # The pointer should ideally be preserved (fast path) + # but may change if contiguous extension fails (slow path) + assert grown_buffer.handle is not None + + # Test growing to same size (should return original buffer) + same_buffer = vmm_mr.modify_allocation(grown_buffer, 8192) + assert same_buffer is grown_buffer + + # Test growing to smaller size (should return original buffer) + smaller_buffer = vmm_mr.modify_allocation(grown_buffer, 4096) + assert smaller_buffer is grown_buffer + + # Clean up + grown_buffer.close() From bb5de7f34b3f5d4083687054de806acf1d160b7d Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Fri, 12 Sep 2025 15:32:09 -0700 Subject: [PATCH 05/35] Fix format with pre-commit hooks --- cuda_core/tests/test_memory.py | 74 +++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 0ceef8f27..71f523189 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -10,7 +10,14 @@ import pytest -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource, VMMAllocatedMemoryResource, VMMConfig +from cuda.core.experimental import ( + Buffer, + Device, + DeviceMemoryResource, + MemoryResource, + VMMAllocatedMemoryResource, + VMMConfig, +) from cuda.core.experimental._memory import DLDeviceType from cuda.core.experimental._utils.cuda_utils import handle_return @@ -284,48 +291,50 @@ def test_device_memory_resource_initialization(): assert buffer.device_id == device.device_id buffer.close() + def test_vmm_allocator_basic_allocation(): """Test basic VMM allocation functionality. - + This test verifies that VMMAllocatedMemoryResource can allocate memory using CUDA VMM APIs with default configuration. """ device = Device() device.set_current() - + # Create VMM allocator with default config vmm_mr = VMMAllocatedMemoryResource(device) - + # Test basic allocation buffer = vmm_mr.allocate(4096) assert buffer.size >= 4096 # May be aligned up assert buffer.device_id == device.device_id assert buffer.memory_resource == vmm_mr - + # Test deallocation buffer.close() - + # Test multiple allocations buffers = [] for i in range(5): buf = vmm_mr.allocate(1024 * (i + 1)) buffers.append(buf) assert buf.size >= 1024 * (i + 1) - + # Clean up for buf in buffers: buf.close() + def test_vmm_allocator_policy_configuration(): """Test VMM allocator with different policy configurations. - + This test verifies that VMMAllocatedMemoryResource can be configured with different allocation policies and that the configuration affects the allocation behavior. """ device = Device() device.set_current() - + # Test with custom VMM config custom_config = VMMConfig( allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, @@ -337,19 +346,19 @@ def test_vmm_allocator_policy_configuration(): self_access="rw", peer_access="rw", ) - + vmm_mr = VMMAllocatedMemoryResource(device, config=custom_config) - + # Verify configuration is applied assert vmm_mr.config == custom_config assert vmm_mr.config.gpu_direct_rdma is True assert vmm_mr.config.granularity == driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM - + # Test allocation with custom config buffer = vmm_mr.allocate(8192) assert buffer.size >= 8192 assert buffer.device_id == device.device_id - + # Test policy modification new_config = VMMConfig( allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, @@ -361,51 +370,50 @@ def test_vmm_allocator_policy_configuration(): self_access="r", # Read-only access peer_access="r", ) - + # Modify allocation policy modified_buffer = vmm_mr.modify_allocation(buffer, 16384, config=new_config) assert modified_buffer.size >= 16384 assert vmm_mr.config == new_config assert vmm_mr.config.self_access == "r" - + # Clean up modified_buffer.close() + def test_vmm_allocator_grow_allocation(): """Test VMM allocator's ability to grow existing allocations. - + This test verifies that VMMAllocatedMemoryResource can grow existing allocations while preserving the base pointer when possible. """ device = Device() device.set_current() - + vmm_mr = VMMAllocatedMemoryResource(device) - + # Create initial allocation - buffer = vmm_mr.allocate(4096) - original_ptr = buffer.handle + buffer = vmm_mr.allocate(2 * 1024 * 1024) original_size = buffer.size - + # Grow the allocation - grown_buffer = vmm_mr.modify_allocation(buffer, 8192) - + grown_buffer = vmm_mr.modify_allocation(buffer, 4 * 1024 * 1024) + # Verify growth - assert grown_buffer.size >= 8192 + assert grown_buffer.size >= 4 * 1024 * 1024 assert grown_buffer.size > original_size - + # The pointer should ideally be preserved (fast path) # but may change if contiguous extension fails (slow path) assert grown_buffer.handle is not None - + # Test growing to same size (should return original buffer) - same_buffer = vmm_mr.modify_allocation(grown_buffer, 8192) - assert same_buffer is grown_buffer - + same_buffer = vmm_mr.modify_allocation(grown_buffer, 4 * 1024 * 1024) + assert same_buffer.size == grown_buffer.size + # Test growing to smaller size (should return original buffer) - smaller_buffer = vmm_mr.modify_allocation(grown_buffer, 4096) - assert smaller_buffer is grown_buffer - + smaller_buffer = vmm_mr.modify_allocation(grown_buffer, 2 * 1024 * 1024) + assert smaller_buffer.size == grown_buffer.size + # Clean up grown_buffer.close() - From 4517ca8980d67f2cec215d3922eb336515f7559e Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Fri, 12 Sep 2025 15:38:40 -0700 Subject: [PATCH 06/35] Fix format with pre-commit hooks --- cuda_core/cuda/core/experimental/__init__.py | 9 ++- cuda_core/cuda/core/experimental/_memory.pyx | 62 ++++++++++---------- cuda_core/tests/test_memory.py | 2 +- 3 files changed, 40 insertions(+), 33 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index af06f4393..536899308 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -14,7 +14,14 @@ from cuda.core.experimental._launch_config import LaunchConfig from cuda.core.experimental._launcher import launch from cuda.core.experimental._linker import Linker, LinkerOptions -from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource, VMMAllocatedMemoryResource, VMMConfig +from cuda.core.experimental._memory import ( + Buffer, + DeviceMemoryResource, + LegacyPinnedMemoryResource, + MemoryResource, + VMMAllocatedMemoryResource, + VMMConfig, +) from cuda.core.experimental._module import Kernel, ObjectCode from cuda.core.experimental._program import Program, ProgramOptions from cuda.core.experimental._stream import Stream, StreamOptions diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 0f4bd0efd..39a5f9d7c 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -553,7 +553,7 @@ class VMMConfig: if spec == "none": return 0 raise ValueError(f"Unknown access spec: {spec!r}") - + class VMMAllocatedMemoryResource(MemoryResource): """Create a device memory resource that uses the CUDA VMM APIs to allocate memory. @@ -564,7 +564,7 @@ class VMMAllocatedMemoryResource(MemoryResource): Device ordinal for which a memory resource is constructed. The mempool that is set to *current* on ``device_id`` is used. If no mempool is set to current yet, the driver would use the *default* mempool on the device. - + config : VMMConfig A configuration object for the VMMAllocatedMemoryResource """ @@ -581,10 +581,10 @@ class VMMAllocatedMemoryResource(MemoryResource): def modify_allocation(self, buf: Buffer, new_size: int, config: VMMConfig = None) -> Buffer: """ Grow an existing allocation using CUDA VMM, with a configurable policy. - + This implements true growing allocations that preserve the base pointer by extending the virtual address range and mapping additional physical memory. - + Parameters ---------- buf : Buffer @@ -593,19 +593,19 @@ class VMMAllocatedMemoryResource(MemoryResource): The new total size for the allocation config : VMMConfig, optional Configuration for the new physical memory chunks. If None, uses current config. - + Returns ------- Buffer The same buffer with updated size, preserving the original pointer - """ + """ if config is not None: self.config = config - + if new_size <= buf.size: # No growth needed, return original buffer return buf - + # Build allocation properties for new chunks prop = driver.CUmemAllocationProp() prop.type = self.config.allocation_type @@ -613,20 +613,20 @@ class VMMAllocatedMemoryResource(MemoryResource): prop.location.id = self.device.device_id prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0 prop.requestedHandleTypes = self.config.handle_type - + # Query granularity gran_flag = self.config.granularity res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) if res != driver.CUresult.CUDA_SUCCESS: raise Exception(f"cuMemGetAllocationGranularity failed: {res}") - + # Calculate sizes additional_size = new_size - buf.size aligned_additional_size = self._align_up(additional_size, gran) total_aligned_size = self._align_up(new_size, gran) aligned_prev_size = total_aligned_size - aligned_additional_size addr_align = self.config.addr_align or gran - + # Try to extend the existing VA range first res, new_ptr = driver.cuMemAddressReserve( aligned_additional_size, @@ -634,7 +634,7 @@ class VMMAllocatedMemoryResource(MemoryResource): int(buf.handle) + aligned_prev_size, # fixedAddr hint - aligned end of current range 0 ) - + if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size): # Fallback: couldn't extend contiguously, need full remapping return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align) @@ -642,11 +642,11 @@ class VMMAllocatedMemoryResource(MemoryResource): # Success! We can extend the VA range contiguously return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr) - def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, + def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, aligned_additional_size: int, new_ptr: int) -> Buffer: """ Fast path: extend the VA range contiguously. - + This preserves the original pointer by mapping new physical memory to the extended portion of the virtual address range. """ @@ -655,14 +655,14 @@ class VMMAllocatedMemoryResource(MemoryResource): if res != driver.CUresult.CUDA_SUCCESS: driver.cuMemAddressFree(new_ptr, aligned_additional_size) raise Exception(f"cuMemCreate failed: {res}") - + # Map the new physical memory to the extended VA range res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) if res != driver.CUresult.CUDA_SUCCESS: driver.cuMemAddressFree(new_ptr, aligned_additional_size) driver.cuMemRelease(new_handle) raise Exception(f"cuMemMap failed: {res}") - + # Set access permissions for the new portion descs = self._build_access_descriptors(prop) if descs: @@ -672,17 +672,17 @@ class VMMAllocatedMemoryResource(MemoryResource): driver.cuMemAddressFree(new_ptr, aligned_additional_size) driver.cuMemRelease(new_handle) raise Exception(f"cuMemSetAccess failed: {res}") - + # Update the buffer size (pointer stays the same!) buf._size = new_size - + return buf def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer: """ Slow path: full remapping when contiguous extension fails. - + This creates a new VA range and remaps both old and new physical memory. The buffer's pointer will change. """ @@ -690,13 +690,13 @@ class VMMAllocatedMemoryResource(MemoryResource): res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0) if res != driver.CUresult.CUDA_SUCCESS: raise Exception(f"cuMemAddressReserve failed: {res}") - + # Get the old allocation handle for remapping result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle) if result != driver.CUresult.CUDA_SUCCESS: driver.cuMemAddressFree(new_ptr, total_aligned_size) raise Exception(f"Failed to retain old allocation handle: {result}") - + # Unmap the old VA range (aligned previous size) aligned_prev_size = total_aligned_size - aligned_additional_size result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size) @@ -704,14 +704,14 @@ class VMMAllocatedMemoryResource(MemoryResource): driver.cuMemAddressFree(new_ptr, total_aligned_size) driver.cuMemRelease(old_handle) raise Exception(f"Failed to unmap old allocation: {result}") - + # Remap the old physical memory to the new VA range (aligned previous size) res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0) if res != driver.CUresult.CUDA_SUCCESS: driver.cuMemAddressFree(new_ptr, total_aligned_size) driver.cuMemRelease(old_handle) raise Exception(f"cuMemMap failed for old memory: {res}") - + # Create new physical memory for the additional size res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) if res != driver.CUresult.CUDA_SUCCESS: @@ -719,7 +719,7 @@ class VMMAllocatedMemoryResource(MemoryResource): driver.cuMemAddressFree(new_ptr, total_aligned_size) driver.cuMemRelease(old_handle) raise Exception(f"cuMemCreate failed for new memory: {res}") - + # Map the new physical memory to the extended portion (aligned offset) res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) if res != driver.CUresult.CUDA_SUCCESS: @@ -728,7 +728,7 @@ class VMMAllocatedMemoryResource(MemoryResource): driver.cuMemRelease(old_handle) driver.cuMemRelease(new_handle) raise Exception(f"cuMemMap failed for new memory: {res}") - + # Set access permissions for the entire new range descs = self._build_access_descriptors(prop) if descs: @@ -739,7 +739,7 @@ class VMMAllocatedMemoryResource(MemoryResource): driver.cuMemRelease(old_handle) driver.cuMemRelease(new_handle) raise Exception(f"cuMemSetAccess failed: {res}") - + # Free the old VA range (aligned previous size) driver.cuMemAddressFree(int(buf.handle), aligned_prev_size) @@ -756,14 +756,14 @@ class VMMAllocatedMemoryResource(MemoryResource): def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list: """ Build access descriptors for memory access permissions. - + Returns ------- list List of CUmemAccessDesc objects for setting memory access """ descs = [] - + # Owner access owner_flags = VMMConfig._access_to_flags(driver, self.config.self_access) if owner_flags: @@ -772,7 +772,7 @@ class VMMAllocatedMemoryResource(MemoryResource): d.location.id = prop.location.id d.flags = owner_flags descs.append(d) - + # Peer device access peer_flags = VMMConfig._access_to_flags(driver, self.config.peer_access) for peer_dev in self.config.peers: @@ -782,9 +782,9 @@ class VMMAllocatedMemoryResource(MemoryResource): d.location.id = int(peer_dev) d.flags = peer_flags descs.append(d) - + return descs - + def allocate(self, size: int, stream: Stream = None) -> Buffer: """ diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 71f523189..129f46825 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 - +# Dummy change try: from cuda.bindings import driver except ImportError: From aa4f8df1825086a0acf947691541f1ffdcf93235 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Mon, 15 Sep 2025 11:02:42 -0700 Subject: [PATCH 07/35] Expose enumertor options through VMMAllocationOptions rather than exporting driver enums --- cuda_core/cuda/core/experimental/__init__.py | 2 +- cuda_core/cuda/core/experimental/_memory.pyx | 95 ++++++++++++++------ cuda_core/tests/test_memory.py | 25 +++--- 3 files changed, 83 insertions(+), 39 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 536899308..0cb515e05 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -20,7 +20,7 @@ LegacyPinnedMemoryResource, MemoryResource, VMMAllocatedMemoryResource, - VMMConfig, + VMMAllocationOptions, ) from cuda.core.experimental._module import Kernel, ObjectCode from cuda.core.experimental._program import Program, ProgramOptions diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 39a5f9d7c..d25d23854 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -511,7 +511,7 @@ class _SynchronousMemoryResource(MemoryResource): return self._dev_id @dataclass -class VMMConfig: +class VMMAllocationOptions: """A configuration object for the VMMAllocatedMemoryResource Stores configuration information which tells the resource how to use the CUDA VMM APIs """ @@ -531,11 +531,11 @@ class VMMConfig: self_access: Access flags for the owning device ('rw', 'r', or 'none'). peer_access: Access flags for peers ('rw' or 'r'). """ - # TODO: for enums, do we re-expose them as cuda-core Enums or leave them as driver enums? - allocation_type: driver.CUmemAllocationType = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED - location_type: driver.CUmemLocationType = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - handle_type: driver.CUmemAllocationHandleType = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR - granularity: driver.CUmemAllocationGranularity_flags = driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED + # Human-friendly strings; normalized in __post_init__ + allocation_type: str = "pinned" # pinned + location_type: str = "device" # device + handle_type: str = "posix-fd" # posix-fd | generic | none + granularity: str = "recommended" # minimum | recommended gpu_direct_rdma: bool = True addr_hint: Optional[int] = 0 addr_align: Optional[int] = None @@ -544,7 +544,7 @@ class VMMConfig: peer_access: str = "rw" # 'rw' | 'r' @staticmethod - def _access_to_flags(driver, spec: str): + def _access_to_flags(spec: str): f = driver.CUmemAccess_flags if spec == "rw": return f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE @@ -554,6 +554,51 @@ class VMMConfig: return 0 raise ValueError(f"Unknown access spec: {spec!r}") + @staticmethod + def _allocation_type_to_driver(spec: str): + if spec == "pinned": + return driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED + if spec == "managed": + return driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED + raise ValueError(f"Unsupported allocation_type: {spec!r}") + + @staticmethod + def _location_type_to_driver(spec: str): + if spec == "device": + return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + if spec == "host": + return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST + if spec == "host-numa": + return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA + if spec == "host-numa-current": + return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT + raise ValueError(f"Unsupported location_type: {spec!r}") + + @staticmethod + def _handle_type_to_driver(spec: str): + if spec == "posix-fd": + return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR + if spec == "generic": + return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC + if spec == "none": + return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + if spec == "win32": + return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32 + if spec == "win32-kmt": + return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32_KMT + if spec == "fabric": + return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC + raise ValueError(f"Unsupported handle_type: {spec!r}") + + @staticmethod + def _granularity_to_driver(spec: str): + f = driver.CUmemAllocationGranularity_flags + if spec == "minimum": + return f.CU_MEM_ALLOC_GRANULARITY_MINIMUM + if spec == "recommended": + return f.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED + raise ValueError(f"Unsupported granularity: {spec!r}") + class VMMAllocatedMemoryResource(MemoryResource): """Create a device memory resource that uses the CUDA VMM APIs to allocate memory. @@ -565,12 +610,12 @@ class VMMAllocatedMemoryResource(MemoryResource): set to *current* on ``device_id`` is used. If no mempool is set to current yet, the driver would use the *default* mempool on the device. - config : VMMConfig + config : VMMAllocationOptions A configuration object for the VMMAllocatedMemoryResource """ - def __init__(self, device, config: VMMConfig = None): + def __init__(self, device, config: VMMAllocationOptions = None): self.device = device - self.config = config or VMMConfig() + self.config = config or VMMAllocationOptions() def _align_up(self, size: int, gran: int) -> int: """ @@ -578,7 +623,7 @@ class VMMAllocatedMemoryResource(MemoryResource): """ return (size + gran - 1) & ~(gran - 1) - def modify_allocation(self, buf: Buffer, new_size: int, config: VMMConfig = None) -> Buffer: + def modify_allocation(self, buf: Buffer, new_size: int, config: VMMAllocationOptions = None) -> Buffer: """ Grow an existing allocation using CUDA VMM, with a configurable policy. @@ -591,7 +636,7 @@ class VMMAllocatedMemoryResource(MemoryResource): The existing buffer to grow new_size : int The new total size for the allocation - config : VMMConfig, optional + config : VMMAllocationOptions, optional Configuration for the new physical memory chunks. If None, uses current config. Returns @@ -608,14 +653,14 @@ class VMMAllocatedMemoryResource(MemoryResource): # Build allocation properties for new chunks prop = driver.CUmemAllocationProp() - prop.type = self.config.allocation_type - prop.location.type = self.config.location_type + prop.type = VMMAllocationOptions._allocation_type_to_driver(self.config.allocation_type) + prop.location.type = VMMAllocationOptions._location_type_to_driver(self.config.location_type) prop.location.id = self.device.device_id prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0 - prop.requestedHandleTypes = self.config.handle_type + prop.requestedHandleTypes = VMMAllocationOptions._handle_type_to_driver(self.config.handle_type) # Query granularity - gran_flag = self.config.granularity + gran_flag = VMMAllocationOptions._granularity_to_driver(self.config.granularity) res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) if res != driver.CUresult.CUDA_SUCCESS: raise Exception(f"cuMemGetAllocationGranularity failed: {res}") @@ -765,7 +810,7 @@ class VMMAllocatedMemoryResource(MemoryResource): descs = [] # Owner access - owner_flags = VMMConfig._access_to_flags(driver, self.config.self_access) + owner_flags = VMMAllocationOptions._access_to_flags(self.config.self_access) if owner_flags: d = driver.CUmemAccessDesc() d.location.type = prop.location.type @@ -774,7 +819,7 @@ class VMMAllocatedMemoryResource(MemoryResource): descs.append(d) # Peer device access - peer_flags = VMMConfig._access_to_flags(driver, self.config.peer_access) + peer_flags = VMMAllocationOptions._access_to_flags(self.config.peer_access) for peer_dev in self.config.peers: if peer_flags: d = driver.CUmemAccessDesc() @@ -793,18 +838,18 @@ class VMMAllocatedMemoryResource(MemoryResource): config = self.config # ---- Build allocation properties ---- prop = driver.CUmemAllocationProp() - prop.type = config.allocation_type + prop.type = VMMAllocationOptions._allocation_type_to_driver(config.allocation_type) # TODO: Support host alloation if required - if config.location_type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: + if prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}") - prop.location.type = config.location_type + prop.location.type = VMMAllocationOptions._location_type_to_driver(config.location_type) prop.location.id = self.device.device_id prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0 - prop.requestedHandleTypes = config.handle_type + prop.requestedHandleTypes = VMMAllocationOptions._handle_type_to_driver(config.handle_type) # ---- Query and apply granularity ---- # Choose min vs recommended granularity per config - gran_flag = config.granularity + gran_flag = VMMAllocationOptions._granularity_to_driver(config.granularity) res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) if res != driver.CUresult.CUDA_SUCCESS: raise Exception(f"cuMemGetAllocationGranularity failed: {res}") @@ -836,7 +881,7 @@ class VMMAllocatedMemoryResource(MemoryResource): descs = [] # Owner access - owner_flags = VMMConfig._access_to_flags(driver, config.self_access) + owner_flags = VMMAllocationOptions._access_to_flags(config.self_access) if owner_flags: d = driver.CUmemAccessDesc() d.location.type = prop.location.type @@ -845,7 +890,7 @@ class VMMAllocatedMemoryResource(MemoryResource): descs.append(d) # Peer device access - peer_flags = VMMConfig._access_to_flags(driver, config.peer_access) + peer_flags = VMMAllocationOptions._access_to_flags(config.peer_access) for peer_dev in config.peers: if peer_flags: d = driver.CUmemAccessDesc() diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 129f46825..8bf024760 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1,6 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Dummy change try: from cuda.bindings import driver except ImportError: @@ -16,7 +15,7 @@ DeviceMemoryResource, MemoryResource, VMMAllocatedMemoryResource, - VMMConfig, + VMMAllocationOptions, ) from cuda.core.experimental._memory import DLDeviceType from cuda.core.experimental._utils.cuda_utils import handle_return @@ -336,12 +335,12 @@ def test_vmm_allocator_policy_configuration(): device.set_current() # Test with custom VMM config - custom_config = VMMConfig( - allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, - location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, - granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM, + custom_config = VMMAllocationOptions( + allocation_type="pinned", + location_type="device", + granularity="minimum", gpu_direct_rdma=True, - handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, + handle_type="posix-fd", peers=(), self_access="rw", peer_access="rw", @@ -352,7 +351,7 @@ def test_vmm_allocator_policy_configuration(): # Verify configuration is applied assert vmm_mr.config == custom_config assert vmm_mr.config.gpu_direct_rdma is True - assert vmm_mr.config.granularity == driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM + assert vmm_mr.config.granularity == "minimum" # Test allocation with custom config buffer = vmm_mr.allocate(8192) @@ -360,12 +359,12 @@ def test_vmm_allocator_policy_configuration(): assert buffer.device_id == device.device_id # Test policy modification - new_config = VMMConfig( - allocation_type=driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, - location_type=driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, - granularity=driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, + new_config = VMMAllocationOptions( + allocation_type="pinned", + location_type="device", + granularity="recommended", gpu_direct_rdma=False, - handle_type=driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, + handle_type="posix-fd", peers=(), self_access="r", # Read-only access peer_access="r", From b1d99e55e0d43aa87b940dd2a6ed3d45b2f88281 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Thu, 18 Sep 2025 11:02:45 -0700 Subject: [PATCH 08/35] fix merge conflict --- cuda_core/tests/test_memory.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 9fcf63f94..21c8baf3d 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -10,7 +10,6 @@ import pytest -<<<<<<< HEAD from cuda.core.experimental import ( Buffer, Device, @@ -19,11 +18,8 @@ VMMAllocatedMemoryResource, VMMAllocationOptions, ) -from cuda.core.experimental._memory import DLDeviceType -======= from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor ->>>>>>> d8b4acc1838845d08eaa3f7248246af5244617a8 from cuda.core.experimental._utils.cuda_utils import handle_return POOL_SIZE = 2097152 # 2MB size From a9f41916d864c139e295a63550f90058c334f7f9 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Thu, 18 Sep 2025 11:44:12 -0700 Subject: [PATCH 09/35] fix pre-commit issues --- cuda_core/tests/test_memory.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 21c8baf3d..0887dd64d 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -9,6 +9,8 @@ import platform import pytest +from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor +from cuda.core.experimental._utils.cuda_utils import handle_return from cuda.core.experimental import ( Buffer, @@ -18,9 +20,6 @@ VMMAllocatedMemoryResource, VMMAllocationOptions, ) -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource -from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor -from cuda.core.experimental._utils.cuda_utils import handle_return POOL_SIZE = 2097152 # 2MB size From d1b3379d4e665e444e2c3b352b8f787693143578 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 18 Sep 2025 18:53:18 +0000 Subject: [PATCH 10/35] [pre-commit.ci] auto code formatting --- cuda_core/tests/test_memory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 0887dd64d..e66ef35c0 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -9,8 +9,6 @@ import platform import pytest -from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor -from cuda.core.experimental._utils.cuda_utils import handle_return from cuda.core.experimental import ( Buffer, @@ -20,6 +18,8 @@ VMMAllocatedMemoryResource, VMMAllocationOptions, ) +from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor +from cuda.core.experimental._utils.cuda_utils import handle_return POOL_SIZE = 2097152 # 2MB size From 071ab74d30c12c3e1023453bdc9bba45af65bd59 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Thu, 18 Sep 2025 14:54:42 -0700 Subject: [PATCH 11/35] Address Leo's first comments --- cuda_core/cuda/core/experimental/__init__.py | 4 +- cuda_core/cuda/core/experimental/_memory.pyx | 72 +++++++++++--------- cuda_core/tests/test_memory.py | 20 +++--- 3 files changed, 51 insertions(+), 45 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index d08a96538..bbf75ac85 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -20,8 +20,8 @@ IPCChannel, LegacyPinnedMemoryResource, MemoryResource, - VMMAllocatedMemoryResource, - VMMAllocationOptions, + VirtualMemoryResource, + VirtualMemoryResourceOptions, ) from cuda.core.experimental._module import Kernel, ObjectCode from cuda.core.experimental._program import Program, ProgramOptions diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index bdea9e823..fc7839d35 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -13,7 +13,7 @@ from cuda.core.experimental._utils.cuda_utils cimport ( from dataclasses import dataclass from typing import TypeVar, Union, TYPE_CHECKING import abc -from typing import TypeVar, Union, Optional, Iterable +from typing import TypeVar, Union, Optional, Iterable, Literal from dataclasses import dataclass, field import array import cython @@ -904,9 +904,15 @@ class _SynchronousMemoryResource(MemoryResource): def device_id(self) -> int: return self._dev_id +VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none"] +VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"] +VirtualMemoryGranularityT = Literal["minimum", "recommended"] +VirtualMemoryAccessTypeT = Literal["rw", "r", "none"] +VirtualMemoryAllocationTypeT = Literal["pinned", "managed"] + @dataclass -class VMMAllocationOptions: - """A configuration object for the VMMAllocatedMemoryResource +class VirtualMemoryResourceOptions: + """A configuration object for the VirtualMemoryResource Stores configuration information which tells the resource how to use the CUDA VMM APIs """ """ @@ -926,16 +932,16 @@ class VMMAllocationOptions: peer_access: Access flags for peers ('rw' or 'r'). """ # Human-friendly strings; normalized in __post_init__ - allocation_type: str = "pinned" # pinned - location_type: str = "device" # device - handle_type: str = "posix-fd" # posix-fd | generic | none - granularity: str = "recommended" # minimum | recommended + allocation_type: VirtualMemoryAllocationTypeT = "pinned" + location_type: VirtualMemoryLocationTypeT = "device" + handle_type: VirtualMemoryHandleTypeT = "posix_fd" + granularity: VirtualMemoryGranularityT = "recommended" gpu_direct_rdma: bool = True addr_hint: Optional[int] = 0 addr_align: Optional[int] = None peers: Iterable[int] = field(default_factory=tuple) - self_access: str = "rw" # 'rw' | 'r' | 'none' - peer_access: str = "rw" # 'rw' | 'r' + self_access: VirtualMemoryAccessTypeT = "rw" + peer_access: VirtualMemoryAccessTypeT = "rw" @staticmethod def _access_to_flags(spec: str): @@ -962,15 +968,15 @@ class VMMAllocationOptions: return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE if spec == "host": return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST - if spec == "host-numa": + if spec == "host_numa": return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA - if spec == "host-numa-current": + if spec == "host_numa_current": return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT raise ValueError(f"Unsupported location_type: {spec!r}") @staticmethod def _handle_type_to_driver(spec: str): - if spec == "posix-fd": + if spec == "posix_fd": return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR if spec == "generic": return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC @@ -978,7 +984,7 @@ class VMMAllocationOptions: return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE if spec == "win32": return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32 - if spec == "win32-kmt": + if spec == "win32_kmt": return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32_KMT if spec == "fabric": return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC @@ -994,7 +1000,7 @@ class VMMAllocationOptions: raise ValueError(f"Unsupported granularity: {spec!r}") -class VMMAllocatedMemoryResource(MemoryResource): +class VirtualMemoryResource(MemoryResource): """Create a device memory resource that uses the CUDA VMM APIs to allocate memory. Parameters @@ -1004,12 +1010,12 @@ class VMMAllocatedMemoryResource(MemoryResource): set to *current* on ``device_id`` is used. If no mempool is set to current yet, the driver would use the *default* mempool on the device. - config : VMMAllocationOptions - A configuration object for the VMMAllocatedMemoryResource + config : VirtualMemoryResourceOptions + A configuration object for the VirtualMemoryResource """ - def __init__(self, device, config: VMMAllocationOptions = None): + def __init__(self, device, config: VirtualMemoryResourceOptions = None): self.device = device - self.config = config or VMMAllocationOptions() + self.config = config or VirtualMemoryResourceOptions() def _align_up(self, size: int, gran: int) -> int: """ @@ -1017,7 +1023,7 @@ class VMMAllocatedMemoryResource(MemoryResource): """ return (size + gran - 1) & ~(gran - 1) - def modify_allocation(self, buf: Buffer, new_size: int, config: VMMAllocationOptions = None) -> Buffer: + def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryResourceOptions = None) -> Buffer: """ Grow an existing allocation using CUDA VMM, with a configurable policy. @@ -1030,7 +1036,7 @@ class VMMAllocatedMemoryResource(MemoryResource): The existing buffer to grow new_size : int The new total size for the allocation - config : VMMAllocationOptions, optional + config : VirtualMemoryResourceOptions, optional Configuration for the new physical memory chunks. If None, uses current config. Returns @@ -1047,14 +1053,14 @@ class VMMAllocatedMemoryResource(MemoryResource): # Build allocation properties for new chunks prop = driver.CUmemAllocationProp() - prop.type = VMMAllocationOptions._allocation_type_to_driver(self.config.allocation_type) - prop.location.type = VMMAllocationOptions._location_type_to_driver(self.config.location_type) + prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type) + prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(self.config.location_type) prop.location.id = self.device.device_id prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0 - prop.requestedHandleTypes = VMMAllocationOptions._handle_type_to_driver(self.config.handle_type) + prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(self.config.handle_type) # Query granularity - gran_flag = VMMAllocationOptions._granularity_to_driver(self.config.granularity) + gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(self.config.granularity) res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) if res != driver.CUresult.CUDA_SUCCESS: raise Exception(f"cuMemGetAllocationGranularity failed: {res}") @@ -1204,7 +1210,7 @@ class VMMAllocatedMemoryResource(MemoryResource): descs = [] # Owner access - owner_flags = VMMAllocationOptions._access_to_flags(self.config.self_access) + owner_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.self_access) if owner_flags: d = driver.CUmemAccessDesc() d.location.type = prop.location.type @@ -1213,7 +1219,7 @@ class VMMAllocatedMemoryResource(MemoryResource): descs.append(d) # Peer device access - peer_flags = VMMAllocationOptions._access_to_flags(self.config.peer_access) + peer_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.peer_access) for peer_dev in self.config.peers: if peer_flags: d = driver.CUmemAccessDesc() @@ -1232,18 +1238,18 @@ class VMMAllocatedMemoryResource(MemoryResource): config = self.config # ---- Build allocation properties ---- prop = driver.CUmemAllocationProp() - prop.type = VMMAllocationOptions._allocation_type_to_driver(config.allocation_type) + prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type) # TODO: Support host alloation if required if prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}") - prop.location.type = VMMAllocationOptions._location_type_to_driver(config.location_type) + prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type) prop.location.id = self.device.device_id prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0 - prop.requestedHandleTypes = VMMAllocationOptions._handle_type_to_driver(config.handle_type) + prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(config.handle_type) # ---- Query and apply granularity ---- # Choose min vs recommended granularity per config - gran_flag = VMMAllocationOptions._granularity_to_driver(config.granularity) + gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(config.granularity) res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) if res != driver.CUresult.CUDA_SUCCESS: raise Exception(f"cuMemGetAllocationGranularity failed: {res}") @@ -1275,7 +1281,7 @@ class VMMAllocatedMemoryResource(MemoryResource): descs = [] # Owner access - owner_flags = VMMAllocationOptions._access_to_flags(config.self_access) + owner_flags = VirtualMemoryResourceOptions._access_to_flags(config.self_access) if owner_flags: d = driver.CUmemAccessDesc() d.location.type = prop.location.type @@ -1284,7 +1290,7 @@ class VMMAllocatedMemoryResource(MemoryResource): descs.append(d) # Peer device access - peer_flags = VMMAllocationOptions._access_to_flags(config.peer_access) + peer_flags = VirtualMemoryResourceOptions._access_to_flags(config.peer_access) for peer_dev in config.peers: if peer_flags: d = driver.CUmemAccessDesc() @@ -1361,4 +1367,4 @@ class VMMAllocatedMemoryResource(MemoryResource): Returns: str: A string describing the object """ - return f"" + return f"" diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index e66ef35c0..b5cf0d60c 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -15,8 +15,8 @@ Device, DeviceMemoryResource, MemoryResource, - VMMAllocatedMemoryResource, - VMMAllocationOptions, + VirtualMemoryResource, + VirtualMemoryResourceOptions, ) from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor from cuda.core.experimental._utils.cuda_utils import handle_return @@ -310,14 +310,14 @@ def test_device_memory_resource_initialization(mempool_device, use_device_object def test_vmm_allocator_basic_allocation(): """Test basic VMM allocation functionality. - This test verifies that VMMAllocatedMemoryResource can allocate memory + This test verifies that VirtualMemoryResource can allocate memory using CUDA VMM APIs with default configuration. """ device = Device() device.set_current() # Create VMM allocator with default config - vmm_mr = VMMAllocatedMemoryResource(device) + vmm_mr = VirtualMemoryResource(device) # Test basic allocation buffer = vmm_mr.allocate(4096) @@ -343,7 +343,7 @@ def test_vmm_allocator_basic_allocation(): def test_vmm_allocator_policy_configuration(): """Test VMM allocator with different policy configurations. - This test verifies that VMMAllocatedMemoryResource can be configured + This test verifies that VirtualMemoryResource can be configured with different allocation policies and that the configuration affects the allocation behavior. """ @@ -351,7 +351,7 @@ def test_vmm_allocator_policy_configuration(): device.set_current() # Test with custom VMM config - custom_config = VMMAllocationOptions( + custom_config = VirtualMemoryResourceOptions( allocation_type="pinned", location_type="device", granularity="minimum", @@ -362,7 +362,7 @@ def test_vmm_allocator_policy_configuration(): peer_access="rw", ) - vmm_mr = VMMAllocatedMemoryResource(device, config=custom_config) + vmm_mr = VirtualMemoryResource(device, config=custom_config) # Verify configuration is applied assert vmm_mr.config == custom_config @@ -375,7 +375,7 @@ def test_vmm_allocator_policy_configuration(): assert buffer.device_id == device.device_id # Test policy modification - new_config = VMMAllocationOptions( + new_config = VirtualMemoryResourceOptions( allocation_type="pinned", location_type="device", granularity="recommended", @@ -399,13 +399,13 @@ def test_vmm_allocator_policy_configuration(): def test_vmm_allocator_grow_allocation(): """Test VMM allocator's ability to grow existing allocations. - This test verifies that VMMAllocatedMemoryResource can grow existing + This test verifies that VirtualMemoryResource can grow existing allocations while preserving the base pointer when possible. """ device = Device() device.set_current() - vmm_mr = VMMAllocatedMemoryResource(device) + vmm_mr = VirtualMemoryResource(device) # Create initial allocation buffer = vmm_mr.allocate(2 * 1024 * 1024) From 52f2644863a1d278ebcba09911ee55e729d8a806 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Fri, 19 Sep 2025 14:12:14 -0700 Subject: [PATCH 12/35] save state before I muck with error handling and it gets too messy --- cuda_core/cuda/core/experimental/_memory.pyx | 103 ++++++++----------- 1 file changed, 44 insertions(+), 59 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index fc7839d35..ae992afba 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -946,58 +946,47 @@ class VirtualMemoryResourceOptions: @staticmethod def _access_to_flags(spec: str): f = driver.CUmemAccess_flags - if spec == "rw": - return f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE - if spec == "r": - return f.CU_MEM_ACCESS_FLAGS_PROT_READ - if spec == "none": - return 0 - raise ValueError(f"Unknown access spec: {spec!r}") + _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0} + flags = _access_flags.get(string) + if not flags: + raise ValueError(f"Unknown access spec: {spec!r}") + return flags @staticmethod def _allocation_type_to_driver(spec: str): - if spec == "pinned": - return driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED - if spec == "managed": - return driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED - raise ValueError(f"Unsupported allocation_type: {spec!r}") + f = driver.CUmemAllocationType + _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED, "managed": f.CU_MEM_ALLOCATION_TYPE_MANAGED} + alloc_type = _allocation_type.get(spec) + if not alloc_type: + raise ValueError(f"Unsupported allocation_type: {spec!r}") + return alloc_type @staticmethod def _location_type_to_driver(spec: str): - if spec == "device": - return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - if spec == "host": - return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST - if spec == "host_numa": - return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA - if spec == "host_numa_current": - return driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT - raise ValueError(f"Unsupported location_type: {spec!r}") + f = driver.CUmemLocationType + _location_type = {"device": f.CU_MEM_LOCATION_TYPE_DEVICE, "host": f.CU_MEM_LOCATION_TYPE_HOST, "host_numa": f.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": f.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT} + loc_type = _location_type.get(spec) + if not loc_type: + raise ValueError(f"Unsupported location_type: {spec!r}") + return loc_type @staticmethod def _handle_type_to_driver(spec: str): - if spec == "posix_fd": - return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR - if spec == "generic": - return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_GENERIC - if spec == "none": - return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE - if spec == "win32": - return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32 - if spec == "win32_kmt": - return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32_KMT - if spec == "fabric": - return driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC - raise ValueError(f"Unsupported handle_type: {spec!r}") + f = driver.CUmemAllocationHandleType + _handle_type = {"posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "generic": f.CU_MEM_HANDLE_TYPE_GENERIC, "none": f.CU_MEM_HANDLE_TYPE_NONE, "win32": f.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC} + handle_type = _handle_type.get(spec) + if not handle_type: + raise ValueError(f"Unsupported handle_type: {spec!r}") + return handle_type @staticmethod def _granularity_to_driver(spec: str): f = driver.CUmemAllocationGranularity_flags - if spec == "minimum": - return f.CU_MEM_ALLOC_GRANULARITY_MINIMUM - if spec == "recommended": - return f.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED - raise ValueError(f"Unsupported granularity: {spec!r}") + _granularity = {"minimum": f.CU_MEM_ALLOC_GRANULARITY_MINIMUM, "recommended": f.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED} + granularity = _granularity.get(spec) + if not granularity: + raise ValueError(f"Unsupported granularity: {spec!r}") + return granularity class VirtualMemoryResource(MemoryResource): @@ -1015,7 +1004,9 @@ class VirtualMemoryResource(MemoryResource): """ def __init__(self, device, config: VirtualMemoryResourceOptions = None): self.device = device - self.config = config or VirtualMemoryResourceOptions() + self.config = check_or_create_options( + VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False + ) def _align_up(self, size: int, gran: int) -> int: """ @@ -1047,10 +1038,6 @@ class VirtualMemoryResource(MemoryResource): if config is not None: self.config = config - if new_size <= buf.size: - # No growth needed, return original buffer - return buf - # Build allocation properties for new chunks prop = driver.CUmemAllocationProp() prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type) @@ -1062,8 +1049,7 @@ class VirtualMemoryResource(MemoryResource): # Query granularity gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(self.config.granularity) res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) - if res != driver.CUresult.CUDA_SUCCESS: - raise Exception(f"cuMemGetAllocationGranularity failed: {res}") + raise_if_driver_error(res) # Calculate sizes additional_size = new_size - buf.size @@ -1097,9 +1083,9 @@ class VirtualMemoryResource(MemoryResource): """ # Create new physical memory for the additional size res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) - if res != driver.CUresult.CUDA_SUCCESS: - driver.cuMemAddressFree(new_ptr, aligned_additional_size) - raise Exception(f"cuMemCreate failed: {res}") + if res != driver.CUresult.CUDA_SUCCESS: + driver.cuMemAddressFree(new_ptr, aligned_additional_size) + raise Exception(f"cuMemCreate failed: {res}") # Map the new physical memory to the extended VA range res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) @@ -1235,6 +1221,9 @@ class VirtualMemoryResource(MemoryResource): """ Allocate memory using CUDA VMM with a configurable policy. """ + if stream is not None: + raise NotImplementedError("Stream is not supported with VirtualMemoryResource") + config = self.config # ---- Build allocation properties ---- prop = driver.CUmemAllocationProp() @@ -1242,6 +1231,7 @@ class VirtualMemoryResource(MemoryResource): # TODO: Support host alloation if required if prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}") + prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type) prop.location.id = self.device.device_id prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0 @@ -1251,8 +1241,7 @@ class VirtualMemoryResource(MemoryResource): # Choose min vs recommended granularity per config gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(config.granularity) res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) - if res != driver.CUresult.CUDA_SUCCESS: - raise Exception(f"cuMemGetAllocationGranularity failed: {res}") + raise_if_driver_error(res) aligned_size = self._align_up(size, gran) addr_align = config.addr_align or gran @@ -1317,17 +1306,13 @@ class VirtualMemoryResource(MemoryResource): Deallocate memory on the device using CUDA VMM APIs. """ result, handle = driver.cuMemRetainAllocationHandle(ptr) - if result != driver.CUresult.CUDA_SUCCESS: - raise Exception(f"Failed to retain allocation handle: {result}") + raise_if_driver_error(result) result, = driver.cuMemUnmap(ptr, size) - if result != driver.CUresult.CUDA_SUCCESS: - raise Exception(f"Failed to unmap physical allocation: {result}") + raise_if_driver_error(result) result, = driver.cuMemAddressFree(ptr, size) - if result != driver.CUresult.CUDA_SUCCESS: - raise Exception(f"Failed to free address: {result}") + raise_if_driver_error(result) result, = driver.cuMemRelease(handle) - if result != driver.CUresult.CUDA_SUCCESS: - raise Exception(f"Failed to release physical allocation: {result}") + raise_if_driver_error(result) @property From 24888186e9d388850aefae11d3c66753a006cc21 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Fri, 19 Sep 2025 15:17:56 -0700 Subject: [PATCH 13/35] Overhaul the error handling and implement a Transaction() class to help with that. --- cuda_core/cuda/core/experimental/_memory.pyx | 251 +++++++++--------- .../core/experimental/_utils/cuda_utils.pyx | 33 +++ 2 files changed, 157 insertions(+), 127 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index ae992afba..8787ce7fe 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -21,9 +21,10 @@ import os import platform import weakref + from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream -from cuda.core.experimental._utils.cuda_utils import driver +from cuda.core.experimental._utils.cuda_utils import driver, Transaction if platform.system() == "Linux": import socket @@ -947,7 +948,7 @@ class VirtualMemoryResourceOptions: def _access_to_flags(spec: str): f = driver.CUmemAccess_flags _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0} - flags = _access_flags.get(string) + flags = _access_flags.get(spec) if not flags: raise ValueError(f"Unknown access spec: {spec!r}") return flags @@ -1081,32 +1082,31 @@ class VirtualMemoryResource(MemoryResource): This preserves the original pointer by mapping new physical memory to the extended portion of the virtual address range. """ - # Create new physical memory for the additional size - res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) - if res != driver.CUresult.CUDA_SUCCESS: - driver.cuMemAddressFree(new_ptr, aligned_additional_size) - raise Exception(f"cuMemCreate failed: {res}") - - # Map the new physical memory to the extended VA range - res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) - if res != driver.CUresult.CUDA_SUCCESS: - driver.cuMemAddressFree(new_ptr, aligned_additional_size) - driver.cuMemRelease(new_handle) - raise Exception(f"cuMemMap failed: {res}") - - # Set access permissions for the new portion - descs = self._build_access_descriptors(prop) - if descs: - res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs)) - if res != driver.CUresult.CUDA_SUCCESS: - driver.cuMemUnmap(new_ptr, aligned_additional_size) - driver.cuMemAddressFree(new_ptr, aligned_additional_size) - driver.cuMemRelease(new_handle) - raise Exception(f"cuMemSetAccess failed: {res}") - - # Update the buffer size (pointer stays the same!) + with Transaction() as trans: + # Create new physical memory for the additional size + trans.append(lambda: driver.cuMemAddressFree(new_ptr, aligned_additional_size)) + res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) + raise_if_driver_error(res) + # Register undo for creation + trans.append(lambda: driver.cuMemRelease(new_handle)) + + # Map the new physical memory to the extended VA range + res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) + raise_if_driver_error(res) + # Register undo for mapping + trans.append(lambda: driver.cuMemUnmap(new_ptr, aligned_additional_size)) + + # Set access permissions for the new portion + descs = self._build_access_descriptors(prop) + if descs: + res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs)) + raise_if_driver_error(res) + + # All succeeded, cancel undo actions + trans.commit() + + # Update the buffer size (pointer stays the same) buf._size = new_size - return buf def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, @@ -1117,59 +1117,61 @@ class VirtualMemoryResource(MemoryResource): This creates a new VA range and remaps both old and new physical memory. The buffer's pointer will change. """ - # Reserve a completely new, larger VA range - res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0) - if res != driver.CUresult.CUDA_SUCCESS: - raise Exception(f"cuMemAddressReserve failed: {res}") - - # Get the old allocation handle for remapping - result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle) - if result != driver.CUresult.CUDA_SUCCESS: - driver.cuMemAddressFree(new_ptr, total_aligned_size) - raise Exception(f"Failed to retain old allocation handle: {result}") - - # Unmap the old VA range (aligned previous size) - aligned_prev_size = total_aligned_size - aligned_additional_size - result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size) - if result != driver.CUresult.CUDA_SUCCESS: - driver.cuMemAddressFree(new_ptr, total_aligned_size) - driver.cuMemRelease(old_handle) - raise Exception(f"Failed to unmap old allocation: {result}") - - # Remap the old physical memory to the new VA range (aligned previous size) - res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0) - if res != driver.CUresult.CUDA_SUCCESS: - driver.cuMemAddressFree(new_ptr, total_aligned_size) - driver.cuMemRelease(old_handle) - raise Exception(f"cuMemMap failed for old memory: {res}") - - # Create new physical memory for the additional size - res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) - if res != driver.CUresult.CUDA_SUCCESS: - driver.cuMemUnmap(new_ptr, total_aligned_size) - driver.cuMemAddressFree(new_ptr, total_aligned_size) - driver.cuMemRelease(old_handle) - raise Exception(f"cuMemCreate failed for new memory: {res}") - - # Map the new physical memory to the extended portion (aligned offset) - res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) - if res != driver.CUresult.CUDA_SUCCESS: - driver.cuMemUnmap(new_ptr, total_aligned_size) - driver.cuMemAddressFree(new_ptr, total_aligned_size) - driver.cuMemRelease(old_handle) - driver.cuMemRelease(new_handle) - raise Exception(f"cuMemMap failed for new memory: {res}") - - # Set access permissions for the entire new range - descs = self._build_access_descriptors(prop) - if descs: - res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs)) - if res != driver.CUresult.CUDA_SUCCESS: - driver.cuMemUnmap(new_ptr, total_aligned_size) - driver.cuMemAddressFree(new_ptr, total_aligned_size) - driver.cuMemRelease(old_handle) - driver.cuMemRelease(new_handle) - raise Exception(f"cuMemSetAccess failed: {res}") + with Transaction() as trans: + # Reserve a completely new, larger VA range + res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0) + raise_if_driver_error(res) + # Register undo for VA reservation + trans.append(lambda: driver.cuMemAddressFree(new_ptr, total_aligned_size)) + + # Get the old allocation handle for remapping + result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle) + raise_if_driver_error(result) + # Register undo for old_handle + trans.append(lambda: driver.cuMemRelease(old_handle)) + + # Unmap the old VA range (aligned previous size) + aligned_prev_size = total_aligned_size - aligned_additional_size + result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size) + raise_if_driver_error(result) + + def _remap_old(): + # Try to remap the old physical memory back to the original VA range + try: + driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0) + except Exception: + pass + trans.append(_remap_old) + + # Remap the old physical memory to the new VA range (aligned previous size) + res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0) + raise_if_driver_error(res) + + # Register undo for mapping + trans.append(lambda: driver.cuMemUnmap(new_ptr, aligned_prev_size)) + + # Create new physical memory for the additional size + res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) + raise_if_driver_error(res) + + # Register undo for new physical memory + trans.append(lambda: driver.cuMemRelease(new_handle)) + + # Map the new physical memory to the extended portion (aligned offset) + res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) + raise_if_driver_error(res) + + # Register undo for mapping + trans.append(lambda: driver.cuMemUnmap(int(new_ptr) + aligned_prev_size, aligned_additional_size)) + + # Set access permissions for the entire new range + descs = self._build_access_descriptors(prop) + if descs: + res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs)) + raise_if_driver_error(res) + + # All succeeded, cancel undo actions + trans.commit() # Free the old VA range (aligned previous size) driver.cuMemAddressFree(int(buf.handle), aligned_prev_size) @@ -1246,56 +1248,51 @@ class VirtualMemoryResource(MemoryResource): aligned_size = self._align_up(size, gran) addr_align = config.addr_align or gran - # ---- Create physical memory ---- - res, handle = driver.cuMemCreate(aligned_size, prop, 0) - if res != driver.CUresult.CUDA_SUCCESS: - raise Exception(f"cuMemCreate failed: {res}") - - # ---- Reserve VA space ---- - # Potentially, use a separate size for the VA reservation from the physical allocation size - res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0) - if res != driver.CUresult.CUDA_SUCCESS: - # tidy up physical handle on failure - driver.cuMemRelease(handle) - raise Exception(f"cuMemAddressReserve failed: {res}") - - # ---- Map physical memory into VA ---- - res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0) - if res != driver.CUresult.CUDA_SUCCESS: - driver.cuMemAddressFree(ptr, aligned_size) - driver.cuMemRelease(handle) - raise Exception(f"cuMemMap failed: {res}") - - # ---- Set access for owner + peers ---- - descs = [] - - # Owner access - owner_flags = VirtualMemoryResourceOptions._access_to_flags(config.self_access) - if owner_flags: - d = driver.CUmemAccessDesc() - d.location.type = prop.location.type - d.location.id = prop.location.id - d.flags = owner_flags - descs.append(d) - - # Peer device access - peer_flags = VirtualMemoryResourceOptions._access_to_flags(config.peer_access) - for peer_dev in config.peers: - if peer_flags: + # ---- Transactional allocation ---- + with Transaction() as trans: + # ---- Create physical memory ---- + res, handle = driver.cuMemCreate(aligned_size, prop, 0) + raise_if_driver_error(res) + # Register undo for physical memory + trans.append(lambda: driver.cuMemRelease(handle)) + + # ---- Reserve VA space ---- + # Potentially, use a separate size for the VA reservation from the physical allocation size + res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0) + raise_if_driver_error(res) + # Register undo for VA reservation + trans.append(lambda: driver.cuMemAddressFree(ptr, aligned_size)) + + # ---- Map physical memory into VA ---- + res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0) + trans.append(lambda: driver.cuMemUnmap(ptr, aligned_size)) + raise_if_driver_error(res) + + # ---- Set access for owner + peers ---- + descs = [] + + # Owner access + owner_flags = VirtualMemoryResourceOptions._access_to_flags(config.self_access) + if owner_flags: d = driver.CUmemAccessDesc() - d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - d.location.id = int(peer_dev) - d.flags = peer_flags + d.location.type = prop.location.type + d.location.id = prop.location.id + d.flags = owner_flags descs.append(d) - if descs: - res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs)) - if res != driver.CUresult.CUDA_SUCCESS: - # Try to unwind on failure - driver.cuMemUnmap(ptr, aligned_size) - driver.cuMemAddressFree(ptr, aligned_size) - driver.cuMemRelease(handle) - raise Exception(f"cuMemSetAccess failed: {res}") + # Peer device access + peer_flags = VirtualMemoryResourceOptions._access_to_flags(config.peer_access) + for peer_dev in config.peers: + if peer_flags: + d = driver.CUmemAccessDesc() + d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + d.location.id = int(peer_dev) + d.flags = peer_flags + descs.append(d) + + if descs: + res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs)) + trans.commit() # Done — return a Buffer that tracks this VA range buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self) diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index 86588f733..620b7b95b 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -3,9 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 import functools +from functools import partial import importlib.metadata from collections import namedtuple from collections.abc import Sequence +from contextlib import ExitStack from typing import Callable try: @@ -222,3 +224,34 @@ def get_binding_version(): except importlib.metadata.PackageNotFoundError: major_minor = importlib.metadata.version("cuda-python").split(".")[:2] return tuple(int(v) for v in major_minor) + + +class Transaction: + def __init__(self): + self._stack = ExitStack() + self._entered = False + + def __enter__(self): + self._stack.__enter__() + self._entered = True + return self + + def __exit__(self, exc_type, exc, tb): + # If exit callbacks remain, they'll run in LIFO order. + return self._stack.__exit__(exc_type, exc, tb) + + def append(self, fn, /, *args, **kwargs): + """ + Register an undo action (runs if the with-block exits without commit()). + Values are bound now via partial so late mutations don't bite you. + """ + if not self._entered: + raise RuntimeError("Transaction must be entered before append()") + self._stack.callback(partial(fn, *args, **kwargs)) + + def commit(self): + """ + Disarm all undo actions. After this, exiting the with-block does nothing. + """ + # pop_all() empties this stack so no callbacks are triggered on exit. + self._stack.pop_all() From df6243da95a65dcc3c3e3179c92cf1495370906d Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Fri, 19 Sep 2025 15:43:36 -0700 Subject: [PATCH 14/35] fix re-importation of dataclasses --- cuda_core/cuda/core/experimental/_memory.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 8787ce7fe..47cd5cbbd 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -10,7 +10,6 @@ from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, ) -from dataclasses import dataclass from typing import TypeVar, Union, TYPE_CHECKING import abc from typing import TypeVar, Union, Optional, Iterable, Literal From d8b9af8f4d7050074a041438fd93d47099d2a709 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Fri, 19 Sep 2025 17:11:33 -0700 Subject: [PATCH 15/35] Checkpoint with modified tests and code with comments taken into account passing --- cuda_core/cuda/core/experimental/_memory.pyx | 33 +++++++++++++------- cuda_core/tests/test_memory.py | 4 +-- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 47cd5cbbd..28918c6c8 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -23,7 +23,7 @@ import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream -from cuda.core.experimental._utils.cuda_utils import driver, Transaction +from cuda.core.experimental._utils.cuda_utils import driver, Transaction, get_binding_version if platform.system() == "Linux": import socket @@ -914,9 +914,6 @@ VirtualMemoryAllocationTypeT = Literal["pinned", "managed"] class VirtualMemoryResourceOptions: """A configuration object for the VirtualMemoryResource Stores configuration information which tells the resource how to use the CUDA VMM APIs - """ - """ - Configuration for CUDA VMM allocations. Args: handle_type: Export handle type for the physical allocation. Use @@ -948,16 +945,20 @@ class VirtualMemoryResourceOptions: f = driver.CUmemAccess_flags _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0} flags = _access_flags.get(spec) - if not flags: + if flags is None: raise ValueError(f"Unknown access spec: {spec!r}") return flags @staticmethod def _allocation_type_to_driver(spec: str): f = driver.CUmemAllocationType - _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED, "managed": f.CU_MEM_ALLOCATION_TYPE_MANAGED} + # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not + _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED} + ver_major, ver_minor = get_binding_version() + if ver_major >= 13: + _allocation_type["managed"] = f.CU_MEM_ALLOCATION_TYPE_MANAGED alloc_type = _allocation_type.get(spec) - if not alloc_type: + if alloc_type is None: raise ValueError(f"Unsupported allocation_type: {spec!r}") return alloc_type @@ -966,16 +967,22 @@ class VirtualMemoryResourceOptions: f = driver.CUmemLocationType _location_type = {"device": f.CU_MEM_LOCATION_TYPE_DEVICE, "host": f.CU_MEM_LOCATION_TYPE_HOST, "host_numa": f.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": f.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT} loc_type = _location_type.get(spec) - if not loc_type: + if loc_type is None: raise ValueError(f"Unsupported location_type: {spec!r}") return loc_type @staticmethod def _handle_type_to_driver(spec: str): f = driver.CUmemAllocationHandleType - _handle_type = {"posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "generic": f.CU_MEM_HANDLE_TYPE_GENERIC, "none": f.CU_MEM_HANDLE_TYPE_NONE, "win32": f.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC} + _handle_type = { + "none": f.CU_MEM_HANDLE_TYPE_NONE, + "posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, + "win32": f.CU_MEM_HANDLE_TYPE_WIN32, + "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT, + "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC, + } handle_type = _handle_type.get(spec) - if not handle_type: + if handle_type is None: raise ValueError(f"Unsupported handle_type: {spec!r}") return handle_type @@ -984,7 +991,7 @@ class VirtualMemoryResourceOptions: f = driver.CUmemAllocationGranularity_flags _granularity = {"minimum": f.CU_MEM_ALLOC_GRANULARITY_MINIMUM, "recommended": f.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED} granularity = _granularity.get(spec) - if not granularity: + if granularity is None: raise ValueError(f"Unsupported granularity: {spec!r}") return granularity @@ -1038,6 +1045,10 @@ class VirtualMemoryResource(MemoryResource): if config is not None: self.config = config + # No-op if new size is less than or equal to the current size + if new_size <= buf.size: + return buf + # Build allocation properties for new chunks prop = driver.CUmemAllocationProp() prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index b5cf0d60c..8c5910142 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -356,7 +356,7 @@ def test_vmm_allocator_policy_configuration(): location_type="device", granularity="minimum", gpu_direct_rdma=True, - handle_type="posix-fd", + handle_type="posix_fd", peers=(), self_access="rw", peer_access="rw", @@ -380,7 +380,7 @@ def test_vmm_allocator_policy_configuration(): location_type="device", granularity="recommended", gpu_direct_rdma=False, - handle_type="posix-fd", + handle_type="posix_fd", peers=(), self_access="r", # Read-only access peer_access="r", From 6fa2e7ded7a1da5f5f263505c65a1ba296b9ab5d Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Fri, 19 Sep 2025 17:39:33 -0700 Subject: [PATCH 16/35] docstrings, a couple review comments --- cuda_core/cuda/core/experimental/_memory.pyx | 105 ++++++++++++------ .../core/experimental/_utils/cuda_utils.pyx | 17 +++ 2 files changed, 89 insertions(+), 33 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 28918c6c8..0f4298e4f 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -1028,6 +1028,9 @@ class VirtualMemoryResource(MemoryResource): This implements true growing allocations that preserve the base pointer by extending the virtual address range and mapping additional physical memory. + This function uses transactional allocation: if any step fails, the original buffer is not modified and + all steps the function took are rolled back so a new allocation is not created. + Parameters ---------- buf : Buffer @@ -1040,15 +1043,11 @@ class VirtualMemoryResource(MemoryResource): Returns ------- Buffer - The same buffer with updated size, preserving the original pointer + The same buffer with updated size and properties, preserving the original pointer """ if config is not None: self.config = config - # No-op if new size is less than or equal to the current size - if new_size <= buf.size: - return buf - # Build allocation properties for new chunks prop = driver.CUmemAllocationProp() prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type) @@ -1064,6 +1063,14 @@ class VirtualMemoryResource(MemoryResource): # Calculate sizes additional_size = new_size - buf.size + if additional_size <= 0: + # Same size: only update access policy if needed; avoid zero-sized driver calls + descs = self._build_access_descriptors(prop) + if descs: + res, = driver.cuMemSetAccess(int(buf.handle), buf.size, descs, len(descs)) + raise_if_driver_error(res) + return buf + aligned_additional_size = self._align_up(additional_size, gran) total_aligned_size = self._align_up(new_size, gran) aligned_prev_size = total_aligned_size - aligned_additional_size @@ -1087,10 +1094,22 @@ class VirtualMemoryResource(MemoryResource): def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, aligned_additional_size: int, new_ptr: int) -> Buffer: """ - Fast path: extend the VA range contiguously. + Fast path for growing a virtual memory allocation when the new region can be + reserved contiguously after the existing buffer. - This preserves the original pointer by mapping new physical memory - to the extended portion of the virtual address range. + This function creates and maps new physical memory for the additional size, + sets access permissions, and updates the buffer size in place (the pointer + remains unchanged). + + Args: + buf (Buffer): The buffer to grow. + new_size (int): The new total size in bytes. + prop (driver.CUmemAllocationProp): Allocation properties for the new memory. + aligned_additional_size (int): The size of the new region to allocate, aligned to granularity. + new_ptr (int): The address of the newly reserved contiguous VA region (should be at the end of the current buffer). + + Returns: + Buffer: The same buffer object with its size updated to `new_size`. """ with Transaction() as trans: # Create new physical memory for the additional size @@ -1122,10 +1141,24 @@ class VirtualMemoryResource(MemoryResource): def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer: """ - Slow path: full remapping when contiguous extension fails. + Slow path for growing a virtual memory allocation when the new region cannot be + reserved contiguously after the existing buffer. + + This function reserves a new, larger virtual address (VA) range, remaps the old + physical memory to the beginning of the new VA range, creates and maps new physical + memory for the additional size, sets access permissions, and updates the buffer's + pointer and size. + + Args: + buf (Buffer): The buffer to grow. + new_size (int): The new total size in bytes. + prop (driver.CUmemAllocationProp): Allocation properties for the new memory. + aligned_additional_size (int): The size of the new region to allocate, aligned to granularity. + total_aligned_size (int): The total new size to reserve, aligned to granularity. + addr_align (int): The required address alignment for the new VA range. - This creates a new VA range and remaps both old and new physical memory. - The buffer's pointer will change. + Returns: + Buffer: The buffer object updated with the new pointer and size. """ with Transaction() as trans: # Reserve a completely new, larger VA range @@ -1231,7 +1264,33 @@ class VirtualMemoryResource(MemoryResource): def allocate(self, size: int, stream: Stream = None) -> Buffer: """ - Allocate memory using CUDA VMM with a configurable policy. + Allocate a buffer of the given size using CUDA virtual memory. + + Parameters + ---------- + size : int + The size in bytes of the buffer to allocate. + stream : Stream, optional + CUDA stream to associate with the allocation (not currently supported). + + Returns + ------- + Buffer + A Buffer object representing the allocated virtual memory. + + Raises + ------ + NotImplementedError + If a stream is provided or if the location type is not device memory. + CUDAError + If any CUDA driver API call fails during allocation. + + Notes + ----- + This method uses transactional allocation: if any step fails, all resources + allocated so far are automatically cleaned up. The allocation is performed + with the configured granularity, access permissions, and peer access as + specified in the resource's configuration. """ if stream is not None: raise NotImplementedError("Stream is not supported with VirtualMemoryResource") @@ -1279,27 +1338,7 @@ class VirtualMemoryResource(MemoryResource): raise_if_driver_error(res) # ---- Set access for owner + peers ---- - descs = [] - - # Owner access - owner_flags = VirtualMemoryResourceOptions._access_to_flags(config.self_access) - if owner_flags: - d = driver.CUmemAccessDesc() - d.location.type = prop.location.type - d.location.id = prop.location.id - d.flags = owner_flags - descs.append(d) - - # Peer device access - peer_flags = VirtualMemoryResourceOptions._access_to_flags(config.peer_access) - for peer_dev in config.peers: - if peer_flags: - d = driver.CUmemAccessDesc() - d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - d.location.id = int(peer_dev) - d.flags = peer_flags - descs.append(d) - + descs = self._build_access_descriptors(prop) if descs: res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs)) trans.commit() diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index 620b7b95b..dedb8ac53 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -227,6 +227,23 @@ def get_binding_version(): class Transaction: + """ + A context manager for transactional operations with undo capability. + + The Transaction class allows you to register undo actions (callbacks) that will be executed + if the transaction is not committed before exiting the context. This is useful for managing + resources or operations that need to be rolled back in case of errors or early exits. + + Usage: + with Transaction() as txn: + txn.append(some_cleanup_function, arg1, arg2) + # ... perform operations ... + txn.commit() # Disarm undo actions; nothing will be rolled back on exit + + Methods: + append(fn, *args, **kwargs): Register an undo action to be called on rollback. + commit(): Disarm all undo actions; nothing will be rolled back on exit. + """ def __init__(self): self._stack = ExitStack() self._entered = False From 4dd65f4e3adfd57d6cdc8bc8e541a423806377a6 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Fri, 19 Sep 2025 17:48:18 -0700 Subject: [PATCH 17/35] address a review comment --- cuda_core/tests/test_memory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 8c5910142..553950f83 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -417,9 +417,9 @@ def test_vmm_allocator_grow_allocation(): # Verify growth assert grown_buffer.size >= 4 * 1024 * 1024 assert grown_buffer.size > original_size - - # The pointer should ideally be preserved (fast path) - # but may change if contiguous extension fails (slow path) + # Because of the slow path, the pointer may change + # We cannot assert that the new pointer is the same, + # but we can assert that a new pointer was assigned assert grown_buffer.handle is not None # Test growing to same size (should return original buffer) From 1b86916d461a0cc5ed56a268e8275d8705be0ba6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 20 Sep 2025 00:48:55 +0000 Subject: [PATCH 18/35] [pre-commit.ci] auto code formatting --- cuda_core/tests/test_memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 553950f83..303fd4dff 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -418,7 +418,7 @@ def test_vmm_allocator_grow_allocation(): assert grown_buffer.size >= 4 * 1024 * 1024 assert grown_buffer.size > original_size # Because of the slow path, the pointer may change - # We cannot assert that the new pointer is the same, + # We cannot assert that the new pointer is the same, # but we can assert that a new pointer was assigned assert grown_buffer.handle is not None From ebba14378cff5db4651c7498799bb09b127f4e35 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Mon, 29 Sep 2025 22:17:45 -0700 Subject: [PATCH 19/35] First pass on Keith's comments --- cuda_core/cuda/core/experimental/_memory.pyx | 66 +++++++++----------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index d03e48a80..67815874a 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -945,12 +945,20 @@ class VirtualMemoryResourceOptions: peers: Iterable[int] = field(default_factory=tuple) self_access: VirtualMemoryAccessTypeT = "rw" peer_access: VirtualMemoryAccessTypeT = "rw" + _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0} + _handle_types = {"none": f.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": f.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC} + _granularity = {"recommended": f.CU_MEM_ALLOCATION_GRANULARITY_RECOMMENDED, "minimum": f.CU_MEM_ALLOCATION_GRANULARITY_MINIMUM} + _location_type = {"device": f.CU_MEM_LOCATION_TYPE_DEVICE, "host": f.CU_MEM_LOCATION_TYPE_HOST, "host_numa": f.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": f.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT} + # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not + _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED} + ver_major, ver_minor = get_binding_version() + if ver_major >= 13: + _allocation_type["managed"] = f.CU_MEM_ALLOCATION_TYPE_MANAGED @staticmethod def _access_to_flags(spec: str): f = driver.CUmemAccess_flags - _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0} - flags = _access_flags.get(spec) + flags = VirtualMemoryResourceOptions._access_flags.get(spec) if flags is None: raise ValueError(f"Unknown access spec: {spec!r}") return flags @@ -958,12 +966,7 @@ class VirtualMemoryResourceOptions: @staticmethod def _allocation_type_to_driver(spec: str): f = driver.CUmemAllocationType - # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not - _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED} - ver_major, ver_minor = get_binding_version() - if ver_major >= 13: - _allocation_type["managed"] = f.CU_MEM_ALLOCATION_TYPE_MANAGED - alloc_type = _allocation_type.get(spec) + alloc_type = VirtualMemoryResourceOptions._allocation_type.get(spec) if alloc_type is None: raise ValueError(f"Unsupported allocation_type: {spec!r}") return alloc_type @@ -971,8 +974,7 @@ class VirtualMemoryResourceOptions: @staticmethod def _location_type_to_driver(spec: str): f = driver.CUmemLocationType - _location_type = {"device": f.CU_MEM_LOCATION_TYPE_DEVICE, "host": f.CU_MEM_LOCATION_TYPE_HOST, "host_numa": f.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": f.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT} - loc_type = _location_type.get(spec) + loc_type = VirtualMemoryResourceOptions._location_type.get(spec) if loc_type is None: raise ValueError(f"Unsupported location_type: {spec!r}") return loc_type @@ -980,14 +982,7 @@ class VirtualMemoryResourceOptions: @staticmethod def _handle_type_to_driver(spec: str): f = driver.CUmemAllocationHandleType - _handle_type = { - "none": f.CU_MEM_HANDLE_TYPE_NONE, - "posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, - "win32": f.CU_MEM_HANDLE_TYPE_WIN32, - "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT, - "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC, - } - handle_type = _handle_type.get(spec) + handle_type = VirtualMemoryResourceOptions._handle_types.get(spec) if handle_type is None: raise ValueError(f"Unsupported handle_type: {spec!r}") return handle_type @@ -995,8 +990,7 @@ class VirtualMemoryResourceOptions: @staticmethod def _granularity_to_driver(spec: str): f = driver.CUmemAllocationGranularity_flags - _granularity = {"minimum": f.CU_MEM_ALLOC_GRANULARITY_MINIMUM, "recommended": f.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED} - granularity = _granularity.get(spec) + granularity = VirtualMemoryResourceOptions._granularity.get(spec) if granularity is None: raise ValueError(f"Unsupported granularity: {spec!r}") return granularity @@ -1008,9 +1002,7 @@ class VirtualMemoryResource(MemoryResource): Parameters ---------- device_id : int - Device ordinal for which a memory resource is constructed. The mempool that is - set to *current* on ``device_id`` is used. If no mempool is set to current yet, - the driver would use the *default* mempool on the device. + Device ordinal for which a memory resource is constructed. config : VirtualMemoryResourceOptions A configuration object for the VirtualMemoryResource @@ -1021,6 +1013,7 @@ class VirtualMemoryResource(MemoryResource): VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False ) + @staticmethod def _align_up(self, size: int, gran: int) -> int: """ Align a size up to the nearest multiple of a granularity. @@ -1077,8 +1070,8 @@ class VirtualMemoryResource(MemoryResource): raise_if_driver_error(res) return buf - aligned_additional_size = self._align_up(additional_size, gran) - total_aligned_size = self._align_up(new_size, gran) + aligned_additional_size = VirtualMemoryResource._align_up(additional_size, gran) + total_aligned_size = VirtualMemoryResource._align_up(new_size, gran) aligned_prev_size = total_aligned_size - aligned_additional_size addr_align = self.config.addr_align or gran @@ -1223,7 +1216,8 @@ class VirtualMemoryResource(MemoryResource): trans.commit() # Free the old VA range (aligned previous size) - driver.cuMemAddressFree(int(buf.handle), aligned_prev_size) + result, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size) + raise_if_driver_error(result) # Invalidate the old buffer so its destructor won't try to free again buf._ptr = 0 @@ -1257,8 +1251,8 @@ class VirtualMemoryResource(MemoryResource): # Peer device access peer_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.peer_access) - for peer_dev in self.config.peers: - if peer_flags: + if peer_flags: + for peer_dev in self.config.peers: d = driver.CUmemAccessDesc() d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE d.location.id = int(peer_dev) @@ -1305,8 +1299,8 @@ class VirtualMemoryResource(MemoryResource): # ---- Build allocation properties ---- prop = driver.CUmemAllocationProp() prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type) - # TODO: Support host alloation if required - if prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: + + if prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}") prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type) @@ -1320,7 +1314,7 @@ class VirtualMemoryResource(MemoryResource): res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) raise_if_driver_error(res) - aligned_size = self._align_up(size, gran) + aligned_size = VirtualMemoryResource._align_up(size, gran) addr_align = config.addr_align or gran # ---- Transactional allocation ---- @@ -1347,6 +1341,8 @@ class VirtualMemoryResource(MemoryResource): descs = self._build_access_descriptors(prop) if descs: res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs)) + raise_if_driver_error(res) + trans.commit() # Done — return a Buffer that tracks this VA range @@ -1371,9 +1367,6 @@ class VirtualMemoryResource(MemoryResource): def is_device_accessible(self) -> bool: """ Indicates whether the allocated memory is accessible from the device. - - Returns: - bool: Always True for NVSHMEM memory. """ return True @@ -1381,9 +1374,6 @@ class VirtualMemoryResource(MemoryResource): def is_host_accessible(self) -> bool: """ Indicates whether the allocated memory is accessible from the host. - - Returns: - bool: Always False for NVSHMEM memory. """ return False @@ -1399,7 +1389,7 @@ class VirtualMemoryResource(MemoryResource): def __repr__(self) -> str: """ - Return a string representation of the NvshmemResource. + Return a string representation of the VirtualMemoryResource. Returns: str: A string describing the object From e0c863d3a54d3933a82c530fc2811fa1e8141b24 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Mon, 29 Sep 2025 22:30:36 -0700 Subject: [PATCH 20/35] Second pass on Keith's comments --- cuda_core/cuda/core/experimental/_memory.pyx | 38 ++++++++++--------- .../core/experimental/_utils/cuda_utils.pyx | 8 ++++ 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 67815874a..9e66c16c5 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -1012,6 +1012,8 @@ class VirtualMemoryResource(MemoryResource): self.config = check_or_create_options( VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False ) + if self.config.location_type == "host": + self.device = None @staticmethod def _align_up(self, size: int, gran: int) -> int: @@ -1112,17 +1114,17 @@ class VirtualMemoryResource(MemoryResource): """ with Transaction() as trans: # Create new physical memory for the additional size - trans.append(lambda: driver.cuMemAddressFree(new_ptr, aligned_additional_size)) + trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(new_ptr, aligned_additional_size))) res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) raise_if_driver_error(res) # Register undo for creation - trans.append(lambda: driver.cuMemRelease(new_handle)) + trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(new_handle))) # Map the new physical memory to the extended VA range res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) raise_if_driver_error(res) # Register undo for mapping - trans.append(lambda: driver.cuMemUnmap(new_ptr, aligned_additional_size)) + trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(new_ptr, aligned_additional_size))) # Set access permissions for the new portion descs = self._build_access_descriptors(prop) @@ -1164,13 +1166,13 @@ class VirtualMemoryResource(MemoryResource): res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0) raise_if_driver_error(res) # Register undo for VA reservation - trans.append(lambda: driver.cuMemAddressFree(new_ptr, total_aligned_size)) + trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(new_ptr, total_aligned_size))) # Get the old allocation handle for remapping result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle) raise_if_driver_error(result) # Register undo for old_handle - trans.append(lambda: driver.cuMemRelease(old_handle)) + trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(old_handle))) # Unmap the old VA range (aligned previous size) aligned_prev_size = total_aligned_size - aligned_additional_size @@ -1190,21 +1192,21 @@ class VirtualMemoryResource(MemoryResource): raise_if_driver_error(res) # Register undo for mapping - trans.append(lambda: driver.cuMemUnmap(new_ptr, aligned_prev_size)) + trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(new_ptr, aligned_prev_size))) # Create new physical memory for the additional size res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) raise_if_driver_error(res) # Register undo for new physical memory - trans.append(lambda: driver.cuMemRelease(new_handle)) + trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(new_handle))) # Map the new physical memory to the extended portion (aligned offset) res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) raise_if_driver_error(res) # Register undo for mapping - trans.append(lambda: driver.cuMemUnmap(int(new_ptr) + aligned_prev_size, aligned_additional_size)) + trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(int(new_ptr) + aligned_prev_size, aligned_additional_size))) # Set access permissions for the entire new range descs = self._build_access_descriptors(prop) @@ -1216,7 +1218,7 @@ class VirtualMemoryResource(MemoryResource): trans.commit() # Free the old VA range (aligned previous size) - result, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size) + result, = wrap_driver_function_with_error_handling(driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)) raise_if_driver_error(result) # Invalidate the old buffer so its destructor won't try to free again @@ -1299,12 +1301,12 @@ class VirtualMemoryResource(MemoryResource): # ---- Build allocation properties ---- prop = driver.CUmemAllocationProp() prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type) - + if prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}") prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type) - prop.location.id = self.device.device_id + prop.location.id = self.device.device_id if config.location_type == "device" else -1 prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0 prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(config.handle_type) @@ -1323,18 +1325,18 @@ class VirtualMemoryResource(MemoryResource): res, handle = driver.cuMemCreate(aligned_size, prop, 0) raise_if_driver_error(res) # Register undo for physical memory - trans.append(lambda: driver.cuMemRelease(handle)) + trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(handle))) # ---- Reserve VA space ---- # Potentially, use a separate size for the VA reservation from the physical allocation size res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0) raise_if_driver_error(res) # Register undo for VA reservation - trans.append(lambda: driver.cuMemAddressFree(ptr, aligned_size)) + trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(ptr, aligned_size))) # ---- Map physical memory into VA ---- res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0) - trans.append(lambda: driver.cuMemUnmap(ptr, aligned_size)) + trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(ptr, aligned_size))) raise_if_driver_error(res) # ---- Set access for owner + peers ---- @@ -1368,14 +1370,14 @@ class VirtualMemoryResource(MemoryResource): """ Indicates whether the allocated memory is accessible from the device. """ - return True + return self.config.location_type == "device" @property def is_host_accessible(self) -> bool: """ Indicates whether the allocated memory is accessible from the host. """ - return False + return self.config.location_type == "host" @property def device_id(self) -> int: @@ -1383,9 +1385,9 @@ class VirtualMemoryResource(MemoryResource): Get the device ID associated with this memory resource. Returns: - int: CUDA device ID. + int: CUDA device ID. -1 if the memory resource allocates host memory """ - return self.device.device_id + return self.device.device_id if self.config.location_type == "device" else -1 def __repr__(self) -> str: """ diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index dedb8ac53..f4f33b048 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -225,6 +225,14 @@ def get_binding_version(): major_minor = importlib.metadata.version("cuda-python").split(".")[:2] return tuple(int(v) for v in major_minor) +def wrap_driver_function_with_error_handling(func): + """ + A wrapper that handles driver errors and raises a CUDAError. + """ + def wrapper(*args, **kwargs): + res, = func(*args, **kwargs) + _check_driver_error(res) + return wrapper class Transaction: """ From 452160cf958702c836e6057a1cbe664a6e883709 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Mon, 29 Sep 2025 22:31:14 -0700 Subject: [PATCH 21/35] Added helper function for error handling --- cuda_core/cuda/core/experimental/_memory.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 9e66c16c5..27dd90a0c 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -7,7 +7,7 @@ from __future__ import annotations from libc.stdint cimport uintptr_t from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, - check_or_create_options, + check_or_create_options, wrap_driver_function_with_error_handling, ) import sys From 63bce2a06eb3f73dbfedaa829069f95d8ccb1a42 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 05:33:13 +0000 Subject: [PATCH 22/35] [pre-commit.ci] auto code formatting --- cuda_core/tests/test_memory.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 303fd4dff..e375a32d3 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -9,7 +9,6 @@ import platform import pytest - from cuda.core.experimental import ( Buffer, Device, From b630314923550bbd3b31a671467c7e9308e4fc7f Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Mon, 29 Sep 2025 22:37:16 -0700 Subject: [PATCH 23/35] need to import, not cimport --- cuda_core/cuda/core/experimental/_memory.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 27dd90a0c..04577a718 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -7,7 +7,7 @@ from __future__ import annotations from libc.stdint cimport uintptr_t from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, - check_or_create_options, wrap_driver_function_with_error_handling, + check_or_create_options, ) import sys @@ -24,7 +24,7 @@ import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream -from cuda.core.experimental._utils.cuda_utils import driver, Transaction, get_binding_version +from cuda.core.experimental._utils.cuda_utils import driver, Transaction, get_binding_version, wrap_driver_function_with_error_handling if platform.system() == "Linux": import socket From b2b824cebda1fc00d8bf537d0481ab642053674d Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Mon, 29 Sep 2025 22:38:45 -0700 Subject: [PATCH 24/35] need to import, not cimport --- cuda_core/cuda/core/experimental/_memory.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 04577a718..a12fd79c0 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -24,7 +24,9 @@ import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream -from cuda.core.experimental._utils.cuda_utils import driver, Transaction, get_binding_version, wrap_driver_function_with_error_handling +from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, + wrap_driver_function_with_error_handling, + ) if platform.system() == "Linux": import socket From efdae393688522f5aa5d6bef0dbcbfbea94f7483 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Mon, 29 Sep 2025 22:50:32 -0700 Subject: [PATCH 25/35] build and test fixes --- cuda_core/cuda/core/experimental/_memory.pyx | 49 ++++++++++---------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index a12fd79c0..624b6c40c 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -947,19 +947,23 @@ class VirtualMemoryResourceOptions: peers: Iterable[int] = field(default_factory=tuple) self_access: VirtualMemoryAccessTypeT = "rw" peer_access: VirtualMemoryAccessTypeT = "rw" - _access_flags = {"rw": f.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": f.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0} - _handle_types = {"none": f.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": f.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": f.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": f.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": f.CU_MEM_HANDLE_TYPE_FABRIC} - _granularity = {"recommended": f.CU_MEM_ALLOCATION_GRANULARITY_RECOMMENDED, "minimum": f.CU_MEM_ALLOCATION_GRANULARITY_MINIMUM} - _location_type = {"device": f.CU_MEM_LOCATION_TYPE_DEVICE, "host": f.CU_MEM_LOCATION_TYPE_HOST, "host_numa": f.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": f.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT} + a = driver.CUmemAccess_flags + _access_flags = {"rw": a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": a.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0} + h = driver.CUmemAllocationHandleType + _handle_types = {"none": h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": h.CU_MEM_HANDLE_TYPE_FABRIC} + g = driver.CUmemAllocationGranularity_flags + _granularity = {"recommended": g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": g.CU_MEM_ALLOC_GRANULARITY_MINIMUM} + l = driver.CUmemLocationType + _location_type = {"device": l.CU_MEM_LOCATION_TYPE_DEVICE, "host": l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT} # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not - _allocation_type = {"pinned": f.CU_MEM_ALLOCATION_TYPE_PINNED} + a = driver.CUmemAllocationType + _allocation_type = {"pinned": a.CU_MEM_ALLOCATION_TYPE_PINNED} ver_major, ver_minor = get_binding_version() if ver_major >= 13: - _allocation_type["managed"] = f.CU_MEM_ALLOCATION_TYPE_MANAGED + _allocation_type["managed"] = a.CU_MEM_ALLOCATION_TYPE_MANAGED @staticmethod def _access_to_flags(spec: str): - f = driver.CUmemAccess_flags flags = VirtualMemoryResourceOptions._access_flags.get(spec) if flags is None: raise ValueError(f"Unknown access spec: {spec!r}") @@ -967,7 +971,6 @@ class VirtualMemoryResourceOptions: @staticmethod def _allocation_type_to_driver(spec: str): - f = driver.CUmemAllocationType alloc_type = VirtualMemoryResourceOptions._allocation_type.get(spec) if alloc_type is None: raise ValueError(f"Unsupported allocation_type: {spec!r}") @@ -975,7 +978,6 @@ class VirtualMemoryResourceOptions: @staticmethod def _location_type_to_driver(spec: str): - f = driver.CUmemLocationType loc_type = VirtualMemoryResourceOptions._location_type.get(spec) if loc_type is None: raise ValueError(f"Unsupported location_type: {spec!r}") @@ -983,7 +985,6 @@ class VirtualMemoryResourceOptions: @staticmethod def _handle_type_to_driver(spec: str): - f = driver.CUmemAllocationHandleType handle_type = VirtualMemoryResourceOptions._handle_types.get(spec) if handle_type is None: raise ValueError(f"Unsupported handle_type: {spec!r}") @@ -991,7 +992,6 @@ class VirtualMemoryResourceOptions: @staticmethod def _granularity_to_driver(spec: str): - f = driver.CUmemAllocationGranularity_flags granularity = VirtualMemoryResourceOptions._granularity.get(spec) if granularity is None: raise ValueError(f"Unsupported granularity: {spec!r}") @@ -1018,7 +1018,7 @@ class VirtualMemoryResource(MemoryResource): self.device = None @staticmethod - def _align_up(self, size: int, gran: int) -> int: + def _align_up(size: int, gran: int) -> int: """ Align a size up to the nearest multiple of a granularity. """ @@ -1116,17 +1116,17 @@ class VirtualMemoryResource(MemoryResource): """ with Transaction() as trans: # Create new physical memory for the additional size - trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(new_ptr, aligned_additional_size))) + trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), new_ptr, aligned_additional_size) res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) raise_if_driver_error(res) # Register undo for creation - trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(new_handle))) + trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), new_handle) # Map the new physical memory to the extended VA range res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) raise_if_driver_error(res) # Register undo for mapping - trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(new_ptr, aligned_additional_size))) + trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), new_ptr, aligned_additional_size) # Set access permissions for the new portion descs = self._build_access_descriptors(prop) @@ -1168,13 +1168,13 @@ class VirtualMemoryResource(MemoryResource): res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0) raise_if_driver_error(res) # Register undo for VA reservation - trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(new_ptr, total_aligned_size))) + trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), new_ptr, total_aligned_size) # Get the old allocation handle for remapping result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle) raise_if_driver_error(result) # Register undo for old_handle - trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(old_handle))) + trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), old_handle) # Unmap the old VA range (aligned previous size) aligned_prev_size = total_aligned_size - aligned_additional_size @@ -1194,21 +1194,21 @@ class VirtualMemoryResource(MemoryResource): raise_if_driver_error(res) # Register undo for mapping - trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(new_ptr, aligned_prev_size))) + trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), new_ptr, aligned_prev_size) # Create new physical memory for the additional size res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) raise_if_driver_error(res) # Register undo for new physical memory - trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(new_handle))) + trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), new_handle) # Map the new physical memory to the extended portion (aligned offset) res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) raise_if_driver_error(res) # Register undo for mapping - trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(int(new_ptr) + aligned_prev_size, aligned_additional_size))) + trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), int(new_ptr) + aligned_prev_size, aligned_additional_size) # Set access permissions for the entire new range descs = self._build_access_descriptors(prop) @@ -1220,8 +1220,7 @@ class VirtualMemoryResource(MemoryResource): trans.commit() # Free the old VA range (aligned previous size) - result, = wrap_driver_function_with_error_handling(driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)) - raise_if_driver_error(result) + wrap_driver_function_with_error_handling(driver.cuMemAddressFree)(int(buf.handle), aligned_prev_size) # Invalidate the old buffer so its destructor won't try to free again buf._ptr = 0 @@ -1327,18 +1326,18 @@ class VirtualMemoryResource(MemoryResource): res, handle = driver.cuMemCreate(aligned_size, prop, 0) raise_if_driver_error(res) # Register undo for physical memory - trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemRelease(handle))) + trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), handle) # ---- Reserve VA space ---- # Potentially, use a separate size for the VA reservation from the physical allocation size res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0) raise_if_driver_error(res) # Register undo for VA reservation - trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemAddressFree(ptr, aligned_size))) + trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), ptr, aligned_size) # ---- Map physical memory into VA ---- res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0) - trans.append(lambda: wrap_driver_function_with_error_handling(driver.cuMemUnmap(ptr, aligned_size))) + trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), ptr, aligned_size) raise_if_driver_error(res) # ---- Set access for owner + peers ---- From 5283e5627db541fb491a1f5f3202635020cea604 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 05:59:45 +0000 Subject: [PATCH 26/35] [pre-commit.ci] auto code formatting --- cuda_core/cuda/core/experimental/_memory.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 624b6c40c..52968ca53 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -24,7 +24,7 @@ import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream -from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, +from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, wrap_driver_function_with_error_handling, ) From 49fd8dafdc078bee74321080f9e41299ac8136bd Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Tue, 30 Sep 2025 13:24:56 -0700 Subject: [PATCH 27/35] address next round of comments --- cuda_core/cuda/core/experimental/_memory.pyx | 34 ++++++++++++------- .../core/experimental/_utils/cuda_utils.pyx | 8 ----- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 624b6c40c..436f04cb0 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -25,11 +25,15 @@ import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, - wrap_driver_function_with_error_handling, + handle_return, ) if platform.system() == "Linux": import socket +# Helper to invoke CUDA driver functions with standard error handling. +def _driver_call(func, /, *args): + handle_return(func(*args)) + if TYPE_CHECKING: import cuda.bindings.driver @@ -1088,6 +1092,10 @@ class VirtualMemoryResource(MemoryResource): ) if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size): + # Check for specific errors that are not recoverable with the slow path + if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED): + raise RuntimeError(f"Failed to extend VA range: {res}") + _driver_call(driver.cuMemAddressFree, new_ptr, aligned_additional_size) # Fallback: couldn't extend contiguously, need full remapping return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align) else: @@ -1116,17 +1124,17 @@ class VirtualMemoryResource(MemoryResource): """ with Transaction() as trans: # Create new physical memory for the additional size - trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), new_ptr, aligned_additional_size) + trans.append(_driver_call, driver.cuMemAddressFree, new_ptr, aligned_additional_size) res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) raise_if_driver_error(res) # Register undo for creation - trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), new_handle) + trans.append(_driver_call, driver.cuMemRelease, new_handle) # Map the new physical memory to the extended VA range res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) raise_if_driver_error(res) # Register undo for mapping - trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), new_ptr, aligned_additional_size) + trans.append(_driver_call, driver.cuMemUnmap, new_ptr, aligned_additional_size) # Set access permissions for the new portion descs = self._build_access_descriptors(prop) @@ -1168,13 +1176,13 @@ class VirtualMemoryResource(MemoryResource): res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0) raise_if_driver_error(res) # Register undo for VA reservation - trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), new_ptr, total_aligned_size) + trans.append(_driver_call, driver.cuMemAddressFree, new_ptr, total_aligned_size) # Get the old allocation handle for remapping result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle) raise_if_driver_error(result) # Register undo for old_handle - trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), old_handle) + trans.append(_driver_call, driver.cuMemRelease, old_handle) # Unmap the old VA range (aligned previous size) aligned_prev_size = total_aligned_size - aligned_additional_size @@ -1194,21 +1202,21 @@ class VirtualMemoryResource(MemoryResource): raise_if_driver_error(res) # Register undo for mapping - trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), new_ptr, aligned_prev_size) + trans.append(_driver_call, driver.cuMemUnmap, new_ptr, aligned_prev_size) # Create new physical memory for the additional size res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) raise_if_driver_error(res) # Register undo for new physical memory - trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), new_handle) + trans.append(_driver_call, driver.cuMemRelease, new_handle) # Map the new physical memory to the extended portion (aligned offset) res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) raise_if_driver_error(res) # Register undo for mapping - trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), int(new_ptr) + aligned_prev_size, aligned_additional_size) + trans.append(_driver_call, driver.cuMemUnmap, int(new_ptr) + aligned_prev_size, aligned_additional_size) # Set access permissions for the entire new range descs = self._build_access_descriptors(prop) @@ -1220,7 +1228,7 @@ class VirtualMemoryResource(MemoryResource): trans.commit() # Free the old VA range (aligned previous size) - wrap_driver_function_with_error_handling(driver.cuMemAddressFree)(int(buf.handle), aligned_prev_size) + handle_return(driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)) # Invalidate the old buffer so its destructor won't try to free again buf._ptr = 0 @@ -1326,18 +1334,18 @@ class VirtualMemoryResource(MemoryResource): res, handle = driver.cuMemCreate(aligned_size, prop, 0) raise_if_driver_error(res) # Register undo for physical memory - trans.append(wrap_driver_function_with_error_handling(driver.cuMemRelease), handle) + trans.append(_driver_call, driver.cuMemRelease, handle) # ---- Reserve VA space ---- # Potentially, use a separate size for the VA reservation from the physical allocation size res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0) raise_if_driver_error(res) # Register undo for VA reservation - trans.append(wrap_driver_function_with_error_handling(driver.cuMemAddressFree), ptr, aligned_size) + trans.append(_driver_call, driver.cuMemAddressFree, ptr, aligned_size) # ---- Map physical memory into VA ---- res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0) - trans.append(wrap_driver_function_with_error_handling(driver.cuMemUnmap), ptr, aligned_size) + trans.append(_driver_call, driver.cuMemUnmap, ptr, aligned_size) raise_if_driver_error(res) # ---- Set access for owner + peers ---- diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index f4f33b048..dedb8ac53 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -225,14 +225,6 @@ def get_binding_version(): major_minor = importlib.metadata.version("cuda-python").split(".")[:2] return tuple(int(v) for v in major_minor) -def wrap_driver_function_with_error_handling(func): - """ - A wrapper that handles driver errors and raises a CUDAError. - """ - def wrapper(*args, **kwargs): - res, = func(*args, **kwargs) - _check_driver_error(res) - return wrapper class Transaction: """ From e09bda7e64fcec09e65095132e5673104cddef28 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Sep 2025 20:32:47 +0000 Subject: [PATCH 28/35] [pre-commit.ci] auto code formatting --- cuda_core/cuda/core/experimental/_memory.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 436f04cb0..3a1fb9d5b 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -24,7 +24,7 @@ import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream -from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, +from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, handle_return, ) From a086c3c0c79f9c13c10b63419b5f97479512ddad Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Wed, 1 Oct 2025 17:32:40 -0700 Subject: [PATCH 29/35] Next round of review comments --- cuda_core/cuda/core/experimental/_memory.pyx | 36 +++++++++---------- .../core/experimental/_utils/cuda_utils.pyx | 1 + 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 436f04cb0..b453e8b26 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -24,15 +24,11 @@ import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream -from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version, - handle_return, - ) +from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version ) if platform.system() == "Linux": import socket -# Helper to invoke CUDA driver functions with standard error handling. -def _driver_call(func, /, *args): - handle_return(func(*args)) +# (Removed) helper for driver calls; use raise_if_driver_error with direct calls instead. if TYPE_CHECKING: @@ -1095,7 +1091,8 @@ class VirtualMemoryResource(MemoryResource): # Check for specific errors that are not recoverable with the slow path if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED): raise RuntimeError(f"Failed to extend VA range: {res}") - _driver_call(driver.cuMemAddressFree, new_ptr, aligned_additional_size) + res2, = driver.cuMemAddressFree(new_ptr, aligned_additional_size) + raise_if_driver_error(res2) # Fallback: couldn't extend contiguously, need full remapping return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align) else: @@ -1124,17 +1121,17 @@ class VirtualMemoryResource(MemoryResource): """ with Transaction() as trans: # Create new physical memory for the additional size - trans.append(_driver_call, driver.cuMemAddressFree, new_ptr, aligned_additional_size) + trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0])) res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) raise_if_driver_error(res) # Register undo for creation - trans.append(_driver_call, driver.cuMemRelease, new_handle) + trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) # Map the new physical memory to the extended VA range res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) raise_if_driver_error(res) # Register undo for mapping - trans.append(_driver_call, driver.cuMemUnmap, new_ptr, aligned_additional_size) + trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0])) # Set access permissions for the new portion descs = self._build_access_descriptors(prop) @@ -1176,13 +1173,13 @@ class VirtualMemoryResource(MemoryResource): res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0) raise_if_driver_error(res) # Register undo for VA reservation - trans.append(_driver_call, driver.cuMemAddressFree, new_ptr, total_aligned_size) + trans.append(lambda np=new_ptr, s=total_aligned_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0])) # Get the old allocation handle for remapping result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle) raise_if_driver_error(result) # Register undo for old_handle - trans.append(_driver_call, driver.cuMemRelease, old_handle) + trans.append(lambda h=old_handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) # Unmap the old VA range (aligned previous size) aligned_prev_size = total_aligned_size - aligned_additional_size @@ -1202,21 +1199,21 @@ class VirtualMemoryResource(MemoryResource): raise_if_driver_error(res) # Register undo for mapping - trans.append(_driver_call, driver.cuMemUnmap, new_ptr, aligned_prev_size) + trans.append(lambda np=new_ptr, s=aligned_prev_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0])) # Create new physical memory for the additional size res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) raise_if_driver_error(res) # Register undo for new physical memory - trans.append(_driver_call, driver.cuMemRelease, new_handle) + trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) # Map the new physical memory to the extended portion (aligned offset) res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) raise_if_driver_error(res) # Register undo for mapping - trans.append(_driver_call, driver.cuMemUnmap, int(new_ptr) + aligned_prev_size, aligned_additional_size) + trans.append(lambda base=int(new_ptr), offs=aligned_prev_size, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(base + offs, s)[0])) # Set access permissions for the entire new range descs = self._build_access_descriptors(prop) @@ -1228,7 +1225,8 @@ class VirtualMemoryResource(MemoryResource): trans.commit() # Free the old VA range (aligned previous size) - handle_return(driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)) + res2, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size) + raise_if_driver_error(res2) # Invalidate the old buffer so its destructor won't try to free again buf._ptr = 0 @@ -1334,18 +1332,18 @@ class VirtualMemoryResource(MemoryResource): res, handle = driver.cuMemCreate(aligned_size, prop, 0) raise_if_driver_error(res) # Register undo for physical memory - trans.append(_driver_call, driver.cuMemRelease, handle) + trans.append(lambda h=handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) # ---- Reserve VA space ---- # Potentially, use a separate size for the VA reservation from the physical allocation size res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0) raise_if_driver_error(res) # Register undo for VA reservation - trans.append(_driver_call, driver.cuMemAddressFree, ptr, aligned_size) + trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemAddressFree(p, s)[0])) # ---- Map physical memory into VA ---- res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0) - trans.append(_driver_call, driver.cuMemUnmap, ptr, aligned_size) + trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemUnmap(p, s)[0])) raise_if_driver_error(res) # ---- Set access for owner + peers ---- diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index dedb8ac53..83eef2f33 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -255,6 +255,7 @@ class Transaction: def __exit__(self, exc_type, exc, tb): # If exit callbacks remain, they'll run in LIFO order. + self._entered = False return self._stack.__exit__(exc_type, exc, tb) def append(self, fn, /, *args, **kwargs): From e90b9b0999e069376752a80951a939241f73864f Mon Sep 17 00:00:00 2001 From: Ben Glick Date: Thu, 2 Oct 2025 09:42:35 -0700 Subject: [PATCH 30/35] Update cuda_core/cuda/core/experimental/_memory.pyx Co-authored-by: Keith Kraus --- cuda_core/cuda/core/experimental/_memory.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index b453e8b26..ba376b9c3 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -28,8 +28,6 @@ from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_ if platform.system() == "Linux": import socket -# (Removed) helper for driver calls; use raise_if_driver_error with direct calls instead. - if TYPE_CHECKING: import cuda.bindings.driver From ae8263c320f3fcb91d4631f9a6f6ba2b36ddf194 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Thu, 2 Oct 2025 10:43:03 -0700 Subject: [PATCH 31/35] Handle missing error check and address review comments --- cuda_core/cuda/core/experimental/_memory.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index ba376b9c3..2ff306ea4 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -1088,7 +1088,7 @@ class VirtualMemoryResource(MemoryResource): if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size): # Check for specific errors that are not recoverable with the slow path if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED): - raise RuntimeError(f"Failed to extend VA range: {res}") + raise_if_driver_error(res) res2, = driver.cuMemAddressFree(new_ptr, aligned_additional_size) raise_if_driver_error(res2) # Fallback: couldn't extend contiguously, need full remapping @@ -1187,7 +1187,8 @@ class VirtualMemoryResource(MemoryResource): def _remap_old(): # Try to remap the old physical memory back to the original VA range try: - driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0) + res, = driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0) + raise_if_driver_error(res) except Exception: pass trans.append(_remap_old) @@ -1307,9 +1308,6 @@ class VirtualMemoryResource(MemoryResource): prop = driver.CUmemAllocationProp() prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type) - if prop.type != driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: - raise NotImplementedError(f"Location type must be CU_MEM_LOCATION_TYPE_DEVICE, got {config.location_type}") - prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type) prop.location.id = self.device.device_id if config.location_type == "device" else -1 prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0 From fea55e0d680f2b6e409049501f9674ed97b0e940 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 7 Oct 2025 22:24:54 +0000 Subject: [PATCH 32/35] nit: hide non-public dataclass members --- cuda_core/cuda/core/experimental/_memory.pyx | 25 +++++++++++--------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 0fc6ef895..044d80196 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -1087,12 +1087,14 @@ class _SynchronousMemoryResource(MemoryResource): def device_id(self) -> int: return self._dev_id + VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none"] VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"] VirtualMemoryGranularityT = Literal["minimum", "recommended"] VirtualMemoryAccessTypeT = Literal["rw", "r", "none"] VirtualMemoryAllocationTypeT = Literal["pinned", "managed"] + @dataclass class VirtualMemoryResourceOptions: """A configuration object for the VirtualMemoryResource @@ -1122,20 +1124,21 @@ class VirtualMemoryResourceOptions: peers: Iterable[int] = field(default_factory=tuple) self_access: VirtualMemoryAccessTypeT = "rw" peer_access: VirtualMemoryAccessTypeT = "rw" - a = driver.CUmemAccess_flags - _access_flags = {"rw": a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": a.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0} - h = driver.CUmemAllocationHandleType - _handle_types = {"none": h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": h.CU_MEM_HANDLE_TYPE_FABRIC} - g = driver.CUmemAllocationGranularity_flags - _granularity = {"recommended": g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": g.CU_MEM_ALLOC_GRANULARITY_MINIMUM} - l = driver.CUmemLocationType - _location_type = {"device": l.CU_MEM_LOCATION_TYPE_DEVICE, "host": l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT} + + _a = driver.CUmemAccess_flags + _access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, "none": 0} + _h = driver.CUmemAllocationHandleType + _handle_types = {"none": _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC} + _g = driver.CUmemAllocationGranularity_flags + _granularity = {"recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM} + _l = driver.CUmemLocationType + _location_type = {"device": _l.CU_MEM_LOCATION_TYPE_DEVICE, "host": _l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT} # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not - a = driver.CUmemAllocationType - _allocation_type = {"pinned": a.CU_MEM_ALLOCATION_TYPE_PINNED} + _a = driver.CUmemAllocationType + _allocation_type = {"pinned": _a.CU_MEM_ALLOCATION_TYPE_PINNED} ver_major, ver_minor = get_binding_version() if ver_major >= 13: - _allocation_type["managed"] = a.CU_MEM_ALLOCATION_TYPE_MANAGED + _allocation_type["managed"] = _a.CU_MEM_ALLOCATION_TYPE_MANAGED @staticmethod def _access_to_flags(spec: str): From 6450712da7e5d1e43b48d0168e534605fa5f552b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 7 Oct 2025 22:28:08 +0000 Subject: [PATCH 33/35] add basic docs --- cuda_core/docs/source/api.rst | 2 ++ cuda_core/docs/source/release/0.X.Y-notes.rst | 1 + 2 files changed, 3 insertions(+) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index f239c69cd..d7f4d3642 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -27,6 +27,7 @@ CUDA runtime MemoryResource DeviceMemoryResource LegacyPinnedMemoryResource + VirtualMemoryResource :template: dataclass.rst @@ -36,6 +37,7 @@ CUDA runtime GraphDebugPrintOptions StreamOptions LaunchConfig + VirtualMemoryResourceOptions CUDA compilation toolchain diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst index 7c1487329..7907839e8 100644 --- a/cuda_core/docs/source/release/0.X.Y-notes.rst +++ b/cuda_core/docs/source/release/0.X.Y-notes.rst @@ -32,6 +32,7 @@ New features - Stream-ordered memory allocation can now be shared on Linux via :class:`DeviceMemoryResource`. - Added NVVM IR support to :class:`Program`. NVVM IR is now understood with ``code_type="nvvm"``. - Added an :attr:`ObjectCode.code_type` attribute for querying the code type. +- Added :class:`VirtualMemoryResource` for low-level virtual memory management. New examples From 4af54ac5ec9015f19bcc27e5a083fc77318c1805 Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Tue, 7 Oct 2025 17:11:12 -0700 Subject: [PATCH 34/35] add windows support --- cuda_core/cuda/core/experimental/_memory.pyx | 2 +- cuda_core/tests/test_memory.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 044d80196..a7ec0751c 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -1088,7 +1088,7 @@ class _SynchronousMemoryResource(MemoryResource): return self._dev_id -VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none"] +VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none", "win32"] VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"] VirtualMemoryGranularityT = Literal["minimum", "recommended"] VirtualMemoryAccessTypeT = Literal["rw", "r", "none"] diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 028055a54..f7d07260f 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -322,9 +322,11 @@ def test_vmm_allocator_basic_allocation(): """ device = Device() device.set_current() - + options = VirtualMemoryResourceOptions() + if platform.system() == "Windows": + options.handle_type = "win32" # Create VMM allocator with default config - vmm_mr = VirtualMemoryResource(device) + vmm_mr = VirtualMemoryResource(device, config=options) # Test basic allocation buffer = vmm_mr.allocate(4096) @@ -363,7 +365,7 @@ def test_vmm_allocator_policy_configuration(): location_type="device", granularity="minimum", gpu_direct_rdma=True, - handle_type="posix_fd", + handle_type="posix_fd" if platform.system() != "Windows" else "win32", peers=(), self_access="rw", peer_access="rw", @@ -412,7 +414,10 @@ def test_vmm_allocator_grow_allocation(): device = Device() device.set_current() - vmm_mr = VirtualMemoryResource(device) + options = VirtualMemoryResourceOptions() + if platform.system() == "Windows": + options.handle_type = "win32" + vmm_mr = VirtualMemoryResource(device, config=options) # Create initial allocation buffer = vmm_mr.allocate(2 * 1024 * 1024) From 9db04b10715f1c83c26cf8cbf1700bfe8f431bfa Mon Sep 17 00:00:00 2001 From: Benjamin Glick Date: Tue, 7 Oct 2025 18:38:19 -0700 Subject: [PATCH 35/35] remove windows tests --- cuda_core/cuda/core/experimental/_memory.pyx | 4 +++- cuda_core/tests/test_memory.py | 11 +++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index a7ec0751c..3786f066b 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -1088,7 +1088,7 @@ class _SynchronousMemoryResource(MemoryResource): return self._dev_id -VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none", "win32"] +VirtualMemoryHandleTypeT = Literal["posix_fd", "generic", "none", "win32", "win32_kmt", "fabric"] VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"] VirtualMemoryGranularityT = Literal["minimum", "recommended"] VirtualMemoryAccessTypeT = Literal["rw", "r", "none"] @@ -1194,6 +1194,8 @@ class VirtualMemoryResource(MemoryResource): ) if self.config.location_type == "host": self.device = None + if platform.system() == "Windows": + raise NotImplementedError("VirtualMemoryResource is not supported on Windows") @staticmethod def _align_up(size: int, gran: int) -> int: diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index f7d07260f..8c980837e 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -320,11 +320,11 @@ def test_vmm_allocator_basic_allocation(): This test verifies that VirtualMemoryResource can allocate memory using CUDA VMM APIs with default configuration. """ + if platform.system() == "Windows": + pytest.skip("VirtualMemoryResource is not supported on Windows TCC") device = Device() device.set_current() options = VirtualMemoryResourceOptions() - if platform.system() == "Windows": - options.handle_type = "win32" # Create VMM allocator with default config vmm_mr = VirtualMemoryResource(device, config=options) @@ -356,6 +356,8 @@ def test_vmm_allocator_policy_configuration(): with different allocation policies and that the configuration affects the allocation behavior. """ + if platform.system() == "Windows": + pytest.skip("VirtualMemoryResource is not supported on Windows TCC") device = Device() device.set_current() @@ -411,12 +413,13 @@ def test_vmm_allocator_grow_allocation(): This test verifies that VirtualMemoryResource can grow existing allocations while preserving the base pointer when possible. """ + if platform.system() == "Windows": + pytest.skip("VirtualMemoryResource is not supported on Windows TCC") device = Device() device.set_current() options = VirtualMemoryResourceOptions() - if platform.system() == "Windows": - options.handle_type = "win32" + vmm_mr = VirtualMemoryResource(device, config=options) # Create initial allocation