Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cuda_core/cuda/core/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from cuda.core.experimental._launch_config import LaunchConfig
from cuda.core.experimental._launcher import launch
from cuda.core.experimental._linker import Linker, LinkerOptions
from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource
from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, LegacyPinnedMemoryResource, MemoryResource, VMMAllocatedMemoryResource
from cuda.core.experimental._module import Kernel, ObjectCode
from cuda.core.experimental._program import Program, ProgramOptions
from cuda.core.experimental._stream import Stream, StreamOptions
Expand Down
36 changes: 36 additions & 0 deletions cuda_core/cuda/core/experimental/_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -1312,6 +1312,42 @@ def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
stream = default_stream()
return self._mr.allocate(size, stream)

def create_vmm_memory_resource(self, allocation_type=None) -> "VMMAllocatedMemoryResource":
"""Create a VMMAllocatedMemoryResource for this device.

Creates a memory resource that uses CUDA's Virtual Memory Management APIs
for fine-grained control over memory allocation and mapping. This is useful for:

- NVSHMEM/NCCL external buffer registration
- Growing allocations without changing pointer addresses
- EGM (Extended GPU Memory) on Grace-Hopper or Grace-Blackwell systems
- Custom memory access patterns and sharing between processes

Parameters
----------
allocation_type : driver.CUmemAllocationType, optional
The type of memory allocation. Defaults to CU_MEM_ALLOCATION_TYPE_PINNED.

Returns
-------
VMMAllocatedMemoryResource
A newly-created VMMAllocatedMemoryResource for this device.

Raises
------
RuntimeError
If this device does not support virtual memory management.

Examples
--------
>>> device = Device()
>>> vmm_mr = device.create_vmm_memory_resource()
>>> device.memory_resource = vmm_mr # Set as default for the device
>>> buffer = device.allocate(1024) # Now uses VMM allocation
"""
from cuda.core.experimental._memory import VMMAllocatedMemoryResource
return VMMAllocatedMemoryResource(self._id, allocation_type)

def sync(self):
"""Synchronize the device.

Expand Down
169 changes: 169 additions & 0 deletions cuda_core/cuda/core/experimental/_memory.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -508,3 +508,172 @@ class _SynchronousMemoryResource(MemoryResource):
@property
def device_id(self) -> int:
return self._dev_id


class VMMAllocatedMemoryResource(MemoryResource):
"""Create a memory resource that uses CUDA's Virtual Memory Management APIs.

This memory resource uses cuMemCreate, cuMemAddressReserve, cuMemMap, and related
APIs to provide fine-grained control over memory allocation and mapping. This is
useful for:

- NVSHMEM/NCCL external buffer registration
- Growing allocations without changing pointer addresses
- EGM (Extended GPU Memory) on Grace-Hopper or Grace-Blackwell systems
- Custom memory access patterns and sharing between processes

Parameters
----------
device_id : int
Device ordinal for which memory allocations will be created.
allocation_type : driver.CUmemAllocationType, optional
The type of memory allocation. Defaults to CU_MEM_ALLOCATION_TYPE_PINNED.
"""

__slots__ = ("_dev_id", "_allocation_type", "_allocations")

def __init__(self, device_id: int, allocation_type=None):
if allocation_type is None:
allocation_type = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED

self._dev_id = device_id
self._allocation_type = allocation_type
self._allocations = {} # Track allocations: ptr -> (handle, reserved_ptr, size)
self._handle = None

# Check if device supports virtual memory management
err, vmm_supported = driver.cuDeviceGetAttribute(
driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED,
device_id
)
raise_if_driver_error(err)
if not vmm_supported:
raise RuntimeError(f"Device {device_id} does not support virtual memory management")

def allocate(self, size_t size, stream: Stream = None) -> Buffer:
"""Allocate a buffer using virtual memory management APIs.

Parameters
----------
size : int
The size of the buffer to allocate, in bytes.
stream : Stream, optional
Currently ignored as VMM operations are synchronous.

Returns
-------
Buffer
The allocated buffer object, which is accessible on the device.
"""
# Get allocation granularity
allocation_prop = driver.CUmemAllocationProp()
allocation_prop.type = self._allocation_type
allocation_prop.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
allocation_prop.location.id = self._dev_id
allocation_prop.requestedHandleTypes = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE

err, granularity = driver.cuMemGetAllocationGranularity(
allocation_prop,
driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
)
raise_if_driver_error(err)

# Round size up to granularity
aligned_size = ((size + granularity - 1) // granularity) * granularity

# Create the memory allocation
err, mem_handle = driver.cuMemCreate(aligned_size, allocation_prop, 0)
raise_if_driver_error(err)

# Reserve address space
err, reserved_ptr = driver.cuMemAddressReserve(aligned_size, 0, 0, 0)
raise_if_driver_error(err)

try:
# Map the allocation to the reserved address
err, = driver.cuMemMap(reserved_ptr, aligned_size, 0, mem_handle, 0)
raise_if_driver_error(err)

# Set access permissions
access_desc = driver.CUmemAccessDesc()
access_desc.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
access_desc.location.id = self._dev_id
access_desc.flags = driver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE

err, = driver.cuMemSetAccess(reserved_ptr, aligned_size, [access_desc], 1)
raise_if_driver_error(err)

# Store allocation info for cleanup
self._allocations[int(reserved_ptr)] = (mem_handle, reserved_ptr, aligned_size)

return Buffer._init(reserved_ptr, size, self)

except Exception:
# Clean up on error
try:
driver.cuMemAddressFree(reserved_ptr, aligned_size)
except:
pass
try:
driver.cuMemRelease(mem_handle)
except:
pass
raise

def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
"""Deallocate a buffer previously allocated by this resource.

Parameters
----------
ptr : DevicePointerT
The pointer to the buffer to deallocate.
size : int
The size of the buffer to deallocate, in bytes.
stream : Stream, optional
Currently ignored as VMM operations are synchronous.
"""
ptr_int = int(ptr)
if ptr_int not in self._allocations:
raise ValueError(f"Pointer {ptr_int:x} was not allocated by this memory resource")

mem_handle, reserved_ptr, aligned_size = self._allocations.pop(ptr_int)

# Unmap the memory
err, = driver.cuMemUnmap(reserved_ptr, aligned_size)
raise_if_driver_error(err)

# Free the address reservation
err, = driver.cuMemAddressFree(reserved_ptr, aligned_size)
raise_if_driver_error(err)

# Release the memory handle
err, = driver.cuMemRelease(mem_handle)
raise_if_driver_error(err)

@property
def is_device_accessible(self) -> bool:
"""bool: this memory resource provides device-accessible buffers."""
return True

@property
def is_host_accessible(self) -> bool:
"""bool: this memory resource does not provide host-accessible buffers by default."""
# VMM allocations are typically device-only unless specifically configured for host access
return False

@property
def device_id(self) -> int:
"""int: the associated device ordinal."""
return self._dev_id

def __del__(self):
"""Clean up any remaining allocations."""
# Clean up any remaining allocations
for ptr_int, (mem_handle, reserved_ptr, aligned_size) in list(self._allocations.items()):
try:
driver.cuMemUnmap(reserved_ptr, aligned_size)
driver.cuMemAddressFree(reserved_ptr, aligned_size)
driver.cuMemRelease(mem_handle)
except:
pass # Ignore errors during cleanup
self._allocations.clear()
103 changes: 103 additions & 0 deletions cuda_core/examples/vmm_memory_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

"""
Example demonstrating the VMMAllocatedMemoryResource for fine-grained memory management.

This example shows how to use CUDA's Virtual Memory Management APIs through the
VMMAllocatedMemoryResource class for advanced memory allocation scenarios.
"""

import sys

from cuda.core.experimental import Device, VMMAllocatedMemoryResource, Stream
from cuda.core.experimental._utils.cuda_utils import driver


def main():
"""Demonstrate VMMAllocatedMemoryResource usage."""
try:
# Get the default device
device = Device()
print(f"Using device {device.device_id}: {device.properties.name}")

# Check if device supports virtual memory management
err, vmm_supported = driver.cuDeviceGetAttribute(
driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED,
device.device_id
)

if err != driver.CUresult.CUDA_SUCCESS or not vmm_supported:
print(f"Device {device.device_id} does not support virtual memory management.")
print("This feature requires a modern GPU with compute capability 6.0 or higher.")
sys.exit(1)

print(f"Device {device.device_id} supports virtual memory management!")

# Create a VMMAllocatedMemoryResource using the convenience method
vmm_mr = device.create_vmm_memory_resource()
print(f"Created VMMAllocatedMemoryResource for device {device.device_id}")

# Optionally set it as the default memory resource for the device
# device.memory_resource = vmm_mr

# Create a stream for operations
stream = Stream()

# Allocate some memory using VMM
sizes = [1024, 4096, 1024*1024] # 1KB, 4KB, 1MB
buffers = []

print("\nAllocating buffers using VMM:")
for i, size in enumerate(sizes):
buffer = vmm_mr.allocate(size, stream)
buffers.append(buffer)
print(f" Buffer {i+1}: {size:,} bytes at address 0x{int(buffer.handle):016x}")

# Verify buffer properties
assert buffer.is_device_accessible
assert not buffer.is_host_accessible
assert buffer.device_id == device.device_id
assert buffer.memory_resource is vmm_mr

# Demonstrate buffer copying
if len(buffers) >= 2:
print(f"\nCopying from buffer 1 to buffer 2...")
# Note: In a real application, you would initialize buffer 1 with data first
buffers[1].copy_from(buffers[0], stream=stream)
stream.sync() # Wait for copy to complete
print("Copy completed!")

# Clean up buffers
print("\nCleaning up buffers:")
for i, buffer in enumerate(buffers):
buffer.close()
print(f" Buffer {i+1} deallocated")

print("\nVMM memory management example completed successfully!")

# Demonstrate advanced usage: custom allocation type
print("\nDemonstrating custom allocation type:")
try:
# Create with managed memory type (if supported)
vmm_mr_managed = device.create_vmm_memory_resource(
driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
)

managed_buffer = vmm_mr_managed.allocate(4096, stream)
print(f" Managed buffer: 4096 bytes at address 0x{int(managed_buffer.handle):016x}")
managed_buffer.close()
print(" Managed buffer deallocated")

except Exception as e:
print(f" Managed memory allocation failed: {e}")
print(" This is expected on some systems/drivers")

except Exception as e:
print(f"Error: {e}")
sys.exit(1)


if __name__ == "__main__":
main()
Loading