From 075022750945b72ca4bafd73e28b569a7dcacf53 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 22 Oct 2025 14:44:40 -0700 Subject: [PATCH 1/4] Checking for RDMA support before allocating via VMM --- cuda_core/tests/test_memory.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 904997f11..8400c99cb 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -28,7 +28,7 @@ from cuda.core.experimental._utils.cuda_utils import handle_return from cuda.core.experimental.utils import StridedMemoryView -from cuda_python_test_helpers import IS_WSL, supports_ipc_mempool +from cuda_python_test_helpers import supports_ipc_mempool POOL_SIZE = 2097152 # 2MB size @@ -324,11 +324,15 @@ def test_vmm_allocator_basic_allocation(): """ if platform.system() == "Windows": pytest.skip("VirtualMemoryResource is not supported on Windows TCC") - if IS_WSL: - pytest.skip("VirtualMemoryResource is not supported on WSL") + device = Device() device.set_current() + + # Validate GPU Direct RDMA support before using it + if not device.properties.gpu_direct_rdma_supported: + pytest.skip("GPU Direct RDMA is not supported on this device") + options = VirtualMemoryResourceOptions() # Create VMM allocator with default config vmm_mr = VirtualMemoryResource(device, config=options) @@ -363,11 +367,14 @@ def test_vmm_allocator_policy_configuration(): """ if platform.system() == "Windows": pytest.skip("VirtualMemoryResource is not supported on Windows TCC") - if IS_WSL: - pytest.skip("VirtualMemoryResource is not supported on WSL") + device = Device() device.set_current() + # Validate GPU Direct RDMA support before using it + if not device.properties.gpu_direct_rdma_supported: + pytest.skip("GPU Direct RDMA is not supported on this device") + # Test with custom VMM config custom_config = VirtualMemoryResourceOptions( allocation_type="pinned", @@ -422,11 +429,14 @@ def test_vmm_allocator_grow_allocation(): """ if platform.system() == "Windows": pytest.skip("VirtualMemoryResource is not supported on Windows TCC") - if IS_WSL: - pytest.skip("VirtualMemoryResource is not supported on WSL") + device = Device() device.set_current() + # Validate GPU Direct RDMA support before using it + if not device.properties.gpu_direct_rdma_supported: + pytest.skip("GPU Direct RDMA is not supported on this device") + options = VirtualMemoryResourceOptions() vmm_mr = VirtualMemoryResource(device, config=options) From aba3a171d9e75782816d8a6c2d4d18aacbefda55 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 22 Oct 2025 14:49:54 -0700 Subject: [PATCH 2/4] whitespace --- cuda_core/tests/test_memory.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 8400c99cb..6682f19e2 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -325,7 +325,6 @@ def test_vmm_allocator_basic_allocation(): if platform.system() == "Windows": pytest.skip("VirtualMemoryResource is not supported on Windows TCC") - device = Device() device.set_current() From 50224bba165926c7ebc83b55a14e7b3fc18ec746 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 22 Oct 2025 15:59:23 -0700 Subject: [PATCH 3/4] Improving the test_memory suite. --- cuda_core/cuda/core/experimental/_memory.pyx | 5 ++ cuda_core/tests/test_memory.py | 74 +++++++++++++++++--- 2 files changed, 70 insertions(+), 9 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 024ffa2ae..6c891315a 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -1197,6 +1197,11 @@ class VirtualMemoryResource(MemoryResource): self.device = None if platform.system() == "Windows": raise NotImplementedError("VirtualMemoryResource is not supported on Windows") + + # Validate RDMA support if requested + if self.config.gpu_direct_rdma and self.device is not None: + if not self.device.properties.gpu_direct_rdma_supported: + raise RuntimeError("GPU Direct RDMA is not supported on this device") @staticmethod def _align_up(size: int, gran: int) -> int: diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 6682f19e2..8299af6dd 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -322,9 +322,6 @@ def test_vmm_allocator_basic_allocation(): This test verifies that VirtualMemoryResource can allocate memory using CUDA VMM APIs with default configuration. """ - if platform.system() == "Windows": - pytest.skip("VirtualMemoryResource is not supported on Windows TCC") - device = Device() device.set_current() @@ -364,9 +361,6 @@ def test_vmm_allocator_policy_configuration(): with different allocation policies and that the configuration affects the allocation behavior. """ - if platform.system() == "Windows": - pytest.skip("VirtualMemoryResource is not supported on Windows TCC") - device = Device() device.set_current() @@ -426,9 +420,6 @@ def test_vmm_allocator_grow_allocation(): This test verifies that VirtualMemoryResource can grow existing allocations while preserving the base pointer when possible. """ - if platform.system() == "Windows": - pytest.skip("VirtualMemoryResource is not supported on Windows TCC") - device = Device() device.set_current() @@ -467,6 +458,71 @@ def test_vmm_allocator_grow_allocation(): grown_buffer.close() +def test_vmm_allocator_rdma_validation(): + """Test that VirtualMemoryResource properly handles RDMA configuration. + + This test verifies that the VirtualMemoryResource constructor properly validates + RDMA support when gpu_direct_rdma=True is requested. + """ + device = Device() + device.set_current() + + # Skip if virtual memory management is not supported + if not device.properties.virtual_memory_management_supported: + pytest.skip("Virtual memory management is not supported on this device") + + # Skip if GPU Direct RDMA is not supported (we need it for this test) + if not device.properties.gpu_direct_rdma_supported: + pytest.skip("GPU Direct RDMA is not supported on this device") + + # Test that RDMA works when device supports it + options = VirtualMemoryResourceOptions(gpu_direct_rdma=True) + vmm_mr = VirtualMemoryResource(device, config=options) + + # Test basic allocation with RDMA enabled + buffer = vmm_mr.allocate(4096) + assert buffer.size >= 4096 + assert buffer.device_id == device.device_id + + # Clean up + buffer.close() + + # Test that RDMA=False also works + options_no_rdma = VirtualMemoryResourceOptions(gpu_direct_rdma=False) + vmm_mr_no_rdma = VirtualMemoryResource(device, config=options_no_rdma) + + # Test basic allocation without RDMA + buffer_no_rdma = vmm_mr_no_rdma.allocate(4096) + assert buffer_no_rdma.size >= 4096 + assert buffer_no_rdma.device_id == device.device_id + + # Clean up + buffer_no_rdma.close() + + +def test_vmm_allocator_rdma_unsupported_exception(): + """Test that VirtualMemoryResource throws an exception when RDMA is requested but device doesn't support it. + + This test verifies that the VirtualMemoryResource constructor throws a RuntimeError + when gpu_direct_rdma=True is requested but the device doesn't support virtual memory management. + """ + device = Device() + device.set_current() + + # Skip if virtual memory management is not supported (we need it for VMM) + if not device.properties.virtual_memory_management_supported: + pytest.skip("Virtual memory management is not supported on this device") + + # Skip if GPU Direct RDMA is supported (we want to test the unsupported case) + if device.properties.gpu_direct_rdma_supported: + pytest.skip("This test requires a device that doesn't support GPU Direct RDMA") + + # Test that requesting RDMA on an unsupported device throws an exception + options = VirtualMemoryResourceOptions(gpu_direct_rdma=True) + with pytest.raises(RuntimeError, match="GPU Direct RDMA is not supported on this device"): + VirtualMemoryResource(device, config=options) + + def test_mempool(mempool_device): device = mempool_device From ebc681869673d108f21a3585b95974d8c4854c4f Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 22 Oct 2025 16:15:41 -0700 Subject: [PATCH 4/4] improving tests and skip checks --- cuda_core/cuda/core/experimental/_memory.pyx | 4 +- cuda_core/tests/test_memory.py | 70 +++++--------------- 2 files changed, 18 insertions(+), 56 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 6c891315a..7c79b775f 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -1119,7 +1119,7 @@ class VirtualMemoryResourceOptions: location_type: VirtualMemoryLocationTypeT = "device" handle_type: VirtualMemoryHandleTypeT = "posix_fd" granularity: VirtualMemoryGranularityT = "recommended" - gpu_direct_rdma: bool = True + gpu_direct_rdma: bool = False addr_hint: Optional[int] = 0 addr_align: Optional[int] = None peers: Iterable[int] = field(default_factory=tuple) @@ -1197,7 +1197,7 @@ class VirtualMemoryResource(MemoryResource): self.device = None if platform.system() == "Windows": raise NotImplementedError("VirtualMemoryResource is not supported on Windows") - + # Validate RDMA support if requested if self.config.gpu_direct_rdma and self.device is not None: if not self.device.properties.gpu_direct_rdma_supported: diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 8299af6dd..3e69bab42 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -325,9 +325,9 @@ def test_vmm_allocator_basic_allocation(): device = Device() device.set_current() - # Validate GPU Direct RDMA support before using it - if not device.properties.gpu_direct_rdma_supported: - pytest.skip("GPU Direct RDMA is not supported on this device") + # Skip if virtual memory management is not supported + if not device.properties.virtual_memory_management_supported: + pytest.skip("Virtual memory management is not supported on this device") options = VirtualMemoryResourceOptions() # Create VMM allocator with default config @@ -364,9 +364,13 @@ def test_vmm_allocator_policy_configuration(): device = Device() device.set_current() - # Validate GPU Direct RDMA support before using it + # Skip if virtual memory management is not supported + if not device.properties.virtual_memory_management_supported: + pytest.skip("Virtual memory management is not supported on this device") + + # Skip if GPU Direct RDMA is supported (we want to test the unsupported case) if not device.properties.gpu_direct_rdma_supported: - pytest.skip("GPU Direct RDMA is not supported on this device") + pytest.skip("This test requires a device that doesn't support GPU Direct RDMA") # Test with custom VMM config custom_config = VirtualMemoryResourceOptions( @@ -423,9 +427,9 @@ def test_vmm_allocator_grow_allocation(): device = Device() device.set_current() - # Validate GPU Direct RDMA support before using it - if not device.properties.gpu_direct_rdma_supported: - pytest.skip("GPU Direct RDMA is not supported on this device") + # Skip if virtual memory management is not supported (we need it for VMM) + if not device.properties.virtual_memory_management_supported: + pytest.skip("Virtual memory management is not supported on this device") options = VirtualMemoryResourceOptions() @@ -458,65 +462,23 @@ def test_vmm_allocator_grow_allocation(): grown_buffer.close() -def test_vmm_allocator_rdma_validation(): - """Test that VirtualMemoryResource properly handles RDMA configuration. - - This test verifies that the VirtualMemoryResource constructor properly validates - RDMA support when gpu_direct_rdma=True is requested. - """ - device = Device() - device.set_current() - - # Skip if virtual memory management is not supported - if not device.properties.virtual_memory_management_supported: - pytest.skip("Virtual memory management is not supported on this device") - - # Skip if GPU Direct RDMA is not supported (we need it for this test) - if not device.properties.gpu_direct_rdma_supported: - pytest.skip("GPU Direct RDMA is not supported on this device") - - # Test that RDMA works when device supports it - options = VirtualMemoryResourceOptions(gpu_direct_rdma=True) - vmm_mr = VirtualMemoryResource(device, config=options) - - # Test basic allocation with RDMA enabled - buffer = vmm_mr.allocate(4096) - assert buffer.size >= 4096 - assert buffer.device_id == device.device_id - - # Clean up - buffer.close() - - # Test that RDMA=False also works - options_no_rdma = VirtualMemoryResourceOptions(gpu_direct_rdma=False) - vmm_mr_no_rdma = VirtualMemoryResource(device, config=options_no_rdma) - - # Test basic allocation without RDMA - buffer_no_rdma = vmm_mr_no_rdma.allocate(4096) - assert buffer_no_rdma.size >= 4096 - assert buffer_no_rdma.device_id == device.device_id - - # Clean up - buffer_no_rdma.close() - - def test_vmm_allocator_rdma_unsupported_exception(): """Test that VirtualMemoryResource throws an exception when RDMA is requested but device doesn't support it. - + This test verifies that the VirtualMemoryResource constructor throws a RuntimeError when gpu_direct_rdma=True is requested but the device doesn't support virtual memory management. """ device = Device() device.set_current() - + # Skip if virtual memory management is not supported (we need it for VMM) if not device.properties.virtual_memory_management_supported: pytest.skip("Virtual memory management is not supported on this device") - + # Skip if GPU Direct RDMA is supported (we want to test the unsupported case) if device.properties.gpu_direct_rdma_supported: pytest.skip("This test requires a device that doesn't support GPU Direct RDMA") - + # Test that requesting RDMA on an unsupported device throws an exception options = VirtualMemoryResourceOptions(gpu_direct_rdma=True) with pytest.raises(RuntimeError, match="GPU Direct RDMA is not supported on this device"):