From 846c75cf16572673f20c940d7a25a46db23ad808 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 18 Sep 2025 10:48:54 -0700 Subject: [PATCH 01/25] Restructures IPC mempool tests into a subdirectory. --- cuda_core/tests/ipc/conftest.py | 33 +++++ cuda_core/tests/ipc/test_ipc_errors.py | 45 ++++++ cuda_core/tests/ipc/test_ipc_mempool.py | 51 +++++++ cuda_core/tests/ipc/utility.py | 70 +++++++++ cuda_core/tests/test_ipc_mempool.py | 179 ------------------------ 5 files changed, 199 insertions(+), 179 deletions(-) create mode 100644 cuda_core/tests/ipc/conftest.py create mode 100644 cuda_core/tests/ipc/test_ipc_errors.py create mode 100644 cuda_core/tests/ipc/test_ipc_mempool.py create mode 100644 cuda_core/tests/ipc/utility.py delete mode 100644 cuda_core/tests/test_ipc_mempool.py diff --git a/cuda_core/tests/ipc/conftest.py b/cuda_core/tests/ipc/conftest.py new file mode 100644 index 000000000..2ac6d858b --- /dev/null +++ b/cuda_core/tests/ipc/conftest.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from cuda.core.experimental import Device, DeviceMemoryResource + +POOL_SIZE = 2097152 + +@pytest.fixture(scope="function") +def device(): + """Obtains a device suitable for IPC-enabled mempool tests, or skips.""" + # Check if IPC is supported on this platform/device + device = Device() + device.set_current() + + if not device.properties.memory_pools_supported: + pytest.skip("Device does not support mempool operations") + + # Note: Linux specific. Once Windows support for IPC is implemented, this + # test should be updated. + if not device.properties.handle_type_posix_file_descriptor_supported: + pytest.skip("Device does not support IPC") + + return device + + +@pytest.fixture(scope="function") +def ipc_memory_resource(device): + mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) + assert mr.is_ipc_enabled + return mr + + diff --git a/cuda_core/tests/ipc/test_ipc_errors.py b/cuda_core/tests/ipc/test_ipc_errors.py new file mode 100644 index 000000000..4ac3277e6 --- /dev/null +++ b/cuda_core/tests/ipc/test_ipc_errors.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core.experimental import Device, DeviceMemoryResource, IPCChannel +import multiprocessing +import pytest + +CHILD_TIMEOUT_SEC = 10 +NBYTES = 64 + +def test_ipc_errors(device, ipc_memory_resource): + """Test expected errors with allocating from a shared IPC memory pool.""" + mr = ipc_memory_resource + # Set up the IPC-enabled memory pool and share it. + channel = IPCChannel() + mr.share_to_channel(channel) + + # Start a child process to generate error info. + queue = multiprocessing.Queue() + process = multiprocessing.Process(target=child_main, args=(channel, queue)) + process.start() + + # Check the errors. + exc_type, exc_msg = queue.get(timeout=CHILD_TIMEOUT_SEC) + assert exc_type is TypeError + assert exc_msg == "Cannot allocate from shared memory pool imported via IPC" + + # Wait for the child process. + process.join(timeout=CHILD_TIMEOUT_SEC) + assert process.exitcode == 0 + + +def child_main(channel, queue): + """Child process that pushes IPC errors to a shared queue for testing.""" + device = Device() + device.set_current() + + mr = DeviceMemoryResource.from_shared_channel(device, channel) + + # Allocating from an imported pool. + try: + mr.allocate(NBYTES) + except Exception as e: + exc_info = type(e), str(e) + queue.put(exc_info) diff --git a/cuda_core/tests/ipc/test_ipc_mempool.py b/cuda_core/tests/ipc/test_ipc_mempool.py new file mode 100644 index 000000000..6b2dc1d7f --- /dev/null +++ b/cuda_core/tests/ipc/test_ipc_mempool.py @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel +from utility import IPCBufferTestHelper +import multiprocessing +import pytest + +CHILD_TIMEOUT_SEC = 10 +NBYTES = 64 + +def test_ipc_mempool(device, ipc_memory_resource): + """Test IPC with memory pools.""" + # Set up the IPC-enabled memory pool and share it. + mr = ipc_memory_resource + channel = IPCChannel() + mr.share_to_channel(channel) + + # Start the child process. + queue = multiprocessing.Queue() + process = multiprocessing.Process(target=child_main, args=(channel, queue)) + process.start() + + # Allocate and fill memory. + buffer = mr.allocate(NBYTES) + helper = IPCBufferTestHelper(device, buffer, NBYTES) + helper.fill_buffer(flipped=False) + + # Export the buffer via IPC. + handle = buffer.export() + queue.put(handle) + + # Wait for the child process. + process.join(timeout=CHILD_TIMEOUT_SEC) + assert process.exitcode == 0 + + # Verify that the buffer was modified. + helper.verify_buffer(flipped=True) + + +def child_main(channel, queue): + device = Device() + device.set_current() + + mr = DeviceMemoryResource.from_shared_channel(device, channel) + handle = queue.get() # Get exported buffer data + buffer = Buffer.import_(mr, handle) + + helper = IPCBufferTestHelper(device, buffer, NBYTES) + helper.verify_buffer(flipped=False) + helper.fill_buffer(flipped=True) diff --git a/cuda_core/tests/ipc/utility.py b/cuda_core/tests/ipc/utility.py new file mode 100644 index 000000000..f778578bd --- /dev/null +++ b/cuda_core/tests/ipc/utility.py @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +try: + from cuda.bindings import driver +except ImportError: + from cuda import cuda as driver + +import ctypes +from cuda.core.experimental import Buffer, MemoryResource +from cuda.core.experimental._utils.cuda_utils import handle_return + +class DummyUnifiedMemoryResource(MemoryResource): + def __init__(self, device): + self.device = device + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)) + return Buffer.from_handle(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + handle_return(driver.cuMemFree(ptr)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return True + + @property + def device_id(self) -> int: + return self.device + + +class IPCBufferTestHelper: + """A helper for manipulating memory buffers in IPC tests. + + Provides methods to fill a buffer with one of two test patterns and verify + the expected values. + """ + + def __init__(self, device, buffer, nbytes): + self.device = device + self.buffer = buffer + self.nbytes = nbytes + self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.nbytes) + self.stream = device.create_stream() + + def fill_buffer(self, flipped=False): + """Fill a device buffer with test pattern using unified memory.""" + ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte)) + op = (lambda i: 255 - i) if flipped else (lambda i: i) + for i in range(self.nbytes): + ptr[i] = ctypes.c_byte(op(i)) + self.buffer.copy_from(self.scratch_buffer, stream=self.stream) + self.device.sync() + + def verify_buffer(self, flipped=False): + """Verify the buffer contents.""" + self.scratch_buffer.copy_from(self.buffer, stream=self.stream) + self.device.sync() + ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte)) + op = (lambda i: 255 - i) if flipped else (lambda i: i) + for i in range(self.nbytes): + assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(i)).value, ( + f"Buffer contains incorrect data at index {i}" + ) + diff --git a/cuda_core/tests/test_ipc_mempool.py b/cuda_core/tests/test_ipc_mempool.py deleted file mode 100644 index 5c4c38275..000000000 --- a/cuda_core/tests/test_ipc_mempool.py +++ /dev/null @@ -1,179 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -try: - from cuda.bindings import driver -except ImportError: - from cuda import cuda as driver - -import ctypes -import multiprocessing - -import pytest - -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel, MemoryResource -from cuda.core.experimental._utils.cuda_utils import handle_return - -CHILD_TIMEOUT_SEC = 10 -NBYTES = 64 -POOL_SIZE = 2097152 - - -@pytest.fixture(scope="function") -def ipc_device(): - """Obtains a device suitable for IPC-enabled mempool tests, or skips.""" - # Check if IPC is supported on this platform/device - device = Device() - device.set_current() - - if not device.properties.memory_pools_supported: - pytest.skip("Device does not support mempool operations") - - # Note: Linux specific. Once Windows support for IPC is implemented, this - # test should be updated. - if not device.properties.handle_type_posix_file_descriptor_supported: - pytest.skip("Device does not support IPC") - - return device - - -def test_ipc_mempool(ipc_device): - """Test IPC with memory pools.""" - # Set up the IPC-enabled memory pool and share it. - stream = ipc_device.create_stream() - mr = DeviceMemoryResource(ipc_device, dict(max_size=POOL_SIZE, ipc_enabled=True)) - assert mr.is_ipc_enabled - channel = IPCChannel() - mr.share_to_channel(channel) - - # Start the child process. - queue = multiprocessing.Queue() - process = multiprocessing.Process(target=child_main1, args=(channel, queue)) - process.start() - - # Allocate and fill memory. - buffer = mr.allocate(NBYTES, stream=stream) - protocol = IPCBufferTestProtocol(ipc_device, buffer, stream=stream) - protocol.fill_buffer(flipped=False) - stream.sync() - - # Export the buffer via IPC. - handle = buffer.export() - queue.put(handle) - - # Wait for the child process. - process.join(timeout=CHILD_TIMEOUT_SEC) - assert process.exitcode == 0 - - # Verify that the buffer was modified. - protocol.verify_buffer(flipped=True) - - -def child_main1(channel, queue): - device = Device() - device.set_current() - stream = device.create_stream() - - mr = DeviceMemoryResource.from_shared_channel(device, channel) - handle = queue.get() # Get exported buffer data - buffer = Buffer.import_(mr, handle) - - protocol = IPCBufferTestProtocol(device, buffer, stream=stream) - protocol.verify_buffer(flipped=False) - protocol.fill_buffer(flipped=True) - stream.sync() - - -def test_shared_pool_errors(ipc_device): - """Test expected errors with allocating from a shared IPC memory pool.""" - # Set up the IPC-enabled memory pool and share it. - mr = DeviceMemoryResource(ipc_device, dict(max_size=POOL_SIZE, ipc_enabled=True)) - channel = IPCChannel() - mr.share_to_channel(channel) - - # Start a child process to generate error info. - queue = multiprocessing.Queue() - process = multiprocessing.Process(target=child_main2, args=(channel, queue)) - process.start() - - # Check the errors. - exc_type, exc_msg = queue.get(timeout=CHILD_TIMEOUT_SEC) - assert exc_type is TypeError - assert exc_msg == "Cannot allocate from shared memory pool imported via IPC" - - # Wait for the child process. - process.join(timeout=CHILD_TIMEOUT_SEC) - assert process.exitcode == 0 - - -def child_main2(channel, queue): - """Child process that pushes IPC errors to a shared queue for testing.""" - device = Device() - device.set_current() - - mr = DeviceMemoryResource.from_shared_channel(device, channel) - - # Allocating from an imported pool. - try: - mr.allocate(NBYTES) - except Exception as e: - exc_info = type(e), str(e) - queue.put(exc_info) - - -class DummyUnifiedMemoryResource(MemoryResource): - def __init__(self, device): - self.device = device - - def allocate(self, size, stream=None) -> Buffer: - ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)) - return Buffer.from_handle(ptr=ptr, size=size, mr=self) - - def deallocate(self, ptr, size, stream=None): - handle_return(driver.cuMemFree(ptr)) - - @property - def is_device_accessible(self) -> bool: - return True - - @property - def is_host_accessible(self) -> bool: - return True - - @property - def device_id(self) -> int: - return self.device - - -class IPCBufferTestProtocol: - """The protocol for verifying IPC. - - Provides methods to fill a buffer with one of two test patterns and verify - the expected values. - """ - - def __init__(self, device, buffer, nbytes=NBYTES, stream=None): - self.device = device - self.buffer = buffer - self.nbytes = nbytes - self.stream = stream if stream is not None else device.create_stream() - self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.nbytes, stream=self.stream) - - def fill_buffer(self, flipped=False): - """Fill a device buffer with test pattern using unified memory.""" - ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte)) - op = (lambda i: 255 - i) if flipped else (lambda i: i) - for i in range(self.nbytes): - ptr[i] = ctypes.c_byte(op(i)) - self.buffer.copy_from(self.scratch_buffer, stream=self.stream) - - def verify_buffer(self, flipped=False): - """Verify the buffer contents.""" - self.scratch_buffer.copy_from(self.buffer, stream=self.stream) - self.stream.sync() - ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte)) - op = (lambda i: 255 - i) if flipped else (lambda i: i) - for i in range(self.nbytes): - assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(i)).value, ( - f"Buffer contains incorrect data at index {i}" - ) From 238db00c33ff185e54565799920797929a68d43e Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 18 Sep 2025 11:10:54 -0700 Subject: [PATCH 02/25] Simplify the IPC interface, adding create_ipc_channel and import_/export methods. --- cuda_core/cuda/core/experimental/_memory.pyx | 32 +++++++++++++++++++- cuda_core/tests/conftest.py | 2 ++ cuda_core/tests/ipc/test_ipc_mempool.py | 19 ++++-------- 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 41a506a58..fcf40ab7e 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -11,10 +11,12 @@ from cuda.core.experimental._utils.cuda_utils cimport ( ) from dataclasses import dataclass -from typing import TypeVar, Union, TYPE_CHECKING +from typing import Optional, TypeVar, Union, TYPE_CHECKING import abc import array +import collections import cython +import multiprocessing import os import platform import weakref @@ -436,12 +438,34 @@ cdef class IPCChannel: cdef: object _proxy + object _queue + object _mr def __init__(self): if platform.system() == "Linux": self._proxy = IPCChannelUnixSocket._init() else: raise RuntimeError("IPC is not available on {platform.system()}") + self._queue = multiprocessing.Queue() + self._mr = None + + def export(self, buffer: Buffer | collections.abc.Sequence): + if not isinstance(buffer, collections.abc.Sequence): + buffer = [buffer] + + for buf in buffer: + handle = buf.export() + self._queue.put(handle) + + def import_(self, device: Optional[Device] = None): + if self._mr is None: + if device is None: + from cuda.core.experimental._device import Device + device = Device() + self._mr = DeviceMemoryResource.from_shared_channel(device, self) + + handle = self._queue.get() + return Buffer.import_(self._mr, handle) cdef class IPCChannelUnixSocket: @@ -658,6 +682,12 @@ class DeviceMemoryResource(MemoryResource): self._mempool_owned = False self._is_imported = False + def create_ipc_channel(self): + """Create an IPC memory channel for sharing allocations.""" + channel = IPCChannel() + self.share_to_channel(channel) + return channel + @classmethod def from_shared_channel(cls, device_id: int | Device, channel: IPCChannel) -> DeviceMemoryResource: """Create a device memory resource from a memory pool shared over an IPC channel.""" diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index c800aae3e..5f42c35c1 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -72,3 +72,5 @@ def pop_all_contexts(): skipif_need_cuda_headers = pytest.mark.skipif(helpers.CUDA_INCLUDE_PATH is None, reason="need CUDA header") + + diff --git a/cuda_core/tests/ipc/test_ipc_mempool.py b/cuda_core/tests/ipc/test_ipc_mempool.py index 6b2dc1d7f..e6a1688b6 100644 --- a/cuda_core/tests/ipc/test_ipc_mempool.py +++ b/cuda_core/tests/ipc/test_ipc_mempool.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel +from cuda.core.experimental import Device from utility import IPCBufferTestHelper import multiprocessing import pytest @@ -13,12 +13,10 @@ def test_ipc_mempool(device, ipc_memory_resource): """Test IPC with memory pools.""" # Set up the IPC-enabled memory pool and share it. mr = ipc_memory_resource - channel = IPCChannel() - mr.share_to_channel(channel) + channel = mr.create_ipc_channel() # Start the child process. - queue = multiprocessing.Queue() - process = multiprocessing.Process(target=child_main, args=(channel, queue)) + process = multiprocessing.Process(target=child_main, args=(channel,)) process.start() # Allocate and fill memory. @@ -27,8 +25,7 @@ def test_ipc_mempool(device, ipc_memory_resource): helper.fill_buffer(flipped=False) # Export the buffer via IPC. - handle = buffer.export() - queue.put(handle) + channel.export(buffer) # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) @@ -38,14 +35,10 @@ def test_ipc_mempool(device, ipc_memory_resource): helper.verify_buffer(flipped=True) -def child_main(channel, queue): +def child_main(channel): device = Device() device.set_current() - - mr = DeviceMemoryResource.from_shared_channel(device, channel) - handle = queue.get() # Get exported buffer data - buffer = Buffer.import_(mr, handle) - + buffer = channel.import_() helper = IPCBufferTestHelper(device, buffer, NBYTES) helper.verify_buffer(flipped=False) helper.fill_buffer(flipped=True) From f2ea8c93fd0c215c56e408b5106eefbb32da1fa5 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 18 Sep 2025 11:13:37 -0700 Subject: [PATCH 03/25] Simply the interface to IPCBufferTestHelper. --- cuda_core/tests/ipc/test_ipc_mempool.py | 4 ++-- cuda_core/tests/ipc/utility.py | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cuda_core/tests/ipc/test_ipc_mempool.py b/cuda_core/tests/ipc/test_ipc_mempool.py index e6a1688b6..25379b1ca 100644 --- a/cuda_core/tests/ipc/test_ipc_mempool.py +++ b/cuda_core/tests/ipc/test_ipc_mempool.py @@ -21,7 +21,7 @@ def test_ipc_mempool(device, ipc_memory_resource): # Allocate and fill memory. buffer = mr.allocate(NBYTES) - helper = IPCBufferTestHelper(device, buffer, NBYTES) + helper = IPCBufferTestHelper(device, buffer) helper.fill_buffer(flipped=False) # Export the buffer via IPC. @@ -39,6 +39,6 @@ def child_main(channel): device = Device() device.set_current() buffer = channel.import_() - helper = IPCBufferTestHelper(device, buffer, NBYTES) + helper = IPCBufferTestHelper(device, buffer) helper.verify_buffer(flipped=False) helper.fill_buffer(flipped=True) diff --git a/cuda_core/tests/ipc/utility.py b/cuda_core/tests/ipc/utility.py index f778578bd..3ca177dfe 100644 --- a/cuda_core/tests/ipc/utility.py +++ b/cuda_core/tests/ipc/utility.py @@ -41,18 +41,17 @@ class IPCBufferTestHelper: the expected values. """ - def __init__(self, device, buffer, nbytes): + def __init__(self, device, buffer): self.device = device self.buffer = buffer - self.nbytes = nbytes - self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.nbytes) + self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.buffer.size) self.stream = device.create_stream() def fill_buffer(self, flipped=False): """Fill a device buffer with test pattern using unified memory.""" ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte)) op = (lambda i: 255 - i) if flipped else (lambda i: i) - for i in range(self.nbytes): + for i in range(self.buffer.size): ptr[i] = ctypes.c_byte(op(i)) self.buffer.copy_from(self.scratch_buffer, stream=self.stream) self.device.sync() @@ -63,7 +62,7 @@ def verify_buffer(self, flipped=False): self.device.sync() ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte)) op = (lambda i: 255 - i) if flipped else (lambda i: i) - for i in range(self.nbytes): + for i in range(self.buffer.size): assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(i)).value, ( f"Buffer contains incorrect data at index {i}" ) From 827466ead2c17c06b0de0aa4d0f1fcf06fdfa406 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 19 Sep 2025 11:35:40 -0700 Subject: [PATCH 04/25] Adds more tests. --- cuda_core/cuda/core/experimental/_memory.pyx | 36 ++++-- .../tests/ipc/test_ipc_mempool_multiple.py | 53 +++++++++ .../ipc/test_ipc_shared_allocation_handle.py | 106 ++++++++++++++++++ cuda_core/tests/ipc/utility.py | 8 +- 4 files changed, 192 insertions(+), 11 deletions(-) create mode 100644 cuda_core/tests/ipc/test_ipc_mempool_multiple.py create mode 100644 cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index fcf40ab7e..b4def295d 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -152,6 +152,18 @@ cdef class Buffer: raise_if_driver_error(err) return Buffer.from_handle(ptr, ipc_buffer.size, mr) + def export_to_channel(self, channel: IPCChannel | Sequence[IPCChannel]): + seq = channel if isinstance(channel, collections.abc.Sequence) else [channel] + for ch in seq: + ch.export(self); + + @classmethod + def import_from_channel(cls, channel: IPCChannel | Sequence[IPCChannel]): + if isinstance(channel, collections.abc.Sequence): + return [ch.import_() for ch in channel] + else: + return channel.import_() + def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. @@ -467,6 +479,16 @@ cdef class IPCChannel: handle = self._queue.get() return Buffer.import_(self._mr, handle) + def send_allocation_handle(self, alloc_handle: IPCAllocationHandle): + """Sends over this channel an allocation handle for exporting a + shared memory pool.""" + self._proxy.send_allocation_handle(alloc_handle) + + def receive_allocation_handle(self) -> IPCAllocationHandle: + """Receives over this channel an allocation handle for importing a + shared memory pool.""" + return self._proxy.receive_allocation_handle() + cdef class IPCChannelUnixSocket: """Unix-specific channel for sharing memory pools over sockets.""" @@ -484,7 +506,7 @@ cdef class IPCChannelUnixSocket: self._sock_out, self._sock_in = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) return self - cpdef _send_allocation_handle(self, alloc_handle: IPCAllocationHandle): + cpdef send_allocation_handle(self, alloc_handle: IPCAllocationHandle): """Sends over this channel an allocation handle for exporting a shared memory pool.""" self._sock_out.sendmsg( @@ -492,7 +514,7 @@ cdef class IPCChannelUnixSocket: [(socket.SOL_SOCKET, socket.SCM_RIGHTS, array.array("i", [int(alloc_handle)]))] ) - cpdef IPCAllocationHandle _receive_allocation_handle(self): + cpdef IPCAllocationHandle receive_allocation_handle(self): """Receives over this channel an allocation handle for importing a shared memory pool.""" fds = array.array("i") @@ -692,11 +714,11 @@ class DeviceMemoryResource(MemoryResource): def from_shared_channel(cls, device_id: int | Device, channel: IPCChannel) -> DeviceMemoryResource: """Create a device memory resource from a memory pool shared over an IPC channel.""" device_id = getattr(device_id, 'device_id', device_id) - alloc_handle = channel._proxy._receive_allocation_handle() - return cls._from_allocation_handle(device_id, alloc_handle) + alloc_handle = channel.receive_allocation_handle() + return cls.from_allocation_handle(device_id, alloc_handle) @classmethod - def _from_allocation_handle(cls, device_id: int | Device, alloc_handle: IPCAllocationHandle) -> DeviceMemoryResource: + def from_allocation_handle(cls, device_id: int | Device, alloc_handle: IPCAllocationHandle) -> DeviceMemoryResource: """Create a device memory resource from an allocation handle. Construct a new `DeviceMemoryResource` instance that imports a memory @@ -734,9 +756,9 @@ class DeviceMemoryResource(MemoryResource): def share_to_channel(self, channel : IPCChannel): if not self.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") - channel._proxy._send_allocation_handle(self._get_allocation_handle()) + channel.send_allocation_handle(self.get_allocation_handle()) - def _get_allocation_handle(self) -> IPCAllocationHandle: + def get_allocation_handle(self) -> IPCAllocationHandle: """Export the memory pool handle to be shared (requires IPC). The handle can be used to share the memory pool with other processes. diff --git a/cuda_core/tests/ipc/test_ipc_mempool_multiple.py b/cuda_core/tests/ipc/test_ipc_mempool_multiple.py new file mode 100644 index 000000000..792966b95 --- /dev/null +++ b/cuda_core/tests/ipc/test_ipc_mempool_multiple.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core.experimental import Device +from utility import IPCBufferTestHelper +import multiprocessing +import pytest + +CHILD_TIMEOUT_SEC = 10 +NBYTES = 64 + + +def test_ipc_mempool_multiple(device, ipc_memory_resource): + """Test IPC with memory pools using multiple processes.""" + # Construct an IPC-enabled memory resource and share it over two channels. + mr = ipc_memory_resource + ch1, ch2 = (mr.create_ipc_channel() for _ in range(2)) + + # Allocate memory buffers and export them to each channel. + buffer1 = mr.allocate(NBYTES) + ch1.export(buffer1) + ch2.export(buffer1) + buffer2 = mr.allocate(NBYTES) + ch1.export(buffer2) + ch2.export(buffer2) + + # Start the child processes. + p1 = multiprocessing.Process(target=child_main, args=(1, ch1)) + p2 = multiprocessing.Process(target=child_main, args=(2, ch2)) + p1.start() + p2.start() + + # Wait for the child processes. + p1.join(timeout=CHILD_TIMEOUT_SEC) + p2.join(timeout=CHILD_TIMEOUT_SEC) + assert p1.exitcode == 0 + assert p2.exitcode == 0 + + # Verify that the buffers were modified. + IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False) + IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True) + + +def child_main(idx, channel): + device = Device() + device.set_current() + buffer1 = channel.import_() # implicitly set up the shared memory pool + buffer2 = channel.import_() + if idx == 1: + IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False) + elif idx == 2: + IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True) + diff --git a/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py b/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py new file mode 100644 index 000000000..323254150 --- /dev/null +++ b/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel +from utility import IPCBufferTestHelper +import multiprocessing +import pytest +from itertools import cycle + +CHILD_TIMEOUT_SEC = 10 +NBYTES = 64 +NWORKERS = 2 +NTASKS = 2 + +def test_ipc_shared_allocation_handle(device, ipc_memory_resource): + """Demonstrate that a memory pool allocation handle can be reused for IPC + with multiple processes.""" + # Set up communication. + ch1 = IPCChannel() + ch2 = IPCChannel() + q1 = multiprocessing.Queue() + q2 = multiprocessing.Queue() + + # Start children. + p1 = multiprocessing.Process(target=child_main, args=(1, ch1, q1)) + p2 = multiprocessing.Process(target=child_main, args=(2, ch2, q2)) + p1.start() + p2.start() + + # Set up the IPC-enabled memory pool and share it using one handle. + mr = ipc_memory_resource + alloc_handle = mr.get_allocation_handle() + ch1.send_allocation_handle(alloc_handle) + ch2.send_allocation_handle(alloc_handle) + + # Allocate a share memory. + buf1 = mr.allocate(NBYTES) + buf2 = mr.allocate(NBYTES) + q1.put(buf1.export()) + q2.put(buf2.export()) + + # Wait for children. + p1.join(timeout=CHILD_TIMEOUT_SEC) + p2.join(timeout=CHILD_TIMEOUT_SEC) + assert p1.exitcode == 0 + assert p2.exitcode == 0 + + # Verify results. + IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1) + IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) + + +def child_main(idx, channel, queue): + """Fills a shared memory buffer.""" + device = Device() + device.set_current() + alloc_handle = channel.receive_allocation_handle() + mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) + buffer_descriptor = queue.get() + buffer = Buffer.import_(mr, buffer_descriptor) + IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) + + +def test_ipc_shared_allocation_handle2(device, ipc_memory_resource): + """Demonstrate that a memory pool allocation handle can be reused for IPC + with multiple processes (simplified).""" + # Set up communication. + ch1 = IPCChannel() + ch2 = IPCChannel() + + # Start children. + p1 = multiprocessing.Process(target=child_main2, args=(1, ch1)) + p2 = multiprocessing.Process(target=child_main2, args=(2, ch2)) + p1.start() + p2.start() + + # Set up the IPC-enabled memory pool and share it using one handle. + mr = ipc_memory_resource + alloc_handle = mr.get_allocation_handle() + ch1.send_allocation_handle(alloc_handle) + ch2.send_allocation_handle(alloc_handle) + + # Allocate a share memory. + buf1 = mr.allocate(NBYTES) + buf2 = mr.allocate(NBYTES) + ch1.export(buf1) + ch2.export(buf2) + + # Wait for children. + p1.join(timeout=CHILD_TIMEOUT_SEC) + p2.join(timeout=CHILD_TIMEOUT_SEC) + assert p1.exitcode == 0 + assert p2.exitcode == 0 + + # Verify results. + IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1) + IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) + + +def child_main2(idx, channel): + """Fills a shared memory buffer.""" + device = Device() + device.set_current() + buffer = channel.import_() + IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) + diff --git a/cuda_core/tests/ipc/utility.py b/cuda_core/tests/ipc/utility.py index 3ca177dfe..781790a9d 100644 --- a/cuda_core/tests/ipc/utility.py +++ b/cuda_core/tests/ipc/utility.py @@ -47,23 +47,23 @@ def __init__(self, device, buffer): self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.buffer.size) self.stream = device.create_stream() - def fill_buffer(self, flipped=False): + def fill_buffer(self, flipped=False, starting_from=0): """Fill a device buffer with test pattern using unified memory.""" ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte)) op = (lambda i: 255 - i) if flipped else (lambda i: i) for i in range(self.buffer.size): - ptr[i] = ctypes.c_byte(op(i)) + ptr[i] = ctypes.c_byte(op(starting_from + i)) self.buffer.copy_from(self.scratch_buffer, stream=self.stream) self.device.sync() - def verify_buffer(self, flipped=False): + def verify_buffer(self, flipped=False, starting_from=0): """Verify the buffer contents.""" self.scratch_buffer.copy_from(self.buffer, stream=self.stream) self.device.sync() ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte)) op = (lambda i: 255 - i) if flipped else (lambda i: i) for i in range(self.buffer.size): - assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(i)).value, ( + assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(starting_from + i)).value, ( f"Buffer contains incorrect data at index {i}" ) From 93d921738792632180612ac93948e2b9b9994303 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 19 Sep 2025 14:17:00 -0700 Subject: [PATCH 05/25] Removes sequence forms of certain function (exception behavior was unclear). Added a test for an error case. --- cuda_core/cuda/core/experimental/_memory.pyx | 24 ++++++-------------- cuda_core/tests/ipc/test_ipc_errors.py | 17 +++++++++++++- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index b4def295d..fd752b84f 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -14,7 +14,6 @@ from dataclasses import dataclass from typing import Optional, TypeVar, Union, TYPE_CHECKING import abc import array -import collections import cython import multiprocessing import os @@ -152,17 +151,12 @@ cdef class Buffer: raise_if_driver_error(err) return Buffer.from_handle(ptr, ipc_buffer.size, mr) - def export_to_channel(self, channel: IPCChannel | Sequence[IPCChannel]): - seq = channel if isinstance(channel, collections.abc.Sequence) else [channel] - for ch in seq: - ch.export(self); + def export_to_channel(self, channel: IPCChannel): + channel.export(self); @classmethod - def import_from_channel(cls, channel: IPCChannel | Sequence[IPCChannel]): - if isinstance(channel, collections.abc.Sequence): - return [ch.import_() for ch in channel] - else: - return channel.import_() + def import_from_channel(cls, channel: IPCChannel): + return channel.import_() def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. @@ -461,13 +455,9 @@ cdef class IPCChannel: self._queue = multiprocessing.Queue() self._mr = None - def export(self, buffer: Buffer | collections.abc.Sequence): - if not isinstance(buffer, collections.abc.Sequence): - buffer = [buffer] - - for buf in buffer: - handle = buf.export() - self._queue.put(handle) + def export(self, buffer: Buffer): + handle = buffer.export() + self._queue.put(handle) def import_(self, device: Optional[Device] = None): if self._mr is None: diff --git a/cuda_core/tests/ipc/test_ipc_errors.py b/cuda_core/tests/ipc/test_ipc_errors.py index 4ac3277e6..08cd41baf 100644 --- a/cuda_core/tests/ipc/test_ipc_errors.py +++ b/cuda_core/tests/ipc/test_ipc_errors.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from cuda.core.experimental._utils.cuda_utils import CUDAError from cuda.core.experimental import Device, DeviceMemoryResource, IPCChannel import multiprocessing import pytest @@ -8,7 +9,21 @@ CHILD_TIMEOUT_SEC = 10 NBYTES = 64 -def test_ipc_errors(device, ipc_memory_resource): +def test_share_to_wrong_channel(device, ipc_memory_resource): + mr1 = ipc_memory_resource + mr2 = DeviceMemoryResource(device, dict(ipc_enabled=True)) + + channel1 = mr1.create_ipc_channel() + buffer1 = mr1.allocate(NBYTES) + buffer2 = mr2.allocate(NBYTES) + + channel1.export(buffer1) # ok + + with pytest.raises(CUDAError): + channel1.export(buffer2) + + +def test_ipc_child_errors(device, ipc_memory_resource): """Test expected errors with allocating from a shared IPC memory pool.""" mr = ipc_memory_resource # Set up the IPC-enabled memory pool and share it. From 476349ee68e087ad729ce8c47940bf57101346ed Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 19 Sep 2025 14:21:04 -0700 Subject: [PATCH 06/25] Changes channel methods export/import_ to send_buffer/receive_buffer, for clarity. --- cuda_core/cuda/core/experimental/_memory.pyx | 4 ++-- cuda_core/tests/ipc/test_ipc_errors.py | 4 ++-- cuda_core/tests/ipc/test_ipc_mempool.py | 4 ++-- cuda_core/tests/ipc/test_ipc_mempool_multiple.py | 12 ++++++------ .../tests/ipc/test_ipc_shared_allocation_handle.py | 6 +++--- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index fd752b84f..c628aaf45 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -455,11 +455,11 @@ cdef class IPCChannel: self._queue = multiprocessing.Queue() self._mr = None - def export(self, buffer: Buffer): + def send_buffer(self, buffer: Buffer): handle = buffer.export() self._queue.put(handle) - def import_(self, device: Optional[Device] = None): + def receive_buffer(self, device: Optional[Device] = None): if self._mr is None: if device is None: from cuda.core.experimental._device import Device diff --git a/cuda_core/tests/ipc/test_ipc_errors.py b/cuda_core/tests/ipc/test_ipc_errors.py index 08cd41baf..710c87ffb 100644 --- a/cuda_core/tests/ipc/test_ipc_errors.py +++ b/cuda_core/tests/ipc/test_ipc_errors.py @@ -17,10 +17,10 @@ def test_share_to_wrong_channel(device, ipc_memory_resource): buffer1 = mr1.allocate(NBYTES) buffer2 = mr2.allocate(NBYTES) - channel1.export(buffer1) # ok + channel1.send_buffer(buffer1) # ok with pytest.raises(CUDAError): - channel1.export(buffer2) + channel1.send_buffer(buffer2) def test_ipc_child_errors(device, ipc_memory_resource): diff --git a/cuda_core/tests/ipc/test_ipc_mempool.py b/cuda_core/tests/ipc/test_ipc_mempool.py index 25379b1ca..582cbc823 100644 --- a/cuda_core/tests/ipc/test_ipc_mempool.py +++ b/cuda_core/tests/ipc/test_ipc_mempool.py @@ -25,7 +25,7 @@ def test_ipc_mempool(device, ipc_memory_resource): helper.fill_buffer(flipped=False) # Export the buffer via IPC. - channel.export(buffer) + channel.send_buffer(buffer) # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) @@ -38,7 +38,7 @@ def test_ipc_mempool(device, ipc_memory_resource): def child_main(channel): device = Device() device.set_current() - buffer = channel.import_() + buffer = channel.receive_buffer() helper = IPCBufferTestHelper(device, buffer) helper.verify_buffer(flipped=False) helper.fill_buffer(flipped=True) diff --git a/cuda_core/tests/ipc/test_ipc_mempool_multiple.py b/cuda_core/tests/ipc/test_ipc_mempool_multiple.py index 792966b95..5edcb6f3a 100644 --- a/cuda_core/tests/ipc/test_ipc_mempool_multiple.py +++ b/cuda_core/tests/ipc/test_ipc_mempool_multiple.py @@ -18,11 +18,11 @@ def test_ipc_mempool_multiple(device, ipc_memory_resource): # Allocate memory buffers and export them to each channel. buffer1 = mr.allocate(NBYTES) - ch1.export(buffer1) - ch2.export(buffer1) + ch1.send_buffer(buffer1) + ch2.send_buffer(buffer1) buffer2 = mr.allocate(NBYTES) - ch1.export(buffer2) - ch2.export(buffer2) + ch1.send_buffer(buffer2) + ch2.send_buffer(buffer2) # Start the child processes. p1 = multiprocessing.Process(target=child_main, args=(1, ch1)) @@ -44,8 +44,8 @@ def test_ipc_mempool_multiple(device, ipc_memory_resource): def child_main(idx, channel): device = Device() device.set_current() - buffer1 = channel.import_() # implicitly set up the shared memory pool - buffer2 = channel.import_() + buffer1 = channel.receive_buffer() # implicitly set up the shared memory pool + buffer2 = channel.receive_buffer() if idx == 1: IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False) elif idx == 2: diff --git a/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py b/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py index 323254150..644052b24 100644 --- a/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py +++ b/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py @@ -83,8 +83,8 @@ def test_ipc_shared_allocation_handle2(device, ipc_memory_resource): # Allocate a share memory. buf1 = mr.allocate(NBYTES) buf2 = mr.allocate(NBYTES) - ch1.export(buf1) - ch2.export(buf2) + ch1.send_buffer(buf1) + ch2.send_buffer(buf2) # Wait for children. p1.join(timeout=CHILD_TIMEOUT_SEC) @@ -101,6 +101,6 @@ def child_main2(idx, channel): """Fills a shared memory buffer.""" device = Device() device.set_current() - buffer = channel.import_() + buffer = channel.receive_buffer() IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) From 2ed5be7bd926a0d3bf692183c391f4af03b276bf Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 23 Sep 2025 11:29:25 -0700 Subject: [PATCH 07/25] Implement serialization methods for Device, Buffer, and DeviceMemoryResource. Add tests for buffer IPC through serialization. --- cuda_core/cuda/core/experimental/_device.py | 10 ++ cuda_core/cuda/core/experimental/_memory.pyx | 43 +++++- cuda_core/tests/conftest.py | 2 - cuda_core/tests/ipc/test_ipc_errors.py | 60 --------- cuda_core/tests/ipc/test_ipc_mempool.py | 44 ------ .../tests/ipc/test_ipc_mempool_multiple.py | 53 -------- .../tests/{ipc => memory_ipc}/conftest.py | 4 +- .../test_channel.py} | 99 ++++++++++++-- cuda_core/tests/memory_ipc/test_errors.py | 81 +++++++++++ .../tests/memory_ipc/test_send_buffers.py | 70 ++++++++++ cuda_core/tests/memory_ipc/test_serialize.py | 127 ++++++++++++++++++ cuda_core/tests/memory_ipc/test_workerpool.py | 48 +++++++ .../tests/{ipc => memory_ipc}/utility.py | 3 +- cuda_core/tests/test_memory.py | 2 +- 14 files changed, 469 insertions(+), 177 deletions(-) delete mode 100644 cuda_core/tests/ipc/test_ipc_errors.py delete mode 100644 cuda_core/tests/ipc/test_ipc_mempool.py delete mode 100644 cuda_core/tests/ipc/test_ipc_mempool_multiple.py rename cuda_core/tests/{ipc => memory_ipc}/conftest.py (100%) rename cuda_core/tests/{ipc/test_ipc_shared_allocation_handle.py => memory_ipc/test_channel.py} (54%) create mode 100644 cuda_core/tests/memory_ipc/test_errors.py create mode 100644 cuda_core/tests/memory_ipc/test_send_buffers.py create mode 100644 cuda_core/tests/memory_ipc/test_serialize.py create mode 100644 cuda_core/tests/memory_ipc/test_workerpool.py rename cuda_core/tests/{ipc => memory_ipc}/utility.py (99%) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 0499baa58..d94e44b5b 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -1160,6 +1160,16 @@ def __int__(self): def __repr__(self): return f"" + def __reduce__(self): + return Device._reconstruct, (self.device_id,) + + @staticmethod + def _reconstruct(device_id): + device = Device(device_id) + if not device._has_inited: + device.set_current() + return device + def set_current(self, ctx: Context = None) -> Union[Context, None]: """Set device to be used for GPU executions. diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index c628aaf45..7f5b9e54d 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -16,8 +16,10 @@ import abc import array import cython import multiprocessing +import multiprocessing.reduction import os import platform +import sys import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream @@ -28,7 +30,7 @@ if platform.system() == "Linux": if TYPE_CHECKING: import cuda.bindings.driver - from cuda.core.experimental._device import Device + from ._device import Device # TODO: define a memory property mixin class and make Buffer and # MemoryResource both inherit from it @@ -72,6 +74,13 @@ cdef class Buffer: def __del__(self): self.close() + def __reduce__(self): + return Buffer._reconstruct, (self.memory_resource, self.export()) + + @staticmethod + def _reconstruct(mr, desc): + return Buffer.import_(mr, desc) + cpdef close(self, stream: Stream = None): """Deallocate this buffer asynchronously on the given stream. @@ -427,6 +436,15 @@ cdef class IPCAllocationHandle: """Close the handle.""" self.close() + def __reduce__(self): + df = multiprocessing.reduction.DupFd(self.handle) + return IPCAllocationHandle._reconstruct, (df,) + + @staticmethod + def _reconstruct(df): + self = IPCAllocationHandle._init(df.detach()) + return self + def __int__(self) -> int: if self._handle < 0: raise ValueError( @@ -434,6 +452,11 @@ cdef class IPCAllocationHandle: ) return self._handle + def detach(self): + handle = self._handle + self._handle = -1 + return handle + @property def handle(self) -> int: return self._handle @@ -462,7 +485,7 @@ cdef class IPCChannel: def receive_buffer(self, device: Optional[Device] = None): if self._mr is None: if device is None: - from cuda.core.experimental._device import Device + from ._device import Device device = Device() self._mr = DeviceMemoryResource.from_shared_channel(device, self) @@ -678,6 +701,9 @@ class DeviceMemoryResource(MemoryResource): err, self._mempool_handle = driver.cuMemPoolCreate(properties) raise_if_driver_error(err) + if opts.ipc_enabled: + self.get_allocation_handle() # enables Buffer.export + def __del__(self): self.close() @@ -694,6 +720,18 @@ class DeviceMemoryResource(MemoryResource): self._mempool_owned = False self._is_imported = False + def __reduce__(self): + from ._device import Device + device = Device(self.device_id) + alloc_handle = self.get_allocation_handle() + df = multiprocessing.reduction.DupFd(alloc_handle.detach()) + return DeviceMemoryResource._reconstruct, (device, df) + + @staticmethod + def _reconstruct(device, df): + alloc_handle = IPCAllocationHandle._init(df.detach()) + return DeviceMemoryResource.from_allocation_handle(device, alloc_handle) + def create_ipc_channel(self): """Create an IPC memory channel for sharing allocations.""" channel = IPCChannel() @@ -740,7 +778,6 @@ class DeviceMemoryResource(MemoryResource): err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0) raise_if_driver_error(err) - return self def share_to_channel(self, channel : IPCChannel): diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 5f42c35c1..c800aae3e 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -72,5 +72,3 @@ def pop_all_contexts(): skipif_need_cuda_headers = pytest.mark.skipif(helpers.CUDA_INCLUDE_PATH is None, reason="need CUDA header") - - diff --git a/cuda_core/tests/ipc/test_ipc_errors.py b/cuda_core/tests/ipc/test_ipc_errors.py deleted file mode 100644 index 710c87ffb..000000000 --- a/cuda_core/tests/ipc/test_ipc_errors.py +++ /dev/null @@ -1,60 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from cuda.core.experimental._utils.cuda_utils import CUDAError -from cuda.core.experimental import Device, DeviceMemoryResource, IPCChannel -import multiprocessing -import pytest - -CHILD_TIMEOUT_SEC = 10 -NBYTES = 64 - -def test_share_to_wrong_channel(device, ipc_memory_resource): - mr1 = ipc_memory_resource - mr2 = DeviceMemoryResource(device, dict(ipc_enabled=True)) - - channel1 = mr1.create_ipc_channel() - buffer1 = mr1.allocate(NBYTES) - buffer2 = mr2.allocate(NBYTES) - - channel1.send_buffer(buffer1) # ok - - with pytest.raises(CUDAError): - channel1.send_buffer(buffer2) - - -def test_ipc_child_errors(device, ipc_memory_resource): - """Test expected errors with allocating from a shared IPC memory pool.""" - mr = ipc_memory_resource - # Set up the IPC-enabled memory pool and share it. - channel = IPCChannel() - mr.share_to_channel(channel) - - # Start a child process to generate error info. - queue = multiprocessing.Queue() - process = multiprocessing.Process(target=child_main, args=(channel, queue)) - process.start() - - # Check the errors. - exc_type, exc_msg = queue.get(timeout=CHILD_TIMEOUT_SEC) - assert exc_type is TypeError - assert exc_msg == "Cannot allocate from shared memory pool imported via IPC" - - # Wait for the child process. - process.join(timeout=CHILD_TIMEOUT_SEC) - assert process.exitcode == 0 - - -def child_main(channel, queue): - """Child process that pushes IPC errors to a shared queue for testing.""" - device = Device() - device.set_current() - - mr = DeviceMemoryResource.from_shared_channel(device, channel) - - # Allocating from an imported pool. - try: - mr.allocate(NBYTES) - except Exception as e: - exc_info = type(e), str(e) - queue.put(exc_info) diff --git a/cuda_core/tests/ipc/test_ipc_mempool.py b/cuda_core/tests/ipc/test_ipc_mempool.py deleted file mode 100644 index 582cbc823..000000000 --- a/cuda_core/tests/ipc/test_ipc_mempool.py +++ /dev/null @@ -1,44 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from cuda.core.experimental import Device -from utility import IPCBufferTestHelper -import multiprocessing -import pytest - -CHILD_TIMEOUT_SEC = 10 -NBYTES = 64 - -def test_ipc_mempool(device, ipc_memory_resource): - """Test IPC with memory pools.""" - # Set up the IPC-enabled memory pool and share it. - mr = ipc_memory_resource - channel = mr.create_ipc_channel() - - # Start the child process. - process = multiprocessing.Process(target=child_main, args=(channel,)) - process.start() - - # Allocate and fill memory. - buffer = mr.allocate(NBYTES) - helper = IPCBufferTestHelper(device, buffer) - helper.fill_buffer(flipped=False) - - # Export the buffer via IPC. - channel.send_buffer(buffer) - - # Wait for the child process. - process.join(timeout=CHILD_TIMEOUT_SEC) - assert process.exitcode == 0 - - # Verify that the buffer was modified. - helper.verify_buffer(flipped=True) - - -def child_main(channel): - device = Device() - device.set_current() - buffer = channel.receive_buffer() - helper = IPCBufferTestHelper(device, buffer) - helper.verify_buffer(flipped=False) - helper.fill_buffer(flipped=True) diff --git a/cuda_core/tests/ipc/test_ipc_mempool_multiple.py b/cuda_core/tests/ipc/test_ipc_mempool_multiple.py deleted file mode 100644 index 5edcb6f3a..000000000 --- a/cuda_core/tests/ipc/test_ipc_mempool_multiple.py +++ /dev/null @@ -1,53 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from cuda.core.experimental import Device -from utility import IPCBufferTestHelper -import multiprocessing -import pytest - -CHILD_TIMEOUT_SEC = 10 -NBYTES = 64 - - -def test_ipc_mempool_multiple(device, ipc_memory_resource): - """Test IPC with memory pools using multiple processes.""" - # Construct an IPC-enabled memory resource and share it over two channels. - mr = ipc_memory_resource - ch1, ch2 = (mr.create_ipc_channel() for _ in range(2)) - - # Allocate memory buffers and export them to each channel. - buffer1 = mr.allocate(NBYTES) - ch1.send_buffer(buffer1) - ch2.send_buffer(buffer1) - buffer2 = mr.allocate(NBYTES) - ch1.send_buffer(buffer2) - ch2.send_buffer(buffer2) - - # Start the child processes. - p1 = multiprocessing.Process(target=child_main, args=(1, ch1)) - p2 = multiprocessing.Process(target=child_main, args=(2, ch2)) - p1.start() - p2.start() - - # Wait for the child processes. - p1.join(timeout=CHILD_TIMEOUT_SEC) - p2.join(timeout=CHILD_TIMEOUT_SEC) - assert p1.exitcode == 0 - assert p2.exitcode == 0 - - # Verify that the buffers were modified. - IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False) - IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True) - - -def child_main(idx, channel): - device = Device() - device.set_current() - buffer1 = channel.receive_buffer() # implicitly set up the shared memory pool - buffer2 = channel.receive_buffer() - if idx == 1: - IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False) - elif idx == 2: - IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True) - diff --git a/cuda_core/tests/ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py similarity index 100% rename from cuda_core/tests/ipc/conftest.py rename to cuda_core/tests/memory_ipc/conftest.py index 2ac6d858b..39f787eb0 100644 --- a/cuda_core/tests/ipc/conftest.py +++ b/cuda_core/tests/memory_ipc/conftest.py @@ -2,10 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 import pytest + from cuda.core.experimental import Device, DeviceMemoryResource POOL_SIZE = 2097152 + @pytest.fixture(scope="function") def device(): """Obtains a device suitable for IPC-enabled mempool tests, or skips.""" @@ -29,5 +31,3 @@ def ipc_memory_resource(device): mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) assert mr.is_ipc_enabled return mr - - diff --git a/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py b/cuda_core/tests/memory_ipc/test_channel.py similarity index 54% rename from cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py rename to cuda_core/tests/memory_ipc/test_channel.py index 644052b24..c118bc122 100644 --- a/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py +++ b/cuda_core/tests/memory_ipc/test_channel.py @@ -1,17 +1,95 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel -from utility import IPCBufferTestHelper import multiprocessing -import pytest -from itertools import cycle + +from utility import IPCBufferTestHelper + +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel CHILD_TIMEOUT_SEC = 10 NBYTES = 64 NWORKERS = 2 NTASKS = 2 + +def test_ipc_mempool(device, ipc_memory_resource): + """Test IPC with memory pools.""" + # Set up the IPC-enabled memory pool and share it. + mr = ipc_memory_resource + channel = mr.create_ipc_channel() + + # Start the child process. + process = multiprocessing.Process(target=child_main1, args=(channel,)) + process.start() + + # Allocate and fill memory. + buffer = mr.allocate(NBYTES) + helper = IPCBufferTestHelper(device, buffer) + helper.fill_buffer(flipped=False) + + # Export the buffer via IPC. + channel.send_buffer(buffer) + + # Wait for the child process. + process.join(timeout=CHILD_TIMEOUT_SEC) + assert process.exitcode == 0 + + # Verify that the buffer was modified. + helper.verify_buffer(flipped=True) + + +def child_main1(channel): + device = Device() + device.set_current() + buffer = channel.receive_buffer() + helper = IPCBufferTestHelper(device, buffer) + helper.verify_buffer(flipped=False) + helper.fill_buffer(flipped=True) + + +def test_ipc_mempool_multiple(device, ipc_memory_resource): + """Test IPC with memory pools using multiple processes.""" + # Construct an IPC-enabled memory resource and share it over two channels. + mr = ipc_memory_resource + ch1, ch2 = (mr.create_ipc_channel() for _ in range(2)) + + # Allocate memory buffers and export them to each channel. + buffer1 = mr.allocate(NBYTES) + ch1.send_buffer(buffer1) + ch2.send_buffer(buffer1) + buffer2 = mr.allocate(NBYTES) + ch1.send_buffer(buffer2) + ch2.send_buffer(buffer2) + + # Start the child processes. + p1 = multiprocessing.Process(target=child_main2, args=(1, ch1)) + p2 = multiprocessing.Process(target=child_main2, args=(2, ch2)) + p1.start() + p2.start() + + # Wait for the child processes. + p1.join(timeout=CHILD_TIMEOUT_SEC) + p2.join(timeout=CHILD_TIMEOUT_SEC) + assert p1.exitcode == 0 + assert p2.exitcode == 0 + + # Verify that the buffers were modified. + IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False) + IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True) + + +def child_main2(idx, channel): + device = Device() + device.set_current() + buffer1 = channel.receive_buffer() # implicitly set up the shared memory pool + buffer2 = channel.receive_buffer() + if idx == 1: + IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False) + elif idx == 2: + IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True) + + def test_ipc_shared_allocation_handle(device, ipc_memory_resource): """Demonstrate that a memory pool allocation handle can be reused for IPC with multiple processes.""" @@ -22,8 +100,8 @@ def test_ipc_shared_allocation_handle(device, ipc_memory_resource): q2 = multiprocessing.Queue() # Start children. - p1 = multiprocessing.Process(target=child_main, args=(1, ch1, q1)) - p2 = multiprocessing.Process(target=child_main, args=(2, ch2, q2)) + p1 = multiprocessing.Process(target=child_main3, args=(1, ch1, q1)) + p2 = multiprocessing.Process(target=child_main3, args=(2, ch2, q2)) p1.start() p2.start() @@ -50,7 +128,7 @@ def test_ipc_shared_allocation_handle(device, ipc_memory_resource): IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) -def child_main(idx, channel, queue): +def child_main3(idx, channel, queue): """Fills a shared memory buffer.""" device = Device() device.set_current() @@ -69,8 +147,8 @@ def test_ipc_shared_allocation_handle2(device, ipc_memory_resource): ch2 = IPCChannel() # Start children. - p1 = multiprocessing.Process(target=child_main2, args=(1, ch1)) - p2 = multiprocessing.Process(target=child_main2, args=(2, ch2)) + p1 = multiprocessing.Process(target=child_main4, args=(1, ch1)) + p2 = multiprocessing.Process(target=child_main4, args=(2, ch2)) p1.start() p2.start() @@ -97,10 +175,9 @@ def test_ipc_shared_allocation_handle2(device, ipc_memory_resource): IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) -def child_main2(idx, channel): +def child_main4(idx, channel): """Fills a shared memory buffer.""" device = Device() device.set_current() buffer = channel.receive_buffer() IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) - diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py new file mode 100644 index 000000000..b8bb9bd4b --- /dev/null +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import multiprocessing + +from cuda.core.experimental import Buffer, DeviceMemoryResource +from cuda.core.experimental._utils.cuda_utils import CUDAError + +CHILD_TIMEOUT_SEC = 10 +NBYTES = 64 +POOL_SIZE = 2097152 + + +class ChildErrorHarness: + """Test harness for checking errors in child processes. Subclasses override + PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples).""" + + def test_main(self, device, ipc_memory_resource): + """Parent process that checks child errors.""" + # Attach fixtures to this object for convenience. These can be accessed + # from PARENT_ACTION. + self.device = device + self.mr = ipc_memory_resource + + # Start a child process to generate error info. + pipe = [multiprocessing.Queue() for _ in range(2)] + process = multiprocessing.Process(target=self.child_main, args=(pipe,)) + process.start() + + # Interact. + self.PARENT_ACTION(pipe[0]) + + # Check the error. + exc_type, exc_msg = pipe[1].get(timeout=CHILD_TIMEOUT_SEC) + self.ASSERT(exc_type, exc_msg) + + # Wait for the child process. + process.join(timeout=CHILD_TIMEOUT_SEC) + assert process.exitcode == 0 + + def child_main(self, pipe): + """Child process that pushes IPC errors to a shared pipe for testing.""" + try: + self.CHILD_ACTION(pipe[0]) + except Exception as e: + exc_info = type(e), str(e) + else: + exc_info = None, None + pipe[1].put(exc_info) + + +class TestAllocFromImportedMr(ChildErrorHarness): + """Error when attempting to allocate from an import memory resource.""" + + def PARENT_ACTION(self, queue): + queue.put(self.mr) + + def CHILD_ACTION(self, queue): + mr = queue.get() + mr.allocate(NBYTES) + + def ASSERT(self, exc_type, exc_msg): + assert exc_type is TypeError + assert exc_msg == "Cannot allocate from shared memory pool imported via IPC" + + +class TestImportWrongMR(ChildErrorHarness): + """Error when importing a buffer from the wrong memory resource.""" + + def PARENT_ACTION(self, queue): + mr2 = DeviceMemoryResource(self.device, dict(max_size=POOL_SIZE, ipc_enabled=True)) + buffer = mr2.allocate(NBYTES) + queue.put([self.mr, buffer.export()]) # Note: mr does not own this buffer + + def CHILD_ACTION(self, queue): + mr, buffer_desc = queue.get() + Buffer.import_(mr, buffer_desc) + + def ASSERT(self, exc_type, exc_msg): + assert exc_type is CUDAError + assert "CUDA_ERROR_INVALID_VALUE" in exc_msg diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py new file mode 100644 index 000000000..e835e53f3 --- /dev/null +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core.experimental import Device, DeviceMemoryResource +from itertools import cycle +from utility import IPCBufferTestHelper +import multiprocessing + +CHILD_TIMEOUT_SEC = 10 +NBYTES = 64 +NMRS = 3 +NTASKS = 7 +POOL_SIZE = 2097152 + +def test_ipc_send_buffers(device, ipc_memory_resource): + """Test passing buffers directly to a child separately from a memory resource.""" + mr = ipc_memory_resource + + # Allocate and fill memory. + buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)] + for buffer in buffers: + helper = IPCBufferTestHelper(device, buffer) + helper.fill_buffer(flipped=False) + + # Start the child process. Send the buffer directly. + process = multiprocessing.Process(target=child_main, args=(buffers,)) + process.start() + + # Wait for the child process. + process.join(timeout=CHILD_TIMEOUT_SEC) + assert process.exitcode == 0 + + # Verify that the buffers were modified. + for buffer in buffers: + helper = IPCBufferTestHelper(device, buffer) + helper.verify_buffer(flipped=True) + +def test_ipc_send_buffers_multi(device, ipc_memory_resource): + """Test passing buffers sourced from multiple memory resources.""" + # Set up several IPC-enabled memory pools. + mrs = [ipc_memory_resource] + [ + DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1) + ] + + # Allocate and fill memory. + buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] + for buffer in buffers: + helper = IPCBufferTestHelper(device, buffer) + helper.fill_buffer(flipped=False) + + # Start the child process. + process = multiprocessing.Process(target=child_main, args=(buffers,)) + process.start() + + # Wait for the child process. + process.join(timeout=CHILD_TIMEOUT_SEC) + assert process.exitcode == 0 + + # Verify that the buffers were modified. + for buffer in buffers: + helper = IPCBufferTestHelper(device, buffer) + helper.verify_buffer(flipped=True) + + +def child_main(buffers): + device = Device() + for buffer in buffers: + helper = IPCBufferTestHelper(device, buffer) + helper.verify_buffer(flipped=False) + helper.fill_buffer(flipped=True) diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py new file mode 100644 index 000000000..f12489dea --- /dev/null +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -0,0 +1,127 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import multiprocessing + +import pytest +from utility import IPCBufferTestHelper + +from cuda.core.experimental import Buffer, DeviceMemoryResource + +CHILD_TIMEOUT_SEC = 10 +NBYTES = 64 +POOL_SIZE = 2097152 + + +class TestObjectSerialization: + @pytest.mark.parametrize("use_alloc_handle", [True, False]) + def test_main(self, use_alloc_handle, device, ipc_memory_resource): + """Test sending IPC memory objects to a child through a queue.""" + mr = ipc_memory_resource + + # Start the child process. + pipe = [multiprocessing.Queue() for _ in range(2)] + process = multiprocessing.Process(target=self.child_main, args=(pipe, use_alloc_handle)) + process.start() + + # Send a device description. + pipe[0].put(device) + device_id = pipe[1].get() + assert device_id == device.device_id + + # Send a memory resource directly or by allocation handle. + # Note: there is no apparent way to check the ID between processes. + if use_alloc_handle: + # Send MR by a handle. + alloc_handle = mr.get_allocation_handle() + pipe[0].put(alloc_handle) + else: + # Send MR directly. + pipe[0].put(mr) + + # Send a buffer. + buffer = mr.allocate(NBYTES) + helper = IPCBufferTestHelper(device, buffer) + helper.fill_buffer(flipped=False) + pipe[0].put(buffer) + pipe[1].get() # signal done + helper.verify_buffer(flipped=True) + + # Wait for the child process. + process.join(timeout=CHILD_TIMEOUT_SEC) + assert process.exitcode == 0 + + def child_main(self, pipe, use_alloc_handle): + # Device. + device = pipe[0].get() + pipe[1].put(device.device_id) + + # Memory resource. + if use_alloc_handle: + alloc_handle = pipe[0].get() + mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) + else: + mr = pipe[0].get() + + # Buffer. + buffer = pipe[0].get() + assert buffer.memory_resource.handle == mr.handle + helper = IPCBufferTestHelper(device, buffer) + helper.verify_buffer(flipped=False) + helper.fill_buffer(flipped=True) + pipe[1].put(None) + + +def test_object_passing(device, ipc_memory_resource): + """Test sending objects as arguments when starting a process.""" + # Define the objects. + mr = ipc_memory_resource + alloc_handle = mr.get_allocation_handle() + buffer = mr.allocate(NBYTES) + buffer_desc = buffer.export() + + helper = IPCBufferTestHelper(device, buffer) + helper.fill_buffer(flipped=False) + + # Start the child process. + process = multiprocessing.Process(target=child_main, args=(device, alloc_handle, mr, buffer_desc, buffer)) + process.start() + process.join(timeout=CHILD_TIMEOUT_SEC) + assert process.exitcode == 0 + + helper.verify_buffer(flipped=True) + + +def child_main(device, alloc_handle, mr1, buffer_desc, buffer1): + mr2 = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) + + # OK to build the buffer from either mr and descriptor. + # These all point to the same buffer. + buffer2 = Buffer.import_(mr1, buffer_desc) + buffer3 = Buffer.import_(mr2, buffer_desc) + + helper1 = IPCBufferTestHelper(device, buffer1) + helper2 = IPCBufferTestHelper(device, buffer2) + helper3 = IPCBufferTestHelper(device, buffer3) + + helper1.verify_buffer(flipped=False) + helper2.verify_buffer(flipped=False) + helper3.verify_buffer(flipped=False) + + helper1.fill_buffer(flipped=True) + + helper1.verify_buffer(flipped=True) + helper2.verify_buffer(flipped=True) + helper3.verify_buffer(flipped=True) + + helper2.fill_buffer(flipped=False) + + helper1.verify_buffer(flipped=False) + helper2.verify_buffer(flipped=False) + helper3.verify_buffer(flipped=False) + + helper3.fill_buffer(flipped=True) + + helper1.verify_buffer(flipped=True) + helper2.verify_buffer(flipped=True) + helper3.verify_buffer(flipped=True) diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py new file mode 100644 index 000000000..2dc29da8b --- /dev/null +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import multiprocessing +from itertools import cycle + +from utility import IPCBufferTestHelper + +from cuda.core.experimental import Device, DeviceMemoryResource + +CHILD_TIMEOUT_SEC = 10 +NBYTES = 64 +NWORKERS = 2 +NMRS = 3 +NTASKS = 20 +POOL_SIZE = 2097152 + + +def test_ipc_workerpool(device, ipc_memory_resource): + """Test IPC with a worker pool.""" + mr = ipc_memory_resource + buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)] + with multiprocessing.Pool(processes=NWORKERS) as pool: + pool.map(process_buffer, buffers) + + for buffer in buffers: + helper = IPCBufferTestHelper(device, buffer) + helper.verify_buffer(flipped=True) + + +def test_ipc_workerpool_multi_mr(device, ipc_memory_resource): + """Test IPC with a worker pool using multiple memory resources.""" + mrs = [ipc_memory_resource] + [ + DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1) + ] + buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] + with multiprocessing.Pool(processes=NWORKERS) as pool: + pool.map(process_buffer, buffers) + + for buffer in buffers: + helper = IPCBufferTestHelper(device, buffer) + helper.verify_buffer(flipped=True) + + +def process_buffer(buffer): + device = Device() + helper = IPCBufferTestHelper(device, buffer) + helper.fill_buffer(flipped=True) diff --git a/cuda_core/tests/ipc/utility.py b/cuda_core/tests/memory_ipc/utility.py similarity index 99% rename from cuda_core/tests/ipc/utility.py rename to cuda_core/tests/memory_ipc/utility.py index 781790a9d..766188d10 100644 --- a/cuda_core/tests/ipc/utility.py +++ b/cuda_core/tests/memory_ipc/utility.py @@ -7,9 +7,11 @@ from cuda import cuda as driver import ctypes + from cuda.core.experimental import Buffer, MemoryResource from cuda.core.experimental._utils.cuda_utils import handle_return + class DummyUnifiedMemoryResource(MemoryResource): def __init__(self, device): self.device = device @@ -66,4 +68,3 @@ def verify_buffer(self, flipped=False, starting_from=0): assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(starting_from + i)).value, ( f"Buffer contains incorrect data at index {i}" ) - diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index c14de8585..497ed751e 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -348,7 +348,7 @@ def test_mempool(mempool_device): ipc_error_msg = "Memory resource is not IPC-enabled" with pytest.raises(RuntimeError, match=ipc_error_msg): - mr._get_allocation_handle() + mr.get_allocation_handle() with pytest.raises(RuntimeError, match=ipc_error_msg): buffer.export() From 7f7f80fe2ecdf3f8a6eb80f79882aede77f072f5 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 24 Sep 2025 14:49:51 -0700 Subject: [PATCH 08/25] Protects serialization where needed to avoid resource leaks. Adds a registry from imported memory resources so that buffers can be serialized using an mr key. Test updates. --- cuda_core/cuda/core/experimental/_memory.pyx | 85 ++++++++--- cuda_core/tests/memory_ipc/test_channel.py | 4 +- cuda_core/tests/memory_ipc/test_errors.py | 45 +++++- .../tests/memory_ipc/test_send_buffers.py | 10 +- cuda_core/tests/memory_ipc/test_serialize.py | 135 +++++++++++++----- cuda_core/tests/memory_ipc/test_workerpool.py | 107 ++++++++++---- 6 files changed, 290 insertions(+), 96 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 7f5b9e54d..4dc5c52e5 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -16,6 +16,7 @@ import abc import array import cython import multiprocessing +import multiprocessing.context import multiprocessing.reduction import os import platform @@ -437,7 +438,9 @@ cdef class IPCAllocationHandle: self.close() def __reduce__(self): - df = multiprocessing.reduction.DupFd(self.handle) + multiprocessing.context.assert_spawning(self) + fd = os.dup(self.handle) + df = multiprocessing.reduction.DupFd(fd) return IPCAllocationHandle._reconstruct, (df,) @staticmethod @@ -617,6 +620,12 @@ class DeviceMemoryResourceAttributes: del mempool_property +# Holds DeviceMemoryResource objects imported by this process. +# This enables buffer serialization, as buffers can reduce to a pair +# of comprising the memory resource `remote_id` (the key into this registry) +# and the serialized buffer descriptor. +_ipc_registry = {} + class DeviceMemoryResource(MemoryResource): """Create a device memory resource managing a stream-ordered memory pool. @@ -640,7 +649,7 @@ class DeviceMemoryResource(MemoryResource): device memory resource does not own the pool (`is_handle_owned` is `False`), and closing the resource has no effect. """ - __slots__ = "_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type", "_mempool_owned", "_is_imported" + __slots__ = "_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type", "_mempool_owned", "_is_imported", "_remote_id" def __init__(self, device_id: int | Device, options=None): device_id = getattr(device_id, 'device_id', device_id) @@ -656,6 +665,7 @@ class DeviceMemoryResource(MemoryResource): self._ipc_handle_type = _NOIPC_HANDLE_TYPE self._mempool_owned = False self._is_imported = False + self._remote_id = None err, self._mempool_handle = driver.cuDeviceGetMemPool(self.device_id) raise_if_driver_error(err) @@ -697,6 +707,7 @@ class DeviceMemoryResource(MemoryResource): self._ipc_handle_type = properties.handleTypes self._mempool_owned = True self._is_imported = False + self._remote_id = None err, self._mempool_handle = driver.cuMemPoolCreate(properties) raise_if_driver_error(err) @@ -709,28 +720,57 @@ class DeviceMemoryResource(MemoryResource): def close(self): """Close the device memory resource and destroy the associated memory pool if owned.""" - if self._mempool_handle is not None and self._mempool_owned: - err, = driver.cuMemPoolDestroy(self._mempool_handle) - raise_if_driver_error(err) + if self._mempool_handle is not None: + try: + if self._mempool_owned: + err, = driver.cuMemPoolDestroy(self._mempool_handle) + raise_if_driver_error(err) + finally: + self._dev_id = None + self._mempool_handle = None + self._attributes = None + self._ipc_handle_type = _NOIPC_HANDLE_TYPE + self._mempool_owned = False + self._is_imported = False + self._remote_id = None - self._dev_id = None - self._mempool_handle = None - self._attributes = None - self._ipc_handle_type = _NOIPC_HANDLE_TYPE - self._mempool_owned = False - self._is_imported = False def __reduce__(self): - from ._device import Device - device = Device(self.device_id) - alloc_handle = self.get_allocation_handle() - df = multiprocessing.reduction.DupFd(alloc_handle.detach()) - return DeviceMemoryResource._reconstruct, (device, df) + # If spawning a new process, serialize the resources; otherwise, just + # send the remote_id, using the registry on the receiving end. + is_spawning = multiprocessing.context.get_spawning_popen() is not None + if is_spawning: + from ._device import Device + device = Device(self.device_id) + alloc_handle = self.get_allocation_handle() + return DeviceMemoryResource._reconstruct, (device, alloc_handle, self.remote_id) + else: + return DeviceMemoryResource.from_registry, (self.remote_id,) + + @staticmethod + def _reconstruct(device, alloc_handle, remote_id): + self = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) + self.register(remote_id) + return self @staticmethod - def _reconstruct(device, df): - alloc_handle = IPCAllocationHandle._init(df.detach()) - return DeviceMemoryResource.from_allocation_handle(device, alloc_handle) + def from_registry(remote_id): + try: + return _ipc_registry[remote_id] + except KeyError: + raise RuntimeError(f"Memory resource with {remote_id=} was not found") + + def register(self, remote_id: int): + if remote_id not in _ipc_registry: + assert self._remote_id is None or self._remote_id == remote_id + _ipc_registry[remote_id] = self + self._remote_id = remote_id + + @property + def remote_id(self): + if self._remote_id is None and not self._is_imported: + self._remote_id = int(self._mempool_handle) + return self._remote_id def create_ipc_channel(self): """Create an IPC memory channel for sharing allocations.""" @@ -746,7 +786,7 @@ class DeviceMemoryResource(MemoryResource): return cls.from_allocation_handle(device_id, alloc_handle) @classmethod - def from_allocation_handle(cls, device_id: int | Device, alloc_handle: IPCAllocationHandle) -> DeviceMemoryResource: + def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource: """Create a device memory resource from an allocation handle. Construct a new `DeviceMemoryResource` instance that imports a memory @@ -759,7 +799,7 @@ class DeviceMemoryResource(MemoryResource): The ID of the device or a Device object for which the memory resource is created. - alloc_handle : int + alloc_handle : int | IPCAllocationHandle The shareable handle of the device memory resource to import. Returns @@ -775,6 +815,7 @@ class DeviceMemoryResource(MemoryResource): self._ipc_handle_type = _IPC_HANDLE_TYPE self._mempool_owned = True self._is_imported = True + self._remote_id = None err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0) raise_if_driver_error(err) @@ -797,6 +838,8 @@ class DeviceMemoryResource(MemoryResource): """ if not self.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") + if self._is_imported: + raise RuntimeError("Imported memory resource cannot be exported") err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0) raise_if_driver_error(err) return IPCAllocationHandle._init(alloc_handle) diff --git a/cuda_core/tests/memory_ipc/test_channel.py b/cuda_core/tests/memory_ipc/test_channel.py index c118bc122..49e8f3be2 100644 --- a/cuda_core/tests/memory_ipc/test_channel.py +++ b/cuda_core/tests/memory_ipc/test_channel.py @@ -7,7 +7,7 @@ from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel -CHILD_TIMEOUT_SEC = 10 +CHILD_TIMEOUT_SEC = 4 NBYTES = 64 NWORKERS = 2 NTASKS = 2 @@ -134,7 +134,7 @@ def child_main3(idx, channel, queue): device.set_current() alloc_handle = channel.receive_allocation_handle() mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) - buffer_descriptor = queue.get() + buffer_descriptor = queue.get(timeout=CHILD_TIMEOUT_SEC) buffer = Buffer.import_(mr, buffer_descriptor) IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index b8bb9bd4b..a6003d19b 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -6,7 +6,7 @@ from cuda.core.experimental import Buffer, DeviceMemoryResource from cuda.core.experimental._utils.cuda_utils import CUDAError -CHILD_TIMEOUT_SEC = 10 +CHILD_TIMEOUT_SEC = 4 NBYTES = 64 POOL_SIZE = 2097152 @@ -24,7 +24,7 @@ def test_main(self, device, ipc_memory_resource): # Start a child process to generate error info. pipe = [multiprocessing.Queue() for _ in range(2)] - process = multiprocessing.Process(target=self.child_main, args=(pipe,)) + process = multiprocessing.Process(target=self.child_main, args=(pipe, self.device, self.mr)) process.start() # Interact. @@ -38,8 +38,10 @@ def test_main(self, device, ipc_memory_resource): process.join(timeout=CHILD_TIMEOUT_SEC) assert process.exitcode == 0 - def child_main(self, pipe): + def child_main(self, pipe, device, mr): """Child process that pushes IPC errors to a shared pipe for testing.""" + self.device = device + self.mr = mr try: self.CHILD_ACTION(pipe[0]) except Exception as e: @@ -56,7 +58,7 @@ def PARENT_ACTION(self, queue): queue.put(self.mr) def CHILD_ACTION(self, queue): - mr = queue.get() + mr = queue.get(timeout=CHILD_TIMEOUT_SEC) mr.allocate(NBYTES) def ASSERT(self, exc_type, exc_msg): @@ -73,9 +75,42 @@ def PARENT_ACTION(self, queue): queue.put([self.mr, buffer.export()]) # Note: mr does not own this buffer def CHILD_ACTION(self, queue): - mr, buffer_desc = queue.get() + mr, buffer_desc = queue.get(timeout=CHILD_TIMEOUT_SEC) Buffer.import_(mr, buffer_desc) def ASSERT(self, exc_type, exc_msg): assert exc_type is CUDAError assert "CUDA_ERROR_INVALID_VALUE" in exc_msg + + +class TestExportImportedMR(ChildErrorHarness): + """Error when exporting a memory resource that was imported.""" + + def PARENT_ACTION(self, queue): + queue.put(self.mr) + + def CHILD_ACTION(self, queue): + mr = queue.get(timeout=CHILD_TIMEOUT_SEC) + mr.get_allocation_handle() + + def ASSERT(self, exc_type, exc_msg): + assert exc_type is RuntimeError + assert exc_msg == "Imported memory resource cannot be exported" + + +class TestImportBuffer(ChildErrorHarness): + """Error when using a buffer as a buffer descriptor.""" + + def PARENT_ACTION(self, queue): + # Note: if the buffer is not attached to something to prolong its life, + # CUDA_ERROR_INVALID_CONTEXT is raised from Buffer.__del__ + self.buffer = self.mr.allocate(NBYTES) + queue.put(self.buffer) + + def CHILD_ACTION(self, queue): + buffer = queue.get(timeout=CHILD_TIMEOUT_SEC) + Buffer.import_(self.mr, buffer) + + def ASSERT(self, exc_type, exc_msg): + assert exc_type is TypeError + assert exc_msg.startswith("Argument 'ipc_buffer' has incorrect type") diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index e835e53f3..496f32553 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -1,17 +1,20 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from cuda.core.experimental import Device, DeviceMemoryResource +import multiprocessing from itertools import cycle + from utility import IPCBufferTestHelper -import multiprocessing -CHILD_TIMEOUT_SEC = 10 +from cuda.core.experimental import Device, DeviceMemoryResource + +CHILD_TIMEOUT_SEC = 4 NBYTES = 64 NMRS = 3 NTASKS = 7 POOL_SIZE = 2097152 + def test_ipc_send_buffers(device, ipc_memory_resource): """Test passing buffers directly to a child separately from a memory resource.""" mr = ipc_memory_resource @@ -35,6 +38,7 @@ def test_ipc_send_buffers(device, ipc_memory_resource): helper = IPCBufferTestHelper(device, buffer) helper.verify_buffer(flipped=True) + def test_ipc_send_buffers_multi(device, ipc_memory_resource): """Test passing buffers sourced from multiple memory resources.""" # Set up several IPC-enabled memory pools. diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index f12489dea..97e4620cf 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -1,79 +1,133 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import multiprocessing +import multiprocessing as mp +import multiprocessing.reduction +import os -import pytest from utility import IPCBufferTestHelper -from cuda.core.experimental import Buffer, DeviceMemoryResource +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource -CHILD_TIMEOUT_SEC = 10 +CHILD_TIMEOUT_SEC = 4 NBYTES = 64 POOL_SIZE = 2097152 -class TestObjectSerialization: - @pytest.mark.parametrize("use_alloc_handle", [True, False]) - def test_main(self, use_alloc_handle, device, ipc_memory_resource): +class TestObjectSerializationDirect: + """ + Test the low-level interface for sharing memory resources. + + Send a memory resource over a connection via Python's `send_handle`. Reconstruct + it on the other end and demonstrate buffer sharing. + """ + + def test_main(self, device, ipc_memory_resource): + mr = ipc_memory_resource + + # Start the child process. + parent_conn, child_conn = mp.Pipe() + process = mp.Process(target=self.child_main, args=(child_conn,)) + process.start() + + # Send a memory resource by allocation handle. + alloc_handle = mr.get_allocation_handle() + mp.reduction.send_handle(parent_conn, alloc_handle.handle, process.pid) + parent_conn.send(mr.remote_id) + + # Send a buffer. + buffer1 = mr.allocate(NBYTES) + parent_conn.send(buffer1) # directly + + buffer2 = mr.allocate(NBYTES) + parent_conn.send(buffer2.export()) # by descriptor + + # Wait for the child process. + process.join(timeout=CHILD_TIMEOUT_SEC) + assert process.exitcode == 0 + + # Confirm buffers were modified. + IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=True) + IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True) + + def child_main(self, conn): + # Set up the device. + device = Device() + device.set_current() + + # Receive the memory resource. + handle = mp.reduction.recv_handle(conn) + remote_id = conn.recv() + mr = DeviceMemoryResource.from_allocation_handle(device, handle) + mr.register(remote_id) + os.close(handle) + + # Receive the buffers. + buffer1 = conn.recv() # directly + buffer_desc = conn.recv() + buffer2 = Buffer.import_(mr, buffer_desc) # by descriptor + + # Modify the buffers. + IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=True) + IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True) + + +class TestObjectSerializationWithMR: + def test_main(self, device, ipc_memory_resource): """Test sending IPC memory objects to a child through a queue.""" mr = ipc_memory_resource # Start the child process. - pipe = [multiprocessing.Queue() for _ in range(2)] - process = multiprocessing.Process(target=self.child_main, args=(pipe, use_alloc_handle)) + pipe = [mp.Queue() for _ in range(2)] + process = mp.Process(target=self.child_main, args=(pipe, mr)) process.start() # Send a device description. pipe[0].put(device) - device_id = pipe[1].get() + device_id = pipe[1].get(timeout=CHILD_TIMEOUT_SEC) assert device_id == device.device_id - # Send a memory resource directly or by allocation handle. - # Note: there is no apparent way to check the ID between processes. - if use_alloc_handle: - # Send MR by a handle. - alloc_handle = mr.get_allocation_handle() - pipe[0].put(alloc_handle) - else: - # Send MR directly. - pipe[0].put(mr) + # Send a memory resource directly. This relies on the mr already + # being passed when spawning the child. + pipe[0].put(mr) + remote_id = pipe[1].get(timeout=CHILD_TIMEOUT_SEC) + assert remote_id == mr.remote_id # Send a buffer. buffer = mr.allocate(NBYTES) - helper = IPCBufferTestHelper(device, buffer) - helper.fill_buffer(flipped=False) pipe[0].put(buffer) - pipe[1].get() # signal done - helper.verify_buffer(flipped=True) # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) assert process.exitcode == 0 - def child_main(self, pipe, use_alloc_handle): + # Confirm buffer was modified. + IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) + + def child_main(self, pipe, _): # Device. - device = pipe[0].get() + device = pipe[0].get(timeout=CHILD_TIMEOUT_SEC) pipe[1].put(device.device_id) # Memory resource. - if use_alloc_handle: - alloc_handle = pipe[0].get() - mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) - else: - mr = pipe[0].get() + mr = pipe[0].get(timeout=CHILD_TIMEOUT_SEC) + pipe[1].put(mr.remote_id) # Buffer. - buffer = pipe[0].get() + buffer = pipe[0].get(timeout=CHILD_TIMEOUT_SEC) assert buffer.memory_resource.handle == mr.handle - helper = IPCBufferTestHelper(device, buffer) - helper.verify_buffer(flipped=False) - helper.fill_buffer(flipped=True) - pipe[1].put(None) + IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True) def test_object_passing(device, ipc_memory_resource): - """Test sending objects as arguments when starting a process.""" + """ + Test sending objects as arguments when starting a process. + + True pickling of allocation handles and memory resources is enabled only + when spawning a process. This is similar to the way sockets and various objects + in multiprocessing (e.g., Queue) work. + """ + # Define the objects. mr = ipc_memory_resource alloc_handle = mr.get_allocation_handle() @@ -84,7 +138,7 @@ def test_object_passing(device, ipc_memory_resource): helper.fill_buffer(flipped=False) # Start the child process. - process = multiprocessing.Process(target=child_main, args=(device, alloc_handle, mr, buffer_desc, buffer)) + process = mp.Process(target=child_main, args=(device, alloc_handle, mr, buffer_desc, buffer)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) assert process.exitcode == 0 @@ -95,8 +149,8 @@ def test_object_passing(device, ipc_memory_resource): def child_main(device, alloc_handle, mr1, buffer_desc, buffer1): mr2 = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) - # OK to build the buffer from either mr and descriptor. - # These all point to the same buffer. + # OK to build the buffer from either mr and the descriptor. + # All buffer* objects point to the same memory. buffer2 = Buffer.import_(mr1, buffer_desc) buffer3 = Buffer.import_(mr2, buffer_desc) @@ -108,18 +162,21 @@ def child_main(device, alloc_handle, mr1, buffer_desc, buffer1): helper2.verify_buffer(flipped=False) helper3.verify_buffer(flipped=False) + # Modify 1. helper1.fill_buffer(flipped=True) helper1.verify_buffer(flipped=True) helper2.verify_buffer(flipped=True) helper3.verify_buffer(flipped=True) + # Modify 2. helper2.fill_buffer(flipped=False) helper1.verify_buffer(flipped=False) helper2.verify_buffer(flipped=False) helper3.verify_buffer(flipped=False) + # Modify 3. helper3.fill_buffer(flipped=True) helper1.verify_buffer(flipped=True) diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index 2dc29da8b..cf2fefd07 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -6,43 +6,98 @@ from utility import IPCBufferTestHelper -from cuda.core.experimental import Device, DeviceMemoryResource +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource -CHILD_TIMEOUT_SEC = 10 +CHILD_TIMEOUT_SEC = 4 NBYTES = 64 NWORKERS = 2 NMRS = 3 NTASKS = 20 POOL_SIZE = 2097152 +# Global memory resources, set in children. +g_mrs = None -def test_ipc_workerpool(device, ipc_memory_resource): - """Test IPC with a worker pool.""" - mr = ipc_memory_resource - buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)] - with multiprocessing.Pool(processes=NWORKERS) as pool: - pool.map(process_buffer, buffers) - for buffer in buffers: - helper = IPCBufferTestHelper(device, buffer) - helper.verify_buffer(flipped=True) +class TestIpcWorkerPoolUsingExport: + """ + Test buffer sharing using export handles. + The memory resources need to be passed to subprocesses at startup. Buffers + are passed by their handles and reconstructed using the corresponding mr. + """ -def test_ipc_workerpool_multi_mr(device, ipc_memory_resource): - """Test IPC with a worker pool using multiple memory resources.""" - mrs = [ipc_memory_resource] + [ - DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1) - ] - buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] - with multiprocessing.Pool(processes=NWORKERS) as pool: - pool.map(process_buffer, buffers) + @staticmethod + def init_worker(mrs): + global g_mrs + g_mrs = mrs - for buffer in buffers: - helper = IPCBufferTestHelper(device, buffer) - helper.verify_buffer(flipped=True) + def test_ipc_workerpool(self, device, ipc_memory_resource): + """Test IPC with a worker pool.""" + mr = ipc_memory_resource + buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)] + with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool: + pool.starmap(self.process_buffer, [(0, buffer.export()) for buffer in buffers]) + for buffer in buffers: + IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) -def process_buffer(buffer): - device = Device() - helper = IPCBufferTestHelper(device, buffer) - helper.fill_buffer(flipped=True) + def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource): + """Test IPC with a worker pool using multiple memory resources.""" + mrs = [ipc_memory_resource] + [ + DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1) + ] + buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] + with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: + pool.starmap( + self.process_buffer, [(mrs.index(buffer.memory_resource), buffer.export()) for buffer in buffers] + ) + + for buffer in buffers: + IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) + + def process_buffer(self, mr_idx, buffer_desc): + device = Device() + buffer = Buffer.import_(g_mrs[mr_idx], buffer_desc) + IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True) + + +class TestIpcWorkerPool: + """ + Test buffer sharing without using export handles. + + The memory resources need to be passed to subprocesses at startup. Buffers + are serialized with the `remote_id` of the corresponding mr, and the + import/export is handled automatically. + """ + + @staticmethod + def init_worker(mrs): + global g_mrs + g_mrs = mrs + + def test_ipc_workerpool(self, device, ipc_memory_resource): + """Test IPC with a worker pool.""" + mr = ipc_memory_resource + buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)] + with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool: + pool.map(self.process_buffer, buffers) + + for buffer in buffers: + IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) + + def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource): + """Test IPC with a worker pool using multiple memory resources.""" + mrs = [ipc_memory_resource] + [ + DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1) + ] + buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] + with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: + pool.map(self.process_buffer, buffers) + + for buffer in buffers: + IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) + + def process_buffer(self, buffer): + device = Device() + IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True) From e8822b3c2bf3307d0de8d429f62c49cd6bd09c9b Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 24 Sep 2025 17:32:19 -0700 Subject: [PATCH 09/25] Add tests for leaked file descriptors and fix leaks. --- cuda_core/cuda/core/experimental/_memory.pyx | 26 ++-- cuda_core/tests/memory_ipc/test_leaks.py | 129 +++++++++++++++++++ 2 files changed, 145 insertions(+), 10 deletions(-) create mode 100644 cuda_core/tests/memory_ipc/test_leaks.py diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 4dc5c52e5..d69dd58f2 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -439,8 +439,7 @@ cdef class IPCAllocationHandle: def __reduce__(self): multiprocessing.context.assert_spawning(self) - fd = os.dup(self.handle) - df = multiprocessing.reduction.DupFd(fd) + df = multiprocessing.reduction.DupFd(self.handle) return IPCAllocationHandle._reconstruct, (df,) @staticmethod @@ -649,7 +648,8 @@ class DeviceMemoryResource(MemoryResource): device memory resource does not own the pool (`is_handle_owned` is `False`), and closing the resource has no effect. """ - __slots__ = "_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type", "_mempool_owned", "_is_imported", "_remote_id" + __slots__ = ("_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type", + "_mempool_owned", "_is_imported", "_remote_id", "_alloc_handle") def __init__(self, device_id: int | Device, options=None): device_id = getattr(device_id, 'device_id', device_id) @@ -666,6 +666,7 @@ class DeviceMemoryResource(MemoryResource): self._mempool_owned = False self._is_imported = False self._remote_id = None + self._alloc_handle = None err, self._mempool_handle = driver.cuDeviceGetMemPool(self.device_id) raise_if_driver_error(err) @@ -708,6 +709,7 @@ class DeviceMemoryResource(MemoryResource): self._mempool_owned = True self._is_imported = False self._remote_id = None + self._alloc_handle = None err, self._mempool_handle = driver.cuMemPoolCreate(properties) raise_if_driver_error(err) @@ -733,6 +735,7 @@ class DeviceMemoryResource(MemoryResource): self._mempool_owned = False self._is_imported = False self._remote_id = None + self._alloc_handle = None def __reduce__(self): @@ -816,6 +819,7 @@ class DeviceMemoryResource(MemoryResource): self._mempool_owned = True self._is_imported = True self._remote_id = None + self._alloc_handle = None # only used for non-imported err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0) raise_if_driver_error(err) @@ -836,13 +840,15 @@ class DeviceMemoryResource(MemoryResource): ------- The shareable handle for the memory pool. """ - if not self.is_ipc_enabled: - raise RuntimeError("Memory resource is not IPC-enabled") - if self._is_imported: - raise RuntimeError("Imported memory resource cannot be exported") - err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0) - raise_if_driver_error(err) - return IPCAllocationHandle._init(alloc_handle) + if self._alloc_handle is None: + if not self.is_ipc_enabled: + raise RuntimeError("Memory resource is not IPC-enabled") + if self._is_imported: + raise RuntimeError("Imported memory resource cannot be exported") + err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0) + raise_if_driver_error(err) + self._alloc_handle = IPCAllocationHandle._init(alloc_handle) + return self._alloc_handle def allocate(self, size_t size, stream: Stream = None) -> Buffer: """Allocate a buffer of the requested size. diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py new file mode 100644 index 000000000..77e4ef2c5 --- /dev/null +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -0,0 +1,129 @@ +import gc +import multiprocessing as mp + +import psutil +import pytest + +from cuda.core.experimental import _memory +from cuda.core.experimental._utils.cuda_utils import driver + +CHILD_TIMEOUT_SEC = 4 +NBYTES = 64 + +USING_FDS = _memory._IPC_HANDLE_TYPE == driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR +skip_unless_using_fds = pytest.mark.skipif(not USING_FDS, reason="mempool allocation handle is not using fds") + + +@skip_unless_using_fds +def test_alloc_handle(ipc_memory_resource): + """Check for fd leaks in get_allocation_handle.""" + mr = ipc_memory_resource + with CheckFDLeaks(): + [mr.get_allocation_handle() for _ in range(10)] + + +def exec_with_object(obj, number=1): + """Succesfully run a child process.""" + for _ in range(number): + process = mp.Process(target=child_main, args=(obj,)) + process.start() + process.join() + assert process.exitcode == 0 + + +def child_main(obj, *args): + pass + + +def exec_launch_failure(obj, number=1): + """ + Unsuccesfully try to launch a child process. This fails when + after the child starts. + """ + for _ in range(number): + process = mp.Process(target=child_main_bad, args=(obj,)) + process.start() + process.join() + assert process.exitcode != 0 + + +def child_main_bad(): + """Fails when passed arguments.""" + pass + + +def exec_reduce_failure(obj, number=1): + """ + Unsuccesfully try to launch a child process. This fails before + the child starts but after the resource-owning object is serialized. + """ + for _ in range(number): + fails_to_reduce = Irreducible() + try: + mp.Process(target=child_main, args=(obj, fails_to_reduce)).start() + except RuntimeError: + pass + + +class Irreducible: + """A class that cannot be serialized.""" + def __reduce__(self): + raise RuntimeError("Irreducible") + + +@skip_unless_using_fds +@pytest.mark.parametrize( + "getobject", + [ + lambda mr: mr.get_allocation_handle(), + lambda mr: mr, + lambda mr: mr.allocate(NBYTES), + lambda mr: mr.allocate(NBYTES).export(), + ], + ids=["alloc_handle", "mr", "buffer", "buffer_desc"], +) +@pytest.mark.parametrize( + "launcher", [exec_with_object, exec_launch_failure, exec_reduce_failure] +) +def test_pass_object(ipc_memory_resource, launcher, getobject): + """Check for fd leaks when an object is sent as a subprocess argument.""" + mr = ipc_memory_resource + with CheckFDLeaks(): + obj = getobject(mr) + try: + launcher(obj, number=2) + finally: + del obj + + +class CheckFDLeaks: + """ + Context manager to check for file descriptor leaks. + Ensures the number of open file descriptors is the same before and after the block. + """ + + def __init__(self): + self.process = psutil.Process() + + def __enter__(self): + self.prime() + gc.collect() + self.initial_fds = self.process.num_fds() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is not None: + gc.collect() + final_fds = self.process.num_fds() + assert final_fds == self.initial_fds + return False + + def prime(self, latch=[]): + """Multiprocessing consumes a file descriptor on first launch.""" + assert mp.get_start_method() == "spawn" + if not latch: + process = mp.Process() + process.start() + process.join() + assert process.exitcode == 0 + latch.append(None) From 708e2b5a0dc7724845c6b896c46b88043c1c4481 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 25 Sep 2025 08:58:33 -0700 Subject: [PATCH 10/25] Eliminates IPCChannel. --- cuda_core/cuda/core/experimental/__init__.py | 1 - cuda_core/cuda/core/experimental/_memory.pyx | 102 ---------- cuda_core/tests/memory_ipc/test_channel.py | 183 ------------------ cuda_core/tests/memory_ipc/test_memory_ipc.py | 182 +++++++++++++++++ 4 files changed, 182 insertions(+), 286 deletions(-) delete mode 100644 cuda_core/tests/memory_ipc/test_channel.py create mode 100644 cuda_core/tests/memory_ipc/test_memory_ipc.py diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index a06119321..9a86459d2 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -17,7 +17,6 @@ from cuda.core.experimental._memory import ( Buffer, DeviceMemoryResource, - IPCChannel, LegacyPinnedMemoryResource, MemoryResource, ) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index d69dd58f2..039998220 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -161,13 +161,6 @@ cdef class Buffer: raise_if_driver_error(err) return Buffer.from_handle(ptr, ipc_buffer.size, mr) - def export_to_channel(self, channel: IPCChannel): - channel.export(self); - - @classmethod - def import_from_channel(cls, channel: IPCChannel): - return channel.import_() - def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. @@ -464,83 +457,6 @@ cdef class IPCAllocationHandle: return self._handle -cdef class IPCChannel: - """Communication channel for sharing IPC-enabled memory pools.""" - - cdef: - object _proxy - object _queue - object _mr - - def __init__(self): - if platform.system() == "Linux": - self._proxy = IPCChannelUnixSocket._init() - else: - raise RuntimeError("IPC is not available on {platform.system()}") - self._queue = multiprocessing.Queue() - self._mr = None - - def send_buffer(self, buffer: Buffer): - handle = buffer.export() - self._queue.put(handle) - - def receive_buffer(self, device: Optional[Device] = None): - if self._mr is None: - if device is None: - from ._device import Device - device = Device() - self._mr = DeviceMemoryResource.from_shared_channel(device, self) - - handle = self._queue.get() - return Buffer.import_(self._mr, handle) - - def send_allocation_handle(self, alloc_handle: IPCAllocationHandle): - """Sends over this channel an allocation handle for exporting a - shared memory pool.""" - self._proxy.send_allocation_handle(alloc_handle) - - def receive_allocation_handle(self) -> IPCAllocationHandle: - """Receives over this channel an allocation handle for importing a - shared memory pool.""" - return self._proxy.receive_allocation_handle() - - -cdef class IPCChannelUnixSocket: - """Unix-specific channel for sharing memory pools over sockets.""" - - cdef: - object _sock_out - object _sock_in - - def __init__(self, *arg, **kwargs): - raise RuntimeError("IPCChannelUnixSocket objects cannot be instantiated directly. Please use MemoryResource APIs.") - - @classmethod - def _init(cls): - cdef IPCChannelUnixSocket self = IPCChannelUnixSocket.__new__(cls) - self._sock_out, self._sock_in = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) - return self - - cpdef send_allocation_handle(self, alloc_handle: IPCAllocationHandle): - """Sends over this channel an allocation handle for exporting a - shared memory pool.""" - self._sock_out.sendmsg( - [], - [(socket.SOL_SOCKET, socket.SCM_RIGHTS, array.array("i", [int(alloc_handle)]))] - ) - - cpdef IPCAllocationHandle receive_allocation_handle(self): - """Receives over this channel an allocation handle for importing a - shared memory pool.""" - fds = array.array("i") - _, ancillary_data, _, _ = self._sock_in.recvmsg(0, socket.CMSG_LEN(fds.itemsize)) - assert len(ancillary_data) == 1 - cmsg_level, cmsg_type, cmsg_data = ancillary_data[0] - assert cmsg_level == socket.SOL_SOCKET and cmsg_type == socket.SCM_RIGHTS - fds.frombytes(cmsg_data[: len(cmsg_data) - (len(cmsg_data) % fds.itemsize)]) - return IPCAllocationHandle._init(int(fds[0])) - - @dataclass cdef class DeviceMemoryResourceOptions: """Customizable :obj:`~_memory.DeviceMemoryResource` options. @@ -775,19 +691,6 @@ class DeviceMemoryResource(MemoryResource): self._remote_id = int(self._mempool_handle) return self._remote_id - def create_ipc_channel(self): - """Create an IPC memory channel for sharing allocations.""" - channel = IPCChannel() - self.share_to_channel(channel) - return channel - - @classmethod - def from_shared_channel(cls, device_id: int | Device, channel: IPCChannel) -> DeviceMemoryResource: - """Create a device memory resource from a memory pool shared over an IPC channel.""" - device_id = getattr(device_id, 'device_id', device_id) - alloc_handle = channel.receive_allocation_handle() - return cls.from_allocation_handle(device_id, alloc_handle) - @classmethod def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource: """Create a device memory resource from an allocation handle. @@ -825,11 +728,6 @@ class DeviceMemoryResource(MemoryResource): raise_if_driver_error(err) return self - def share_to_channel(self, channel : IPCChannel): - if not self.is_ipc_enabled: - raise RuntimeError("Memory resource is not IPC-enabled") - channel.send_allocation_handle(self.get_allocation_handle()) - def get_allocation_handle(self) -> IPCAllocationHandle: """Export the memory pool handle to be shared (requires IPC). diff --git a/cuda_core/tests/memory_ipc/test_channel.py b/cuda_core/tests/memory_ipc/test_channel.py deleted file mode 100644 index 49e8f3be2..000000000 --- a/cuda_core/tests/memory_ipc/test_channel.py +++ /dev/null @@ -1,183 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import multiprocessing - -from utility import IPCBufferTestHelper - -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel - -CHILD_TIMEOUT_SEC = 4 -NBYTES = 64 -NWORKERS = 2 -NTASKS = 2 - - -def test_ipc_mempool(device, ipc_memory_resource): - """Test IPC with memory pools.""" - # Set up the IPC-enabled memory pool and share it. - mr = ipc_memory_resource - channel = mr.create_ipc_channel() - - # Start the child process. - process = multiprocessing.Process(target=child_main1, args=(channel,)) - process.start() - - # Allocate and fill memory. - buffer = mr.allocate(NBYTES) - helper = IPCBufferTestHelper(device, buffer) - helper.fill_buffer(flipped=False) - - # Export the buffer via IPC. - channel.send_buffer(buffer) - - # Wait for the child process. - process.join(timeout=CHILD_TIMEOUT_SEC) - assert process.exitcode == 0 - - # Verify that the buffer was modified. - helper.verify_buffer(flipped=True) - - -def child_main1(channel): - device = Device() - device.set_current() - buffer = channel.receive_buffer() - helper = IPCBufferTestHelper(device, buffer) - helper.verify_buffer(flipped=False) - helper.fill_buffer(flipped=True) - - -def test_ipc_mempool_multiple(device, ipc_memory_resource): - """Test IPC with memory pools using multiple processes.""" - # Construct an IPC-enabled memory resource and share it over two channels. - mr = ipc_memory_resource - ch1, ch2 = (mr.create_ipc_channel() for _ in range(2)) - - # Allocate memory buffers and export them to each channel. - buffer1 = mr.allocate(NBYTES) - ch1.send_buffer(buffer1) - ch2.send_buffer(buffer1) - buffer2 = mr.allocate(NBYTES) - ch1.send_buffer(buffer2) - ch2.send_buffer(buffer2) - - # Start the child processes. - p1 = multiprocessing.Process(target=child_main2, args=(1, ch1)) - p2 = multiprocessing.Process(target=child_main2, args=(2, ch2)) - p1.start() - p2.start() - - # Wait for the child processes. - p1.join(timeout=CHILD_TIMEOUT_SEC) - p2.join(timeout=CHILD_TIMEOUT_SEC) - assert p1.exitcode == 0 - assert p2.exitcode == 0 - - # Verify that the buffers were modified. - IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False) - IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True) - - -def child_main2(idx, channel): - device = Device() - device.set_current() - buffer1 = channel.receive_buffer() # implicitly set up the shared memory pool - buffer2 = channel.receive_buffer() - if idx == 1: - IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False) - elif idx == 2: - IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True) - - -def test_ipc_shared_allocation_handle(device, ipc_memory_resource): - """Demonstrate that a memory pool allocation handle can be reused for IPC - with multiple processes.""" - # Set up communication. - ch1 = IPCChannel() - ch2 = IPCChannel() - q1 = multiprocessing.Queue() - q2 = multiprocessing.Queue() - - # Start children. - p1 = multiprocessing.Process(target=child_main3, args=(1, ch1, q1)) - p2 = multiprocessing.Process(target=child_main3, args=(2, ch2, q2)) - p1.start() - p2.start() - - # Set up the IPC-enabled memory pool and share it using one handle. - mr = ipc_memory_resource - alloc_handle = mr.get_allocation_handle() - ch1.send_allocation_handle(alloc_handle) - ch2.send_allocation_handle(alloc_handle) - - # Allocate a share memory. - buf1 = mr.allocate(NBYTES) - buf2 = mr.allocate(NBYTES) - q1.put(buf1.export()) - q2.put(buf2.export()) - - # Wait for children. - p1.join(timeout=CHILD_TIMEOUT_SEC) - p2.join(timeout=CHILD_TIMEOUT_SEC) - assert p1.exitcode == 0 - assert p2.exitcode == 0 - - # Verify results. - IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1) - IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) - - -def child_main3(idx, channel, queue): - """Fills a shared memory buffer.""" - device = Device() - device.set_current() - alloc_handle = channel.receive_allocation_handle() - mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) - buffer_descriptor = queue.get(timeout=CHILD_TIMEOUT_SEC) - buffer = Buffer.import_(mr, buffer_descriptor) - IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) - - -def test_ipc_shared_allocation_handle2(device, ipc_memory_resource): - """Demonstrate that a memory pool allocation handle can be reused for IPC - with multiple processes (simplified).""" - # Set up communication. - ch1 = IPCChannel() - ch2 = IPCChannel() - - # Start children. - p1 = multiprocessing.Process(target=child_main4, args=(1, ch1)) - p2 = multiprocessing.Process(target=child_main4, args=(2, ch2)) - p1.start() - p2.start() - - # Set up the IPC-enabled memory pool and share it using one handle. - mr = ipc_memory_resource - alloc_handle = mr.get_allocation_handle() - ch1.send_allocation_handle(alloc_handle) - ch2.send_allocation_handle(alloc_handle) - - # Allocate a share memory. - buf1 = mr.allocate(NBYTES) - buf2 = mr.allocate(NBYTES) - ch1.send_buffer(buf1) - ch2.send_buffer(buf2) - - # Wait for children. - p1.join(timeout=CHILD_TIMEOUT_SEC) - p2.join(timeout=CHILD_TIMEOUT_SEC) - assert p1.exitcode == 0 - assert p2.exitcode == 0 - - # Verify results. - IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1) - IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) - - -def child_main4(idx, channel): - """Fills a shared memory buffer.""" - device = Device() - device.set_current() - buffer = channel.receive_buffer() - IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py new file mode 100644 index 000000000..9a527bf0d --- /dev/null +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import multiprocessing as mp + +from utility import IPCBufferTestHelper + +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource + +CHILD_TIMEOUT_SEC = 4 +NBYTES = 64 +NWORKERS = 2 +NTASKS = 2 + + +class TestIpcMempool: + def test_main(self, device, ipc_memory_resource): + """Test IPC with memory pools.""" + # Set up the IPC-enabled memory pool and share it. + mr = ipc_memory_resource + + # Start the child process. + queue = mp.Queue() + process = mp.Process(target=self.child_main, args=(mr, queue)) + process.start() + + # Allocate and fill memory. + buffer = mr.allocate(NBYTES) + helper = IPCBufferTestHelper(device, buffer) + helper.fill_buffer(flipped=False) + + # Export the buffer via IPC. + queue.put(buffer) + + # Wait for the child process. + process.join(timeout=CHILD_TIMEOUT_SEC) + assert process.exitcode == 0 + + # Verify that the buffer was modified. + helper.verify_buffer(flipped=True) + + + def child_main(self, mr, queue): + device = Device() + buffer = queue.get(timeout=CHILD_TIMEOUT_SEC) + helper = IPCBufferTestHelper(device, buffer) + helper.verify_buffer(flipped=False) + helper.fill_buffer(flipped=True) + + +class TestIPCMempoolMultiple: + def test_main(self, device, ipc_memory_resource): + """Test IPC with memory pools using multiple processes.""" + # Construct an IPC-enabled memory resource and share it with two children. + mr = ipc_memory_resource + q1, q2 = (mp.Queue() for _ in range(2)) + + # Allocate memory buffers and export them to each child. + buffer1 = mr.allocate(NBYTES) + q1.put(buffer1) + q2.put(buffer1) + buffer2 = mr.allocate(NBYTES) + q1.put(buffer2) + q2.put(buffer2) + + # Start the child processes. + p1 = mp.Process(target=self.child_main, args=(mr, 1, q1)) + p2 = mp.Process(target=self.child_main, args=(mr, 2, q2)) + p1.start() + p2.start() + + # Wait for the child processes. + p1.join(timeout=CHILD_TIMEOUT_SEC) + p2.join(timeout=CHILD_TIMEOUT_SEC) + assert p1.exitcode == 0 + assert p2.exitcode == 0 + + # Verify that the buffers were modified. + IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False) + IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True) + + + def child_main(self, mr, idx, queue): + # Note: passing the mr registers it so that buffers can be passed + # directly. + device = Device() + buffer1 = queue.get(timeout=CHILD_TIMEOUT_SEC) + buffer2 = queue.get(timeout=CHILD_TIMEOUT_SEC) + if idx == 1: + IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False) + elif idx == 2: + IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True) + + +class TestIPCSharedAllocationHandleAndBufferDescriptors: + def test_main(self, device, ipc_memory_resource): + """ + Demonstrate that a memory pool allocation handle can be reused for IPC + with multiple processes. Uses buffer descriptors. + """ + # Set up the IPC-enabled memory pool and share it using one handle. + mr = ipc_memory_resource + alloc_handle = mr.get_allocation_handle() + + # Start children. + q1, q2 = (mp.Queue() for _ in range(2)) + p1 = mp.Process(target=self.child_main, args=(alloc_handle, 1, q1)) + p2 = mp.Process(target=self.child_main, args=(alloc_handle, 2, q2)) + p1.start() + p2.start() + + # Allocate and share memory. + buf1 = mr.allocate(NBYTES) + buf2 = mr.allocate(NBYTES) + q1.put(buf1.export()) + q2.put(buf2.export()) + + # Wait for children. + p1.join(timeout=CHILD_TIMEOUT_SEC) + p2.join(timeout=CHILD_TIMEOUT_SEC) + assert p1.exitcode == 0 + assert p2.exitcode == 0 + + # Verify results. + IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1) + IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) + + + def child_main(self, alloc_handle, idx, queue): + """Fills a shared memory buffer.""" + # In this case, the device needs to be set up (passing the mr does it + # implicitly in other tests). + device = Device() + device.set_current() + mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) + buffer_descriptor = queue.get(timeout=CHILD_TIMEOUT_SEC) + buffer = Buffer.import_(mr, buffer_descriptor) + IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) + + +class TestIPCSharedAllocationHandleAndBufferObjects: + def test_main(self, device, ipc_memory_resource): + """ + Demonstrate that a memory pool allocation handle can be reused for IPC + with multiple processes. Uses buffer objects (not descriptors). + """ + mr = ipc_memory_resource + alloc_handle = mr.get_allocation_handle() + + # Start children. + q1, q2 = (mp.Queue() for _ in range(2)) + p1 = mp.Process(target=self.child_main, args=(alloc_handle, mr.remote_id, 1, q1)) + p2 = mp.Process(target=self.child_main, args=(alloc_handle, mr.remote_id, 2, q2)) + p1.start() + p2.start() + + # Allocate and share memory. + buf1 = mr.allocate(NBYTES) + buf2 = mr.allocate(NBYTES) + q1.put(buf1) + q2.put(buf2) + + # Wait for children. + p1.join(timeout=CHILD_TIMEOUT_SEC) + p2.join(timeout=CHILD_TIMEOUT_SEC) + assert p1.exitcode == 0 + assert p2.exitcode == 0 + + # Verify results. + IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1) + IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) + + + def child_main(self, alloc_handle, remote_id, idx, queue): + """Fills a shared memory buffer.""" + device = Device() + device.set_current() + mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) + mr.register(remote_id) + buffer = queue.get(timeout=CHILD_TIMEOUT_SEC) + IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) + From b31d8490ffcd8fec50f38a15a6cf540e4e589f6f Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 25 Sep 2025 09:50:46 -0700 Subject: [PATCH 11/25] Changes DeviceMemoryResource remote_id to uuid. --- cuda_core/cuda/core/experimental/_memory.pyx | 49 ++++++++++--------- cuda_core/tests/memory_ipc/test_memory_ipc.py | 8 +-- cuda_core/tests/memory_ipc/test_serialize.py | 12 ++--- cuda_core/tests/memory_ipc/test_workerpool.py | 2 +- 4 files changed, 36 insertions(+), 35 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 039998220..52588dbd2 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -21,6 +21,7 @@ import multiprocessing.reduction import os import platform import sys +import uuid as uuid_module import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream @@ -537,7 +538,7 @@ class DeviceMemoryResourceAttributes: # Holds DeviceMemoryResource objects imported by this process. # This enables buffer serialization, as buffers can reduce to a pair -# of comprising the memory resource `remote_id` (the key into this registry) +# of comprising the memory resource UUID (the key into this registry) # and the serialized buffer descriptor. _ipc_registry = {} @@ -565,7 +566,7 @@ class DeviceMemoryResource(MemoryResource): `False`), and closing the resource has no effect. """ __slots__ = ("_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type", - "_mempool_owned", "_is_imported", "_remote_id", "_alloc_handle") + "_mempool_owned", "_is_imported", "_uuid", "_alloc_handle") def __init__(self, device_id: int | Device, options=None): device_id = getattr(device_id, 'device_id', device_id) @@ -581,7 +582,7 @@ class DeviceMemoryResource(MemoryResource): self._ipc_handle_type = _NOIPC_HANDLE_TYPE self._mempool_owned = False self._is_imported = False - self._remote_id = None + self._uuid = None self._alloc_handle = None err, self._mempool_handle = driver.cuDeviceGetMemPool(self.device_id) @@ -624,14 +625,14 @@ class DeviceMemoryResource(MemoryResource): self._ipc_handle_type = properties.handleTypes self._mempool_owned = True self._is_imported = False - self._remote_id = None + self._uuid = None self._alloc_handle = None err, self._mempool_handle = driver.cuMemPoolCreate(properties) raise_if_driver_error(err) if opts.ipc_enabled: - self.get_allocation_handle() # enables Buffer.export + self.get_allocation_handle() # enables Buffer.export, sets uuid def __del__(self): self.close() @@ -650,46 +651,44 @@ class DeviceMemoryResource(MemoryResource): self._ipc_handle_type = _NOIPC_HANDLE_TYPE self._mempool_owned = False self._is_imported = False - self._remote_id = None + self._uuid = None self._alloc_handle = None def __reduce__(self): # If spawning a new process, serialize the resources; otherwise, just - # send the remote_id, using the registry on the receiving end. + # send the UUID, using the registry on the receiving end. is_spawning = multiprocessing.context.get_spawning_popen() is not None if is_spawning: from ._device import Device device = Device(self.device_id) alloc_handle = self.get_allocation_handle() - return DeviceMemoryResource._reconstruct, (device, alloc_handle, self.remote_id) + return DeviceMemoryResource._reconstruct, (device, alloc_handle, self.uuid) else: - return DeviceMemoryResource.from_registry, (self.remote_id,) + return DeviceMemoryResource.from_registry, (self.uuid,) @staticmethod - def _reconstruct(device, alloc_handle, remote_id): + def _reconstruct(device, alloc_handle, uuid): self = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) - self.register(remote_id) + self.register(uuid) return self @staticmethod - def from_registry(remote_id): + def from_registry(uuid: uuid_module.UUID): try: - return _ipc_registry[remote_id] + return _ipc_registry[uuid] except KeyError: - raise RuntimeError(f"Memory resource with {remote_id=} was not found") + raise RuntimeError(f"Memory resource with {uuid=} was not found") - def register(self, remote_id: int): - if remote_id not in _ipc_registry: - assert self._remote_id is None or self._remote_id == remote_id - _ipc_registry[remote_id] = self - self._remote_id = remote_id + def register(self, uuid: uuid_module.UUID): + if uuid not in _ipc_registry: + assert self._uuid is None or self._uuid == uuid + _ipc_registry[uuid] = self + self._uuid = uuid @property - def remote_id(self): - if self._remote_id is None and not self._is_imported: - self._remote_id = int(self._mempool_handle) - return self._remote_id + def uuid(self): + return self._uuid @classmethod def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource: @@ -721,7 +720,7 @@ class DeviceMemoryResource(MemoryResource): self._ipc_handle_type = _IPC_HANDLE_TYPE self._mempool_owned = True self._is_imported = True - self._remote_id = None + self._uuid = None self._alloc_handle = None # only used for non-imported err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0) @@ -746,6 +745,8 @@ class DeviceMemoryResource(MemoryResource): err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0) raise_if_driver_error(err) self._alloc_handle = IPCAllocationHandle._init(alloc_handle) + assert self._uuid is None + self._uuid = uuid_module.uuid4() return self._alloc_handle def allocate(self, size_t size, stream: Stream = None) -> Buffer: diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index 9a527bf0d..c0be05188 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -149,8 +149,8 @@ def test_main(self, device, ipc_memory_resource): # Start children. q1, q2 = (mp.Queue() for _ in range(2)) - p1 = mp.Process(target=self.child_main, args=(alloc_handle, mr.remote_id, 1, q1)) - p2 = mp.Process(target=self.child_main, args=(alloc_handle, mr.remote_id, 2, q2)) + p1 = mp.Process(target=self.child_main, args=(alloc_handle, mr.uuid, 1, q1)) + p2 = mp.Process(target=self.child_main, args=(alloc_handle, mr.uuid, 2, q2)) p1.start() p2.start() @@ -171,12 +171,12 @@ def test_main(self, device, ipc_memory_resource): IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) - def child_main(self, alloc_handle, remote_id, idx, queue): + def child_main(self, alloc_handle, uuid, idx, queue): """Fills a shared memory buffer.""" device = Device() device.set_current() mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) - mr.register(remote_id) + mr.register(uuid) buffer = queue.get(timeout=CHILD_TIMEOUT_SEC) IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 97e4620cf..cd17bf366 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -33,7 +33,7 @@ def test_main(self, device, ipc_memory_resource): # Send a memory resource by allocation handle. alloc_handle = mr.get_allocation_handle() mp.reduction.send_handle(parent_conn, alloc_handle.handle, process.pid) - parent_conn.send(mr.remote_id) + parent_conn.send(mr.uuid) # Send a buffer. buffer1 = mr.allocate(NBYTES) @@ -57,9 +57,9 @@ def child_main(self, conn): # Receive the memory resource. handle = mp.reduction.recv_handle(conn) - remote_id = conn.recv() + uuid = conn.recv() mr = DeviceMemoryResource.from_allocation_handle(device, handle) - mr.register(remote_id) + mr.register(uuid) os.close(handle) # Receive the buffers. @@ -90,8 +90,8 @@ def test_main(self, device, ipc_memory_resource): # Send a memory resource directly. This relies on the mr already # being passed when spawning the child. pipe[0].put(mr) - remote_id = pipe[1].get(timeout=CHILD_TIMEOUT_SEC) - assert remote_id == mr.remote_id + uuid = pipe[1].get(timeout=CHILD_TIMEOUT_SEC) + assert uuid == mr.uuid # Send a buffer. buffer = mr.allocate(NBYTES) @@ -111,7 +111,7 @@ def child_main(self, pipe, _): # Memory resource. mr = pipe[0].get(timeout=CHILD_TIMEOUT_SEC) - pipe[1].put(mr.remote_id) + pipe[1].put(mr.uuid) # Buffer. buffer = pipe[0].get(timeout=CHILD_TIMEOUT_SEC) diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index cf2fefd07..50fa1d509 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -67,7 +67,7 @@ class TestIpcWorkerPool: Test buffer sharing without using export handles. The memory resources need to be passed to subprocesses at startup. Buffers - are serialized with the `remote_id` of the corresponding mr, and the + are serialized with the `uuid` of the corresponding mr, and the import/export is handled automatically. """ From 6c53cb0c8fe36db1d7a1c4bb6f660fd00ad39f92 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 25 Sep 2025 10:20:23 -0700 Subject: [PATCH 12/25] Embeds the memory resource UUID into allocation handles. --- cuda_core/cuda/core/experimental/_memory.pyx | 62 +++++++++---------- cuda_core/tests/memory_ipc/conftest.py | 4 +- cuda_core/tests/memory_ipc/test_errors.py | 23 ++++++- cuda_core/tests/memory_ipc/test_leaks.py | 38 +++++++----- cuda_core/tests/memory_ipc/test_memory_ipc.py | 18 +++--- cuda_core/tests/memory_ipc/test_serialize.py | 3 +- 6 files changed, 86 insertions(+), 62 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 52588dbd2..3e75967cc 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -77,11 +77,7 @@ cdef class Buffer: self.close() def __reduce__(self): - return Buffer._reconstruct, (self.memory_resource, self.export()) - - @staticmethod - def _reconstruct(mr, desc): - return Buffer.import_(mr, desc) + return Buffer.import_, (self.memory_resource, self.export()) cpdef close(self, stream: Stream = None): """Deallocate this buffer asynchronously on the given stream. @@ -390,33 +386,29 @@ cdef class IPCBufferDescriptor: return self def __reduce__(self): - # This is subject to change if the CUmemPoolPtrExportData struct/object changes. - return (self._reconstruct, (self._reserved, self._size)) + return self._init, (self._reserved, self._size) @property def size(self): return self._size - @classmethod - def _reconstruct(cls, reserved, size): - instance = cls._init(reserved, size) - return instance - cdef class IPCAllocationHandle: """Shareable handle to an IPC-enabled device memory pool.""" cdef: int _handle + object _uuid def __init__(self, *arg, **kwargs): raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.") @classmethod - def _init(cls, handle: int): + def _init(cls, handle: int, uuid: uuid_module.UUID): cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls) assert handle >= 0 self._handle = handle + self._uuid = uuid return self cpdef close(self): @@ -426,6 +418,7 @@ cdef class IPCAllocationHandle: os.close(self._handle) finally: self._handle = -1 + self._uuid = None def __del__(self): """Close the handle.""" @@ -434,12 +427,11 @@ cdef class IPCAllocationHandle: def __reduce__(self): multiprocessing.context.assert_spawning(self) df = multiprocessing.reduction.DupFd(self.handle) - return IPCAllocationHandle._reconstruct, (df,) + return self._reconstruct, (df, self._uuid) - @staticmethod - def _reconstruct(df): - self = IPCAllocationHandle._init(df.detach()) - return self + @classmethod + def _reconstruct(cls, df, uuid): + return cls._init(df.detach(), uuid) def __int__(self) -> int: if self._handle < 0: @@ -449,14 +441,19 @@ cdef class IPCAllocationHandle: return self._handle def detach(self): - handle = self._handle - self._handle = -1 - return handle + handle = self._handle + self._handle = -1 + self._uuid = None + return handle @property def handle(self) -> int: return self._handle + @property + def uuid(self) -> uuid_module.UUID: + return self._uuid + @dataclass cdef class DeviceMemoryResourceOptions: @@ -663,22 +660,16 @@ class DeviceMemoryResource(MemoryResource): from ._device import Device device = Device(self.device_id) alloc_handle = self.get_allocation_handle() - return DeviceMemoryResource._reconstruct, (device, alloc_handle, self.uuid) + return DeviceMemoryResource.from_allocation_handle, (device, alloc_handle) else: return DeviceMemoryResource.from_registry, (self.uuid,) - @staticmethod - def _reconstruct(device, alloc_handle, uuid): - self = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) - self.register(uuid) - return self - @staticmethod def from_registry(uuid: uuid_module.UUID): try: return _ipc_registry[uuid] except KeyError: - raise RuntimeError(f"Memory resource with {uuid=} was not found") + raise RuntimeError(f"Memory resource {uuid} was not found") def register(self, uuid: uuid_module.UUID): if uuid not in _ipc_registry: @@ -725,6 +716,9 @@ class DeviceMemoryResource(MemoryResource): err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0) raise_if_driver_error(err) + uuid = getattr(alloc_handle, 'uuid', None) + if uuid is not None: + self.register(uuid) return self def get_allocation_handle(self) -> IPCAllocationHandle: @@ -744,9 +738,13 @@ class DeviceMemoryResource(MemoryResource): raise RuntimeError("Imported memory resource cannot be exported") err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0) raise_if_driver_error(err) - self._alloc_handle = IPCAllocationHandle._init(alloc_handle) - assert self._uuid is None - self._uuid = uuid_module.uuid4() + try: + assert self._uuid is None + self._uuid = uuid_module.uuid4() + self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid) + except: + os.close(alloc_handle) + raise return self._alloc_handle def allocate(self, size_t size, stream: Stream = None) -> Buffer: diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py index 39f787eb0..ea8b7a347 100644 --- a/cuda_core/tests/memory_ipc/conftest.py +++ b/cuda_core/tests/memory_ipc/conftest.py @@ -8,7 +8,7 @@ POOL_SIZE = 2097152 -@pytest.fixture(scope="function") +@pytest.fixture def device(): """Obtains a device suitable for IPC-enabled mempool tests, or skips.""" # Check if IPC is supported on this platform/device @@ -26,7 +26,7 @@ def device(): return device -@pytest.fixture(scope="function") +@pytest.fixture def ipc_memory_resource(device): mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) assert mr.is_ipc_enabled diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index a6003d19b..b151f0edf 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -2,8 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import multiprocessing +import re -from cuda.core.experimental import Buffer, DeviceMemoryResource +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource from cuda.core.experimental._utils.cuda_utils import CUDAError CHILD_TIMEOUT_SEC = 4 @@ -114,3 +115,23 @@ def CHILD_ACTION(self, queue): def ASSERT(self, exc_type, exc_msg): assert exc_type is TypeError assert exc_msg.startswith("Argument 'ipc_buffer' has incorrect type") + + +class TestDanglingBuffer(ChildErrorHarness): + """ + Error when importing a buffer object without registering its memory + resource. + """ + + def PARENT_ACTION(self, queue): + mr2 = DeviceMemoryResource(self.device, dict(max_size=POOL_SIZE, ipc_enabled=True)) + self.buffer = mr2.allocate(NBYTES) + queue.put(self.buffer) # Note: mr2 not sent + + def CHILD_ACTION(self, queue): + Device().set_current() + queue.get(timeout=CHILD_TIMEOUT_SEC) + + def ASSERT(self, exc_type, exc_msg): + assert exc_type is RuntimeError + assert re.match(r"Memory resource [a-z0-9-]+ was not found", exc_msg) diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index 77e4ef2c5..c7a9b0b53 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import contextlib import gc import multiprocessing as mp @@ -59,14 +63,13 @@ def exec_reduce_failure(obj, number=1): """ for _ in range(number): fails_to_reduce = Irreducible() - try: + with contextlib.suppress(RuntimeError): mp.Process(target=child_main, args=(obj, fails_to_reduce)).start() - except RuntimeError: - pass class Irreducible: """A class that cannot be serialized.""" + def __reduce__(self): raise RuntimeError("Irreducible") @@ -82,9 +85,7 @@ def __reduce__(self): ], ids=["alloc_handle", "mr", "buffer", "buffer_desc"], ) -@pytest.mark.parametrize( - "launcher", [exec_with_object, exec_launch_failure, exec_reduce_failure] -) +@pytest.mark.parametrize("launcher", [exec_with_object, exec_launch_failure, exec_reduce_failure]) def test_pass_object(ipc_memory_resource, launcher, getobject): """Check for fd leaks when an object is sent as a subprocess argument.""" mr = ipc_memory_resource @@ -106,7 +107,7 @@ def __init__(self): self.process = psutil.Process() def __enter__(self): - self.prime() + prime() gc.collect() self.initial_fds = self.process.num_fds() return self @@ -118,12 +119,17 @@ def __exit__(self, exc_type, exc_val, exc_tb): assert final_fds == self.initial_fds return False - def prime(self, latch=[]): - """Multiprocessing consumes a file descriptor on first launch.""" - assert mp.get_start_method() == "spawn" - if not latch: - process = mp.Process() - process.start() - process.join() - assert process.exitcode == 0 - latch.append(None) + +prime_was_run = False + + +def prime(): + """Multiprocessing consumes a file descriptor on first launch.""" + assert mp.get_start_method() == "spawn" + global prime_was_run + if not prime_was_run: + process = mp.Process() + process.start() + process.join() + assert process.exitcode == 0 + prime_was_run = True diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index c0be05188..c0ff64aa1 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -39,7 +39,6 @@ def test_main(self, device, ipc_memory_resource): # Verify that the buffer was modified. helper.verify_buffer(flipped=True) - def child_main(self, mr, queue): device = Device() buffer = queue.get(timeout=CHILD_TIMEOUT_SEC) @@ -79,7 +78,6 @@ def test_main(self, device, ipc_memory_resource): IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False) IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True) - def child_main(self, mr, idx, queue): # Note: passing the mr registers it so that buffers can be passed # directly. @@ -125,7 +123,6 @@ def test_main(self, device, ipc_memory_resource): IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1) IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) - def child_main(self, alloc_handle, idx, queue): """Fills a shared memory buffer.""" # In this case, the device needs to be set up (passing the mr does it @@ -149,8 +146,8 @@ def test_main(self, device, ipc_memory_resource): # Start children. q1, q2 = (mp.Queue() for _ in range(2)) - p1 = mp.Process(target=self.child_main, args=(alloc_handle, mr.uuid, 1, q1)) - p2 = mp.Process(target=self.child_main, args=(alloc_handle, mr.uuid, 2, q2)) + p1 = mp.Process(target=self.child_main, args=(alloc_handle, 1, q1)) + p2 = mp.Process(target=self.child_main, args=(alloc_handle, 2, q2)) p1.start() p2.start() @@ -170,13 +167,14 @@ def test_main(self, device, ipc_memory_resource): IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1) IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) - - def child_main(self, alloc_handle, uuid, idx, queue): + def child_main(self, alloc_handle, idx, queue): """Fills a shared memory buffer.""" device = Device() device.set_current() - mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) - mr.register(uuid) + + # Register the memory resource. + DeviceMemoryResource.from_allocation_handle(device, alloc_handle) + + # Now get buffers. buffer = queue.get(timeout=CHILD_TIMEOUT_SEC) IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) - diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index cd17bf366..62674767c 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -77,7 +77,8 @@ def test_main(self, device, ipc_memory_resource): """Test sending IPC memory objects to a child through a queue.""" mr = ipc_memory_resource - # Start the child process. + # Start the child process. Sending the memory resource registers it so + # that buffers can be handled automatically. pipe = [mp.Queue() for _ in range(2)] process = mp.Process(target=self.child_main, args=(pipe, mr)) process.start() From ed0b35654e877f50a57b0f95e752b718c74e8354 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 26 Sep 2025 11:04:08 -0700 Subject: [PATCH 13/25] Minor changes to address feedback. --- cuda_core/cuda/core/experimental/__init__.py | 1 + cuda_core/cuda/core/experimental/_memory.pyx | 14 +++++++------- cuda_core/tests/memory_ipc/conftest.py | 1 - cuda_core/tests/memory_ipc/test_leaks.py | 1 - cuda_core/tests/memory_ipc/test_memory_ipc.py | 3 +-- cuda_core/tests/memory_ipc/test_send_buffers.py | 3 +-- cuda_core/tests/memory_ipc/test_serialize.py | 3 +-- cuda_core/tests/memory_ipc/test_workerpool.py | 3 +-- cuda_core/tests/memory_ipc/utility.py | 7 +------ 9 files changed, 13 insertions(+), 23 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 9a86459d2..a01134373 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -17,6 +17,7 @@ from cuda.core.experimental._memory import ( Buffer, DeviceMemoryResource, + DeviceMemoryResourceOptions, LegacyPinnedMemoryResource, MemoryResource, ) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 3e75967cc..c30ca6784 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -14,6 +14,7 @@ from dataclasses import dataclass from typing import Optional, TypeVar, Union, TYPE_CHECKING import abc import array +import contextlib import cython import multiprocessing import multiprocessing.context @@ -440,12 +441,6 @@ cdef class IPCAllocationHandle: ) return self._handle - def detach(self): - handle = self._handle - self._handle = -1 - self._uuid = None - return handle - @property def handle(self) -> int: return self._handle @@ -642,6 +637,7 @@ class DeviceMemoryResource(MemoryResource): err, = driver.cuMemPoolDestroy(self._mempool_handle) raise_if_driver_error(err) finally: + self.unregister() self._dev_id = None self._mempool_handle = None self._attributes = None @@ -669,7 +665,7 @@ class DeviceMemoryResource(MemoryResource): try: return _ipc_registry[uuid] except KeyError: - raise RuntimeError(f"Memory resource {uuid} was not found") + raise RuntimeError(f"Memory resource {uuid} was not found") from None def register(self, uuid: uuid_module.UUID): if uuid not in _ipc_registry: @@ -677,6 +673,10 @@ class DeviceMemoryResource(MemoryResource): _ipc_registry[uuid] = self self._uuid = uuid + def unregister(self): + with contextlib.suppress(KeyError): + del _ipc_registry[self.uuid] + @property def uuid(self): return self._uuid diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py index ea8b7a347..2c3c881e3 100644 --- a/cuda_core/tests/memory_ipc/conftest.py +++ b/cuda_core/tests/memory_ipc/conftest.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import pytest - from cuda.core.experimental import Device, DeviceMemoryResource POOL_SIZE = 2097152 diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index c7a9b0b53..b5607097e 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -7,7 +7,6 @@ import psutil import pytest - from cuda.core.experimental import _memory from cuda.core.experimental._utils.cuda_utils import driver diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index c0ff64aa1..aa9aacef9 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -3,9 +3,8 @@ import multiprocessing as mp -from utility import IPCBufferTestHelper - from cuda.core.experimental import Buffer, Device, DeviceMemoryResource +from utility import IPCBufferTestHelper CHILD_TIMEOUT_SEC = 4 NBYTES = 64 diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index 496f32553..4e2a9600d 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -4,9 +4,8 @@ import multiprocessing from itertools import cycle -from utility import IPCBufferTestHelper - from cuda.core.experimental import Device, DeviceMemoryResource +from utility import IPCBufferTestHelper CHILD_TIMEOUT_SEC = 4 NBYTES = 64 diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 62674767c..984bb16e2 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -5,9 +5,8 @@ import multiprocessing.reduction import os -from utility import IPCBufferTestHelper - from cuda.core.experimental import Buffer, Device, DeviceMemoryResource +from utility import IPCBufferTestHelper CHILD_TIMEOUT_SEC = 4 NBYTES = 64 diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index 50fa1d509..aeacd1707 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -4,9 +4,8 @@ import multiprocessing from itertools import cycle -from utility import IPCBufferTestHelper - from cuda.core.experimental import Buffer, Device, DeviceMemoryResource +from utility import IPCBufferTestHelper CHILD_TIMEOUT_SEC = 4 NBYTES = 64 diff --git a/cuda_core/tests/memory_ipc/utility.py b/cuda_core/tests/memory_ipc/utility.py index 766188d10..7ce7752b6 100644 --- a/cuda_core/tests/memory_ipc/utility.py +++ b/cuda_core/tests/memory_ipc/utility.py @@ -1,15 +1,10 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -try: - from cuda.bindings import driver -except ImportError: - from cuda import cuda as driver - import ctypes from cuda.core.experimental import Buffer, MemoryResource -from cuda.core.experimental._utils.cuda_utils import handle_return +from cuda.core.experimental._utils.cuda_utils import driver, handle_return class DummyUnifiedMemoryResource(MemoryResource): From b40a21346ab2e1bc7021339e81a9ca7d6d07cbb2 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 26 Sep 2025 14:41:33 -0700 Subject: [PATCH 14/25] Removes obsolte tests. Moves imports to nested contexts. --- cuda_core/cuda/core/experimental/_device.py | 2 + cuda_core/cuda/core/experimental/_memory.pyx | 21 ++- cuda_core/tests/memory_ipc/test_serialize.py | 10 +- cuda_core/tests/test_ipc_mempool.py | 178 ------------------- 4 files changed, 14 insertions(+), 197 deletions(-) delete mode 100644 cuda_core/tests/test_ipc_mempool.py diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index d94e44b5b..d5814c13a 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -1161,6 +1161,8 @@ def __repr__(self): return f"" def __reduce__(self): + import multiprocessing + multiprocessing.context.assert_spawning(self) return Device._reconstruct, (self.device_id,) @staticmethod diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index c30ca6784..081900ce7 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -16,13 +16,9 @@ import abc import array import contextlib import cython -import multiprocessing -import multiprocessing.context -import multiprocessing.reduction import os import platform import sys -import uuid as uuid_module import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream @@ -32,8 +28,9 @@ if platform.system() == "Linux": import socket if TYPE_CHECKING: - import cuda.bindings.driver from ._device import Device + import cuda.bindings.driver + import uuid # TODO: define a memory property mixin class and make Buffer and # MemoryResource both inherit from it @@ -405,7 +402,7 @@ cdef class IPCAllocationHandle: raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.") @classmethod - def _init(cls, handle: int, uuid: uuid_module.UUID): + def _init(cls, handle: int, uuid: uuid.UUID): cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls) assert handle >= 0 self._handle = handle @@ -426,6 +423,7 @@ cdef class IPCAllocationHandle: self.close() def __reduce__(self): + import multiprocessing multiprocessing.context.assert_spawning(self) df = multiprocessing.reduction.DupFd(self.handle) return self._reconstruct, (df, self._uuid) @@ -446,7 +444,7 @@ cdef class IPCAllocationHandle: return self._handle @property - def uuid(self) -> uuid_module.UUID: + def uuid(self) -> uuid.UUID: return self._uuid @@ -527,7 +525,6 @@ class DeviceMemoryResourceAttributes: del mempool_property - # Holds DeviceMemoryResource objects imported by this process. # This enables buffer serialization, as buffers can reduce to a pair # of comprising the memory resource UUID (the key into this registry) @@ -651,6 +648,7 @@ class DeviceMemoryResource(MemoryResource): def __reduce__(self): # If spawning a new process, serialize the resources; otherwise, just # send the UUID, using the registry on the receiving end. + import multiprocessing is_spawning = multiprocessing.context.get_spawning_popen() is not None if is_spawning: from ._device import Device @@ -661,13 +659,13 @@ class DeviceMemoryResource(MemoryResource): return DeviceMemoryResource.from_registry, (self.uuid,) @staticmethod - def from_registry(uuid: uuid_module.UUID): + def from_registry(uuid: uuid.UUID): try: return _ipc_registry[uuid] except KeyError: raise RuntimeError(f"Memory resource {uuid} was not found") from None - def register(self, uuid: uuid_module.UUID): + def register(self, uuid: uuid.UUID): if uuid not in _ipc_registry: assert self._uuid is None or self._uuid == uuid _ipc_registry[uuid] = self @@ -740,7 +738,8 @@ class DeviceMemoryResource(MemoryResource): raise_if_driver_error(err) try: assert self._uuid is None - self._uuid = uuid_module.uuid4() + import uuid as uuid + self._uuid = uuid.uuid4() self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid) except: os.close(alloc_handle) diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 984bb16e2..61e99091b 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -82,11 +82,6 @@ def test_main(self, device, ipc_memory_resource): process = mp.Process(target=self.child_main, args=(pipe, mr)) process.start() - # Send a device description. - pipe[0].put(device) - device_id = pipe[1].get(timeout=CHILD_TIMEOUT_SEC) - assert device_id == device.device_id - # Send a memory resource directly. This relies on the mr already # being passed when spawning the child. pipe[0].put(mr) @@ -105,9 +100,8 @@ def test_main(self, device, ipc_memory_resource): IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) def child_main(self, pipe, _): - # Device. - device = pipe[0].get(timeout=CHILD_TIMEOUT_SEC) - pipe[1].put(device.device_id) + device = Device() + device.set_current() # Memory resource. mr = pipe[0].get(timeout=CHILD_TIMEOUT_SEC) diff --git a/cuda_core/tests/test_ipc_mempool.py b/cuda_core/tests/test_ipc_mempool.py deleted file mode 100644 index de436fd48..000000000 --- a/cuda_core/tests/test_ipc_mempool.py +++ /dev/null @@ -1,178 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -try: - from cuda.bindings import driver -except ImportError: - from cuda import cuda as driver - -import ctypes -import multiprocessing - -import pytest -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel, MemoryResource -from cuda.core.experimental._utils.cuda_utils import handle_return - -CHILD_TIMEOUT_SEC = 10 -NBYTES = 64 -POOL_SIZE = 2097152 - - -@pytest.fixture(scope="function") -def ipc_device(): - """Obtains a device suitable for IPC-enabled mempool tests, or skips.""" - # Check if IPC is supported on this platform/device - device = Device() - device.set_current() - - if not device.properties.memory_pools_supported: - pytest.skip("Device does not support mempool operations") - - # Note: Linux specific. Once Windows support for IPC is implemented, this - # test should be updated. - if not device.properties.handle_type_posix_file_descriptor_supported: - pytest.skip("Device does not support IPC") - - return device - - -def test_ipc_mempool(ipc_device): - """Test IPC with memory pools.""" - # Set up the IPC-enabled memory pool and share it. - stream = ipc_device.create_stream() - mr = DeviceMemoryResource(ipc_device, dict(max_size=POOL_SIZE, ipc_enabled=True)) - assert mr.is_ipc_enabled - channel = IPCChannel() - mr.share_to_channel(channel) - - # Start the child process. - queue = multiprocessing.Queue() - process = multiprocessing.Process(target=child_main1, args=(channel, queue)) - process.start() - - # Allocate and fill memory. - buffer = mr.allocate(NBYTES, stream=stream) - protocol = IPCBufferTestProtocol(ipc_device, buffer, stream=stream) - protocol.fill_buffer(flipped=False) - stream.sync() - - # Export the buffer via IPC. - handle = buffer.export() - queue.put(handle) - - # Wait for the child process. - process.join(timeout=CHILD_TIMEOUT_SEC) - assert process.exitcode == 0 - - # Verify that the buffer was modified. - protocol.verify_buffer(flipped=True) - - -def child_main1(channel, queue): - device = Device() - device.set_current() - stream = device.create_stream() - - mr = DeviceMemoryResource.from_shared_channel(device, channel) - handle = queue.get() # Get exported buffer data - buffer = Buffer.import_(mr, handle) - - protocol = IPCBufferTestProtocol(device, buffer, stream=stream) - protocol.verify_buffer(flipped=False) - protocol.fill_buffer(flipped=True) - stream.sync() - - -def test_shared_pool_errors(ipc_device): - """Test expected errors with allocating from a shared IPC memory pool.""" - # Set up the IPC-enabled memory pool and share it. - mr = DeviceMemoryResource(ipc_device, dict(max_size=POOL_SIZE, ipc_enabled=True)) - channel = IPCChannel() - mr.share_to_channel(channel) - - # Start a child process to generate error info. - queue = multiprocessing.Queue() - process = multiprocessing.Process(target=child_main2, args=(channel, queue)) - process.start() - - # Check the errors. - exc_type, exc_msg = queue.get(timeout=CHILD_TIMEOUT_SEC) - assert exc_type is TypeError - assert exc_msg == "Cannot allocate from shared memory pool imported via IPC" - - # Wait for the child process. - process.join(timeout=CHILD_TIMEOUT_SEC) - assert process.exitcode == 0 - - -def child_main2(channel, queue): - """Child process that pushes IPC errors to a shared queue for testing.""" - device = Device() - device.set_current() - - mr = DeviceMemoryResource.from_shared_channel(device, channel) - - # Allocating from an imported pool. - try: - mr.allocate(NBYTES) - except Exception as e: - exc_info = type(e), str(e) - queue.put(exc_info) - - -class DummyUnifiedMemoryResource(MemoryResource): - def __init__(self, device): - self.device = device - - def allocate(self, size, stream=None) -> Buffer: - ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)) - return Buffer.from_handle(ptr=ptr, size=size, mr=self) - - def deallocate(self, ptr, size, stream=None): - handle_return(driver.cuMemFree(ptr)) - - @property - def is_device_accessible(self) -> bool: - return True - - @property - def is_host_accessible(self) -> bool: - return True - - @property - def device_id(self) -> int: - return self.device - - -class IPCBufferTestProtocol: - """The protocol for verifying IPC. - - Provides methods to fill a buffer with one of two test patterns and verify - the expected values. - """ - - def __init__(self, device, buffer, nbytes=NBYTES, stream=None): - self.device = device - self.buffer = buffer - self.nbytes = nbytes - self.stream = stream if stream is not None else device.create_stream() - self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.nbytes, stream=self.stream) - - def fill_buffer(self, flipped=False): - """Fill a device buffer with test pattern using unified memory.""" - ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte)) - op = (lambda i: 255 - i) if flipped else (lambda i: i) - for i in range(self.nbytes): - ptr[i] = ctypes.c_byte(op(i)) - self.buffer.copy_from(self.scratch_buffer, stream=self.stream) - - def verify_buffer(self, flipped=False): - """Verify the buffer contents.""" - self.scratch_buffer.copy_from(self.buffer, stream=self.stream) - self.stream.sync() - ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte)) - op = (lambda i: 255 - i) if flipped else (lambda i: i) - for i in range(self.nbytes): - assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(i)).value, ( - f"Buffer contains incorrect data at index {i}" - ) From 4fb3d47b490457ad0adcef0b6ad2caa3ddfbef68 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 26 Sep 2025 15:28:25 -0700 Subject: [PATCH 15/25] Removes pickling for Device objects. Registers the pickle method with multiprocessing instead. --- cuda_core/cuda/core/experimental/_device.py | 26 +++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index d5814c13a..6abaee9f3 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import multiprocessing import threading from typing import Optional, Union @@ -1160,18 +1161,6 @@ def __int__(self): def __repr__(self): return f"" - def __reduce__(self): - import multiprocessing - multiprocessing.context.assert_spawning(self) - return Device._reconstruct, (self.device_id,) - - @staticmethod - def _reconstruct(device_id): - device = Device(device_id) - if not device._has_inited: - device.set_current() - return device - def set_current(self, ctx: Context = None) -> Union[Context, None]: """Set device to be used for GPU executions. @@ -1346,3 +1335,16 @@ def create_graph_builder(self) -> GraphBuilder: """ self._check_context_initialized() return GraphBuilder._init(stream=self.create_stream(), is_stream_owner=True) + + +def _reconstruct_device(device_id): + device = Device(device_id) + if not device._has_inited: + device.set_current() + return device + +def _reduce_device(device): + return _reconstruct_device, (device.device_id,) + +multiprocessing.reduction.register(Device, _reduce_device) + From c9f8c911c5e4a467a35531e24a08804a4471bab3 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 29 Sep 2025 12:14:31 -0700 Subject: [PATCH 16/25] Updates register function to return registered object. Avoids possible early deregistration. --- cuda_core/cuda/core/experimental/_memory.pyx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 081900ce7..19f8eb4be 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -666,10 +666,13 @@ class DeviceMemoryResource(MemoryResource): raise RuntimeError(f"Memory resource {uuid} was not found") from None def register(self, uuid: uuid.UUID): - if uuid not in _ipc_registry: - assert self._uuid is None or self._uuid == uuid - _ipc_registry[uuid] = self - self._uuid = uuid + existing = _ipc_registry.get(uuid) + if existing is not None: + return existing + assert self._uuid is None or self._uuid == uuid + _ipc_registry[uuid] = self + self._uuid = uuid + return self def unregister(self): with contextlib.suppress(KeyError): @@ -716,7 +719,7 @@ class DeviceMemoryResource(MemoryResource): raise_if_driver_error(err) uuid = getattr(alloc_handle, 'uuid', None) if uuid is not None: - self.register(uuid) + self = self.register(uuid) return self def get_allocation_handle(self) -> IPCAllocationHandle: From 5dda1964a9905f4346fbe5b1ffb19d8361dd5870 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 29 Sep 2025 15:28:41 -0700 Subject: [PATCH 17/25] Renames Buffer import_/export methods. --- cuda_core/cuda/core/experimental/_device.py | 3 ++- cuda_core/cuda/core/experimental/_memory.pyx | 14 +++++++------- cuda_core/tests/memory_ipc/test_errors.py | 6 +++--- cuda_core/tests/memory_ipc/test_leaks.py | 2 +- cuda_core/tests/memory_ipc/test_memory_ipc.py | 6 +++--- cuda_core/tests/memory_ipc/test_serialize.py | 10 +++++----- cuda_core/tests/memory_ipc/test_workerpool.py | 7 ++++--- cuda_core/tests/test_memory.py | 4 ++-- 8 files changed, 27 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 6abaee9f3..1ae659e8d 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -1343,8 +1343,9 @@ def _reconstruct_device(device_id): device.set_current() return device + def _reduce_device(device): return _reconstruct_device, (device.device_id,) -multiprocessing.reduction.register(Device, _reduce_device) +multiprocessing.reduction.register(Device, _reduce_device) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 19f8eb4be..5ce6cd79d 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -29,7 +29,6 @@ if platform.system() == "Linux": if TYPE_CHECKING: from ._device import Device - import cuda.bindings.driver import uuid # TODO: define a memory property mixin class and make Buffer and @@ -75,7 +74,7 @@ cdef class Buffer: self.close() def __reduce__(self): - return Buffer.import_, (self.memory_resource, self.export()) + return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor()) cpdef close(self, stream: Stream = None): """Deallocate this buffer asynchronously on the given stream. @@ -137,7 +136,7 @@ cdef class Buffer: return self._mr.device_id raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") - def export(self) -> IPCBufferDescriptor: + def get_ipc_descriptor(self) -> IPCBufferDescriptor: """Export a buffer allocated for sharing between processes.""" if not self._mr.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") @@ -146,7 +145,7 @@ cdef class Buffer: return IPCBufferDescriptor._init(ptr.reserved, self.size) @classmethod - def import_(cls, mr: MemoryResource, ipc_buffer: IPCBufferDescriptor) -> Buffer: + def from_ipc_descriptor(cls, mr: MemoryResource, ipc_buffer: IPCBufferDescriptor) -> Buffer: """Import a buffer that was exported from another process.""" if not mr.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") @@ -621,7 +620,7 @@ class DeviceMemoryResource(MemoryResource): raise_if_driver_error(err) if opts.ipc_enabled: - self.get_allocation_handle() # enables Buffer.export, sets uuid + self.get_allocation_handle() # enables Buffer.get_ipc_descriptor, sets uuid def __del__(self): self.close() @@ -675,8 +674,9 @@ class DeviceMemoryResource(MemoryResource): return self def unregister(self): - with contextlib.suppress(KeyError): - del _ipc_registry[self.uuid] + if _ipc_registry is not None: + with contextlib.suppress(KeyError): + del _ipc_registry[self.uuid] @property def uuid(self): diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index b151f0edf..c2654b7f7 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -73,11 +73,11 @@ class TestImportWrongMR(ChildErrorHarness): def PARENT_ACTION(self, queue): mr2 = DeviceMemoryResource(self.device, dict(max_size=POOL_SIZE, ipc_enabled=True)) buffer = mr2.allocate(NBYTES) - queue.put([self.mr, buffer.export()]) # Note: mr does not own this buffer + queue.put([self.mr, buffer.get_ipc_descriptor()]) # Note: mr does not own this buffer def CHILD_ACTION(self, queue): mr, buffer_desc = queue.get(timeout=CHILD_TIMEOUT_SEC) - Buffer.import_(mr, buffer_desc) + Buffer.from_ipc_descriptor(mr, buffer_desc) def ASSERT(self, exc_type, exc_msg): assert exc_type is CUDAError @@ -110,7 +110,7 @@ def PARENT_ACTION(self, queue): def CHILD_ACTION(self, queue): buffer = queue.get(timeout=CHILD_TIMEOUT_SEC) - Buffer.import_(self.mr, buffer) + Buffer.from_ipc_descriptor(self.mr, buffer) def ASSERT(self, exc_type, exc_msg): assert exc_type is TypeError diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index b5607097e..a9d09a672 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -80,7 +80,7 @@ def __reduce__(self): lambda mr: mr.get_allocation_handle(), lambda mr: mr, lambda mr: mr.allocate(NBYTES), - lambda mr: mr.allocate(NBYTES).export(), + lambda mr: mr.allocate(NBYTES).get_ipc_descriptor(), ], ids=["alloc_handle", "mr", "buffer", "buffer_desc"], ) diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index aa9aacef9..c5e10805d 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -109,8 +109,8 @@ def test_main(self, device, ipc_memory_resource): # Allocate and share memory. buf1 = mr.allocate(NBYTES) buf2 = mr.allocate(NBYTES) - q1.put(buf1.export()) - q2.put(buf2.export()) + q1.put(buf1.get_ipc_descriptor()) + q2.put(buf2.get_ipc_descriptor()) # Wait for children. p1.join(timeout=CHILD_TIMEOUT_SEC) @@ -130,7 +130,7 @@ def child_main(self, alloc_handle, idx, queue): device.set_current() mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) buffer_descriptor = queue.get(timeout=CHILD_TIMEOUT_SEC) - buffer = Buffer.import_(mr, buffer_descriptor) + buffer = Buffer.from_ipc_descriptor(mr, buffer_descriptor) IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx) diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 61e99091b..94338a55a 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -39,7 +39,7 @@ def test_main(self, device, ipc_memory_resource): parent_conn.send(buffer1) # directly buffer2 = mr.allocate(NBYTES) - parent_conn.send(buffer2.export()) # by descriptor + parent_conn.send(buffer2.get_ipc_descriptor()) # by descriptor # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) @@ -64,7 +64,7 @@ def child_main(self, conn): # Receive the buffers. buffer1 = conn.recv() # directly buffer_desc = conn.recv() - buffer2 = Buffer.import_(mr, buffer_desc) # by descriptor + buffer2 = Buffer.from_ipc_descriptor(mr, buffer_desc) # by descriptor # Modify the buffers. IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=True) @@ -126,7 +126,7 @@ def test_object_passing(device, ipc_memory_resource): mr = ipc_memory_resource alloc_handle = mr.get_allocation_handle() buffer = mr.allocate(NBYTES) - buffer_desc = buffer.export() + buffer_desc = buffer.get_ipc_descriptor() helper = IPCBufferTestHelper(device, buffer) helper.fill_buffer(flipped=False) @@ -145,8 +145,8 @@ def child_main(device, alloc_handle, mr1, buffer_desc, buffer1): # OK to build the buffer from either mr and the descriptor. # All buffer* objects point to the same memory. - buffer2 = Buffer.import_(mr1, buffer_desc) - buffer3 = Buffer.import_(mr2, buffer_desc) + buffer2 = Buffer.from_ipc_descriptor(mr1, buffer_desc) + buffer3 = Buffer.from_ipc_descriptor(mr2, buffer_desc) helper1 = IPCBufferTestHelper(device, buffer1) helper2 = IPCBufferTestHelper(device, buffer2) diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index aeacd1707..6c9d9f2d8 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -36,7 +36,7 @@ def test_ipc_workerpool(self, device, ipc_memory_resource): mr = ipc_memory_resource buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)] with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool: - pool.starmap(self.process_buffer, [(0, buffer.export()) for buffer in buffers]) + pool.starmap(self.process_buffer, [(0, buffer.get_ipc_descriptor()) for buffer in buffers]) for buffer in buffers: IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) @@ -49,7 +49,8 @@ def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource): buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: pool.starmap( - self.process_buffer, [(mrs.index(buffer.memory_resource), buffer.export()) for buffer in buffers] + self.process_buffer, + [(mrs.index(buffer.memory_resource), buffer.get_ipc_descriptor()) for buffer in buffers], ) for buffer in buffers: @@ -57,7 +58,7 @@ def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource): def process_buffer(self, mr_idx, buffer_desc): device = Device() - buffer = Buffer.import_(g_mrs[mr_idx], buffer_desc) + buffer = Buffer.from_ipc_descriptor(g_mrs[mr_idx], buffer_desc) IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index b23cd6d4b..f0b305f55 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -350,11 +350,11 @@ def test_mempool(mempool_device): mr.get_allocation_handle() with pytest.raises(RuntimeError, match=ipc_error_msg): - buffer.export() + buffer.get_ipc_descriptor() with pytest.raises(RuntimeError, match=ipc_error_msg): handle = IPCBufferDescriptor._init(b"", 0) - Buffer.import_(mr, handle) + Buffer.from_ipc_descriptor(mr, handle) buffer.close() From e54cb5b64b8824bd33569556863ee011b9ec19f8 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 30 Sep 2025 09:33:07 -0700 Subject: [PATCH 18/25] Moves AllocationHandle serialization to a registration with multiprocessing, since it depends on DupFd. --- cuda_core/cuda/core/experimental/_device.py | 8 +++---- cuda_core/cuda/core/experimental/_memory.pyx | 22 +++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 1ae659e8d..91ae7829c 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -1337,6 +1337,10 @@ def create_graph_builder(self) -> GraphBuilder: return GraphBuilder._init(stream=self.create_stream(), is_stream_owner=True) +def _reduce_device(device): + return _reconstruct_device, (device.device_id,) + + def _reconstruct_device(device_id): device = Device(device_id) if not device._has_inited: @@ -1344,8 +1348,4 @@ def _reconstruct_device(device_id): return device -def _reduce_device(device): - return _reconstruct_device, (device.device_id,) - - multiprocessing.reduction.register(Device, _reduce_device) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 5ce6cd79d..af4970704 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -16,6 +16,7 @@ import abc import array import contextlib import cython +import multiprocessing import os import platform import sys @@ -421,16 +422,6 @@ cdef class IPCAllocationHandle: """Close the handle.""" self.close() - def __reduce__(self): - import multiprocessing - multiprocessing.context.assert_spawning(self) - df = multiprocessing.reduction.DupFd(self.handle) - return self._reconstruct, (df, self._uuid) - - @classmethod - def _reconstruct(cls, df, uuid): - return cls._init(df.detach(), uuid) - def __int__(self) -> int: if self._handle < 0: raise ValueError( @@ -447,6 +438,17 @@ cdef class IPCAllocationHandle: return self._uuid +def _reduce_allocation_handle(alloc_handle): + df = multiprocessing.reduction.DupFd(alloc_handle.handle) + return _reconstruct_allocation_handle, (type(alloc_handle), df, alloc_handle.uuid) + +def _reconstruct_allocation_handle(cls, df, uuid): + return cls._init(df.detach(), uuid) + + +multiprocessing.reduction.register(IPCAllocationHandle, _reduce_allocation_handle) + + @dataclass cdef class DeviceMemoryResourceOptions: """Customizable :obj:`~_memory.DeviceMemoryResource` options. From 948af33273fbaf6fe71435c2ea3ccc317d96755e Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 30 Sep 2025 09:46:00 -0700 Subject: [PATCH 19/25] Use DeviceMemoryResourceOptions throughout tests. --- cuda_core/tests/memory_ipc/conftest.py | 5 +++-- cuda_core/tests/memory_ipc/test_errors.py | 8 +++++--- cuda_core/tests/memory_ipc/test_send_buffers.py | 7 +++---- cuda_core/tests/memory_ipc/test_workerpool.py | 12 +++++------- cuda_core/tests/test_memory.py | 8 +++++--- 5 files changed, 21 insertions(+), 19 deletions(-) diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py index 2c3c881e3..0d4ada510 100644 --- a/cuda_core/tests/memory_ipc/conftest.py +++ b/cuda_core/tests/memory_ipc/conftest.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import pytest -from cuda.core.experimental import Device, DeviceMemoryResource +from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions POOL_SIZE = 2097152 @@ -27,6 +27,7 @@ def device(): @pytest.fixture def ipc_memory_resource(device): - mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) + mr = DeviceMemoryResource(device, options=options) assert mr.is_ipc_enabled return mr diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index c2654b7f7..d8e2af177 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -4,7 +4,7 @@ import multiprocessing import re -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions from cuda.core.experimental._utils.cuda_utils import CUDAError CHILD_TIMEOUT_SEC = 4 @@ -71,7 +71,8 @@ class TestImportWrongMR(ChildErrorHarness): """Error when importing a buffer from the wrong memory resource.""" def PARENT_ACTION(self, queue): - mr2 = DeviceMemoryResource(self.device, dict(max_size=POOL_SIZE, ipc_enabled=True)) + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) + mr2 = DeviceMemoryResource(self.device, options=options) buffer = mr2.allocate(NBYTES) queue.put([self.mr, buffer.get_ipc_descriptor()]) # Note: mr does not own this buffer @@ -124,7 +125,8 @@ class TestDanglingBuffer(ChildErrorHarness): """ def PARENT_ACTION(self, queue): - mr2 = DeviceMemoryResource(self.device, dict(max_size=POOL_SIZE, ipc_enabled=True)) + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) + mr2 = DeviceMemoryResource(self.device, options=options) self.buffer = mr2.allocate(NBYTES) queue.put(self.buffer) # Note: mr2 not sent diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index 4e2a9600d..f2fc5d070 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -4,7 +4,7 @@ import multiprocessing from itertools import cycle -from cuda.core.experimental import Device, DeviceMemoryResource +from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions from utility import IPCBufferTestHelper CHILD_TIMEOUT_SEC = 4 @@ -41,9 +41,8 @@ def test_ipc_send_buffers(device, ipc_memory_resource): def test_ipc_send_buffers_multi(device, ipc_memory_resource): """Test passing buffers sourced from multiple memory resources.""" # Set up several IPC-enabled memory pools. - mrs = [ipc_memory_resource] + [ - DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1) - ] + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) + mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)] # Allocate and fill memory. buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index 6c9d9f2d8..f79a3ce32 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -4,7 +4,7 @@ import multiprocessing from itertools import cycle -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions from utility import IPCBufferTestHelper CHILD_TIMEOUT_SEC = 4 @@ -43,9 +43,8 @@ def test_ipc_workerpool(self, device, ipc_memory_resource): def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource): """Test IPC with a worker pool using multiple memory resources.""" - mrs = [ipc_memory_resource] + [ - DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1) - ] + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) + mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)] buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: pool.starmap( @@ -88,9 +87,8 @@ def test_ipc_workerpool(self, device, ipc_memory_resource): def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource): """Test IPC with a worker pool using multiple memory resources.""" - mrs = [ipc_memory_resource] + [ - DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1) - ] + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) + mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)] buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: pool.map(self.process_buffer, buffers) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index f0b305f55..922db5d5b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -10,7 +10,7 @@ import platform import pytest -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions, MemoryResource from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor from cuda.core.experimental._utils.cuda_utils import handle_return @@ -304,7 +304,8 @@ def test_mempool(mempool_device): device = mempool_device # Test basic pool creation - mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=False)) + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False) + mr = DeviceMemoryResource(device, options=options) assert mr.device_id == device.device_id assert mr.is_device_accessible assert not mr.is_host_accessible @@ -379,7 +380,8 @@ def test_mempool_attributes(ipc_enabled, mempool_device, property_name, expected if platform.system() == "Windows": return # IPC not implemented for Windows - mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=ipc_enabled)) + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=ipc_enabled) + mr = DeviceMemoryResource(device, options=options) assert mr.is_ipc_enabled == ipc_enabled # Get the property value From b91d98b0aa450a71bebec74739643bfa7bbd07ca Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 30 Sep 2025 11:37:02 -0700 Subject: [PATCH 20/25] Merged tests/memory_ipc/conftest.py into tests/conftest.py because certain configurations could not resolve it consistently. --- cuda_core/tests/conftest.py | 29 +++++++++++++++- cuda_core/tests/memory_ipc/conftest.py | 33 ------------------- cuda_core/tests/memory_ipc/test_errors.py | 4 +-- cuda_core/tests/memory_ipc/test_memory_ipc.py | 12 ++++--- .../tests/memory_ipc/test_send_buffers.py | 6 ++-- cuda_core/tests/memory_ipc/test_serialize.py | 9 +++-- cuda_core/tests/memory_ipc/test_workerpool.py | 12 ++++--- 7 files changed, 56 insertions(+), 49 deletions(-) delete mode 100644 cuda_core/tests/memory_ipc/conftest.py diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index c56c0a972..db9761a3c 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -10,7 +10,7 @@ import multiprocessing import pytest -from cuda.core.experimental import Device, _device +from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions, _device from cuda.core.experimental._utils.cuda_utils import handle_return @@ -70,4 +70,31 @@ def pop_all_contexts(): return pop_all_contexts +@pytest.fixture +def ipc_device(): + """Obtains a device suitable for IPC-enabled mempool tests, or skips.""" + # Check if IPC is supported on this platform/device + device = Device() + device.set_current() + + if not device.properties.memory_pools_supported: + pytest.skip("Device does not support mempool operations") + + # Note: Linux specific. Once Windows support for IPC is implemented, this + # test should be updated. + if not device.properties.handle_type_posix_file_descriptor_supported: + pytest.skip("Device does not support IPC") + + return device + + +@pytest.fixture +def ipc_memory_resource(ipc_device): + POOL_SIZE = 2097152 + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) + mr = DeviceMemoryResource(ipc_device, options=options) + assert mr.is_ipc_enabled + return mr + + skipif_need_cuda_headers = pytest.mark.skipif(helpers.CUDA_INCLUDE_PATH is None, reason="need CUDA header") diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py deleted file mode 100644 index 0d4ada510..000000000 --- a/cuda_core/tests/memory_ipc/conftest.py +++ /dev/null @@ -1,33 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import pytest -from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions - -POOL_SIZE = 2097152 - - -@pytest.fixture -def device(): - """Obtains a device suitable for IPC-enabled mempool tests, or skips.""" - # Check if IPC is supported on this platform/device - device = Device() - device.set_current() - - if not device.properties.memory_pools_supported: - pytest.skip("Device does not support mempool operations") - - # Note: Linux specific. Once Windows support for IPC is implemented, this - # test should be updated. - if not device.properties.handle_type_posix_file_descriptor_supported: - pytest.skip("Device does not support IPC") - - return device - - -@pytest.fixture -def ipc_memory_resource(device): - options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) - mr = DeviceMemoryResource(device, options=options) - assert mr.is_ipc_enabled - return mr diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index d8e2af177..aa103ebf9 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -16,11 +16,11 @@ class ChildErrorHarness: """Test harness for checking errors in child processes. Subclasses override PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples).""" - def test_main(self, device, ipc_memory_resource): + def test_main(self, ipc_device, ipc_memory_resource): """Parent process that checks child errors.""" # Attach fixtures to this object for convenience. These can be accessed # from PARENT_ACTION. - self.device = device + self.device = ipc_device self.mr = ipc_memory_resource # Start a child process to generate error info. diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index c5e10805d..85d8292cd 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -13,9 +13,10 @@ class TestIpcMempool: - def test_main(self, device, ipc_memory_resource): + def test_main(self, ipc_device, ipc_memory_resource): """Test IPC with memory pools.""" # Set up the IPC-enabled memory pool and share it. + device = ipc_device mr = ipc_memory_resource # Start the child process. @@ -47,9 +48,10 @@ def child_main(self, mr, queue): class TestIPCMempoolMultiple: - def test_main(self, device, ipc_memory_resource): + def test_main(self, ipc_device, ipc_memory_resource): """Test IPC with memory pools using multiple processes.""" # Construct an IPC-enabled memory resource and share it with two children. + device = ipc_device mr = ipc_memory_resource q1, q2 = (mp.Queue() for _ in range(2)) @@ -90,12 +92,13 @@ def child_main(self, mr, idx, queue): class TestIPCSharedAllocationHandleAndBufferDescriptors: - def test_main(self, device, ipc_memory_resource): + def test_main(self, ipc_device, ipc_memory_resource): """ Demonstrate that a memory pool allocation handle can be reused for IPC with multiple processes. Uses buffer descriptors. """ # Set up the IPC-enabled memory pool and share it using one handle. + device = ipc_device mr = ipc_memory_resource alloc_handle = mr.get_allocation_handle() @@ -135,11 +138,12 @@ def child_main(self, alloc_handle, idx, queue): class TestIPCSharedAllocationHandleAndBufferObjects: - def test_main(self, device, ipc_memory_resource): + def test_main(self, ipc_device, ipc_memory_resource): """ Demonstrate that a memory pool allocation handle can be reused for IPC with multiple processes. Uses buffer objects (not descriptors). """ + device = ipc_device mr = ipc_memory_resource alloc_handle = mr.get_allocation_handle() diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index f2fc5d070..b6cc631d8 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -14,8 +14,9 @@ POOL_SIZE = 2097152 -def test_ipc_send_buffers(device, ipc_memory_resource): +def test_ipc_send_buffers(ipc_device, ipc_memory_resource): """Test passing buffers directly to a child separately from a memory resource.""" + device = ipc_device mr = ipc_memory_resource # Allocate and fill memory. @@ -38,9 +39,10 @@ def test_ipc_send_buffers(device, ipc_memory_resource): helper.verify_buffer(flipped=True) -def test_ipc_send_buffers_multi(device, ipc_memory_resource): +def test_ipc_send_buffers_multi(ipc_device, ipc_memory_resource): """Test passing buffers sourced from multiple memory resources.""" # Set up several IPC-enabled memory pools. + device = ipc_device options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)] diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 94338a55a..df9a85633 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -21,7 +21,8 @@ class TestObjectSerializationDirect: it on the other end and demonstrate buffer sharing. """ - def test_main(self, device, ipc_memory_resource): + def test_main(self, ipc_device, ipc_memory_resource): + device = ipc_device mr = ipc_memory_resource # Start the child process. @@ -72,8 +73,9 @@ def child_main(self, conn): class TestObjectSerializationWithMR: - def test_main(self, device, ipc_memory_resource): + def test_main(self, ipc_device, ipc_memory_resource): """Test sending IPC memory objects to a child through a queue.""" + device = ipc_device mr = ipc_memory_resource # Start the child process. Sending the memory resource registers it so @@ -113,7 +115,7 @@ def child_main(self, pipe, _): IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True) -def test_object_passing(device, ipc_memory_resource): +def test_object_passing(ipc_device, ipc_memory_resource): """ Test sending objects as arguments when starting a process. @@ -123,6 +125,7 @@ def test_object_passing(device, ipc_memory_resource): """ # Define the objects. + device = ipc_device mr = ipc_memory_resource alloc_handle = mr.get_allocation_handle() buffer = mr.allocate(NBYTES) diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index f79a3ce32..aeaeaa69d 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -31,8 +31,9 @@ def init_worker(mrs): global g_mrs g_mrs = mrs - def test_ipc_workerpool(self, device, ipc_memory_resource): + def test_ipc_workerpool(self, ipc_device, ipc_memory_resource): """Test IPC with a worker pool.""" + device = ipc_device mr = ipc_memory_resource buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)] with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool: @@ -41,8 +42,9 @@ def test_ipc_workerpool(self, device, ipc_memory_resource): for buffer in buffers: IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) - def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource): + def test_ipc_workerpool_multi_mr(self, ipc_device, ipc_memory_resource): """Test IPC with a worker pool using multiple memory resources.""" + device = ipc_device options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)] buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] @@ -75,8 +77,9 @@ def init_worker(mrs): global g_mrs g_mrs = mrs - def test_ipc_workerpool(self, device, ipc_memory_resource): + def test_ipc_workerpool(self, ipc_device, ipc_memory_resource): """Test IPC with a worker pool.""" + device = ipc_device mr = ipc_memory_resource buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)] with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool: @@ -85,8 +88,9 @@ def test_ipc_workerpool(self, device, ipc_memory_resource): for buffer in buffers: IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) - def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource): + def test_ipc_workerpool_multi_mr(self, ipc_device, ipc_memory_resource): """Test IPC with a worker pool using multiple memory resources.""" + device = ipc_device options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)] buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] From d28b52fcff5d41e7bf431e798e9ba08202b3d2ce Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 30 Sep 2025 12:13:28 -0700 Subject: [PATCH 21/25] Makes the psutil module an optional dependency for testing. --- cuda_core/tests/memory_ipc/test_leaks.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index a9d09a672..823da36db 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -5,7 +5,13 @@ import gc import multiprocessing as mp -import psutil +try: + import psutil +except ImportError: + HAVE_PSUTIL = False +else: + HAVE_PSUTIL = True + import pytest from cuda.core.experimental import _memory from cuda.core.experimental._utils.cuda_utils import driver @@ -14,10 +20,12 @@ NBYTES = 64 USING_FDS = _memory._IPC_HANDLE_TYPE == driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR -skip_unless_using_fds = pytest.mark.skipif(not USING_FDS, reason="mempool allocation handle is not using fds") +skip_if_unrunnable = pytest.mark.skipif( + not USING_FDS or not HAVE_PSUTIL, reason="mempool allocation handle is not using fds or psutil is unavailable" +) -@skip_unless_using_fds +@skip_if_unrunnable def test_alloc_handle(ipc_memory_resource): """Check for fd leaks in get_allocation_handle.""" mr = ipc_memory_resource @@ -73,7 +81,7 @@ def __reduce__(self): raise RuntimeError("Irreducible") -@skip_unless_using_fds +@skip_if_unrunnable @pytest.mark.parametrize( "getobject", [ From e0d0bf449d330db629f3c6325a9416bc7bdb951b Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 1 Oct 2025 15:13:59 -0700 Subject: [PATCH 22/25] Bump the child timeout for IPC tests. --- cuda_core/tests/memory_ipc/test_errors.py | 2 +- cuda_core/tests/memory_ipc/test_leaks.py | 2 +- cuda_core/tests/memory_ipc/test_memory_ipc.py | 2 +- cuda_core/tests/memory_ipc/test_send_buffers.py | 2 +- cuda_core/tests/memory_ipc/test_serialize.py | 2 +- cuda_core/tests/memory_ipc/test_workerpool.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index aa103ebf9..e0d87a0d0 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -7,7 +7,7 @@ from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions from cuda.core.experimental._utils.cuda_utils import CUDAError -CHILD_TIMEOUT_SEC = 4 +CHILD_TIMEOUT_SEC = 20 NBYTES = 64 POOL_SIZE = 2097152 diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index 823da36db..387ca7042 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -16,7 +16,7 @@ from cuda.core.experimental import _memory from cuda.core.experimental._utils.cuda_utils import driver -CHILD_TIMEOUT_SEC = 4 +CHILD_TIMEOUT_SEC = 20 NBYTES = 64 USING_FDS = _memory._IPC_HANDLE_TYPE == driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index 85d8292cd..da4678afe 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -6,7 +6,7 @@ from cuda.core.experimental import Buffer, Device, DeviceMemoryResource from utility import IPCBufferTestHelper -CHILD_TIMEOUT_SEC = 4 +CHILD_TIMEOUT_SEC = 20 NBYTES = 64 NWORKERS = 2 NTASKS = 2 diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index b6cc631d8..3e3f44865 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -7,7 +7,7 @@ from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions from utility import IPCBufferTestHelper -CHILD_TIMEOUT_SEC = 4 +CHILD_TIMEOUT_SEC = 20 NBYTES = 64 NMRS = 3 NTASKS = 7 diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index df9a85633..ba0ac326f 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -8,7 +8,7 @@ from cuda.core.experimental import Buffer, Device, DeviceMemoryResource from utility import IPCBufferTestHelper -CHILD_TIMEOUT_SEC = 4 +CHILD_TIMEOUT_SEC = 20 NBYTES = 64 POOL_SIZE = 2097152 diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index aeaeaa69d..cd2259c4e 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -7,7 +7,7 @@ from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions from utility import IPCBufferTestHelper -CHILD_TIMEOUT_SEC = 4 +CHILD_TIMEOUT_SEC = 20 NBYTES = 64 NWORKERS = 2 NMRS = 3 From fbcf3b311f851f74b3c2b83adf777df02e4c4430 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 2 Oct 2025 14:21:22 -0700 Subject: [PATCH 23/25] Add docstrings. Change is_imported to is_mapped. Register DeviceMemoryResource reduction with multiprocessing. Add a quick exit to from_allocation_handle. Simplify the worker pool tests based on the new reduction method. --- cuda_core/cuda/core/experimental/_memory.pyx | 151 ++++++++++++++---- cuda_core/docs/source/api.rst | 1 + cuda_core/docs/source/api_private.rst | 9 +- cuda_core/tests/memory_ipc/test_errors.py | 9 +- cuda_core/tests/memory_ipc/test_workerpool.py | 109 +++++++------ 5 files changed, 192 insertions(+), 87 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 78d738908..024ed3c3f 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -532,6 +532,7 @@ class DeviceMemoryResourceAttributes: del mempool_property + # Holds DeviceMemoryResource objects imported by this process. # This enables buffer serialization, as buffers can reduce to a pair # of comprising the memory resource UUID (the key into this registry) @@ -539,7 +540,8 @@ class DeviceMemoryResourceAttributes: _ipc_registry = {} class DeviceMemoryResource(MemoryResource): - """Create a device memory resource managing a stream-ordered memory pool. + """ + Create a device memory resource managing a stream-ordered memory pool. Parameters ---------- @@ -560,9 +562,63 @@ class DeviceMemoryResource(MemoryResource): When using an existing (current or default) memory pool, the returned device memory resource does not own the pool (`is_handle_owned` is `False`), and closing the resource has no effect. + + IPC-Enabled Memory Resources + ---------------------------- + If ``ipc_enabled=True`` is specified as an initializer option, the memory + resource constructed will be capable of sharing allocations between + processes. Sharing an allocation is a two-step procedure that involves + mapping a memory resource and then mapping buffers owned by that resource. + These steps can be accomplished in several ways. + + An IPC-enabled memory resource (MR) can allocate memory buffers but cannot + receive shared buffers. Mapping an MR to another process creates a "mapped + memory resource" (MMR). An MMR cannot allocate memory buffers and can only + receive shared buffers. MRs and MMRs are both of type + :class:`DeviceMemoryResource` and can be distinguished via + :attr:`DeviceMemoryResource.is_mapped`. + + An MR is shared via an allocation handle obtained by calling + :meth:`DeviceMemoryResource.get_allocation_handle`. The allocation handle + has a platform-specific interpretation; however, memory IPC is currently + only supported for Linux, and in that case allocation handles are file + descriptors. After sending an allocation handle to another process, it can + be used to create an MMR by invoking + :meth:`DeviceMemoryResource.from_allocation_handle`. + + Buffers can be shared as serializable descriptors obtained by calling + :meth:`Buffer.get_ipc_descriptor`. In a receiving process, a shared buffer is + created by invoking :meth:`Buffer.from_ipc_descriptor` with an MMR and + buffer descriptor, where the MMR corresponds to the MR that created the + described buffer. + + To help manage the association between memory resources and buffers, a + registry is provided. Every MR has a unique identifier (UUID). MMRs can be + registered by calling :meth:`DeviceMemoryResource.register` with the UUID + of the corresponding MR. Registered MMRs can be looked up via + :meth:`DeviceMemoryResource.from_registry`. When registering MMRs in this + way, the use of buffer descriptors can be avoided. Instead, buffer objects + can themselves be serialized and transferred directly. Serialization embeds + the UUID, which is used to locate the correct MMR during reconstruction. + + IPC-enabled memory resources interoperate with the :mod:`multiprocessing` + module to provide a simplified interface. This approach can avoid direct + use of allocation handles, buffer descriptors, MMRs, and the registry. When + using :mod:`multiprocessing` to spawn processes or send objects through + communication channels such as :class:`multiprocessing.Queue`, + :class:`multiprocessing.Pipe`, or :class:`multiprocessing.Connection`, + :class:`Buffer` objects may be sent directly, and in such cases the process + for creating MMRs and mapping buffers will be handled automatically. + + For greater efficiency when transferring many buffers, one may also send + MRs and buffers separately. When an MR is sent via :mod:`multiprocessing`, + an MMR is created and registered in the receiving process. Subsequently, + buffers may be serialized and transferred using ordinary :mod:`pickle` + methods. The reconstruction procedure uses the registry to find the + associated MMR. """ __slots__ = ("_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type", - "_mempool_owned", "_is_imported", "_uuid", "_alloc_handle") + "_mempool_owned", "_is_mapped", "_uuid", "_alloc_handle") def __init__(self, device_id: int | Device, options=None): device_id = getattr(device_id, 'device_id', device_id) @@ -577,7 +633,7 @@ class DeviceMemoryResource(MemoryResource): self._attributes = None self._ipc_handle_type = _NOIPC_HANDLE_TYPE self._mempool_owned = False - self._is_imported = False + self._is_mapped = False self._uuid = None self._alloc_handle = None @@ -620,7 +676,7 @@ class DeviceMemoryResource(MemoryResource): self._attributes = None self._ipc_handle_type = properties.handleTypes self._mempool_owned = True - self._is_imported = False + self._is_mapped = False self._uuid = None self._alloc_handle = None @@ -641,38 +697,46 @@ class DeviceMemoryResource(MemoryResource): err, = driver.cuMemPoolDestroy(self._mempool_handle) raise_if_driver_error(err) finally: - self.unregister() + if self.is_mapped: + self.unregister() self._dev_id = None self._mempool_handle = None self._attributes = None self._ipc_handle_type = _NOIPC_HANDLE_TYPE self._mempool_owned = False - self._is_imported = False + self._is_mapped = False self._uuid = None self._alloc_handle = None def __reduce__(self): - # If spawning a new process, serialize the resources; otherwise, just - # send the UUID, using the registry on the receiving end. - import multiprocessing - is_spawning = multiprocessing.context.get_spawning_popen() is not None - if is_spawning: - from ._device import Device - device = Device(self.device_id) - alloc_handle = self.get_allocation_handle() - return DeviceMemoryResource.from_allocation_handle, (device, alloc_handle) - else: - return DeviceMemoryResource.from_registry, (self.uuid,) + return DeviceMemoryResource.from_registry, (self.uuid,) @staticmethod - def from_registry(uuid: uuid.UUID): + def from_registry(uuid: uuid.UUID) -> DeviceMemoryResource: + """ + Obtain a registered mapped memory resource. + + Raises + ------ + RuntimeError + If no mapped memory resource is found in the registry. + """ + try: return _ipc_registry[uuid] except KeyError: raise RuntimeError(f"Memory resource {uuid} was not found") from None - def register(self, uuid: uuid.UUID): + def register(self, uuid: uuid.UUID) -> DeviceMemoryResource: + """ + Register a mapped memory resource. + + Returns + ------- + The registered mapped memory resource. If one was previously registered + with the given key, it is returned. + """ existing = _ipc_registry.get(uuid) if existing is not None: return existing @@ -682,12 +746,18 @@ class DeviceMemoryResource(MemoryResource): return self def unregister(self): - if _ipc_registry is not None: + """Unregister this mapped memory resource.""" + assert self.is_mapped + if _ipc_registry is not None: # can occur during shutdown catastrophe with contextlib.suppress(KeyError): del _ipc_registry[self.uuid] @property - def uuid(self): + def uuid(self) -> Optional[uuid.UUID]: + """ + A universally unique identifier for this memory resource. Meaningful + only for IPC-enabled memory resources. + """ return self._uuid @classmethod @@ -711,6 +781,12 @@ class DeviceMemoryResource(MemoryResource): ------- A new device memory resource instance with the imported handle. """ + # Quick exit for registry hits. + uuid = getattr(alloc_handle, 'uuid', None) + self = _ipc_registry.get(uuid) + if self is not None: + return self + device_id = getattr(device_id, 'device_id', device_id) self = cls.__new__(cls) @@ -719,15 +795,15 @@ class DeviceMemoryResource(MemoryResource): self._attributes = None self._ipc_handle_type = _IPC_HANDLE_TYPE self._mempool_owned = True - self._is_imported = True + self._is_mapped = True self._uuid = None self._alloc_handle = None # only used for non-imported err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0) raise_if_driver_error(err) - uuid = getattr(alloc_handle, 'uuid', None) if uuid is not None: - self = self.register(uuid) + registered = self.register(uuid) + assert registered is self return self def get_allocation_handle(self) -> IPCAllocationHandle: @@ -743,13 +819,13 @@ class DeviceMemoryResource(MemoryResource): if self._alloc_handle is None: if not self.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") - if self._is_imported: + if self._is_mapped: raise RuntimeError("Imported memory resource cannot be exported") err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0) raise_if_driver_error(err) try: assert self._uuid is None - import uuid as uuid + import uuid self._uuid = uuid.uuid4() self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid) except: @@ -774,8 +850,8 @@ class DeviceMemoryResource(MemoryResource): The allocated buffer object, which is accessible on the device that this memory resource was created for. """ - if self._is_imported: - raise TypeError("Cannot allocate from shared memory pool imported via IPC") + if self._is_mapped: + raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") if stream is None: stream = default_stream() err, ptr = driver.cuMemAllocFromPoolAsync(size, self._mempool_handle, stream.handle) @@ -823,9 +899,12 @@ class DeviceMemoryResource(MemoryResource): return self._mempool_owned @property - def is_imported(self) -> bool: - """Whether the memory resource was imported from another process. If True, allocation is not permitted.""" - return self._is_imported + def is_mapped(self) -> bool: + """ + Whether this is a mapping of an IPC-enabled memory resource from + another process. If True, allocation is not permitted. + """ + return self._is_mapped @property def is_device_accessible(self) -> bool: @@ -843,6 +922,16 @@ class DeviceMemoryResource(MemoryResource): return self._ipc_handle_type != _NOIPC_HANDLE_TYPE +def _deep_reduce_device_memory_resource(mr): + from ._device import Device + device = Device(mr.device_id) + alloc_handle = mr.get_allocation_handle() + return mr.from_allocation_handle, (device, alloc_handle) + + +multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource) + + class LegacyPinnedMemoryResource(MemoryResource): """Create a pinned memory resource that uses legacy cuMemAllocHost/cudaMallocHost APIs. diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 9c93d0f75..f239c69cd 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -30,6 +30,7 @@ CUDA runtime :template: dataclass.rst + DeviceMemoryResourceOptions EventOptions GraphCompleteOptions GraphDebugPrintOptions diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index fb36e0a30..917b7101d 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -4,9 +4,9 @@ :orphan: .. This page is to generate documentation for private classes exposed to users, - i.e., users cannot instantiate it by themselves but may use it's properties - or methods via returned values from public APIs. These classes must be referred - in public APIs returning their instances. + i.e., users cannot instantiate them but may use their properties or methods + via returned values from public APIs. These classes must be referred in + public APIs returning their instances. .. currentmodule:: cuda.core.experimental @@ -18,8 +18,9 @@ CUDA runtime _memory.PyCapsule _memory.DevicePointerT - _memory.IPCBufferDescriptor _device.DeviceProperties + _memory.IPCAllocationHandle + _memory.IPCBufferDescriptor _module.KernelAttributes _module.KernelOccupancy _module.ParamInfo diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index e0d87a0d0..d6b1dc86d 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import multiprocessing +import pickle import re from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions @@ -64,7 +65,7 @@ def CHILD_ACTION(self, queue): def ASSERT(self, exc_type, exc_msg): assert exc_type is TypeError - assert exc_msg == "Cannot allocate from shared memory pool imported via IPC" + assert exc_msg == "Cannot allocate from a mapped IPC-enabled memory resource" class TestImportWrongMR(ChildErrorHarness): @@ -128,11 +129,13 @@ def PARENT_ACTION(self, queue): options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mr2 = DeviceMemoryResource(self.device, options=options) self.buffer = mr2.allocate(NBYTES) - queue.put(self.buffer) # Note: mr2 not sent + buffer_s = pickle.dumps(self.buffer) + queue.put(buffer_s) # Note: mr2 not sent def CHILD_ACTION(self, queue): Device().set_current() - queue.get(timeout=CHILD_TIMEOUT_SEC) + buffer_s = queue.get(timeout=CHILD_TIMEOUT_SEC) + pickle.loads(buffer_s) def ASSERT(self, exc_type, exc_msg): assert exc_type is RuntimeError diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index cd2259c4e..b7be23267 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -1,8 +1,10 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import multiprocessing from itertools import cycle +import multiprocessing as mp +import pickle +import pytest from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions from utility import IPCBufferTestHelper @@ -14,44 +16,58 @@ NTASKS = 20 POOL_SIZE = 2097152 -# Global memory resources, set in children. -g_mrs = None - -class TestIpcWorkerPoolUsingExport: +class TestIpcWorkerPool: """ - Test buffer sharing using export handles. + Map a function over shared buffers using a worker pool to distribute work. - The memory resources need to be passed to subprocesses at startup. Buffers - are passed by their handles and reconstructed using the corresponding mr. + This demonstrates the simplest interface, though not the most efficient + one. Each buffer transfer involes a deep transfer of the associated memory + resource (duplicates are ignored on the receiving end). """ - @staticmethod - def init_worker(mrs): - global g_mrs - g_mrs = mrs - - def test_ipc_workerpool(self, ipc_device, ipc_memory_resource): - """Test IPC with a worker pool.""" + @pytest.mark.parametrize("nmrs", (1, NMRS)) + def test_main(self, ipc_device, nmrs): device = ipc_device - mr = ipc_memory_resource - buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)] - with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool: - pool.starmap(self.process_buffer, [(0, buffer.get_ipc_descriptor()) for buffer in buffers]) + options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) + mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)] + buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] + + with mp.Pool(NWORKERS) as pool: + pool.map(self.process_buffer, buffers) for buffer in buffers: IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) - def test_ipc_workerpool_multi_mr(self, ipc_device, ipc_memory_resource): - """Test IPC with a worker pool using multiple memory resources.""" + def process_buffer(self, buffer): + device = Device() + IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True) + + +class TestIpcWorkerPoolUsingIPCDescriptors: + """ + Test buffer sharing using IPC descriptors. + + The memory resources are passed to subprocesses at startup. Buffers are + passed by their handles and reconstructed using the corresponding resource. + """ + + @staticmethod + def init_worker(mrs): + """Called during child process initialization to store received memory resources.""" + TestIpcWorkerPoolUsingIPCDescriptors.mrs = mrs + + @pytest.mark.parametrize("nmrs", (1, NMRS)) + def test_main(self, ipc_device, nmrs): device = ipc_device options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) - mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)] + mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)] buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] - with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: + + with mp.Pool(NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: pool.starmap( self.process_buffer, - [(mrs.index(buffer.memory_resource), buffer.get_ipc_descriptor()) for buffer in buffers], + [(mrs.index(buffer.memory_resource), buffer.get_ipc_descriptor()) for buffer in buffers] ) for buffer in buffers: @@ -59,47 +75,42 @@ def test_ipc_workerpool_multi_mr(self, ipc_device, ipc_memory_resource): def process_buffer(self, mr_idx, buffer_desc): device = Device() - buffer = Buffer.from_ipc_descriptor(g_mrs[mr_idx], buffer_desc) + buffer = Buffer.from_ipc_descriptor(self.mrs[mr_idx], buffer_desc) IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True) -class TestIpcWorkerPool: +class TestIpcWorkerPoolUsingRegistry: """ - Test buffer sharing without using export handles. + Test buffer sharing using the memory resource registry. - The memory resources need to be passed to subprocesses at startup. Buffers - are serialized with the `uuid` of the corresponding mr, and the - import/export is handled automatically. + The memory resources are passed to subprocesses at startup, which + implicitly registers them. Buffers are passed via serialization and matched + to the corresponding memory resource through the registry. This is more + complicated than the simple example (first, above) but passes buffers more + efficiently. """ @staticmethod def init_worker(mrs): - global g_mrs - g_mrs = mrs + # Passing mrs implicitly registers them. + pass - def test_ipc_workerpool(self, ipc_device, ipc_memory_resource): - """Test IPC with a worker pool.""" - device = ipc_device - mr = ipc_memory_resource - buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)] - with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool: - pool.map(self.process_buffer, buffers) - - for buffer in buffers: - IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) - - def test_ipc_workerpool_multi_mr(self, ipc_device, ipc_memory_resource): - """Test IPC with a worker pool using multiple memory resources.""" + @pytest.mark.parametrize("nmrs", (1, NMRS)) + def test_main(self, ipc_device, nmrs): device = ipc_device options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) - mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)] + mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)] buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] - with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: - pool.map(self.process_buffer, buffers) + + with mp.Pool(NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: + pool.map(self.process_buffer, [pickle.dumps(buffer) for buffer in buffers] + ) for buffer in buffers: IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) - def process_buffer(self, buffer): + def process_buffer(self, buffer_s): device = Device() + buffer = pickle.loads(buffer_s) IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True) + From e5b8542a86bbdff1011ada605494e4d7e5f7330a Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 2 Oct 2025 16:56:04 -0700 Subject: [PATCH 24/25] Remove call to set_current in Device reconstruction. Add device set-up to tests. --- cuda_core/cuda/core/experimental/_device.py | 18 ++----- cuda_core/cuda/core/experimental/_memory.pyx | 2 +- cuda_core/tests/memory_ipc/test_errors.py | 5 +- cuda_core/tests/memory_ipc/test_leaks.py | 4 +- cuda_core/tests/memory_ipc/test_memory_ipc.py | 30 ++++++------ .../tests/memory_ipc/test_send_buffers.py | 47 ++++++------------- cuda_core/tests/memory_ipc/test_serialize.py | 9 ++-- cuda_core/tests/memory_ipc/test_workerpool.py | 25 +++++----- 8 files changed, 55 insertions(+), 85 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 91ae7829c..be8c5170a 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import multiprocessing import threading from typing import Optional, Union @@ -1161,6 +1160,9 @@ def __int__(self): def __repr__(self): return f"" + def __reduce__(self): + return Device, (self.device_id,) + def set_current(self, ctx: Context = None) -> Union[Context, None]: """Set device to be used for GPU executions. @@ -1335,17 +1337,3 @@ def create_graph_builder(self) -> GraphBuilder: """ self._check_context_initialized() return GraphBuilder._init(stream=self.create_stream(), is_stream_owner=True) - - -def _reduce_device(device): - return _reconstruct_device, (device.device_id,) - - -def _reconstruct_device(device_id): - device = Device(device_id) - if not device._has_inited: - device.set_current() - return device - - -multiprocessing.reduction.register(Device, _reduce_device) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 024ed3c3f..0b9f8a28b 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -923,7 +923,7 @@ class DeviceMemoryResource(MemoryResource): def _deep_reduce_device_memory_resource(mr): - from ._device import Device + from . import Device device = Device(mr.device_id) alloc_handle = mr.get_allocation_handle() return mr.from_allocation_handle, (device, alloc_handle) diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index d6b1dc86d..3e8265b39 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -43,6 +43,7 @@ def test_main(self, ipc_device, ipc_memory_resource): def child_main(self, pipe, device, mr): """Child process that pushes IPC errors to a shared pipe for testing.""" self.device = device + self.device.set_current() self.mr = mr try: self.CHILD_ACTION(pipe[0]) @@ -129,13 +130,13 @@ def PARENT_ACTION(self, queue): options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mr2 = DeviceMemoryResource(self.device, options=options) self.buffer = mr2.allocate(NBYTES) - buffer_s = pickle.dumps(self.buffer) + buffer_s = pickle.dumps(self.buffer) # noqa: S301 queue.put(buffer_s) # Note: mr2 not sent def CHILD_ACTION(self, queue): Device().set_current() buffer_s = queue.get(timeout=CHILD_TIMEOUT_SEC) - pickle.loads(buffer_s) + pickle.loads(buffer_s) # noqa: S301 def ASSERT(self, exc_type, exc_msg): assert exc_type is RuntimeError diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index 387ca7042..bfead7dd3 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -33,7 +33,7 @@ def test_alloc_handle(ipc_memory_resource): [mr.get_allocation_handle() for _ in range(10)] -def exec_with_object(obj, number=1): +def exec_success(obj, number=1): """Succesfully run a child process.""" for _ in range(number): process = mp.Process(target=child_main, args=(obj,)) @@ -92,7 +92,7 @@ def __reduce__(self): ], ids=["alloc_handle", "mr", "buffer", "buffer_desc"], ) -@pytest.mark.parametrize("launcher", [exec_with_object, exec_launch_failure, exec_reduce_failure]) +@pytest.mark.parametrize("launcher", [exec_success, exec_launch_failure, exec_reduce_failure]) def test_pass_object(ipc_memory_resource, launcher, getobject): """Check for fd leaks when an object is sent as a subprocess argument.""" mr = ipc_memory_resource diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index da4678afe..9ed24792b 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -3,7 +3,7 @@ import multiprocessing as mp -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource +from cuda.core.experimental import Buffer, DeviceMemoryResource from utility import IPCBufferTestHelper CHILD_TIMEOUT_SEC = 20 @@ -21,7 +21,7 @@ def test_main(self, ipc_device, ipc_memory_resource): # Start the child process. queue = mp.Queue() - process = mp.Process(target=self.child_main, args=(mr, queue)) + process = mp.Process(target=self.child_main, args=(device, mr, queue)) process.start() # Allocate and fill memory. @@ -39,8 +39,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Verify that the buffer was modified. helper.verify_buffer(flipped=True) - def child_main(self, mr, queue): - device = Device() + def child_main(self, device, mr, queue): + device.set_current() buffer = queue.get(timeout=CHILD_TIMEOUT_SEC) helper = IPCBufferTestHelper(device, buffer) helper.verify_buffer(flipped=False) @@ -64,8 +64,8 @@ def test_main(self, ipc_device, ipc_memory_resource): q2.put(buffer2) # Start the child processes. - p1 = mp.Process(target=self.child_main, args=(mr, 1, q1)) - p2 = mp.Process(target=self.child_main, args=(mr, 2, q2)) + p1 = mp.Process(target=self.child_main, args=(device, mr, 1, q1)) + p2 = mp.Process(target=self.child_main, args=(device, mr, 2, q2)) p1.start() p2.start() @@ -79,10 +79,10 @@ def test_main(self, ipc_device, ipc_memory_resource): IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False) IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True) - def child_main(self, mr, idx, queue): + def child_main(self, device, mr, idx, queue): # Note: passing the mr registers it so that buffers can be passed # directly. - device = Device() + device.set_current() buffer1 = queue.get(timeout=CHILD_TIMEOUT_SEC) buffer2 = queue.get(timeout=CHILD_TIMEOUT_SEC) if idx == 1: @@ -104,8 +104,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Start children. q1, q2 = (mp.Queue() for _ in range(2)) - p1 = mp.Process(target=self.child_main, args=(alloc_handle, 1, q1)) - p2 = mp.Process(target=self.child_main, args=(alloc_handle, 2, q2)) + p1 = mp.Process(target=self.child_main, args=(device, alloc_handle, 1, q1)) + p2 = mp.Process(target=self.child_main, args=(device, alloc_handle, 2, q2)) p1.start() p2.start() @@ -125,11 +125,10 @@ def test_main(self, ipc_device, ipc_memory_resource): IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1) IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) - def child_main(self, alloc_handle, idx, queue): + def child_main(self, device, alloc_handle, idx, queue): """Fills a shared memory buffer.""" # In this case, the device needs to be set up (passing the mr does it # implicitly in other tests). - device = Device() device.set_current() mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) buffer_descriptor = queue.get(timeout=CHILD_TIMEOUT_SEC) @@ -149,8 +148,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Start children. q1, q2 = (mp.Queue() for _ in range(2)) - p1 = mp.Process(target=self.child_main, args=(alloc_handle, 1, q1)) - p2 = mp.Process(target=self.child_main, args=(alloc_handle, 2, q2)) + p1 = mp.Process(target=self.child_main, args=(device, alloc_handle, 1, q1)) + p2 = mp.Process(target=self.child_main, args=(device, alloc_handle, 2, q2)) p1.start() p2.start() @@ -170,9 +169,8 @@ def test_main(self, ipc_device, ipc_memory_resource): IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1) IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2) - def child_main(self, alloc_handle, idx, queue): + def child_main(self, device, alloc_handle, idx, queue): """Fills a shared memory buffer.""" - device = Device() device.set_current() # Register the memory resource. diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index 3e3f44865..966b6eafc 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -1,10 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import multiprocessing +import multiprocessing as mp from itertools import cycle -from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions +import pytest +from cuda.core.experimental import DeviceMemoryResource, DeviceMemoryResourceOptions from utility import IPCBufferTestHelper CHILD_TIMEOUT_SEC = 20 @@ -14,37 +15,13 @@ POOL_SIZE = 2097152 -def test_ipc_send_buffers(ipc_device, ipc_memory_resource): - """Test passing buffers directly to a child separately from a memory resource.""" - device = ipc_device - mr = ipc_memory_resource - - # Allocate and fill memory. - buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)] - for buffer in buffers: - helper = IPCBufferTestHelper(device, buffer) - helper.fill_buffer(flipped=False) - - # Start the child process. Send the buffer directly. - process = multiprocessing.Process(target=child_main, args=(buffers,)) - process.start() - - # Wait for the child process. - process.join(timeout=CHILD_TIMEOUT_SEC) - assert process.exitcode == 0 - - # Verify that the buffers were modified. - for buffer in buffers: - helper = IPCBufferTestHelper(device, buffer) - helper.verify_buffer(flipped=True) - - -def test_ipc_send_buffers_multi(ipc_device, ipc_memory_resource): +@pytest.mark.parametrize("nmrs", (1, NMRS)) +def test_ipc_send_buffers(ipc_device, nmrs): """Test passing buffers sourced from multiple memory resources.""" # Set up several IPC-enabled memory pools. device = ipc_device options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) - mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)] + mrs = [DeviceMemoryResource(device, options=options) for _ in range(NMRS)] # Allocate and fill memory. buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] @@ -53,7 +30,13 @@ def test_ipc_send_buffers_multi(ipc_device, ipc_memory_resource): helper.fill_buffer(flipped=False) # Start the child process. - process = multiprocessing.Process(target=child_main, args=(buffers,)) + process = mp.Process( + target=child_main, + args=( + device, + buffers, + ), + ) process.start() # Wait for the child process. @@ -66,8 +49,8 @@ def test_ipc_send_buffers_multi(ipc_device, ipc_memory_resource): helper.verify_buffer(flipped=True) -def child_main(buffers): - device = Device() +def child_main(device, buffers): + device.set_current() for buffer in buffers: helper = IPCBufferTestHelper(device, buffer) helper.verify_buffer(flipped=False) diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index ba0ac326f..2d88bcd03 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -33,7 +33,6 @@ def test_main(self, ipc_device, ipc_memory_resource): # Send a memory resource by allocation handle. alloc_handle = mr.get_allocation_handle() mp.reduction.send_handle(parent_conn, alloc_handle.handle, process.pid) - parent_conn.send(mr.uuid) # Send a buffer. buffer1 = mr.allocate(NBYTES) @@ -57,9 +56,7 @@ def child_main(self, conn): # Receive the memory resource. handle = mp.reduction.recv_handle(conn) - uuid = conn.recv() mr = DeviceMemoryResource.from_allocation_handle(device, handle) - mr.register(uuid) os.close(handle) # Receive the buffers. @@ -135,7 +132,7 @@ def test_object_passing(ipc_device, ipc_memory_resource): helper.fill_buffer(flipped=False) # Start the child process. - process = mp.Process(target=child_main, args=(device, alloc_handle, mr, buffer_desc, buffer)) + process = mp.Process(target=child_main, args=(alloc_handle, mr, buffer_desc, buffer)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) assert process.exitcode == 0 @@ -143,7 +140,9 @@ def test_object_passing(ipc_device, ipc_memory_resource): helper.verify_buffer(flipped=True) -def child_main(device, alloc_handle, mr1, buffer_desc, buffer1): +def child_main(alloc_handle, mr1, buffer_desc, buffer1): + device = Device() + device.set_current() mr2 = DeviceMemoryResource.from_allocation_handle(device, alloc_handle) # OK to build the buffer from either mr and the descriptor. diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index b7be23267..401324e05 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -1,11 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from itertools import cycle import multiprocessing as mp import pickle -import pytest +from itertools import cycle +import pytest from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions from utility import IPCBufferTestHelper @@ -40,7 +40,8 @@ def test_main(self, ipc_device, nmrs): IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) def process_buffer(self, buffer): - device = Device() + device = Device(buffer.memory_resource.device_id) + device.set_current() IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True) @@ -67,15 +68,17 @@ def test_main(self, ipc_device, nmrs): with mp.Pool(NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: pool.starmap( self.process_buffer, - [(mrs.index(buffer.memory_resource), buffer.get_ipc_descriptor()) for buffer in buffers] + [(mrs.index(buffer.memory_resource), buffer.get_ipc_descriptor()) for buffer in buffers], ) for buffer in buffers: IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) def process_buffer(self, mr_idx, buffer_desc): - device = Device() - buffer = Buffer.from_ipc_descriptor(self.mrs[mr_idx], buffer_desc) + mr = self.mrs[mr_idx] + device = Device(mr.device_id) + device.set_current() + buffer = Buffer.from_ipc_descriptor(mr, buffer_desc) IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True) @@ -103,14 +106,12 @@ def test_main(self, ipc_device, nmrs): buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))] with mp.Pool(NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: - pool.map(self.process_buffer, [pickle.dumps(buffer) for buffer in buffers] - ) + pool.starmap(self.process_buffer, [(device, pickle.dumps(buffer)) for buffer in buffers]) for buffer in buffers: IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True) - def process_buffer(self, buffer_s): - device = Device() - buffer = pickle.loads(buffer_s) + def process_buffer(self, device, buffer_s): + device.set_current() + buffer = pickle.loads(buffer_s) # noqa: S301 IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True) - From 534b16ae9a117b131a2f9d953c20d1d388baae1c Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 3 Oct 2025 01:49:24 +0000 Subject: [PATCH 25/25] fix docstring rendering --- cuda_core/cuda/core/experimental/_memory.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 0b9f8a28b..3fdc1410f 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -563,15 +563,15 @@ class DeviceMemoryResource(MemoryResource): device memory resource does not own the pool (`is_handle_owned` is `False`), and closing the resource has no effect. - IPC-Enabled Memory Resources - ---------------------------- - If ``ipc_enabled=True`` is specified as an initializer option, the memory - resource constructed will be capable of sharing allocations between - processes. Sharing an allocation is a two-step procedure that involves + Notes + ----- + To create an IPC-Enabled memory resource (MR) that is capable of sharing + allocations between processes, specify ``ipc_enabled=True`` in the initializer + option. Sharing an allocation is a two-step procedure that involves mapping a memory resource and then mapping buffers owned by that resource. These steps can be accomplished in several ways. - An IPC-enabled memory resource (MR) can allocate memory buffers but cannot + An IPC-enabled memory resource can allocate memory buffers but cannot receive shared buffers. Mapping an MR to another process creates a "mapped memory resource" (MMR). An MMR cannot allocate memory buffers and can only receive shared buffers. MRs and MMRs are both of type