From 846c75cf16572673f20c940d7a25a46db23ad808 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 18 Sep 2025 10:48:54 -0700
Subject: [PATCH 01/25] Restructures IPC mempool tests into a subdirectory.

---
 cuda_core/tests/ipc/conftest.py         |  33 +++++
 cuda_core/tests/ipc/test_ipc_errors.py  |  45 ++++++
 cuda_core/tests/ipc/test_ipc_mempool.py |  51 +++++++
 cuda_core/tests/ipc/utility.py          |  70 +++++++++
 cuda_core/tests/test_ipc_mempool.py     | 179 ------------------------
 5 files changed, 199 insertions(+), 179 deletions(-)
 create mode 100644 cuda_core/tests/ipc/conftest.py
 create mode 100644 cuda_core/tests/ipc/test_ipc_errors.py
 create mode 100644 cuda_core/tests/ipc/test_ipc_mempool.py
 create mode 100644 cuda_core/tests/ipc/utility.py
 delete mode 100644 cuda_core/tests/test_ipc_mempool.py

diff --git a/cuda_core/tests/ipc/conftest.py b/cuda_core/tests/ipc/conftest.py
new file mode 100644
index 000000000..2ac6d858b
--- /dev/null
+++ b/cuda_core/tests/ipc/conftest.py
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from cuda.core.experimental import Device, DeviceMemoryResource
+
+POOL_SIZE = 2097152
+
+@pytest.fixture(scope="function")
+def device():
+    """Obtains a device suitable for IPC-enabled mempool tests, or skips."""
+    # Check if IPC is supported on this platform/device
+    device = Device()
+    device.set_current()
+
+    if not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
+
+    # Note: Linux specific. Once Windows support for IPC is implemented, this
+    # test should be updated.
+    if not device.properties.handle_type_posix_file_descriptor_supported:
+        pytest.skip("Device does not support IPC")
+
+    return device
+
+
+@pytest.fixture(scope="function")
+def ipc_memory_resource(device):
+    mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True))
+    assert mr.is_ipc_enabled
+    return mr
+
+
diff --git a/cuda_core/tests/ipc/test_ipc_errors.py b/cuda_core/tests/ipc/test_ipc_errors.py
new file mode 100644
index 000000000..4ac3277e6
--- /dev/null
+++ b/cuda_core/tests/ipc/test_ipc_errors.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental import Device, DeviceMemoryResource, IPCChannel
+import multiprocessing
+import pytest
+
+CHILD_TIMEOUT_SEC = 10
+NBYTES = 64
+
+def test_ipc_errors(device, ipc_memory_resource):
+    """Test expected errors with allocating from a shared IPC memory pool."""
+    mr = ipc_memory_resource
+    # Set up the IPC-enabled memory pool and share it.
+    channel = IPCChannel()
+    mr.share_to_channel(channel)
+
+    # Start a child process to generate error info.
+    queue = multiprocessing.Queue()
+    process = multiprocessing.Process(target=child_main, args=(channel, queue))
+    process.start()
+
+    # Check the errors.
+    exc_type, exc_msg = queue.get(timeout=CHILD_TIMEOUT_SEC)
+    assert exc_type is TypeError
+    assert exc_msg == "Cannot allocate from shared memory pool imported via IPC"
+
+    # Wait for the child process.
+    process.join(timeout=CHILD_TIMEOUT_SEC)
+    assert process.exitcode == 0
+
+
+def child_main(channel, queue):
+    """Child process that pushes IPC errors to a shared queue for testing."""
+    device = Device()
+    device.set_current()
+
+    mr = DeviceMemoryResource.from_shared_channel(device, channel)
+
+    # Allocating from an imported pool.
+    try:
+        mr.allocate(NBYTES)
+    except Exception as e:
+        exc_info = type(e), str(e)
+        queue.put(exc_info)
diff --git a/cuda_core/tests/ipc/test_ipc_mempool.py b/cuda_core/tests/ipc/test_ipc_mempool.py
new file mode 100644
index 000000000..6b2dc1d7f
--- /dev/null
+++ b/cuda_core/tests/ipc/test_ipc_mempool.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel
+from utility import IPCBufferTestHelper
+import multiprocessing
+import pytest
+
+CHILD_TIMEOUT_SEC = 10
+NBYTES = 64
+
+def test_ipc_mempool(device, ipc_memory_resource):
+    """Test IPC with memory pools."""
+    # Set up the IPC-enabled memory pool and share it.
+    mr = ipc_memory_resource
+    channel = IPCChannel()
+    mr.share_to_channel(channel)
+
+    # Start the child process.
+    queue = multiprocessing.Queue()
+    process = multiprocessing.Process(target=child_main, args=(channel, queue))
+    process.start()
+
+    # Allocate and fill memory.
+    buffer = mr.allocate(NBYTES)
+    helper = IPCBufferTestHelper(device, buffer, NBYTES)
+    helper.fill_buffer(flipped=False)
+
+    # Export the buffer via IPC.
+    handle = buffer.export()
+    queue.put(handle)
+
+    # Wait for the child process.
+    process.join(timeout=CHILD_TIMEOUT_SEC)
+    assert process.exitcode == 0
+
+    # Verify that the buffer was modified.
+    helper.verify_buffer(flipped=True)
+
+
+def child_main(channel, queue):
+    device = Device()
+    device.set_current()
+
+    mr = DeviceMemoryResource.from_shared_channel(device, channel)
+    handle = queue.get()  # Get exported buffer data
+    buffer = Buffer.import_(mr, handle)
+
+    helper = IPCBufferTestHelper(device, buffer, NBYTES)
+    helper.verify_buffer(flipped=False)
+    helper.fill_buffer(flipped=True)
diff --git a/cuda_core/tests/ipc/utility.py b/cuda_core/tests/ipc/utility.py
new file mode 100644
index 000000000..f778578bd
--- /dev/null
+++ b/cuda_core/tests/ipc/utility.py
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+try:
+    from cuda.bindings import driver
+except ImportError:
+    from cuda import cuda as driver
+
+import ctypes
+from cuda.core.experimental import Buffer, MemoryResource
+from cuda.core.experimental._utils.cuda_utils import handle_return
+
+class DummyUnifiedMemoryResource(MemoryResource):
+    def __init__(self, device):
+        self.device = device
+
+    def allocate(self, size, stream=None) -> Buffer:
+        ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value))
+        return Buffer.from_handle(ptr=ptr, size=size, mr=self)
+
+    def deallocate(self, ptr, size, stream=None):
+        handle_return(driver.cuMemFree(ptr))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return True
+
+    @property
+    def device_id(self) -> int:
+        return self.device
+
+
+class IPCBufferTestHelper:
+    """A helper for manipulating memory buffers in IPC tests.
+
+    Provides methods to fill a buffer with one of two test patterns and verify
+    the expected values.
+    """
+
+    def __init__(self, device, buffer, nbytes):
+        self.device = device
+        self.buffer = buffer
+        self.nbytes = nbytes
+        self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.nbytes)
+        self.stream = device.create_stream()
+
+    def fill_buffer(self, flipped=False):
+        """Fill a device buffer with test pattern using unified memory."""
+        ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte))
+        op = (lambda i: 255 - i) if flipped else (lambda i: i)
+        for i in range(self.nbytes):
+            ptr[i] = ctypes.c_byte(op(i))
+        self.buffer.copy_from(self.scratch_buffer, stream=self.stream)
+        self.device.sync()
+
+    def verify_buffer(self, flipped=False):
+        """Verify the buffer contents."""
+        self.scratch_buffer.copy_from(self.buffer, stream=self.stream)
+        self.device.sync()
+        ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte))
+        op = (lambda i: 255 - i) if flipped else (lambda i: i)
+        for i in range(self.nbytes):
+            assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(i)).value, (
+                f"Buffer contains incorrect data at index {i}"
+            )
+
diff --git a/cuda_core/tests/test_ipc_mempool.py b/cuda_core/tests/test_ipc_mempool.py
deleted file mode 100644
index 5c4c38275..000000000
--- a/cuda_core/tests/test_ipc_mempool.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-try:
-    from cuda.bindings import driver
-except ImportError:
-    from cuda import cuda as driver
-
-import ctypes
-import multiprocessing
-
-import pytest
-
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel, MemoryResource
-from cuda.core.experimental._utils.cuda_utils import handle_return
-
-CHILD_TIMEOUT_SEC = 10
-NBYTES = 64
-POOL_SIZE = 2097152
-
-
-@pytest.fixture(scope="function")
-def ipc_device():
-    """Obtains a device suitable for IPC-enabled mempool tests, or skips."""
-    # Check if IPC is supported on this platform/device
-    device = Device()
-    device.set_current()
-
-    if not device.properties.memory_pools_supported:
-        pytest.skip("Device does not support mempool operations")
-
-    # Note: Linux specific. Once Windows support for IPC is implemented, this
-    # test should be updated.
-    if not device.properties.handle_type_posix_file_descriptor_supported:
-        pytest.skip("Device does not support IPC")
-
-    return device
-
-
-def test_ipc_mempool(ipc_device):
-    """Test IPC with memory pools."""
-    # Set up the IPC-enabled memory pool and share it.
-    stream = ipc_device.create_stream()
-    mr = DeviceMemoryResource(ipc_device, dict(max_size=POOL_SIZE, ipc_enabled=True))
-    assert mr.is_ipc_enabled
-    channel = IPCChannel()
-    mr.share_to_channel(channel)
-
-    # Start the child process.
-    queue = multiprocessing.Queue()
-    process = multiprocessing.Process(target=child_main1, args=(channel, queue))
-    process.start()
-
-    # Allocate and fill memory.
-    buffer = mr.allocate(NBYTES, stream=stream)
-    protocol = IPCBufferTestProtocol(ipc_device, buffer, stream=stream)
-    protocol.fill_buffer(flipped=False)
-    stream.sync()
-
-    # Export the buffer via IPC.
-    handle = buffer.export()
-    queue.put(handle)
-
-    # Wait for the child process.
-    process.join(timeout=CHILD_TIMEOUT_SEC)
-    assert process.exitcode == 0
-
-    # Verify that the buffer was modified.
-    protocol.verify_buffer(flipped=True)
-
-
-def child_main1(channel, queue):
-    device = Device()
-    device.set_current()
-    stream = device.create_stream()
-
-    mr = DeviceMemoryResource.from_shared_channel(device, channel)
-    handle = queue.get()  # Get exported buffer data
-    buffer = Buffer.import_(mr, handle)
-
-    protocol = IPCBufferTestProtocol(device, buffer, stream=stream)
-    protocol.verify_buffer(flipped=False)
-    protocol.fill_buffer(flipped=True)
-    stream.sync()
-
-
-def test_shared_pool_errors(ipc_device):
-    """Test expected errors with allocating from a shared IPC memory pool."""
-    # Set up the IPC-enabled memory pool and share it.
-    mr = DeviceMemoryResource(ipc_device, dict(max_size=POOL_SIZE, ipc_enabled=True))
-    channel = IPCChannel()
-    mr.share_to_channel(channel)
-
-    # Start a child process to generate error info.
-    queue = multiprocessing.Queue()
-    process = multiprocessing.Process(target=child_main2, args=(channel, queue))
-    process.start()
-
-    # Check the errors.
-    exc_type, exc_msg = queue.get(timeout=CHILD_TIMEOUT_SEC)
-    assert exc_type is TypeError
-    assert exc_msg == "Cannot allocate from shared memory pool imported via IPC"
-
-    # Wait for the child process.
-    process.join(timeout=CHILD_TIMEOUT_SEC)
-    assert process.exitcode == 0
-
-
-def child_main2(channel, queue):
-    """Child process that pushes IPC errors to a shared queue for testing."""
-    device = Device()
-    device.set_current()
-
-    mr = DeviceMemoryResource.from_shared_channel(device, channel)
-
-    # Allocating from an imported pool.
-    try:
-        mr.allocate(NBYTES)
-    except Exception as e:
-        exc_info = type(e), str(e)
-        queue.put(exc_info)
-
-
-class DummyUnifiedMemoryResource(MemoryResource):
-    def __init__(self, device):
-        self.device = device
-
-    def allocate(self, size, stream=None) -> Buffer:
-        ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value))
-        return Buffer.from_handle(ptr=ptr, size=size, mr=self)
-
-    def deallocate(self, ptr, size, stream=None):
-        handle_return(driver.cuMemFree(ptr))
-
-    @property
-    def is_device_accessible(self) -> bool:
-        return True
-
-    @property
-    def is_host_accessible(self) -> bool:
-        return True
-
-    @property
-    def device_id(self) -> int:
-        return self.device
-
-
-class IPCBufferTestProtocol:
-    """The protocol for verifying IPC.
-
-    Provides methods to fill a buffer with one of two test patterns and verify
-    the expected values.
-    """
-
-    def __init__(self, device, buffer, nbytes=NBYTES, stream=None):
-        self.device = device
-        self.buffer = buffer
-        self.nbytes = nbytes
-        self.stream = stream if stream is not None else device.create_stream()
-        self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.nbytes, stream=self.stream)
-
-    def fill_buffer(self, flipped=False):
-        """Fill a device buffer with test pattern using unified memory."""
-        ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte))
-        op = (lambda i: 255 - i) if flipped else (lambda i: i)
-        for i in range(self.nbytes):
-            ptr[i] = ctypes.c_byte(op(i))
-        self.buffer.copy_from(self.scratch_buffer, stream=self.stream)
-
-    def verify_buffer(self, flipped=False):
-        """Verify the buffer contents."""
-        self.scratch_buffer.copy_from(self.buffer, stream=self.stream)
-        self.stream.sync()
-        ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte))
-        op = (lambda i: 255 - i) if flipped else (lambda i: i)
-        for i in range(self.nbytes):
-            assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(i)).value, (
-                f"Buffer contains incorrect data at index {i}"
-            )

From 238db00c33ff185e54565799920797929a68d43e Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 18 Sep 2025 11:10:54 -0700
Subject: [PATCH 02/25] Simplify the IPC interface, adding create_ipc_channel
 and import_/export methods.

---
 cuda_core/cuda/core/experimental/_memory.pyx | 32 +++++++++++++++++++-
 cuda_core/tests/conftest.py                  |  2 ++
 cuda_core/tests/ipc/test_ipc_mempool.py      | 19 ++++--------
 3 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 41a506a58..fcf40ab7e 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -11,10 +11,12 @@ from cuda.core.experimental._utils.cuda_utils cimport (
 )
 
 from dataclasses import dataclass
-from typing import TypeVar, Union, TYPE_CHECKING
+from typing import Optional, TypeVar, Union, TYPE_CHECKING
 import abc
 import array
+import collections
 import cython
+import multiprocessing
 import os
 import platform
 import weakref
@@ -436,12 +438,34 @@ cdef class IPCChannel:
 
     cdef:
         object _proxy
+        object _queue
+        object _mr
 
     def __init__(self):
         if platform.system() == "Linux":
             self._proxy = IPCChannelUnixSocket._init()
         else:
             raise RuntimeError("IPC is not available on {platform.system()}")
+        self._queue = multiprocessing.Queue()
+        self._mr = None
+
+    def export(self, buffer: Buffer | collections.abc.Sequence):
+        if not isinstance(buffer, collections.abc.Sequence):
+            buffer = [buffer]
+
+        for buf in buffer:
+            handle = buf.export()
+            self._queue.put(handle)
+
+    def import_(self, device: Optional[Device] = None):
+        if self._mr is None:
+            if device is None:
+                from cuda.core.experimental._device import Device
+                device = Device()
+            self._mr = DeviceMemoryResource.from_shared_channel(device, self)
+
+        handle = self._queue.get()
+        return Buffer.import_(self._mr, handle)
 
 
 cdef class IPCChannelUnixSocket:
@@ -658,6 +682,12 @@ class DeviceMemoryResource(MemoryResource):
             self._mempool_owned = False
             self._is_imported = False
 
+    def create_ipc_channel(self):
+        """Create an IPC memory channel for sharing allocations."""
+        channel = IPCChannel()
+        self.share_to_channel(channel)
+        return channel
+
     @classmethod
     def from_shared_channel(cls, device_id: int | Device, channel: IPCChannel) -> DeviceMemoryResource:
         """Create a device memory resource from a memory pool shared over an IPC channel."""
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index c800aae3e..5f42c35c1 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -72,3 +72,5 @@ def pop_all_contexts():
 
 
 skipif_need_cuda_headers = pytest.mark.skipif(helpers.CUDA_INCLUDE_PATH is None, reason="need CUDA header")
+
+
diff --git a/cuda_core/tests/ipc/test_ipc_mempool.py b/cuda_core/tests/ipc/test_ipc_mempool.py
index 6b2dc1d7f..e6a1688b6 100644
--- a/cuda_core/tests/ipc/test_ipc_mempool.py
+++ b/cuda_core/tests/ipc/test_ipc_mempool.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel
+from cuda.core.experimental import Device
 from utility import IPCBufferTestHelper
 import multiprocessing
 import pytest
@@ -13,12 +13,10 @@ def test_ipc_mempool(device, ipc_memory_resource):
     """Test IPC with memory pools."""
     # Set up the IPC-enabled memory pool and share it.
     mr = ipc_memory_resource
-    channel = IPCChannel()
-    mr.share_to_channel(channel)
+    channel = mr.create_ipc_channel()
 
     # Start the child process.
-    queue = multiprocessing.Queue()
-    process = multiprocessing.Process(target=child_main, args=(channel, queue))
+    process = multiprocessing.Process(target=child_main, args=(channel,))
     process.start()
 
     # Allocate and fill memory.
@@ -27,8 +25,7 @@ def test_ipc_mempool(device, ipc_memory_resource):
     helper.fill_buffer(flipped=False)
 
     # Export the buffer via IPC.
-    handle = buffer.export()
-    queue.put(handle)
+    channel.export(buffer)
 
     # Wait for the child process.
     process.join(timeout=CHILD_TIMEOUT_SEC)
@@ -38,14 +35,10 @@ def test_ipc_mempool(device, ipc_memory_resource):
     helper.verify_buffer(flipped=True)
 
 
-def child_main(channel, queue):
+def child_main(channel):
     device = Device()
     device.set_current()
-
-    mr = DeviceMemoryResource.from_shared_channel(device, channel)
-    handle = queue.get()  # Get exported buffer data
-    buffer = Buffer.import_(mr, handle)
-
+    buffer = channel.import_()
     helper = IPCBufferTestHelper(device, buffer, NBYTES)
     helper.verify_buffer(flipped=False)
     helper.fill_buffer(flipped=True)

From f2ea8c93fd0c215c56e408b5106eefbb32da1fa5 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 18 Sep 2025 11:13:37 -0700
Subject: [PATCH 03/25] Simply the interface to IPCBufferTestHelper.

---
 cuda_core/tests/ipc/test_ipc_mempool.py | 4 ++--
 cuda_core/tests/ipc/utility.py          | 9 ++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cuda_core/tests/ipc/test_ipc_mempool.py b/cuda_core/tests/ipc/test_ipc_mempool.py
index e6a1688b6..25379b1ca 100644
--- a/cuda_core/tests/ipc/test_ipc_mempool.py
+++ b/cuda_core/tests/ipc/test_ipc_mempool.py
@@ -21,7 +21,7 @@ def test_ipc_mempool(device, ipc_memory_resource):
 
     # Allocate and fill memory.
     buffer = mr.allocate(NBYTES)
-    helper = IPCBufferTestHelper(device, buffer, NBYTES)
+    helper = IPCBufferTestHelper(device, buffer)
     helper.fill_buffer(flipped=False)
 
     # Export the buffer via IPC.
@@ -39,6 +39,6 @@ def child_main(channel):
     device = Device()
     device.set_current()
     buffer = channel.import_()
-    helper = IPCBufferTestHelper(device, buffer, NBYTES)
+    helper = IPCBufferTestHelper(device, buffer)
     helper.verify_buffer(flipped=False)
     helper.fill_buffer(flipped=True)
diff --git a/cuda_core/tests/ipc/utility.py b/cuda_core/tests/ipc/utility.py
index f778578bd..3ca177dfe 100644
--- a/cuda_core/tests/ipc/utility.py
+++ b/cuda_core/tests/ipc/utility.py
@@ -41,18 +41,17 @@ class IPCBufferTestHelper:
     the expected values.
     """
 
-    def __init__(self, device, buffer, nbytes):
+    def __init__(self, device, buffer):
         self.device = device
         self.buffer = buffer
-        self.nbytes = nbytes
-        self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.nbytes)
+        self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.buffer.size)
         self.stream = device.create_stream()
 
     def fill_buffer(self, flipped=False):
         """Fill a device buffer with test pattern using unified memory."""
         ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte))
         op = (lambda i: 255 - i) if flipped else (lambda i: i)
-        for i in range(self.nbytes):
+        for i in range(self.buffer.size):
             ptr[i] = ctypes.c_byte(op(i))
         self.buffer.copy_from(self.scratch_buffer, stream=self.stream)
         self.device.sync()
@@ -63,7 +62,7 @@ def verify_buffer(self, flipped=False):
         self.device.sync()
         ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte))
         op = (lambda i: 255 - i) if flipped else (lambda i: i)
-        for i in range(self.nbytes):
+        for i in range(self.buffer.size):
             assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(i)).value, (
                 f"Buffer contains incorrect data at index {i}"
             )

From 827466ead2c17c06b0de0aa4d0f1fcf06fdfa406 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 19 Sep 2025 11:35:40 -0700
Subject: [PATCH 04/25] Adds more tests.

---
 cuda_core/cuda/core/experimental/_memory.pyx  |  36 ++++--
 .../tests/ipc/test_ipc_mempool_multiple.py    |  53 +++++++++
 .../ipc/test_ipc_shared_allocation_handle.py  | 106 ++++++++++++++++++
 cuda_core/tests/ipc/utility.py                |   8 +-
 4 files changed, 192 insertions(+), 11 deletions(-)
 create mode 100644 cuda_core/tests/ipc/test_ipc_mempool_multiple.py
 create mode 100644 cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index fcf40ab7e..b4def295d 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -152,6 +152,18 @@ cdef class Buffer:
         raise_if_driver_error(err)
         return Buffer.from_handle(ptr, ipc_buffer.size, mr)
 
+    def export_to_channel(self, channel: IPCChannel | Sequence[IPCChannel]):
+        seq = channel if isinstance(channel, collections.abc.Sequence) else [channel]
+        for ch in seq:
+            ch.export(self);
+
+    @classmethod
+    def import_from_channel(cls, channel: IPCChannel | Sequence[IPCChannel]):
+        if isinstance(channel, collections.abc.Sequence):
+            return [ch.import_() for ch in channel]
+        else:
+            return channel.import_()
+
     def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
 
@@ -467,6 +479,16 @@ cdef class IPCChannel:
         handle = self._queue.get()
         return Buffer.import_(self._mr, handle)
 
+    def send_allocation_handle(self, alloc_handle: IPCAllocationHandle):
+        """Sends over this channel an allocation handle for exporting a
+        shared memory pool."""
+        self._proxy.send_allocation_handle(alloc_handle)
+
+    def receive_allocation_handle(self) -> IPCAllocationHandle:
+        """Receives over this channel an allocation handle for importing a
+        shared memory pool."""
+        return self._proxy.receive_allocation_handle()
+
 
 cdef class IPCChannelUnixSocket:
     """Unix-specific channel for sharing memory pools over sockets."""
@@ -484,7 +506,7 @@ cdef class IPCChannelUnixSocket:
         self._sock_out, self._sock_in = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET)
         return self
 
-    cpdef _send_allocation_handle(self, alloc_handle: IPCAllocationHandle):
+    cpdef send_allocation_handle(self, alloc_handle: IPCAllocationHandle):
         """Sends over this channel an allocation handle for exporting a
         shared memory pool."""
         self._sock_out.sendmsg(
@@ -492,7 +514,7 @@ cdef class IPCChannelUnixSocket:
             [(socket.SOL_SOCKET, socket.SCM_RIGHTS, array.array("i", [int(alloc_handle)]))]
         )
 
-    cpdef IPCAllocationHandle _receive_allocation_handle(self):
+    cpdef IPCAllocationHandle receive_allocation_handle(self):
         """Receives over this channel an allocation handle for importing a
         shared memory pool."""
         fds = array.array("i")
@@ -692,11 +714,11 @@ class DeviceMemoryResource(MemoryResource):
     def from_shared_channel(cls, device_id: int | Device, channel: IPCChannel) -> DeviceMemoryResource:
         """Create a device memory resource from a memory pool shared over an IPC channel."""
         device_id = getattr(device_id, 'device_id', device_id)
-        alloc_handle = channel._proxy._receive_allocation_handle()
-        return cls._from_allocation_handle(device_id, alloc_handle)
+        alloc_handle = channel.receive_allocation_handle()
+        return cls.from_allocation_handle(device_id, alloc_handle)
 
     @classmethod
-    def _from_allocation_handle(cls, device_id: int | Device, alloc_handle: IPCAllocationHandle) -> DeviceMemoryResource:
+    def from_allocation_handle(cls, device_id: int | Device, alloc_handle: IPCAllocationHandle) -> DeviceMemoryResource:
         """Create a device memory resource from an allocation handle.
 
         Construct a new `DeviceMemoryResource` instance that imports a memory
@@ -734,9 +756,9 @@ class DeviceMemoryResource(MemoryResource):
     def share_to_channel(self, channel : IPCChannel):
         if not self.is_ipc_enabled:
             raise RuntimeError("Memory resource is not IPC-enabled")
-        channel._proxy._send_allocation_handle(self._get_allocation_handle())
+        channel.send_allocation_handle(self.get_allocation_handle())
 
-    def _get_allocation_handle(self) -> IPCAllocationHandle:
+    def get_allocation_handle(self) -> IPCAllocationHandle:
         """Export the memory pool handle to be shared (requires IPC).
 
         The handle can be used to share the memory pool with other processes.
diff --git a/cuda_core/tests/ipc/test_ipc_mempool_multiple.py b/cuda_core/tests/ipc/test_ipc_mempool_multiple.py
new file mode 100644
index 000000000..792966b95
--- /dev/null
+++ b/cuda_core/tests/ipc/test_ipc_mempool_multiple.py
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental import Device
+from utility import IPCBufferTestHelper
+import multiprocessing
+import pytest
+
+CHILD_TIMEOUT_SEC = 10
+NBYTES = 64
+
+
+def test_ipc_mempool_multiple(device, ipc_memory_resource):
+    """Test IPC with memory pools using multiple processes."""
+    # Construct an IPC-enabled memory resource and share it over two channels.
+    mr = ipc_memory_resource
+    ch1, ch2 = (mr.create_ipc_channel() for _ in range(2))
+
+    # Allocate memory buffers and export them to each channel.
+    buffer1 = mr.allocate(NBYTES)
+    ch1.export(buffer1)
+    ch2.export(buffer1)
+    buffer2 = mr.allocate(NBYTES)
+    ch1.export(buffer2)
+    ch2.export(buffer2)
+
+    # Start the child processes.
+    p1 = multiprocessing.Process(target=child_main, args=(1, ch1))
+    p2 = multiprocessing.Process(target=child_main, args=(2, ch2))
+    p1.start()
+    p2.start()
+
+    # Wait for the child processes.
+    p1.join(timeout=CHILD_TIMEOUT_SEC)
+    p2.join(timeout=CHILD_TIMEOUT_SEC)
+    assert p1.exitcode == 0
+    assert p2.exitcode == 0
+
+    # Verify that the buffers were modified.
+    IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False)
+    IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True)
+
+
+def child_main(idx, channel):
+    device = Device()
+    device.set_current()
+    buffer1 = channel.import_() # implicitly set up the shared memory pool
+    buffer2 = channel.import_()
+    if idx == 1:
+        IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False)
+    elif idx == 2:
+        IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True)
+
diff --git a/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py b/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py
new file mode 100644
index 000000000..323254150
--- /dev/null
+++ b/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel
+from utility import IPCBufferTestHelper
+import multiprocessing
+import pytest
+from itertools import cycle
+
+CHILD_TIMEOUT_SEC = 10
+NBYTES = 64
+NWORKERS = 2
+NTASKS = 2
+
+def test_ipc_shared_allocation_handle(device, ipc_memory_resource):
+    """Demonstrate that a memory pool allocation handle can be reused for IPC
+    with multiple processes."""
+    # Set up communication.
+    ch1 = IPCChannel()
+    ch2 = IPCChannel()
+    q1 = multiprocessing.Queue()
+    q2 = multiprocessing.Queue()
+
+    # Start children.
+    p1 = multiprocessing.Process(target=child_main, args=(1, ch1, q1))
+    p2 = multiprocessing.Process(target=child_main, args=(2, ch2, q2))
+    p1.start()
+    p2.start()
+
+    # Set up the IPC-enabled memory pool and share it using one handle.
+    mr = ipc_memory_resource
+    alloc_handle = mr.get_allocation_handle()
+    ch1.send_allocation_handle(alloc_handle)
+    ch2.send_allocation_handle(alloc_handle)
+
+    # Allocate a share memory.
+    buf1 = mr.allocate(NBYTES)
+    buf2 = mr.allocate(NBYTES)
+    q1.put(buf1.export())
+    q2.put(buf2.export())
+
+    # Wait for children.
+    p1.join(timeout=CHILD_TIMEOUT_SEC)
+    p2.join(timeout=CHILD_TIMEOUT_SEC)
+    assert p1.exitcode == 0
+    assert p2.exitcode == 0
+
+    # Verify results.
+    IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1)
+    IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
+
+
+def child_main(idx, channel, queue):
+    """Fills a shared memory buffer."""
+    device = Device()
+    device.set_current()
+    alloc_handle = channel.receive_allocation_handle()
+    mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+    buffer_descriptor = queue.get()
+    buffer = Buffer.import_(mr, buffer_descriptor)
+    IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
+
+
+def test_ipc_shared_allocation_handle2(device, ipc_memory_resource):
+    """Demonstrate that a memory pool allocation handle can be reused for IPC
+    with multiple processes (simplified)."""
+    # Set up communication.
+    ch1 = IPCChannel()
+    ch2 = IPCChannel()
+
+    # Start children.
+    p1 = multiprocessing.Process(target=child_main2, args=(1, ch1))
+    p2 = multiprocessing.Process(target=child_main2, args=(2, ch2))
+    p1.start()
+    p2.start()
+
+    # Set up the IPC-enabled memory pool and share it using one handle.
+    mr = ipc_memory_resource
+    alloc_handle = mr.get_allocation_handle()
+    ch1.send_allocation_handle(alloc_handle)
+    ch2.send_allocation_handle(alloc_handle)
+
+    # Allocate a share memory.
+    buf1 = mr.allocate(NBYTES)
+    buf2 = mr.allocate(NBYTES)
+    ch1.export(buf1)
+    ch2.export(buf2)
+
+    # Wait for children.
+    p1.join(timeout=CHILD_TIMEOUT_SEC)
+    p2.join(timeout=CHILD_TIMEOUT_SEC)
+    assert p1.exitcode == 0
+    assert p2.exitcode == 0
+
+    # Verify results.
+    IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1)
+    IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
+
+
+def child_main2(idx, channel):
+    """Fills a shared memory buffer."""
+    device = Device()
+    device.set_current()
+    buffer = channel.import_()
+    IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
+
diff --git a/cuda_core/tests/ipc/utility.py b/cuda_core/tests/ipc/utility.py
index 3ca177dfe..781790a9d 100644
--- a/cuda_core/tests/ipc/utility.py
+++ b/cuda_core/tests/ipc/utility.py
@@ -47,23 +47,23 @@ def __init__(self, device, buffer):
         self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.buffer.size)
         self.stream = device.create_stream()
 
-    def fill_buffer(self, flipped=False):
+    def fill_buffer(self, flipped=False, starting_from=0):
         """Fill a device buffer with test pattern using unified memory."""
         ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte))
         op = (lambda i: 255 - i) if flipped else (lambda i: i)
         for i in range(self.buffer.size):
-            ptr[i] = ctypes.c_byte(op(i))
+            ptr[i] = ctypes.c_byte(op(starting_from + i))
         self.buffer.copy_from(self.scratch_buffer, stream=self.stream)
         self.device.sync()
 
-    def verify_buffer(self, flipped=False):
+    def verify_buffer(self, flipped=False, starting_from=0):
         """Verify the buffer contents."""
         self.scratch_buffer.copy_from(self.buffer, stream=self.stream)
         self.device.sync()
         ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte))
         op = (lambda i: 255 - i) if flipped else (lambda i: i)
         for i in range(self.buffer.size):
-            assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(i)).value, (
+            assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(starting_from + i)).value, (
                 f"Buffer contains incorrect data at index {i}"
             )
 

From 93d921738792632180612ac93948e2b9b9994303 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 19 Sep 2025 14:17:00 -0700
Subject: [PATCH 05/25] Removes sequence forms of certain function (exception
 behavior was unclear). Added a test for an error case.

---
 cuda_core/cuda/core/experimental/_memory.pyx | 24 ++++++--------------
 cuda_core/tests/ipc/test_ipc_errors.py       | 17 +++++++++++++-
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index b4def295d..fd752b84f 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -14,7 +14,6 @@ from dataclasses import dataclass
 from typing import Optional, TypeVar, Union, TYPE_CHECKING
 import abc
 import array
-import collections
 import cython
 import multiprocessing
 import os
@@ -152,17 +151,12 @@ cdef class Buffer:
         raise_if_driver_error(err)
         return Buffer.from_handle(ptr, ipc_buffer.size, mr)
 
-    def export_to_channel(self, channel: IPCChannel | Sequence[IPCChannel]):
-        seq = channel if isinstance(channel, collections.abc.Sequence) else [channel]
-        for ch in seq:
-            ch.export(self);
+    def export_to_channel(self, channel: IPCChannel):
+        channel.export(self);
 
     @classmethod
-    def import_from_channel(cls, channel: IPCChannel | Sequence[IPCChannel]):
-        if isinstance(channel, collections.abc.Sequence):
-            return [ch.import_() for ch in channel]
-        else:
-            return channel.import_()
+    def import_from_channel(cls, channel: IPCChannel):
+        return channel.import_()
 
     def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
@@ -461,13 +455,9 @@ cdef class IPCChannel:
         self._queue = multiprocessing.Queue()
         self._mr = None
 
-    def export(self, buffer: Buffer | collections.abc.Sequence):
-        if not isinstance(buffer, collections.abc.Sequence):
-            buffer = [buffer]
-
-        for buf in buffer:
-            handle = buf.export()
-            self._queue.put(handle)
+    def export(self, buffer: Buffer):
+        handle = buffer.export()
+        self._queue.put(handle)
 
     def import_(self, device: Optional[Device] = None):
         if self._mr is None:
diff --git a/cuda_core/tests/ipc/test_ipc_errors.py b/cuda_core/tests/ipc/test_ipc_errors.py
index 4ac3277e6..08cd41baf 100644
--- a/cuda_core/tests/ipc/test_ipc_errors.py
+++ b/cuda_core/tests/ipc/test_ipc_errors.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+from cuda.core.experimental._utils.cuda_utils import CUDAError
 from cuda.core.experimental import Device, DeviceMemoryResource, IPCChannel
 import multiprocessing
 import pytest
@@ -8,7 +9,21 @@
 CHILD_TIMEOUT_SEC = 10
 NBYTES = 64
 
-def test_ipc_errors(device, ipc_memory_resource):
+def test_share_to_wrong_channel(device, ipc_memory_resource):
+    mr1 = ipc_memory_resource
+    mr2 = DeviceMemoryResource(device, dict(ipc_enabled=True))
+
+    channel1 = mr1.create_ipc_channel()
+    buffer1 = mr1.allocate(NBYTES)
+    buffer2 = mr2.allocate(NBYTES)
+
+    channel1.export(buffer1) # ok
+
+    with pytest.raises(CUDAError):
+        channel1.export(buffer2)
+
+
+def test_ipc_child_errors(device, ipc_memory_resource):
     """Test expected errors with allocating from a shared IPC memory pool."""
     mr = ipc_memory_resource
     # Set up the IPC-enabled memory pool and share it.

From 476349ee68e087ad729ce8c47940bf57101346ed Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 19 Sep 2025 14:21:04 -0700
Subject: [PATCH 06/25] Changes channel methods export/import_ to
 send_buffer/receive_buffer, for clarity.

---
 cuda_core/cuda/core/experimental/_memory.pyx         |  4 ++--
 cuda_core/tests/ipc/test_ipc_errors.py               |  4 ++--
 cuda_core/tests/ipc/test_ipc_mempool.py              |  4 ++--
 cuda_core/tests/ipc/test_ipc_mempool_multiple.py     | 12 ++++++------
 .../tests/ipc/test_ipc_shared_allocation_handle.py   |  6 +++---
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index fd752b84f..c628aaf45 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -455,11 +455,11 @@ cdef class IPCChannel:
         self._queue = multiprocessing.Queue()
         self._mr = None
 
-    def export(self, buffer: Buffer):
+    def send_buffer(self, buffer: Buffer):
         handle = buffer.export()
         self._queue.put(handle)
 
-    def import_(self, device: Optional[Device] = None):
+    def receive_buffer(self, device: Optional[Device] = None):
         if self._mr is None:
             if device is None:
                 from cuda.core.experimental._device import Device
diff --git a/cuda_core/tests/ipc/test_ipc_errors.py b/cuda_core/tests/ipc/test_ipc_errors.py
index 08cd41baf..710c87ffb 100644
--- a/cuda_core/tests/ipc/test_ipc_errors.py
+++ b/cuda_core/tests/ipc/test_ipc_errors.py
@@ -17,10 +17,10 @@ def test_share_to_wrong_channel(device, ipc_memory_resource):
     buffer1 = mr1.allocate(NBYTES)
     buffer2 = mr2.allocate(NBYTES)
 
-    channel1.export(buffer1) # ok
+    channel1.send_buffer(buffer1) # ok
 
     with pytest.raises(CUDAError):
-        channel1.export(buffer2)
+        channel1.send_buffer(buffer2)
 
 
 def test_ipc_child_errors(device, ipc_memory_resource):
diff --git a/cuda_core/tests/ipc/test_ipc_mempool.py b/cuda_core/tests/ipc/test_ipc_mempool.py
index 25379b1ca..582cbc823 100644
--- a/cuda_core/tests/ipc/test_ipc_mempool.py
+++ b/cuda_core/tests/ipc/test_ipc_mempool.py
@@ -25,7 +25,7 @@ def test_ipc_mempool(device, ipc_memory_resource):
     helper.fill_buffer(flipped=False)
 
     # Export the buffer via IPC.
-    channel.export(buffer)
+    channel.send_buffer(buffer)
 
     # Wait for the child process.
     process.join(timeout=CHILD_TIMEOUT_SEC)
@@ -38,7 +38,7 @@ def test_ipc_mempool(device, ipc_memory_resource):
 def child_main(channel):
     device = Device()
     device.set_current()
-    buffer = channel.import_()
+    buffer = channel.receive_buffer()
     helper = IPCBufferTestHelper(device, buffer)
     helper.verify_buffer(flipped=False)
     helper.fill_buffer(flipped=True)
diff --git a/cuda_core/tests/ipc/test_ipc_mempool_multiple.py b/cuda_core/tests/ipc/test_ipc_mempool_multiple.py
index 792966b95..5edcb6f3a 100644
--- a/cuda_core/tests/ipc/test_ipc_mempool_multiple.py
+++ b/cuda_core/tests/ipc/test_ipc_mempool_multiple.py
@@ -18,11 +18,11 @@ def test_ipc_mempool_multiple(device, ipc_memory_resource):
 
     # Allocate memory buffers and export them to each channel.
     buffer1 = mr.allocate(NBYTES)
-    ch1.export(buffer1)
-    ch2.export(buffer1)
+    ch1.send_buffer(buffer1)
+    ch2.send_buffer(buffer1)
     buffer2 = mr.allocate(NBYTES)
-    ch1.export(buffer2)
-    ch2.export(buffer2)
+    ch1.send_buffer(buffer2)
+    ch2.send_buffer(buffer2)
 
     # Start the child processes.
     p1 = multiprocessing.Process(target=child_main, args=(1, ch1))
@@ -44,8 +44,8 @@ def test_ipc_mempool_multiple(device, ipc_memory_resource):
 def child_main(idx, channel):
     device = Device()
     device.set_current()
-    buffer1 = channel.import_() # implicitly set up the shared memory pool
-    buffer2 = channel.import_()
+    buffer1 = channel.receive_buffer() # implicitly set up the shared memory pool
+    buffer2 = channel.receive_buffer()
     if idx == 1:
         IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False)
     elif idx == 2:
diff --git a/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py b/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py
index 323254150..644052b24 100644
--- a/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py
+++ b/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py
@@ -83,8 +83,8 @@ def test_ipc_shared_allocation_handle2(device, ipc_memory_resource):
     # Allocate a share memory.
     buf1 = mr.allocate(NBYTES)
     buf2 = mr.allocate(NBYTES)
-    ch1.export(buf1)
-    ch2.export(buf2)
+    ch1.send_buffer(buf1)
+    ch2.send_buffer(buf2)
 
     # Wait for children.
     p1.join(timeout=CHILD_TIMEOUT_SEC)
@@ -101,6 +101,6 @@ def child_main2(idx, channel):
     """Fills a shared memory buffer."""
     device = Device()
     device.set_current()
-    buffer = channel.import_()
+    buffer = channel.receive_buffer()
     IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
 

From 2ed5be7bd926a0d3bf692183c391f4af03b276bf Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 23 Sep 2025 11:29:25 -0700
Subject: [PATCH 07/25] Implement serialization methods for Device, Buffer, and
 DeviceMemoryResource. Add tests for buffer IPC through serialization.

---
 cuda_core/cuda/core/experimental/_device.py   |  10 ++
 cuda_core/cuda/core/experimental/_memory.pyx  |  43 +++++-
 cuda_core/tests/conftest.py                   |   2 -
 cuda_core/tests/ipc/test_ipc_errors.py        |  60 ---------
 cuda_core/tests/ipc/test_ipc_mempool.py       |  44 ------
 .../tests/ipc/test_ipc_mempool_multiple.py    |  53 --------
 .../tests/{ipc => memory_ipc}/conftest.py     |   4 +-
 .../test_channel.py}                          |  99 ++++++++++++--
 cuda_core/tests/memory_ipc/test_errors.py     |  81 +++++++++++
 .../tests/memory_ipc/test_send_buffers.py     |  70 ++++++++++
 cuda_core/tests/memory_ipc/test_serialize.py  | 127 ++++++++++++++++++
 cuda_core/tests/memory_ipc/test_workerpool.py |  48 +++++++
 .../tests/{ipc => memory_ipc}/utility.py      |   3 +-
 cuda_core/tests/test_memory.py                |   2 +-
 14 files changed, 469 insertions(+), 177 deletions(-)
 delete mode 100644 cuda_core/tests/ipc/test_ipc_errors.py
 delete mode 100644 cuda_core/tests/ipc/test_ipc_mempool.py
 delete mode 100644 cuda_core/tests/ipc/test_ipc_mempool_multiple.py
 rename cuda_core/tests/{ipc => memory_ipc}/conftest.py (100%)
 rename cuda_core/tests/{ipc/test_ipc_shared_allocation_handle.py => memory_ipc/test_channel.py} (54%)
 create mode 100644 cuda_core/tests/memory_ipc/test_errors.py
 create mode 100644 cuda_core/tests/memory_ipc/test_send_buffers.py
 create mode 100644 cuda_core/tests/memory_ipc/test_serialize.py
 create mode 100644 cuda_core/tests/memory_ipc/test_workerpool.py
 rename cuda_core/tests/{ipc => memory_ipc}/utility.py (99%)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 0499baa58..d94e44b5b 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -1160,6 +1160,16 @@ def __int__(self):
     def __repr__(self):
         return f"<Device {self._id} ({self.name})>"
 
+    def __reduce__(self):
+        return Device._reconstruct, (self.device_id,)
+
+    @staticmethod
+    def _reconstruct(device_id):
+        device = Device(device_id)
+        if not device._has_inited:
+            device.set_current()
+        return device
+
     def set_current(self, ctx: Context = None) -> Union[Context, None]:
         """Set device to be used for GPU executions.
 
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index c628aaf45..7f5b9e54d 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -16,8 +16,10 @@ import abc
 import array
 import cython
 import multiprocessing
+import multiprocessing.reduction
 import os
 import platform
+import sys
 import weakref
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
@@ -28,7 +30,7 @@ if platform.system() == "Linux":
 
 if TYPE_CHECKING:
     import cuda.bindings.driver
-    from cuda.core.experimental._device import Device
+    from ._device import Device
 
 # TODO: define a memory property mixin class and make Buffer and
 # MemoryResource both inherit from it
@@ -72,6 +74,13 @@ cdef class Buffer:
     def __del__(self):
         self.close()
 
+    def __reduce__(self):
+        return Buffer._reconstruct, (self.memory_resource, self.export())
+
+    @staticmethod
+    def _reconstruct(mr, desc):
+        return Buffer.import_(mr, desc)
+
     cpdef close(self, stream: Stream = None):
         """Deallocate this buffer asynchronously on the given stream.
 
@@ -427,6 +436,15 @@ cdef class IPCAllocationHandle:
         """Close the handle."""
         self.close()
 
+    def __reduce__(self):
+        df = multiprocessing.reduction.DupFd(self.handle)
+        return IPCAllocationHandle._reconstruct, (df,)
+
+    @staticmethod
+    def _reconstruct(df):
+        self = IPCAllocationHandle._init(df.detach())
+        return self
+
     def __int__(self) -> int:
         if self._handle < 0:
             raise ValueError(
@@ -434,6 +452,11 @@ cdef class IPCAllocationHandle:
             )
         return self._handle
 
+    def detach(self):
+      handle = self._handle
+      self._handle = -1
+      return handle
+
     @property
     def handle(self) -> int:
         return self._handle
@@ -462,7 +485,7 @@ cdef class IPCChannel:
     def receive_buffer(self, device: Optional[Device] = None):
         if self._mr is None:
             if device is None:
-                from cuda.core.experimental._device import Device
+                from ._device import Device
                 device = Device()
             self._mr = DeviceMemoryResource.from_shared_channel(device, self)
 
@@ -678,6 +701,9 @@ class DeviceMemoryResource(MemoryResource):
             err, self._mempool_handle = driver.cuMemPoolCreate(properties)
             raise_if_driver_error(err)
 
+            if opts.ipc_enabled:
+                self.get_allocation_handle() # enables Buffer.export
+
     def __del__(self):
         self.close()
 
@@ -694,6 +720,18 @@ class DeviceMemoryResource(MemoryResource):
             self._mempool_owned = False
             self._is_imported = False
 
+    def __reduce__(self):
+        from ._device import Device
+        device = Device(self.device_id)
+        alloc_handle = self.get_allocation_handle()
+        df = multiprocessing.reduction.DupFd(alloc_handle.detach())
+        return DeviceMemoryResource._reconstruct, (device, df)
+
+    @staticmethod
+    def _reconstruct(device, df):
+        alloc_handle = IPCAllocationHandle._init(df.detach())
+        return DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+
     def create_ipc_channel(self):
         """Create an IPC memory channel for sharing allocations."""
         channel = IPCChannel()
@@ -740,7 +778,6 @@ class DeviceMemoryResource(MemoryResource):
 
         err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0)
         raise_if_driver_error(err)
-
         return self
 
     def share_to_channel(self, channel : IPCChannel):
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 5f42c35c1..c800aae3e 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -72,5 +72,3 @@ def pop_all_contexts():
 
 
 skipif_need_cuda_headers = pytest.mark.skipif(helpers.CUDA_INCLUDE_PATH is None, reason="need CUDA header")
-
-
diff --git a/cuda_core/tests/ipc/test_ipc_errors.py b/cuda_core/tests/ipc/test_ipc_errors.py
deleted file mode 100644
index 710c87ffb..000000000
--- a/cuda_core/tests/ipc/test_ipc_errors.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-from cuda.core.experimental._utils.cuda_utils import CUDAError
-from cuda.core.experimental import Device, DeviceMemoryResource, IPCChannel
-import multiprocessing
-import pytest
-
-CHILD_TIMEOUT_SEC = 10
-NBYTES = 64
-
-def test_share_to_wrong_channel(device, ipc_memory_resource):
-    mr1 = ipc_memory_resource
-    mr2 = DeviceMemoryResource(device, dict(ipc_enabled=True))
-
-    channel1 = mr1.create_ipc_channel()
-    buffer1 = mr1.allocate(NBYTES)
-    buffer2 = mr2.allocate(NBYTES)
-
-    channel1.send_buffer(buffer1) # ok
-
-    with pytest.raises(CUDAError):
-        channel1.send_buffer(buffer2)
-
-
-def test_ipc_child_errors(device, ipc_memory_resource):
-    """Test expected errors with allocating from a shared IPC memory pool."""
-    mr = ipc_memory_resource
-    # Set up the IPC-enabled memory pool and share it.
-    channel = IPCChannel()
-    mr.share_to_channel(channel)
-
-    # Start a child process to generate error info.
-    queue = multiprocessing.Queue()
-    process = multiprocessing.Process(target=child_main, args=(channel, queue))
-    process.start()
-
-    # Check the errors.
-    exc_type, exc_msg = queue.get(timeout=CHILD_TIMEOUT_SEC)
-    assert exc_type is TypeError
-    assert exc_msg == "Cannot allocate from shared memory pool imported via IPC"
-
-    # Wait for the child process.
-    process.join(timeout=CHILD_TIMEOUT_SEC)
-    assert process.exitcode == 0
-
-
-def child_main(channel, queue):
-    """Child process that pushes IPC errors to a shared queue for testing."""
-    device = Device()
-    device.set_current()
-
-    mr = DeviceMemoryResource.from_shared_channel(device, channel)
-
-    # Allocating from an imported pool.
-    try:
-        mr.allocate(NBYTES)
-    except Exception as e:
-        exc_info = type(e), str(e)
-        queue.put(exc_info)
diff --git a/cuda_core/tests/ipc/test_ipc_mempool.py b/cuda_core/tests/ipc/test_ipc_mempool.py
deleted file mode 100644
index 582cbc823..000000000
--- a/cuda_core/tests/ipc/test_ipc_mempool.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-from cuda.core.experimental import Device
-from utility import IPCBufferTestHelper
-import multiprocessing
-import pytest
-
-CHILD_TIMEOUT_SEC = 10
-NBYTES = 64
-
-def test_ipc_mempool(device, ipc_memory_resource):
-    """Test IPC with memory pools."""
-    # Set up the IPC-enabled memory pool and share it.
-    mr = ipc_memory_resource
-    channel = mr.create_ipc_channel()
-
-    # Start the child process.
-    process = multiprocessing.Process(target=child_main, args=(channel,))
-    process.start()
-
-    # Allocate and fill memory.
-    buffer = mr.allocate(NBYTES)
-    helper = IPCBufferTestHelper(device, buffer)
-    helper.fill_buffer(flipped=False)
-
-    # Export the buffer via IPC.
-    channel.send_buffer(buffer)
-
-    # Wait for the child process.
-    process.join(timeout=CHILD_TIMEOUT_SEC)
-    assert process.exitcode == 0
-
-    # Verify that the buffer was modified.
-    helper.verify_buffer(flipped=True)
-
-
-def child_main(channel):
-    device = Device()
-    device.set_current()
-    buffer = channel.receive_buffer()
-    helper = IPCBufferTestHelper(device, buffer)
-    helper.verify_buffer(flipped=False)
-    helper.fill_buffer(flipped=True)
diff --git a/cuda_core/tests/ipc/test_ipc_mempool_multiple.py b/cuda_core/tests/ipc/test_ipc_mempool_multiple.py
deleted file mode 100644
index 5edcb6f3a..000000000
--- a/cuda_core/tests/ipc/test_ipc_mempool_multiple.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-from cuda.core.experimental import Device
-from utility import IPCBufferTestHelper
-import multiprocessing
-import pytest
-
-CHILD_TIMEOUT_SEC = 10
-NBYTES = 64
-
-
-def test_ipc_mempool_multiple(device, ipc_memory_resource):
-    """Test IPC with memory pools using multiple processes."""
-    # Construct an IPC-enabled memory resource and share it over two channels.
-    mr = ipc_memory_resource
-    ch1, ch2 = (mr.create_ipc_channel() for _ in range(2))
-
-    # Allocate memory buffers and export them to each channel.
-    buffer1 = mr.allocate(NBYTES)
-    ch1.send_buffer(buffer1)
-    ch2.send_buffer(buffer1)
-    buffer2 = mr.allocate(NBYTES)
-    ch1.send_buffer(buffer2)
-    ch2.send_buffer(buffer2)
-
-    # Start the child processes.
-    p1 = multiprocessing.Process(target=child_main, args=(1, ch1))
-    p2 = multiprocessing.Process(target=child_main, args=(2, ch2))
-    p1.start()
-    p2.start()
-
-    # Wait for the child processes.
-    p1.join(timeout=CHILD_TIMEOUT_SEC)
-    p2.join(timeout=CHILD_TIMEOUT_SEC)
-    assert p1.exitcode == 0
-    assert p2.exitcode == 0
-
-    # Verify that the buffers were modified.
-    IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False)
-    IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True)
-
-
-def child_main(idx, channel):
-    device = Device()
-    device.set_current()
-    buffer1 = channel.receive_buffer() # implicitly set up the shared memory pool
-    buffer2 = channel.receive_buffer()
-    if idx == 1:
-        IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False)
-    elif idx == 2:
-        IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True)
-
diff --git a/cuda_core/tests/ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py
similarity index 100%
rename from cuda_core/tests/ipc/conftest.py
rename to cuda_core/tests/memory_ipc/conftest.py
index 2ac6d858b..39f787eb0 100644
--- a/cuda_core/tests/ipc/conftest.py
+++ b/cuda_core/tests/memory_ipc/conftest.py
@@ -2,10 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+
 from cuda.core.experimental import Device, DeviceMemoryResource
 
 POOL_SIZE = 2097152
 
+
 @pytest.fixture(scope="function")
 def device():
     """Obtains a device suitable for IPC-enabled mempool tests, or skips."""
@@ -29,5 +31,3 @@ def ipc_memory_resource(device):
     mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True))
     assert mr.is_ipc_enabled
     return mr
-
-
diff --git a/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py b/cuda_core/tests/memory_ipc/test_channel.py
similarity index 54%
rename from cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py
rename to cuda_core/tests/memory_ipc/test_channel.py
index 644052b24..c118bc122 100644
--- a/cuda_core/tests/ipc/test_ipc_shared_allocation_handle.py
+++ b/cuda_core/tests/memory_ipc/test_channel.py
@@ -1,17 +1,95 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel
-from utility import IPCBufferTestHelper
 import multiprocessing
-import pytest
-from itertools import cycle
+
+from utility import IPCBufferTestHelper
+
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel
 
 CHILD_TIMEOUT_SEC = 10
 NBYTES = 64
 NWORKERS = 2
 NTASKS = 2
 
+
+def test_ipc_mempool(device, ipc_memory_resource):
+    """Test IPC with memory pools."""
+    # Set up the IPC-enabled memory pool and share it.
+    mr = ipc_memory_resource
+    channel = mr.create_ipc_channel()
+
+    # Start the child process.
+    process = multiprocessing.Process(target=child_main1, args=(channel,))
+    process.start()
+
+    # Allocate and fill memory.
+    buffer = mr.allocate(NBYTES)
+    helper = IPCBufferTestHelper(device, buffer)
+    helper.fill_buffer(flipped=False)
+
+    # Export the buffer via IPC.
+    channel.send_buffer(buffer)
+
+    # Wait for the child process.
+    process.join(timeout=CHILD_TIMEOUT_SEC)
+    assert process.exitcode == 0
+
+    # Verify that the buffer was modified.
+    helper.verify_buffer(flipped=True)
+
+
+def child_main1(channel):
+    device = Device()
+    device.set_current()
+    buffer = channel.receive_buffer()
+    helper = IPCBufferTestHelper(device, buffer)
+    helper.verify_buffer(flipped=False)
+    helper.fill_buffer(flipped=True)
+
+
+def test_ipc_mempool_multiple(device, ipc_memory_resource):
+    """Test IPC with memory pools using multiple processes."""
+    # Construct an IPC-enabled memory resource and share it over two channels.
+    mr = ipc_memory_resource
+    ch1, ch2 = (mr.create_ipc_channel() for _ in range(2))
+
+    # Allocate memory buffers and export them to each channel.
+    buffer1 = mr.allocate(NBYTES)
+    ch1.send_buffer(buffer1)
+    ch2.send_buffer(buffer1)
+    buffer2 = mr.allocate(NBYTES)
+    ch1.send_buffer(buffer2)
+    ch2.send_buffer(buffer2)
+
+    # Start the child processes.
+    p1 = multiprocessing.Process(target=child_main2, args=(1, ch1))
+    p2 = multiprocessing.Process(target=child_main2, args=(2, ch2))
+    p1.start()
+    p2.start()
+
+    # Wait for the child processes.
+    p1.join(timeout=CHILD_TIMEOUT_SEC)
+    p2.join(timeout=CHILD_TIMEOUT_SEC)
+    assert p1.exitcode == 0
+    assert p2.exitcode == 0
+
+    # Verify that the buffers were modified.
+    IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False)
+    IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True)
+
+
+def child_main2(idx, channel):
+    device = Device()
+    device.set_current()
+    buffer1 = channel.receive_buffer()  # implicitly set up the shared memory pool
+    buffer2 = channel.receive_buffer()
+    if idx == 1:
+        IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False)
+    elif idx == 2:
+        IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True)
+
+
 def test_ipc_shared_allocation_handle(device, ipc_memory_resource):
     """Demonstrate that a memory pool allocation handle can be reused for IPC
     with multiple processes."""
@@ -22,8 +100,8 @@ def test_ipc_shared_allocation_handle(device, ipc_memory_resource):
     q2 = multiprocessing.Queue()
 
     # Start children.
-    p1 = multiprocessing.Process(target=child_main, args=(1, ch1, q1))
-    p2 = multiprocessing.Process(target=child_main, args=(2, ch2, q2))
+    p1 = multiprocessing.Process(target=child_main3, args=(1, ch1, q1))
+    p2 = multiprocessing.Process(target=child_main3, args=(2, ch2, q2))
     p1.start()
     p2.start()
 
@@ -50,7 +128,7 @@ def test_ipc_shared_allocation_handle(device, ipc_memory_resource):
     IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
 
 
-def child_main(idx, channel, queue):
+def child_main3(idx, channel, queue):
     """Fills a shared memory buffer."""
     device = Device()
     device.set_current()
@@ -69,8 +147,8 @@ def test_ipc_shared_allocation_handle2(device, ipc_memory_resource):
     ch2 = IPCChannel()
 
     # Start children.
-    p1 = multiprocessing.Process(target=child_main2, args=(1, ch1))
-    p2 = multiprocessing.Process(target=child_main2, args=(2, ch2))
+    p1 = multiprocessing.Process(target=child_main4, args=(1, ch1))
+    p2 = multiprocessing.Process(target=child_main4, args=(2, ch2))
     p1.start()
     p2.start()
 
@@ -97,10 +175,9 @@ def test_ipc_shared_allocation_handle2(device, ipc_memory_resource):
     IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
 
 
-def child_main2(idx, channel):
+def child_main4(idx, channel):
     """Fills a shared memory buffer."""
     device = Device()
     device.set_current()
     buffer = channel.receive_buffer()
     IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
-
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
new file mode 100644
index 000000000..b8bb9bd4b
--- /dev/null
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -0,0 +1,81 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import multiprocessing
+
+from cuda.core.experimental import Buffer, DeviceMemoryResource
+from cuda.core.experimental._utils.cuda_utils import CUDAError
+
+CHILD_TIMEOUT_SEC = 10
+NBYTES = 64
+POOL_SIZE = 2097152
+
+
+class ChildErrorHarness:
+    """Test harness for checking errors in child processes. Subclasses override
+    PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples)."""
+
+    def test_main(self, device, ipc_memory_resource):
+        """Parent process that checks child errors."""
+        # Attach fixtures to this object for convenience. These can be accessed
+        # from PARENT_ACTION.
+        self.device = device
+        self.mr = ipc_memory_resource
+
+        # Start a child process to generate error info.
+        pipe = [multiprocessing.Queue() for _ in range(2)]
+        process = multiprocessing.Process(target=self.child_main, args=(pipe,))
+        process.start()
+
+        # Interact.
+        self.PARENT_ACTION(pipe[0])
+
+        # Check the error.
+        exc_type, exc_msg = pipe[1].get(timeout=CHILD_TIMEOUT_SEC)
+        self.ASSERT(exc_type, exc_msg)
+
+        # Wait for the child process.
+        process.join(timeout=CHILD_TIMEOUT_SEC)
+        assert process.exitcode == 0
+
+    def child_main(self, pipe):
+        """Child process that pushes IPC errors to a shared pipe for testing."""
+        try:
+            self.CHILD_ACTION(pipe[0])
+        except Exception as e:
+            exc_info = type(e), str(e)
+        else:
+            exc_info = None, None
+        pipe[1].put(exc_info)
+
+
+class TestAllocFromImportedMr(ChildErrorHarness):
+    """Error when attempting to allocate from an import memory resource."""
+
+    def PARENT_ACTION(self, queue):
+        queue.put(self.mr)
+
+    def CHILD_ACTION(self, queue):
+        mr = queue.get()
+        mr.allocate(NBYTES)
+
+    def ASSERT(self, exc_type, exc_msg):
+        assert exc_type is TypeError
+        assert exc_msg == "Cannot allocate from shared memory pool imported via IPC"
+
+
+class TestImportWrongMR(ChildErrorHarness):
+    """Error when importing a buffer from the wrong memory resource."""
+
+    def PARENT_ACTION(self, queue):
+        mr2 = DeviceMemoryResource(self.device, dict(max_size=POOL_SIZE, ipc_enabled=True))
+        buffer = mr2.allocate(NBYTES)
+        queue.put([self.mr, buffer.export()])  # Note: mr does not own this buffer
+
+    def CHILD_ACTION(self, queue):
+        mr, buffer_desc = queue.get()
+        Buffer.import_(mr, buffer_desc)
+
+    def ASSERT(self, exc_type, exc_msg):
+        assert exc_type is CUDAError
+        assert "CUDA_ERROR_INVALID_VALUE" in exc_msg
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
new file mode 100644
index 000000000..e835e53f3
--- /dev/null
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental import Device, DeviceMemoryResource
+from itertools import cycle
+from utility import IPCBufferTestHelper
+import multiprocessing
+
+CHILD_TIMEOUT_SEC = 10
+NBYTES = 64
+NMRS = 3
+NTASKS = 7
+POOL_SIZE = 2097152
+
+def test_ipc_send_buffers(device, ipc_memory_resource):
+    """Test passing buffers directly to a child separately from a memory resource."""
+    mr = ipc_memory_resource
+
+    # Allocate and fill memory.
+    buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)]
+    for buffer in buffers:
+        helper = IPCBufferTestHelper(device, buffer)
+        helper.fill_buffer(flipped=False)
+
+    # Start the child process. Send the buffer directly.
+    process = multiprocessing.Process(target=child_main, args=(buffers,))
+    process.start()
+
+    # Wait for the child process.
+    process.join(timeout=CHILD_TIMEOUT_SEC)
+    assert process.exitcode == 0
+
+    # Verify that the buffers were modified.
+    for buffer in buffers:
+        helper = IPCBufferTestHelper(device, buffer)
+        helper.verify_buffer(flipped=True)
+
+def test_ipc_send_buffers_multi(device, ipc_memory_resource):
+    """Test passing buffers sourced from multiple memory resources."""
+    # Set up several IPC-enabled memory pools.
+    mrs = [ipc_memory_resource] + [
+        DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1)
+    ]
+
+    # Allocate and fill memory.
+    buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
+    for buffer in buffers:
+        helper = IPCBufferTestHelper(device, buffer)
+        helper.fill_buffer(flipped=False)
+
+    # Start the child process.
+    process = multiprocessing.Process(target=child_main, args=(buffers,))
+    process.start()
+
+    # Wait for the child process.
+    process.join(timeout=CHILD_TIMEOUT_SEC)
+    assert process.exitcode == 0
+
+    # Verify that the buffers were modified.
+    for buffer in buffers:
+        helper = IPCBufferTestHelper(device, buffer)
+        helper.verify_buffer(flipped=True)
+
+
+def child_main(buffers):
+    device = Device()
+    for buffer in buffers:
+        helper = IPCBufferTestHelper(device, buffer)
+        helper.verify_buffer(flipped=False)
+        helper.fill_buffer(flipped=True)
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
new file mode 100644
index 000000000..f12489dea
--- /dev/null
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import multiprocessing
+
+import pytest
+from utility import IPCBufferTestHelper
+
+from cuda.core.experimental import Buffer, DeviceMemoryResource
+
+CHILD_TIMEOUT_SEC = 10
+NBYTES = 64
+POOL_SIZE = 2097152
+
+
+class TestObjectSerialization:
+    @pytest.mark.parametrize("use_alloc_handle", [True, False])
+    def test_main(self, use_alloc_handle, device, ipc_memory_resource):
+        """Test sending IPC memory objects to a child through a queue."""
+        mr = ipc_memory_resource
+
+        # Start the child process.
+        pipe = [multiprocessing.Queue() for _ in range(2)]
+        process = multiprocessing.Process(target=self.child_main, args=(pipe, use_alloc_handle))
+        process.start()
+
+        # Send a device description.
+        pipe[0].put(device)
+        device_id = pipe[1].get()
+        assert device_id == device.device_id
+
+        # Send a memory resource directly or by allocation handle.
+        # Note: there is no apparent way to check the ID between processes.
+        if use_alloc_handle:
+            # Send MR by a handle.
+            alloc_handle = mr.get_allocation_handle()
+            pipe[0].put(alloc_handle)
+        else:
+            # Send MR directly.
+            pipe[0].put(mr)
+
+        # Send a buffer.
+        buffer = mr.allocate(NBYTES)
+        helper = IPCBufferTestHelper(device, buffer)
+        helper.fill_buffer(flipped=False)
+        pipe[0].put(buffer)
+        pipe[1].get()  # signal done
+        helper.verify_buffer(flipped=True)
+
+        # Wait for the child process.
+        process.join(timeout=CHILD_TIMEOUT_SEC)
+        assert process.exitcode == 0
+
+    def child_main(self, pipe, use_alloc_handle):
+        # Device.
+        device = pipe[0].get()
+        pipe[1].put(device.device_id)
+
+        # Memory resource.
+        if use_alloc_handle:
+            alloc_handle = pipe[0].get()
+            mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+        else:
+            mr = pipe[0].get()
+
+        # Buffer.
+        buffer = pipe[0].get()
+        assert buffer.memory_resource.handle == mr.handle
+        helper = IPCBufferTestHelper(device, buffer)
+        helper.verify_buffer(flipped=False)
+        helper.fill_buffer(flipped=True)
+        pipe[1].put(None)
+
+
+def test_object_passing(device, ipc_memory_resource):
+    """Test sending objects as arguments when starting a process."""
+    # Define the objects.
+    mr = ipc_memory_resource
+    alloc_handle = mr.get_allocation_handle()
+    buffer = mr.allocate(NBYTES)
+    buffer_desc = buffer.export()
+
+    helper = IPCBufferTestHelper(device, buffer)
+    helper.fill_buffer(flipped=False)
+
+    # Start the child process.
+    process = multiprocessing.Process(target=child_main, args=(device, alloc_handle, mr, buffer_desc, buffer))
+    process.start()
+    process.join(timeout=CHILD_TIMEOUT_SEC)
+    assert process.exitcode == 0
+
+    helper.verify_buffer(flipped=True)
+
+
+def child_main(device, alloc_handle, mr1, buffer_desc, buffer1):
+    mr2 = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+
+    # OK to build the buffer from either mr and descriptor.
+    # These all point to the same buffer.
+    buffer2 = Buffer.import_(mr1, buffer_desc)
+    buffer3 = Buffer.import_(mr2, buffer_desc)
+
+    helper1 = IPCBufferTestHelper(device, buffer1)
+    helper2 = IPCBufferTestHelper(device, buffer2)
+    helper3 = IPCBufferTestHelper(device, buffer3)
+
+    helper1.verify_buffer(flipped=False)
+    helper2.verify_buffer(flipped=False)
+    helper3.verify_buffer(flipped=False)
+
+    helper1.fill_buffer(flipped=True)
+
+    helper1.verify_buffer(flipped=True)
+    helper2.verify_buffer(flipped=True)
+    helper3.verify_buffer(flipped=True)
+
+    helper2.fill_buffer(flipped=False)
+
+    helper1.verify_buffer(flipped=False)
+    helper2.verify_buffer(flipped=False)
+    helper3.verify_buffer(flipped=False)
+
+    helper3.fill_buffer(flipped=True)
+
+    helper1.verify_buffer(flipped=True)
+    helper2.verify_buffer(flipped=True)
+    helper3.verify_buffer(flipped=True)
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
new file mode 100644
index 000000000..2dc29da8b
--- /dev/null
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import multiprocessing
+from itertools import cycle
+
+from utility import IPCBufferTestHelper
+
+from cuda.core.experimental import Device, DeviceMemoryResource
+
+CHILD_TIMEOUT_SEC = 10
+NBYTES = 64
+NWORKERS = 2
+NMRS = 3
+NTASKS = 20
+POOL_SIZE = 2097152
+
+
+def test_ipc_workerpool(device, ipc_memory_resource):
+    """Test IPC with a worker pool."""
+    mr = ipc_memory_resource
+    buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)]
+    with multiprocessing.Pool(processes=NWORKERS) as pool:
+        pool.map(process_buffer, buffers)
+
+    for buffer in buffers:
+        helper = IPCBufferTestHelper(device, buffer)
+        helper.verify_buffer(flipped=True)
+
+
+def test_ipc_workerpool_multi_mr(device, ipc_memory_resource):
+    """Test IPC with a worker pool using multiple memory resources."""
+    mrs = [ipc_memory_resource] + [
+        DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1)
+    ]
+    buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
+    with multiprocessing.Pool(processes=NWORKERS) as pool:
+        pool.map(process_buffer, buffers)
+
+    for buffer in buffers:
+        helper = IPCBufferTestHelper(device, buffer)
+        helper.verify_buffer(flipped=True)
+
+
+def process_buffer(buffer):
+    device = Device()
+    helper = IPCBufferTestHelper(device, buffer)
+    helper.fill_buffer(flipped=True)
diff --git a/cuda_core/tests/ipc/utility.py b/cuda_core/tests/memory_ipc/utility.py
similarity index 99%
rename from cuda_core/tests/ipc/utility.py
rename to cuda_core/tests/memory_ipc/utility.py
index 781790a9d..766188d10 100644
--- a/cuda_core/tests/ipc/utility.py
+++ b/cuda_core/tests/memory_ipc/utility.py
@@ -7,9 +7,11 @@
     from cuda import cuda as driver
 
 import ctypes
+
 from cuda.core.experimental import Buffer, MemoryResource
 from cuda.core.experimental._utils.cuda_utils import handle_return
 
+
 class DummyUnifiedMemoryResource(MemoryResource):
     def __init__(self, device):
         self.device = device
@@ -66,4 +68,3 @@ def verify_buffer(self, flipped=False, starting_from=0):
             assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(starting_from + i)).value, (
                 f"Buffer contains incorrect data at index {i}"
             )
-
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index c14de8585..497ed751e 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -348,7 +348,7 @@ def test_mempool(mempool_device):
     ipc_error_msg = "Memory resource is not IPC-enabled"
 
     with pytest.raises(RuntimeError, match=ipc_error_msg):
-        mr._get_allocation_handle()
+        mr.get_allocation_handle()
 
     with pytest.raises(RuntimeError, match=ipc_error_msg):
         buffer.export()

From 7f7f80fe2ecdf3f8a6eb80f79882aede77f072f5 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 24 Sep 2025 14:49:51 -0700
Subject: [PATCH 08/25] Protects serialization where needed to avoid resource
 leaks. Adds a registry from imported memory resources so that buffers can be
 serialized using an mr key. Test updates.

---
 cuda_core/cuda/core/experimental/_memory.pyx  |  85 ++++++++---
 cuda_core/tests/memory_ipc/test_channel.py    |   4 +-
 cuda_core/tests/memory_ipc/test_errors.py     |  45 +++++-
 .../tests/memory_ipc/test_send_buffers.py     |  10 +-
 cuda_core/tests/memory_ipc/test_serialize.py  | 135 +++++++++++++-----
 cuda_core/tests/memory_ipc/test_workerpool.py | 107 ++++++++++----
 6 files changed, 290 insertions(+), 96 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 7f5b9e54d..4dc5c52e5 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -16,6 +16,7 @@ import abc
 import array
 import cython
 import multiprocessing
+import multiprocessing.context
 import multiprocessing.reduction
 import os
 import platform
@@ -437,7 +438,9 @@ cdef class IPCAllocationHandle:
         self.close()
 
     def __reduce__(self):
-        df = multiprocessing.reduction.DupFd(self.handle)
+        multiprocessing.context.assert_spawning(self)
+        fd = os.dup(self.handle)
+        df = multiprocessing.reduction.DupFd(fd)
         return IPCAllocationHandle._reconstruct, (df,)
 
     @staticmethod
@@ -617,6 +620,12 @@ class DeviceMemoryResourceAttributes:
     del mempool_property
 
 
+# Holds DeviceMemoryResource objects imported by this process.
+# This enables buffer serialization, as buffers can reduce to a pair
+# of comprising the memory resource `remote_id` (the key into this registry)
+# and the serialized buffer descriptor.
+_ipc_registry = {}
+
 class DeviceMemoryResource(MemoryResource):
     """Create a device memory resource managing a stream-ordered memory pool.
 
@@ -640,7 +649,7 @@ class DeviceMemoryResource(MemoryResource):
         device memory resource does not own the pool (`is_handle_owned` is
         `False`), and closing the resource has no effect.
     """
-    __slots__ = "_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type", "_mempool_owned", "_is_imported"
+    __slots__ = "_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type", "_mempool_owned", "_is_imported", "_remote_id"
 
     def __init__(self, device_id: int | Device, options=None):
         device_id = getattr(device_id, 'device_id', device_id)
@@ -656,6 +665,7 @@ class DeviceMemoryResource(MemoryResource):
             self._ipc_handle_type = _NOIPC_HANDLE_TYPE
             self._mempool_owned = False
             self._is_imported = False
+            self._remote_id = None
 
             err, self._mempool_handle = driver.cuDeviceGetMemPool(self.device_id)
             raise_if_driver_error(err)
@@ -697,6 +707,7 @@ class DeviceMemoryResource(MemoryResource):
             self._ipc_handle_type = properties.handleTypes
             self._mempool_owned = True
             self._is_imported = False
+            self._remote_id = None
 
             err, self._mempool_handle = driver.cuMemPoolCreate(properties)
             raise_if_driver_error(err)
@@ -709,28 +720,57 @@ class DeviceMemoryResource(MemoryResource):
 
     def close(self):
         """Close the device memory resource and destroy the associated memory pool if owned."""
-        if self._mempool_handle is not None and self._mempool_owned:
-            err, = driver.cuMemPoolDestroy(self._mempool_handle)
-            raise_if_driver_error(err)
+        if self._mempool_handle is not None:
+            try:
+                if self._mempool_owned:
+                    err, = driver.cuMemPoolDestroy(self._mempool_handle)
+                    raise_if_driver_error(err)
+            finally:
+                self._dev_id = None
+                self._mempool_handle = None
+                self._attributes = None
+                self._ipc_handle_type = _NOIPC_HANDLE_TYPE
+                self._mempool_owned = False
+                self._is_imported = False
+                self._remote_id = None
 
-            self._dev_id = None
-            self._mempool_handle = None
-            self._attributes = None
-            self._ipc_handle_type = _NOIPC_HANDLE_TYPE
-            self._mempool_owned = False
-            self._is_imported = False
 
     def __reduce__(self):
-        from ._device import Device
-        device = Device(self.device_id)
-        alloc_handle = self.get_allocation_handle()
-        df = multiprocessing.reduction.DupFd(alloc_handle.detach())
-        return DeviceMemoryResource._reconstruct, (device, df)
+        # If spawning a new process, serialize the resources; otherwise, just
+        # send the remote_id, using the registry on the receiving end.
+        is_spawning = multiprocessing.context.get_spawning_popen() is not None
+        if is_spawning:
+            from ._device import Device
+            device = Device(self.device_id)
+            alloc_handle = self.get_allocation_handle()
+            return DeviceMemoryResource._reconstruct, (device, alloc_handle, self.remote_id)
+        else:
+            return DeviceMemoryResource.from_registry, (self.remote_id,)
+
+    @staticmethod
+    def _reconstruct(device, alloc_handle, remote_id):
+        self = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+        self.register(remote_id)
+        return self
 
     @staticmethod
-    def _reconstruct(device, df):
-        alloc_handle = IPCAllocationHandle._init(df.detach())
-        return DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+    def from_registry(remote_id):
+        try:
+            return _ipc_registry[remote_id]
+        except KeyError:
+            raise RuntimeError(f"Memory resource with {remote_id=} was not found")
+
+    def register(self, remote_id: int):
+        if remote_id not in _ipc_registry:
+            assert self._remote_id is None or self._remote_id == remote_id
+            _ipc_registry[remote_id] = self
+            self._remote_id = remote_id
+
+    @property
+    def remote_id(self):
+        if self._remote_id is None and not self._is_imported:
+            self._remote_id = int(self._mempool_handle)
+        return self._remote_id
 
     def create_ipc_channel(self):
         """Create an IPC memory channel for sharing allocations."""
@@ -746,7 +786,7 @@ class DeviceMemoryResource(MemoryResource):
         return cls.from_allocation_handle(device_id, alloc_handle)
 
     @classmethod
-    def from_allocation_handle(cls, device_id: int | Device, alloc_handle: IPCAllocationHandle) -> DeviceMemoryResource:
+    def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource:
         """Create a device memory resource from an allocation handle.
 
         Construct a new `DeviceMemoryResource` instance that imports a memory
@@ -759,7 +799,7 @@ class DeviceMemoryResource(MemoryResource):
             The ID of the device or a Device object for which the memory
             resource is created.
 
-        alloc_handle : int
+        alloc_handle : int | IPCAllocationHandle
             The shareable handle of the device memory resource to import.
 
         Returns
@@ -775,6 +815,7 @@ class DeviceMemoryResource(MemoryResource):
         self._ipc_handle_type = _IPC_HANDLE_TYPE
         self._mempool_owned = True
         self._is_imported = True
+        self._remote_id = None
 
         err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0)
         raise_if_driver_error(err)
@@ -797,6 +838,8 @@ class DeviceMemoryResource(MemoryResource):
         """
         if not self.is_ipc_enabled:
             raise RuntimeError("Memory resource is not IPC-enabled")
+        if self._is_imported:
+            raise RuntimeError("Imported memory resource cannot be exported")
         err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0)
         raise_if_driver_error(err)
         return IPCAllocationHandle._init(alloc_handle)
diff --git a/cuda_core/tests/memory_ipc/test_channel.py b/cuda_core/tests/memory_ipc/test_channel.py
index c118bc122..49e8f3be2 100644
--- a/cuda_core/tests/memory_ipc/test_channel.py
+++ b/cuda_core/tests/memory_ipc/test_channel.py
@@ -7,7 +7,7 @@
 
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel
 
-CHILD_TIMEOUT_SEC = 10
+CHILD_TIMEOUT_SEC = 4
 NBYTES = 64
 NWORKERS = 2
 NTASKS = 2
@@ -134,7 +134,7 @@ def child_main3(idx, channel, queue):
     device.set_current()
     alloc_handle = channel.receive_allocation_handle()
     mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
-    buffer_descriptor = queue.get()
+    buffer_descriptor = queue.get(timeout=CHILD_TIMEOUT_SEC)
     buffer = Buffer.import_(mr, buffer_descriptor)
     IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
 
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index b8bb9bd4b..a6003d19b 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -6,7 +6,7 @@
 from cuda.core.experimental import Buffer, DeviceMemoryResource
 from cuda.core.experimental._utils.cuda_utils import CUDAError
 
-CHILD_TIMEOUT_SEC = 10
+CHILD_TIMEOUT_SEC = 4
 NBYTES = 64
 POOL_SIZE = 2097152
 
@@ -24,7 +24,7 @@ def test_main(self, device, ipc_memory_resource):
 
         # Start a child process to generate error info.
         pipe = [multiprocessing.Queue() for _ in range(2)]
-        process = multiprocessing.Process(target=self.child_main, args=(pipe,))
+        process = multiprocessing.Process(target=self.child_main, args=(pipe, self.device, self.mr))
         process.start()
 
         # Interact.
@@ -38,8 +38,10 @@ def test_main(self, device, ipc_memory_resource):
         process.join(timeout=CHILD_TIMEOUT_SEC)
         assert process.exitcode == 0
 
-    def child_main(self, pipe):
+    def child_main(self, pipe, device, mr):
         """Child process that pushes IPC errors to a shared pipe for testing."""
+        self.device = device
+        self.mr = mr
         try:
             self.CHILD_ACTION(pipe[0])
         except Exception as e:
@@ -56,7 +58,7 @@ def PARENT_ACTION(self, queue):
         queue.put(self.mr)
 
     def CHILD_ACTION(self, queue):
-        mr = queue.get()
+        mr = queue.get(timeout=CHILD_TIMEOUT_SEC)
         mr.allocate(NBYTES)
 
     def ASSERT(self, exc_type, exc_msg):
@@ -73,9 +75,42 @@ def PARENT_ACTION(self, queue):
         queue.put([self.mr, buffer.export()])  # Note: mr does not own this buffer
 
     def CHILD_ACTION(self, queue):
-        mr, buffer_desc = queue.get()
+        mr, buffer_desc = queue.get(timeout=CHILD_TIMEOUT_SEC)
         Buffer.import_(mr, buffer_desc)
 
     def ASSERT(self, exc_type, exc_msg):
         assert exc_type is CUDAError
         assert "CUDA_ERROR_INVALID_VALUE" in exc_msg
+
+
+class TestExportImportedMR(ChildErrorHarness):
+    """Error when exporting a memory resource that was imported."""
+
+    def PARENT_ACTION(self, queue):
+        queue.put(self.mr)
+
+    def CHILD_ACTION(self, queue):
+        mr = queue.get(timeout=CHILD_TIMEOUT_SEC)
+        mr.get_allocation_handle()
+
+    def ASSERT(self, exc_type, exc_msg):
+        assert exc_type is RuntimeError
+        assert exc_msg == "Imported memory resource cannot be exported"
+
+
+class TestImportBuffer(ChildErrorHarness):
+    """Error when using a buffer as a buffer descriptor."""
+
+    def PARENT_ACTION(self, queue):
+        # Note: if the buffer is not attached to something to prolong its life,
+        # CUDA_ERROR_INVALID_CONTEXT is raised from Buffer.__del__
+        self.buffer = self.mr.allocate(NBYTES)
+        queue.put(self.buffer)
+
+    def CHILD_ACTION(self, queue):
+        buffer = queue.get(timeout=CHILD_TIMEOUT_SEC)
+        Buffer.import_(self.mr, buffer)
+
+    def ASSERT(self, exc_type, exc_msg):
+        assert exc_type is TypeError
+        assert exc_msg.startswith("Argument 'ipc_buffer' has incorrect type")
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index e835e53f3..496f32553 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -1,17 +1,20 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental import Device, DeviceMemoryResource
+import multiprocessing
 from itertools import cycle
+
 from utility import IPCBufferTestHelper
-import multiprocessing
 
-CHILD_TIMEOUT_SEC = 10
+from cuda.core.experimental import Device, DeviceMemoryResource
+
+CHILD_TIMEOUT_SEC = 4
 NBYTES = 64
 NMRS = 3
 NTASKS = 7
 POOL_SIZE = 2097152
 
+
 def test_ipc_send_buffers(device, ipc_memory_resource):
     """Test passing buffers directly to a child separately from a memory resource."""
     mr = ipc_memory_resource
@@ -35,6 +38,7 @@ def test_ipc_send_buffers(device, ipc_memory_resource):
         helper = IPCBufferTestHelper(device, buffer)
         helper.verify_buffer(flipped=True)
 
+
 def test_ipc_send_buffers_multi(device, ipc_memory_resource):
     """Test passing buffers sourced from multiple memory resources."""
     # Set up several IPC-enabled memory pools.
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index f12489dea..97e4620cf 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -1,79 +1,133 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import multiprocessing
+import multiprocessing as mp
+import multiprocessing.reduction
+import os
 
-import pytest
 from utility import IPCBufferTestHelper
 
-from cuda.core.experimental import Buffer, DeviceMemoryResource
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
 
-CHILD_TIMEOUT_SEC = 10
+CHILD_TIMEOUT_SEC = 4
 NBYTES = 64
 POOL_SIZE = 2097152
 
 
-class TestObjectSerialization:
-    @pytest.mark.parametrize("use_alloc_handle", [True, False])
-    def test_main(self, use_alloc_handle, device, ipc_memory_resource):
+class TestObjectSerializationDirect:
+    """
+    Test the low-level interface for sharing memory resources.
+
+    Send a memory resource over a connection via Python's `send_handle`. Reconstruct
+    it on the other end and demonstrate buffer sharing.
+    """
+
+    def test_main(self, device, ipc_memory_resource):
+        mr = ipc_memory_resource
+
+        # Start the child process.
+        parent_conn, child_conn = mp.Pipe()
+        process = mp.Process(target=self.child_main, args=(child_conn,))
+        process.start()
+
+        # Send a memory resource by allocation handle.
+        alloc_handle = mr.get_allocation_handle()
+        mp.reduction.send_handle(parent_conn, alloc_handle.handle, process.pid)
+        parent_conn.send(mr.remote_id)
+
+        # Send a buffer.
+        buffer1 = mr.allocate(NBYTES)
+        parent_conn.send(buffer1)  # directly
+
+        buffer2 = mr.allocate(NBYTES)
+        parent_conn.send(buffer2.export())  # by descriptor
+
+        # Wait for the child process.
+        process.join(timeout=CHILD_TIMEOUT_SEC)
+        assert process.exitcode == 0
+
+        # Confirm buffers were modified.
+        IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=True)
+        IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True)
+
+    def child_main(self, conn):
+        # Set up the device.
+        device = Device()
+        device.set_current()
+
+        # Receive the memory resource.
+        handle = mp.reduction.recv_handle(conn)
+        remote_id = conn.recv()
+        mr = DeviceMemoryResource.from_allocation_handle(device, handle)
+        mr.register(remote_id)
+        os.close(handle)
+
+        # Receive the buffers.
+        buffer1 = conn.recv()  # directly
+        buffer_desc = conn.recv()
+        buffer2 = Buffer.import_(mr, buffer_desc)  # by descriptor
+
+        # Modify the buffers.
+        IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=True)
+        IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True)
+
+
+class TestObjectSerializationWithMR:
+    def test_main(self, device, ipc_memory_resource):
         """Test sending IPC memory objects to a child through a queue."""
         mr = ipc_memory_resource
 
         # Start the child process.
-        pipe = [multiprocessing.Queue() for _ in range(2)]
-        process = multiprocessing.Process(target=self.child_main, args=(pipe, use_alloc_handle))
+        pipe = [mp.Queue() for _ in range(2)]
+        process = mp.Process(target=self.child_main, args=(pipe, mr))
         process.start()
 
         # Send a device description.
         pipe[0].put(device)
-        device_id = pipe[1].get()
+        device_id = pipe[1].get(timeout=CHILD_TIMEOUT_SEC)
         assert device_id == device.device_id
 
-        # Send a memory resource directly or by allocation handle.
-        # Note: there is no apparent way to check the ID between processes.
-        if use_alloc_handle:
-            # Send MR by a handle.
-            alloc_handle = mr.get_allocation_handle()
-            pipe[0].put(alloc_handle)
-        else:
-            # Send MR directly.
-            pipe[0].put(mr)
+        # Send a memory resource directly. This relies on the mr already
+        # being passed when spawning the child.
+        pipe[0].put(mr)
+        remote_id = pipe[1].get(timeout=CHILD_TIMEOUT_SEC)
+        assert remote_id == mr.remote_id
 
         # Send a buffer.
         buffer = mr.allocate(NBYTES)
-        helper = IPCBufferTestHelper(device, buffer)
-        helper.fill_buffer(flipped=False)
         pipe[0].put(buffer)
-        pipe[1].get()  # signal done
-        helper.verify_buffer(flipped=True)
 
         # Wait for the child process.
         process.join(timeout=CHILD_TIMEOUT_SEC)
         assert process.exitcode == 0
 
-    def child_main(self, pipe, use_alloc_handle):
+        # Confirm buffer was modified.
+        IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
+
+    def child_main(self, pipe, _):
         # Device.
-        device = pipe[0].get()
+        device = pipe[0].get(timeout=CHILD_TIMEOUT_SEC)
         pipe[1].put(device.device_id)
 
         # Memory resource.
-        if use_alloc_handle:
-            alloc_handle = pipe[0].get()
-            mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
-        else:
-            mr = pipe[0].get()
+        mr = pipe[0].get(timeout=CHILD_TIMEOUT_SEC)
+        pipe[1].put(mr.remote_id)
 
         # Buffer.
-        buffer = pipe[0].get()
+        buffer = pipe[0].get(timeout=CHILD_TIMEOUT_SEC)
         assert buffer.memory_resource.handle == mr.handle
-        helper = IPCBufferTestHelper(device, buffer)
-        helper.verify_buffer(flipped=False)
-        helper.fill_buffer(flipped=True)
-        pipe[1].put(None)
+        IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True)
 
 
 def test_object_passing(device, ipc_memory_resource):
-    """Test sending objects as arguments when starting a process."""
+    """
+    Test sending objects as arguments when starting a process.
+
+    True pickling of allocation handles and memory resources is enabled only
+    when spawning a process. This is similar to the way sockets and various objects
+    in multiprocessing (e.g., Queue) work.
+    """
+
     # Define the objects.
     mr = ipc_memory_resource
     alloc_handle = mr.get_allocation_handle()
@@ -84,7 +138,7 @@ def test_object_passing(device, ipc_memory_resource):
     helper.fill_buffer(flipped=False)
 
     # Start the child process.
-    process = multiprocessing.Process(target=child_main, args=(device, alloc_handle, mr, buffer_desc, buffer))
+    process = mp.Process(target=child_main, args=(device, alloc_handle, mr, buffer_desc, buffer))
     process.start()
     process.join(timeout=CHILD_TIMEOUT_SEC)
     assert process.exitcode == 0
@@ -95,8 +149,8 @@ def test_object_passing(device, ipc_memory_resource):
 def child_main(device, alloc_handle, mr1, buffer_desc, buffer1):
     mr2 = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
 
-    # OK to build the buffer from either mr and descriptor.
-    # These all point to the same buffer.
+    # OK to build the buffer from either mr and the descriptor.
+    # All buffer* objects point to the same memory.
     buffer2 = Buffer.import_(mr1, buffer_desc)
     buffer3 = Buffer.import_(mr2, buffer_desc)
 
@@ -108,18 +162,21 @@ def child_main(device, alloc_handle, mr1, buffer_desc, buffer1):
     helper2.verify_buffer(flipped=False)
     helper3.verify_buffer(flipped=False)
 
+    # Modify 1.
     helper1.fill_buffer(flipped=True)
 
     helper1.verify_buffer(flipped=True)
     helper2.verify_buffer(flipped=True)
     helper3.verify_buffer(flipped=True)
 
+    # Modify 2.
     helper2.fill_buffer(flipped=False)
 
     helper1.verify_buffer(flipped=False)
     helper2.verify_buffer(flipped=False)
     helper3.verify_buffer(flipped=False)
 
+    # Modify 3.
     helper3.fill_buffer(flipped=True)
 
     helper1.verify_buffer(flipped=True)
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index 2dc29da8b..cf2fefd07 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -6,43 +6,98 @@
 
 from utility import IPCBufferTestHelper
 
-from cuda.core.experimental import Device, DeviceMemoryResource
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
 
-CHILD_TIMEOUT_SEC = 10
+CHILD_TIMEOUT_SEC = 4
 NBYTES = 64
 NWORKERS = 2
 NMRS = 3
 NTASKS = 20
 POOL_SIZE = 2097152
 
+# Global memory resources, set in children.
+g_mrs = None
 
-def test_ipc_workerpool(device, ipc_memory_resource):
-    """Test IPC with a worker pool."""
-    mr = ipc_memory_resource
-    buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)]
-    with multiprocessing.Pool(processes=NWORKERS) as pool:
-        pool.map(process_buffer, buffers)
 
-    for buffer in buffers:
-        helper = IPCBufferTestHelper(device, buffer)
-        helper.verify_buffer(flipped=True)
+class TestIpcWorkerPoolUsingExport:
+    """
+    Test buffer sharing using export handles.
 
+    The memory resources need to be passed to subprocesses at startup. Buffers
+    are passed by their handles and reconstructed using the corresponding mr.
+    """
 
-def test_ipc_workerpool_multi_mr(device, ipc_memory_resource):
-    """Test IPC with a worker pool using multiple memory resources."""
-    mrs = [ipc_memory_resource] + [
-        DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1)
-    ]
-    buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
-    with multiprocessing.Pool(processes=NWORKERS) as pool:
-        pool.map(process_buffer, buffers)
+    @staticmethod
+    def init_worker(mrs):
+        global g_mrs
+        g_mrs = mrs
 
-    for buffer in buffers:
-        helper = IPCBufferTestHelper(device, buffer)
-        helper.verify_buffer(flipped=True)
+    def test_ipc_workerpool(self, device, ipc_memory_resource):
+        """Test IPC with a worker pool."""
+        mr = ipc_memory_resource
+        buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)]
+        with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool:
+            pool.starmap(self.process_buffer, [(0, buffer.export()) for buffer in buffers])
 
+        for buffer in buffers:
+            IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
 
-def process_buffer(buffer):
-    device = Device()
-    helper = IPCBufferTestHelper(device, buffer)
-    helper.fill_buffer(flipped=True)
+    def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource):
+        """Test IPC with a worker pool using multiple memory resources."""
+        mrs = [ipc_memory_resource] + [
+            DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1)
+        ]
+        buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
+        with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool:
+            pool.starmap(
+                self.process_buffer, [(mrs.index(buffer.memory_resource), buffer.export()) for buffer in buffers]
+            )
+
+        for buffer in buffers:
+            IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
+
+    def process_buffer(self, mr_idx, buffer_desc):
+        device = Device()
+        buffer = Buffer.import_(g_mrs[mr_idx], buffer_desc)
+        IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True)
+
+
+class TestIpcWorkerPool:
+    """
+    Test buffer sharing without using export handles.
+
+    The memory resources need to be passed to subprocesses at startup. Buffers
+    are serialized with the `remote_id` of the corresponding mr, and the
+    import/export is handled automatically.
+    """
+
+    @staticmethod
+    def init_worker(mrs):
+        global g_mrs
+        g_mrs = mrs
+
+    def test_ipc_workerpool(self, device, ipc_memory_resource):
+        """Test IPC with a worker pool."""
+        mr = ipc_memory_resource
+        buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)]
+        with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool:
+            pool.map(self.process_buffer, buffers)
+
+        for buffer in buffers:
+            IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
+
+    def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource):
+        """Test IPC with a worker pool using multiple memory resources."""
+        mrs = [ipc_memory_resource] + [
+            DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1)
+        ]
+        buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
+        with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool:
+            pool.map(self.process_buffer, buffers)
+
+        for buffer in buffers:
+            IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
+
+    def process_buffer(self, buffer):
+        device = Device()
+        IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True)

From e8822b3c2bf3307d0de8d429f62c49cd6bd09c9b Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 24 Sep 2025 17:32:19 -0700
Subject: [PATCH 09/25] Add tests for leaked file descriptors and fix leaks.

---
 cuda_core/cuda/core/experimental/_memory.pyx |  26 ++--
 cuda_core/tests/memory_ipc/test_leaks.py     | 129 +++++++++++++++++++
 2 files changed, 145 insertions(+), 10 deletions(-)
 create mode 100644 cuda_core/tests/memory_ipc/test_leaks.py

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 4dc5c52e5..d69dd58f2 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -439,8 +439,7 @@ cdef class IPCAllocationHandle:
 
     def __reduce__(self):
         multiprocessing.context.assert_spawning(self)
-        fd = os.dup(self.handle)
-        df = multiprocessing.reduction.DupFd(fd)
+        df = multiprocessing.reduction.DupFd(self.handle)
         return IPCAllocationHandle._reconstruct, (df,)
 
     @staticmethod
@@ -649,7 +648,8 @@ class DeviceMemoryResource(MemoryResource):
         device memory resource does not own the pool (`is_handle_owned` is
         `False`), and closing the resource has no effect.
     """
-    __slots__ = "_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type", "_mempool_owned", "_is_imported", "_remote_id"
+    __slots__ = ("_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type",
+                 "_mempool_owned", "_is_imported", "_remote_id", "_alloc_handle")
 
     def __init__(self, device_id: int | Device, options=None):
         device_id = getattr(device_id, 'device_id', device_id)
@@ -666,6 +666,7 @@ class DeviceMemoryResource(MemoryResource):
             self._mempool_owned = False
             self._is_imported = False
             self._remote_id = None
+            self._alloc_handle = None
 
             err, self._mempool_handle = driver.cuDeviceGetMemPool(self.device_id)
             raise_if_driver_error(err)
@@ -708,6 +709,7 @@ class DeviceMemoryResource(MemoryResource):
             self._mempool_owned = True
             self._is_imported = False
             self._remote_id = None
+            self._alloc_handle = None
 
             err, self._mempool_handle = driver.cuMemPoolCreate(properties)
             raise_if_driver_error(err)
@@ -733,6 +735,7 @@ class DeviceMemoryResource(MemoryResource):
                 self._mempool_owned = False
                 self._is_imported = False
                 self._remote_id = None
+                self._alloc_handle = None
 
 
     def __reduce__(self):
@@ -816,6 +819,7 @@ class DeviceMemoryResource(MemoryResource):
         self._mempool_owned = True
         self._is_imported = True
         self._remote_id = None
+        self._alloc_handle = None # only used for non-imported
 
         err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0)
         raise_if_driver_error(err)
@@ -836,13 +840,15 @@ class DeviceMemoryResource(MemoryResource):
         -------
             The shareable handle for the memory pool.
         """
-        if not self.is_ipc_enabled:
-            raise RuntimeError("Memory resource is not IPC-enabled")
-        if self._is_imported:
-            raise RuntimeError("Imported memory resource cannot be exported")
-        err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0)
-        raise_if_driver_error(err)
-        return IPCAllocationHandle._init(alloc_handle)
+        if self._alloc_handle is None:
+            if not self.is_ipc_enabled:
+                raise RuntimeError("Memory resource is not IPC-enabled")
+            if self._is_imported:
+                raise RuntimeError("Imported memory resource cannot be exported")
+            err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0)
+            raise_if_driver_error(err)
+            self._alloc_handle = IPCAllocationHandle._init(alloc_handle)
+        return self._alloc_handle
 
     def allocate(self, size_t size, stream: Stream = None) -> Buffer:
         """Allocate a buffer of the requested size.
diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
new file mode 100644
index 000000000..77e4ef2c5
--- /dev/null
+++ b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -0,0 +1,129 @@
+import gc
+import multiprocessing as mp
+
+import psutil
+import pytest
+
+from cuda.core.experimental import _memory
+from cuda.core.experimental._utils.cuda_utils import driver
+
+CHILD_TIMEOUT_SEC = 4
+NBYTES = 64
+
+USING_FDS = _memory._IPC_HANDLE_TYPE == driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+skip_unless_using_fds = pytest.mark.skipif(not USING_FDS, reason="mempool allocation handle is not using fds")
+
+
+@skip_unless_using_fds
+def test_alloc_handle(ipc_memory_resource):
+    """Check for fd leaks in get_allocation_handle."""
+    mr = ipc_memory_resource
+    with CheckFDLeaks():
+        [mr.get_allocation_handle() for _ in range(10)]
+
+
+def exec_with_object(obj, number=1):
+    """Succesfully run a child process."""
+    for _ in range(number):
+        process = mp.Process(target=child_main, args=(obj,))
+        process.start()
+        process.join()
+        assert process.exitcode == 0
+
+
+def child_main(obj, *args):
+    pass
+
+
+def exec_launch_failure(obj, number=1):
+    """
+    Unsuccesfully try to launch a child process. This fails when
+    after the child starts.
+    """
+    for _ in range(number):
+        process = mp.Process(target=child_main_bad, args=(obj,))
+        process.start()
+        process.join()
+        assert process.exitcode != 0
+
+
+def child_main_bad():
+    """Fails when passed arguments."""
+    pass
+
+
+def exec_reduce_failure(obj, number=1):
+    """
+    Unsuccesfully try to launch a child process. This fails before
+    the child starts but after the resource-owning object is serialized.
+    """
+    for _ in range(number):
+        fails_to_reduce = Irreducible()
+        try:
+            mp.Process(target=child_main, args=(obj, fails_to_reduce)).start()
+        except RuntimeError:
+            pass
+
+
+class Irreducible:
+    """A class that cannot be serialized."""
+    def __reduce__(self):
+        raise RuntimeError("Irreducible")
+
+
+@skip_unless_using_fds
+@pytest.mark.parametrize(
+    "getobject",
+    [
+        lambda mr: mr.get_allocation_handle(),
+        lambda mr: mr,
+        lambda mr: mr.allocate(NBYTES),
+        lambda mr: mr.allocate(NBYTES).export(),
+    ],
+    ids=["alloc_handle", "mr", "buffer", "buffer_desc"],
+)
+@pytest.mark.parametrize(
+    "launcher", [exec_with_object, exec_launch_failure, exec_reduce_failure]
+)
+def test_pass_object(ipc_memory_resource, launcher, getobject):
+    """Check for fd leaks when an object is sent as a subprocess argument."""
+    mr = ipc_memory_resource
+    with CheckFDLeaks():
+        obj = getobject(mr)
+        try:
+            launcher(obj, number=2)
+        finally:
+            del obj
+
+
+class CheckFDLeaks:
+    """
+    Context manager to check for file descriptor leaks.
+    Ensures the number of open file descriptors is the same before and after the block.
+    """
+
+    def __init__(self):
+        self.process = psutil.Process()
+
+    def __enter__(self):
+        self.prime()
+        gc.collect()
+        self.initial_fds = self.process.num_fds()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            gc.collect()
+            final_fds = self.process.num_fds()
+            assert final_fds == self.initial_fds
+        return False
+
+    def prime(self, latch=[]):
+        """Multiprocessing consumes a file descriptor on first launch."""
+        assert mp.get_start_method() == "spawn"
+        if not latch:
+            process = mp.Process()
+            process.start()
+            process.join()
+            assert process.exitcode == 0
+            latch.append(None)

From 708e2b5a0dc7724845c6b896c46b88043c1c4481 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 25 Sep 2025 08:58:33 -0700
Subject: [PATCH 10/25] Eliminates IPCChannel.

---
 cuda_core/cuda/core/experimental/__init__.py  |   1 -
 cuda_core/cuda/core/experimental/_memory.pyx  | 102 ----------
 cuda_core/tests/memory_ipc/test_channel.py    | 183 ------------------
 cuda_core/tests/memory_ipc/test_memory_ipc.py | 182 +++++++++++++++++
 4 files changed, 182 insertions(+), 286 deletions(-)
 delete mode 100644 cuda_core/tests/memory_ipc/test_channel.py
 create mode 100644 cuda_core/tests/memory_ipc/test_memory_ipc.py

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index a06119321..9a86459d2 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -17,7 +17,6 @@
 from cuda.core.experimental._memory import (
     Buffer,
     DeviceMemoryResource,
-    IPCChannel,
     LegacyPinnedMemoryResource,
     MemoryResource,
 )
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index d69dd58f2..039998220 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -161,13 +161,6 @@ cdef class Buffer:
         raise_if_driver_error(err)
         return Buffer.from_handle(ptr, ipc_buffer.size, mr)
 
-    def export_to_channel(self, channel: IPCChannel):
-        channel.export(self);
-
-    @classmethod
-    def import_from_channel(cls, channel: IPCChannel):
-        return channel.import_()
-
     def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
 
@@ -464,83 +457,6 @@ cdef class IPCAllocationHandle:
         return self._handle
 
 
-cdef class IPCChannel:
-    """Communication channel for sharing IPC-enabled memory pools."""
-
-    cdef:
-        object _proxy
-        object _queue
-        object _mr
-
-    def __init__(self):
-        if platform.system() == "Linux":
-            self._proxy = IPCChannelUnixSocket._init()
-        else:
-            raise RuntimeError("IPC is not available on {platform.system()}")
-        self._queue = multiprocessing.Queue()
-        self._mr = None
-
-    def send_buffer(self, buffer: Buffer):
-        handle = buffer.export()
-        self._queue.put(handle)
-
-    def receive_buffer(self, device: Optional[Device] = None):
-        if self._mr is None:
-            if device is None:
-                from ._device import Device
-                device = Device()
-            self._mr = DeviceMemoryResource.from_shared_channel(device, self)
-
-        handle = self._queue.get()
-        return Buffer.import_(self._mr, handle)
-
-    def send_allocation_handle(self, alloc_handle: IPCAllocationHandle):
-        """Sends over this channel an allocation handle for exporting a
-        shared memory pool."""
-        self._proxy.send_allocation_handle(alloc_handle)
-
-    def receive_allocation_handle(self) -> IPCAllocationHandle:
-        """Receives over this channel an allocation handle for importing a
-        shared memory pool."""
-        return self._proxy.receive_allocation_handle()
-
-
-cdef class IPCChannelUnixSocket:
-    """Unix-specific channel for sharing memory pools over sockets."""
-
-    cdef:
-        object _sock_out
-        object _sock_in
-
-    def __init__(self, *arg, **kwargs):
-        raise RuntimeError("IPCChannelUnixSocket objects cannot be instantiated directly. Please use MemoryResource APIs.")
-
-    @classmethod
-    def _init(cls):
-        cdef IPCChannelUnixSocket self = IPCChannelUnixSocket.__new__(cls)
-        self._sock_out, self._sock_in = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET)
-        return self
-
-    cpdef send_allocation_handle(self, alloc_handle: IPCAllocationHandle):
-        """Sends over this channel an allocation handle for exporting a
-        shared memory pool."""
-        self._sock_out.sendmsg(
-            [],
-            [(socket.SOL_SOCKET, socket.SCM_RIGHTS, array.array("i", [int(alloc_handle)]))]
-        )
-
-    cpdef IPCAllocationHandle receive_allocation_handle(self):
-        """Receives over this channel an allocation handle for importing a
-        shared memory pool."""
-        fds = array.array("i")
-        _, ancillary_data, _, _ = self._sock_in.recvmsg(0, socket.CMSG_LEN(fds.itemsize))
-        assert len(ancillary_data) == 1
-        cmsg_level, cmsg_type, cmsg_data = ancillary_data[0]
-        assert cmsg_level == socket.SOL_SOCKET and cmsg_type == socket.SCM_RIGHTS
-        fds.frombytes(cmsg_data[: len(cmsg_data) - (len(cmsg_data) % fds.itemsize)])
-        return IPCAllocationHandle._init(int(fds[0]))
-
-
 @dataclass
 cdef class DeviceMemoryResourceOptions:
     """Customizable :obj:`~_memory.DeviceMemoryResource` options.
@@ -775,19 +691,6 @@ class DeviceMemoryResource(MemoryResource):
             self._remote_id = int(self._mempool_handle)
         return self._remote_id
 
-    def create_ipc_channel(self):
-        """Create an IPC memory channel for sharing allocations."""
-        channel = IPCChannel()
-        self.share_to_channel(channel)
-        return channel
-
-    @classmethod
-    def from_shared_channel(cls, device_id: int | Device, channel: IPCChannel) -> DeviceMemoryResource:
-        """Create a device memory resource from a memory pool shared over an IPC channel."""
-        device_id = getattr(device_id, 'device_id', device_id)
-        alloc_handle = channel.receive_allocation_handle()
-        return cls.from_allocation_handle(device_id, alloc_handle)
-
     @classmethod
     def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource:
         """Create a device memory resource from an allocation handle.
@@ -825,11 +728,6 @@ class DeviceMemoryResource(MemoryResource):
         raise_if_driver_error(err)
         return self
 
-    def share_to_channel(self, channel : IPCChannel):
-        if not self.is_ipc_enabled:
-            raise RuntimeError("Memory resource is not IPC-enabled")
-        channel.send_allocation_handle(self.get_allocation_handle())
-
     def get_allocation_handle(self) -> IPCAllocationHandle:
         """Export the memory pool handle to be shared (requires IPC).
 
diff --git a/cuda_core/tests/memory_ipc/test_channel.py b/cuda_core/tests/memory_ipc/test_channel.py
deleted file mode 100644
index 49e8f3be2..000000000
--- a/cuda_core/tests/memory_ipc/test_channel.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-import multiprocessing
-
-from utility import IPCBufferTestHelper
-
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel
-
-CHILD_TIMEOUT_SEC = 4
-NBYTES = 64
-NWORKERS = 2
-NTASKS = 2
-
-
-def test_ipc_mempool(device, ipc_memory_resource):
-    """Test IPC with memory pools."""
-    # Set up the IPC-enabled memory pool and share it.
-    mr = ipc_memory_resource
-    channel = mr.create_ipc_channel()
-
-    # Start the child process.
-    process = multiprocessing.Process(target=child_main1, args=(channel,))
-    process.start()
-
-    # Allocate and fill memory.
-    buffer = mr.allocate(NBYTES)
-    helper = IPCBufferTestHelper(device, buffer)
-    helper.fill_buffer(flipped=False)
-
-    # Export the buffer via IPC.
-    channel.send_buffer(buffer)
-
-    # Wait for the child process.
-    process.join(timeout=CHILD_TIMEOUT_SEC)
-    assert process.exitcode == 0
-
-    # Verify that the buffer was modified.
-    helper.verify_buffer(flipped=True)
-
-
-def child_main1(channel):
-    device = Device()
-    device.set_current()
-    buffer = channel.receive_buffer()
-    helper = IPCBufferTestHelper(device, buffer)
-    helper.verify_buffer(flipped=False)
-    helper.fill_buffer(flipped=True)
-
-
-def test_ipc_mempool_multiple(device, ipc_memory_resource):
-    """Test IPC with memory pools using multiple processes."""
-    # Construct an IPC-enabled memory resource and share it over two channels.
-    mr = ipc_memory_resource
-    ch1, ch2 = (mr.create_ipc_channel() for _ in range(2))
-
-    # Allocate memory buffers and export them to each channel.
-    buffer1 = mr.allocate(NBYTES)
-    ch1.send_buffer(buffer1)
-    ch2.send_buffer(buffer1)
-    buffer2 = mr.allocate(NBYTES)
-    ch1.send_buffer(buffer2)
-    ch2.send_buffer(buffer2)
-
-    # Start the child processes.
-    p1 = multiprocessing.Process(target=child_main2, args=(1, ch1))
-    p2 = multiprocessing.Process(target=child_main2, args=(2, ch2))
-    p1.start()
-    p2.start()
-
-    # Wait for the child processes.
-    p1.join(timeout=CHILD_TIMEOUT_SEC)
-    p2.join(timeout=CHILD_TIMEOUT_SEC)
-    assert p1.exitcode == 0
-    assert p2.exitcode == 0
-
-    # Verify that the buffers were modified.
-    IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False)
-    IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True)
-
-
-def child_main2(idx, channel):
-    device = Device()
-    device.set_current()
-    buffer1 = channel.receive_buffer()  # implicitly set up the shared memory pool
-    buffer2 = channel.receive_buffer()
-    if idx == 1:
-        IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False)
-    elif idx == 2:
-        IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True)
-
-
-def test_ipc_shared_allocation_handle(device, ipc_memory_resource):
-    """Demonstrate that a memory pool allocation handle can be reused for IPC
-    with multiple processes."""
-    # Set up communication.
-    ch1 = IPCChannel()
-    ch2 = IPCChannel()
-    q1 = multiprocessing.Queue()
-    q2 = multiprocessing.Queue()
-
-    # Start children.
-    p1 = multiprocessing.Process(target=child_main3, args=(1, ch1, q1))
-    p2 = multiprocessing.Process(target=child_main3, args=(2, ch2, q2))
-    p1.start()
-    p2.start()
-
-    # Set up the IPC-enabled memory pool and share it using one handle.
-    mr = ipc_memory_resource
-    alloc_handle = mr.get_allocation_handle()
-    ch1.send_allocation_handle(alloc_handle)
-    ch2.send_allocation_handle(alloc_handle)
-
-    # Allocate a share memory.
-    buf1 = mr.allocate(NBYTES)
-    buf2 = mr.allocate(NBYTES)
-    q1.put(buf1.export())
-    q2.put(buf2.export())
-
-    # Wait for children.
-    p1.join(timeout=CHILD_TIMEOUT_SEC)
-    p2.join(timeout=CHILD_TIMEOUT_SEC)
-    assert p1.exitcode == 0
-    assert p2.exitcode == 0
-
-    # Verify results.
-    IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1)
-    IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
-
-
-def child_main3(idx, channel, queue):
-    """Fills a shared memory buffer."""
-    device = Device()
-    device.set_current()
-    alloc_handle = channel.receive_allocation_handle()
-    mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
-    buffer_descriptor = queue.get(timeout=CHILD_TIMEOUT_SEC)
-    buffer = Buffer.import_(mr, buffer_descriptor)
-    IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
-
-
-def test_ipc_shared_allocation_handle2(device, ipc_memory_resource):
-    """Demonstrate that a memory pool allocation handle can be reused for IPC
-    with multiple processes (simplified)."""
-    # Set up communication.
-    ch1 = IPCChannel()
-    ch2 = IPCChannel()
-
-    # Start children.
-    p1 = multiprocessing.Process(target=child_main4, args=(1, ch1))
-    p2 = multiprocessing.Process(target=child_main4, args=(2, ch2))
-    p1.start()
-    p2.start()
-
-    # Set up the IPC-enabled memory pool and share it using one handle.
-    mr = ipc_memory_resource
-    alloc_handle = mr.get_allocation_handle()
-    ch1.send_allocation_handle(alloc_handle)
-    ch2.send_allocation_handle(alloc_handle)
-
-    # Allocate a share memory.
-    buf1 = mr.allocate(NBYTES)
-    buf2 = mr.allocate(NBYTES)
-    ch1.send_buffer(buf1)
-    ch2.send_buffer(buf2)
-
-    # Wait for children.
-    p1.join(timeout=CHILD_TIMEOUT_SEC)
-    p2.join(timeout=CHILD_TIMEOUT_SEC)
-    assert p1.exitcode == 0
-    assert p2.exitcode == 0
-
-    # Verify results.
-    IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1)
-    IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
-
-
-def child_main4(idx, channel):
-    """Fills a shared memory buffer."""
-    device = Device()
-    device.set_current()
-    buffer = channel.receive_buffer()
-    IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
new file mode 100644
index 000000000..9a527bf0d
--- /dev/null
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -0,0 +1,182 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import multiprocessing as mp
+
+from utility import IPCBufferTestHelper
+
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
+
+CHILD_TIMEOUT_SEC = 4
+NBYTES = 64
+NWORKERS = 2
+NTASKS = 2
+
+
+class TestIpcMempool:
+    def test_main(self, device, ipc_memory_resource):
+        """Test IPC with memory pools."""
+        # Set up the IPC-enabled memory pool and share it.
+        mr = ipc_memory_resource
+
+        # Start the child process.
+        queue = mp.Queue()
+        process = mp.Process(target=self.child_main, args=(mr, queue))
+        process.start()
+
+        # Allocate and fill memory.
+        buffer = mr.allocate(NBYTES)
+        helper = IPCBufferTestHelper(device, buffer)
+        helper.fill_buffer(flipped=False)
+
+        # Export the buffer via IPC.
+        queue.put(buffer)
+
+        # Wait for the child process.
+        process.join(timeout=CHILD_TIMEOUT_SEC)
+        assert process.exitcode == 0
+
+        # Verify that the buffer was modified.
+        helper.verify_buffer(flipped=True)
+
+
+    def child_main(self, mr, queue):
+        device = Device()
+        buffer = queue.get(timeout=CHILD_TIMEOUT_SEC)
+        helper = IPCBufferTestHelper(device, buffer)
+        helper.verify_buffer(flipped=False)
+        helper.fill_buffer(flipped=True)
+
+
+class TestIPCMempoolMultiple:
+    def test_main(self, device, ipc_memory_resource):
+        """Test IPC with memory pools using multiple processes."""
+        # Construct an IPC-enabled memory resource and share it with two children.
+        mr = ipc_memory_resource
+        q1, q2 = (mp.Queue() for _ in range(2))
+
+        # Allocate memory buffers and export them to each child.
+        buffer1 = mr.allocate(NBYTES)
+        q1.put(buffer1)
+        q2.put(buffer1)
+        buffer2 = mr.allocate(NBYTES)
+        q1.put(buffer2)
+        q2.put(buffer2)
+
+        # Start the child processes.
+        p1 = mp.Process(target=self.child_main, args=(mr, 1, q1))
+        p2 = mp.Process(target=self.child_main, args=(mr, 2, q2))
+        p1.start()
+        p2.start()
+
+        # Wait for the child processes.
+        p1.join(timeout=CHILD_TIMEOUT_SEC)
+        p2.join(timeout=CHILD_TIMEOUT_SEC)
+        assert p1.exitcode == 0
+        assert p2.exitcode == 0
+
+        # Verify that the buffers were modified.
+        IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False)
+        IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True)
+
+
+    def child_main(self, mr, idx, queue):
+        # Note: passing the mr registers it so that buffers can be passed
+        # directly.
+        device = Device()
+        buffer1 = queue.get(timeout=CHILD_TIMEOUT_SEC)
+        buffer2 = queue.get(timeout=CHILD_TIMEOUT_SEC)
+        if idx == 1:
+            IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=False)
+        elif idx == 2:
+            IPCBufferTestHelper(device, buffer2).fill_buffer(flipped=True)
+
+
+class TestIPCSharedAllocationHandleAndBufferDescriptors:
+    def test_main(self, device, ipc_memory_resource):
+        """
+        Demonstrate that a memory pool allocation handle can be reused for IPC
+        with multiple processes. Uses buffer descriptors.
+        """
+        # Set up the IPC-enabled memory pool and share it using one handle.
+        mr = ipc_memory_resource
+        alloc_handle = mr.get_allocation_handle()
+
+        # Start children.
+        q1, q2 = (mp.Queue() for _ in range(2))
+        p1 = mp.Process(target=self.child_main, args=(alloc_handle, 1, q1))
+        p2 = mp.Process(target=self.child_main, args=(alloc_handle, 2, q2))
+        p1.start()
+        p2.start()
+
+        # Allocate and share memory.
+        buf1 = mr.allocate(NBYTES)
+        buf2 = mr.allocate(NBYTES)
+        q1.put(buf1.export())
+        q2.put(buf2.export())
+
+        # Wait for children.
+        p1.join(timeout=CHILD_TIMEOUT_SEC)
+        p2.join(timeout=CHILD_TIMEOUT_SEC)
+        assert p1.exitcode == 0
+        assert p2.exitcode == 0
+
+        # Verify results.
+        IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1)
+        IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
+
+
+    def child_main(self, alloc_handle, idx, queue):
+        """Fills a shared memory buffer."""
+        # In this case, the device needs to be set up (passing the mr does it
+        # implicitly in other tests).
+        device = Device()
+        device.set_current()
+        mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+        buffer_descriptor = queue.get(timeout=CHILD_TIMEOUT_SEC)
+        buffer = Buffer.import_(mr, buffer_descriptor)
+        IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
+
+
+class TestIPCSharedAllocationHandleAndBufferObjects:
+    def test_main(self, device, ipc_memory_resource):
+        """
+        Demonstrate that a memory pool allocation handle can be reused for IPC
+        with multiple processes. Uses buffer objects (not descriptors).
+        """
+        mr = ipc_memory_resource
+        alloc_handle = mr.get_allocation_handle()
+
+        # Start children.
+        q1, q2 = (mp.Queue() for _ in range(2))
+        p1 = mp.Process(target=self.child_main, args=(alloc_handle, mr.remote_id, 1, q1))
+        p2 = mp.Process(target=self.child_main, args=(alloc_handle, mr.remote_id, 2, q2))
+        p1.start()
+        p2.start()
+
+        # Allocate and share memory.
+        buf1 = mr.allocate(NBYTES)
+        buf2 = mr.allocate(NBYTES)
+        q1.put(buf1)
+        q2.put(buf2)
+
+        # Wait for children.
+        p1.join(timeout=CHILD_TIMEOUT_SEC)
+        p2.join(timeout=CHILD_TIMEOUT_SEC)
+        assert p1.exitcode == 0
+        assert p2.exitcode == 0
+
+        # Verify results.
+        IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1)
+        IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
+
+
+    def child_main(self, alloc_handle, remote_id, idx, queue):
+        """Fills a shared memory buffer."""
+        device = Device()
+        device.set_current()
+        mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+        mr.register(remote_id)
+        buffer = queue.get(timeout=CHILD_TIMEOUT_SEC)
+        IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
+

From b31d8490ffcd8fec50f38a15a6cf540e4e589f6f Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 25 Sep 2025 09:50:46 -0700
Subject: [PATCH 11/25] Changes DeviceMemoryResource remote_id to uuid.

---
 cuda_core/cuda/core/experimental/_memory.pyx  | 49 ++++++++++---------
 cuda_core/tests/memory_ipc/test_memory_ipc.py |  8 +--
 cuda_core/tests/memory_ipc/test_serialize.py  | 12 ++---
 cuda_core/tests/memory_ipc/test_workerpool.py |  2 +-
 4 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 039998220..52588dbd2 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -21,6 +21,7 @@ import multiprocessing.reduction
 import os
 import platform
 import sys
+import uuid as uuid_module
 import weakref
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
@@ -537,7 +538,7 @@ class DeviceMemoryResourceAttributes:
 
 # Holds DeviceMemoryResource objects imported by this process.
 # This enables buffer serialization, as buffers can reduce to a pair
-# of comprising the memory resource `remote_id` (the key into this registry)
+# of comprising the memory resource UUID (the key into this registry)
 # and the serialized buffer descriptor.
 _ipc_registry = {}
 
@@ -565,7 +566,7 @@ class DeviceMemoryResource(MemoryResource):
         `False`), and closing the resource has no effect.
     """
     __slots__ = ("_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type",
-                 "_mempool_owned", "_is_imported", "_remote_id", "_alloc_handle")
+                 "_mempool_owned", "_is_imported", "_uuid", "_alloc_handle")
 
     def __init__(self, device_id: int | Device, options=None):
         device_id = getattr(device_id, 'device_id', device_id)
@@ -581,7 +582,7 @@ class DeviceMemoryResource(MemoryResource):
             self._ipc_handle_type = _NOIPC_HANDLE_TYPE
             self._mempool_owned = False
             self._is_imported = False
-            self._remote_id = None
+            self._uuid = None
             self._alloc_handle = None
 
             err, self._mempool_handle = driver.cuDeviceGetMemPool(self.device_id)
@@ -624,14 +625,14 @@ class DeviceMemoryResource(MemoryResource):
             self._ipc_handle_type = properties.handleTypes
             self._mempool_owned = True
             self._is_imported = False
-            self._remote_id = None
+            self._uuid = None
             self._alloc_handle = None
 
             err, self._mempool_handle = driver.cuMemPoolCreate(properties)
             raise_if_driver_error(err)
 
             if opts.ipc_enabled:
-                self.get_allocation_handle() # enables Buffer.export
+                self.get_allocation_handle() # enables Buffer.export, sets uuid
 
     def __del__(self):
         self.close()
@@ -650,46 +651,44 @@ class DeviceMemoryResource(MemoryResource):
                 self._ipc_handle_type = _NOIPC_HANDLE_TYPE
                 self._mempool_owned = False
                 self._is_imported = False
-                self._remote_id = None
+                self._uuid = None
                 self._alloc_handle = None
 
 
     def __reduce__(self):
         # If spawning a new process, serialize the resources; otherwise, just
-        # send the remote_id, using the registry on the receiving end.
+        # send the UUID, using the registry on the receiving end.
         is_spawning = multiprocessing.context.get_spawning_popen() is not None
         if is_spawning:
             from ._device import Device
             device = Device(self.device_id)
             alloc_handle = self.get_allocation_handle()
-            return DeviceMemoryResource._reconstruct, (device, alloc_handle, self.remote_id)
+            return DeviceMemoryResource._reconstruct, (device, alloc_handle, self.uuid)
         else:
-            return DeviceMemoryResource.from_registry, (self.remote_id,)
+            return DeviceMemoryResource.from_registry, (self.uuid,)
 
     @staticmethod
-    def _reconstruct(device, alloc_handle, remote_id):
+    def _reconstruct(device, alloc_handle, uuid):
         self = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
-        self.register(remote_id)
+        self.register(uuid)
         return self
 
     @staticmethod
-    def from_registry(remote_id):
+    def from_registry(uuid: uuid_module.UUID):
         try:
-            return _ipc_registry[remote_id]
+            return _ipc_registry[uuid]
         except KeyError:
-            raise RuntimeError(f"Memory resource with {remote_id=} was not found")
+            raise RuntimeError(f"Memory resource with {uuid=} was not found")
 
-    def register(self, remote_id: int):
-        if remote_id not in _ipc_registry:
-            assert self._remote_id is None or self._remote_id == remote_id
-            _ipc_registry[remote_id] = self
-            self._remote_id = remote_id
+    def register(self, uuid: uuid_module.UUID):
+        if uuid not in _ipc_registry:
+            assert self._uuid is None or self._uuid == uuid
+            _ipc_registry[uuid] = self
+            self._uuid = uuid
 
     @property
-    def remote_id(self):
-        if self._remote_id is None and not self._is_imported:
-            self._remote_id = int(self._mempool_handle)
-        return self._remote_id
+    def uuid(self):
+        return self._uuid
 
     @classmethod
     def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource:
@@ -721,7 +720,7 @@ class DeviceMemoryResource(MemoryResource):
         self._ipc_handle_type = _IPC_HANDLE_TYPE
         self._mempool_owned = True
         self._is_imported = True
-        self._remote_id = None
+        self._uuid = None
         self._alloc_handle = None # only used for non-imported
 
         err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0)
@@ -746,6 +745,8 @@ class DeviceMemoryResource(MemoryResource):
             err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0)
             raise_if_driver_error(err)
             self._alloc_handle = IPCAllocationHandle._init(alloc_handle)
+            assert self._uuid is None
+            self._uuid = uuid_module.uuid4()
         return self._alloc_handle
 
     def allocate(self, size_t size, stream: Stream = None) -> Buffer:
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index 9a527bf0d..c0be05188 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -149,8 +149,8 @@ def test_main(self, device, ipc_memory_resource):
 
         # Start children.
         q1, q2 = (mp.Queue() for _ in range(2))
-        p1 = mp.Process(target=self.child_main, args=(alloc_handle, mr.remote_id, 1, q1))
-        p2 = mp.Process(target=self.child_main, args=(alloc_handle, mr.remote_id, 2, q2))
+        p1 = mp.Process(target=self.child_main, args=(alloc_handle, mr.uuid, 1, q1))
+        p2 = mp.Process(target=self.child_main, args=(alloc_handle, mr.uuid, 2, q2))
         p1.start()
         p2.start()
 
@@ -171,12 +171,12 @@ def test_main(self, device, ipc_memory_resource):
         IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
 
 
-    def child_main(self, alloc_handle, remote_id, idx, queue):
+    def child_main(self, alloc_handle, uuid, idx, queue):
         """Fills a shared memory buffer."""
         device = Device()
         device.set_current()
         mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
-        mr.register(remote_id)
+        mr.register(uuid)
         buffer = queue.get(timeout=CHILD_TIMEOUT_SEC)
         IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
 
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 97e4620cf..cd17bf366 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -33,7 +33,7 @@ def test_main(self, device, ipc_memory_resource):
         # Send a memory resource by allocation handle.
         alloc_handle = mr.get_allocation_handle()
         mp.reduction.send_handle(parent_conn, alloc_handle.handle, process.pid)
-        parent_conn.send(mr.remote_id)
+        parent_conn.send(mr.uuid)
 
         # Send a buffer.
         buffer1 = mr.allocate(NBYTES)
@@ -57,9 +57,9 @@ def child_main(self, conn):
 
         # Receive the memory resource.
         handle = mp.reduction.recv_handle(conn)
-        remote_id = conn.recv()
+        uuid = conn.recv()
         mr = DeviceMemoryResource.from_allocation_handle(device, handle)
-        mr.register(remote_id)
+        mr.register(uuid)
         os.close(handle)
 
         # Receive the buffers.
@@ -90,8 +90,8 @@ def test_main(self, device, ipc_memory_resource):
         # Send a memory resource directly. This relies on the mr already
         # being passed when spawning the child.
         pipe[0].put(mr)
-        remote_id = pipe[1].get(timeout=CHILD_TIMEOUT_SEC)
-        assert remote_id == mr.remote_id
+        uuid = pipe[1].get(timeout=CHILD_TIMEOUT_SEC)
+        assert uuid == mr.uuid
 
         # Send a buffer.
         buffer = mr.allocate(NBYTES)
@@ -111,7 +111,7 @@ def child_main(self, pipe, _):
 
         # Memory resource.
         mr = pipe[0].get(timeout=CHILD_TIMEOUT_SEC)
-        pipe[1].put(mr.remote_id)
+        pipe[1].put(mr.uuid)
 
         # Buffer.
         buffer = pipe[0].get(timeout=CHILD_TIMEOUT_SEC)
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index cf2fefd07..50fa1d509 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -67,7 +67,7 @@ class TestIpcWorkerPool:
     Test buffer sharing without using export handles.
 
     The memory resources need to be passed to subprocesses at startup. Buffers
-    are serialized with the `remote_id` of the corresponding mr, and the
+    are serialized with the `uuid` of the corresponding mr, and the
     import/export is handled automatically.
     """
 

From 6c53cb0c8fe36db1d7a1c4bb6f660fd00ad39f92 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 25 Sep 2025 10:20:23 -0700
Subject: [PATCH 12/25] Embeds the memory resource UUID into allocation
 handles.

---
 cuda_core/cuda/core/experimental/_memory.pyx  | 62 +++++++++----------
 cuda_core/tests/memory_ipc/conftest.py        |  4 +-
 cuda_core/tests/memory_ipc/test_errors.py     | 23 ++++++-
 cuda_core/tests/memory_ipc/test_leaks.py      | 38 +++++++-----
 cuda_core/tests/memory_ipc/test_memory_ipc.py | 18 +++---
 cuda_core/tests/memory_ipc/test_serialize.py  |  3 +-
 6 files changed, 86 insertions(+), 62 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 52588dbd2..3e75967cc 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -77,11 +77,7 @@ cdef class Buffer:
         self.close()
 
     def __reduce__(self):
-        return Buffer._reconstruct, (self.memory_resource, self.export())
-
-    @staticmethod
-    def _reconstruct(mr, desc):
-        return Buffer.import_(mr, desc)
+        return Buffer.import_, (self.memory_resource, self.export())
 
     cpdef close(self, stream: Stream = None):
         """Deallocate this buffer asynchronously on the given stream.
@@ -390,33 +386,29 @@ cdef class IPCBufferDescriptor:
         return self
 
     def __reduce__(self):
-        # This is subject to change if the CUmemPoolPtrExportData struct/object changes.
-        return (self._reconstruct, (self._reserved, self._size))
+        return self._init, (self._reserved, self._size)
 
     @property
     def size(self):
         return self._size
 
-    @classmethod
-    def _reconstruct(cls, reserved, size):
-        instance = cls._init(reserved, size)
-        return instance
-
 
 cdef class IPCAllocationHandle:
     """Shareable handle to an IPC-enabled device memory pool."""
 
     cdef:
         int _handle
+        object _uuid
 
     def __init__(self, *arg, **kwargs):
         raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.")
 
     @classmethod
-    def _init(cls, handle: int):
+    def _init(cls, handle: int, uuid: uuid_module.UUID):
         cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls)
         assert handle >= 0
         self._handle = handle
+        self._uuid = uuid
         return self
 
     cpdef close(self):
@@ -426,6 +418,7 @@ cdef class IPCAllocationHandle:
                 os.close(self._handle)
             finally:
                 self._handle = -1
+                self._uuid = None
 
     def __del__(self):
         """Close the handle."""
@@ -434,12 +427,11 @@ cdef class IPCAllocationHandle:
     def __reduce__(self):
         multiprocessing.context.assert_spawning(self)
         df = multiprocessing.reduction.DupFd(self.handle)
-        return IPCAllocationHandle._reconstruct, (df,)
+        return self._reconstruct, (df, self._uuid)
 
-    @staticmethod
-    def _reconstruct(df):
-        self = IPCAllocationHandle._init(df.detach())
-        return self
+    @classmethod
+    def _reconstruct(cls, df, uuid):
+        return cls._init(df.detach(), uuid)
 
     def __int__(self) -> int:
         if self._handle < 0:
@@ -449,14 +441,19 @@ cdef class IPCAllocationHandle:
         return self._handle
 
     def detach(self):
-      handle = self._handle
-      self._handle = -1
-      return handle
+        handle = self._handle
+        self._handle = -1
+        self._uuid = None
+        return handle
 
     @property
     def handle(self) -> int:
         return self._handle
 
+    @property
+    def uuid(self) -> uuid_module.UUID:
+        return self._uuid
+
 
 @dataclass
 cdef class DeviceMemoryResourceOptions:
@@ -663,22 +660,16 @@ class DeviceMemoryResource(MemoryResource):
             from ._device import Device
             device = Device(self.device_id)
             alloc_handle = self.get_allocation_handle()
-            return DeviceMemoryResource._reconstruct, (device, alloc_handle, self.uuid)
+            return DeviceMemoryResource.from_allocation_handle, (device, alloc_handle)
         else:
             return DeviceMemoryResource.from_registry, (self.uuid,)
 
-    @staticmethod
-    def _reconstruct(device, alloc_handle, uuid):
-        self = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
-        self.register(uuid)
-        return self
-
     @staticmethod
     def from_registry(uuid: uuid_module.UUID):
         try:
             return _ipc_registry[uuid]
         except KeyError:
-            raise RuntimeError(f"Memory resource with {uuid=} was not found")
+            raise RuntimeError(f"Memory resource {uuid} was not found")
 
     def register(self, uuid: uuid_module.UUID):
         if uuid not in _ipc_registry:
@@ -725,6 +716,9 @@ class DeviceMemoryResource(MemoryResource):
 
         err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0)
         raise_if_driver_error(err)
+        uuid = getattr(alloc_handle, 'uuid', None)
+        if uuid is not None:
+            self.register(uuid)
         return self
 
     def get_allocation_handle(self) -> IPCAllocationHandle:
@@ -744,9 +738,13 @@ class DeviceMemoryResource(MemoryResource):
                 raise RuntimeError("Imported memory resource cannot be exported")
             err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0)
             raise_if_driver_error(err)
-            self._alloc_handle = IPCAllocationHandle._init(alloc_handle)
-            assert self._uuid is None
-            self._uuid = uuid_module.uuid4()
+            try:
+                assert self._uuid is None
+                self._uuid = uuid_module.uuid4()
+                self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid)
+            except:
+                os.close(alloc_handle)
+                raise
         return self._alloc_handle
 
     def allocate(self, size_t size, stream: Stream = None) -> Buffer:
diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py
index 39f787eb0..ea8b7a347 100644
--- a/cuda_core/tests/memory_ipc/conftest.py
+++ b/cuda_core/tests/memory_ipc/conftest.py
@@ -8,7 +8,7 @@
 POOL_SIZE = 2097152
 
 
-@pytest.fixture(scope="function")
+@pytest.fixture
 def device():
     """Obtains a device suitable for IPC-enabled mempool tests, or skips."""
     # Check if IPC is supported on this platform/device
@@ -26,7 +26,7 @@ def device():
     return device
 
 
-@pytest.fixture(scope="function")
+@pytest.fixture
 def ipc_memory_resource(device):
     mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True))
     assert mr.is_ipc_enabled
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index a6003d19b..b151f0edf 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -2,8 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import multiprocessing
+import re
 
-from cuda.core.experimental import Buffer, DeviceMemoryResource
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
 from cuda.core.experimental._utils.cuda_utils import CUDAError
 
 CHILD_TIMEOUT_SEC = 4
@@ -114,3 +115,23 @@ def CHILD_ACTION(self, queue):
     def ASSERT(self, exc_type, exc_msg):
         assert exc_type is TypeError
         assert exc_msg.startswith("Argument 'ipc_buffer' has incorrect type")
+
+
+class TestDanglingBuffer(ChildErrorHarness):
+    """
+    Error when importing a buffer object without registering its memory
+    resource.
+    """
+
+    def PARENT_ACTION(self, queue):
+        mr2 = DeviceMemoryResource(self.device, dict(max_size=POOL_SIZE, ipc_enabled=True))
+        self.buffer = mr2.allocate(NBYTES)
+        queue.put(self.buffer)  # Note: mr2 not sent
+
+    def CHILD_ACTION(self, queue):
+        Device().set_current()
+        queue.get(timeout=CHILD_TIMEOUT_SEC)
+
+    def ASSERT(self, exc_type, exc_msg):
+        assert exc_type is RuntimeError
+        assert re.match(r"Memory resource [a-z0-9-]+ was not found", exc_msg)
diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
index 77e4ef2c5..c7a9b0b53 100644
--- a/cuda_core/tests/memory_ipc/test_leaks.py
+++ b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import contextlib
 import gc
 import multiprocessing as mp
 
@@ -59,14 +63,13 @@ def exec_reduce_failure(obj, number=1):
     """
     for _ in range(number):
         fails_to_reduce = Irreducible()
-        try:
+        with contextlib.suppress(RuntimeError):
             mp.Process(target=child_main, args=(obj, fails_to_reduce)).start()
-        except RuntimeError:
-            pass
 
 
 class Irreducible:
     """A class that cannot be serialized."""
+
     def __reduce__(self):
         raise RuntimeError("Irreducible")
 
@@ -82,9 +85,7 @@ def __reduce__(self):
     ],
     ids=["alloc_handle", "mr", "buffer", "buffer_desc"],
 )
-@pytest.mark.parametrize(
-    "launcher", [exec_with_object, exec_launch_failure, exec_reduce_failure]
-)
+@pytest.mark.parametrize("launcher", [exec_with_object, exec_launch_failure, exec_reduce_failure])
 def test_pass_object(ipc_memory_resource, launcher, getobject):
     """Check for fd leaks when an object is sent as a subprocess argument."""
     mr = ipc_memory_resource
@@ -106,7 +107,7 @@ def __init__(self):
         self.process = psutil.Process()
 
     def __enter__(self):
-        self.prime()
+        prime()
         gc.collect()
         self.initial_fds = self.process.num_fds()
         return self
@@ -118,12 +119,17 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             assert final_fds == self.initial_fds
         return False
 
-    def prime(self, latch=[]):
-        """Multiprocessing consumes a file descriptor on first launch."""
-        assert mp.get_start_method() == "spawn"
-        if not latch:
-            process = mp.Process()
-            process.start()
-            process.join()
-            assert process.exitcode == 0
-            latch.append(None)
+
+prime_was_run = False
+
+
+def prime():
+    """Multiprocessing consumes a file descriptor on first launch."""
+    assert mp.get_start_method() == "spawn"
+    global prime_was_run
+    if not prime_was_run:
+        process = mp.Process()
+        process.start()
+        process.join()
+        assert process.exitcode == 0
+        prime_was_run = True
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index c0be05188..c0ff64aa1 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -39,7 +39,6 @@ def test_main(self, device, ipc_memory_resource):
         # Verify that the buffer was modified.
         helper.verify_buffer(flipped=True)
 
-
     def child_main(self, mr, queue):
         device = Device()
         buffer = queue.get(timeout=CHILD_TIMEOUT_SEC)
@@ -79,7 +78,6 @@ def test_main(self, device, ipc_memory_resource):
         IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False)
         IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True)
 
-
     def child_main(self, mr, idx, queue):
         # Note: passing the mr registers it so that buffers can be passed
         # directly.
@@ -125,7 +123,6 @@ def test_main(self, device, ipc_memory_resource):
         IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1)
         IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
 
-
     def child_main(self, alloc_handle, idx, queue):
         """Fills a shared memory buffer."""
         # In this case, the device needs to be set up (passing the mr does it
@@ -149,8 +146,8 @@ def test_main(self, device, ipc_memory_resource):
 
         # Start children.
         q1, q2 = (mp.Queue() for _ in range(2))
-        p1 = mp.Process(target=self.child_main, args=(alloc_handle, mr.uuid, 1, q1))
-        p2 = mp.Process(target=self.child_main, args=(alloc_handle, mr.uuid, 2, q2))
+        p1 = mp.Process(target=self.child_main, args=(alloc_handle, 1, q1))
+        p2 = mp.Process(target=self.child_main, args=(alloc_handle, 2, q2))
         p1.start()
         p2.start()
 
@@ -170,13 +167,14 @@ def test_main(self, device, ipc_memory_resource):
         IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1)
         IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
 
-
-    def child_main(self, alloc_handle, uuid, idx, queue):
+    def child_main(self, alloc_handle, idx, queue):
         """Fills a shared memory buffer."""
         device = Device()
         device.set_current()
-        mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
-        mr.register(uuid)
+
+        # Register the memory resource.
+        DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+
+        # Now get buffers.
         buffer = queue.get(timeout=CHILD_TIMEOUT_SEC)
         IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
-
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index cd17bf366..62674767c 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -77,7 +77,8 @@ def test_main(self, device, ipc_memory_resource):
         """Test sending IPC memory objects to a child through a queue."""
         mr = ipc_memory_resource
 
-        # Start the child process.
+        # Start the child process. Sending the memory resource registers it so
+        # that buffers can be handled automatically.
         pipe = [mp.Queue() for _ in range(2)]
         process = mp.Process(target=self.child_main, args=(pipe, mr))
         process.start()

From ed0b35654e877f50a57b0f95e752b718c74e8354 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 26 Sep 2025 11:04:08 -0700
Subject: [PATCH 13/25] Minor changes to address feedback.

---
 cuda_core/cuda/core/experimental/__init__.py    |  1 +
 cuda_core/cuda/core/experimental/_memory.pyx    | 14 +++++++-------
 cuda_core/tests/memory_ipc/conftest.py          |  1 -
 cuda_core/tests/memory_ipc/test_leaks.py        |  1 -
 cuda_core/tests/memory_ipc/test_memory_ipc.py   |  3 +--
 cuda_core/tests/memory_ipc/test_send_buffers.py |  3 +--
 cuda_core/tests/memory_ipc/test_serialize.py    |  3 +--
 cuda_core/tests/memory_ipc/test_workerpool.py   |  3 +--
 cuda_core/tests/memory_ipc/utility.py           |  7 +------
 9 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 9a86459d2..a01134373 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -17,6 +17,7 @@
 from cuda.core.experimental._memory import (
     Buffer,
     DeviceMemoryResource,
+    DeviceMemoryResourceOptions,
     LegacyPinnedMemoryResource,
     MemoryResource,
 )
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 3e75967cc..c30ca6784 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -14,6 +14,7 @@ from dataclasses import dataclass
 from typing import Optional, TypeVar, Union, TYPE_CHECKING
 import abc
 import array
+import contextlib
 import cython
 import multiprocessing
 import multiprocessing.context
@@ -440,12 +441,6 @@ cdef class IPCAllocationHandle:
             )
         return self._handle
 
-    def detach(self):
-        handle = self._handle
-        self._handle = -1
-        self._uuid = None
-        return handle
-
     @property
     def handle(self) -> int:
         return self._handle
@@ -642,6 +637,7 @@ class DeviceMemoryResource(MemoryResource):
                     err, = driver.cuMemPoolDestroy(self._mempool_handle)
                     raise_if_driver_error(err)
             finally:
+                self.unregister()
                 self._dev_id = None
                 self._mempool_handle = None
                 self._attributes = None
@@ -669,7 +665,7 @@ class DeviceMemoryResource(MemoryResource):
         try:
             return _ipc_registry[uuid]
         except KeyError:
-            raise RuntimeError(f"Memory resource {uuid} was not found")
+            raise RuntimeError(f"Memory resource {uuid} was not found") from None
 
     def register(self, uuid: uuid_module.UUID):
         if uuid not in _ipc_registry:
@@ -677,6 +673,10 @@ class DeviceMemoryResource(MemoryResource):
             _ipc_registry[uuid] = self
             self._uuid = uuid
 
+    def unregister(self):
+        with contextlib.suppress(KeyError):
+            del _ipc_registry[self.uuid]
+
     @property
     def uuid(self):
         return self._uuid
diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py
index ea8b7a347..2c3c881e3 100644
--- a/cuda_core/tests/memory_ipc/conftest.py
+++ b/cuda_core/tests/memory_ipc/conftest.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
-
 from cuda.core.experimental import Device, DeviceMemoryResource
 
 POOL_SIZE = 2097152
diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
index c7a9b0b53..b5607097e 100644
--- a/cuda_core/tests/memory_ipc/test_leaks.py
+++ b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -7,7 +7,6 @@
 
 import psutil
 import pytest
-
 from cuda.core.experimental import _memory
 from cuda.core.experimental._utils.cuda_utils import driver
 
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index c0ff64aa1..aa9aacef9 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -3,9 +3,8 @@
 
 import multiprocessing as mp
 
-from utility import IPCBufferTestHelper
-
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
+from utility import IPCBufferTestHelper
 
 CHILD_TIMEOUT_SEC = 4
 NBYTES = 64
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index 496f32553..4e2a9600d 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -4,9 +4,8 @@
 import multiprocessing
 from itertools import cycle
 
-from utility import IPCBufferTestHelper
-
 from cuda.core.experimental import Device, DeviceMemoryResource
+from utility import IPCBufferTestHelper
 
 CHILD_TIMEOUT_SEC = 4
 NBYTES = 64
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 62674767c..984bb16e2 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -5,9 +5,8 @@
 import multiprocessing.reduction
 import os
 
-from utility import IPCBufferTestHelper
-
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
+from utility import IPCBufferTestHelper
 
 CHILD_TIMEOUT_SEC = 4
 NBYTES = 64
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index 50fa1d509..aeacd1707 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -4,9 +4,8 @@
 import multiprocessing
 from itertools import cycle
 
-from utility import IPCBufferTestHelper
-
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
+from utility import IPCBufferTestHelper
 
 CHILD_TIMEOUT_SEC = 4
 NBYTES = 64
diff --git a/cuda_core/tests/memory_ipc/utility.py b/cuda_core/tests/memory_ipc/utility.py
index 766188d10..7ce7752b6 100644
--- a/cuda_core/tests/memory_ipc/utility.py
+++ b/cuda_core/tests/memory_ipc/utility.py
@@ -1,15 +1,10 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-try:
-    from cuda.bindings import driver
-except ImportError:
-    from cuda import cuda as driver
-
 import ctypes
 
 from cuda.core.experimental import Buffer, MemoryResource
-from cuda.core.experimental._utils.cuda_utils import handle_return
+from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
 
 class DummyUnifiedMemoryResource(MemoryResource):

From b40a21346ab2e1bc7021339e81a9ca7d6d07cbb2 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 26 Sep 2025 14:41:33 -0700
Subject: [PATCH 14/25] Removes obsolte tests. Moves imports to nested
 contexts.

---
 cuda_core/cuda/core/experimental/_device.py  |   2 +
 cuda_core/cuda/core/experimental/_memory.pyx |  21 ++-
 cuda_core/tests/memory_ipc/test_serialize.py |  10 +-
 cuda_core/tests/test_ipc_mempool.py          | 178 -------------------
 4 files changed, 14 insertions(+), 197 deletions(-)
 delete mode 100644 cuda_core/tests/test_ipc_mempool.py

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index d94e44b5b..d5814c13a 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -1161,6 +1161,8 @@ def __repr__(self):
         return f"<Device {self._id} ({self.name})>"
 
     def __reduce__(self):
+        import multiprocessing
+        multiprocessing.context.assert_spawning(self)
         return Device._reconstruct, (self.device_id,)
 
     @staticmethod
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index c30ca6784..081900ce7 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -16,13 +16,9 @@ import abc
 import array
 import contextlib
 import cython
-import multiprocessing
-import multiprocessing.context
-import multiprocessing.reduction
 import os
 import platform
 import sys
-import uuid as uuid_module
 import weakref
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream, default_stream
@@ -32,8 +28,9 @@ if platform.system() == "Linux":
     import socket
 
 if TYPE_CHECKING:
-    import cuda.bindings.driver
     from ._device import Device
+    import cuda.bindings.driver
+    import uuid
 
 # TODO: define a memory property mixin class and make Buffer and
 # MemoryResource both inherit from it
@@ -405,7 +402,7 @@ cdef class IPCAllocationHandle:
         raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.")
 
     @classmethod
-    def _init(cls, handle: int, uuid: uuid_module.UUID):
+    def _init(cls, handle: int, uuid: uuid.UUID):
         cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls)
         assert handle >= 0
         self._handle = handle
@@ -426,6 +423,7 @@ cdef class IPCAllocationHandle:
         self.close()
 
     def __reduce__(self):
+        import multiprocessing
         multiprocessing.context.assert_spawning(self)
         df = multiprocessing.reduction.DupFd(self.handle)
         return self._reconstruct, (df, self._uuid)
@@ -446,7 +444,7 @@ cdef class IPCAllocationHandle:
         return self._handle
 
     @property
-    def uuid(self) -> uuid_module.UUID:
+    def uuid(self) -> uuid.UUID:
         return self._uuid
 
 
@@ -527,7 +525,6 @@ class DeviceMemoryResourceAttributes:
 
     del mempool_property
 
-
 # Holds DeviceMemoryResource objects imported by this process.
 # This enables buffer serialization, as buffers can reduce to a pair
 # of comprising the memory resource UUID (the key into this registry)
@@ -651,6 +648,7 @@ class DeviceMemoryResource(MemoryResource):
     def __reduce__(self):
         # If spawning a new process, serialize the resources; otherwise, just
         # send the UUID, using the registry on the receiving end.
+        import multiprocessing
         is_spawning = multiprocessing.context.get_spawning_popen() is not None
         if is_spawning:
             from ._device import Device
@@ -661,13 +659,13 @@ class DeviceMemoryResource(MemoryResource):
             return DeviceMemoryResource.from_registry, (self.uuid,)
 
     @staticmethod
-    def from_registry(uuid: uuid_module.UUID):
+    def from_registry(uuid: uuid.UUID):
         try:
             return _ipc_registry[uuid]
         except KeyError:
             raise RuntimeError(f"Memory resource {uuid} was not found") from None
 
-    def register(self, uuid: uuid_module.UUID):
+    def register(self, uuid: uuid.UUID):
         if uuid not in _ipc_registry:
             assert self._uuid is None or self._uuid == uuid
             _ipc_registry[uuid] = self
@@ -740,7 +738,8 @@ class DeviceMemoryResource(MemoryResource):
             raise_if_driver_error(err)
             try:
                 assert self._uuid is None
-                self._uuid = uuid_module.uuid4()
+                import uuid as uuid
+                self._uuid = uuid.uuid4()
                 self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid)
             except:
                 os.close(alloc_handle)
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 984bb16e2..61e99091b 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -82,11 +82,6 @@ def test_main(self, device, ipc_memory_resource):
         process = mp.Process(target=self.child_main, args=(pipe, mr))
         process.start()
 
-        # Send a device description.
-        pipe[0].put(device)
-        device_id = pipe[1].get(timeout=CHILD_TIMEOUT_SEC)
-        assert device_id == device.device_id
-
         # Send a memory resource directly. This relies on the mr already
         # being passed when spawning the child.
         pipe[0].put(mr)
@@ -105,9 +100,8 @@ def test_main(self, device, ipc_memory_resource):
         IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
 
     def child_main(self, pipe, _):
-        # Device.
-        device = pipe[0].get(timeout=CHILD_TIMEOUT_SEC)
-        pipe[1].put(device.device_id)
+        device = Device()
+        device.set_current()
 
         # Memory resource.
         mr = pipe[0].get(timeout=CHILD_TIMEOUT_SEC)
diff --git a/cuda_core/tests/test_ipc_mempool.py b/cuda_core/tests/test_ipc_mempool.py
deleted file mode 100644
index de436fd48..000000000
--- a/cuda_core/tests/test_ipc_mempool.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-try:
-    from cuda.bindings import driver
-except ImportError:
-    from cuda import cuda as driver
-
-import ctypes
-import multiprocessing
-
-import pytest
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, IPCChannel, MemoryResource
-from cuda.core.experimental._utils.cuda_utils import handle_return
-
-CHILD_TIMEOUT_SEC = 10
-NBYTES = 64
-POOL_SIZE = 2097152
-
-
-@pytest.fixture(scope="function")
-def ipc_device():
-    """Obtains a device suitable for IPC-enabled mempool tests, or skips."""
-    # Check if IPC is supported on this platform/device
-    device = Device()
-    device.set_current()
-
-    if not device.properties.memory_pools_supported:
-        pytest.skip("Device does not support mempool operations")
-
-    # Note: Linux specific. Once Windows support for IPC is implemented, this
-    # test should be updated.
-    if not device.properties.handle_type_posix_file_descriptor_supported:
-        pytest.skip("Device does not support IPC")
-
-    return device
-
-
-def test_ipc_mempool(ipc_device):
-    """Test IPC with memory pools."""
-    # Set up the IPC-enabled memory pool and share it.
-    stream = ipc_device.create_stream()
-    mr = DeviceMemoryResource(ipc_device, dict(max_size=POOL_SIZE, ipc_enabled=True))
-    assert mr.is_ipc_enabled
-    channel = IPCChannel()
-    mr.share_to_channel(channel)
-
-    # Start the child process.
-    queue = multiprocessing.Queue()
-    process = multiprocessing.Process(target=child_main1, args=(channel, queue))
-    process.start()
-
-    # Allocate and fill memory.
-    buffer = mr.allocate(NBYTES, stream=stream)
-    protocol = IPCBufferTestProtocol(ipc_device, buffer, stream=stream)
-    protocol.fill_buffer(flipped=False)
-    stream.sync()
-
-    # Export the buffer via IPC.
-    handle = buffer.export()
-    queue.put(handle)
-
-    # Wait for the child process.
-    process.join(timeout=CHILD_TIMEOUT_SEC)
-    assert process.exitcode == 0
-
-    # Verify that the buffer was modified.
-    protocol.verify_buffer(flipped=True)
-
-
-def child_main1(channel, queue):
-    device = Device()
-    device.set_current()
-    stream = device.create_stream()
-
-    mr = DeviceMemoryResource.from_shared_channel(device, channel)
-    handle = queue.get()  # Get exported buffer data
-    buffer = Buffer.import_(mr, handle)
-
-    protocol = IPCBufferTestProtocol(device, buffer, stream=stream)
-    protocol.verify_buffer(flipped=False)
-    protocol.fill_buffer(flipped=True)
-    stream.sync()
-
-
-def test_shared_pool_errors(ipc_device):
-    """Test expected errors with allocating from a shared IPC memory pool."""
-    # Set up the IPC-enabled memory pool and share it.
-    mr = DeviceMemoryResource(ipc_device, dict(max_size=POOL_SIZE, ipc_enabled=True))
-    channel = IPCChannel()
-    mr.share_to_channel(channel)
-
-    # Start a child process to generate error info.
-    queue = multiprocessing.Queue()
-    process = multiprocessing.Process(target=child_main2, args=(channel, queue))
-    process.start()
-
-    # Check the errors.
-    exc_type, exc_msg = queue.get(timeout=CHILD_TIMEOUT_SEC)
-    assert exc_type is TypeError
-    assert exc_msg == "Cannot allocate from shared memory pool imported via IPC"
-
-    # Wait for the child process.
-    process.join(timeout=CHILD_TIMEOUT_SEC)
-    assert process.exitcode == 0
-
-
-def child_main2(channel, queue):
-    """Child process that pushes IPC errors to a shared queue for testing."""
-    device = Device()
-    device.set_current()
-
-    mr = DeviceMemoryResource.from_shared_channel(device, channel)
-
-    # Allocating from an imported pool.
-    try:
-        mr.allocate(NBYTES)
-    except Exception as e:
-        exc_info = type(e), str(e)
-        queue.put(exc_info)
-
-
-class DummyUnifiedMemoryResource(MemoryResource):
-    def __init__(self, device):
-        self.device = device
-
-    def allocate(self, size, stream=None) -> Buffer:
-        ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value))
-        return Buffer.from_handle(ptr=ptr, size=size, mr=self)
-
-    def deallocate(self, ptr, size, stream=None):
-        handle_return(driver.cuMemFree(ptr))
-
-    @property
-    def is_device_accessible(self) -> bool:
-        return True
-
-    @property
-    def is_host_accessible(self) -> bool:
-        return True
-
-    @property
-    def device_id(self) -> int:
-        return self.device
-
-
-class IPCBufferTestProtocol:
-    """The protocol for verifying IPC.
-
-    Provides methods to fill a buffer with one of two test patterns and verify
-    the expected values.
-    """
-
-    def __init__(self, device, buffer, nbytes=NBYTES, stream=None):
-        self.device = device
-        self.buffer = buffer
-        self.nbytes = nbytes
-        self.stream = stream if stream is not None else device.create_stream()
-        self.scratch_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.nbytes, stream=self.stream)
-
-    def fill_buffer(self, flipped=False):
-        """Fill a device buffer with test pattern using unified memory."""
-        ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte))
-        op = (lambda i: 255 - i) if flipped else (lambda i: i)
-        for i in range(self.nbytes):
-            ptr[i] = ctypes.c_byte(op(i))
-        self.buffer.copy_from(self.scratch_buffer, stream=self.stream)
-
-    def verify_buffer(self, flipped=False):
-        """Verify the buffer contents."""
-        self.scratch_buffer.copy_from(self.buffer, stream=self.stream)
-        self.stream.sync()
-        ptr = ctypes.cast(int(self.scratch_buffer.handle), ctypes.POINTER(ctypes.c_byte))
-        op = (lambda i: 255 - i) if flipped else (lambda i: i)
-        for i in range(self.nbytes):
-            assert ctypes.c_byte(ptr[i]).value == ctypes.c_byte(op(i)).value, (
-                f"Buffer contains incorrect data at index {i}"
-            )

From 4fb3d47b490457ad0adcef0b6ad2caa3ddfbef68 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 26 Sep 2025 15:28:25 -0700
Subject: [PATCH 15/25] Removes pickling for Device objects. Registers the
 pickle method with multiprocessing instead.

---
 cuda_core/cuda/core/experimental/_device.py | 26 +++++++++++----------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index d5814c13a..6abaee9f3 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import multiprocessing
 import threading
 from typing import Optional, Union
 
@@ -1160,18 +1161,6 @@ def __int__(self):
     def __repr__(self):
         return f"<Device {self._id} ({self.name})>"
 
-    def __reduce__(self):
-        import multiprocessing
-        multiprocessing.context.assert_spawning(self)
-        return Device._reconstruct, (self.device_id,)
-
-    @staticmethod
-    def _reconstruct(device_id):
-        device = Device(device_id)
-        if not device._has_inited:
-            device.set_current()
-        return device
-
     def set_current(self, ctx: Context = None) -> Union[Context, None]:
         """Set device to be used for GPU executions.
 
@@ -1346,3 +1335,16 @@ def create_graph_builder(self) -> GraphBuilder:
         """
         self._check_context_initialized()
         return GraphBuilder._init(stream=self.create_stream(), is_stream_owner=True)
+
+
+def _reconstruct_device(device_id):
+    device = Device(device_id)
+    if not device._has_inited:
+        device.set_current()
+    return device
+
+def _reduce_device(device):
+    return _reconstruct_device, (device.device_id,)
+
+multiprocessing.reduction.register(Device, _reduce_device)
+

From c9f8c911c5e4a467a35531e24a08804a4471bab3 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 29 Sep 2025 12:14:31 -0700
Subject: [PATCH 16/25] Updates register function to return registered object.
 Avoids possible early deregistration.

---
 cuda_core/cuda/core/experimental/_memory.pyx | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 081900ce7..19f8eb4be 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -666,10 +666,13 @@ class DeviceMemoryResource(MemoryResource):
             raise RuntimeError(f"Memory resource {uuid} was not found") from None
 
     def register(self, uuid: uuid.UUID):
-        if uuid not in _ipc_registry:
-            assert self._uuid is None or self._uuid == uuid
-            _ipc_registry[uuid] = self
-            self._uuid = uuid
+        existing = _ipc_registry.get(uuid)
+        if existing is not None:
+            return existing
+        assert self._uuid is None or self._uuid == uuid
+        _ipc_registry[uuid] = self
+        self._uuid = uuid
+        return self
 
     def unregister(self):
         with contextlib.suppress(KeyError):
@@ -716,7 +719,7 @@ class DeviceMemoryResource(MemoryResource):
         raise_if_driver_error(err)
         uuid = getattr(alloc_handle, 'uuid', None)
         if uuid is not None:
-            self.register(uuid)
+            self = self.register(uuid)
         return self
 
     def get_allocation_handle(self) -> IPCAllocationHandle:

From 5dda1964a9905f4346fbe5b1ffb19d8361dd5870 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 29 Sep 2025 15:28:41 -0700
Subject: [PATCH 17/25] Renames Buffer import_/export methods.

---
 cuda_core/cuda/core/experimental/_device.py   |  3 ++-
 cuda_core/cuda/core/experimental/_memory.pyx  | 14 +++++++-------
 cuda_core/tests/memory_ipc/test_errors.py     |  6 +++---
 cuda_core/tests/memory_ipc/test_leaks.py      |  2 +-
 cuda_core/tests/memory_ipc/test_memory_ipc.py |  6 +++---
 cuda_core/tests/memory_ipc/test_serialize.py  | 10 +++++-----
 cuda_core/tests/memory_ipc/test_workerpool.py |  7 ++++---
 cuda_core/tests/test_memory.py                |  4 ++--
 8 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 6abaee9f3..1ae659e8d 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -1343,8 +1343,9 @@ def _reconstruct_device(device_id):
         device.set_current()
     return device
 
+
 def _reduce_device(device):
     return _reconstruct_device, (device.device_id,)
 
-multiprocessing.reduction.register(Device, _reduce_device)
 
+multiprocessing.reduction.register(Device, _reduce_device)
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 19f8eb4be..5ce6cd79d 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -29,7 +29,6 @@ if platform.system() == "Linux":
 
 if TYPE_CHECKING:
     from ._device import Device
-    import cuda.bindings.driver
     import uuid
 
 # TODO: define a memory property mixin class and make Buffer and
@@ -75,7 +74,7 @@ cdef class Buffer:
         self.close()
 
     def __reduce__(self):
-        return Buffer.import_, (self.memory_resource, self.export())
+        return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor())
 
     cpdef close(self, stream: Stream = None):
         """Deallocate this buffer asynchronously on the given stream.
@@ -137,7 +136,7 @@ cdef class Buffer:
             return self._mr.device_id
         raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
-    def export(self) -> IPCBufferDescriptor:
+    def get_ipc_descriptor(self) -> IPCBufferDescriptor:
         """Export a buffer allocated for sharing between processes."""
         if not self._mr.is_ipc_enabled:
             raise RuntimeError("Memory resource is not IPC-enabled")
@@ -146,7 +145,7 @@ cdef class Buffer:
         return IPCBufferDescriptor._init(ptr.reserved, self.size)
 
     @classmethod
-    def import_(cls, mr: MemoryResource, ipc_buffer: IPCBufferDescriptor) -> Buffer:
+    def from_ipc_descriptor(cls, mr: MemoryResource, ipc_buffer: IPCBufferDescriptor) -> Buffer:
         """Import a buffer that was exported from another process."""
         if not mr.is_ipc_enabled:
             raise RuntimeError("Memory resource is not IPC-enabled")
@@ -621,7 +620,7 @@ class DeviceMemoryResource(MemoryResource):
             raise_if_driver_error(err)
 
             if opts.ipc_enabled:
-                self.get_allocation_handle() # enables Buffer.export, sets uuid
+                self.get_allocation_handle() # enables Buffer.get_ipc_descriptor, sets uuid
 
     def __del__(self):
         self.close()
@@ -675,8 +674,9 @@ class DeviceMemoryResource(MemoryResource):
         return self
 
     def unregister(self):
-        with contextlib.suppress(KeyError):
-            del _ipc_registry[self.uuid]
+        if _ipc_registry is not None:
+            with contextlib.suppress(KeyError):
+                del _ipc_registry[self.uuid]
 
     @property
     def uuid(self):
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index b151f0edf..c2654b7f7 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -73,11 +73,11 @@ class TestImportWrongMR(ChildErrorHarness):
     def PARENT_ACTION(self, queue):
         mr2 = DeviceMemoryResource(self.device, dict(max_size=POOL_SIZE, ipc_enabled=True))
         buffer = mr2.allocate(NBYTES)
-        queue.put([self.mr, buffer.export()])  # Note: mr does not own this buffer
+        queue.put([self.mr, buffer.get_ipc_descriptor()])  # Note: mr does not own this buffer
 
     def CHILD_ACTION(self, queue):
         mr, buffer_desc = queue.get(timeout=CHILD_TIMEOUT_SEC)
-        Buffer.import_(mr, buffer_desc)
+        Buffer.from_ipc_descriptor(mr, buffer_desc)
 
     def ASSERT(self, exc_type, exc_msg):
         assert exc_type is CUDAError
@@ -110,7 +110,7 @@ def PARENT_ACTION(self, queue):
 
     def CHILD_ACTION(self, queue):
         buffer = queue.get(timeout=CHILD_TIMEOUT_SEC)
-        Buffer.import_(self.mr, buffer)
+        Buffer.from_ipc_descriptor(self.mr, buffer)
 
     def ASSERT(self, exc_type, exc_msg):
         assert exc_type is TypeError
diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
index b5607097e..a9d09a672 100644
--- a/cuda_core/tests/memory_ipc/test_leaks.py
+++ b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -80,7 +80,7 @@ def __reduce__(self):
         lambda mr: mr.get_allocation_handle(),
         lambda mr: mr,
         lambda mr: mr.allocate(NBYTES),
-        lambda mr: mr.allocate(NBYTES).export(),
+        lambda mr: mr.allocate(NBYTES).get_ipc_descriptor(),
     ],
     ids=["alloc_handle", "mr", "buffer", "buffer_desc"],
 )
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index aa9aacef9..c5e10805d 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -109,8 +109,8 @@ def test_main(self, device, ipc_memory_resource):
         # Allocate and share memory.
         buf1 = mr.allocate(NBYTES)
         buf2 = mr.allocate(NBYTES)
-        q1.put(buf1.export())
-        q2.put(buf2.export())
+        q1.put(buf1.get_ipc_descriptor())
+        q2.put(buf2.get_ipc_descriptor())
 
         # Wait for children.
         p1.join(timeout=CHILD_TIMEOUT_SEC)
@@ -130,7 +130,7 @@ def child_main(self, alloc_handle, idx, queue):
         device.set_current()
         mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
         buffer_descriptor = queue.get(timeout=CHILD_TIMEOUT_SEC)
-        buffer = Buffer.import_(mr, buffer_descriptor)
+        buffer = Buffer.from_ipc_descriptor(mr, buffer_descriptor)
         IPCBufferTestHelper(device, buffer).fill_buffer(starting_from=idx)
 
 
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 61e99091b..94338a55a 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -39,7 +39,7 @@ def test_main(self, device, ipc_memory_resource):
         parent_conn.send(buffer1)  # directly
 
         buffer2 = mr.allocate(NBYTES)
-        parent_conn.send(buffer2.export())  # by descriptor
+        parent_conn.send(buffer2.get_ipc_descriptor())  # by descriptor
 
         # Wait for the child process.
         process.join(timeout=CHILD_TIMEOUT_SEC)
@@ -64,7 +64,7 @@ def child_main(self, conn):
         # Receive the buffers.
         buffer1 = conn.recv()  # directly
         buffer_desc = conn.recv()
-        buffer2 = Buffer.import_(mr, buffer_desc)  # by descriptor
+        buffer2 = Buffer.from_ipc_descriptor(mr, buffer_desc)  # by descriptor
 
         # Modify the buffers.
         IPCBufferTestHelper(device, buffer1).fill_buffer(flipped=True)
@@ -126,7 +126,7 @@ def test_object_passing(device, ipc_memory_resource):
     mr = ipc_memory_resource
     alloc_handle = mr.get_allocation_handle()
     buffer = mr.allocate(NBYTES)
-    buffer_desc = buffer.export()
+    buffer_desc = buffer.get_ipc_descriptor()
 
     helper = IPCBufferTestHelper(device, buffer)
     helper.fill_buffer(flipped=False)
@@ -145,8 +145,8 @@ def child_main(device, alloc_handle, mr1, buffer_desc, buffer1):
 
     # OK to build the buffer from either mr and the descriptor.
     # All buffer* objects point to the same memory.
-    buffer2 = Buffer.import_(mr1, buffer_desc)
-    buffer3 = Buffer.import_(mr2, buffer_desc)
+    buffer2 = Buffer.from_ipc_descriptor(mr1, buffer_desc)
+    buffer3 = Buffer.from_ipc_descriptor(mr2, buffer_desc)
 
     helper1 = IPCBufferTestHelper(device, buffer1)
     helper2 = IPCBufferTestHelper(device, buffer2)
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index aeacd1707..6c9d9f2d8 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -36,7 +36,7 @@ def test_ipc_workerpool(self, device, ipc_memory_resource):
         mr = ipc_memory_resource
         buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)]
         with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool:
-            pool.starmap(self.process_buffer, [(0, buffer.export()) for buffer in buffers])
+            pool.starmap(self.process_buffer, [(0, buffer.get_ipc_descriptor()) for buffer in buffers])
 
         for buffer in buffers:
             IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
@@ -49,7 +49,8 @@ def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource):
         buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
         with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool:
             pool.starmap(
-                self.process_buffer, [(mrs.index(buffer.memory_resource), buffer.export()) for buffer in buffers]
+                self.process_buffer,
+                [(mrs.index(buffer.memory_resource), buffer.get_ipc_descriptor()) for buffer in buffers],
             )
 
         for buffer in buffers:
@@ -57,7 +58,7 @@ def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource):
 
     def process_buffer(self, mr_idx, buffer_desc):
         device = Device()
-        buffer = Buffer.import_(g_mrs[mr_idx], buffer_desc)
+        buffer = Buffer.from_ipc_descriptor(g_mrs[mr_idx], buffer_desc)
         IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True)
 
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index b23cd6d4b..f0b305f55 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -350,11 +350,11 @@ def test_mempool(mempool_device):
         mr.get_allocation_handle()
 
     with pytest.raises(RuntimeError, match=ipc_error_msg):
-        buffer.export()
+        buffer.get_ipc_descriptor()
 
     with pytest.raises(RuntimeError, match=ipc_error_msg):
         handle = IPCBufferDescriptor._init(b"", 0)
-        Buffer.import_(mr, handle)
+        Buffer.from_ipc_descriptor(mr, handle)
 
     buffer.close()
 

From e54cb5b64b8824bd33569556863ee011b9ec19f8 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 30 Sep 2025 09:33:07 -0700
Subject: [PATCH 18/25] Moves AllocationHandle serialization to a registration
 with multiprocessing, since it depends on DupFd.

---
 cuda_core/cuda/core/experimental/_device.py  |  8 +++----
 cuda_core/cuda/core/experimental/_memory.pyx | 22 +++++++++++---------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 1ae659e8d..91ae7829c 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -1337,6 +1337,10 @@ def create_graph_builder(self) -> GraphBuilder:
         return GraphBuilder._init(stream=self.create_stream(), is_stream_owner=True)
 
 
+def _reduce_device(device):
+    return _reconstruct_device, (device.device_id,)
+
+
 def _reconstruct_device(device_id):
     device = Device(device_id)
     if not device._has_inited:
@@ -1344,8 +1348,4 @@ def _reconstruct_device(device_id):
     return device
 
 
-def _reduce_device(device):
-    return _reconstruct_device, (device.device_id,)
-
-
 multiprocessing.reduction.register(Device, _reduce_device)
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 5ce6cd79d..af4970704 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -16,6 +16,7 @@ import abc
 import array
 import contextlib
 import cython
+import multiprocessing
 import os
 import platform
 import sys
@@ -421,16 +422,6 @@ cdef class IPCAllocationHandle:
         """Close the handle."""
         self.close()
 
-    def __reduce__(self):
-        import multiprocessing
-        multiprocessing.context.assert_spawning(self)
-        df = multiprocessing.reduction.DupFd(self.handle)
-        return self._reconstruct, (df, self._uuid)
-
-    @classmethod
-    def _reconstruct(cls, df, uuid):
-        return cls._init(df.detach(), uuid)
-
     def __int__(self) -> int:
         if self._handle < 0:
             raise ValueError(
@@ -447,6 +438,17 @@ cdef class IPCAllocationHandle:
         return self._uuid
 
 
+def _reduce_allocation_handle(alloc_handle):
+    df = multiprocessing.reduction.DupFd(alloc_handle.handle)
+    return _reconstruct_allocation_handle, (type(alloc_handle), df, alloc_handle.uuid)
+
+def _reconstruct_allocation_handle(cls, df, uuid):
+    return cls._init(df.detach(), uuid)
+
+
+multiprocessing.reduction.register(IPCAllocationHandle, _reduce_allocation_handle)
+
+
 @dataclass
 cdef class DeviceMemoryResourceOptions:
     """Customizable :obj:`~_memory.DeviceMemoryResource` options.

From 948af33273fbaf6fe71435c2ea3ccc317d96755e Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 30 Sep 2025 09:46:00 -0700
Subject: [PATCH 19/25] Use DeviceMemoryResourceOptions throughout tests.

---
 cuda_core/tests/memory_ipc/conftest.py          |  5 +++--
 cuda_core/tests/memory_ipc/test_errors.py       |  8 +++++---
 cuda_core/tests/memory_ipc/test_send_buffers.py |  7 +++----
 cuda_core/tests/memory_ipc/test_workerpool.py   | 12 +++++-------
 cuda_core/tests/test_memory.py                  |  8 +++++---
 5 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py
index 2c3c881e3..0d4ada510 100644
--- a/cuda_core/tests/memory_ipc/conftest.py
+++ b/cuda_core/tests/memory_ipc/conftest.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
-from cuda.core.experimental import Device, DeviceMemoryResource
+from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 
 POOL_SIZE = 2097152
 
@@ -27,6 +27,7 @@ def device():
 
 @pytest.fixture
 def ipc_memory_resource(device):
-    mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True))
+    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
+    mr = DeviceMemoryResource(device, options=options)
     assert mr.is_ipc_enabled
     return mr
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index c2654b7f7..d8e2af177 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -4,7 +4,7 @@
 import multiprocessing
 import re
 
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from cuda.core.experimental._utils.cuda_utils import CUDAError
 
 CHILD_TIMEOUT_SEC = 4
@@ -71,7 +71,8 @@ class TestImportWrongMR(ChildErrorHarness):
     """Error when importing a buffer from the wrong memory resource."""
 
     def PARENT_ACTION(self, queue):
-        mr2 = DeviceMemoryResource(self.device, dict(max_size=POOL_SIZE, ipc_enabled=True))
+        options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
+        mr2 = DeviceMemoryResource(self.device, options=options)
         buffer = mr2.allocate(NBYTES)
         queue.put([self.mr, buffer.get_ipc_descriptor()])  # Note: mr does not own this buffer
 
@@ -124,7 +125,8 @@ class TestDanglingBuffer(ChildErrorHarness):
     """
 
     def PARENT_ACTION(self, queue):
-        mr2 = DeviceMemoryResource(self.device, dict(max_size=POOL_SIZE, ipc_enabled=True))
+        options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
+        mr2 = DeviceMemoryResource(self.device, options=options)
         self.buffer = mr2.allocate(NBYTES)
         queue.put(self.buffer)  # Note: mr2 not sent
 
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index 4e2a9600d..f2fc5d070 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -4,7 +4,7 @@
 import multiprocessing
 from itertools import cycle
 
-from cuda.core.experimental import Device, DeviceMemoryResource
+from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from utility import IPCBufferTestHelper
 
 CHILD_TIMEOUT_SEC = 4
@@ -41,9 +41,8 @@ def test_ipc_send_buffers(device, ipc_memory_resource):
 def test_ipc_send_buffers_multi(device, ipc_memory_resource):
     """Test passing buffers sourced from multiple memory resources."""
     # Set up several IPC-enabled memory pools.
-    mrs = [ipc_memory_resource] + [
-        DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1)
-    ]
+    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
+    mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)]
 
     # Allocate and fill memory.
     buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index 6c9d9f2d8..f79a3ce32 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -4,7 +4,7 @@
 import multiprocessing
 from itertools import cycle
 
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from utility import IPCBufferTestHelper
 
 CHILD_TIMEOUT_SEC = 4
@@ -43,9 +43,8 @@ def test_ipc_workerpool(self, device, ipc_memory_resource):
 
     def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource):
         """Test IPC with a worker pool using multiple memory resources."""
-        mrs = [ipc_memory_resource] + [
-            DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1)
-        ]
+        options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
+        mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)]
         buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
         with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool:
             pool.starmap(
@@ -88,9 +87,8 @@ def test_ipc_workerpool(self, device, ipc_memory_resource):
 
     def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource):
         """Test IPC with a worker pool using multiple memory resources."""
-        mrs = [ipc_memory_resource] + [
-            DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=True)) for _ in range(NMRS - 1)
-        ]
+        options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
+        mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)]
         buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
         with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool:
             pool.map(self.process_buffer, buffers)
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index f0b305f55..922db5d5b 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -10,7 +10,7 @@
 import platform
 
 import pytest
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions, MemoryResource
 from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor
 from cuda.core.experimental._utils.cuda_utils import handle_return
 
@@ -304,7 +304,8 @@ def test_mempool(mempool_device):
     device = mempool_device
 
     # Test basic pool creation
-    mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=False))
+    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False)
+    mr = DeviceMemoryResource(device, options=options)
     assert mr.device_id == device.device_id
     assert mr.is_device_accessible
     assert not mr.is_host_accessible
@@ -379,7 +380,8 @@ def test_mempool_attributes(ipc_enabled, mempool_device, property_name, expected
     if platform.system() == "Windows":
         return  # IPC not implemented for Windows
 
-    mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE, ipc_enabled=ipc_enabled))
+    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=ipc_enabled)
+    mr = DeviceMemoryResource(device, options=options)
     assert mr.is_ipc_enabled == ipc_enabled
 
     # Get the property value

From b91d98b0aa450a71bebec74739643bfa7bbd07ca Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 30 Sep 2025 11:37:02 -0700
Subject: [PATCH 20/25] Merged tests/memory_ipc/conftest.py into
 tests/conftest.py because certain configurations could not resolve it
 consistently.

---
 cuda_core/tests/conftest.py                   | 29 +++++++++++++++-
 cuda_core/tests/memory_ipc/conftest.py        | 33 -------------------
 cuda_core/tests/memory_ipc/test_errors.py     |  4 +--
 cuda_core/tests/memory_ipc/test_memory_ipc.py | 12 ++++---
 .../tests/memory_ipc/test_send_buffers.py     |  6 ++--
 cuda_core/tests/memory_ipc/test_serialize.py  |  9 +++--
 cuda_core/tests/memory_ipc/test_workerpool.py | 12 ++++---
 7 files changed, 56 insertions(+), 49 deletions(-)
 delete mode 100644 cuda_core/tests/memory_ipc/conftest.py

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index c56c0a972..db9761a3c 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -10,7 +10,7 @@
 import multiprocessing
 
 import pytest
-from cuda.core.experimental import Device, _device
+from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions, _device
 from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
@@ -70,4 +70,31 @@ def pop_all_contexts():
     return pop_all_contexts
 
 
+@pytest.fixture
+def ipc_device():
+    """Obtains a device suitable for IPC-enabled mempool tests, or skips."""
+    # Check if IPC is supported on this platform/device
+    device = Device()
+    device.set_current()
+
+    if not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
+
+    # Note: Linux specific. Once Windows support for IPC is implemented, this
+    # test should be updated.
+    if not device.properties.handle_type_posix_file_descriptor_supported:
+        pytest.skip("Device does not support IPC")
+
+    return device
+
+
+@pytest.fixture
+def ipc_memory_resource(ipc_device):
+    POOL_SIZE = 2097152
+    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
+    mr = DeviceMemoryResource(ipc_device, options=options)
+    assert mr.is_ipc_enabled
+    return mr
+
+
 skipif_need_cuda_headers = pytest.mark.skipif(helpers.CUDA_INCLUDE_PATH is None, reason="need CUDA header")
diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py
deleted file mode 100644
index 0d4ada510..000000000
--- a/cuda_core/tests/memory_ipc/conftest.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
-
-POOL_SIZE = 2097152
-
-
-@pytest.fixture
-def device():
-    """Obtains a device suitable for IPC-enabled mempool tests, or skips."""
-    # Check if IPC is supported on this platform/device
-    device = Device()
-    device.set_current()
-
-    if not device.properties.memory_pools_supported:
-        pytest.skip("Device does not support mempool operations")
-
-    # Note: Linux specific. Once Windows support for IPC is implemented, this
-    # test should be updated.
-    if not device.properties.handle_type_posix_file_descriptor_supported:
-        pytest.skip("Device does not support IPC")
-
-    return device
-
-
-@pytest.fixture
-def ipc_memory_resource(device):
-    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
-    mr = DeviceMemoryResource(device, options=options)
-    assert mr.is_ipc_enabled
-    return mr
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index d8e2af177..aa103ebf9 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -16,11 +16,11 @@ class ChildErrorHarness:
     """Test harness for checking errors in child processes. Subclasses override
     PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples)."""
 
-    def test_main(self, device, ipc_memory_resource):
+    def test_main(self, ipc_device, ipc_memory_resource):
         """Parent process that checks child errors."""
         # Attach fixtures to this object for convenience. These can be accessed
         # from PARENT_ACTION.
-        self.device = device
+        self.device = ipc_device
         self.mr = ipc_memory_resource
 
         # Start a child process to generate error info.
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index c5e10805d..85d8292cd 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -13,9 +13,10 @@
 
 
 class TestIpcMempool:
-    def test_main(self, device, ipc_memory_resource):
+    def test_main(self, ipc_device, ipc_memory_resource):
         """Test IPC with memory pools."""
         # Set up the IPC-enabled memory pool and share it.
+        device = ipc_device
         mr = ipc_memory_resource
 
         # Start the child process.
@@ -47,9 +48,10 @@ def child_main(self, mr, queue):
 
 
 class TestIPCMempoolMultiple:
-    def test_main(self, device, ipc_memory_resource):
+    def test_main(self, ipc_device, ipc_memory_resource):
         """Test IPC with memory pools using multiple processes."""
         # Construct an IPC-enabled memory resource and share it with two children.
+        device = ipc_device
         mr = ipc_memory_resource
         q1, q2 = (mp.Queue() for _ in range(2))
 
@@ -90,12 +92,13 @@ def child_main(self, mr, idx, queue):
 
 
 class TestIPCSharedAllocationHandleAndBufferDescriptors:
-    def test_main(self, device, ipc_memory_resource):
+    def test_main(self, ipc_device, ipc_memory_resource):
         """
         Demonstrate that a memory pool allocation handle can be reused for IPC
         with multiple processes. Uses buffer descriptors.
         """
         # Set up the IPC-enabled memory pool and share it using one handle.
+        device = ipc_device
         mr = ipc_memory_resource
         alloc_handle = mr.get_allocation_handle()
 
@@ -135,11 +138,12 @@ def child_main(self, alloc_handle, idx, queue):
 
 
 class TestIPCSharedAllocationHandleAndBufferObjects:
-    def test_main(self, device, ipc_memory_resource):
+    def test_main(self, ipc_device, ipc_memory_resource):
         """
         Demonstrate that a memory pool allocation handle can be reused for IPC
         with multiple processes. Uses buffer objects (not descriptors).
         """
+        device = ipc_device
         mr = ipc_memory_resource
         alloc_handle = mr.get_allocation_handle()
 
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index f2fc5d070..b6cc631d8 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -14,8 +14,9 @@
 POOL_SIZE = 2097152
 
 
-def test_ipc_send_buffers(device, ipc_memory_resource):
+def test_ipc_send_buffers(ipc_device, ipc_memory_resource):
     """Test passing buffers directly to a child separately from a memory resource."""
+    device = ipc_device
     mr = ipc_memory_resource
 
     # Allocate and fill memory.
@@ -38,9 +39,10 @@ def test_ipc_send_buffers(device, ipc_memory_resource):
         helper.verify_buffer(flipped=True)
 
 
-def test_ipc_send_buffers_multi(device, ipc_memory_resource):
+def test_ipc_send_buffers_multi(ipc_device, ipc_memory_resource):
     """Test passing buffers sourced from multiple memory resources."""
     # Set up several IPC-enabled memory pools.
+    device = ipc_device
     options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
     mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)]
 
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 94338a55a..df9a85633 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -21,7 +21,8 @@ class TestObjectSerializationDirect:
     it on the other end and demonstrate buffer sharing.
     """
 
-    def test_main(self, device, ipc_memory_resource):
+    def test_main(self, ipc_device, ipc_memory_resource):
+        device = ipc_device
         mr = ipc_memory_resource
 
         # Start the child process.
@@ -72,8 +73,9 @@ def child_main(self, conn):
 
 
 class TestObjectSerializationWithMR:
-    def test_main(self, device, ipc_memory_resource):
+    def test_main(self, ipc_device, ipc_memory_resource):
         """Test sending IPC memory objects to a child through a queue."""
+        device = ipc_device
         mr = ipc_memory_resource
 
         # Start the child process. Sending the memory resource registers it so
@@ -113,7 +115,7 @@ def child_main(self, pipe, _):
         IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True)
 
 
-def test_object_passing(device, ipc_memory_resource):
+def test_object_passing(ipc_device, ipc_memory_resource):
     """
     Test sending objects as arguments when starting a process.
 
@@ -123,6 +125,7 @@ def test_object_passing(device, ipc_memory_resource):
     """
 
     # Define the objects.
+    device = ipc_device
     mr = ipc_memory_resource
     alloc_handle = mr.get_allocation_handle()
     buffer = mr.allocate(NBYTES)
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index f79a3ce32..aeaeaa69d 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -31,8 +31,9 @@ def init_worker(mrs):
         global g_mrs
         g_mrs = mrs
 
-    def test_ipc_workerpool(self, device, ipc_memory_resource):
+    def test_ipc_workerpool(self, ipc_device, ipc_memory_resource):
         """Test IPC with a worker pool."""
+        device = ipc_device
         mr = ipc_memory_resource
         buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)]
         with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool:
@@ -41,8 +42,9 @@ def test_ipc_workerpool(self, device, ipc_memory_resource):
         for buffer in buffers:
             IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
 
-    def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource):
+    def test_ipc_workerpool_multi_mr(self, ipc_device, ipc_memory_resource):
         """Test IPC with a worker pool using multiple memory resources."""
+        device = ipc_device
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
         mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)]
         buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
@@ -75,8 +77,9 @@ def init_worker(mrs):
         global g_mrs
         g_mrs = mrs
 
-    def test_ipc_workerpool(self, device, ipc_memory_resource):
+    def test_ipc_workerpool(self, ipc_device, ipc_memory_resource):
         """Test IPC with a worker pool."""
+        device = ipc_device
         mr = ipc_memory_resource
         buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)]
         with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool:
@@ -85,8 +88,9 @@ def test_ipc_workerpool(self, device, ipc_memory_resource):
         for buffer in buffers:
             IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
 
-    def test_ipc_workerpool_multi_mr(self, device, ipc_memory_resource):
+    def test_ipc_workerpool_multi_mr(self, ipc_device, ipc_memory_resource):
         """Test IPC with a worker pool using multiple memory resources."""
+        device = ipc_device
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
         mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)]
         buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]

From d28b52fcff5d41e7bf431e798e9ba08202b3d2ce Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 30 Sep 2025 12:13:28 -0700
Subject: [PATCH 21/25] Makes the psutil module an optional dependency for
 testing.

---
 cuda_core/tests/memory_ipc/test_leaks.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
index a9d09a672..823da36db 100644
--- a/cuda_core/tests/memory_ipc/test_leaks.py
+++ b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -5,7 +5,13 @@
 import gc
 import multiprocessing as mp
 
-import psutil
+try:
+    import psutil
+except ImportError:
+    HAVE_PSUTIL = False
+else:
+    HAVE_PSUTIL = True
+
 import pytest
 from cuda.core.experimental import _memory
 from cuda.core.experimental._utils.cuda_utils import driver
@@ -14,10 +20,12 @@
 NBYTES = 64
 
 USING_FDS = _memory._IPC_HANDLE_TYPE == driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
-skip_unless_using_fds = pytest.mark.skipif(not USING_FDS, reason="mempool allocation handle is not using fds")
+skip_if_unrunnable = pytest.mark.skipif(
+    not USING_FDS or not HAVE_PSUTIL, reason="mempool allocation handle is not using fds or psutil is unavailable"
+)
 
 
-@skip_unless_using_fds
+@skip_if_unrunnable
 def test_alloc_handle(ipc_memory_resource):
     """Check for fd leaks in get_allocation_handle."""
     mr = ipc_memory_resource
@@ -73,7 +81,7 @@ def __reduce__(self):
         raise RuntimeError("Irreducible")
 
 
-@skip_unless_using_fds
+@skip_if_unrunnable
 @pytest.mark.parametrize(
     "getobject",
     [

From e0d0bf449d330db629f3c6325a9416bc7bdb951b Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 1 Oct 2025 15:13:59 -0700
Subject: [PATCH 22/25] Bump the child timeout for IPC tests.

---
 cuda_core/tests/memory_ipc/test_errors.py       | 2 +-
 cuda_core/tests/memory_ipc/test_leaks.py        | 2 +-
 cuda_core/tests/memory_ipc/test_memory_ipc.py   | 2 +-
 cuda_core/tests/memory_ipc/test_send_buffers.py | 2 +-
 cuda_core/tests/memory_ipc/test_serialize.py    | 2 +-
 cuda_core/tests/memory_ipc/test_workerpool.py   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index aa103ebf9..e0d87a0d0 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -7,7 +7,7 @@
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from cuda.core.experimental._utils.cuda_utils import CUDAError
 
-CHILD_TIMEOUT_SEC = 4
+CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
 POOL_SIZE = 2097152
 
diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
index 823da36db..387ca7042 100644
--- a/cuda_core/tests/memory_ipc/test_leaks.py
+++ b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -16,7 +16,7 @@
 from cuda.core.experimental import _memory
 from cuda.core.experimental._utils.cuda_utils import driver
 
-CHILD_TIMEOUT_SEC = 4
+CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
 
 USING_FDS = _memory._IPC_HANDLE_TYPE == driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index 85d8292cd..da4678afe 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -6,7 +6,7 @@
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
 from utility import IPCBufferTestHelper
 
-CHILD_TIMEOUT_SEC = 4
+CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
 NWORKERS = 2
 NTASKS = 2
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index b6cc631d8..3e3f44865 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -7,7 +7,7 @@
 from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from utility import IPCBufferTestHelper
 
-CHILD_TIMEOUT_SEC = 4
+CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
 NMRS = 3
 NTASKS = 7
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index df9a85633..ba0ac326f 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -8,7 +8,7 @@
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
 from utility import IPCBufferTestHelper
 
-CHILD_TIMEOUT_SEC = 4
+CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
 POOL_SIZE = 2097152
 
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index aeaeaa69d..cd2259c4e 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -7,7 +7,7 @@
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from utility import IPCBufferTestHelper
 
-CHILD_TIMEOUT_SEC = 4
+CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
 NWORKERS = 2
 NMRS = 3

From fbcf3b311f851f74b3c2b83adf777df02e4c4430 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 2 Oct 2025 14:21:22 -0700
Subject: [PATCH 23/25] Add docstrings. Change is_imported to is_mapped.
 Register DeviceMemoryResource reduction with multiprocessing. Add a quick
 exit to from_allocation_handle. Simplify the worker pool tests based on the
 new reduction method.

---
 cuda_core/cuda/core/experimental/_memory.pyx  | 151 ++++++++++++++----
 cuda_core/docs/source/api.rst                 |   1 +
 cuda_core/docs/source/api_private.rst         |   9 +-
 cuda_core/tests/memory_ipc/test_errors.py     |   9 +-
 cuda_core/tests/memory_ipc/test_workerpool.py | 109 +++++++------
 5 files changed, 192 insertions(+), 87 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 78d738908..024ed3c3f 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -532,6 +532,7 @@ class DeviceMemoryResourceAttributes:
 
     del mempool_property
 
+
 # Holds DeviceMemoryResource objects imported by this process.
 # This enables buffer serialization, as buffers can reduce to a pair
 # of comprising the memory resource UUID (the key into this registry)
@@ -539,7 +540,8 @@ class DeviceMemoryResourceAttributes:
 _ipc_registry = {}
 
 class DeviceMemoryResource(MemoryResource):
-    """Create a device memory resource managing a stream-ordered memory pool.
+    """
+    Create a device memory resource managing a stream-ordered memory pool.
 
     Parameters
     ----------
@@ -560,9 +562,63 @@ class DeviceMemoryResource(MemoryResource):
         When using an existing (current or default) memory pool, the returned
         device memory resource does not own the pool (`is_handle_owned` is
         `False`), and closing the resource has no effect.
+
+    IPC-Enabled Memory Resources
+    ----------------------------
+    If ``ipc_enabled=True`` is specified as an initializer option, the memory
+    resource constructed will be capable of sharing allocations between
+    processes. Sharing an allocation is a two-step procedure that involves
+    mapping a memory resource and then mapping buffers owned by that resource.
+    These steps can be accomplished in several ways.
+
+    An IPC-enabled memory resource (MR) can allocate memory buffers but cannot
+    receive shared buffers. Mapping an MR to another process creates a "mapped
+    memory resource" (MMR). An MMR cannot allocate memory buffers and can only
+    receive shared buffers. MRs and MMRs are both of type
+    :class:`DeviceMemoryResource` and can be distinguished via
+    :attr:`DeviceMemoryResource.is_mapped`.
+
+    An MR is shared via an allocation handle obtained by calling
+    :meth:`DeviceMemoryResource.get_allocation_handle`. The allocation handle
+    has a platform-specific interpretation; however, memory IPC is currently
+    only supported for Linux, and in that case allocation handles are file
+    descriptors. After sending an allocation handle to another process, it can
+    be used to create an MMR by invoking
+    :meth:`DeviceMemoryResource.from_allocation_handle`.
+
+    Buffers can be shared as serializable descriptors obtained by calling
+    :meth:`Buffer.get_ipc_descriptor`. In a receiving process, a shared buffer is
+    created by invoking :meth:`Buffer.from_ipc_descriptor` with an MMR and
+    buffer descriptor, where the MMR corresponds to the MR that created the
+    described buffer.
+
+    To help manage the association between memory resources and buffers, a
+    registry is provided. Every MR has a unique identifier (UUID). MMRs can be
+    registered by calling :meth:`DeviceMemoryResource.register` with the UUID
+    of the corresponding MR. Registered MMRs can be looked up via
+    :meth:`DeviceMemoryResource.from_registry`. When registering MMRs in this
+    way, the use of buffer descriptors can be avoided. Instead, buffer objects
+    can themselves be serialized and transferred directly. Serialization embeds
+    the UUID, which is used to locate the correct MMR during reconstruction.
+
+    IPC-enabled memory resources interoperate with the :mod:`multiprocessing`
+    module to provide a simplified interface. This approach can avoid direct
+    use of allocation handles, buffer descriptors, MMRs, and the registry. When
+    using :mod:`multiprocessing` to spawn processes or send objects through
+    communication channels such as :class:`multiprocessing.Queue`,
+    :class:`multiprocessing.Pipe`, or :class:`multiprocessing.Connection`,
+    :class:`Buffer` objects may be sent directly, and in such cases the process
+    for creating MMRs and mapping buffers will be handled automatically.
+
+    For greater efficiency when transferring many buffers, one may also send
+    MRs and buffers separately. When an MR is sent via :mod:`multiprocessing`,
+    an MMR is created and registered in the receiving process. Subsequently,
+    buffers may be serialized and transferred using ordinary :mod:`pickle`
+    methods.  The reconstruction procedure uses the registry to find the
+    associated MMR.
     """
     __slots__ = ("_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type",
-                 "_mempool_owned", "_is_imported", "_uuid", "_alloc_handle")
+                 "_mempool_owned", "_is_mapped", "_uuid", "_alloc_handle")
 
     def __init__(self, device_id: int | Device, options=None):
         device_id = getattr(device_id, 'device_id', device_id)
@@ -577,7 +633,7 @@ class DeviceMemoryResource(MemoryResource):
             self._attributes = None
             self._ipc_handle_type = _NOIPC_HANDLE_TYPE
             self._mempool_owned = False
-            self._is_imported = False
+            self._is_mapped = False
             self._uuid = None
             self._alloc_handle = None
 
@@ -620,7 +676,7 @@ class DeviceMemoryResource(MemoryResource):
             self._attributes = None
             self._ipc_handle_type = properties.handleTypes
             self._mempool_owned = True
-            self._is_imported = False
+            self._is_mapped = False
             self._uuid = None
             self._alloc_handle = None
 
@@ -641,38 +697,46 @@ class DeviceMemoryResource(MemoryResource):
                     err, = driver.cuMemPoolDestroy(self._mempool_handle)
                     raise_if_driver_error(err)
             finally:
-                self.unregister()
+                if self.is_mapped:
+                    self.unregister()
                 self._dev_id = None
                 self._mempool_handle = None
                 self._attributes = None
                 self._ipc_handle_type = _NOIPC_HANDLE_TYPE
                 self._mempool_owned = False
-                self._is_imported = False
+                self._is_mapped = False
                 self._uuid = None
                 self._alloc_handle = None
 
 
     def __reduce__(self):
-        # If spawning a new process, serialize the resources; otherwise, just
-        # send the UUID, using the registry on the receiving end.
-        import multiprocessing
-        is_spawning = multiprocessing.context.get_spawning_popen() is not None
-        if is_spawning:
-            from ._device import Device
-            device = Device(self.device_id)
-            alloc_handle = self.get_allocation_handle()
-            return DeviceMemoryResource.from_allocation_handle, (device, alloc_handle)
-        else:
-            return DeviceMemoryResource.from_registry, (self.uuid,)
+        return DeviceMemoryResource.from_registry, (self.uuid,)
 
     @staticmethod
-    def from_registry(uuid: uuid.UUID):
+    def from_registry(uuid: uuid.UUID) -> DeviceMemoryResource:
+        """
+        Obtain a registered mapped memory resource.
+
+        Raises
+        ------
+        RuntimeError
+            If no mapped memory resource is found in the registry.
+        """
+
         try:
             return _ipc_registry[uuid]
         except KeyError:
             raise RuntimeError(f"Memory resource {uuid} was not found") from None
 
-    def register(self, uuid: uuid.UUID):
+    def register(self, uuid: uuid.UUID) -> DeviceMemoryResource:
+        """
+        Register a mapped memory resource.
+
+        Returns
+        -------
+        The registered mapped memory resource. If one was previously registered
+        with the given key, it is returned.
+        """
         existing = _ipc_registry.get(uuid)
         if existing is not None:
             return existing
@@ -682,12 +746,18 @@ class DeviceMemoryResource(MemoryResource):
         return self
 
     def unregister(self):
-        if _ipc_registry is not None:
+        """Unregister this mapped memory resource."""
+        assert self.is_mapped
+        if _ipc_registry is not None:  # can occur during shutdown catastrophe
             with contextlib.suppress(KeyError):
                 del _ipc_registry[self.uuid]
 
     @property
-    def uuid(self):
+    def uuid(self) -> Optional[uuid.UUID]:
+        """
+        A universally unique identifier for this memory resource. Meaningful
+        only for IPC-enabled memory resources.
+        """
         return self._uuid
 
     @classmethod
@@ -711,6 +781,12 @@ class DeviceMemoryResource(MemoryResource):
         -------
             A new device memory resource instance with the imported handle.
         """
+         # Quick exit for registry hits.
+        uuid = getattr(alloc_handle, 'uuid', None)
+        self = _ipc_registry.get(uuid)
+        if self is not None:
+            return self
+
         device_id = getattr(device_id, 'device_id', device_id)
 
         self = cls.__new__(cls)
@@ -719,15 +795,15 @@ class DeviceMemoryResource(MemoryResource):
         self._attributes = None
         self._ipc_handle_type = _IPC_HANDLE_TYPE
         self._mempool_owned = True
-        self._is_imported = True
+        self._is_mapped = True
         self._uuid = None
         self._alloc_handle = None # only used for non-imported
 
         err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0)
         raise_if_driver_error(err)
-        uuid = getattr(alloc_handle, 'uuid', None)
         if uuid is not None:
-            self = self.register(uuid)
+            registered = self.register(uuid)
+            assert registered is self
         return self
 
     def get_allocation_handle(self) -> IPCAllocationHandle:
@@ -743,13 +819,13 @@ class DeviceMemoryResource(MemoryResource):
         if self._alloc_handle is None:
             if not self.is_ipc_enabled:
                 raise RuntimeError("Memory resource is not IPC-enabled")
-            if self._is_imported:
+            if self._is_mapped:
                 raise RuntimeError("Imported memory resource cannot be exported")
             err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0)
             raise_if_driver_error(err)
             try:
                 assert self._uuid is None
-                import uuid as uuid
+                import uuid
                 self._uuid = uuid.uuid4()
                 self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid)
             except:
@@ -774,8 +850,8 @@ class DeviceMemoryResource(MemoryResource):
             The allocated buffer object, which is accessible on the device that this memory
             resource was created for.
         """
-        if self._is_imported:
-            raise TypeError("Cannot allocate from shared memory pool imported via IPC")
+        if self._is_mapped:
+            raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
         if stream is None:
             stream = default_stream()
         err, ptr = driver.cuMemAllocFromPoolAsync(size, self._mempool_handle, stream.handle)
@@ -823,9 +899,12 @@ class DeviceMemoryResource(MemoryResource):
         return self._mempool_owned
 
     @property
-    def is_imported(self) -> bool:
-        """Whether the memory resource was imported from another process. If True, allocation is not permitted."""
-        return self._is_imported
+    def is_mapped(self) -> bool:
+        """
+        Whether this is a mapping of an IPC-enabled memory resource from
+        another process.  If True, allocation is not permitted.
+        """
+        return self._is_mapped
 
     @property
     def is_device_accessible(self) -> bool:
@@ -843,6 +922,16 @@ class DeviceMemoryResource(MemoryResource):
         return self._ipc_handle_type != _NOIPC_HANDLE_TYPE
 
 
+def _deep_reduce_device_memory_resource(mr):
+    from ._device import Device
+    device = Device(mr.device_id)
+    alloc_handle = mr.get_allocation_handle()
+    return mr.from_allocation_handle, (device, alloc_handle)
+
+
+multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource)
+
+
 class LegacyPinnedMemoryResource(MemoryResource):
     """Create a pinned memory resource that uses legacy cuMemAllocHost/cudaMallocHost
     APIs.
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 9c93d0f75..f239c69cd 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -30,6 +30,7 @@ CUDA runtime
 
    :template: dataclass.rst
 
+   DeviceMemoryResourceOptions
    EventOptions
    GraphCompleteOptions
    GraphDebugPrintOptions
diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst
index fb36e0a30..917b7101d 100644
--- a/cuda_core/docs/source/api_private.rst
+++ b/cuda_core/docs/source/api_private.rst
@@ -4,9 +4,9 @@
 :orphan:
 
 .. This page is to generate documentation for private classes exposed to users,
-   i.e., users cannot instantiate it by themselves but may use it's properties
-   or methods via returned values from public APIs. These classes must be referred
-   in public APIs returning their instances.
+   i.e., users cannot instantiate them but may use their properties or methods
+   via returned values from public APIs. These classes must be referred in
+   public APIs returning their instances.
 
 .. currentmodule:: cuda.core.experimental
 
@@ -18,8 +18,9 @@ CUDA runtime
 
    _memory.PyCapsule
    _memory.DevicePointerT
-   _memory.IPCBufferDescriptor
    _device.DeviceProperties
+   _memory.IPCAllocationHandle
+   _memory.IPCBufferDescriptor
    _module.KernelAttributes
    _module.KernelOccupancy
    _module.ParamInfo
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index e0d87a0d0..d6b1dc86d 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import multiprocessing
+import pickle
 import re
 
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
@@ -64,7 +65,7 @@ def CHILD_ACTION(self, queue):
 
     def ASSERT(self, exc_type, exc_msg):
         assert exc_type is TypeError
-        assert exc_msg == "Cannot allocate from shared memory pool imported via IPC"
+        assert exc_msg == "Cannot allocate from a mapped IPC-enabled memory resource"
 
 
 class TestImportWrongMR(ChildErrorHarness):
@@ -128,11 +129,13 @@ def PARENT_ACTION(self, queue):
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
         mr2 = DeviceMemoryResource(self.device, options=options)
         self.buffer = mr2.allocate(NBYTES)
-        queue.put(self.buffer)  # Note: mr2 not sent
+        buffer_s = pickle.dumps(self.buffer)
+        queue.put(buffer_s)  # Note: mr2 not sent
 
     def CHILD_ACTION(self, queue):
         Device().set_current()
-        queue.get(timeout=CHILD_TIMEOUT_SEC)
+        buffer_s = queue.get(timeout=CHILD_TIMEOUT_SEC)
+        pickle.loads(buffer_s)
 
     def ASSERT(self, exc_type, exc_msg):
         assert exc_type is RuntimeError
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index cd2259c4e..b7be23267 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -1,8 +1,10 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import multiprocessing
 from itertools import cycle
+import multiprocessing as mp
+import pickle
+import pytest
 
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from utility import IPCBufferTestHelper
@@ -14,44 +16,58 @@
 NTASKS = 20
 POOL_SIZE = 2097152
 
-# Global memory resources, set in children.
-g_mrs = None
 
-
-class TestIpcWorkerPoolUsingExport:
+class TestIpcWorkerPool:
     """
-    Test buffer sharing using export handles.
+    Map a function over shared buffers using a worker pool to distribute work.
 
-    The memory resources need to be passed to subprocesses at startup. Buffers
-    are passed by their handles and reconstructed using the corresponding mr.
+    This demonstrates the simplest interface, though not the most efficient
+    one.  Each buffer transfer involes a deep transfer of the associated memory
+    resource (duplicates are ignored on the receiving end).
     """
 
-    @staticmethod
-    def init_worker(mrs):
-        global g_mrs
-        g_mrs = mrs
-
-    def test_ipc_workerpool(self, ipc_device, ipc_memory_resource):
-        """Test IPC with a worker pool."""
+    @pytest.mark.parametrize("nmrs", (1, NMRS))
+    def test_main(self, ipc_device, nmrs):
         device = ipc_device
-        mr = ipc_memory_resource
-        buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)]
-        with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool:
-            pool.starmap(self.process_buffer, [(0, buffer.get_ipc_descriptor()) for buffer in buffers])
+        options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
+        mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)]
+        buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
+
+        with mp.Pool(NWORKERS) as pool:
+            pool.map(self.process_buffer, buffers)
 
         for buffer in buffers:
             IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
 
-    def test_ipc_workerpool_multi_mr(self, ipc_device, ipc_memory_resource):
-        """Test IPC with a worker pool using multiple memory resources."""
+    def process_buffer(self, buffer):
+        device = Device()
+        IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True)
+
+
+class TestIpcWorkerPoolUsingIPCDescriptors:
+    """
+    Test buffer sharing using IPC descriptors.
+
+    The memory resources are passed to subprocesses at startup. Buffers are
+    passed by their handles and reconstructed using the corresponding resource.
+    """
+
+    @staticmethod
+    def init_worker(mrs):
+        """Called during child process initialization to store received memory resources."""
+        TestIpcWorkerPoolUsingIPCDescriptors.mrs = mrs
+
+    @pytest.mark.parametrize("nmrs", (1, NMRS))
+    def test_main(self, ipc_device, nmrs):
         device = ipc_device
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
-        mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)]
+        mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)]
         buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
-        with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool:
+
+        with mp.Pool(NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool:
             pool.starmap(
                 self.process_buffer,
-                [(mrs.index(buffer.memory_resource), buffer.get_ipc_descriptor()) for buffer in buffers],
+                [(mrs.index(buffer.memory_resource), buffer.get_ipc_descriptor()) for buffer in buffers]
             )
 
         for buffer in buffers:
@@ -59,47 +75,42 @@ def test_ipc_workerpool_multi_mr(self, ipc_device, ipc_memory_resource):
 
     def process_buffer(self, mr_idx, buffer_desc):
         device = Device()
-        buffer = Buffer.from_ipc_descriptor(g_mrs[mr_idx], buffer_desc)
+        buffer = Buffer.from_ipc_descriptor(self.mrs[mr_idx], buffer_desc)
         IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True)
 
 
-class TestIpcWorkerPool:
+class TestIpcWorkerPoolUsingRegistry:
     """
-    Test buffer sharing without using export handles.
+    Test buffer sharing using the memory resource registry.
 
-    The memory resources need to be passed to subprocesses at startup. Buffers
-    are serialized with the `uuid` of the corresponding mr, and the
-    import/export is handled automatically.
+    The memory resources are passed to subprocesses at startup, which
+    implicitly registers them. Buffers are passed via serialization and matched
+    to the corresponding memory resource through the registry. This is more
+    complicated than the simple example (first, above) but passes buffers more
+    efficiently.
     """
 
     @staticmethod
     def init_worker(mrs):
-        global g_mrs
-        g_mrs = mrs
+        # Passing mrs implicitly registers them.
+        pass
 
-    def test_ipc_workerpool(self, ipc_device, ipc_memory_resource):
-        """Test IPC with a worker pool."""
-        device = ipc_device
-        mr = ipc_memory_resource
-        buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)]
-        with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=([mr],)) as pool:
-            pool.map(self.process_buffer, buffers)
-
-        for buffer in buffers:
-            IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
-
-    def test_ipc_workerpool_multi_mr(self, ipc_device, ipc_memory_resource):
-        """Test IPC with a worker pool using multiple memory resources."""
+    @pytest.mark.parametrize("nmrs", (1, NMRS))
+    def test_main(self, ipc_device, nmrs):
         device = ipc_device
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
-        mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)]
+        mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)]
         buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
-        with multiprocessing.Pool(processes=NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool:
-            pool.map(self.process_buffer, buffers)
+
+        with mp.Pool(NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool:
+            pool.map(self.process_buffer, [pickle.dumps(buffer) for buffer in buffers]
+            )
 
         for buffer in buffers:
             IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
 
-    def process_buffer(self, buffer):
+    def process_buffer(self, buffer_s):
         device = Device()
+        buffer = pickle.loads(buffer_s)
         IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True)
+

From e5b8542a86bbdff1011ada605494e4d7e5f7330a Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 2 Oct 2025 16:56:04 -0700
Subject: [PATCH 24/25] Remove call to set_current in Device reconstruction.
 Add device set-up to tests.

---
 cuda_core/cuda/core/experimental/_device.py   | 18 ++-----
 cuda_core/cuda/core/experimental/_memory.pyx  |  2 +-
 cuda_core/tests/memory_ipc/test_errors.py     |  5 +-
 cuda_core/tests/memory_ipc/test_leaks.py      |  4 +-
 cuda_core/tests/memory_ipc/test_memory_ipc.py | 30 ++++++------
 .../tests/memory_ipc/test_send_buffers.py     | 47 ++++++-------------
 cuda_core/tests/memory_ipc/test_serialize.py  |  9 ++--
 cuda_core/tests/memory_ipc/test_workerpool.py | 25 +++++-----
 8 files changed, 55 insertions(+), 85 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 91ae7829c..be8c5170a 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import multiprocessing
 import threading
 from typing import Optional, Union
 
@@ -1161,6 +1160,9 @@ def __int__(self):
     def __repr__(self):
         return f"<Device {self._id} ({self.name})>"
 
+    def __reduce__(self):
+        return Device, (self.device_id,)
+
     def set_current(self, ctx: Context = None) -> Union[Context, None]:
         """Set device to be used for GPU executions.
 
@@ -1335,17 +1337,3 @@ def create_graph_builder(self) -> GraphBuilder:
         """
         self._check_context_initialized()
         return GraphBuilder._init(stream=self.create_stream(), is_stream_owner=True)
-
-
-def _reduce_device(device):
-    return _reconstruct_device, (device.device_id,)
-
-
-def _reconstruct_device(device_id):
-    device = Device(device_id)
-    if not device._has_inited:
-        device.set_current()
-    return device
-
-
-multiprocessing.reduction.register(Device, _reduce_device)
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 024ed3c3f..0b9f8a28b 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -923,7 +923,7 @@ class DeviceMemoryResource(MemoryResource):
 
 
 def _deep_reduce_device_memory_resource(mr):
-    from ._device import Device
+    from . import Device
     device = Device(mr.device_id)
     alloc_handle = mr.get_allocation_handle()
     return mr.from_allocation_handle, (device, alloc_handle)
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index d6b1dc86d..3e8265b39 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -43,6 +43,7 @@ def test_main(self, ipc_device, ipc_memory_resource):
     def child_main(self, pipe, device, mr):
         """Child process that pushes IPC errors to a shared pipe for testing."""
         self.device = device
+        self.device.set_current()
         self.mr = mr
         try:
             self.CHILD_ACTION(pipe[0])
@@ -129,13 +130,13 @@ def PARENT_ACTION(self, queue):
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
         mr2 = DeviceMemoryResource(self.device, options=options)
         self.buffer = mr2.allocate(NBYTES)
-        buffer_s = pickle.dumps(self.buffer)
+        buffer_s = pickle.dumps(self.buffer)  # noqa: S301
         queue.put(buffer_s)  # Note: mr2 not sent
 
     def CHILD_ACTION(self, queue):
         Device().set_current()
         buffer_s = queue.get(timeout=CHILD_TIMEOUT_SEC)
-        pickle.loads(buffer_s)
+        pickle.loads(buffer_s)  # noqa: S301
 
     def ASSERT(self, exc_type, exc_msg):
         assert exc_type is RuntimeError
diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
index 387ca7042..bfead7dd3 100644
--- a/cuda_core/tests/memory_ipc/test_leaks.py
+++ b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -33,7 +33,7 @@ def test_alloc_handle(ipc_memory_resource):
         [mr.get_allocation_handle() for _ in range(10)]
 
 
-def exec_with_object(obj, number=1):
+def exec_success(obj, number=1):
     """Succesfully run a child process."""
     for _ in range(number):
         process = mp.Process(target=child_main, args=(obj,))
@@ -92,7 +92,7 @@ def __reduce__(self):
     ],
     ids=["alloc_handle", "mr", "buffer", "buffer_desc"],
 )
-@pytest.mark.parametrize("launcher", [exec_with_object, exec_launch_failure, exec_reduce_failure])
+@pytest.mark.parametrize("launcher", [exec_success, exec_launch_failure, exec_reduce_failure])
 def test_pass_object(ipc_memory_resource, launcher, getobject):
     """Check for fd leaks when an object is sent as a subprocess argument."""
     mr = ipc_memory_resource
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index da4678afe..9ed24792b 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -3,7 +3,7 @@
 
 import multiprocessing as mp
 
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
+from cuda.core.experimental import Buffer, DeviceMemoryResource
 from utility import IPCBufferTestHelper
 
 CHILD_TIMEOUT_SEC = 20
@@ -21,7 +21,7 @@ def test_main(self, ipc_device, ipc_memory_resource):
 
         # Start the child process.
         queue = mp.Queue()
-        process = mp.Process(target=self.child_main, args=(mr, queue))
+        process = mp.Process(target=self.child_main, args=(device, mr, queue))
         process.start()
 
         # Allocate and fill memory.
@@ -39,8 +39,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
         # Verify that the buffer was modified.
         helper.verify_buffer(flipped=True)
 
-    def child_main(self, mr, queue):
-        device = Device()
+    def child_main(self, device, mr, queue):
+        device.set_current()
         buffer = queue.get(timeout=CHILD_TIMEOUT_SEC)
         helper = IPCBufferTestHelper(device, buffer)
         helper.verify_buffer(flipped=False)
@@ -64,8 +64,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
         q2.put(buffer2)
 
         # Start the child processes.
-        p1 = mp.Process(target=self.child_main, args=(mr, 1, q1))
-        p2 = mp.Process(target=self.child_main, args=(mr, 2, q2))
+        p1 = mp.Process(target=self.child_main, args=(device, mr, 1, q1))
+        p2 = mp.Process(target=self.child_main, args=(device, mr, 2, q2))
         p1.start()
         p2.start()
 
@@ -79,10 +79,10 @@ def test_main(self, ipc_device, ipc_memory_resource):
         IPCBufferTestHelper(device, buffer1).verify_buffer(flipped=False)
         IPCBufferTestHelper(device, buffer2).verify_buffer(flipped=True)
 
-    def child_main(self, mr, idx, queue):
+    def child_main(self, device, mr, idx, queue):
         # Note: passing the mr registers it so that buffers can be passed
         # directly.
-        device = Device()
+        device.set_current()
         buffer1 = queue.get(timeout=CHILD_TIMEOUT_SEC)
         buffer2 = queue.get(timeout=CHILD_TIMEOUT_SEC)
         if idx == 1:
@@ -104,8 +104,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
 
         # Start children.
         q1, q2 = (mp.Queue() for _ in range(2))
-        p1 = mp.Process(target=self.child_main, args=(alloc_handle, 1, q1))
-        p2 = mp.Process(target=self.child_main, args=(alloc_handle, 2, q2))
+        p1 = mp.Process(target=self.child_main, args=(device, alloc_handle, 1, q1))
+        p2 = mp.Process(target=self.child_main, args=(device, alloc_handle, 2, q2))
         p1.start()
         p2.start()
 
@@ -125,11 +125,10 @@ def test_main(self, ipc_device, ipc_memory_resource):
         IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1)
         IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
 
-    def child_main(self, alloc_handle, idx, queue):
+    def child_main(self, device, alloc_handle, idx, queue):
         """Fills a shared memory buffer."""
         # In this case, the device needs to be set up (passing the mr does it
         # implicitly in other tests).
-        device = Device()
         device.set_current()
         mr = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
         buffer_descriptor = queue.get(timeout=CHILD_TIMEOUT_SEC)
@@ -149,8 +148,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
 
         # Start children.
         q1, q2 = (mp.Queue() for _ in range(2))
-        p1 = mp.Process(target=self.child_main, args=(alloc_handle, 1, q1))
-        p2 = mp.Process(target=self.child_main, args=(alloc_handle, 2, q2))
+        p1 = mp.Process(target=self.child_main, args=(device, alloc_handle, 1, q1))
+        p2 = mp.Process(target=self.child_main, args=(device, alloc_handle, 2, q2))
         p1.start()
         p2.start()
 
@@ -170,9 +169,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
         IPCBufferTestHelper(device, buf1).verify_buffer(starting_from=1)
         IPCBufferTestHelper(device, buf2).verify_buffer(starting_from=2)
 
-    def child_main(self, alloc_handle, idx, queue):
+    def child_main(self, device, alloc_handle, idx, queue):
         """Fills a shared memory buffer."""
-        device = Device()
         device.set_current()
 
         # Register the memory resource.
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index 3e3f44865..966b6eafc 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -1,10 +1,11 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import multiprocessing
+import multiprocessing as mp
 from itertools import cycle
 
-from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+import pytest
+from cuda.core.experimental import DeviceMemoryResource, DeviceMemoryResourceOptions
 from utility import IPCBufferTestHelper
 
 CHILD_TIMEOUT_SEC = 20
@@ -14,37 +15,13 @@
 POOL_SIZE = 2097152
 
 
-def test_ipc_send_buffers(ipc_device, ipc_memory_resource):
-    """Test passing buffers directly to a child separately from a memory resource."""
-    device = ipc_device
-    mr = ipc_memory_resource
-
-    # Allocate and fill memory.
-    buffers = [mr.allocate(NBYTES) for _ in range(NTASKS)]
-    for buffer in buffers:
-        helper = IPCBufferTestHelper(device, buffer)
-        helper.fill_buffer(flipped=False)
-
-    # Start the child process. Send the buffer directly.
-    process = multiprocessing.Process(target=child_main, args=(buffers,))
-    process.start()
-
-    # Wait for the child process.
-    process.join(timeout=CHILD_TIMEOUT_SEC)
-    assert process.exitcode == 0
-
-    # Verify that the buffers were modified.
-    for buffer in buffers:
-        helper = IPCBufferTestHelper(device, buffer)
-        helper.verify_buffer(flipped=True)
-
-
-def test_ipc_send_buffers_multi(ipc_device, ipc_memory_resource):
+@pytest.mark.parametrize("nmrs", (1, NMRS))
+def test_ipc_send_buffers(ipc_device, nmrs):
     """Test passing buffers sourced from multiple memory resources."""
     # Set up several IPC-enabled memory pools.
     device = ipc_device
     options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
-    mrs = [ipc_memory_resource] + [DeviceMemoryResource(device, options=options) for _ in range(NMRS - 1)]
+    mrs = [DeviceMemoryResource(device, options=options) for _ in range(NMRS)]
 
     # Allocate and fill memory.
     buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
@@ -53,7 +30,13 @@ def test_ipc_send_buffers_multi(ipc_device, ipc_memory_resource):
         helper.fill_buffer(flipped=False)
 
     # Start the child process.
-    process = multiprocessing.Process(target=child_main, args=(buffers,))
+    process = mp.Process(
+        target=child_main,
+        args=(
+            device,
+            buffers,
+        ),
+    )
     process.start()
 
     # Wait for the child process.
@@ -66,8 +49,8 @@ def test_ipc_send_buffers_multi(ipc_device, ipc_memory_resource):
         helper.verify_buffer(flipped=True)
 
 
-def child_main(buffers):
-    device = Device()
+def child_main(device, buffers):
+    device.set_current()
     for buffer in buffers:
         helper = IPCBufferTestHelper(device, buffer)
         helper.verify_buffer(flipped=False)
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index ba0ac326f..2d88bcd03 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -33,7 +33,6 @@ def test_main(self, ipc_device, ipc_memory_resource):
         # Send a memory resource by allocation handle.
         alloc_handle = mr.get_allocation_handle()
         mp.reduction.send_handle(parent_conn, alloc_handle.handle, process.pid)
-        parent_conn.send(mr.uuid)
 
         # Send a buffer.
         buffer1 = mr.allocate(NBYTES)
@@ -57,9 +56,7 @@ def child_main(self, conn):
 
         # Receive the memory resource.
         handle = mp.reduction.recv_handle(conn)
-        uuid = conn.recv()
         mr = DeviceMemoryResource.from_allocation_handle(device, handle)
-        mr.register(uuid)
         os.close(handle)
 
         # Receive the buffers.
@@ -135,7 +132,7 @@ def test_object_passing(ipc_device, ipc_memory_resource):
     helper.fill_buffer(flipped=False)
 
     # Start the child process.
-    process = mp.Process(target=child_main, args=(device, alloc_handle, mr, buffer_desc, buffer))
+    process = mp.Process(target=child_main, args=(alloc_handle, mr, buffer_desc, buffer))
     process.start()
     process.join(timeout=CHILD_TIMEOUT_SEC)
     assert process.exitcode == 0
@@ -143,7 +140,9 @@ def test_object_passing(ipc_device, ipc_memory_resource):
     helper.verify_buffer(flipped=True)
 
 
-def child_main(device, alloc_handle, mr1, buffer_desc, buffer1):
+def child_main(alloc_handle, mr1, buffer_desc, buffer1):
+    device = Device()
+    device.set_current()
     mr2 = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
 
     # OK to build the buffer from either mr and the descriptor.
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index b7be23267..401324e05 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -1,11 +1,11 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from itertools import cycle
 import multiprocessing as mp
 import pickle
-import pytest
+from itertools import cycle
 
+import pytest
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from utility import IPCBufferTestHelper
 
@@ -40,7 +40,8 @@ def test_main(self, ipc_device, nmrs):
             IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
 
     def process_buffer(self, buffer):
-        device = Device()
+        device = Device(buffer.memory_resource.device_id)
+        device.set_current()
         IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True)
 
 
@@ -67,15 +68,17 @@ def test_main(self, ipc_device, nmrs):
         with mp.Pool(NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool:
             pool.starmap(
                 self.process_buffer,
-                [(mrs.index(buffer.memory_resource), buffer.get_ipc_descriptor()) for buffer in buffers]
+                [(mrs.index(buffer.memory_resource), buffer.get_ipc_descriptor()) for buffer in buffers],
             )
 
         for buffer in buffers:
             IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
 
     def process_buffer(self, mr_idx, buffer_desc):
-        device = Device()
-        buffer = Buffer.from_ipc_descriptor(self.mrs[mr_idx], buffer_desc)
+        mr = self.mrs[mr_idx]
+        device = Device(mr.device_id)
+        device.set_current()
+        buffer = Buffer.from_ipc_descriptor(mr, buffer_desc)
         IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True)
 
 
@@ -103,14 +106,12 @@ def test_main(self, ipc_device, nmrs):
         buffers = [mr.allocate(NBYTES) for mr, _ in zip(cycle(mrs), range(NTASKS))]
 
         with mp.Pool(NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool:
-            pool.map(self.process_buffer, [pickle.dumps(buffer) for buffer in buffers]
-            )
+            pool.starmap(self.process_buffer, [(device, pickle.dumps(buffer)) for buffer in buffers])
 
         for buffer in buffers:
             IPCBufferTestHelper(device, buffer).verify_buffer(flipped=True)
 
-    def process_buffer(self, buffer_s):
-        device = Device()
-        buffer = pickle.loads(buffer_s)
+    def process_buffer(self, device, buffer_s):
+        device.set_current()
+        buffer = pickle.loads(buffer_s)  # noqa: S301
         IPCBufferTestHelper(device, buffer).fill_buffer(flipped=True)
-

From 534b16ae9a117b131a2f9d953c20d1d388baae1c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 3 Oct 2025 01:49:24 +0000
Subject: [PATCH 25/25] fix docstring rendering

---
 cuda_core/cuda/core/experimental/_memory.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx
index 0b9f8a28b..3fdc1410f 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory.pyx
@@ -563,15 +563,15 @@ class DeviceMemoryResource(MemoryResource):
         device memory resource does not own the pool (`is_handle_owned` is
         `False`), and closing the resource has no effect.
 
-    IPC-Enabled Memory Resources
-    ----------------------------
-    If ``ipc_enabled=True`` is specified as an initializer option, the memory
-    resource constructed will be capable of sharing allocations between
-    processes. Sharing an allocation is a two-step procedure that involves
+    Notes
+    -----
+    To create an IPC-Enabled memory resource (MR) that is capable of sharing
+    allocations between processes, specify ``ipc_enabled=True`` in the initializer
+    option. Sharing an allocation is a two-step procedure that involves
     mapping a memory resource and then mapping buffers owned by that resource.
     These steps can be accomplished in several ways.
 
-    An IPC-enabled memory resource (MR) can allocate memory buffers but cannot
+    An IPC-enabled memory resource can allocate memory buffers but cannot
     receive shared buffers. Mapping an MR to another process creates a "mapped
     memory resource" (MMR). An MMR cannot allocate memory buffers and can only
     receive shared buffers. MRs and MMRs are both of type