Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 65 additions & 1 deletion cuda_core/tests/memory_ipc/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import pickle
import re

import pytest
from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
from cuda.core.experimental._utils.cuda_utils import CUDAError
from cuda.core.experimental._utils.cuda_utils import CUDAError, driver

from cuda_python_test_helpers import supports_ipc_mempool

Expand Down Expand Up @@ -147,3 +148,66 @@ def CHILD_ACTION(self, queue):
def ASSERT(self, exc_type, exc_msg):
assert exc_type is RuntimeError
assert re.match(r"Memory resource [a-z0-9-]+ was not found", exc_msg)


def test_error_in_close_memory_resource(ipc_memory_resource):
"""Test that errors when closing a memory resource are raised."""
mr = ipc_memory_resource
driver.cuMemPoolDestroy(mr.handle)
with pytest.raises(CUDAError, match=".*CUDA_ERROR_INVALID_VALUE.*"):
mr.close()
Comment on lines +153 to +158
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is illegal and I disagree we need to test this. This is Python and we can't possibly guard against all kinds of bizarre ways of trying to mutate the state of our Python objects behind our back. In particular, as noted in both #1074 (comment) and offline discussion, errors like CUDA_ERROR_INVALID_VALUE are due to multiple frees. I thought we've moved on?

This test is just another instance of the same class of errors: We free the handle of an object through a direct C API call, bypassing our safeguard mechanism (under the hood we do check if the handle is already null before freeing, and then after free we set the handle to null to avoid double free), so our destructor kicks in again, either through an explicit close() call or implicitly when going out of scope.

Copy link
Contributor

@cpcloud cpcloud Oct 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that this test is asserting that different levels of APIs offer the same guarantees, which would make developing layers of APIs really really difficult.

It seems roughly analogous to calling into the Python C API through ctypes, and expecting Python to somehow know you didn't mean to cause a segmentation violation:

❯ python3.13 -q
>>> x = 1
>>> import ctypes
>>> ctypes.pythonapi.Py_DecRef(x)
zsh: segmentation fault (core dumped)  python3.13 -q

Copy link
Contributor

@cpcloud cpcloud Oct 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would guess there's probably another way to write this test such that the behavior is buggy without crossing into the C abyss of naked bindings.

Would it be enough to just call close() twice? That seems like something we should perhaps be robust to if we're not already:

❯ python -q
>>> f = open('/tmp/x', 'w')
>>> f.close()
>>> f.close()



@pytest.mark.xfail
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this work here?

@pytest.mark.xfail(reason="Issue #1118", strict=True)

The important part is strict=True.

@pytest.mark.filterwarnings("ignore") # Test incorrectly warns
def test_error_in_close_buffer(ipc_memory_resource):
"""Test that errors when closing a memory buffer are raised."""
mr = ipc_memory_resource
buffer = mr.allocate(NBYTES)
driver.cuMemFree(buffer.handle)
with pytest.raises(CUDAError, match=".*CUDA_ERROR_INVALID_VALUE.*"):
buffer.close()
Comment on lines +167 to +169
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto



@pytest.mark.xfail
@pytest.mark.filterwarnings("ignore") # Test incorrectly warns
class TestErrorInCloseImportedMemoryResource(ChildErrorHarness):
"""Test that errors when closing an imported memory resource are raised."""

def PARENT_ACTION(self, queue):
pass

def CHILD_ACTION(self, queue):
try:
driver.cuMemPoolDestroy(self.mr.handle)
except Exception: # noqa: S110
pass
else:
self.mr.close()
Comment on lines +181 to +186
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto


def ASSERT(self, exc_type, exc_msg):
assert exc_type is CUDAError
assert re.match(r"CUDA_ERROR_INVALID_VALUE", exc_msg)


@pytest.mark.xfail
@pytest.mark.filterwarnings("ignore") # Test incorrectly warns
class TestErrorInCloseImportedBuffer(ChildErrorHarness):
"""Test that errors when closing an imported buffer are raised."""

def PARENT_ACTION(self, queue):
self.buffer = self.mr.allocate(NBYTES)
queue.put(self.buffer)

def CHILD_ACTION(self, queue):
buffer = queue.get(timeout=CHILD_TIMEOUT_SEC)
try:
driver.cuMemFree(buffer.handle)
except Exception: # noqa: S110
pass
else:
buffer.close()

def ASSERT(self, exc_type, exc_msg):
assert exc_type is CUDAError
assert re.match(r"CUDA_ERROR_INVALID_VALUE", exc_msg)