From df61e4942217dda845d89bb959047631e6490bc5 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 21 May 2026 12:01:20 -0400 Subject: [PATCH 1/4] tests: kill zombie IPC child processes after join timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Python 3.12 CI runs, env-vars enables compute-sanitizer with --target-processes=all, which attaches to every mp.Process child the tests spawn. On CUDA 12.9.1 the sanitizer analysis of IPC buffer teardown gets stuck, so child processes never exit. The existing join(timeout=CHILD_TIMEOUT_SEC) returns but leaves the child alive. That zombie keeps its IPC handle open. When pytest teardown runs ipc_memory_resource's mr.close(), it blocks waiting for the handle to be released — tying up the runner for hours until GitHub Actions force-cancels the job. This is the exact pattern in issue #2004 (always Python 3.12 + CUDA 12.9.1 local). Fix: after join(timeout=...), kill any process still alive so the IPC handle is released before fixture teardown. Tests still fail (exit code is non-zero or completed is False), just in seconds rather than hours. Fixes #2004 --- .../tests/memory_ipc/test_send_buffers.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index 041a8539da3..e0a92e6ee33 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -39,6 +39,13 @@ def test_main(self, ipc_device, nmrs): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + # Child is stuck (CUDA context teardown can hang under certain + # driver/Python combos — see issue #2004). Kill it so the + # IPC handle is released and the fixture teardown doesn't + # block the runner for hours. + process.kill() + process.join() assert process.exitcode == 0 # Verify that the buffers were modified. @@ -96,10 +103,20 @@ def test_main(self, ipc_device, ipc_memory_resource): proc_c.start() # Wait for C to signal completion then clean up. - event_c.wait(timeout=CHILD_TIMEOUT_SEC) + completed = event_c.wait(timeout=CHILD_TIMEOUT_SEC) event_b.set() # b can finish now proc_b.join(timeout=CHILD_TIMEOUT_SEC) proc_c.join(timeout=CHILD_TIMEOUT_SEC) + + # Kill any processes that are still alive. Without this, a child stuck + # in CUDA context teardown (issue #2004: Python 3.12 + CUDA 12.9.1) + # holds IPC handles and blocks fixture teardown indefinitely. + for p in (proc_b, proc_c): + if p.is_alive(): + p.kill() + p.join() + + assert completed, "process C did not complete within timeout" assert proc_b.exitcode == 0 assert proc_c.exitcode == 0 From 5b802a0f02cfaf01d18c7fc6b3e64e8b4e9dc961 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 21 May 2026 12:27:32 -0400 Subject: [PATCH 2/4] tests: kill zombie IPC child processes after join timeout When Python 3.12 CI runs, env-vars enables compute-sanitizer with --target-processes=all, which attaches to every mp.Process child the tests spawn. On CUDA 12.9.1 the sanitizer analysis of IPC buffer teardown gets stuck, so child processes never exit. The existing join(timeout=CHILD_TIMEOUT_SEC) returns but leaves the child alive. That zombie keeps its IPC handle open. When pytest teardown runs ipc_memory_resource's mr.close(), it blocks waiting for the handle to be released -- tying up the runner for hours until GitHub Actions force-cancels the job. This is the exact pattern in issue #2004 (always Python 3.12 + CUDA 12.9.1 local). Fix: after join(timeout=...), kill any process still alive so the IPC handle is released before fixture teardown. A stderr warning is printed when kill() fires so the failure is clearly attributable to the sanitizer/IPC deadlock rather than appearing as a generic exitcode != 0. Tests still fail (exit code is non-zero or completed is False), just in seconds rather than hours. Fixes #2004 --- .../tests/memory_ipc/test_send_buffers.py | 32 +++++++++++++++---- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index e0a92e6ee33..9914b857b97 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import multiprocessing as mp +import sys from itertools import cycle import pytest @@ -40,10 +41,19 @@ def test_main(self, ipc_device, nmrs): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) if process.is_alive(): - # Child is stuck (CUDA context teardown can hang under certain - # driver/Python combos — see issue #2004). Kill it so the - # IPC handle is released and the fixture teardown doesn't - # block the runner for hours. + # Child did not exit within the timeout. Under compute-sanitizer + # with --target-processes=all (active for Python 3.12 + local + # CTK on Linux), IPC memory teardown inside the sanitizer can + # deadlock on CUDA 12.9.1, leaving the child alive indefinitely. + # SIGKILL forces the kernel to reclaim all IPC handles so that + # fixture teardown (mr.close()) does not block the runner for + # hours. See issue #2004. + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) process.kill() process.join() assert process.exitcode == 0 @@ -108,11 +118,19 @@ def test_main(self, ipc_device, ipc_memory_resource): proc_b.join(timeout=CHILD_TIMEOUT_SEC) proc_c.join(timeout=CHILD_TIMEOUT_SEC) - # Kill any processes that are still alive. Without this, a child stuck - # in CUDA context teardown (issue #2004: Python 3.12 + CUDA 12.9.1) - # holds IPC handles and blocks fixture teardown indefinitely. + # Kill any processes that are still alive. Under compute-sanitizer with + # --target-processes=all, IPC teardown deadlocks on CUDA 12.9.1 so + # children never exit. SIGKILL forces kernel IPC handle release so + # fixture teardown (mr.close()) does not block the runner for hours. + # See issue #2004. for p in (proc_b, proc_c): if p.is_alive(): + print( + f"[WARN] child process {p.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) p.kill() p.join() From 929cd6fd6c28d5b8059675fbfceb47ce550aeb30 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 21 May 2026 13:57:43 -0400 Subject: [PATCH 3/4] tests: extend kill-after-join to remaining memory_ipc tests The fix in the initial commit only covered test_send_buffers.py. The 63-minute CI hang confirmed the deadlock was occurring in one of the other test files. Applies the same kill() after join(timeout=...) pattern to all remaining memory_ipc tests: - test_errors.py (1 join) - test_event_ipc.py (2 joins) - test_ipc_duplicate_import.py (1 join) - test_leaks.py (3 joins) - test_memory_ipc.py (7 joins across 4 test classes) - test_peer_access.py (2 joins) - test_serialize.py (3 joins) See issue #2004. --- cuda_core/tests/memory_ipc/test_errors.py | 10 +++++ cuda_core/tests/memory_ipc/test_event_ipc.py | 18 +++++++++ .../memory_ipc/test_ipc_duplicate_import.py | 9 +++++ cuda_core/tests/memory_ipc/test_leaks.py | 28 +++++++++++++ cuda_core/tests/memory_ipc/test_memory_ipc.py | 40 +++++++++++++++++++ .../tests/memory_ipc/test_peer_access.py | 18 +++++++++ cuda_core/tests/memory_ipc/test_serialize.py | 19 +++++++++ 7 files changed, 142 insertions(+) diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index d17e63dc90a..4320bee1b68 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -4,6 +4,7 @@ import multiprocessing import pickle import re +import sys import pytest @@ -43,6 +44,15 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 finally: for mr in self._extra_mrs: diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py index e1bb45efcfb..5016c7ca4b3 100644 --- a/cuda_core/tests/memory_ipc/test_event_ipc.py +++ b/cuda_core/tests/memory_ipc/test_event_ipc.py @@ -67,6 +67,15 @@ def test_main(self, ipc_device, ipc_memory_resource): log("releasing stream1") latch.release() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 log("done") @@ -162,6 +171,15 @@ def test_main(self, ipc_device, blocking_sync, use_options_cls, use_option_kw): assert props[5] is None process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 def child_main(self, q_in, q_out): diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py index f0c4951e8e6..bb99df01314 100644 --- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -84,6 +84,15 @@ def test_main(self, ipc_device, ipc_memory_resource): log("waiting for child") process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() log(f"child exit code: {process.exitcode}") assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}" log("done") diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index e2a7e8d096b..26cd202f9a2 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -5,6 +5,7 @@ import gc import multiprocessing as mp import platform +import sys try: import psutil @@ -38,6 +39,15 @@ def exec_success(obj, number=1): process = mp.Process(target=child_main, args=(obj,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 @@ -54,6 +64,15 @@ def exec_launch_failure(obj, number=1): process = mp.Process(target=child_main_bad, args=(obj,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode != 0 @@ -137,5 +156,14 @@ def prime(): process = mp.Process() process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 prime_was_run = True diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index 54159d81130..77af8701dc0 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import multiprocessing as mp +import sys import pytest from helpers.buffers import PatternGen @@ -39,6 +40,15 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 # Verify that the buffer was modified. @@ -82,6 +92,16 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child processes. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + for p in (p1, p2): + if p.is_alive(): + print( + f"[WARN] child process {p.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + p.kill() + p.join() assert p1.exitcode == 0 assert p2.exitcode == 0 @@ -135,6 +155,16 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for children. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + for p in (p1, p2): + if p.is_alive(): + print( + f"[WARN] child process {p.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + p.kill() + p.join() assert p1.exitcode == 0 assert p2.exitcode == 0 @@ -185,6 +215,16 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for children. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + for p in (p1, p2): + if p.is_alive(): + print( + f"[WARN] child process {p.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + p.kill() + p.join() assert p1.exitcode == 0 assert p2.exitcode == 0 diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index 0c644c25ed4..c0a92f40cc8 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -35,6 +35,15 @@ def test_main(self, ipc_mempool_device_x2): process = mp.Process(target=self.child_main, args=(mr,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 # Verify parent's MR still has peer access set (independent state) @@ -81,6 +90,15 @@ def test_main(self, ipc_mempool_device_x2, grant_access_in_parent): process = mp.Process(target=self.child_main, args=(mr, buffer)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 buffer.close() diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 78d26387c8a..130dde44320 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -4,6 +4,7 @@ import multiprocessing as mp import multiprocessing.reduction import os +import sys import pytest from helpers.buffers import PatternGen @@ -46,6 +47,15 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 # Confirm buffers were modified. @@ -103,6 +113,15 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 # Confirm buffer was modified. From 4cbc5310ee52dda75a6ed789377dcbeed4e260af Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 21 May 2026 14:07:00 -0400 Subject: [PATCH 4/4] tests: add missing sys import to event_ipc, dup_import, peer_access The kill() calls print to sys.stderr -- these three files were missing the import and would have raised NameError at runtime under the hang path. --- cuda_core/tests/memory_ipc/test_event_ipc.py | 1 + cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py | 1 + cuda_core/tests/memory_ipc/test_peer_access.py | 1 + 3 files changed, 3 insertions(+) diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py index 5016c7ca4b3..601e6c913d2 100644 --- a/cuda_core/tests/memory_ipc/test_event_ipc.py +++ b/cuda_core/tests/memory_ipc/test_event_ipc.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import multiprocessing as mp +import sys import pytest from helpers.buffers import compare_equal_buffers, make_scratch_buffer diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py index bb99df01314..03b7d46a5d4 100644 --- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -11,6 +11,7 @@ import contextlib import multiprocessing as mp +import sys import pytest from helpers.logging import TimestampedLogger diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index c0a92f40cc8..bd0ceb287d9 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import multiprocessing as mp +import sys import pytest from helpers.buffers import PatternGen