diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index d17e63dc90a..4320bee1b68 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -4,6 +4,7 @@ import multiprocessing import pickle import re +import sys import pytest @@ -43,6 +44,15 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 finally: for mr in self._extra_mrs: diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py index e1bb45efcfb..601e6c913d2 100644 --- a/cuda_core/tests/memory_ipc/test_event_ipc.py +++ b/cuda_core/tests/memory_ipc/test_event_ipc.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import multiprocessing as mp +import sys import pytest from helpers.buffers import compare_equal_buffers, make_scratch_buffer @@ -67,6 +68,15 @@ def test_main(self, ipc_device, ipc_memory_resource): log("releasing stream1") latch.release() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 log("done") @@ -162,6 +172,15 @@ def test_main(self, ipc_device, blocking_sync, use_options_cls, use_option_kw): assert props[5] is None process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 def child_main(self, q_in, q_out): diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py index f0c4951e8e6..03b7d46a5d4 100644 --- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -11,6 +11,7 @@ import contextlib import multiprocessing as mp +import sys import pytest from helpers.logging import TimestampedLogger @@ -84,6 +85,15 @@ def test_main(self, ipc_device, ipc_memory_resource): log("waiting for child") process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() log(f"child exit code: {process.exitcode}") assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}" log("done") diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index e2a7e8d096b..26cd202f9a2 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -5,6 +5,7 @@ import gc import multiprocessing as mp import platform +import sys try: import psutil @@ -38,6 +39,15 @@ def exec_success(obj, number=1): process = mp.Process(target=child_main, args=(obj,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 @@ -54,6 +64,15 @@ def exec_launch_failure(obj, number=1): process = mp.Process(target=child_main_bad, args=(obj,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode != 0 @@ -137,5 +156,14 @@ def prime(): process = mp.Process() process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 prime_was_run = True diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index 54159d81130..77af8701dc0 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import multiprocessing as mp +import sys import pytest from helpers.buffers import PatternGen @@ -39,6 +40,15 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 # Verify that the buffer was modified. @@ -82,6 +92,16 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child processes. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + for p in (p1, p2): + if p.is_alive(): + print( + f"[WARN] child process {p.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + p.kill() + p.join() assert p1.exitcode == 0 assert p2.exitcode == 0 @@ -135,6 +155,16 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for children. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + for p in (p1, p2): + if p.is_alive(): + print( + f"[WARN] child process {p.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + p.kill() + p.join() assert p1.exitcode == 0 assert p2.exitcode == 0 @@ -185,6 +215,16 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for children. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + for p in (p1, p2): + if p.is_alive(): + print( + f"[WARN] child process {p.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + p.kill() + p.join() assert p1.exitcode == 0 assert p2.exitcode == 0 diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index 0c644c25ed4..bd0ceb287d9 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import multiprocessing as mp +import sys import pytest from helpers.buffers import PatternGen @@ -35,6 +36,15 @@ def test_main(self, ipc_mempool_device_x2): process = mp.Process(target=self.child_main, args=(mr,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 # Verify parent's MR still has peer access set (independent state) @@ -81,6 +91,15 @@ def test_main(self, ipc_mempool_device_x2, grant_access_in_parent): process = mp.Process(target=self.child_main, args=(mr, buffer)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 buffer.close() diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index 041a8539da3..9914b857b97 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import multiprocessing as mp +import sys from itertools import cycle import pytest @@ -39,6 +40,22 @@ def test_main(self, ipc_device, nmrs): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + # Child did not exit within the timeout. Under compute-sanitizer + # with --target-processes=all (active for Python 3.12 + local + # CTK on Linux), IPC memory teardown inside the sanitizer can + # deadlock on CUDA 12.9.1, leaving the child alive indefinitely. + # SIGKILL forces the kernel to reclaim all IPC handles so that + # fixture teardown (mr.close()) does not block the runner for + # hours. See issue #2004. + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 # Verify that the buffers were modified. @@ -96,10 +113,28 @@ def test_main(self, ipc_device, ipc_memory_resource): proc_c.start() # Wait for C to signal completion then clean up. - event_c.wait(timeout=CHILD_TIMEOUT_SEC) + completed = event_c.wait(timeout=CHILD_TIMEOUT_SEC) event_b.set() # b can finish now proc_b.join(timeout=CHILD_TIMEOUT_SEC) proc_c.join(timeout=CHILD_TIMEOUT_SEC) + + # Kill any processes that are still alive. Under compute-sanitizer with + # --target-processes=all, IPC teardown deadlocks on CUDA 12.9.1 so + # children never exit. SIGKILL forces kernel IPC handle release so + # fixture teardown (mr.close()) does not block the runner for hours. + # See issue #2004. + for p in (proc_b, proc_c): + if p.is_alive(): + print( + f"[WARN] child process {p.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + p.kill() + p.join() + + assert completed, "process C did not complete within timeout" assert proc_b.exitcode == 0 assert proc_c.exitcode == 0 diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 78d26387c8a..130dde44320 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -4,6 +4,7 @@ import multiprocessing as mp import multiprocessing.reduction import os +import sys import pytest from helpers.buffers import PatternGen @@ -46,6 +47,15 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 # Confirm buffers were modified. @@ -103,6 +113,15 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + if process.is_alive(): + print( + f"[WARN] child process {process.pid} still alive after " + f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL " + f"(likely compute-sanitizer IPC deadlock, see issue #2004)", + file=sys.stderr, + ) + process.kill() + process.join() assert process.exitcode == 0 # Confirm buffer was modified.