Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cuda_core/tests/memory_ipc/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import multiprocessing
import pickle
import re
import sys

import pytest

Expand Down Expand Up @@ -43,6 +44,15 @@ def test_main(self, ipc_device, ipc_memory_resource):

# Wait for the child process.
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode == 0
finally:
for mr in self._extra_mrs:
Expand Down
19 changes: 19 additions & 0 deletions cuda_core/tests/memory_ipc/test_event_ipc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import multiprocessing as mp
import sys

import pytest
from helpers.buffers import compare_equal_buffers, make_scratch_buffer
Expand Down Expand Up @@ -67,6 +68,15 @@ def test_main(self, ipc_device, ipc_memory_resource):
log("releasing stream1")
latch.release()
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode == 0
log("done")

Expand Down Expand Up @@ -162,6 +172,15 @@ def test_main(self, ipc_device, blocking_sync, use_options_cls, use_option_kw):
assert props[5] is None

process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode == 0

def child_main(self, q_in, q_out):
Expand Down
10 changes: 10 additions & 0 deletions cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import contextlib
import multiprocessing as mp
import sys

import pytest
from helpers.logging import TimestampedLogger
Expand Down Expand Up @@ -84,6 +85,15 @@ def test_main(self, ipc_device, ipc_memory_resource):

log("waiting for child")
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
log(f"child exit code: {process.exitcode}")
assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}"
log("done")
28 changes: 28 additions & 0 deletions cuda_core/tests/memory_ipc/test_leaks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import gc
import multiprocessing as mp
import platform
import sys

try:
import psutil
Expand Down Expand Up @@ -38,6 +39,15 @@ def exec_success(obj, number=1):
process = mp.Process(target=child_main, args=(obj,))
process.start()
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode == 0


Expand All @@ -54,6 +64,15 @@ def exec_launch_failure(obj, number=1):
process = mp.Process(target=child_main_bad, args=(obj,))
process.start()
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode != 0


Expand Down Expand Up @@ -137,5 +156,14 @@ def prime():
process = mp.Process()
process.start()
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode == 0
prime_was_run = True
40 changes: 40 additions & 0 deletions cuda_core/tests/memory_ipc/test_memory_ipc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import multiprocessing as mp
import sys

import pytest
from helpers.buffers import PatternGen
Expand Down Expand Up @@ -39,6 +40,15 @@ def test_main(self, ipc_device, ipc_memory_resource):

# Wait for the child process.
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode == 0

# Verify that the buffer was modified.
Expand Down Expand Up @@ -82,6 +92,16 @@ def test_main(self, ipc_device, ipc_memory_resource):
# Wait for the child processes.
p1.join(timeout=CHILD_TIMEOUT_SEC)
p2.join(timeout=CHILD_TIMEOUT_SEC)
for p in (p1, p2):
if p.is_alive():
print(
f"[WARN] child process {p.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
p.kill()
p.join()
assert p1.exitcode == 0
assert p2.exitcode == 0

Expand Down Expand Up @@ -135,6 +155,16 @@ def test_main(self, ipc_device, ipc_memory_resource):
# Wait for children.
p1.join(timeout=CHILD_TIMEOUT_SEC)
p2.join(timeout=CHILD_TIMEOUT_SEC)
for p in (p1, p2):
if p.is_alive():
print(
f"[WARN] child process {p.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
p.kill()
p.join()
assert p1.exitcode == 0
assert p2.exitcode == 0

Expand Down Expand Up @@ -185,6 +215,16 @@ def test_main(self, ipc_device, ipc_memory_resource):
# Wait for children.
p1.join(timeout=CHILD_TIMEOUT_SEC)
p2.join(timeout=CHILD_TIMEOUT_SEC)
for p in (p1, p2):
if p.is_alive():
print(
f"[WARN] child process {p.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
p.kill()
p.join()
assert p1.exitcode == 0
assert p2.exitcode == 0

Expand Down
19 changes: 19 additions & 0 deletions cuda_core/tests/memory_ipc/test_peer_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import multiprocessing as mp
import sys

import pytest
from helpers.buffers import PatternGen
Expand Down Expand Up @@ -35,6 +36,15 @@ def test_main(self, ipc_mempool_device_x2):
process = mp.Process(target=self.child_main, args=(mr,))
process.start()
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode == 0

# Verify parent's MR still has peer access set (independent state)
Expand Down Expand Up @@ -81,6 +91,15 @@ def test_main(self, ipc_mempool_device_x2, grant_access_in_parent):
process = mp.Process(target=self.child_main, args=(mr, buffer))
process.start()
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode == 0

buffer.close()
Expand Down
37 changes: 36 additions & 1 deletion cuda_core/tests/memory_ipc/test_send_buffers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import multiprocessing as mp
import sys
from itertools import cycle

import pytest
Expand Down Expand Up @@ -39,6 +40,22 @@ def test_main(self, ipc_device, nmrs):

# Wait for the child process.
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
# Child did not exit within the timeout. Under compute-sanitizer
# with --target-processes=all (active for Python 3.12 + local
# CTK on Linux), IPC memory teardown inside the sanitizer can
# deadlock on CUDA 12.9.1, leaving the child alive indefinitely.
# SIGKILL forces the kernel to reclaim all IPC handles so that
# fixture teardown (mr.close()) does not block the runner for
# hours. See issue #2004.
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode == 0

# Verify that the buffers were modified.
Expand Down Expand Up @@ -96,10 +113,28 @@ def test_main(self, ipc_device, ipc_memory_resource):
proc_c.start()

# Wait for C to signal completion then clean up.
event_c.wait(timeout=CHILD_TIMEOUT_SEC)
completed = event_c.wait(timeout=CHILD_TIMEOUT_SEC)
event_b.set() # b can finish now
proc_b.join(timeout=CHILD_TIMEOUT_SEC)
proc_c.join(timeout=CHILD_TIMEOUT_SEC)

# Kill any processes that are still alive. Under compute-sanitizer with
# --target-processes=all, IPC teardown deadlocks on CUDA 12.9.1 so
# children never exit. SIGKILL forces kernel IPC handle release so
# fixture teardown (mr.close()) does not block the runner for hours.
# See issue #2004.
for p in (proc_b, proc_c):
if p.is_alive():
print(
f"[WARN] child process {p.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
p.kill()
p.join()

assert completed, "process C did not complete within timeout"
assert proc_b.exitcode == 0
assert proc_c.exitcode == 0

Expand Down
19 changes: 19 additions & 0 deletions cuda_core/tests/memory_ipc/test_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import multiprocessing as mp
import multiprocessing.reduction
import os
import sys

import pytest
from helpers.buffers import PatternGen
Expand Down Expand Up @@ -46,6 +47,15 @@ def test_main(self, ipc_device, ipc_memory_resource):

# Wait for the child process.
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode == 0

# Confirm buffers were modified.
Expand Down Expand Up @@ -103,6 +113,15 @@ def test_main(self, ipc_device, ipc_memory_resource):

# Wait for the child process.
process.join(timeout=CHILD_TIMEOUT_SEC)
if process.is_alive():
print(
f"[WARN] child process {process.pid} still alive after "
f"{CHILD_TIMEOUT_SEC}s — sending SIGKILL "
f"(likely compute-sanitizer IPC deadlock, see issue #2004)",
file=sys.stderr,
)
process.kill()
process.join()
assert process.exitcode == 0

# Confirm buffer was modified.
Expand Down