From bc41afed43e16f1efb06f9fe7325b00b70cd31cc Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:06:54 -0400 Subject: [PATCH 1/5] test: add failing test of free-threading loading of SO --- cuda_bindings/tests/test_cudart.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cuda_bindings/tests/test_cudart.py b/cuda_bindings/tests/test_cudart.py index 6f8fc009e..08be9f3b4 100644 --- a/cuda_bindings/tests/test_cudart.py +++ b/cuda_bindings/tests/test_cudart.py @@ -1411,3 +1411,14 @@ def test_getLocalRuntimeVersion(): else: assertSuccess(err) assert version >= 12000 # CUDA 12.0 + + +def test_getLocalRuntimeVersion_can_be_called_multiple_times(): + try: + cudart.getLocalRuntimeVersion() + err, version = cudart.getLocalRuntimeVersion() + except pathfinder.DynamicLibNotFoundError: + pytest.skip("cudart dynamic lib not available") + else: + assertSuccess(err) + assert version >= 12000 # CUDA 12.0 From 09a594c7c7280a63feeaefa500e1b2abf4c47c5c Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:33:50 -0400 Subject: [PATCH 2/5] fix(library): avoid spurious close of cached shared library --- cuda_bindings/cuda/bindings/cyruntime.pyx.in | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in index df85a806c..3031e43d2 100644 --- a/cuda_bindings/cuda/bindings/cyruntime.pyx.in +++ b/cuda_bindings/cuda/bindings/cyruntime.pyx.in @@ -1917,13 +1917,19 @@ cdef cudaError_t getLocalRuntimeVersion(int* runtimeVersion) except ?cudaErrorCa cdef cudaError_t err = cudaSuccess err = ( __cudaRuntimeGetVersion)(runtimeVersion) - # Unload - {{if 'Windows' == platform.system()}} - windll.FreeLibrary(handle) - {{else}} - dlfcn.dlclose(handle) - {{endif}} + # We explicitly do *NOT* cleanup the library handle here, acknowledging + # that, yes, the handle leaks. The reason is that there's a + # `functools.cache` on the top-level caller of this function. + # + # This means this library would be opened once and then immediately closed, + # all the while remaining in the cache lurking there for people to call. + # + # Since we open the library one time (technically once per unique library name), + # there's not a ton of leakage, which we deem acceptable for the 1000x speedup + # achieved by caching (ultimately) `ctypes.CDLL` calls. + # + # Long(er)-term we can explore cleaning up the library using higher-level + # Python mechanisms, like `__del__` or `weakref.finalizer`s. - # Return return err {{endif}} From 8cf16e216e7a630813d13ff0b45484df94793a1d Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 23 Sep 2025 16:57:37 -0400 Subject: [PATCH 3/5] test: deduplicate --- cuda_bindings/tests/test_cudart.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/cuda_bindings/tests/test_cudart.py b/cuda_bindings/tests/test_cudart.py index 08be9f3b4..441645a8d 100644 --- a/cuda_bindings/tests/test_cudart.py +++ b/cuda_bindings/tests/test_cudart.py @@ -1404,21 +1404,12 @@ def test_struct_pointer_comparison(target): def test_getLocalRuntimeVersion(): - try: - err, version = cudart.getLocalRuntimeVersion() - except pathfinder.DynamicLibNotFoundError: - pytest.skip("cudart dynamic lib not available") - else: - assertSuccess(err) - assert version >= 12000 # CUDA 12.0 - - -def test_getLocalRuntimeVersion_can_be_called_multiple_times(): - try: - cudart.getLocalRuntimeVersion() - err, version = cudart.getLocalRuntimeVersion() - except pathfinder.DynamicLibNotFoundError: - pytest.skip("cudart dynamic lib not available") - else: - assertSuccess(err) - assert version >= 12000 # CUDA 12.0 + # verify that successive calls do not segfault the interpreter + for _ in range(10): + try: + err, version = cudart.getLocalRuntimeVersion() + except pathfinder.DynamicLibNotFoundError: + pytest.skip("cudart dynamic lib not available") + else: + assertSuccess(err) + assert version >= 12000 # CUDA 12.0 From 73c43aea889b9a3a25617b63f6264a25a9b1a237 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:04:30 -0400 Subject: [PATCH 4/5] fix(library): alternative implementation that cleans up using a finalizer --- .../pathfinder/_dynamic_libs/load_dl_linux.py | 9 +++++++++ .../pathfinder/_dynamic_libs/load_dl_windows.py | 9 +++++++++ .../_dynamic_libs/load_nvidia_dynamic_lib.py | 15 ++++++++++++++- 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py index a7de858b7..6761d1af8 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py @@ -38,11 +38,20 @@ def _load_libdl() -> ctypes.CDLL: LIBDL.dlerror.argtypes = [] LIBDL.dlerror.restype = ctypes.c_char_p +LIBDL.dlclose.argtypes = [ctypes.c_void_p] +LIBDL.dlclose.restype = ctypes.c_int + # First appeared in 2004-era glibc. Universally correct on Linux for all practical purposes. RTLD_DI_LINKMAP = 2 RTLD_DI_ORIGIN = 6 +def unload_dl(handle: ctypes.c_void_p) -> None: + result = LIBDL.dlclose(handle) + if result: + raise RuntimeError(LIBDL.dlerror()) + + class _LinkMapLNameView(ctypes.Structure): """ Prefix-only view of glibc's `struct link_map` used **solely** to read `l_name`. diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py index 5da6d9b84..0898754dd 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py @@ -46,6 +46,9 @@ kernel32.AddDllDirectory.argtypes = [ctypes.wintypes.LPCWSTR] kernel32.AddDllDirectory.restype = ctypes.c_void_p # DLL_DIRECTORY_COOKIE +kernel32.FreeLibrary.argtypes = [ctypes.wintypes.HMODULE] +kernel32.FreeLibrary.restype = ctypes.c_bool + def ctypes_handle_to_unsigned_int(handle: ctypes.wintypes.HMODULE) -> int: """Convert ctypes HMODULE to unsigned int.""" @@ -157,3 +160,9 @@ def load_with_abs_path(libname: str, found_path: str) -> LoadedDL: raise RuntimeError(f"Failed to load DLL at {found_path}: Windows error {error_code}") return LoadedDL(found_path, False, ctypes_handle_to_unsigned_int(handle)) + + +def unload_dl(handle: ctypes.c_void_p) -> None: + result = kernel32.FreeLibrary(handle) + if not result: + raise RuntimeError(f"Failed to load windows DLL with error code: {ctypes.GetLastError()}") diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py index 3160333aa..a9d1402fd 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py @@ -1,9 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import ctypes import functools import struct import sys +import weakref from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL, load_dependencies @@ -14,12 +16,14 @@ check_if_already_loaded_from_elsewhere, load_with_abs_path, load_with_system_search, + unload_dl, ) else: from cuda.pathfinder._dynamic_libs.load_dl_linux import ( check_if_already_loaded_from_elsewhere, load_with_abs_path, load_with_system_search, + unload_dl, ) @@ -117,4 +121,13 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL: f" Currently running: {pointer_size_bits}-bit Python" f" {sys.version_info.major}.{sys.version_info.minor}" ) - return _load_lib_no_cache(libname) + + library = _load_lib_no_cache(libname) + + # Ensure that the library is unloaded after GC runs on `library` + # + # We only need the address, so the rest of whatever is in `library` is free + # to be cleaned up. The integer address is immutable, so it gets copied + # upon being referenced here + weakref.finalize(library, unload_dl, ctypes.c_void_p(library._handle_uint)) + return library From cd89f39292062e5379fd3f9c73d8654849c02441 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 24 Sep 2025 16:20:45 -0400 Subject: [PATCH 5/5] revert: fix(library): alternative implementation that cleans up using a finalizer This reverts commit 73c43aea889b9a3a25617b63f6264a25a9b1a237. --- .../pathfinder/_dynamic_libs/load_dl_linux.py | 9 --------- .../pathfinder/_dynamic_libs/load_dl_windows.py | 9 --------- .../_dynamic_libs/load_nvidia_dynamic_lib.py | 15 +-------------- 3 files changed, 1 insertion(+), 32 deletions(-) diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py index 6761d1af8..a7de858b7 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py @@ -38,20 +38,11 @@ def _load_libdl() -> ctypes.CDLL: LIBDL.dlerror.argtypes = [] LIBDL.dlerror.restype = ctypes.c_char_p -LIBDL.dlclose.argtypes = [ctypes.c_void_p] -LIBDL.dlclose.restype = ctypes.c_int - # First appeared in 2004-era glibc. Universally correct on Linux for all practical purposes. RTLD_DI_LINKMAP = 2 RTLD_DI_ORIGIN = 6 -def unload_dl(handle: ctypes.c_void_p) -> None: - result = LIBDL.dlclose(handle) - if result: - raise RuntimeError(LIBDL.dlerror()) - - class _LinkMapLNameView(ctypes.Structure): """ Prefix-only view of glibc's `struct link_map` used **solely** to read `l_name`. diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py index 0898754dd..5da6d9b84 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_windows.py @@ -46,9 +46,6 @@ kernel32.AddDllDirectory.argtypes = [ctypes.wintypes.LPCWSTR] kernel32.AddDllDirectory.restype = ctypes.c_void_p # DLL_DIRECTORY_COOKIE -kernel32.FreeLibrary.argtypes = [ctypes.wintypes.HMODULE] -kernel32.FreeLibrary.restype = ctypes.c_bool - def ctypes_handle_to_unsigned_int(handle: ctypes.wintypes.HMODULE) -> int: """Convert ctypes HMODULE to unsigned int.""" @@ -160,9 +157,3 @@ def load_with_abs_path(libname: str, found_path: str) -> LoadedDL: raise RuntimeError(f"Failed to load DLL at {found_path}: Windows error {error_code}") return LoadedDL(found_path, False, ctypes_handle_to_unsigned_int(handle)) - - -def unload_dl(handle: ctypes.c_void_p) -> None: - result = kernel32.FreeLibrary(handle) - if not result: - raise RuntimeError(f"Failed to load windows DLL with error code: {ctypes.GetLastError()}") diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py index a9d1402fd..3160333aa 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py @@ -1,11 +1,9 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import ctypes import functools import struct import sys -import weakref from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL, load_dependencies @@ -16,14 +14,12 @@ check_if_already_loaded_from_elsewhere, load_with_abs_path, load_with_system_search, - unload_dl, ) else: from cuda.pathfinder._dynamic_libs.load_dl_linux import ( check_if_already_loaded_from_elsewhere, load_with_abs_path, load_with_system_search, - unload_dl, ) @@ -121,13 +117,4 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL: f" Currently running: {pointer_size_bits}-bit Python" f" {sys.version_info.major}.{sys.version_info.minor}" ) - - library = _load_lib_no_cache(libname) - - # Ensure that the library is unloaded after GC runs on `library` - # - # We only need the address, so the rest of whatever is in `library` is free - # to be cleaned up. The integer address is immutable, so it gets copied - # upon being referenced here - weakref.finalize(library, unload_dl, ctypes.c_void_p(library._handle_uint)) - return library + return _load_lib_no_cache(libname)