From 293258d9d60620c091ece04a7a89c43ba52d24e6 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 28 May 2026 10:33:38 -0700 Subject: [PATCH 1/2] test: fail checkpoint coverage on missing bindings Ensure checkpoint tests distinguish missing required cuda.bindings symbols from genuinely unsupported environments. --- cuda_bindings/tests/test_cuda.py | 20 +++++++++++++ cuda_core/tests/test_checkpoint.py | 46 ++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py index 32b05f638fe..c346329c31c 100644 --- a/cuda_bindings/tests/test_cuda.py +++ b/cuda_bindings/tests/test_cuda.py @@ -861,6 +861,26 @@ def test_cuCheckpointProcessGetState_failure(): assert state is None +@pytest.mark.skipif(not supportsCudaAPI("cuCheckpointProcessGetState"), reason="When API was introduced") +def test_cuCheckpoint_required_bindings_present(): + required_bindings = ( + "cuCheckpointProcessCheckpoint", + "cuCheckpointProcessGetRestoreThreadId", + "cuCheckpointProcessGetState", + "cuCheckpointProcessLock", + "cuCheckpointProcessRestore", + "cuCheckpointProcessUnlock", + "CUcheckpointGpuPair", + "CUcheckpointLockArgs", + "CUprocessState", + "CUcheckpointRestoreArgs", + ) + + missing = [name for name in required_bindings if not hasattr(cuda, name)] + + assert missing == [] + + def test_private_function_pointer_inspector(): from cuda.bindings._bindings.cydriver import _inspect_function_pointer diff --git a/cuda_core/tests/test_checkpoint.py b/cuda_core/tests/test_checkpoint.py index fd683aedcde..d1ade5fd2bd 100644 --- a/cuda_core/tests/test_checkpoint.py +++ b/cuda_core/tests/test_checkpoint.py @@ -33,8 +33,19 @@ def _checkpoint_available(): try: checkpoint._get_driver() return True - except RuntimeError: - return False + except RuntimeError as exc: + if _checkpoint_unavailable_can_skip(str(exc)): + return False + raise + + +def _checkpoint_unavailable_can_skip(message): + return message.startswith( + ( + "CUDA checkpointing is not supported by the installed NVIDIA driver.", + "CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. Found cuda.bindings ", + ) + ) needs_checkpoint = pytest.mark.skipif( @@ -384,6 +395,37 @@ def test_public_symbols(self): assert checkpoint.__all__ == ["Process"] assert not hasattr(checkpoint, "ProcessStateType") + def test_checkpoint_available_skips_unsupported_driver(self, monkeypatch): + def raise_unsupported_driver(): + raise RuntimeError("CUDA checkpointing is not supported by the installed NVIDIA driver.") + + monkeypatch.setattr(checkpoint, "_get_driver", raise_unsupported_driver) + + assert not _checkpoint_available() + + def test_checkpoint_available_skips_old_bindings(self, monkeypatch): + def raise_old_bindings(): + raise RuntimeError( + "CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. " + "Found cuda.bindings 12.7.0." + ) + + monkeypatch.setattr(checkpoint, "_get_driver", raise_old_bindings) + + assert not _checkpoint_available() + + def test_checkpoint_available_fails_missing_required_bindings(self, monkeypatch): + def raise_missing_binding(): + raise RuntimeError( + "CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. " + "Missing: CUcheckpointRestoreArgs" + ) + + monkeypatch.setattr(checkpoint, "_get_driver", raise_missing_binding) + + with pytest.raises(RuntimeError, match="Missing: CUcheckpointRestoreArgs"): + _checkpoint_available() + def test_pid_is_read_only(self): proc = checkpoint.Process(1) assert proc.pid == 1 From cd730c11b9162423df5e2d8748242fd6075ef60d Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 28 May 2026 12:13:10 -0700 Subject: [PATCH 2/2] test: split checkpoint GPU mapping coverage Keep baseline CUDA checkpoint coverage active for CUDA versions whose headers do not expose GPU remapping structs, while still failing when required base checkpoint bindings such as CUcheckpointRestoreArgs are missing. Gate only the GPU migration path on CUcheckpointGpuPair so CUDA 12.9 can exercise state, lock, checkpoint, restore-without-mapping, and unlock. --- cuda_bindings/tests/test_cuda.py | 3 ++- cuda_core/cuda/core/checkpoint.py | 18 ++++++++++++---- cuda_core/tests/test_checkpoint.py | 33 +++++++++++++++++++++++++++++- 3 files changed, 48 insertions(+), 6 deletions(-) diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py index c346329c31c..b5d09b7f448 100644 --- a/cuda_bindings/tests/test_cuda.py +++ b/cuda_bindings/tests/test_cuda.py @@ -870,11 +870,12 @@ def test_cuCheckpoint_required_bindings_present(): "cuCheckpointProcessLock", "cuCheckpointProcessRestore", "cuCheckpointProcessUnlock", - "CUcheckpointGpuPair", "CUcheckpointLockArgs", "CUprocessState", "CUcheckpointRestoreArgs", ) + if cuda.CUDA_VERSION >= 13000: + required_bindings += ("CUcheckpointGpuPair",) missing = [name for name in required_bindings if not hasattr(cuda, name)] diff --git a/cuda_core/cuda/core/checkpoint.py b/cuda_core/cuda/core/checkpoint.py index 7f811013d19..034afd24662 100644 --- a/cuda_core/cuda/core/checkpoint.py +++ b/cuda_core/cuda/core/checkpoint.py @@ -31,11 +31,11 @@ "cuCheckpointProcessLock", "cuCheckpointProcessRestore", "cuCheckpointProcessUnlock", - "CUcheckpointGpuPair", "CUcheckpointLockArgs", "CUprocessState", "CUcheckpointRestoreArgs", ) +_GPU_MAPPING_BINDING_ATTRS = ("CUcheckpointGpuPair",) _REQUIRED_DRIVER_VERSION = (12, 8, 0) _driver_capability_checked = False @@ -215,7 +215,11 @@ def _make_restore_args(driver, gpu_mapping: _Mapping[_Any, _Any] | None): if not isinstance(gpu_mapping, _Mapping): raise TypeError("gpu_mapping must be a mapping from checkpointed GPU UUID to restore GPU UUID") + if not gpu_mapping: + return None + pairs = [] + _require_gpu_mapping_bindings(driver) for old_uuid, new_uuid in gpu_mapping.items(): pair = driver.CUcheckpointGpuPair() buffers = [] @@ -223,15 +227,21 @@ def _make_restore_args(driver, gpu_mapping: _Mapping[_Any, _Any] | None): pair.newUuid = _as_cuuuid(driver, new_uuid, buffers) pairs.append(pair) - if not pairs: - return None - args = driver.CUcheckpointRestoreArgs() args.gpuPairs = pairs args.gpuPairsCount = len(pairs) return args +def _require_gpu_mapping_bindings(driver) -> None: + missing = [name for name in _GPU_MAPPING_BINDING_ATTRS if not hasattr(driver, name)] + if missing: + raise RuntimeError( + "CUDA checkpoint GPU remapping requires cuda.bindings with GPU remapping support. " + f"Missing: {', '.join(missing)}" + ) + + def _as_cuuuid(driver, value, buffers): """Convert *value* to a ``CUuuid``. diff --git a/cuda_core/tests/test_checkpoint.py b/cuda_core/tests/test_checkpoint.py index d1ade5fd2bd..098da2950dc 100644 --- a/cuda_core/tests/test_checkpoint.py +++ b/cuda_core/tests/test_checkpoint.py @@ -39,6 +39,19 @@ def _checkpoint_available(): raise +def _checkpoint_gpu_mapping_available(): + """Return True if checkpoint restore GPU remapping is usable on this system.""" + if not _checkpoint_available(): + return False + try: + checkpoint._require_gpu_mapping_bindings(checkpoint._get_driver()) + return True + except RuntimeError as exc: + if _checkpoint_gpu_mapping_unavailable_can_skip(str(exc)): + return False + raise + + def _checkpoint_unavailable_can_skip(message): return message.startswith( ( @@ -48,10 +61,20 @@ def _checkpoint_unavailable_can_skip(message): ) +def _checkpoint_gpu_mapping_unavailable_can_skip(message): + return message.startswith( + "CUDA checkpoint GPU remapping requires cuda.bindings with GPU remapping support. Missing: CUcheckpointGpuPair" + ) + + needs_checkpoint = pytest.mark.skipif( sys.platform != "linux" or not _checkpoint_available(), reason="CUDA checkpoint API requires Linux and a supported driver/bindings", ) +needs_checkpoint_gpu_mapping = pytest.mark.skipif( + sys.platform != "linux" or not _checkpoint_gpu_mapping_available(), + reason="CUDA checkpoint GPU remapping requires Linux and supported driver/bindings", +) # -- Helpers --------------------------------------------------------------- @@ -426,6 +449,14 @@ def raise_missing_binding(): with pytest.raises(RuntimeError, match="Missing: CUcheckpointRestoreArgs"): _checkpoint_available() + def test_checkpoint_gpu_mapping_available_skips_missing_gpu_pair(self, monkeypatch): + class Driver: + pass + + monkeypatch.setattr(checkpoint, "_get_driver", lambda: Driver) + + assert not _checkpoint_gpu_mapping_available() + def test_pid_is_read_only(self): proc = checkpoint.Process(1) assert proc.pid == 1 @@ -462,7 +493,7 @@ def test_full_cycle_no_migration(self): # -- GPU migration (>= 2 same-chip GPUs, real driver) --------------------- -@needs_checkpoint +@needs_checkpoint_gpu_mapping class TestCheckpointGpuMigration: """GPU UUID remapping tests following the r580-migration-api.c pattern.