From a6124ef2f7c993d7df762890c911c7b902e9fb47 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 20 Nov 2025 10:58:00 -0800 Subject: [PATCH 01/22] First step adding cuda_core/cuda/core/experimental/_utils/hags_status.c --- .gitattributes | 2 + cuda_core/build_hooks.py | 18 +++- cuda_core/cuda/core/experimental/_event.pyx | 18 ++++ .../core/experimental/_utils/hags_status.c | 84 +++++++++++++++++++ .../core/experimental/_utils/hags_status.h | 23 +++++ cuda_core/tests/test_event.py | 7 ++ 6 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_utils/hags_status.c create mode 100644 cuda_core/cuda/core/experimental/_utils/hags_status.h diff --git a/.gitattributes b/.gitattributes index cf17ba9d5e..3bff10eeff 100644 --- a/.gitattributes +++ b/.gitattributes @@ -9,3 +9,5 @@ cuda/_version.py export-subst *.png binary # SCM syntax highlighting & preventing 3-way merges pixi.lock merge=binary linguist-language=YAML linguist-generated=true + +cuda_core/cuda/core/experimental/_utils/hags_status.h text eol=lf diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index e38f5676df..d42463bcb9 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -84,11 +84,25 @@ def get_cuda_paths(): print("CUDA paths:", CUDA_PATH) return CUDA_PATH + common_include_dirs = [ + # CUDA include paths (for driver/runtime headers) + *(os.path.join(root, "include") for root in get_cuda_paths()), + # Local experimental utils headers (for hags_status.h, etc.) + os.path.join("cuda", "core", "experimental", "_utils"), + ] + + def get_sources(mod): + sources = [f"cuda/core/experimental/{mod}.pyx"] + # Add hags_status.c for _event module + if mod == "_event": + sources.append("cuda/core/experimental/_utils/hags_status.c") + return sources + ext_modules = tuple( Extension( f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", - sources=[f"cuda/core/experimental/{mod}.pyx"], - include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()), + sources=get_sources(mod), + include_dirs=common_include_dirs, language="c++", ) for mod in module_names diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 98a45d0043..3cd94e7ea9 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -13,6 +13,9 @@ from cuda.core.experimental._utils.cuda_utils cimport ( HANDLE_RETURN ) +cdef extern from "hags_status.h": + int _hags_status_impl "hags_status"() + import cython from dataclasses import dataclass import multiprocessing @@ -303,3 +306,18 @@ def _reduce_event(event): return event.from_ipc_descriptor, (event.get_ipc_descriptor(),) multiprocessing.reduction.register(Event, _reduce_event) + +cpdef int hags_status(): + """Check Hardware Accelerated GPU Scheduling (HAGS) status on Windows. + + Returns + ------- + int + Status code indicating HAGS state: + + - -1: Not available on this platform (not compiled with MSVC on Windows) + - 0: Failure obtaining HwSchSupported/HwSchEnabled + - 1: HwSchSupported == 0 or HwSchEnabled == 0 (HAGS not fully enabled) + - 2: HwSchSupported == 1 and HwSchEnabled == 1 (HAGS fully enabled) + """ + return _hags_status_impl() diff --git a/cuda_core/cuda/core/experimental/_utils/hags_status.c b/cuda_core/cuda/core/experimental/_utils/hags_status.c new file mode 100644 index 0000000000..e034dcf855 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/hags_status.c @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Note, this may or may not exist, but is NOT the ground truth: +// reg query "HKLM\SYSTEM\CurrentControlSet\Control\GraphicsDrivers" /v HwSchMode +// The HwSchMode registry value is only a user override (force on/off). +// If absent, Windows uses the driver's WDDM caps defaults. +// Actual HAGS state comes from D3DKMT_WDDM_2_7_CAPS, not the registry. + +// Possibly useful for experimentation: +// reg delete "HKLM\SYSTEM\CurrentControlSet\Control\GraphicsDrivers" /v HwSchMode /f + +#ifdef _MSC_VER +#include +#include +#include +#endif + +int hags_status(void) +{ +#ifdef _MSC_VER + DISPLAY_DEVICEW dd; + HDC hdc; + int i; + BOOL foundPrimary = FALSE; + NTSTATUS status; + + D3DKMT_OPENADAPTERFROMHDC openData; + D3DKMT_QUERYADAPTERINFO query; + D3DKMT_WDDM_2_7_CAPS caps; + D3DKMT_CLOSEADAPTER closeData; + + // Find the primary display device + ZeroMemory(&dd, sizeof(dd)); + dd.cb = sizeof(dd); + + for (i = 0; EnumDisplayDevicesW(NULL, i, &dd, 0); ++i) { + if (dd.StateFlags & DISPLAY_DEVICE_PRIMARY_DEVICE) { + foundPrimary = TRUE; + break; + } + } + + if (!foundPrimary) + return 0; + + hdc = CreateDCW(NULL, dd.DeviceName, NULL, NULL); + if (!hdc) + return 0; + + ZeroMemory(&openData, sizeof(openData)); + openData.hDc = hdc; + status = D3DKMTOpenAdapterFromHdc(&openData); + + DeleteDC(hdc); + + if (status != 0) + return 0; + + ZeroMemory(&caps, sizeof(caps)); + ZeroMemory(&query, sizeof(query)); + + query.hAdapter = openData.hAdapter; + query.Type = KMTQAITYPE_WDDM_2_7_CAPS; + query.pPrivateDriverData = ∩︀ + query.PrivateDriverDataSize = sizeof(caps); + + status = D3DKMTQueryAdapterInfo(&query); + + ZeroMemory(&closeData, sizeof(closeData)); + closeData.hAdapter = openData.hAdapter; + D3DKMTCloseAdapter(&closeData); + + if (status != 0) + return 0; + + if (!caps.HwSchSupported || !caps.HwSchEnabled) + return 1; + + return 2; +#else + return -1; +#endif +} diff --git a/cuda_core/cuda/core/experimental/_utils/hags_status.h b/cuda_core/cuda/core/experimental/_utils/hags_status.h new file mode 100644 index 0000000000..e2194ea769 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/hags_status.h @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hags_status + * + * Return codes: + * -1 : Not available on this platform (not compiled with MSVC on Windows) + * 0 : Failure obtaining HwSchSupported/HwSchEnabled + * 1 : HwSchSupported == 0 or HwSchEnabled == 0 (HAGS not fully enabled) + * 2 : HwSchSupported == 1 and HwSchEnabled == 1 (HAGS fully enabled) + */ +int hags_status(void); + +#ifdef __cplusplus +} /* extern "C" */ +#endif diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 992a78e92e..de0c74a6e3 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -11,17 +11,24 @@ Event, EventOptions, ) +from cuda.core.experimental._event import hags_status from helpers.latch import LatchKernel from cuda_python_test_helpers import IS_WSL +def inspect_hags_status(): + stat = hags_status() + print(f"\nLOOOK {stat=!r}", flush=True) + + def test_event_init_disabled(): with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."): cuda.core.experimental._event.Event() # Ensure back door is locked. def test_timing_success(init_cuda): + inspect_hags_status() options = EventOptions(enable_timing=True) stream = Device().create_stream() delay_seconds = 0.5 From 3dbb0fcea6835512178283aa3b941d8912bfd88f Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 20 Nov 2025 12:01:55 -0800 Subject: [PATCH 02/22] cuda_core/build_hooks.py: Extension get_libraries() --- cuda_core/build_hooks.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index d42463bcb9..6e856013bd 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -98,12 +98,18 @@ def get_sources(mod): sources.append("cuda/core/experimental/_utils/hags_status.c") return sources + def get_libraries(mod): + if os.name == "nt" and mod == "_event": + return ["user32", "gdi32"] # for hags_status.c + return None + ext_modules = tuple( Extension( f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", sources=get_sources(mod), include_dirs=common_include_dirs, language="c++", + libraries=get_libraries(mod), ) for mod in module_names ) From 3b19ff6cbe37ab05f1a73eeef32d26fd4207f299 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 20 Nov 2025 12:40:25 -0800 Subject: [PATCH 03/22] pre-commit: exclude soft-linked cuda_python/README.md from end-of-file-fixer On Linux (including WSL) the file cuda_python/README.md is a real symlink, whereas on Windows Git it is checked out as a plain file containing "../README.md" (without a trailing LF). When pre-commit runs under WSL, the end-of-file-fixer hook rewrites this file, and Git Bash can no longer handle the symlink-emulation file correctly, resulting in errors on subsequent git operations. --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 20ce44c44a..3609d58c21 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -53,7 +53,7 @@ repos: - id: check-yaml - id: debug-statements - id: end-of-file-fixer - exclude: &gen_exclude '^(?:cuda_bindings/cuda/bindings/.*\.in?|cuda_bindings/docs/source/module/.*\.rst?)$' + exclude: &gen_exclude '^(?:cuda_python/README\.md|cuda_bindings/cuda/bindings/.*\.in?|cuda_bindings/docs/source/module/.*\.rst?)$' - id: mixed-line-ending - id: trailing-whitespace exclude: *gen_exclude From 2b08492bd23829c78d553bf2c04b057a1d1bac18 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 20 Nov 2025 14:18:50 -0800 Subject: [PATCH 04/22] Add drvmodel.c prototype (with main) --- .../cuda/core/experimental/_utils/drvmodel.c | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 cuda_core/cuda/core/experimental/_utils/drvmodel.c diff --git a/cuda_core/cuda/core/experimental/_utils/drvmodel.c b/cuda_core/cuda/core/experimental/_utils/drvmodel.c new file mode 100644 index 0000000000..7825157c2d --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/drvmodel.c @@ -0,0 +1,99 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// drvmodel.c +// Query NVML for the Windows driver model (WDDM / WDM(TCC) / MCDM) of each GPU. +// +// Build example (MSVC, adjust paths as needed): +// cl /nologo /W3 drvmodel.c /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\include" /link /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\lib\x64" nvml.lib +// +// On success, prints something like: +// GPU 0: NVIDIA RTX A6000 +// Current driver model: WDDM +// Pending driver model: WDDM + +#include +#include + +#include "nvml.h" // from NVIDIA NVML package / CUDA toolkit + +static const char *driverModelToString(nvmlDriverModel_t m) +{ + switch (m) { + case NVML_DRIVER_WDDM: + return "WDDM (display device)"; + case NVML_DRIVER_WDM: + return "WDM (TCC, compute device)"; +#ifdef NVML_DRIVER_MCDM + case NVML_DRIVER_MCDM: + return "MCDM (Microsoft compute device)"; +#endif + default: + return "Unknown"; + } +} + +int main(void) +{ + nvmlReturn_t result; + unsigned int deviceCount = 0; + unsigned int i; + + result = nvmlInit_v2(); + if (result != NVML_SUCCESS) { + fprintf(stderr, "nvmlInit_v2() failed: %s\n", nvmlErrorString(result)); + return EXIT_FAILURE; + } + + result = nvmlDeviceGetCount_v2(&deviceCount); + if (result != NVML_SUCCESS) { + fprintf(stderr, "nvmlDeviceGetCount_v2() failed: %s\n", nvmlErrorString(result)); + nvmlShutdown(); + return EXIT_FAILURE; + } + + if (deviceCount == 0) { + printf("No NVIDIA GPUs found.\n"); + nvmlShutdown(); + return EXIT_SUCCESS; + } + + for (i = 0; i < deviceCount; ++i) { + nvmlDevice_t device; + char name[NVML_DEVICE_NAME_BUFFER_SIZE] = {0}; + nvmlDriverModel_t currentModel = 0; + nvmlDriverModel_t pendingModel = 0; + + result = nvmlDeviceGetHandleByIndex_v2(i, &device); + if (result != NVML_SUCCESS) { + fprintf(stderr, + "nvmlDeviceGetHandleByIndex_v2(%u) failed: %s\n", + i, nvmlErrorString(result)); + continue; + } + + result = nvmlDeviceGetName(device, name, sizeof(name)); + if (result != NVML_SUCCESS) { + snprintf(name, sizeof(name), ""); + } + + result = nvmlDeviceGetDriverModel(device, ¤tModel, &pendingModel); + if (result == NVML_ERROR_NOT_SUPPORTED) { + printf("GPU %u: %s\n", i, name); + printf(" Driver model query not supported (non-Windows or unsupported device).\n"); + continue; + } else if (result != NVML_SUCCESS) { + fprintf(stderr, + "nvmlDeviceGetDriverModel(%u) failed: %s\n", + i, nvmlErrorString(result)); + continue; + } + + printf("GPU %u: %s\n", i, name); + printf(" Current driver model: %s\n", driverModelToString(currentModel)); + printf(" Pending driver model: %s\n", driverModelToString(pendingModel)); + } + + nvmlShutdown(); + return EXIT_SUCCESS; +} From 235c1222b633efaa1872f1e88d1003eb5f3f8a70 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 21 Nov 2025 11:13:35 -0800 Subject: [PATCH 05/22] Add wddm_driver_model_is_in_use.c --- cuda_core/build_hooks.py | 10 +- .../cuda/core/experimental/_utils/drvmodel.c | 99 ------------------- .../_utils/wddm_driver_model_is_in_use.c | 48 +++++++++ .../_utils/wddm_driver_model_is_in_use.h | 14 +++ 4 files changed, 68 insertions(+), 103 deletions(-) delete mode 100644 cuda_core/cuda/core/experimental/_utils/drvmodel.c create mode 100644 cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c create mode 100644 cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 6e856013bd..67428e7579 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -85,17 +85,19 @@ def get_cuda_paths(): return CUDA_PATH common_include_dirs = [ - # CUDA include paths (for driver/runtime headers) *(os.path.join(root, "include") for root in get_cuda_paths()), - # Local experimental utils headers (for hags_status.h, etc.) os.path.join("cuda", "core", "experimental", "_utils"), ] def get_sources(mod): sources = [f"cuda/core/experimental/{mod}.pyx"] - # Add hags_status.c for _event module if mod == "_event": - sources.append("cuda/core/experimental/_utils/hags_status.c") + sources.extend( + [ + "cuda/core/experimental/_utils/hags_status.c", + "cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c", + ] + ) return sources def get_libraries(mod): diff --git a/cuda_core/cuda/core/experimental/_utils/drvmodel.c b/cuda_core/cuda/core/experimental/_utils/drvmodel.c deleted file mode 100644 index 7825157c2d..0000000000 --- a/cuda_core/cuda/core/experimental/_utils/drvmodel.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -// drvmodel.c -// Query NVML for the Windows driver model (WDDM / WDM(TCC) / MCDM) of each GPU. -// -// Build example (MSVC, adjust paths as needed): -// cl /nologo /W3 drvmodel.c /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\include" /link /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\lib\x64" nvml.lib -// -// On success, prints something like: -// GPU 0: NVIDIA RTX A6000 -// Current driver model: WDDM -// Pending driver model: WDDM - -#include -#include - -#include "nvml.h" // from NVIDIA NVML package / CUDA toolkit - -static const char *driverModelToString(nvmlDriverModel_t m) -{ - switch (m) { - case NVML_DRIVER_WDDM: - return "WDDM (display device)"; - case NVML_DRIVER_WDM: - return "WDM (TCC, compute device)"; -#ifdef NVML_DRIVER_MCDM - case NVML_DRIVER_MCDM: - return "MCDM (Microsoft compute device)"; -#endif - default: - return "Unknown"; - } -} - -int main(void) -{ - nvmlReturn_t result; - unsigned int deviceCount = 0; - unsigned int i; - - result = nvmlInit_v2(); - if (result != NVML_SUCCESS) { - fprintf(stderr, "nvmlInit_v2() failed: %s\n", nvmlErrorString(result)); - return EXIT_FAILURE; - } - - result = nvmlDeviceGetCount_v2(&deviceCount); - if (result != NVML_SUCCESS) { - fprintf(stderr, "nvmlDeviceGetCount_v2() failed: %s\n", nvmlErrorString(result)); - nvmlShutdown(); - return EXIT_FAILURE; - } - - if (deviceCount == 0) { - printf("No NVIDIA GPUs found.\n"); - nvmlShutdown(); - return EXIT_SUCCESS; - } - - for (i = 0; i < deviceCount; ++i) { - nvmlDevice_t device; - char name[NVML_DEVICE_NAME_BUFFER_SIZE] = {0}; - nvmlDriverModel_t currentModel = 0; - nvmlDriverModel_t pendingModel = 0; - - result = nvmlDeviceGetHandleByIndex_v2(i, &device); - if (result != NVML_SUCCESS) { - fprintf(stderr, - "nvmlDeviceGetHandleByIndex_v2(%u) failed: %s\n", - i, nvmlErrorString(result)); - continue; - } - - result = nvmlDeviceGetName(device, name, sizeof(name)); - if (result != NVML_SUCCESS) { - snprintf(name, sizeof(name), ""); - } - - result = nvmlDeviceGetDriverModel(device, ¤tModel, &pendingModel); - if (result == NVML_ERROR_NOT_SUPPORTED) { - printf("GPU %u: %s\n", i, name); - printf(" Driver model query not supported (non-Windows or unsupported device).\n"); - continue; - } else if (result != NVML_SUCCESS) { - fprintf(stderr, - "nvmlDeviceGetDriverModel(%u) failed: %s\n", - i, nvmlErrorString(result)); - continue; - } - - printf("GPU %u: %s\n", i, name); - printf(" Current driver model: %s\n", driverModelToString(currentModel)); - printf(" Pending driver model: %s\n", driverModelToString(pendingModel)); - } - - nvmlShutdown(); - return EXIT_SUCCESS; -} diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c new file mode 100644 index 0000000000..2dc8f768bc --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Query NVML for the Windows WDDM driver model, looping over all GPUs. +// +// Example compilation command: +// cl /nologo /c wddm_driver_model_is_in_use.c /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\include" +// Needed for linking: +// /link /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\lib\x64" nvml.lib + +#include "wddm_driver_model_is_in_use.h" + +#include "nvml.h" // from NVIDIA GPU Computing Toolkit + +static int wddm_driver_model_is_in_use_impl(void) +{ + unsigned deviceCount = 0; + nvmlReturn_t result = nvmlDeviceGetCount_v2(&deviceCount); + if (result != NVML_SUCCESS) { + return -2; + } + for (unsigned i_dev = 0; i_dev < deviceCount; ++i_dev) { + nvmlDevice_t device; + result = nvmlDeviceGetHandleByIndex_v2(i_dev, &device); + if (result == NVML_SUCCESS) { + nvmlDriverModel_t currentModel = 0; + nvmlDriverModel_t pendingModel = 0; + result = nvmlDeviceGetDriverModel(device, ¤tModel, &pendingModel); + if (result == NVML_SUCCESS) { + if (currentModel == NVML_DRIVER_WDDM || pendingModel == NVML_DRIVER_WDDM) { + return 1; + } + } + } + } + return 0; +} + +int wddm_driver_model_is_in_use(void) +{ + nvmlReturn_t result = nvmlInit_v2(); + if (result != NVML_SUCCESS) { + return -1; + } + int return_code = wddm_driver_model_is_in_use_impl(); + nvmlShutdown(); + return return_code; +} diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h new file mode 100644 index 0000000000..3472b90014 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +int wddm_driver_model_is_in_use(void); + +#ifdef __cplusplus +} /* extern "C" */ +#endif From 4fa00ee8967c52a2082cd6983e09b0c2fbed2954 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 21 Nov 2025 11:34:05 -0800 Subject: [PATCH 06/22] Update build_hooks.py to link in nvml.lib --- cuda_core/build_hooks.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 67428e7579..703227d14d 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -102,9 +102,16 @@ def get_sources(mod): def get_libraries(mod): if os.name == "nt" and mod == "_event": - return ["user32", "gdi32"] # for hags_status.c + # user32 / gdi32 for hags_status.c, nvml for wddm_driver_model_is_in_use.c + return ["user32", "gdi32", "nvml"] return None + def get_library_dirs(): + if os.name != "nt": + return None + # wddm_driver_model_is_in_use.c needs nvml.lib + return [os.path.join(root, "lib", "x64") for root in get_cuda_paths()] + ext_modules = tuple( Extension( f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", @@ -112,6 +119,7 @@ def get_libraries(mod): include_dirs=common_include_dirs, language="c++", libraries=get_libraries(mod), + library_dirs=get_library_dirs(), ) for mod in module_names ) From 25060c684d7a45e9c765efec704a0299df766234 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 21 Nov 2025 11:58:37 -0800 Subject: [PATCH 07/22] Call wddm_driver_model_is_in_use from test_event.py --- cuda_core/cuda/core/experimental/_event.pyx | 24 ++++++++------------- cuda_core/tests/test_event.py | 8 ++++--- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 3cd94e7ea9..4e0672aaa9 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -16,6 +16,15 @@ from cuda.core.experimental._utils.cuda_utils cimport ( cdef extern from "hags_status.h": int _hags_status_impl "hags_status"() +cpdef int hags_status(): + return _hags_status_impl() + +cdef extern from "wddm_driver_model_is_in_use.h": + int _wddm_driver_model_is_in_use_impl "wddm_driver_model_is_in_use"() + +cpdef int wddm_driver_model_is_in_use(): + return _wddm_driver_model_is_in_use_impl() + import cython from dataclasses import dataclass import multiprocessing @@ -306,18 +315,3 @@ def _reduce_event(event): return event.from_ipc_descriptor, (event.get_ipc_descriptor(),) multiprocessing.reduction.register(Event, _reduce_event) - -cpdef int hags_status(): - """Check Hardware Accelerated GPU Scheduling (HAGS) status on Windows. - - Returns - ------- - int - Status code indicating HAGS state: - - - -1: Not available on this platform (not compiled with MSVC on Windows) - - 0: Failure obtaining HwSchSupported/HwSchEnabled - - 1: HwSchSupported == 0 or HwSchEnabled == 0 (HAGS not fully enabled) - - 2: HwSchSupported == 1 and HwSchEnabled == 1 (HAGS fully enabled) - """ - return _hags_status_impl() diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index de0c74a6e3..3e714526cf 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -11,15 +11,17 @@ Event, EventOptions, ) -from cuda.core.experimental._event import hags_status +from cuda.core.experimental._event import hags_status, wddm_driver_model_is_in_use from helpers.latch import LatchKernel from cuda_python_test_helpers import IS_WSL def inspect_hags_status(): - stat = hags_status() - print(f"\nLOOOK {stat=!r}", flush=True) + hags = hags_status() + print(f"\nLOOOK {hags=!r}", flush=True) + wddm = wddm_driver_model_is_in_use() + print(f"\nLOOOK {wddm=!r}", flush=True) def test_event_init_disabled(): From 7e340af7497de0f2bfcb8aff3d4f7d4d22dc9a07 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 21 Nov 2025 12:12:04 -0800 Subject: [PATCH 08/22] Add ensure_hags_is_enabled_if_wddm_driver_model_is_in_use() in _event.pyx --- cuda_core/cuda/core/experimental/_event.pyx | 31 +++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 4e0672aaa9..ec44fc08aa 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -25,6 +25,35 @@ cdef extern from "wddm_driver_model_is_in_use.h": cpdef int wddm_driver_model_is_in_use(): return _wddm_driver_model_is_in_use_impl() + +def ensure_hags_is_enabled_if_wddm_driver_model_is_in_use() -> None: + """On Windows with WDDM driver model, require HAGS to be fully enabled. + + If WDDM is not in use, or the platform is non-Windows, this is a no-op. + """ + import sys + + if sys.platform != "win32": + return + + wddm_state = wddm_driver_model_is_in_use() + if wddm_state != 1: + # Either not WDDM or NVML was not able to determine the driver model. + return + + hags_state = hags_status() + if hags_state == 2: + # HAGS fully enabled. + return + + raise RuntimeError( + "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the " + "Windows WDDM driver model is in use in order to obtain reliable CUDA event " + "timing. Please enable HAGS in the Windows graphics settings or switch to a " + "non-WDDM driver model." + ) + + import cython from dataclasses import dataclass import multiprocessing @@ -148,6 +177,8 @@ cdef class Event: def __sub__(self, other: Event): # return self - other (in milliseconds) + if not self.is_timing_disabled and not other.is_timing_disabled: + ensure_hags_is_enabled_if_wddm_driver_model_is_in_use() cdef float timing with nogil: err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle) From 2eebbca5ab5fe82e521de2b8c863404381ac402d Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 21 Nov 2025 13:18:18 -0800 Subject: [PATCH 09/22] Work _xfail_if_hags_runtime_error into tests/test_event.py --- cuda_core/tests/test_event.py | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 3e714526cf..67bb9055c6 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import os +import re import time import cuda.core.experimental @@ -16,6 +17,8 @@ from cuda_python_test_helpers import IS_WSL +_HAGS_ERROR_SUBSTRING = "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled" + def inspect_hags_status(): hags = hags_status() @@ -24,6 +27,18 @@ def inspect_hags_status(): print(f"\nLOOOK {wddm=!r}", flush=True) +def _xfail_if_hags_runtime_error(exc: BaseException, expected_regex: str | None = None) -> None: + message = str(exc) + if _HAGS_ERROR_SUBSTRING in message: + pytest.xfail( + "HAGS is not fully enabled while the Windows WDDM driver model is in use; " + "event timing tests are expected to fail in this configuration." + ) + + if expected_regex is not None: + assert re.match(expected_regex, message), f"Expected regex: {expected_regex!r}\nActual message: {message!r}" + + def test_event_init_disabled(): with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."): cuda.core.experimental._event.Event() # Ensure back door is locked. @@ -38,7 +53,11 @@ def test_timing_success(init_cuda): time.sleep(delay_seconds) e2 = stream.record(options=options) e2.sync() - elapsed_time_ms = e2 - e1 + try: + elapsed_time_ms = e2 - e1 + except RuntimeError as exc: + _xfail_if_hags_runtime_error(exc) + raise assert isinstance(elapsed_time_ms, float) # Using a generous tolerance, to avoid flaky tests: # We only want to exercise the __sub__ method, this test is not meant @@ -115,12 +134,17 @@ def test_error_timing_recorded(): event3 = device.create_event(options=enabled) stream.sync() - with pytest.raises(RuntimeError, match="^Both Events must be recorded"): + with pytest.raises(RuntimeError) as excinfo: event2 - event1 - with pytest.raises(RuntimeError, match="^Both Events must be recorded"): + _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded") + + with pytest.raises(RuntimeError) as excinfo: event1 - event2 - with pytest.raises(RuntimeError, match="^Both Events must be recorded"): + _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded") + + with pytest.raises(RuntimeError) as excinfo: event3 - event2 + _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded") def test_error_timing_incomplete(): @@ -135,8 +159,9 @@ def test_error_timing_incomplete(): event3 = stream.record(options=enabled) # event3 will never complete because the latch has not been released - with pytest.raises(RuntimeError, match="^One or both events have not completed."): + with pytest.raises(RuntimeError) as excinfo: event3 - event1 + _xfail_if_hags_runtime_error(excinfo.value, r"^One or both events have not completed.") latch.release() event3.sync() From 3c2a1c7df3f3b09cb447986dd8f294084c834644 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 12:55:15 -0800 Subject: [PATCH 10/22] Clean out inspect_hags_status(), add _is_hags_timing_usable() helper. --- cuda_core/tests/test_event.py | 89 ++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 67bb9055c6..deffcfa38e 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -import re import time import cuda.core.experimental @@ -12,7 +11,6 @@ Event, EventOptions, ) -from cuda.core.experimental._event import hags_status, wddm_driver_model_is_in_use from helpers.latch import LatchKernel from cuda_python_test_helpers import IS_WSL @@ -20,23 +18,32 @@ _HAGS_ERROR_SUBSTRING = "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled" -def inspect_hags_status(): - hags = hags_status() - print(f"\nLOOOK {hags=!r}", flush=True) - wddm = wddm_driver_model_is_in_use() - print(f"\nLOOOK {wddm=!r}", flush=True) +def _is_hags_timing_usable() -> bool: + """Probe Event.__sub__ to detect HAGS/WDDM timing issues. + Returns True if timing appears usable, False if we see the known + HAGS/WDDM RuntimeError. Any other RuntimeError is propagated. + """ + device = Device() + device.set_current() + options = EventOptions(enable_timing=True) + stream = device.create_stream() -def _xfail_if_hags_runtime_error(exc: BaseException, expected_regex: str | None = None) -> None: - message = str(exc) - if _HAGS_ERROR_SUBSTRING in message: - pytest.xfail( - "HAGS is not fully enabled while the Windows WDDM driver model is in use; " - "event timing tests are expected to fail in this configuration." - ) + event1 = stream.record(options=options) + event2 = stream.record(options=options) + event2.sync() - if expected_regex is not None: - assert re.match(expected_regex, message), f"Expected regex: {expected_regex!r}\nActual message: {message!r}" + try: + _ = event2 - event1 + except RuntimeError as exc: + message = str(exc) + if _HAGS_ERROR_SUBSTRING in message: + return False + raise + return True + + +_HAGS_TIMING_USABLE = _is_hags_timing_usable() def test_event_init_disabled(): @@ -44,8 +51,22 @@ def test_event_init_disabled(): cuda.core.experimental._event.Event() # Ensure back door is locked. +def test_ensure_hags_is_enabled_if_wddm_driver_model_is_in_use(): + if not _HAGS_TIMING_USABLE: + pytest.xfail( + "HAGS is not fully enabled while the Windows WDDM driver model is in use; " + "event timing tests are expected to fail in this configuration." + ) + + +@pytest.mark.skipif( + not _HAGS_TIMING_USABLE, + reason=( + "HAGS is not fully enabled while the Windows WDDM driver model is in use; " + "event timing tests are expected to fail in this configuration." + ), +) def test_timing_success(init_cuda): - inspect_hags_status() options = EventOptions(enable_timing=True) stream = Device().create_stream() delay_seconds = 0.5 @@ -53,11 +74,7 @@ def test_timing_success(init_cuda): time.sleep(delay_seconds) e2 = stream.record(options=options) e2.sync() - try: - elapsed_time_ms = e2 - e1 - except RuntimeError as exc: - _xfail_if_hags_runtime_error(exc) - raise + elapsed_time_ms = e2 - e1 assert isinstance(elapsed_time_ms, float) # Using a generous tolerance, to avoid flaky tests: # We only want to exercise the __sub__ method, this test is not meant @@ -123,6 +140,13 @@ def test_error_timing_disabled(): event2 - event1 +@pytest.mark.skipif( + not _HAGS_TIMING_USABLE, + reason=( + "HAGS is not fully enabled while the Windows WDDM driver model is in use; " + "event timing tests are expected to fail in this configuration." + ), +) def test_error_timing_recorded(): device = Device() device.set_current() @@ -134,19 +158,21 @@ def test_error_timing_recorded(): event3 = device.create_event(options=enabled) stream.sync() - with pytest.raises(RuntimeError) as excinfo: + with pytest.raises(RuntimeError, match="^Both Events must be recorded"): event2 - event1 - _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded") - - with pytest.raises(RuntimeError) as excinfo: + with pytest.raises(RuntimeError, match="^Both Events must be recorded"): event1 - event2 - _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded") - - with pytest.raises(RuntimeError) as excinfo: + with pytest.raises(RuntimeError, match="^Both Events must be recorded"): event3 - event2 - _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded") +@pytest.mark.skipif( + not _HAGS_TIMING_USABLE, + reason=( + "HAGS is not fully enabled while the Windows WDDM driver model is in use; " + "event timing tests are expected to fail in this configuration." + ), +) def test_error_timing_incomplete(): device = Device() device.set_current() @@ -159,9 +185,8 @@ def test_error_timing_incomplete(): event3 = stream.record(options=enabled) # event3 will never complete because the latch has not been released - with pytest.raises(RuntimeError) as excinfo: + with pytest.raises(RuntimeError, match="^One or both events have not completed."): event3 - event1 - _xfail_if_hags_runtime_error(excinfo.value, r"^One or both events have not completed.") latch.release() event3.sync() From 066d92845fb616da2c0a63b12464e283b52d5b64 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 13:43:07 -0800 Subject: [PATCH 11/22] Refactor as _get_wddm_hags_error() --- cuda_core/tests/test_event.py | 55 ++++++++++------------------------- 1 file changed, 16 insertions(+), 39 deletions(-) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index deffcfa38e..b094ae4e8e 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -15,14 +15,11 @@ from cuda_python_test_helpers import IS_WSL -_HAGS_ERROR_SUBSTRING = "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled" +def _get_wddm_hags_error() -> str: + """Probe Event.__sub__ to obtain WDDM/HAGS RuntimeError. -def _is_hags_timing_usable() -> bool: - """Probe Event.__sub__ to detect HAGS/WDDM timing issues. - - Returns True if timing appears usable, False if we see the known - HAGS/WDDM RuntimeError. Any other RuntimeError is propagated. + Any other RuntimeError is propagated. """ device = Device() device.set_current() @@ -36,14 +33,15 @@ def _is_hags_timing_usable() -> bool: try: _ = event2 - event1 except RuntimeError as exc: - message = str(exc) - if _HAGS_ERROR_SUBSTRING in message: - return False + msg = str(exc) + if "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled" in msg: + return msg raise - return True + return None -_HAGS_TIMING_USABLE = _is_hags_timing_usable() +_WDDM_HAGS_ERROR = _get_wddm_hags_error() +_WDDM_HAGS_PRECONDITION_MSG = "WDDM/HAGS precondition not met" def test_event_init_disabled(): @@ -52,20 +50,11 @@ def test_event_init_disabled(): def test_ensure_hags_is_enabled_if_wddm_driver_model_is_in_use(): - if not _HAGS_TIMING_USABLE: - pytest.xfail( - "HAGS is not fully enabled while the Windows WDDM driver model is in use; " - "event timing tests are expected to fail in this configuration." - ) - - -@pytest.mark.skipif( - not _HAGS_TIMING_USABLE, - reason=( - "HAGS is not fully enabled while the Windows WDDM driver model is in use; " - "event timing tests are expected to fail in this configuration." - ), -) + if _WDDM_HAGS_ERROR: + pytest.xfail(_WDDM_HAGS_ERROR) + + +@pytest.mark.skipif(_WDDM_HAGS_ERROR is not None, reason=_WDDM_HAGS_PRECONDITION_MSG) def test_timing_success(init_cuda): options = EventOptions(enable_timing=True) stream = Device().create_stream() @@ -140,13 +129,7 @@ def test_error_timing_disabled(): event2 - event1 -@pytest.mark.skipif( - not _HAGS_TIMING_USABLE, - reason=( - "HAGS is not fully enabled while the Windows WDDM driver model is in use; " - "event timing tests are expected to fail in this configuration." - ), -) +@pytest.mark.skipif(_WDDM_HAGS_ERROR is not None, reason=_WDDM_HAGS_PRECONDITION_MSG) def test_error_timing_recorded(): device = Device() device.set_current() @@ -166,13 +149,7 @@ def test_error_timing_recorded(): event3 - event2 -@pytest.mark.skipif( - not _HAGS_TIMING_USABLE, - reason=( - "HAGS is not fully enabled while the Windows WDDM driver model is in use; " - "event timing tests are expected to fail in this configuration." - ), -) +@pytest.mark.skipif(_WDDM_HAGS_ERROR is not None, reason=_WDDM_HAGS_PRECONDITION_MSG) def test_error_timing_incomplete(): device = Device() device.set_current() From a80f5531f213ab3eeb9908c4c4b8857071c20904 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 13:48:39 -0800 Subject: [PATCH 12/22] Shorten name: ensure_wddm_with_hags() --- cuda_core/cuda/core/experimental/_event.pyx | 4 ++-- cuda_core/tests/test_event.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index ec44fc08aa..89b9ced29d 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -26,7 +26,7 @@ cpdef int wddm_driver_model_is_in_use(): return _wddm_driver_model_is_in_use_impl() -def ensure_hags_is_enabled_if_wddm_driver_model_is_in_use() -> None: +def ensure_wddm_with_hags() -> None: """On Windows with WDDM driver model, require HAGS to be fully enabled. If WDDM is not in use, or the platform is non-Windows, this is a no-op. @@ -178,7 +178,7 @@ cdef class Event: def __sub__(self, other: Event): # return self - other (in milliseconds) if not self.is_timing_disabled and not other.is_timing_disabled: - ensure_hags_is_enabled_if_wddm_driver_model_is_in_use() + ensure_wddm_with_hags() cdef float timing with nogil: err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index b094ae4e8e..998a87db03 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -49,7 +49,7 @@ def test_event_init_disabled(): cuda.core.experimental._event.Event() # Ensure back door is locked. -def test_ensure_hags_is_enabled_if_wddm_driver_model_is_in_use(): +def test_ensure_wddm_with_hags(): if _WDDM_HAGS_ERROR: pytest.xfail(_WDDM_HAGS_ERROR) From 35cccf39d6563068909ca8f63b950165142b5d1a Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 13:56:32 -0800 Subject: [PATCH 13/22] Remove hags_status(), wddm_driver_model_is_in_use() Python bindings --- cuda_core/cuda/core/experimental/_event.pyx | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 89b9ced29d..c14f8df9eb 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -14,16 +14,10 @@ from cuda.core.experimental._utils.cuda_utils cimport ( ) cdef extern from "hags_status.h": - int _hags_status_impl "hags_status"() - -cpdef int hags_status(): - return _hags_status_impl() + int hags_status() cdef extern from "wddm_driver_model_is_in_use.h": - int _wddm_driver_model_is_in_use_impl "wddm_driver_model_is_in_use"() - -cpdef int wddm_driver_model_is_in_use(): - return _wddm_driver_model_is_in_use_impl() + int wddm_driver_model_is_in_use() def ensure_wddm_with_hags() -> None: From 58e157820371e151200e63b9274fa2e1c5e8fb43 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 13:59:22 -0800 Subject: [PATCH 14/22] .gitattributes: _utils/*.h text --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 3bff10eeff..9cb2b279c4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -10,4 +10,4 @@ cuda/_version.py export-subst # SCM syntax highlighting & preventing 3-way merges pixi.lock merge=binary linguist-language=YAML linguist-generated=true -cuda_core/cuda/core/experimental/_utils/hags_status.h text eol=lf +cuda_core/cuda/core/experimental/_utils/*.h text eol=lf From 8d11440cadbdc3665599feebc348de5389e18fb8 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 14:26:55 -0800 Subject: [PATCH 15/22] Caching: Run wddm_driver_model_is_in_use() and hags_status() only once per process. --- cuda_core/cuda/core/experimental/_event.pyx | 39 ++++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index c14f8df9eb..c3fc35eff4 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -13,6 +13,8 @@ from cuda.core.experimental._utils.cuda_utils cimport ( HANDLE_RETURN ) +import sys + cdef extern from "hags_status.h": int hags_status() @@ -20,32 +22,51 @@ cdef extern from "wddm_driver_model_is_in_use.h": int wddm_driver_model_is_in_use() +cdef int _ensure_wddm_with_hags_state = 0 +# 0 = unknown / not checked +# 1 = OK (no restriction or HAGS fully enabled) +# 2 = misconfigured (should raise) + +_WDDM_HAGS_ERROR = ( + "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the " + "Windows WDDM driver model is in use in order to obtain reliable CUDA event " + "timing. Please enable HAGS in the Windows graphics settings or switch to a " + "non-WDDM driver model." +) + + def ensure_wddm_with_hags() -> None: """On Windows with WDDM driver model, require HAGS to be fully enabled. If WDDM is not in use, or the platform is non-Windows, this is a no-op. + The result of the driver/HAGS probe is cached per process. """ - import sys + global _ensure_wddm_with_hags_state + + cdef int state = _ensure_wddm_with_hags_state + if state == 1: + return + if state == 2: + raise RuntimeError(_WDDM_HAGS_ERROR) if sys.platform != "win32": + _ensure_wddm_with_hags_state = 1 return - wddm_state = wddm_driver_model_is_in_use() + cdef int wddm_state = wddm_driver_model_is_in_use() if wddm_state != 1: # Either not WDDM or NVML was not able to determine the driver model. + _ensure_wddm_with_hags_state = 1 return - hags_state = hags_status() + cdef int hags_state = hags_status() if hags_state == 2: # HAGS fully enabled. + _ensure_wddm_with_hags_state = 1 return - raise RuntimeError( - "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the " - "Windows WDDM driver model is in use in order to obtain reliable CUDA event " - "timing. Please enable HAGS in the Windows graphics settings or switch to a " - "non-WDDM driver model." - ) + _ensure_wddm_with_hags_state = 2 + raise RuntimeError(_WDDM_HAGS_ERROR) import cython From dfd4d135c7866b159f2e0b2998ab6a541e88983c Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 14:55:33 -0800 Subject: [PATCH 16/22] Stub wddm_driver_model_is_in_use on non-Windows --- .../_utils/wddm_driver_model_is_in_use.c | 25 +++++++++++++++---- cuda_python/README.md | 0 2 files changed, 20 insertions(+), 5 deletions(-) mode change 120000 => 100644 cuda_python/README.md diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c index 2dc8f768bc..63197cb5c3 100644 --- a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c +++ b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c @@ -1,15 +1,19 @@ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 - +// // Query NVML for the Windows WDDM driver model, looping over all GPUs. // -// Example compilation command: -// cl /nologo /c wddm_driver_model_is_in_use.c /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\include" +// On non-Windows platforms this always returns -1 and performs no NVML calls. +// +// Example compilation command (Windows/MSVC): +// cl /nologo /c wddm_driver_model_is_in_use.c /I"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\include" // Needed for linking: -// /link /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\lib\x64" nvml.lib - +// /link /LIBPATH:"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\lib\\x64" nvml.lib +// #include "wddm_driver_model_is_in_use.h" +#ifdef _MSC_VER + #include "nvml.h" // from NVIDIA GPU Computing Toolkit static int wddm_driver_model_is_in_use_impl(void) @@ -46,3 +50,14 @@ int wddm_driver_model_is_in_use(void) nvmlShutdown(); return return_code; } + +#else // !_MSC_VER + +int wddm_driver_model_is_in_use(void) +{ + // WDDM is a Windows-only concept; on non-Windows platforms we report -1 + // to indicate that the driver model could not be determined. + return -1; +} + +#endif // _MSC_VER diff --git a/cuda_python/README.md b/cuda_python/README.md deleted file mode 120000 index 32d46ee883..0000000000 --- a/cuda_python/README.md +++ /dev/null @@ -1 +0,0 @@ -../README.md \ No newline at end of file diff --git a/cuda_python/README.md b/cuda_python/README.md new file mode 100644 index 0000000000..32d46ee883 --- /dev/null +++ b/cuda_python/README.md @@ -0,0 +1 @@ +../README.md \ No newline at end of file From a41581056c83a62331862de489790211af229590 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 20:42:47 -0800 Subject: [PATCH 17/22] Add cuda_nvml_dev to cuda-components in fetch_ctk/action.yml --- .github/actions/fetch_ctk/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml index 6cec965105..e5f6c18e30 100644 --- a/.github/actions/fetch_ctk/action.yml +++ b/.github/actions/fetch_ctk/action.yml @@ -17,7 +17,7 @@ inputs: description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'" required: false type: string - default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile" + default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile,cuda_nvml_dev" cuda-path: description: "where the CTK components will be installed to, relative to $PWD" required: false From 7c2aa1ae71af84a93fe86606bc745466e0449989 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 21:22:03 -0800 Subject: [PATCH 18/22] Exclude nvml.dll from delvewheel repair nvml.dll is not part of the CTK but is installed with the CUDA driver. Adding --exclude nvml.dll to the delvewheel repair command prevents delvewheel from searching for this DLL during wheel repair, since it will be available system-wide at runtime. --- cuda_core/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index a920005f21..5ea98fc85e 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -86,4 +86,4 @@ archs = "native" [tool.cibuildwheel.windows] archs = "AMD64" before-build = "pip install delvewheel" -repair-wheel-command = "delvewheel repair --namespace-pkg cuda -w {dest_dir} {wheel}" +repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude nvml.dll -w {dest_dir} {wheel}" From 85a1550ee989d446d72ca34c99f45311934cc3e6 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 22:56:48 -0800 Subject: [PATCH 19/22] Add documentation comment to wddm_driver_model_is_in_use.h Add a comment block documenting the return codes, matching the style of hags_status.h. Also converts the file from binary to text mode per .gitattributes rules. --- .../experimental/_utils/wddm_driver_model_is_in_use.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h index 3472b90014..15ddca24e4 100644 --- a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h +++ b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h @@ -7,6 +7,15 @@ extern "C" { #endif +/* + * wddm_driver_model_is_in_use + * + * Return codes: + * -2 : Failed to get device count from NVML + * -1 : Not available on this platform (not compiled with MSVC on Windows) or NVML initialization failed + * 0 : No WDDM driver model found (all devices use TCC or other driver models) + * 1 : WDDM driver model is in use (at least one device uses WDDM) + */ int wddm_driver_model_is_in_use(void); #ifdef __cplusplus From 63ee3f386d76b24a593ba0cdac9fd884bf79ec45 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 23:02:09 -0800 Subject: [PATCH 20/22] Change 'event timing' to 'event timings' in WDDM HAGS error message Use plural 'timings' to better match the verb 'obtain', which pairs more naturally with concrete measurements/results rather than an abstract concept. --- cuda_core/cuda/core/experimental/_event.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index c3fc35eff4..b50f1e0173 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -30,7 +30,7 @@ cdef int _ensure_wddm_with_hags_state = 0 _WDDM_HAGS_ERROR = ( "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the " "Windows WDDM driver model is in use in order to obtain reliable CUDA event " - "timing. Please enable HAGS in the Windows graphics settings or switch to a " + "timings. Please enable HAGS in the Windows graphics settings or switch to a " "non-WDDM driver model." ) From 43d7c8ff3f9d228091baa72f289661875722ef2b Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 23:09:18 -0800 Subject: [PATCH 21/22] Restore cuda_python/README.md as symlink Restore the symlink that was accidentally converted to a regular file, likely due to Windows/WSL2 interaction. This restores the file mode from 100644 (regular file) back to 120000 (symlink) to match main. --- cuda_python/README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 120000 cuda_python/README.md diff --git a/cuda_python/README.md b/cuda_python/README.md deleted file mode 100644 index 32d46ee883..0000000000 --- a/cuda_python/README.md +++ /dev/null @@ -1 +0,0 @@ -../README.md \ No newline at end of file diff --git a/cuda_python/README.md b/cuda_python/README.md new file mode 120000 index 0000000000..32d46ee883 --- /dev/null +++ b/cuda_python/README.md @@ -0,0 +1 @@ +../README.md \ No newline at end of file From d6d7b578d764899dd3e5555618fb9fa644851670 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 22 Nov 2025 23:16:31 -0800 Subject: [PATCH 22/22] Standardize platform detection to use sys.platform == 'win32' Replace os.name == 'nt' with sys.platform == 'win32' in: - cuda_core/build_hooks.py (2 occurrences) - cuda_core/tests/test_event.py (1 occurrence) This matches the pattern used throughout the rest of the codebase for consistency. --- cuda_core/build_hooks.py | 5 +++-- cuda_core/tests/test_event.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 703227d14d..ce94ab1067 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -12,6 +12,7 @@ import os import re import subprocess +import sys from Cython.Build import cythonize from setuptools import Extension @@ -101,13 +102,13 @@ def get_sources(mod): return sources def get_libraries(mod): - if os.name == "nt" and mod == "_event": + if sys.platform == "win32" and mod == "_event": # user32 / gdi32 for hags_status.c, nvml for wddm_driver_model_is_in_use.c return ["user32", "gdi32", "nvml"] return None def get_library_dirs(): - if os.name != "nt": + if sys.platform != "win32": return None # wddm_driver_model_is_in_use.c needs nvml.lib return [os.path.join(root, "lib", "x64") for root in get_cuda_paths()] diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 998a87db03..ce6b0e1cd6 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import os +import sys import time import cuda.core.experimental @@ -69,7 +69,7 @@ def test_timing_success(init_cuda): # We only want to exercise the __sub__ method, this test is not meant # to stress-test the CUDA driver or time.sleep(). delay_ms = delay_seconds * 1000 - if os.name == "nt" or IS_WSL: # noqa: SIM108 + if sys.platform == "win32" or IS_WSL: # noqa: SIM108 # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default. generous_tolerance = 100 else: