diff --git a/.gitattributes b/.gitattributes index cf17ba9d5e..9cb2b279c4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -9,3 +9,5 @@ cuda/_version.py export-subst *.png binary # SCM syntax highlighting & preventing 3-way merges pixi.lock merge=binary linguist-language=YAML linguist-generated=true + +cuda_core/cuda/core/experimental/_utils/*.h text eol=lf diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml index 6cec965105..e5f6c18e30 100644 --- a/.github/actions/fetch_ctk/action.yml +++ b/.github/actions/fetch_ctk/action.yml @@ -17,7 +17,7 @@ inputs: description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'" required: false type: string - default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile" + default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile,cuda_nvml_dev" cuda-path: description: "where the CTK components will be installed to, relative to $PWD" required: false diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index e38f5676df..ce94ab1067 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -12,6 +12,7 @@ import os import re import subprocess +import sys from Cython.Build import cythonize from setuptools import Extension @@ -84,12 +85,42 @@ def get_cuda_paths(): print("CUDA paths:", CUDA_PATH) return CUDA_PATH + common_include_dirs = [ + *(os.path.join(root, "include") for root in get_cuda_paths()), + os.path.join("cuda", "core", "experimental", "_utils"), + ] + + def get_sources(mod): + sources = [f"cuda/core/experimental/{mod}.pyx"] + if mod == "_event": + sources.extend( + [ + "cuda/core/experimental/_utils/hags_status.c", + "cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c", + ] + ) + return sources + + def get_libraries(mod): + if sys.platform == "win32" and mod == "_event": + # user32 / gdi32 for hags_status.c, nvml for wddm_driver_model_is_in_use.c + return ["user32", "gdi32", "nvml"] + return None + + def get_library_dirs(): + if sys.platform != "win32": + return None + # wddm_driver_model_is_in_use.c needs nvml.lib + return [os.path.join(root, "lib", "x64") for root in get_cuda_paths()] + ext_modules = tuple( Extension( f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", - sources=[f"cuda/core/experimental/{mod}.pyx"], - include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()), + sources=get_sources(mod), + include_dirs=common_include_dirs, language="c++", + libraries=get_libraries(mod), + library_dirs=get_library_dirs(), ) for mod in module_names ) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 98a45d0043..b50f1e0173 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -13,6 +13,62 @@ from cuda.core.experimental._utils.cuda_utils cimport ( HANDLE_RETURN ) +import sys + +cdef extern from "hags_status.h": + int hags_status() + +cdef extern from "wddm_driver_model_is_in_use.h": + int wddm_driver_model_is_in_use() + + +cdef int _ensure_wddm_with_hags_state = 0 +# 0 = unknown / not checked +# 1 = OK (no restriction or HAGS fully enabled) +# 2 = misconfigured (should raise) + +_WDDM_HAGS_ERROR = ( + "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the " + "Windows WDDM driver model is in use in order to obtain reliable CUDA event " + "timings. Please enable HAGS in the Windows graphics settings or switch to a " + "non-WDDM driver model." +) + + +def ensure_wddm_with_hags() -> None: + """On Windows with WDDM driver model, require HAGS to be fully enabled. + + If WDDM is not in use, or the platform is non-Windows, this is a no-op. + The result of the driver/HAGS probe is cached per process. + """ + global _ensure_wddm_with_hags_state + + cdef int state = _ensure_wddm_with_hags_state + if state == 1: + return + if state == 2: + raise RuntimeError(_WDDM_HAGS_ERROR) + + if sys.platform != "win32": + _ensure_wddm_with_hags_state = 1 + return + + cdef int wddm_state = wddm_driver_model_is_in_use() + if wddm_state != 1: + # Either not WDDM or NVML was not able to determine the driver model. + _ensure_wddm_with_hags_state = 1 + return + + cdef int hags_state = hags_status() + if hags_state == 2: + # HAGS fully enabled. + _ensure_wddm_with_hags_state = 1 + return + + _ensure_wddm_with_hags_state = 2 + raise RuntimeError(_WDDM_HAGS_ERROR) + + import cython from dataclasses import dataclass import multiprocessing @@ -136,6 +192,8 @@ cdef class Event: def __sub__(self, other: Event): # return self - other (in milliseconds) + if not self.is_timing_disabled and not other.is_timing_disabled: + ensure_wddm_with_hags() cdef float timing with nogil: err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle) diff --git a/cuda_core/cuda/core/experimental/_utils/hags_status.c b/cuda_core/cuda/core/experimental/_utils/hags_status.c new file mode 100644 index 0000000000..e034dcf855 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/hags_status.c @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Note, this may or may not exist, but is NOT the ground truth: +// reg query "HKLM\SYSTEM\CurrentControlSet\Control\GraphicsDrivers" /v HwSchMode +// The HwSchMode registry value is only a user override (force on/off). +// If absent, Windows uses the driver's WDDM caps defaults. +// Actual HAGS state comes from D3DKMT_WDDM_2_7_CAPS, not the registry. + +// Possibly useful for experimentation: +// reg delete "HKLM\SYSTEM\CurrentControlSet\Control\GraphicsDrivers" /v HwSchMode /f + +#ifdef _MSC_VER +#include +#include +#include +#endif + +int hags_status(void) +{ +#ifdef _MSC_VER + DISPLAY_DEVICEW dd; + HDC hdc; + int i; + BOOL foundPrimary = FALSE; + NTSTATUS status; + + D3DKMT_OPENADAPTERFROMHDC openData; + D3DKMT_QUERYADAPTERINFO query; + D3DKMT_WDDM_2_7_CAPS caps; + D3DKMT_CLOSEADAPTER closeData; + + // Find the primary display device + ZeroMemory(&dd, sizeof(dd)); + dd.cb = sizeof(dd); + + for (i = 0; EnumDisplayDevicesW(NULL, i, &dd, 0); ++i) { + if (dd.StateFlags & DISPLAY_DEVICE_PRIMARY_DEVICE) { + foundPrimary = TRUE; + break; + } + } + + if (!foundPrimary) + return 0; + + hdc = CreateDCW(NULL, dd.DeviceName, NULL, NULL); + if (!hdc) + return 0; + + ZeroMemory(&openData, sizeof(openData)); + openData.hDc = hdc; + status = D3DKMTOpenAdapterFromHdc(&openData); + + DeleteDC(hdc); + + if (status != 0) + return 0; + + ZeroMemory(&caps, sizeof(caps)); + ZeroMemory(&query, sizeof(query)); + + query.hAdapter = openData.hAdapter; + query.Type = KMTQAITYPE_WDDM_2_7_CAPS; + query.pPrivateDriverData = ∩︀ + query.PrivateDriverDataSize = sizeof(caps); + + status = D3DKMTQueryAdapterInfo(&query); + + ZeroMemory(&closeData, sizeof(closeData)); + closeData.hAdapter = openData.hAdapter; + D3DKMTCloseAdapter(&closeData); + + if (status != 0) + return 0; + + if (!caps.HwSchSupported || !caps.HwSchEnabled) + return 1; + + return 2; +#else + return -1; +#endif +} diff --git a/cuda_core/cuda/core/experimental/_utils/hags_status.h b/cuda_core/cuda/core/experimental/_utils/hags_status.h new file mode 100644 index 0000000000..e2194ea769 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/hags_status.h @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hags_status + * + * Return codes: + * -1 : Not available on this platform (not compiled with MSVC on Windows) + * 0 : Failure obtaining HwSchSupported/HwSchEnabled + * 1 : HwSchSupported == 0 or HwSchEnabled == 0 (HAGS not fully enabled) + * 2 : HwSchSupported == 1 and HwSchEnabled == 1 (HAGS fully enabled) + */ +int hags_status(void); + +#ifdef __cplusplus +} /* extern "C" */ +#endif diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c new file mode 100644 index 0000000000..63197cb5c3 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Query NVML for the Windows WDDM driver model, looping over all GPUs. +// +// On non-Windows platforms this always returns -1 and performs no NVML calls. +// +// Example compilation command (Windows/MSVC): +// cl /nologo /c wddm_driver_model_is_in_use.c /I"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\include" +// Needed for linking: +// /link /LIBPATH:"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\lib\\x64" nvml.lib +// +#include "wddm_driver_model_is_in_use.h" + +#ifdef _MSC_VER + +#include "nvml.h" // from NVIDIA GPU Computing Toolkit + +static int wddm_driver_model_is_in_use_impl(void) +{ + unsigned deviceCount = 0; + nvmlReturn_t result = nvmlDeviceGetCount_v2(&deviceCount); + if (result != NVML_SUCCESS) { + return -2; + } + for (unsigned i_dev = 0; i_dev < deviceCount; ++i_dev) { + nvmlDevice_t device; + result = nvmlDeviceGetHandleByIndex_v2(i_dev, &device); + if (result == NVML_SUCCESS) { + nvmlDriverModel_t currentModel = 0; + nvmlDriverModel_t pendingModel = 0; + result = nvmlDeviceGetDriverModel(device, ¤tModel, &pendingModel); + if (result == NVML_SUCCESS) { + if (currentModel == NVML_DRIVER_WDDM || pendingModel == NVML_DRIVER_WDDM) { + return 1; + } + } + } + } + return 0; +} + +int wddm_driver_model_is_in_use(void) +{ + nvmlReturn_t result = nvmlInit_v2(); + if (result != NVML_SUCCESS) { + return -1; + } + int return_code = wddm_driver_model_is_in_use_impl(); + nvmlShutdown(); + return return_code; +} + +#else // !_MSC_VER + +int wddm_driver_model_is_in_use(void) +{ + // WDDM is a Windows-only concept; on non-Windows platforms we report -1 + // to indicate that the driver model could not be determined. + return -1; +} + +#endif // _MSC_VER diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h new file mode 100644 index 0000000000..15ddca24e4 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * wddm_driver_model_is_in_use + * + * Return codes: + * -2 : Failed to get device count from NVML + * -1 : Not available on this platform (not compiled with MSVC on Windows) or NVML initialization failed + * 0 : No WDDM driver model found (all devices use TCC or other driver models) + * 1 : WDDM driver model is in use (at least one device uses WDDM) + */ +int wddm_driver_model_is_in_use(void); + +#ifdef __cplusplus +} /* extern "C" */ +#endif diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index a920005f21..5ea98fc85e 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -86,4 +86,4 @@ archs = "native" [tool.cibuildwheel.windows] archs = "AMD64" before-build = "pip install delvewheel" -repair-wheel-command = "delvewheel repair --namespace-pkg cuda -w {dest_dir} {wheel}" +repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude nvml.dll -w {dest_dir} {wheel}" diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 992a78e92e..ce6b0e1cd6 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import os +import sys import time import cuda.core.experimental @@ -16,11 +16,45 @@ from cuda_python_test_helpers import IS_WSL +def _get_wddm_hags_error() -> str: + """Probe Event.__sub__ to obtain WDDM/HAGS RuntimeError. + + Any other RuntimeError is propagated. + """ + device = Device() + device.set_current() + options = EventOptions(enable_timing=True) + stream = device.create_stream() + + event1 = stream.record(options=options) + event2 = stream.record(options=options) + event2.sync() + + try: + _ = event2 - event1 + except RuntimeError as exc: + msg = str(exc) + if "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled" in msg: + return msg + raise + return None + + +_WDDM_HAGS_ERROR = _get_wddm_hags_error() +_WDDM_HAGS_PRECONDITION_MSG = "WDDM/HAGS precondition not met" + + def test_event_init_disabled(): with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."): cuda.core.experimental._event.Event() # Ensure back door is locked. +def test_ensure_wddm_with_hags(): + if _WDDM_HAGS_ERROR: + pytest.xfail(_WDDM_HAGS_ERROR) + + +@pytest.mark.skipif(_WDDM_HAGS_ERROR is not None, reason=_WDDM_HAGS_PRECONDITION_MSG) def test_timing_success(init_cuda): options = EventOptions(enable_timing=True) stream = Device().create_stream() @@ -35,7 +69,7 @@ def test_timing_success(init_cuda): # We only want to exercise the __sub__ method, this test is not meant # to stress-test the CUDA driver or time.sleep(). delay_ms = delay_seconds * 1000 - if os.name == "nt" or IS_WSL: # noqa: SIM108 + if sys.platform == "win32" or IS_WSL: # noqa: SIM108 # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default. generous_tolerance = 100 else: @@ -95,6 +129,7 @@ def test_error_timing_disabled(): event2 - event1 +@pytest.mark.skipif(_WDDM_HAGS_ERROR is not None, reason=_WDDM_HAGS_PRECONDITION_MSG) def test_error_timing_recorded(): device = Device() device.set_current() @@ -114,6 +149,7 @@ def test_error_timing_recorded(): event3 - event2 +@pytest.mark.skipif(_WDDM_HAGS_ERROR is not None, reason=_WDDM_HAGS_PRECONDITION_MSG) def test_error_timing_incomplete(): device = Device() device.set_current()