Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
a6124ef
First step adding cuda_core/cuda/core/experimental/_utils/hags_status.c
rwgk Nov 20, 2025
3dbb0fc
cuda_core/build_hooks.py: Extension get_libraries()
rwgk Nov 20, 2025
3b19ff6
pre-commit: exclude soft-linked cuda_python/README.md from end-of-fil…
rwgk Nov 20, 2025
d79fb29
Merge branch 'main' into cuda_core_hags
rwgk Nov 20, 2025
2b08492
Add drvmodel.c prototype (with main)
rwgk Nov 20, 2025
235c122
Add wddm_driver_model_is_in_use.c
rwgk Nov 21, 2025
4fa00ee
Update build_hooks.py to link in nvml.lib
rwgk Nov 21, 2025
25060c6
Call wddm_driver_model_is_in_use from test_event.py
rwgk Nov 21, 2025
7e340af
Add ensure_hags_is_enabled_if_wddm_driver_model_is_in_use() in _event…
rwgk Nov 21, 2025
2eebbca
Work _xfail_if_hags_runtime_error into tests/test_event.py
rwgk Nov 21, 2025
3c2a1c7
Clean out inspect_hags_status(), add _is_hags_timing_usable() helper.
rwgk Nov 22, 2025
066d928
Refactor as _get_wddm_hags_error()
rwgk Nov 22, 2025
a80f553
Shorten name: ensure_wddm_with_hags()
rwgk Nov 22, 2025
35cccf3
Remove hags_status(), wddm_driver_model_is_in_use() Python bindings
rwgk Nov 22, 2025
58e1578
.gitattributes: _utils/*.h text
rwgk Nov 22, 2025
8d11440
Caching: Run wddm_driver_model_is_in_use() and hags_status() only onc…
rwgk Nov 22, 2025
dfd4d13
Stub wddm_driver_model_is_in_use on non-Windows
rwgk Nov 22, 2025
a415810
Add cuda_nvml_dev to cuda-components in fetch_ctk/action.yml
rwgk Nov 23, 2025
7c2aa1a
Exclude nvml.dll from delvewheel repair
rwgk Nov 23, 2025
85a1550
Add documentation comment to wddm_driver_model_is_in_use.h
rwgk Nov 23, 2025
63ee3f3
Change 'event timing' to 'event timings' in WDDM HAGS error message
rwgk Nov 23, 2025
43d7c8f
Restore cuda_python/README.md as symlink
rwgk Nov 23, 2025
d6d7b57
Standardize platform detection to use sys.platform == 'win32'
rwgk Nov 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ cuda/_version.py export-subst
*.png binary
# SCM syntax highlighting & preventing 3-way merges
pixi.lock merge=binary linguist-language=YAML linguist-generated=true

cuda_core/cuda/core/experimental/_utils/*.h text eol=lf
2 changes: 1 addition & 1 deletion .github/actions/fetch_ctk/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ inputs:
description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
required: false
type: string
default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile,cuda_nvml_dev"
cuda-path:
description: "where the CTK components will be installed to, relative to $PWD"
required: false
Expand Down
35 changes: 33 additions & 2 deletions cuda_core/build_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import os
import re
import subprocess
import sys

from Cython.Build import cythonize
from setuptools import Extension
Expand Down Expand Up @@ -84,12 +85,42 @@ def get_cuda_paths():
print("CUDA paths:", CUDA_PATH)
return CUDA_PATH

common_include_dirs = [
*(os.path.join(root, "include") for root in get_cuda_paths()),
os.path.join("cuda", "core", "experimental", "_utils"),
]

def get_sources(mod):
sources = [f"cuda/core/experimental/{mod}.pyx"]
if mod == "_event":
sources.extend(
[
"cuda/core/experimental/_utils/hags_status.c",
"cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c",
]
)
return sources

def get_libraries(mod):
if sys.platform == "win32" and mod == "_event":
# user32 / gdi32 for hags_status.c, nvml for wddm_driver_model_is_in_use.c
return ["user32", "gdi32", "nvml"]
return None

def get_library_dirs():
if sys.platform != "win32":
return None
# wddm_driver_model_is_in_use.c needs nvml.lib
return [os.path.join(root, "lib", "x64") for root in get_cuda_paths()]

ext_modules = tuple(
Extension(
f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
sources=[f"cuda/core/experimental/{mod}.pyx"],
include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()),
sources=get_sources(mod),
include_dirs=common_include_dirs,
language="c++",
libraries=get_libraries(mod),
library_dirs=get_library_dirs(),
)
for mod in module_names
)
Expand Down
58 changes: 58 additions & 0 deletions cuda_core/cuda/core/experimental/_event.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,62 @@ from cuda.core.experimental._utils.cuda_utils cimport (
HANDLE_RETURN
)

import sys

cdef extern from "hags_status.h":
int hags_status()

cdef extern from "wddm_driver_model_is_in_use.h":
int wddm_driver_model_is_in_use()


cdef int _ensure_wddm_with_hags_state = 0
# 0 = unknown / not checked
# 1 = OK (no restriction or HAGS fully enabled)
# 2 = misconfigured (should raise)

_WDDM_HAGS_ERROR = (
"Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the "
"Windows WDDM driver model is in use in order to obtain reliable CUDA event "
"timings. Please enable HAGS in the Windows graphics settings or switch to a "
"non-WDDM driver model."
)


def ensure_wddm_with_hags() -> None:
"""On Windows with WDDM driver model, require HAGS to be fully enabled.

If WDDM is not in use, or the platform is non-Windows, this is a no-op.
The result of the driver/HAGS probe is cached per process.
"""
global _ensure_wddm_with_hags_state

cdef int state = _ensure_wddm_with_hags_state
if state == 1:
return
if state == 2:
raise RuntimeError(_WDDM_HAGS_ERROR)

if sys.platform != "win32":
_ensure_wddm_with_hags_state = 1
return

cdef int wddm_state = wddm_driver_model_is_in_use()
if wddm_state != 1:
# Either not WDDM or NVML was not able to determine the driver model.
_ensure_wddm_with_hags_state = 1
return

cdef int hags_state = hags_status()
if hags_state == 2:
# HAGS fully enabled.
_ensure_wddm_with_hags_state = 1
return

_ensure_wddm_with_hags_state = 2
raise RuntimeError(_WDDM_HAGS_ERROR)


import cython
from dataclasses import dataclass
import multiprocessing
Expand Down Expand Up @@ -136,6 +192,8 @@ cdef class Event:

def __sub__(self, other: Event):
# return self - other (in milliseconds)
if not self.is_timing_disabled and not other.is_timing_disabled:
ensure_wddm_with_hags()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will throw if hardware accelerated scheduling isn't enabled, no?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Responding to this question in isolation:

Yes, it'll show this message:

    "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the "
    "Windows WDDM driver model is in use in order to obtain reliable CUDA event "
    "timings. Please enable HAGS in the Windows graphics settings or switch to a "
    "non-WDDM driver model."

cdef float timing
with nogil:
err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle)
Expand Down
84 changes: 84 additions & 0 deletions cuda_core/cuda/core/experimental/_utils/hags_status.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

// Note, this may or may not exist, but is NOT the ground truth:
// reg query "HKLM\SYSTEM\CurrentControlSet\Control\GraphicsDrivers" /v HwSchMode
// The HwSchMode registry value is only a user override (force on/off).
// If absent, Windows uses the driver's WDDM caps defaults.
// Actual HAGS state comes from D3DKMT_WDDM_2_7_CAPS, not the registry.

// Possibly useful for experimentation:
// reg delete "HKLM\SYSTEM\CurrentControlSet\Control\GraphicsDrivers" /v HwSchMode /f

#ifdef _MSC_VER
#include <windows.h>
#include <d3dkmthk.h>
#include <d3dkmdt.h>
#endif

int hags_status(void)
{
#ifdef _MSC_VER
DISPLAY_DEVICEW dd;
HDC hdc;
int i;
BOOL foundPrimary = FALSE;
NTSTATUS status;

D3DKMT_OPENADAPTERFROMHDC openData;
D3DKMT_QUERYADAPTERINFO query;
D3DKMT_WDDM_2_7_CAPS caps;
D3DKMT_CLOSEADAPTER closeData;

// Find the primary display device
ZeroMemory(&dd, sizeof(dd));
dd.cb = sizeof(dd);

for (i = 0; EnumDisplayDevicesW(NULL, i, &dd, 0); ++i) {
if (dd.StateFlags & DISPLAY_DEVICE_PRIMARY_DEVICE) {
foundPrimary = TRUE;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about cases where a) CUDA compute is being done on the non-primary GPU (i.e. you've got a separate GPU with no display that you use for compute), or b) laptops with the integrated GPUs as default (that are non-NVIDIA), where the discrete GPU is the NVIDIA one.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That granularity is indeed missing in this PR, because I didn't want to add the complexity to query based on the actual device active when the cuEventElapsedTime() time call is reached.

How likely are those cases? Could it cause unwanted side-effects if HAGS is enabled?

break;
}
}

if (!foundPrimary)
return 0;

hdc = CreateDCW(NULL, dd.DeviceName, NULL, NULL);
if (!hdc)
return 0;

ZeroMemory(&openData, sizeof(openData));
openData.hDc = hdc;
status = D3DKMTOpenAdapterFromHdc(&openData);

DeleteDC(hdc);

if (status != 0)
return 0;

ZeroMemory(&caps, sizeof(caps));
ZeroMemory(&query, sizeof(query));

query.hAdapter = openData.hAdapter;
query.Type = KMTQAITYPE_WDDM_2_7_CAPS;
query.pPrivateDriverData = &caps;
query.PrivateDriverDataSize = sizeof(caps);

status = D3DKMTQueryAdapterInfo(&query);

ZeroMemory(&closeData, sizeof(closeData));
closeData.hAdapter = openData.hAdapter;
D3DKMTCloseAdapter(&closeData);

if (status != 0)
return 0;

if (!caps.HwSchSupported || !caps.HwSchEnabled)
return 1;

return 2;
#else
return -1;
#endif
}
23 changes: 23 additions & 0 deletions cuda_core/cuda/core/experimental/_utils/hags_status.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

#pragma once

#ifdef __cplusplus
extern "C" {
#endif

/*
* hags_status
*
* Return codes:
* -1 : Not available on this platform (not compiled with MSVC on Windows)
* 0 : Failure obtaining HwSchSupported/HwSchEnabled
* 1 : HwSchSupported == 0 or HwSchEnabled == 0 (HAGS not fully enabled)
* 2 : HwSchSupported == 1 and HwSchEnabled == 1 (HAGS fully enabled)
*/
int hags_status(void);

#ifdef __cplusplus
} /* extern "C" */
#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Query NVML for the Windows WDDM driver model, looping over all GPUs.
//
// On non-Windows platforms this always returns -1 and performs no NVML calls.
//
// Example compilation command (Windows/MSVC):
// cl /nologo /c wddm_driver_model_is_in_use.c /I"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\include"
// Needed for linking:
// /link /LIBPATH:"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\lib\\x64" nvml.lib
//
#include "wddm_driver_model_is_in_use.h"

#ifdef _MSC_VER

#include "nvml.h" // from NVIDIA GPU Computing Toolkit

static int wddm_driver_model_is_in_use_impl(void)
{
unsigned deviceCount = 0;
nvmlReturn_t result = nvmlDeviceGetCount_v2(&deviceCount);
if (result != NVML_SUCCESS) {
return -2;
}
for (unsigned i_dev = 0; i_dev < deviceCount; ++i_dev) {
nvmlDevice_t device;
result = nvmlDeviceGetHandleByIndex_v2(i_dev, &device);
if (result == NVML_SUCCESS) {
nvmlDriverModel_t currentModel = 0;
nvmlDriverModel_t pendingModel = 0;
result = nvmlDeviceGetDriverModel(device, &currentModel, &pendingModel);
if (result == NVML_SUCCESS) {
if (currentModel == NVML_DRIVER_WDDM || pendingModel == NVML_DRIVER_WDDM) {
return 1;
}
}
}
}
return 0;
}

int wddm_driver_model_is_in_use(void)
{
nvmlReturn_t result = nvmlInit_v2();
if (result != NVML_SUCCESS) {
return -1;
}
int return_code = wddm_driver_model_is_in_use_impl();
nvmlShutdown();
return return_code;
}

#else // !_MSC_VER

int wddm_driver_model_is_in_use(void)
{
// WDDM is a Windows-only concept; on non-Windows platforms we report -1
// to indicate that the driver model could not be determined.
return -1;
}

#endif // _MSC_VER
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

#pragma once

#ifdef __cplusplus
extern "C" {
#endif

/*
* wddm_driver_model_is_in_use
*
* Return codes:
* -2 : Failed to get device count from NVML
* -1 : Not available on this platform (not compiled with MSVC on Windows) or NVML initialization failed
* 0 : No WDDM driver model found (all devices use TCC or other driver models)
* 1 : WDDM driver model is in use (at least one device uses WDDM)
*/
int wddm_driver_model_is_in_use(void);

#ifdef __cplusplus
} /* extern "C" */
#endif
2 changes: 1 addition & 1 deletion cuda_core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,4 @@ archs = "native"
[tool.cibuildwheel.windows]
archs = "AMD64"
before-build = "pip install delvewheel"
repair-wheel-command = "delvewheel repair --namespace-pkg cuda -w {dest_dir} {wheel}"
repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude nvml.dll -w {dest_dir} {wheel}"
Loading
Loading