NVIDIA · rwgk · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -9,3 +9,5 @@ cuda/_version.py export-subst
 *.png binary
 # SCM syntax highlighting & preventing 3-way merges
 pixi.lock merge=binary linguist-language=YAML linguist-generated=true
+
+cuda_core/cuda/core/experimental/_utils/*.h text eol=lf
diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
@@ -17,7 +17,7 @@ inputs:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
     type: string
-    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile,cuda_nvml_dev"
   cuda-path:
     description: "where the CTK components will be installed to, relative to $PWD"
     required: false

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
@@ -12,6 +12,7 @@
 import os
 import re
 import subprocess
+import sys
 
 from Cython.Build import cythonize
 from setuptools import Extension
@@ -84,12 +85,42 @@ def get_cuda_paths():
         print("CUDA paths:", CUDA_PATH)
         return CUDA_PATH
 
+    common_include_dirs = [
+        *(os.path.join(root, "include") for root in get_cuda_paths()),
+        os.path.join("cuda", "core", "experimental", "_utils"),
+    ]
+
+    def get_sources(mod):
+        sources = [f"cuda/core/experimental/{mod}.pyx"]
+        if mod == "_event":
+            sources.extend(
+                [
+                    "cuda/core/experimental/_utils/hags_status.c",
+                    "cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c",
+                ]
+            )
+        return sources
+
+    def get_libraries(mod):
+        if sys.platform == "win32" and mod == "_event":
+            # user32 / gdi32 for hags_status.c, nvml for wddm_driver_model_is_in_use.c
+            return ["user32", "gdi32", "nvml"]
+        return None
+
+    def get_library_dirs():
+        if sys.platform != "win32":
+            return None
+        # wddm_driver_model_is_in_use.c needs nvml.lib
+        return [os.path.join(root, "lib", "x64") for root in get_cuda_paths()]
+
     ext_modules = tuple(
         Extension(
             f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
-            sources=[f"cuda/core/experimental/{mod}.pyx"],
-            include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()),
+            sources=get_sources(mod),
+            include_dirs=common_include_dirs,
             language="c++",
+            libraries=get_libraries(mod),
+            library_dirs=get_library_dirs(),
         )
         for mod in module_names
     )

diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
@@ -13,6 +13,62 @@ from cuda.core.experimental._utils.cuda_utils cimport (
     HANDLE_RETURN
 )
 
+import sys
+
+cdef extern from "hags_status.h":
+    int hags_status()
+
+cdef extern from "wddm_driver_model_is_in_use.h":
+    int wddm_driver_model_is_in_use()
+
+
+cdef int _ensure_wddm_with_hags_state = 0
+# 0 = unknown / not checked
+# 1 = OK (no restriction or HAGS fully enabled)
+# 2 = misconfigured (should raise)
+
+_WDDM_HAGS_ERROR = (
+    "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the "
+    "Windows WDDM driver model is in use in order to obtain reliable CUDA event "
+    "timings. Please enable HAGS in the Windows graphics settings or switch to a "
+    "non-WDDM driver model."
+)
+
+
+def ensure_wddm_with_hags() -> None:
+    """On Windows with WDDM driver model, require HAGS to be fully enabled.
+
+    If WDDM is not in use, or the platform is non-Windows, this is a no-op.
+    The result of the driver/HAGS probe is cached per process.
+    """
+    global _ensure_wddm_with_hags_state
+
+    cdef int state = _ensure_wddm_with_hags_state
+    if state == 1:
+        return
+    if state == 2:
+        raise RuntimeError(_WDDM_HAGS_ERROR)
+
+    if sys.platform != "win32":
+        _ensure_wddm_with_hags_state = 1
+        return
+
+    cdef int wddm_state = wddm_driver_model_is_in_use()
+    if wddm_state != 1:
+        # Either not WDDM or NVML was not able to determine the driver model.
+        _ensure_wddm_with_hags_state = 1
+        return
+
+    cdef int hags_state = hags_status()
+    if hags_state == 2:
+        # HAGS fully enabled.
+        _ensure_wddm_with_hags_state = 1
+        return
+
+    _ensure_wddm_with_hags_state = 2
+    raise RuntimeError(_WDDM_HAGS_ERROR)
+
+
 import cython
 from dataclasses import dataclass
 import multiprocessing
@@ -136,6 +192,8 @@ cdef class Event:
 
     def __sub__(self, other: Event):
         # return self - other (in milliseconds)
+        if not self.is_timing_disabled and not other.is_timing_disabled:
+            ensure_wddm_with_hags()
         cdef float timing
         with nogil:
             err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle)

diff --git a/cuda_core/cuda/core/experimental/_utils/hags_status.c b/cuda_core/cuda/core/experimental/_utils/hags_status.c
@@ -0,0 +1,84 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Note, this may or may not exist, but is NOT the ground truth:
+// reg query "HKLM\SYSTEM\CurrentControlSet\Control\GraphicsDrivers" /v HwSchMode
+// The HwSchMode registry value is only a user override (force on/off).
+// If absent, Windows uses the driver's WDDM caps defaults.
+// Actual HAGS state comes from D3DKMT_WDDM_2_7_CAPS, not the registry.
+
+// Possibly useful for experimentation:
+// reg delete "HKLM\SYSTEM\CurrentControlSet\Control\GraphicsDrivers" /v HwSchMode /f
+
+#ifdef _MSC_VER
+#include <windows.h>
+#include <d3dkmthk.h>
+#include <d3dkmdt.h>
+#endif
+
+int hags_status(void)
+{
+#ifdef _MSC_VER
+    DISPLAY_DEVICEW dd;
+    HDC hdc;
+    int i;
+    BOOL foundPrimary = FALSE;
+    NTSTATUS status;
+
+    D3DKMT_OPENADAPTERFROMHDC openData;
+    D3DKMT_QUERYADAPTERINFO   query;
+    D3DKMT_WDDM_2_7_CAPS      caps;
+    D3DKMT_CLOSEADAPTER       closeData;
+
+    // Find the primary display device
+    ZeroMemory(&dd, sizeof(dd));
+    dd.cb = sizeof(dd);
+
+    for (i = 0; EnumDisplayDevicesW(NULL, i, &dd, 0); ++i) {
+        if (dd.StateFlags & DISPLAY_DEVICE_PRIMARY_DEVICE) {
+            foundPrimary = TRUE;
+            break;
+        }
+    }
+
+    if (!foundPrimary)
+        return 0;
+
+    hdc = CreateDCW(NULL, dd.DeviceName, NULL, NULL);
+    if (!hdc)
+        return 0;
+
+    ZeroMemory(&openData, sizeof(openData));
+    openData.hDc = hdc;
+    status = D3DKMTOpenAdapterFromHdc(&openData);
+
+    DeleteDC(hdc);
+
+    if (status != 0)
+        return 0;
+
+    ZeroMemory(&caps, sizeof(caps));
+    ZeroMemory(&query, sizeof(query));
+
+    query.hAdapter             = openData.hAdapter;
+    query.Type                 = KMTQAITYPE_WDDM_2_7_CAPS;
+    query.pPrivateDriverData   = &caps;
+    query.PrivateDriverDataSize = sizeof(caps);
+
+    status = D3DKMTQueryAdapterInfo(&query);
+
+    ZeroMemory(&closeData, sizeof(closeData));
+    closeData.hAdapter = openData.hAdapter;
+    D3DKMTCloseAdapter(&closeData);
+
+    if (status != 0)
+        return 0;
+
+    if (!caps.HwSchSupported || !caps.HwSchEnabled)
+        return 1;
+
+    return 2;
+#else
+    return -1;
+#endif
+}
diff --git a/cuda_core/cuda/core/experimental/_utils/hags_status.h b/cuda_core/cuda/core/experimental/_utils/hags_status.h
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * hags_status
+ *
+ * Return codes:
+ *   -1 : Not available on this platform (not compiled with MSVC on Windows)
+ *    0 : Failure obtaining HwSchSupported/HwSchEnabled
+ *    1 : HwSchSupported == 0 or HwSchEnabled == 0 (HAGS not fully enabled)
+ *    2 : HwSchSupported == 1 and HwSchEnabled == 1 (HAGS fully enabled)
+ */
+int hags_status(void);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Query NVML for the Windows WDDM driver model, looping over all GPUs.
+//
+// On non-Windows platforms this always returns -1 and performs no NVML calls.
+//
+// Example compilation command (Windows/MSVC):
+//     cl /nologo /c wddm_driver_model_is_in_use.c /I"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\include"
+// Needed for linking:
+//     /link /LIBPATH:"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\lib\\x64" nvml.lib
+//
+#include "wddm_driver_model_is_in_use.h"
+
+#ifdef _MSC_VER
+
+#include "nvml.h"  // from NVIDIA GPU Computing Toolkit
+
+static int wddm_driver_model_is_in_use_impl(void)
+{
+    unsigned deviceCount = 0;
+    nvmlReturn_t result = nvmlDeviceGetCount_v2(&deviceCount);
+    if (result != NVML_SUCCESS) {
+        return -2;
+    }
+    for (unsigned i_dev = 0; i_dev < deviceCount; ++i_dev) {
+        nvmlDevice_t device;
+        result = nvmlDeviceGetHandleByIndex_v2(i_dev, &device);
+        if (result == NVML_SUCCESS) {
+            nvmlDriverModel_t currentModel = 0;
+            nvmlDriverModel_t pendingModel = 0;
+            result = nvmlDeviceGetDriverModel(device, &currentModel, &pendingModel);
+            if (result == NVML_SUCCESS) {
+                if (currentModel == NVML_DRIVER_WDDM || pendingModel == NVML_DRIVER_WDDM) {
+                    return 1;
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+int wddm_driver_model_is_in_use(void)
+{
+    nvmlReturn_t result = nvmlInit_v2();
+    if (result != NVML_SUCCESS) {
+        return -1;
+    }
+    int return_code = wddm_driver_model_is_in_use_impl();
+    nvmlShutdown();
+    return return_code;
+}
+
+#else  // !_MSC_VER
+
+int wddm_driver_model_is_in_use(void)
+{
+    // WDDM is a Windows-only concept; on non-Windows platforms we report -1
+    // to indicate that the driver model could not be determined.
+    return -1;
+}
+
+#endif  // _MSC_VER
diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * wddm_driver_model_is_in_use
+ *
+ * Return codes:
+ *   -2 : Failed to get device count from NVML
+ *   -1 : Not available on this platform (not compiled with MSVC on Windows) or NVML initialization failed
+ *    0 : No WDDM driver model found (all devices use TCC or other driver models)
+ *    1 : WDDM driver model is in use (at least one device uses WDDM)
+ */
+int wddm_driver_model_is_in_use(void);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
@@ -86,4 +86,4 @@ archs = "native"
 [tool.cibuildwheel.windows]
 archs = "AMD64"
 before-build = "pip install delvewheel"
-repair-wheel-command = "delvewheel repair --namespace-pkg cuda -w {dest_dir} {wheel}"
+repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude nvml.dll -w {dest_dir} {wheel}"