From a6124ef2f7c993d7df762890c911c7b902e9fb47 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 20 Nov 2025 10:58:00 -0800
Subject: [PATCH 01/22] First step adding
 cuda_core/cuda/core/experimental/_utils/hags_status.c

---
 .gitattributes                                |  2 +
 cuda_core/build_hooks.py                      | 18 +++-
 cuda_core/cuda/core/experimental/_event.pyx   | 18 ++++
 .../core/experimental/_utils/hags_status.c    | 84 +++++++++++++++++++
 .../core/experimental/_utils/hags_status.h    | 23 +++++
 cuda_core/tests/test_event.py                 |  7 ++
 6 files changed, 150 insertions(+), 2 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_utils/hags_status.c
 create mode 100644 cuda_core/cuda/core/experimental/_utils/hags_status.h

diff --git a/.gitattributes b/.gitattributes
index cf17ba9d5e..3bff10eeff 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -9,3 +9,5 @@ cuda/_version.py export-subst
 *.png binary
 # SCM syntax highlighting & preventing 3-way merges
 pixi.lock merge=binary linguist-language=YAML linguist-generated=true
+
+cuda_core/cuda/core/experimental/_utils/hags_status.h text eol=lf
diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index e38f5676df..d42463bcb9 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -84,11 +84,25 @@ def get_cuda_paths():
         print("CUDA paths:", CUDA_PATH)
         return CUDA_PATH
 
+    common_include_dirs = [
+        # CUDA include paths (for driver/runtime headers)
+        *(os.path.join(root, "include") for root in get_cuda_paths()),
+        # Local experimental utils headers (for hags_status.h, etc.)
+        os.path.join("cuda", "core", "experimental", "_utils"),
+    ]
+
+    def get_sources(mod):
+        sources = [f"cuda/core/experimental/{mod}.pyx"]
+        # Add hags_status.c for _event module
+        if mod == "_event":
+            sources.append("cuda/core/experimental/_utils/hags_status.c")
+        return sources
+
     ext_modules = tuple(
         Extension(
             f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
-            sources=[f"cuda/core/experimental/{mod}.pyx"],
-            include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()),
+            sources=get_sources(mod),
+            include_dirs=common_include_dirs,
             language="c++",
         )
         for mod in module_names
diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 98a45d0043..3cd94e7ea9 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -13,6 +13,9 @@ from cuda.core.experimental._utils.cuda_utils cimport (
     HANDLE_RETURN
 )
 
+cdef extern from "hags_status.h":
+    int _hags_status_impl "hags_status"()
+
 import cython
 from dataclasses import dataclass
 import multiprocessing
@@ -303,3 +306,18 @@ def _reduce_event(event):
     return event.from_ipc_descriptor, (event.get_ipc_descriptor(),)
 
 multiprocessing.reduction.register(Event, _reduce_event)
+
+cpdef int hags_status():
+    """Check Hardware Accelerated GPU Scheduling (HAGS) status on Windows.
+
+    Returns
+    -------
+    int
+        Status code indicating HAGS state:
+
+        - -1: Not available on this platform (not compiled with MSVC on Windows)
+        - 0: Failure obtaining HwSchSupported/HwSchEnabled
+        - 1: HwSchSupported == 0 or HwSchEnabled == 0 (HAGS not fully enabled)
+        - 2: HwSchSupported == 1 and HwSchEnabled == 1 (HAGS fully enabled)
+    """
+    return _hags_status_impl()
diff --git a/cuda_core/cuda/core/experimental/_utils/hags_status.c b/cuda_core/cuda/core/experimental/_utils/hags_status.c
new file mode 100644
index 0000000000..e034dcf855
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_utils/hags_status.c
@@ -0,0 +1,84 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Note, this may or may not exist, but is NOT the ground truth:
+// reg query "HKLM\SYSTEM\CurrentControlSet\Control\GraphicsDrivers" /v HwSchMode
+// The HwSchMode registry value is only a user override (force on/off).
+// If absent, Windows uses the driver's WDDM caps defaults.
+// Actual HAGS state comes from D3DKMT_WDDM_2_7_CAPS, not the registry.
+
+// Possibly useful for experimentation:
+// reg delete "HKLM\SYSTEM\CurrentControlSet\Control\GraphicsDrivers" /v HwSchMode /f
+
+#ifdef _MSC_VER
+#include <windows.h>
+#include <d3dkmthk.h>
+#include <d3dkmdt.h>
+#endif
+
+int hags_status(void)
+{
+#ifdef _MSC_VER
+    DISPLAY_DEVICEW dd;
+    HDC hdc;
+    int i;
+    BOOL foundPrimary = FALSE;
+    NTSTATUS status;
+
+    D3DKMT_OPENADAPTERFROMHDC openData;
+    D3DKMT_QUERYADAPTERINFO   query;
+    D3DKMT_WDDM_2_7_CAPS      caps;
+    D3DKMT_CLOSEADAPTER       closeData;
+
+    // Find the primary display device
+    ZeroMemory(&dd, sizeof(dd));
+    dd.cb = sizeof(dd);
+
+    for (i = 0; EnumDisplayDevicesW(NULL, i, &dd, 0); ++i) {
+        if (dd.StateFlags & DISPLAY_DEVICE_PRIMARY_DEVICE) {
+            foundPrimary = TRUE;
+            break;
+        }
+    }
+
+    if (!foundPrimary)
+        return 0;
+
+    hdc = CreateDCW(NULL, dd.DeviceName, NULL, NULL);
+    if (!hdc)
+        return 0;
+
+    ZeroMemory(&openData, sizeof(openData));
+    openData.hDc = hdc;
+    status = D3DKMTOpenAdapterFromHdc(&openData);
+
+    DeleteDC(hdc);
+
+    if (status != 0)
+        return 0;
+
+    ZeroMemory(&caps, sizeof(caps));
+    ZeroMemory(&query, sizeof(query));
+
+    query.hAdapter             = openData.hAdapter;
+    query.Type                 = KMTQAITYPE_WDDM_2_7_CAPS;
+    query.pPrivateDriverData   = &caps;
+    query.PrivateDriverDataSize = sizeof(caps);
+
+    status = D3DKMTQueryAdapterInfo(&query);
+
+    ZeroMemory(&closeData, sizeof(closeData));
+    closeData.hAdapter = openData.hAdapter;
+    D3DKMTCloseAdapter(&closeData);
+
+    if (status != 0)
+        return 0;
+
+    if (!caps.HwSchSupported || !caps.HwSchEnabled)
+        return 1;
+
+    return 2;
+#else
+    return -1;
+#endif
+}
diff --git a/cuda_core/cuda/core/experimental/_utils/hags_status.h b/cuda_core/cuda/core/experimental/_utils/hags_status.h
new file mode 100644
index 0000000000..e2194ea769
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_utils/hags_status.h
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * hags_status
+ *
+ * Return codes:
+ *   -1 : Not available on this platform (not compiled with MSVC on Windows)
+ *    0 : Failure obtaining HwSchSupported/HwSchEnabled
+ *    1 : HwSchSupported == 0 or HwSchEnabled == 0 (HAGS not fully enabled)
+ *    2 : HwSchSupported == 1 and HwSchEnabled == 1 (HAGS fully enabled)
+ */
+int hags_status(void);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 992a78e92e..de0c74a6e3 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -11,17 +11,24 @@
     Event,
     EventOptions,
 )
+from cuda.core.experimental._event import hags_status
 from helpers.latch import LatchKernel
 
 from cuda_python_test_helpers import IS_WSL
 
 
+def inspect_hags_status():
+    stat = hags_status()
+    print(f"\nLOOOK {stat=!r}", flush=True)
+
+
 def test_event_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
         cuda.core.experimental._event.Event()  # Ensure back door is locked.
 
 
 def test_timing_success(init_cuda):
+    inspect_hags_status()
     options = EventOptions(enable_timing=True)
     stream = Device().create_stream()
     delay_seconds = 0.5

From 3dbb0fcea6835512178283aa3b941d8912bfd88f Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 20 Nov 2025 12:01:55 -0800
Subject: [PATCH 02/22] cuda_core/build_hooks.py: Extension get_libraries()

---
 cuda_core/build_hooks.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index d42463bcb9..6e856013bd 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -98,12 +98,18 @@ def get_sources(mod):
             sources.append("cuda/core/experimental/_utils/hags_status.c")
         return sources
 
+    def get_libraries(mod):
+        if os.name == "nt" and mod == "_event":
+            return ["user32", "gdi32"]  # for hags_status.c
+        return None
+
     ext_modules = tuple(
         Extension(
             f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
             sources=get_sources(mod),
             include_dirs=common_include_dirs,
             language="c++",
+            libraries=get_libraries(mod),
         )
         for mod in module_names
     )

From 3b19ff6cbe37ab05f1a73eeef32d26fd4207f299 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 20 Nov 2025 12:40:25 -0800
Subject: [PATCH 03/22] pre-commit: exclude soft-linked cuda_python/README.md
 from end-of-file-fixer

On Linux (including WSL) the file cuda_python/README.md is a real symlink,
whereas on Windows Git it is checked out as a plain file containing
"../README.md" (without a trailing LF). When pre-commit runs under WSL, the
end-of-file-fixer hook rewrites this file, and Git Bash can no longer handle
the symlink-emulation file correctly, resulting in errors on subsequent git
operations.
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 20ce44c44a..3609d58c21 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,7 +53,7 @@ repos:
     - id: check-yaml
     - id: debug-statements
     - id: end-of-file-fixer
-      exclude: &gen_exclude '^(?:cuda_bindings/cuda/bindings/.*\.in?|cuda_bindings/docs/source/module/.*\.rst?)$'
+      exclude: &gen_exclude '^(?:cuda_python/README\.md|cuda_bindings/cuda/bindings/.*\.in?|cuda_bindings/docs/source/module/.*\.rst?)$'
     - id: mixed-line-ending
     - id: trailing-whitespace
       exclude: *gen_exclude

From 2b08492bd23829c78d553bf2c04b057a1d1bac18 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 20 Nov 2025 14:18:50 -0800
Subject: [PATCH 04/22] Add drvmodel.c prototype (with main)

---
 .../cuda/core/experimental/_utils/drvmodel.c  | 99 +++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 cuda_core/cuda/core/experimental/_utils/drvmodel.c

diff --git a/cuda_core/cuda/core/experimental/_utils/drvmodel.c b/cuda_core/cuda/core/experimental/_utils/drvmodel.c
new file mode 100644
index 0000000000..7825157c2d
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_utils/drvmodel.c
@@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// drvmodel.c
+// Query NVML for the Windows driver model (WDDM / WDM(TCC) / MCDM) of each GPU.
+//
+// Build example (MSVC, adjust paths as needed):
+//   cl /nologo /W3 drvmodel.c /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\include" /link /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\lib\x64" nvml.lib
+//
+// On success, prints something like:
+//   GPU 0: NVIDIA RTX A6000
+//     Current driver model: WDDM
+//     Pending driver model: WDDM
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "nvml.h"  // from NVIDIA NVML package / CUDA toolkit
+
+static const char *driverModelToString(nvmlDriverModel_t m)
+{
+    switch (m) {
+    case NVML_DRIVER_WDDM:
+        return "WDDM (display device)";
+    case NVML_DRIVER_WDM:
+        return "WDM (TCC, compute device)";
+#ifdef NVML_DRIVER_MCDM
+    case NVML_DRIVER_MCDM:
+        return "MCDM (Microsoft compute device)";
+#endif
+    default:
+        return "Unknown";
+    }
+}
+
+int main(void)
+{
+    nvmlReturn_t result;
+    unsigned int deviceCount = 0;
+    unsigned int i;
+
+    result = nvmlInit_v2();
+    if (result != NVML_SUCCESS) {
+        fprintf(stderr, "nvmlInit_v2() failed: %s\n", nvmlErrorString(result));
+        return EXIT_FAILURE;
+    }
+
+    result = nvmlDeviceGetCount_v2(&deviceCount);
+    if (result != NVML_SUCCESS) {
+        fprintf(stderr, "nvmlDeviceGetCount_v2() failed: %s\n", nvmlErrorString(result));
+        nvmlShutdown();
+        return EXIT_FAILURE;
+    }
+
+    if (deviceCount == 0) {
+        printf("No NVIDIA GPUs found.\n");
+        nvmlShutdown();
+        return EXIT_SUCCESS;
+    }
+
+    for (i = 0; i < deviceCount; ++i) {
+        nvmlDevice_t device;
+        char name[NVML_DEVICE_NAME_BUFFER_SIZE] = {0};
+        nvmlDriverModel_t currentModel = 0;
+        nvmlDriverModel_t pendingModel = 0;
+
+        result = nvmlDeviceGetHandleByIndex_v2(i, &device);
+        if (result != NVML_SUCCESS) {
+            fprintf(stderr,
+                    "nvmlDeviceGetHandleByIndex_v2(%u) failed: %s\n",
+                    i, nvmlErrorString(result));
+            continue;
+        }
+
+        result = nvmlDeviceGetName(device, name, sizeof(name));
+        if (result != NVML_SUCCESS) {
+            snprintf(name, sizeof(name), "<unknown>");
+        }
+
+        result = nvmlDeviceGetDriverModel(device, &currentModel, &pendingModel);
+        if (result == NVML_ERROR_NOT_SUPPORTED) {
+            printf("GPU %u: %s\n", i, name);
+            printf("  Driver model query not supported (non-Windows or unsupported device).\n");
+            continue;
+        } else if (result != NVML_SUCCESS) {
+            fprintf(stderr,
+                    "nvmlDeviceGetDriverModel(%u) failed: %s\n",
+                    i, nvmlErrorString(result));
+            continue;
+        }
+
+        printf("GPU %u: %s\n", i, name);
+        printf("  Current driver model: %s\n", driverModelToString(currentModel));
+        printf("  Pending driver model: %s\n", driverModelToString(pendingModel));
+    }
+
+    nvmlShutdown();
+    return EXIT_SUCCESS;
+}

From 235c1222b633efaa1872f1e88d1003eb5f3f8a70 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 21 Nov 2025 11:13:35 -0800
Subject: [PATCH 05/22] Add wddm_driver_model_is_in_use.c

---
 cuda_core/build_hooks.py                      | 10 +-
 .../cuda/core/experimental/_utils/drvmodel.c  | 99 -------------------
 .../_utils/wddm_driver_model_is_in_use.c      | 48 +++++++++
 .../_utils/wddm_driver_model_is_in_use.h      | 14 +++
 4 files changed, 68 insertions(+), 103 deletions(-)
 delete mode 100644 cuda_core/cuda/core/experimental/_utils/drvmodel.c
 create mode 100644 cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c
 create mode 100644 cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 6e856013bd..67428e7579 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -85,17 +85,19 @@ def get_cuda_paths():
         return CUDA_PATH
 
     common_include_dirs = [
-        # CUDA include paths (for driver/runtime headers)
         *(os.path.join(root, "include") for root in get_cuda_paths()),
-        # Local experimental utils headers (for hags_status.h, etc.)
         os.path.join("cuda", "core", "experimental", "_utils"),
     ]
 
     def get_sources(mod):
         sources = [f"cuda/core/experimental/{mod}.pyx"]
-        # Add hags_status.c for _event module
         if mod == "_event":
-            sources.append("cuda/core/experimental/_utils/hags_status.c")
+            sources.extend(
+                [
+                    "cuda/core/experimental/_utils/hags_status.c",
+                    "cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c",
+                ]
+            )
         return sources
 
     def get_libraries(mod):
diff --git a/cuda_core/cuda/core/experimental/_utils/drvmodel.c b/cuda_core/cuda/core/experimental/_utils/drvmodel.c
deleted file mode 100644
index 7825157c2d..0000000000
--- a/cuda_core/cuda/core/experimental/_utils/drvmodel.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-// drvmodel.c
-// Query NVML for the Windows driver model (WDDM / WDM(TCC) / MCDM) of each GPU.
-//
-// Build example (MSVC, adjust paths as needed):
-//   cl /nologo /W3 drvmodel.c /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\include" /link /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\lib\x64" nvml.lib
-//
-// On success, prints something like:
-//   GPU 0: NVIDIA RTX A6000
-//     Current driver model: WDDM
-//     Pending driver model: WDDM
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "nvml.h"  // from NVIDIA NVML package / CUDA toolkit
-
-static const char *driverModelToString(nvmlDriverModel_t m)
-{
-    switch (m) {
-    case NVML_DRIVER_WDDM:
-        return "WDDM (display device)";
-    case NVML_DRIVER_WDM:
-        return "WDM (TCC, compute device)";
-#ifdef NVML_DRIVER_MCDM
-    case NVML_DRIVER_MCDM:
-        return "MCDM (Microsoft compute device)";
-#endif
-    default:
-        return "Unknown";
-    }
-}
-
-int main(void)
-{
-    nvmlReturn_t result;
-    unsigned int deviceCount = 0;
-    unsigned int i;
-
-    result = nvmlInit_v2();
-    if (result != NVML_SUCCESS) {
-        fprintf(stderr, "nvmlInit_v2() failed: %s\n", nvmlErrorString(result));
-        return EXIT_FAILURE;
-    }
-
-    result = nvmlDeviceGetCount_v2(&deviceCount);
-    if (result != NVML_SUCCESS) {
-        fprintf(stderr, "nvmlDeviceGetCount_v2() failed: %s\n", nvmlErrorString(result));
-        nvmlShutdown();
-        return EXIT_FAILURE;
-    }
-
-    if (deviceCount == 0) {
-        printf("No NVIDIA GPUs found.\n");
-        nvmlShutdown();
-        return EXIT_SUCCESS;
-    }
-
-    for (i = 0; i < deviceCount; ++i) {
-        nvmlDevice_t device;
-        char name[NVML_DEVICE_NAME_BUFFER_SIZE] = {0};
-        nvmlDriverModel_t currentModel = 0;
-        nvmlDriverModel_t pendingModel = 0;
-
-        result = nvmlDeviceGetHandleByIndex_v2(i, &device);
-        if (result != NVML_SUCCESS) {
-            fprintf(stderr,
-                    "nvmlDeviceGetHandleByIndex_v2(%u) failed: %s\n",
-                    i, nvmlErrorString(result));
-            continue;
-        }
-
-        result = nvmlDeviceGetName(device, name, sizeof(name));
-        if (result != NVML_SUCCESS) {
-            snprintf(name, sizeof(name), "<unknown>");
-        }
-
-        result = nvmlDeviceGetDriverModel(device, &currentModel, &pendingModel);
-        if (result == NVML_ERROR_NOT_SUPPORTED) {
-            printf("GPU %u: %s\n", i, name);
-            printf("  Driver model query not supported (non-Windows or unsupported device).\n");
-            continue;
-        } else if (result != NVML_SUCCESS) {
-            fprintf(stderr,
-                    "nvmlDeviceGetDriverModel(%u) failed: %s\n",
-                    i, nvmlErrorString(result));
-            continue;
-        }
-
-        printf("GPU %u: %s\n", i, name);
-        printf("  Current driver model: %s\n", driverModelToString(currentModel));
-        printf("  Pending driver model: %s\n", driverModelToString(pendingModel));
-    }
-
-    nvmlShutdown();
-    return EXIT_SUCCESS;
-}
diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c
new file mode 100644
index 0000000000..2dc8f768bc
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c
@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Query NVML for the Windows WDDM driver model, looping over all GPUs.
+//
+// Example compilation command:
+//     cl /nologo /c wddm_driver_model_is_in_use.c /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\include"
+// Needed for linking:
+//     /link /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\lib\x64" nvml.lib
+
+#include "wddm_driver_model_is_in_use.h"
+
+#include "nvml.h"  // from NVIDIA GPU Computing Toolkit
+
+static int wddm_driver_model_is_in_use_impl(void)
+{
+    unsigned deviceCount = 0;
+    nvmlReturn_t result = nvmlDeviceGetCount_v2(&deviceCount);
+    if (result != NVML_SUCCESS) {
+        return -2;
+    }
+    for (unsigned i_dev = 0; i_dev < deviceCount; ++i_dev) {
+        nvmlDevice_t device;
+        result = nvmlDeviceGetHandleByIndex_v2(i_dev, &device);
+        if (result == NVML_SUCCESS) {
+            nvmlDriverModel_t currentModel = 0;
+            nvmlDriverModel_t pendingModel = 0;
+            result = nvmlDeviceGetDriverModel(device, &currentModel, &pendingModel);
+            if (result == NVML_SUCCESS) {
+                if (currentModel == NVML_DRIVER_WDDM || pendingModel == NVML_DRIVER_WDDM) {
+                    return 1;
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+int wddm_driver_model_is_in_use(void)
+{
+    nvmlReturn_t result = nvmlInit_v2();
+    if (result != NVML_SUCCESS) {
+        return -1;
+    }
+    int return_code = wddm_driver_model_is_in_use_impl();
+    nvmlShutdown();
+    return return_code;
+}
diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h
new file mode 100644
index 0000000000..3472b90014
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int wddm_driver_model_is_in_use(void);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif

From 4fa00ee8967c52a2082cd6983e09b0c2fbed2954 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 21 Nov 2025 11:34:05 -0800
Subject: [PATCH 06/22] Update build_hooks.py to link in nvml.lib

---
 cuda_core/build_hooks.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 67428e7579..703227d14d 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -102,9 +102,16 @@ def get_sources(mod):
 
     def get_libraries(mod):
         if os.name == "nt" and mod == "_event":
-            return ["user32", "gdi32"]  # for hags_status.c
+            # user32 / gdi32 for hags_status.c, nvml for wddm_driver_model_is_in_use.c
+            return ["user32", "gdi32", "nvml"]
         return None
 
+    def get_library_dirs():
+        if os.name != "nt":
+            return None
+        # wddm_driver_model_is_in_use.c needs nvml.lib
+        return [os.path.join(root, "lib", "x64") for root in get_cuda_paths()]
+
     ext_modules = tuple(
         Extension(
             f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
@@ -112,6 +119,7 @@ def get_libraries(mod):
             include_dirs=common_include_dirs,
             language="c++",
             libraries=get_libraries(mod),
+            library_dirs=get_library_dirs(),
         )
         for mod in module_names
     )

From 25060c684d7a45e9c765efec704a0299df766234 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 21 Nov 2025 11:58:37 -0800
Subject: [PATCH 07/22] Call wddm_driver_model_is_in_use from test_event.py

---
 cuda_core/cuda/core/experimental/_event.pyx | 24 ++++++++-------------
 cuda_core/tests/test_event.py               |  8 ++++---
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 3cd94e7ea9..4e0672aaa9 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -16,6 +16,15 @@ from cuda.core.experimental._utils.cuda_utils cimport (
 cdef extern from "hags_status.h":
     int _hags_status_impl "hags_status"()
 
+cpdef int hags_status():
+    return _hags_status_impl()
+
+cdef extern from "wddm_driver_model_is_in_use.h":
+    int _wddm_driver_model_is_in_use_impl "wddm_driver_model_is_in_use"()
+
+cpdef int wddm_driver_model_is_in_use():
+    return _wddm_driver_model_is_in_use_impl()
+
 import cython
 from dataclasses import dataclass
 import multiprocessing
@@ -306,18 +315,3 @@ def _reduce_event(event):
     return event.from_ipc_descriptor, (event.get_ipc_descriptor(),)
 
 multiprocessing.reduction.register(Event, _reduce_event)
-
-cpdef int hags_status():
-    """Check Hardware Accelerated GPU Scheduling (HAGS) status on Windows.
-
-    Returns
-    -------
-    int
-        Status code indicating HAGS state:
-
-        - -1: Not available on this platform (not compiled with MSVC on Windows)
-        - 0: Failure obtaining HwSchSupported/HwSchEnabled
-        - 1: HwSchSupported == 0 or HwSchEnabled == 0 (HAGS not fully enabled)
-        - 2: HwSchSupported == 1 and HwSchEnabled == 1 (HAGS fully enabled)
-    """
-    return _hags_status_impl()
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index de0c74a6e3..3e714526cf 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -11,15 +11,17 @@
     Event,
     EventOptions,
 )
-from cuda.core.experimental._event import hags_status
+from cuda.core.experimental._event import hags_status, wddm_driver_model_is_in_use
 from helpers.latch import LatchKernel
 
 from cuda_python_test_helpers import IS_WSL
 
 
 def inspect_hags_status():
-    stat = hags_status()
-    print(f"\nLOOOK {stat=!r}", flush=True)
+    hags = hags_status()
+    print(f"\nLOOOK {hags=!r}", flush=True)
+    wddm = wddm_driver_model_is_in_use()
+    print(f"\nLOOOK {wddm=!r}", flush=True)
 
 
 def test_event_init_disabled():

From 7e340af7497de0f2bfcb8aff3d4f7d4d22dc9a07 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 21 Nov 2025 12:12:04 -0800
Subject: [PATCH 08/22] Add
 ensure_hags_is_enabled_if_wddm_driver_model_is_in_use() in _event.pyx

---
 cuda_core/cuda/core/experimental/_event.pyx | 31 +++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 4e0672aaa9..ec44fc08aa 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -25,6 +25,35 @@ cdef extern from "wddm_driver_model_is_in_use.h":
 cpdef int wddm_driver_model_is_in_use():
     return _wddm_driver_model_is_in_use_impl()
 
+
+def ensure_hags_is_enabled_if_wddm_driver_model_is_in_use() -> None:
+    """On Windows with WDDM driver model, require HAGS to be fully enabled.
+
+    If WDDM is not in use, or the platform is non-Windows, this is a no-op.
+    """
+    import sys
+
+    if sys.platform != "win32":
+        return
+
+    wddm_state = wddm_driver_model_is_in_use()
+    if wddm_state != 1:
+        # Either not WDDM or NVML was not able to determine the driver model.
+        return
+
+    hags_state = hags_status()
+    if hags_state == 2:
+        # HAGS fully enabled.
+        return
+
+    raise RuntimeError(
+        "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the "
+        "Windows WDDM driver model is in use in order to obtain reliable CUDA event "
+        "timing. Please enable HAGS in the Windows graphics settings or switch to a "
+        "non-WDDM driver model."
+    )
+
+
 import cython
 from dataclasses import dataclass
 import multiprocessing
@@ -148,6 +177,8 @@ cdef class Event:
 
     def __sub__(self, other: Event):
         # return self - other (in milliseconds)
+        if not self.is_timing_disabled and not other.is_timing_disabled:
+            ensure_hags_is_enabled_if_wddm_driver_model_is_in_use()
         cdef float timing
         with nogil:
             err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle)

From 2eebbca5ab5fe82e521de2b8c863404381ac402d Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 21 Nov 2025 13:18:18 -0800
Subject: [PATCH 09/22] Work _xfail_if_hags_runtime_error into
 tests/test_event.py

---
 cuda_core/tests/test_event.py | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 3e714526cf..67bb9055c6 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+import re
 import time
 
 import cuda.core.experimental
@@ -16,6 +17,8 @@
 
 from cuda_python_test_helpers import IS_WSL
 
+_HAGS_ERROR_SUBSTRING = "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled"
+
 
 def inspect_hags_status():
     hags = hags_status()
@@ -24,6 +27,18 @@ def inspect_hags_status():
     print(f"\nLOOOK {wddm=!r}", flush=True)
 
 
+def _xfail_if_hags_runtime_error(exc: BaseException, expected_regex: str | None = None) -> None:
+    message = str(exc)
+    if _HAGS_ERROR_SUBSTRING in message:
+        pytest.xfail(
+            "HAGS is not fully enabled while the Windows WDDM driver model is in use; "
+            "event timing tests are expected to fail in this configuration."
+        )
+
+    if expected_regex is not None:
+        assert re.match(expected_regex, message), f"Expected regex: {expected_regex!r}\nActual message: {message!r}"
+
+
 def test_event_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
         cuda.core.experimental._event.Event()  # Ensure back door is locked.
@@ -38,7 +53,11 @@ def test_timing_success(init_cuda):
     time.sleep(delay_seconds)
     e2 = stream.record(options=options)
     e2.sync()
-    elapsed_time_ms = e2 - e1
+    try:
+        elapsed_time_ms = e2 - e1
+    except RuntimeError as exc:
+        _xfail_if_hags_runtime_error(exc)
+        raise
     assert isinstance(elapsed_time_ms, float)
     # Using a generous tolerance, to avoid flaky tests:
     # We only want to exercise the __sub__ method, this test is not meant
@@ -115,12 +134,17 @@ def test_error_timing_recorded():
     event3 = device.create_event(options=enabled)
 
     stream.sync()
-    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+    with pytest.raises(RuntimeError) as excinfo:
         event2 - event1
-    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+    _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded")
+
+    with pytest.raises(RuntimeError) as excinfo:
         event1 - event2
-    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+    _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded")
+
+    with pytest.raises(RuntimeError) as excinfo:
         event3 - event2
+    _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded")
 
 
 def test_error_timing_incomplete():
@@ -135,8 +159,9 @@ def test_error_timing_incomplete():
     event3 = stream.record(options=enabled)
 
     # event3 will never complete because the latch has not been released
-    with pytest.raises(RuntimeError, match="^One or both events have not completed."):
+    with pytest.raises(RuntimeError) as excinfo:
         event3 - event1
+    _xfail_if_hags_runtime_error(excinfo.value, r"^One or both events have not completed.")
 
     latch.release()
     event3.sync()

From 3c2a1c7df3f3b09cb447986dd8f294084c834644 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 12:55:15 -0800
Subject: [PATCH 10/22] Clean out inspect_hags_status(), add
 _is_hags_timing_usable() helper.

---
 cuda_core/tests/test_event.py | 89 ++++++++++++++++++++++-------------
 1 file changed, 57 insertions(+), 32 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 67bb9055c6..deffcfa38e 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-import re
 import time
 
 import cuda.core.experimental
@@ -12,7 +11,6 @@
     Event,
     EventOptions,
 )
-from cuda.core.experimental._event import hags_status, wddm_driver_model_is_in_use
 from helpers.latch import LatchKernel
 
 from cuda_python_test_helpers import IS_WSL
@@ -20,23 +18,32 @@
 _HAGS_ERROR_SUBSTRING = "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled"
 
 
-def inspect_hags_status():
-    hags = hags_status()
-    print(f"\nLOOOK {hags=!r}", flush=True)
-    wddm = wddm_driver_model_is_in_use()
-    print(f"\nLOOOK {wddm=!r}", flush=True)
+def _is_hags_timing_usable() -> bool:
+    """Probe Event.__sub__ to detect HAGS/WDDM timing issues.
 
+    Returns True if timing appears usable, False if we see the known
+    HAGS/WDDM RuntimeError. Any other RuntimeError is propagated.
+    """
+    device = Device()
+    device.set_current()
+    options = EventOptions(enable_timing=True)
+    stream = device.create_stream()
 
-def _xfail_if_hags_runtime_error(exc: BaseException, expected_regex: str | None = None) -> None:
-    message = str(exc)
-    if _HAGS_ERROR_SUBSTRING in message:
-        pytest.xfail(
-            "HAGS is not fully enabled while the Windows WDDM driver model is in use; "
-            "event timing tests are expected to fail in this configuration."
-        )
+    event1 = stream.record(options=options)
+    event2 = stream.record(options=options)
+    event2.sync()
 
-    if expected_regex is not None:
-        assert re.match(expected_regex, message), f"Expected regex: {expected_regex!r}\nActual message: {message!r}"
+    try:
+        _ = event2 - event1
+    except RuntimeError as exc:
+        message = str(exc)
+        if _HAGS_ERROR_SUBSTRING in message:
+            return False
+        raise
+    return True
+
+
+_HAGS_TIMING_USABLE = _is_hags_timing_usable()
 
 
 def test_event_init_disabled():
@@ -44,8 +51,22 @@ def test_event_init_disabled():
         cuda.core.experimental._event.Event()  # Ensure back door is locked.
 
 
+def test_ensure_hags_is_enabled_if_wddm_driver_model_is_in_use():
+    if not _HAGS_TIMING_USABLE:
+        pytest.xfail(
+            "HAGS is not fully enabled while the Windows WDDM driver model is in use; "
+            "event timing tests are expected to fail in this configuration."
+        )
+
+
+@pytest.mark.skipif(
+    not _HAGS_TIMING_USABLE,
+    reason=(
+        "HAGS is not fully enabled while the Windows WDDM driver model is in use; "
+        "event timing tests are expected to fail in this configuration."
+    ),
+)
 def test_timing_success(init_cuda):
-    inspect_hags_status()
     options = EventOptions(enable_timing=True)
     stream = Device().create_stream()
     delay_seconds = 0.5
@@ -53,11 +74,7 @@ def test_timing_success(init_cuda):
     time.sleep(delay_seconds)
     e2 = stream.record(options=options)
     e2.sync()
-    try:
-        elapsed_time_ms = e2 - e1
-    except RuntimeError as exc:
-        _xfail_if_hags_runtime_error(exc)
-        raise
+    elapsed_time_ms = e2 - e1
     assert isinstance(elapsed_time_ms, float)
     # Using a generous tolerance, to avoid flaky tests:
     # We only want to exercise the __sub__ method, this test is not meant
@@ -123,6 +140,13 @@ def test_error_timing_disabled():
         event2 - event1
 
 
+@pytest.mark.skipif(
+    not _HAGS_TIMING_USABLE,
+    reason=(
+        "HAGS is not fully enabled while the Windows WDDM driver model is in use; "
+        "event timing tests are expected to fail in this configuration."
+    ),
+)
 def test_error_timing_recorded():
     device = Device()
     device.set_current()
@@ -134,19 +158,21 @@ def test_error_timing_recorded():
     event3 = device.create_event(options=enabled)
 
     stream.sync()
-    with pytest.raises(RuntimeError) as excinfo:
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
         event2 - event1
-    _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded")
-
-    with pytest.raises(RuntimeError) as excinfo:
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
         event1 - event2
-    _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded")
-
-    with pytest.raises(RuntimeError) as excinfo:
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
         event3 - event2
-    _xfail_if_hags_runtime_error(excinfo.value, r"^Both Events must be recorded")
 
 
+@pytest.mark.skipif(
+    not _HAGS_TIMING_USABLE,
+    reason=(
+        "HAGS is not fully enabled while the Windows WDDM driver model is in use; "
+        "event timing tests are expected to fail in this configuration."
+    ),
+)
 def test_error_timing_incomplete():
     device = Device()
     device.set_current()
@@ -159,9 +185,8 @@ def test_error_timing_incomplete():
     event3 = stream.record(options=enabled)
 
     # event3 will never complete because the latch has not been released
-    with pytest.raises(RuntimeError) as excinfo:
+    with pytest.raises(RuntimeError, match="^One or both events have not completed."):
         event3 - event1
-    _xfail_if_hags_runtime_error(excinfo.value, r"^One or both events have not completed.")
 
     latch.release()
     event3.sync()

From 066d92845fb616da2c0a63b12464e283b52d5b64 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 13:43:07 -0800
Subject: [PATCH 11/22] Refactor as _get_wddm_hags_error()

---
 cuda_core/tests/test_event.py | 55 ++++++++++-------------------------
 1 file changed, 16 insertions(+), 39 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index deffcfa38e..b094ae4e8e 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -15,14 +15,11 @@
 
 from cuda_python_test_helpers import IS_WSL
 
-_HAGS_ERROR_SUBSTRING = "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled"
 
+def _get_wddm_hags_error() -> str:
+    """Probe Event.__sub__ to obtain WDDM/HAGS RuntimeError.
 
-def _is_hags_timing_usable() -> bool:
-    """Probe Event.__sub__ to detect HAGS/WDDM timing issues.
-
-    Returns True if timing appears usable, False if we see the known
-    HAGS/WDDM RuntimeError. Any other RuntimeError is propagated.
+    Any other RuntimeError is propagated.
     """
     device = Device()
     device.set_current()
@@ -36,14 +33,15 @@ def _is_hags_timing_usable() -> bool:
     try:
         _ = event2 - event1
     except RuntimeError as exc:
-        message = str(exc)
-        if _HAGS_ERROR_SUBSTRING in message:
-            return False
+        msg = str(exc)
+        if "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled" in msg:
+            return msg
         raise
-    return True
+    return None
 
 
-_HAGS_TIMING_USABLE = _is_hags_timing_usable()
+_WDDM_HAGS_ERROR = _get_wddm_hags_error()
+_WDDM_HAGS_PRECONDITION_MSG = "WDDM/HAGS precondition not met"
 
 
 def test_event_init_disabled():
@@ -52,20 +50,11 @@ def test_event_init_disabled():
 
 
 def test_ensure_hags_is_enabled_if_wddm_driver_model_is_in_use():
-    if not _HAGS_TIMING_USABLE:
-        pytest.xfail(
-            "HAGS is not fully enabled while the Windows WDDM driver model is in use; "
-            "event timing tests are expected to fail in this configuration."
-        )
-
-
-@pytest.mark.skipif(
-    not _HAGS_TIMING_USABLE,
-    reason=(
-        "HAGS is not fully enabled while the Windows WDDM driver model is in use; "
-        "event timing tests are expected to fail in this configuration."
-    ),
-)
+    if _WDDM_HAGS_ERROR:
+        pytest.xfail(_WDDM_HAGS_ERROR)
+
+
+@pytest.mark.skipif(_WDDM_HAGS_ERROR is not None, reason=_WDDM_HAGS_PRECONDITION_MSG)
 def test_timing_success(init_cuda):
     options = EventOptions(enable_timing=True)
     stream = Device().create_stream()
@@ -140,13 +129,7 @@ def test_error_timing_disabled():
         event2 - event1
 
 
-@pytest.mark.skipif(
-    not _HAGS_TIMING_USABLE,
-    reason=(
-        "HAGS is not fully enabled while the Windows WDDM driver model is in use; "
-        "event timing tests are expected to fail in this configuration."
-    ),
-)
+@pytest.mark.skipif(_WDDM_HAGS_ERROR is not None, reason=_WDDM_HAGS_PRECONDITION_MSG)
 def test_error_timing_recorded():
     device = Device()
     device.set_current()
@@ -166,13 +149,7 @@ def test_error_timing_recorded():
         event3 - event2
 
 
-@pytest.mark.skipif(
-    not _HAGS_TIMING_USABLE,
-    reason=(
-        "HAGS is not fully enabled while the Windows WDDM driver model is in use; "
-        "event timing tests are expected to fail in this configuration."
-    ),
-)
+@pytest.mark.skipif(_WDDM_HAGS_ERROR is not None, reason=_WDDM_HAGS_PRECONDITION_MSG)
 def test_error_timing_incomplete():
     device = Device()
     device.set_current()

From a80f5531f213ab3eeb9908c4c4b8857071c20904 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 13:48:39 -0800
Subject: [PATCH 12/22] Shorten name: ensure_wddm_with_hags()

---
 cuda_core/cuda/core/experimental/_event.pyx | 4 ++--
 cuda_core/tests/test_event.py               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index ec44fc08aa..89b9ced29d 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -26,7 +26,7 @@ cpdef int wddm_driver_model_is_in_use():
     return _wddm_driver_model_is_in_use_impl()
 
 
-def ensure_hags_is_enabled_if_wddm_driver_model_is_in_use() -> None:
+def ensure_wddm_with_hags() -> None:
     """On Windows with WDDM driver model, require HAGS to be fully enabled.
 
     If WDDM is not in use, or the platform is non-Windows, this is a no-op.
@@ -178,7 +178,7 @@ cdef class Event:
     def __sub__(self, other: Event):
         # return self - other (in milliseconds)
         if not self.is_timing_disabled and not other.is_timing_disabled:
-            ensure_hags_is_enabled_if_wddm_driver_model_is_in_use()
+            ensure_wddm_with_hags()
         cdef float timing
         with nogil:
             err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle)
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index b094ae4e8e..998a87db03 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -49,7 +49,7 @@ def test_event_init_disabled():
         cuda.core.experimental._event.Event()  # Ensure back door is locked.
 
 
-def test_ensure_hags_is_enabled_if_wddm_driver_model_is_in_use():
+def test_ensure_wddm_with_hags():
     if _WDDM_HAGS_ERROR:
         pytest.xfail(_WDDM_HAGS_ERROR)
 

From 35cccf39d6563068909ca8f63b950165142b5d1a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 13:56:32 -0800
Subject: [PATCH 13/22] Remove hags_status(), wddm_driver_model_is_in_use()
 Python bindings

---
 cuda_core/cuda/core/experimental/_event.pyx | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 89b9ced29d..c14f8df9eb 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -14,16 +14,10 @@ from cuda.core.experimental._utils.cuda_utils cimport (
 )
 
 cdef extern from "hags_status.h":
-    int _hags_status_impl "hags_status"()
-
-cpdef int hags_status():
-    return _hags_status_impl()
+    int hags_status()
 
 cdef extern from "wddm_driver_model_is_in_use.h":
-    int _wddm_driver_model_is_in_use_impl "wddm_driver_model_is_in_use"()
-
-cpdef int wddm_driver_model_is_in_use():
-    return _wddm_driver_model_is_in_use_impl()
+    int wddm_driver_model_is_in_use()
 
 
 def ensure_wddm_with_hags() -> None:

From 58e157820371e151200e63b9274fa2e1c5e8fb43 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 13:59:22 -0800
Subject: [PATCH 14/22] .gitattributes: _utils/*.h text

---
 .gitattributes | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitattributes b/.gitattributes
index 3bff10eeff..9cb2b279c4 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -10,4 +10,4 @@ cuda/_version.py export-subst
 # SCM syntax highlighting & preventing 3-way merges
 pixi.lock merge=binary linguist-language=YAML linguist-generated=true
 
-cuda_core/cuda/core/experimental/_utils/hags_status.h text eol=lf
+cuda_core/cuda/core/experimental/_utils/*.h text eol=lf

From 8d11440cadbdc3665599feebc348de5389e18fb8 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 14:26:55 -0800
Subject: [PATCH 15/22] Caching: Run wddm_driver_model_is_in_use() and
 hags_status() only once per process.

---
 cuda_core/cuda/core/experimental/_event.pyx | 39 ++++++++++++++++-----
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index c14f8df9eb..c3fc35eff4 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -13,6 +13,8 @@ from cuda.core.experimental._utils.cuda_utils cimport (
     HANDLE_RETURN
 )
 
+import sys
+
 cdef extern from "hags_status.h":
     int hags_status()
 
@@ -20,32 +22,51 @@ cdef extern from "wddm_driver_model_is_in_use.h":
     int wddm_driver_model_is_in_use()
 
 
+cdef int _ensure_wddm_with_hags_state = 0
+# 0 = unknown / not checked
+# 1 = OK (no restriction or HAGS fully enabled)
+# 2 = misconfigured (should raise)
+
+_WDDM_HAGS_ERROR = (
+    "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the "
+    "Windows WDDM driver model is in use in order to obtain reliable CUDA event "
+    "timing. Please enable HAGS in the Windows graphics settings or switch to a "
+    "non-WDDM driver model."
+)
+
+
 def ensure_wddm_with_hags() -> None:
     """On Windows with WDDM driver model, require HAGS to be fully enabled.
 
     If WDDM is not in use, or the platform is non-Windows, this is a no-op.
+    The result of the driver/HAGS probe is cached per process.
     """
-    import sys
+    global _ensure_wddm_with_hags_state
+
+    cdef int state = _ensure_wddm_with_hags_state
+    if state == 1:
+        return
+    if state == 2:
+        raise RuntimeError(_WDDM_HAGS_ERROR)
 
     if sys.platform != "win32":
+        _ensure_wddm_with_hags_state = 1
         return
 
-    wddm_state = wddm_driver_model_is_in_use()
+    cdef int wddm_state = wddm_driver_model_is_in_use()
     if wddm_state != 1:
         # Either not WDDM or NVML was not able to determine the driver model.
+        _ensure_wddm_with_hags_state = 1
         return
 
-    hags_state = hags_status()
+    cdef int hags_state = hags_status()
     if hags_state == 2:
         # HAGS fully enabled.
+        _ensure_wddm_with_hags_state = 1
         return
 
-    raise RuntimeError(
-        "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the "
-        "Windows WDDM driver model is in use in order to obtain reliable CUDA event "
-        "timing. Please enable HAGS in the Windows graphics settings or switch to a "
-        "non-WDDM driver model."
-    )
+    _ensure_wddm_with_hags_state = 2
+    raise RuntimeError(_WDDM_HAGS_ERROR)
 
 
 import cython

From dfd4d135c7866b159f2e0b2998ab6a541e88983c Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 14:55:33 -0800
Subject: [PATCH 16/22] Stub wddm_driver_model_is_in_use on non-Windows

---
 .../_utils/wddm_driver_model_is_in_use.c      | 25 +++++++++++++++----
 cuda_python/README.md                         |  0
 2 files changed, 20 insertions(+), 5 deletions(-)
 mode change 120000 => 100644 cuda_python/README.md

diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c
index 2dc8f768bc..63197cb5c3 100644
--- a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c
+++ b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.c
@@ -1,15 +1,19 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-
+//
 // Query NVML for the Windows WDDM driver model, looping over all GPUs.
 //
-// Example compilation command:
-//     cl /nologo /c wddm_driver_model_is_in_use.c /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\include"
+// On non-Windows platforms this always returns -1 and performs no NVML calls.
+//
+// Example compilation command (Windows/MSVC):
+//     cl /nologo /c wddm_driver_model_is_in_use.c /I"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\include"
 // Needed for linking:
-//     /link /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\lib\x64" nvml.lib
-
+//     /link /LIBPATH:"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\lib\\x64" nvml.lib
+//
 #include "wddm_driver_model_is_in_use.h"
 
+#ifdef _MSC_VER
+
 #include "nvml.h"  // from NVIDIA GPU Computing Toolkit
 
 static int wddm_driver_model_is_in_use_impl(void)
@@ -46,3 +50,14 @@ int wddm_driver_model_is_in_use(void)
     nvmlShutdown();
     return return_code;
 }
+
+#else  // !_MSC_VER
+
+int wddm_driver_model_is_in_use(void)
+{
+    // WDDM is a Windows-only concept; on non-Windows platforms we report -1
+    // to indicate that the driver model could not be determined.
+    return -1;
+}
+
+#endif  // _MSC_VER
diff --git a/cuda_python/README.md b/cuda_python/README.md
deleted file mode 120000
index 32d46ee883..0000000000
--- a/cuda_python/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../README.md
\ No newline at end of file
diff --git a/cuda_python/README.md b/cuda_python/README.md
new file mode 100644
index 0000000000..32d46ee883
--- /dev/null
+++ b/cuda_python/README.md
@@ -0,0 +1 @@
+../README.md
\ No newline at end of file

From a41581056c83a62331862de489790211af229590 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 20:42:47 -0800
Subject: [PATCH 17/22] Add cuda_nvml_dev to cuda-components in
 fetch_ctk/action.yml

---
 .github/actions/fetch_ctk/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index 6cec965105..e5f6c18e30 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -17,7 +17,7 @@ inputs:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
     type: string
-    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile,cuda_nvml_dev"
   cuda-path:
     description: "where the CTK components will be installed to, relative to $PWD"
     required: false

From 7c2aa1ae71af84a93fe86606bc745466e0449989 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 21:22:03 -0800
Subject: [PATCH 18/22] Exclude nvml.dll from delvewheel repair

nvml.dll is not part of the CTK but is installed with the CUDA driver.
Adding --exclude nvml.dll to the delvewheel repair command prevents
delvewheel from searching for this DLL during wheel repair, since it
will be available system-wide at runtime.
---
 cuda_core/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index a920005f21..5ea98fc85e 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -86,4 +86,4 @@ archs = "native"
 [tool.cibuildwheel.windows]
 archs = "AMD64"
 before-build = "pip install delvewheel"
-repair-wheel-command = "delvewheel repair --namespace-pkg cuda -w {dest_dir} {wheel}"
+repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude nvml.dll -w {dest_dir} {wheel}"

From 85a1550ee989d446d72ca34c99f45311934cc3e6 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 22:56:48 -0800
Subject: [PATCH 19/22] Add documentation comment to
 wddm_driver_model_is_in_use.h

Add a comment block documenting the return codes, matching the style
of hags_status.h. Also converts the file from binary to text mode
per .gitattributes rules.
---
 .../experimental/_utils/wddm_driver_model_is_in_use.h    | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h
index 3472b90014..15ddca24e4 100644
--- a/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h
+++ b/cuda_core/cuda/core/experimental/_utils/wddm_driver_model_is_in_use.h
@@ -7,6 +7,15 @@
 extern "C" {
 #endif
 
+/*
+ * wddm_driver_model_is_in_use
+ *
+ * Return codes:
+ *   -2 : Failed to get device count from NVML
+ *   -1 : Not available on this platform (not compiled with MSVC on Windows) or NVML initialization failed
+ *    0 : No WDDM driver model found (all devices use TCC or other driver models)
+ *    1 : WDDM driver model is in use (at least one device uses WDDM)
+ */
 int wddm_driver_model_is_in_use(void);
 
 #ifdef __cplusplus

From 63ee3f386d76b24a593ba0cdac9fd884bf79ec45 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 23:02:09 -0800
Subject: [PATCH 20/22] Change 'event timing' to 'event timings' in WDDM HAGS
 error message

Use plural 'timings' to better match the verb 'obtain', which pairs
more naturally with concrete measurements/results rather than an
abstract concept.
---
 cuda_core/cuda/core/experimental/_event.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index c3fc35eff4..b50f1e0173 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -30,7 +30,7 @@ cdef int _ensure_wddm_with_hags_state = 0
 _WDDM_HAGS_ERROR = (
     "Hardware Accelerated GPU Scheduling (HAGS) must be fully enabled when the "
     "Windows WDDM driver model is in use in order to obtain reliable CUDA event "
-    "timing. Please enable HAGS in the Windows graphics settings or switch to a "
+    "timings. Please enable HAGS in the Windows graphics settings or switch to a "
     "non-WDDM driver model."
 )
 

From 43d7c8ff3f9d228091baa72f289661875722ef2b Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 23:09:18 -0800
Subject: [PATCH 21/22] Restore cuda_python/README.md as symlink

Restore the symlink that was accidentally converted to a regular file,
likely due to Windows/WSL2 interaction. This restores the file mode
from 100644 (regular file) back to 120000 (symlink) to match main.
---
 cuda_python/README.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 120000 cuda_python/README.md

diff --git a/cuda_python/README.md b/cuda_python/README.md
deleted file mode 100644
index 32d46ee883..0000000000
--- a/cuda_python/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../README.md
\ No newline at end of file
diff --git a/cuda_python/README.md b/cuda_python/README.md
new file mode 120000
index 0000000000..32d46ee883
--- /dev/null
+++ b/cuda_python/README.md
@@ -0,0 +1 @@
+../README.md
\ No newline at end of file

From d6d7b578d764899dd3e5555618fb9fa644851670 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 22 Nov 2025 23:16:31 -0800
Subject: [PATCH 22/22] Standardize platform detection to use sys.platform ==
 'win32'

Replace os.name == 'nt' with sys.platform == 'win32' in:
- cuda_core/build_hooks.py (2 occurrences)
- cuda_core/tests/test_event.py (1 occurrence)

This matches the pattern used throughout the rest of the codebase
for consistency.
---
 cuda_core/build_hooks.py      | 5 +++--
 cuda_core/tests/test_event.py | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 703227d14d..ce94ab1067 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -12,6 +12,7 @@
 import os
 import re
 import subprocess
+import sys
 
 from Cython.Build import cythonize
 from setuptools import Extension
@@ -101,13 +102,13 @@ def get_sources(mod):
         return sources
 
     def get_libraries(mod):
-        if os.name == "nt" and mod == "_event":
+        if sys.platform == "win32" and mod == "_event":
             # user32 / gdi32 for hags_status.c, nvml for wddm_driver_model_is_in_use.c
             return ["user32", "gdi32", "nvml"]
         return None
 
     def get_library_dirs():
-        if os.name != "nt":
+        if sys.platform != "win32":
             return None
         # wddm_driver_model_is_in_use.c needs nvml.lib
         return [os.path.join(root, "lib", "x64") for root in get_cuda_paths()]
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 998a87db03..ce6b0e1cd6 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import os
+import sys
 import time
 
 import cuda.core.experimental
@@ -69,7 +69,7 @@ def test_timing_success(init_cuda):
     # We only want to exercise the __sub__ method, this test is not meant
     # to stress-test the CUDA driver or time.sleep().
     delay_ms = delay_seconds * 1000
-    if os.name == "nt" or IS_WSL:  # noqa: SIM108
+    if sys.platform == "win32" or IS_WSL:  # noqa: SIM108
         # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
         generous_tolerance = 100
     else: