From 9b11ffae4343aacfbe538c672cd897d3ee70edb0 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 24 Nov 2025 13:54:24 -0800
Subject: [PATCH 01/13] Replace timing-based event test with deterministic
 elapsed-time check

The previous test attempted to measure a real sleep delay between two
event records, which introduced flakiness (especially on Windows/WDDM)
and tested OS/driver timing behavior rather than the __sub__ implementation
itself.

This change replaces the test with a minimal, deterministic version that:

  * records two back-to-back events on the same stream
  * synchronizes on the second event to ensure both timestamps are valid
  * asserts that cuEventElapsedTime returns a finite, non-negative float

This exercises the success path of Event.__sub__ without depending on
actual GPU/OS timing characteristics, or requiring artificial GPU work.
---
 cuda_core/tests/test_event.py | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 992a78e92e..f5bf7cb0cb 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -1,8 +1,8 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-import time
+
+import math
 
 import cuda.core.experimental
 import pytest
@@ -13,35 +13,30 @@
 )
 from helpers.latch import LatchKernel
 
-from cuda_python_test_helpers import IS_WSL
-
 
 def test_event_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
         cuda.core.experimental._event.Event()  # Ensure back door is locked.
 
 
-def test_timing_success(init_cuda):
+def test_event_elapsed_time_basic(init_cuda):
     options = EventOptions(enable_timing=True)
     stream = Device().create_stream()
-    delay_seconds = 0.5
     e1 = stream.record(options=options)
-    time.sleep(delay_seconds)
     e2 = stream.record(options=options)
     e2.sync()
-    elapsed_time_ms = e2 - e1
-    assert isinstance(elapsed_time_ms, float)
-    # Using a generous tolerance, to avoid flaky tests:
-    # We only want to exercise the __sub__ method, this test is not meant
-    # to stress-test the CUDA driver or time.sleep().
-    delay_ms = delay_seconds * 1000
-    if os.name == "nt" or IS_WSL:  # noqa: SIM108
-        # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
-        generous_tolerance = 100
-    else:
-        # Most modern Linux kernels have a default timer resolution of 1 ms.
-        generous_tolerance = 20
-    assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
+    delta_ms = e2 - e1
+    assert isinstance(delta_ms, float)
+    # Sanity check: cuEventElapsedTime should always return a finite float for two completed
+    # events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general
+    # undefined behavior, without asserting anything about the magnitude of the measured time.
+    assert math.isfinite(delta_ms)
+    # cudaEventElapsedTime() has a documented resolution of ~0.5 microseconds. With two back-to-back
+    # event records on the same stream, their timestamps may fall into the same tick, producing
+    # an elapsed time of exactly 0.0 ms. The only reliable way to force delta_ms > 0 would be to
+    # insert real GPU work (e.g. a __nanosleep kernel), which is more complexity than this test
+    # should depend on. For correctness we only assert that the elapsed time is non-negative.
+    assert delta_ms >= 0
 
 
 def test_is_sync_busy_waited(init_cuda):

From 605f1ef4489f417f2cad330edda7b9f26c055d61 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 24 Nov 2025 15:08:44 -0800
Subject: [PATCH 02/13] cuda_core/tests/helpers/__init__.py: also use CUDA_HOME

---
 cuda_core/tests/helpers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/helpers/__init__.py b/cuda_core/tests/helpers/__init__.py
index 02cbe6d8e9..82b07dc4ce 100644
--- a/cuda_core/tests/helpers/__init__.py
+++ b/cuda_core/tests/helpers/__init__.py
@@ -5,7 +5,7 @@
 import pathlib
 import sys
 
-CUDA_PATH = os.environ.get("CUDA_PATH")
+CUDA_PATH = os.environ.get("CUDA_HOME", os.environ.get("CUDA_PATH"))
 CUDA_INCLUDE_PATH = None
 CCCL_INCLUDE_PATHS = None
 if CUDA_PATH is not None:

From 29f788287a66e4208300503ff99d6185648ba999 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 24 Nov 2025 17:39:00 -0800
Subject: [PATCH 03/13] Revert "cuda_core/tests/helpers/__init__.py: also use
 CUDA_HOME"

This reverts commit 605f1ef4489f417f2cad330edda7b9f26c055d61.
---
 cuda_core/tests/helpers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/helpers/__init__.py b/cuda_core/tests/helpers/__init__.py
index 82b07dc4ce..02cbe6d8e9 100644
--- a/cuda_core/tests/helpers/__init__.py
+++ b/cuda_core/tests/helpers/__init__.py
@@ -5,7 +5,7 @@
 import pathlib
 import sys
 
-CUDA_PATH = os.environ.get("CUDA_HOME", os.environ.get("CUDA_PATH"))
+CUDA_PATH = os.environ.get("CUDA_PATH")
 CUDA_INCLUDE_PATH = None
 CCCL_INCLUDE_PATHS = None
 if CUDA_PATH is not None:

From 7d5ee2bc2ceeebf65dd9e0b123f3f0fced3bffd8 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 24 Nov 2025 22:10:44 -0800
Subject: [PATCH 04/13] Use nanosleep kernel in test_event_elapsed_time_basic
 for deterministic timing

Replace the back-to-back event record test with a version that uses a
__nanosleep kernel between events. This ensures a guaranteed positive
elapsed time (delta_ms > 10) without depending on OS/driver timing
characteristics or requiring artificial GPU work beyond the minimal
nanosleep delay.

The kernel sleeps for 20ms (double the assertion threshold of 10ms),
providing a large safety margin above the ~0.5 microsecond resolution
of cudaEventElapsedTime, making this test deterministic and non-flaky
across platforms including Windows/WDDM.
---
 cuda_core/tests/test_event.py | 36 ++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index f5bf7cb0cb..bc9e7908d7 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -10,6 +10,10 @@
     Device,
     Event,
     EventOptions,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    launch,
 )
 from helpers.latch import LatchKernel
 
@@ -20,9 +24,28 @@ def test_event_init_disabled():
 
 
 def test_event_elapsed_time_basic(init_cuda):
+    device = Device()
     options = EventOptions(enable_timing=True)
-    stream = Device().create_stream()
+    stream = device.create_stream()
+
+    # Create a simple kernel that sleeps for 20 ms to ensure a measurable delay
+    # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics
+    code = """
+    extern "C"
+    __global__ void nanosleep_kernel() {
+        __nanosleep(20000000); // 20 milliseconds
+    }
+    """
+    arch = "".join(f"{i}" for i in device.compute_capability)
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile("cubin")
+    kernel = mod.get_kernel("nanosleep_kernel")
+
     e1 = stream.record(options=options)
+    # Launch the nanosleep kernel to introduce a guaranteed delay
+    config = LaunchConfig(grid=1, block=1)
+    launch(stream, config, kernel)
     e2 = stream.record(options=options)
     e2.sync()
     delta_ms = e2 - e1
@@ -31,12 +54,11 @@ def test_event_elapsed_time_basic(init_cuda):
     # events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general
     # undefined behavior, without asserting anything about the magnitude of the measured time.
     assert math.isfinite(delta_ms)
-    # cudaEventElapsedTime() has a documented resolution of ~0.5 microseconds. With two back-to-back
-    # event records on the same stream, their timestamps may fall into the same tick, producing
-    # an elapsed time of exactly 0.0 ms. The only reliable way to force delta_ms > 0 would be to
-    # insert real GPU work (e.g. a __nanosleep kernel), which is more complexity than this test
-    # should depend on. For correctness we only assert that the elapsed time is non-negative.
-    assert delta_ms >= 0
+    # With the nanosleep kernel between events, we can assert a positive elapsed time.
+    # The kernel sleeps for 20 ms, so delta_ms should be at least ~10 ms.
+    # Using a 10 ms threshold provides a very large safety margin above the ~0.5 microsecond
+    # resolution of cudaEventElapsedTime, making this test deterministic and non-flaky.
+    assert delta_ms > 10
 
 
 def test_is_sync_busy_waited(init_cuda):

From 5eba5ac7de8633b57ca7881f6e0312597d49c545 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 24 Nov 2025 22:31:35 -0800
Subject: [PATCH 05/13] Fix nanosleep kernel to use clock64() loop for
 guaranteed duration

Replace single __nanosleep() call with clock64()-based loop to ensure the
kernel actually waits for the full 20ms duration. A single __nanosleep()
call doesn't guarantee the full sleep duration, which caused measured
times to be orders of magnitude less than expected (~0.2ms instead of
~20ms).

The new implementation:
- Uses clock64() to measure actual elapsed time
- Loops until 20ms worth of clock cycles have elapsed
- Uses __nanosleep(1000000) inside the loop to yield and avoid 100% CPU spin

This ensures delta_ms > 10 assertion is reliable and the test passes
deterministically.
---
 cuda_core/tests/test_event.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index bc9e7908d7..e99f19bd44 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -30,11 +30,17 @@ def test_event_elapsed_time_basic(init_cuda):
 
     # Create a simple kernel that sleeps for 20 ms to ensure a measurable delay
     # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics
-    code = """
+    # Use clock64() in a loop to ensure we actually wait for the full duration
+    clock_rate_hz = device.properties.clock_rate * 1000
+    sleep_cycles = int(0.020 * clock_rate_hz)  # 20 ms in clock cycles
+    code = f"""
     extern "C"
-    __global__ void nanosleep_kernel() {
-        __nanosleep(20000000); // 20 milliseconds
-    }
+    __global__ void nanosleep_kernel() {{
+        unsigned long long start = clock64();
+        while (clock64() - start < {sleep_cycles}) {{
+            __nanosleep(1000000); // 1 ms yield to avoid 100% spin
+        }}
+    }}
     """
     arch = "".join(f"{i}" for i in device.compute_capability)
     program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
@@ -55,9 +61,10 @@ def test_event_elapsed_time_basic(init_cuda):
     # undefined behavior, without asserting anything about the magnitude of the measured time.
     assert math.isfinite(delta_ms)
     # With the nanosleep kernel between events, we can assert a positive elapsed time.
-    # The kernel sleeps for 20 ms, so delta_ms should be at least ~10 ms.
-    # Using a 10 ms threshold provides a very large safety margin above the ~0.5 microsecond
-    # resolution of cudaEventElapsedTime, making this test deterministic and non-flaky.
+    # The kernel sleeps for 20 ms using clock64(), so delta_ms should be at least ~10 ms.
+    # Using a 10 ms threshold (half the sleep duration) provides a large safety margin above
+    # the ~0.5 microsecond resolution of cudaEventElapsedTime, making this test deterministic
+    # and non-flaky.
     assert delta_ms > 10
 
 

From ad169339ce884e3655f211803e7fb7b4a93edb85 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 25 Nov 2025 14:23:10 -0800
Subject: [PATCH 06/13] clock64() return type is documented as `long long int`:

https://docs.nvidia.com/cuda/cuda-c-programming-guide/#time-function
---
 cuda_core/tests/test_event.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index e99f19bd44..ce26b21c35 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -36,7 +36,7 @@ def test_event_elapsed_time_basic(init_cuda):
     code = f"""
     extern "C"
     __global__ void nanosleep_kernel() {{
-        unsigned long long start = clock64();
+        long long int start = clock64();
         while (clock64() - start < {sleep_cycles}) {{
             __nanosleep(1000000); // 1 ms yield to avoid 100% spin
         }}

From 55d9b44c190e4b7408bf068b12d6d0fba5262572 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 25 Nov 2025 14:28:34 -0800
Subject: [PATCH 07/13] Use device.arch instead of joining
 device.compute_capability

---
 cuda_core/tests/test_event.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index ce26b21c35..412c8c2d3d 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -42,8 +42,7 @@ def test_event_elapsed_time_basic(init_cuda):
         }}
     }}
     """
-    arch = "".join(f"{i}" for i in device.compute_capability)
-    program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
     prog = Program(code, code_type="c++", options=program_options)
     mod = prog.compile("cubin")
     kernel = mod.get_kernel("nanosleep_kernel")

From 3762490416ff37305abd23876385a3034af90c83 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 25 Nov 2025 14:38:35 -0800
Subject: [PATCH 08/13] cusor-generated
 cuda_core/tests/helpers/nanosleep_kernel.py

---
 cuda_core/tests/helpers/nanosleep_kernel.py | 44 +++++++++++++++++++++
 cuda_core/tests/test_event.py               | 27 ++-----------
 2 files changed, 48 insertions(+), 23 deletions(-)
 create mode 100644 cuda_core/tests/helpers/nanosleep_kernel.py

diff --git a/cuda_core/tests/helpers/nanosleep_kernel.py b/cuda_core/tests/helpers/nanosleep_kernel.py
new file mode 100644
index 0000000000..e0cca15586
--- /dev/null
+++ b/cuda_core/tests/helpers/nanosleep_kernel.py
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental import (
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    launch,
+)
+
+
+class NanosleepKernel:
+    """
+    Manages a kernel that sleeps for a specified duration using clock64().
+    """
+
+    def __init__(self, device, sleep_duration_seconds=0.020):
+        """
+        Initialize the nanosleep kernel.
+
+        Args:
+            device: CUDA device to compile the kernel for
+            sleep_duration_seconds: Duration to sleep in seconds (default: 0.020 for 20ms)
+        """
+        clock_rate_hz = device.properties.clock_rate * 1000
+        sleep_cycles = int(sleep_duration_seconds * clock_rate_hz)
+        code = f"""
+        extern "C"
+        __global__ void nanosleep_kernel() {{
+            long long int start = clock64();
+            while (clock64() - start < {sleep_cycles}) {{
+                __nanosleep(1000000); // 1 ms yield to avoid 100% spin
+            }}
+        }}
+        """
+        program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
+        prog = Program(code, code_type="c++", options=program_options)
+        mod = prog.compile("cubin")
+        self.kernel = mod.get_kernel("nanosleep_kernel")
+
+    def launch(self, stream):
+        """Launch the nanosleep kernel on the given stream."""
+        config = LaunchConfig(grid=1, block=1)
+        launch(stream, config, self.kernel)
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 412c8c2d3d..9b9937dd55 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -10,12 +10,9 @@
     Device,
     Event,
     EventOptions,
-    LaunchConfig,
-    Program,
-    ProgramOptions,
-    launch,
 )
 from helpers.latch import LatchKernel
+from helpers.nanosleep_kernel import NanosleepKernel
 
 
 def test_event_init_disabled():
@@ -28,29 +25,13 @@ def test_event_elapsed_time_basic(init_cuda):
     options = EventOptions(enable_timing=True)
     stream = device.create_stream()
 
-    # Create a simple kernel that sleeps for 20 ms to ensure a measurable delay
+    # Create a nanosleep kernel that sleeps for 20 ms to ensure a measurable delay
     # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics
-    # Use clock64() in a loop to ensure we actually wait for the full duration
-    clock_rate_hz = device.properties.clock_rate * 1000
-    sleep_cycles = int(0.020 * clock_rate_hz)  # 20 ms in clock cycles
-    code = f"""
-    extern "C"
-    __global__ void nanosleep_kernel() {{
-        long long int start = clock64();
-        while (clock64() - start < {sleep_cycles}) {{
-            __nanosleep(1000000); // 1 ms yield to avoid 100% spin
-        }}
-    }}
-    """
-    program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
-    prog = Program(code, code_type="c++", options=program_options)
-    mod = prog.compile("cubin")
-    kernel = mod.get_kernel("nanosleep_kernel")
+    nanosleep = NanosleepKernel(device, sleep_duration_seconds=0.020)
 
     e1 = stream.record(options=options)
     # Launch the nanosleep kernel to introduce a guaranteed delay
-    config = LaunchConfig(grid=1, block=1)
-    launch(stream, config, kernel)
+    nanosleep.launch(stream)
     e2 = stream.record(options=options)
     e2.sync()
     delta_ms = e2 - e1

From 528e77e558f82a072745c846ae8cf2c5e4d9eded Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 25 Nov 2025 14:50:06 -0800
Subject: [PATCH 09/13] Change NanosleepKernel API to sleep_duration_ms

---
 cuda_core/tests/helpers/nanosleep_kernel.py | 8 ++++----
 cuda_core/tests/test_event.py               | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cuda_core/tests/helpers/nanosleep_kernel.py b/cuda_core/tests/helpers/nanosleep_kernel.py
index e0cca15586..52238340b2 100644
--- a/cuda_core/tests/helpers/nanosleep_kernel.py
+++ b/cuda_core/tests/helpers/nanosleep_kernel.py
@@ -14,16 +14,16 @@ class NanosleepKernel:
     Manages a kernel that sleeps for a specified duration using clock64().
     """
 
-    def __init__(self, device, sleep_duration_seconds=0.020):
+    def __init__(self, device, sleep_duration_ms=20):
         """
         Initialize the nanosleep kernel.
 
         Args:
             device: CUDA device to compile the kernel for
-            sleep_duration_seconds: Duration to sleep in seconds (default: 0.020 for 20ms)
+            sleep_duration_ms: Duration to sleep in milliseconds (default: 20)
         """
-        clock_rate_hz = device.properties.clock_rate * 1000
-        sleep_cycles = int(sleep_duration_seconds * clock_rate_hz)
+        # clock_rate is in kHz
+        sleep_cycles = int(sleep_duration_ms * device.properties.clock_rate) + 1
         code = f"""
         extern "C"
         __global__ void nanosleep_kernel() {{
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 9b9937dd55..bf57d4a49e 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -27,7 +27,7 @@ def test_event_elapsed_time_basic(init_cuda):
 
     # Create a nanosleep kernel that sleeps for 20 ms to ensure a measurable delay
     # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics
-    nanosleep = NanosleepKernel(device, sleep_duration_seconds=0.020)
+    nanosleep = NanosleepKernel(device, sleep_duration_ms=20)
 
     e1 = stream.record(options=options)
     # Launch the nanosleep kernel to introduce a guaranteed delay

From f585ce04dd2a0a589b99d6460d9e9029556465a8 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 25 Nov 2025 14:56:36 -0800
Subject: [PATCH 10/13] Rename back to test_timing_success

---
 cuda_core/tests/test_event.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index bf57d4a49e..7ab8ca3254 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -20,7 +20,7 @@ def test_event_init_disabled():
         cuda.core.experimental._event.Event()  # Ensure back door is locked.
 
 
-def test_event_elapsed_time_basic(init_cuda):
+def test_timing_success(init_cuda):
     device = Device()
     options = EventOptions(enable_timing=True)
     stream = device.create_stream()

From 909f380a719fa00adf890d0c6af4285b50d83244 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 25 Nov 2025 15:00:36 -0800
Subject: [PATCH 11/13] Streamline a comment

---
 cuda_core/tests/test_event.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 7ab8ca3254..bf5a1b6575 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -30,8 +30,7 @@ def test_timing_success(init_cuda):
     nanosleep = NanosleepKernel(device, sleep_duration_ms=20)
 
     e1 = stream.record(options=options)
-    # Launch the nanosleep kernel to introduce a guaranteed delay
-    nanosleep.launch(stream)
+    nanosleep.launch(stream)  # Insert a guaranteed delay
     e2 = stream.record(options=options)
     e2.sync()
     delta_ms = e2 - e1

From 18563e864a0533ae5791676fce3be7cb03cee35b Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 25 Nov 2025 15:08:37 -0800
Subject: [PATCH 12/13] Polish comments. Make the code more similar to the
 existing code.

---
 cuda_core/tests/test_event.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index bf5a1b6575..ec35448619 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -21,30 +21,29 @@ def test_event_init_disabled():
 
 
 def test_timing_success(init_cuda):
-    device = Device()
     options = EventOptions(enable_timing=True)
+    device = Device()
     stream = device.create_stream()
 
-    # Create a nanosleep kernel that sleeps for 20 ms to ensure a measurable delay
-    # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics
+    # Create a nanosleep kernel that sleeps for 20 ms to ensure a measurable delay.
+    # This guarantees elapsed_time_ms > 10 without depending on OS/driver timing characteristics.
     nanosleep = NanosleepKernel(device, sleep_duration_ms=20)
 
     e1 = stream.record(options=options)
     nanosleep.launch(stream)  # Insert a guaranteed delay
     e2 = stream.record(options=options)
     e2.sync()
-    delta_ms = e2 - e1
-    assert isinstance(delta_ms, float)
+    elapsed_time_ms = e2 - e1
+    assert isinstance(elapsed_time_ms, float)
     # Sanity check: cuEventElapsedTime should always return a finite float for two completed
     # events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general
     # undefined behavior, without asserting anything about the magnitude of the measured time.
-    assert math.isfinite(delta_ms)
-    # With the nanosleep kernel between events, we can assert a positive elapsed time.
-    # The kernel sleeps for 20 ms using clock64(), so delta_ms should be at least ~10 ms.
-    # Using a 10 ms threshold (half the sleep duration) provides a large safety margin above
-    # the ~0.5 microsecond resolution of cudaEventElapsedTime, making this test deterministic
-    # and non-flaky.
-    assert delta_ms > 10
+    assert math.isfinite(elapsed_time_ms)
+    # With the nanosleep kernel between events, the kernel sleeps for 20 ms using clock64(),
+    # so elapsed_time_ms should definitely be larger than 10 ms. This provides a large safety
+    # margin above the ~0.5 microsecond resolution of cudaEventElapsedTime(), which should
+    # make this test deterministic and non-flaky.
+    assert elapsed_time_ms > 10
 
 
 def test_is_sync_busy_waited(init_cuda):

From 35510e6b620729480b1f1a38a11748efb930b884 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 25 Nov 2025 21:26:43 -0800
Subject: [PATCH 13/13] Simplify nanosleep_kernel implementation.

---
 cuda_core/tests/helpers/nanosleep_kernel.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/helpers/nanosleep_kernel.py b/cuda_core/tests/helpers/nanosleep_kernel.py
index 52238340b2..ea6ae34dcf 100644
--- a/cuda_core/tests/helpers/nanosleep_kernel.py
+++ b/cuda_core/tests/helpers/nanosleep_kernel.py
@@ -14,7 +14,7 @@ class NanosleepKernel:
     Manages a kernel that sleeps for a specified duration using clock64().
     """
 
-    def __init__(self, device, sleep_duration_ms=20):
+    def __init__(self, device, sleep_duration_ms: int = 20):
         """
         Initialize the nanosleep kernel.
 
@@ -22,14 +22,13 @@ def __init__(self, device, sleep_duration_ms=20):
             device: CUDA device to compile the kernel for
             sleep_duration_ms: Duration to sleep in milliseconds (default: 20)
         """
-        # clock_rate is in kHz
-        sleep_cycles = int(sleep_duration_ms * device.properties.clock_rate) + 1
         code = f"""
         extern "C"
         __global__ void nanosleep_kernel() {{
-            long long int start = clock64();
-            while (clock64() - start < {sleep_cycles}) {{
-                __nanosleep(1000000); // 1 ms yield to avoid 100% spin
+            // The maximum sleep duration is approximately 1 millisecond.
+            unsigned int one_ms = 1000000U;
+            for (unsigned int i = 0; i < {sleep_duration_ms}; ++i) {{
+                __nanosleep(one_ms);
             }}
         }}
         """