NVIDIA · rwgk · Nov 26, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 25, 2025
diff --git a/cuda_core/tests/helpers/nanosleep_kernel.py b/cuda_core/tests/helpers/nanosleep_kernel.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental import (
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    launch,
+)
+
+
+class NanosleepKernel:
+    """
+    Manages a kernel that sleeps for a specified duration using clock64().
+    """
+
+    def __init__(self, device, sleep_duration_ms: int = 20):
+        """
+        Initialize the nanosleep kernel.
+
+        Args:
+            device: CUDA device to compile the kernel for
+            sleep_duration_ms: Duration to sleep in milliseconds (default: 20)
+        """
+        code = f"""
+        extern "C"
+        __global__ void nanosleep_kernel() {{
+            // The maximum sleep duration is approximately 1 millisecond.
+            unsigned int one_ms = 1000000U;
+            for (unsigned int i = 0; i < {sleep_duration_ms}; ++i) {{
+                __nanosleep(one_ms);
+            }}
+        }}
+        """
+        program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
+        prog = Program(code, code_type="c++", options=program_options)
+        mod = prog.compile("cubin")
+        self.kernel = mod.get_kernel("nanosleep_kernel")
+
+    def launch(self, stream):
+        """Launch the nanosleep kernel on the given stream."""
+        config = LaunchConfig(grid=1, block=1)
+        launch(stream, config, self.kernel)
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
@@ -1,8 +1,8 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-import time
+
+import math
 
 import cuda.core.experimental
 import pytest
@@ -12,8 +12,7 @@
     EventOptions,
 )
 from helpers.latch import LatchKernel
-
-from cuda_python_test_helpers import IS_WSL
+from helpers.nanosleep_kernel import NanosleepKernel
 
 
 def test_event_init_disabled():
@@ -23,25 +22,28 @@ def test_event_init_disabled():
 
 def test_timing_success(init_cuda):
     options = EventOptions(enable_timing=True)
-    stream = Device().create_stream()
-    delay_seconds = 0.5
+    device = Device()
+    stream = device.create_stream()
+
+    # Create a nanosleep kernel that sleeps for 20 ms to ensure a measurable delay.
+    # This guarantees elapsed_time_ms > 10 without depending on OS/driver timing characteristics.
+    nanosleep = NanosleepKernel(device, sleep_duration_ms=20)
+
     e1 = stream.record(options=options)
-    time.sleep(delay_seconds)
+    nanosleep.launch(stream)  # Insert a guaranteed delay
     e2 = stream.record(options=options)
     e2.sync()
     elapsed_time_ms = e2 - e1
     assert isinstance(elapsed_time_ms, float)
-    # Using a generous tolerance, to avoid flaky tests:
-    # We only want to exercise the __sub__ method, this test is not meant
-    # to stress-test the CUDA driver or time.sleep().
-    delay_ms = delay_seconds * 1000
-    if os.name == "nt" or IS_WSL:  # noqa: SIM108
-        # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
-        generous_tolerance = 100
-    else:
-        # Most modern Linux kernels have a default timer resolution of 1 ms.
-        generous_tolerance = 20
-    assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
+    # Sanity check: cuEventElapsedTime should always return a finite float for two completed
+    # events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general
+    # undefined behavior, without asserting anything about the magnitude of the measured time.
+    assert math.isfinite(elapsed_time_ms)
+    # With the nanosleep kernel between events, the kernel sleeps for 20 ms using clock64(),
+    # so elapsed_time_ms should definitely be larger than 10 ms. This provides a large safety
+    # margin above the ~0.5 microsecond resolution of cudaEventElapsedTime(), which should
+    # make this test deterministic and non-flaky.
+    assert elapsed_time_ms > 10
 
 
 def test_is_sync_busy_waited(init_cuda):