From 9b11ffae4343aacfbe538c672cd897d3ee70edb0 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 24 Nov 2025 13:54:24 -0800 Subject: [PATCH 01/13] Replace timing-based event test with deterministic elapsed-time check The previous test attempted to measure a real sleep delay between two event records, which introduced flakiness (especially on Windows/WDDM) and tested OS/driver timing behavior rather than the __sub__ implementation itself. This change replaces the test with a minimal, deterministic version that: * records two back-to-back events on the same stream * synchronizes on the second event to ensure both timestamps are valid * asserts that cuEventElapsedTime returns a finite, non-negative float This exercises the success path of Event.__sub__ without depending on actual GPU/OS timing characteristics, or requiring artificial GPU work. --- cuda_core/tests/test_event.py | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 992a78e92e..f5bf7cb0cb 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -1,8 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import os -import time + +import math import cuda.core.experimental import pytest @@ -13,35 +13,30 @@ ) from helpers.latch import LatchKernel -from cuda_python_test_helpers import IS_WSL - def test_event_init_disabled(): with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."): cuda.core.experimental._event.Event() # Ensure back door is locked. -def test_timing_success(init_cuda): +def test_event_elapsed_time_basic(init_cuda): options = EventOptions(enable_timing=True) stream = Device().create_stream() - delay_seconds = 0.5 e1 = stream.record(options=options) - time.sleep(delay_seconds) e2 = stream.record(options=options) e2.sync() - elapsed_time_ms = e2 - e1 - assert isinstance(elapsed_time_ms, float) - # Using a generous tolerance, to avoid flaky tests: - # We only want to exercise the __sub__ method, this test is not meant - # to stress-test the CUDA driver or time.sleep(). - delay_ms = delay_seconds * 1000 - if os.name == "nt" or IS_WSL: # noqa: SIM108 - # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default. - generous_tolerance = 100 - else: - # Most modern Linux kernels have a default timer resolution of 1 ms. - generous_tolerance = 20 - assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance + delta_ms = e2 - e1 + assert isinstance(delta_ms, float) + # Sanity check: cuEventElapsedTime should always return a finite float for two completed + # events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general + # undefined behavior, without asserting anything about the magnitude of the measured time. + assert math.isfinite(delta_ms) + # cudaEventElapsedTime() has a documented resolution of ~0.5 microseconds. With two back-to-back + # event records on the same stream, their timestamps may fall into the same tick, producing + # an elapsed time of exactly 0.0 ms. The only reliable way to force delta_ms > 0 would be to + # insert real GPU work (e.g. a __nanosleep kernel), which is more complexity than this test + # should depend on. For correctness we only assert that the elapsed time is non-negative. + assert delta_ms >= 0 def test_is_sync_busy_waited(init_cuda): From 605f1ef4489f417f2cad330edda7b9f26c055d61 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 24 Nov 2025 15:08:44 -0800 Subject: [PATCH 02/13] cuda_core/tests/helpers/__init__.py: also use CUDA_HOME --- cuda_core/tests/helpers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/helpers/__init__.py b/cuda_core/tests/helpers/__init__.py index 02cbe6d8e9..82b07dc4ce 100644 --- a/cuda_core/tests/helpers/__init__.py +++ b/cuda_core/tests/helpers/__init__.py @@ -5,7 +5,7 @@ import pathlib import sys -CUDA_PATH = os.environ.get("CUDA_PATH") +CUDA_PATH = os.environ.get("CUDA_HOME", os.environ.get("CUDA_PATH")) CUDA_INCLUDE_PATH = None CCCL_INCLUDE_PATHS = None if CUDA_PATH is not None: From 29f788287a66e4208300503ff99d6185648ba999 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 24 Nov 2025 17:39:00 -0800 Subject: [PATCH 03/13] Revert "cuda_core/tests/helpers/__init__.py: also use CUDA_HOME" This reverts commit 605f1ef4489f417f2cad330edda7b9f26c055d61. --- cuda_core/tests/helpers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/helpers/__init__.py b/cuda_core/tests/helpers/__init__.py index 82b07dc4ce..02cbe6d8e9 100644 --- a/cuda_core/tests/helpers/__init__.py +++ b/cuda_core/tests/helpers/__init__.py @@ -5,7 +5,7 @@ import pathlib import sys -CUDA_PATH = os.environ.get("CUDA_HOME", os.environ.get("CUDA_PATH")) +CUDA_PATH = os.environ.get("CUDA_PATH") CUDA_INCLUDE_PATH = None CCCL_INCLUDE_PATHS = None if CUDA_PATH is not None: From 7d5ee2bc2ceeebf65dd9e0b123f3f0fced3bffd8 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 24 Nov 2025 22:10:44 -0800 Subject: [PATCH 04/13] Use nanosleep kernel in test_event_elapsed_time_basic for deterministic timing Replace the back-to-back event record test with a version that uses a __nanosleep kernel between events. This ensures a guaranteed positive elapsed time (delta_ms > 10) without depending on OS/driver timing characteristics or requiring artificial GPU work beyond the minimal nanosleep delay. The kernel sleeps for 20ms (double the assertion threshold of 10ms), providing a large safety margin above the ~0.5 microsecond resolution of cudaEventElapsedTime, making this test deterministic and non-flaky across platforms including Windows/WDDM. --- cuda_core/tests/test_event.py | 36 ++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index f5bf7cb0cb..bc9e7908d7 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -10,6 +10,10 @@ Device, Event, EventOptions, + LaunchConfig, + Program, + ProgramOptions, + launch, ) from helpers.latch import LatchKernel @@ -20,9 +24,28 @@ def test_event_init_disabled(): def test_event_elapsed_time_basic(init_cuda): + device = Device() options = EventOptions(enable_timing=True) - stream = Device().create_stream() + stream = device.create_stream() + + # Create a simple kernel that sleeps for 20 ms to ensure a measurable delay + # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics + code = """ + extern "C" + __global__ void nanosleep_kernel() { + __nanosleep(20000000); // 20 milliseconds + } + """ + arch = "".join(f"{i}" for i in device.compute_capability) + program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile("cubin") + kernel = mod.get_kernel("nanosleep_kernel") + e1 = stream.record(options=options) + # Launch the nanosleep kernel to introduce a guaranteed delay + config = LaunchConfig(grid=1, block=1) + launch(stream, config, kernel) e2 = stream.record(options=options) e2.sync() delta_ms = e2 - e1 @@ -31,12 +54,11 @@ def test_event_elapsed_time_basic(init_cuda): # events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general # undefined behavior, without asserting anything about the magnitude of the measured time. assert math.isfinite(delta_ms) - # cudaEventElapsedTime() has a documented resolution of ~0.5 microseconds. With two back-to-back - # event records on the same stream, their timestamps may fall into the same tick, producing - # an elapsed time of exactly 0.0 ms. The only reliable way to force delta_ms > 0 would be to - # insert real GPU work (e.g. a __nanosleep kernel), which is more complexity than this test - # should depend on. For correctness we only assert that the elapsed time is non-negative. - assert delta_ms >= 0 + # With the nanosleep kernel between events, we can assert a positive elapsed time. + # The kernel sleeps for 20 ms, so delta_ms should be at least ~10 ms. + # Using a 10 ms threshold provides a very large safety margin above the ~0.5 microsecond + # resolution of cudaEventElapsedTime, making this test deterministic and non-flaky. + assert delta_ms > 10 def test_is_sync_busy_waited(init_cuda): From 5eba5ac7de8633b57ca7881f6e0312597d49c545 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 24 Nov 2025 22:31:35 -0800 Subject: [PATCH 05/13] Fix nanosleep kernel to use clock64() loop for guaranteed duration Replace single __nanosleep() call with clock64()-based loop to ensure the kernel actually waits for the full 20ms duration. A single __nanosleep() call doesn't guarantee the full sleep duration, which caused measured times to be orders of magnitude less than expected (~0.2ms instead of ~20ms). The new implementation: - Uses clock64() to measure actual elapsed time - Loops until 20ms worth of clock cycles have elapsed - Uses __nanosleep(1000000) inside the loop to yield and avoid 100% CPU spin This ensures delta_ms > 10 assertion is reliable and the test passes deterministically. --- cuda_core/tests/test_event.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index bc9e7908d7..e99f19bd44 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -30,11 +30,17 @@ def test_event_elapsed_time_basic(init_cuda): # Create a simple kernel that sleeps for 20 ms to ensure a measurable delay # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics - code = """ + # Use clock64() in a loop to ensure we actually wait for the full duration + clock_rate_hz = device.properties.clock_rate * 1000 + sleep_cycles = int(0.020 * clock_rate_hz) # 20 ms in clock cycles + code = f""" extern "C" - __global__ void nanosleep_kernel() { - __nanosleep(20000000); // 20 milliseconds - } + __global__ void nanosleep_kernel() {{ + unsigned long long start = clock64(); + while (clock64() - start < {sleep_cycles}) {{ + __nanosleep(1000000); // 1 ms yield to avoid 100% spin + }} + }} """ arch = "".join(f"{i}" for i in device.compute_capability) program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") @@ -55,9 +61,10 @@ def test_event_elapsed_time_basic(init_cuda): # undefined behavior, without asserting anything about the magnitude of the measured time. assert math.isfinite(delta_ms) # With the nanosleep kernel between events, we can assert a positive elapsed time. - # The kernel sleeps for 20 ms, so delta_ms should be at least ~10 ms. - # Using a 10 ms threshold provides a very large safety margin above the ~0.5 microsecond - # resolution of cudaEventElapsedTime, making this test deterministic and non-flaky. + # The kernel sleeps for 20 ms using clock64(), so delta_ms should be at least ~10 ms. + # Using a 10 ms threshold (half the sleep duration) provides a large safety margin above + # the ~0.5 microsecond resolution of cudaEventElapsedTime, making this test deterministic + # and non-flaky. assert delta_ms > 10 From ad169339ce884e3655f211803e7fb7b4a93edb85 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 25 Nov 2025 14:23:10 -0800 Subject: [PATCH 06/13] clock64() return type is documented as `long long int`: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#time-function --- cuda_core/tests/test_event.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index e99f19bd44..ce26b21c35 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -36,7 +36,7 @@ def test_event_elapsed_time_basic(init_cuda): code = f""" extern "C" __global__ void nanosleep_kernel() {{ - unsigned long long start = clock64(); + long long int start = clock64(); while (clock64() - start < {sleep_cycles}) {{ __nanosleep(1000000); // 1 ms yield to avoid 100% spin }} From 55d9b44c190e4b7408bf068b12d6d0fba5262572 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 25 Nov 2025 14:28:34 -0800 Subject: [PATCH 07/13] Use device.arch instead of joining device.compute_capability --- cuda_core/tests/test_event.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index ce26b21c35..412c8c2d3d 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -42,8 +42,7 @@ def test_event_elapsed_time_basic(init_cuda): }} }} """ - arch = "".join(f"{i}" for i in device.compute_capability) - program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") + program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") prog = Program(code, code_type="c++", options=program_options) mod = prog.compile("cubin") kernel = mod.get_kernel("nanosleep_kernel") From 3762490416ff37305abd23876385a3034af90c83 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 25 Nov 2025 14:38:35 -0800 Subject: [PATCH 08/13] cusor-generated cuda_core/tests/helpers/nanosleep_kernel.py --- cuda_core/tests/helpers/nanosleep_kernel.py | 44 +++++++++++++++++++++ cuda_core/tests/test_event.py | 27 ++----------- 2 files changed, 48 insertions(+), 23 deletions(-) create mode 100644 cuda_core/tests/helpers/nanosleep_kernel.py diff --git a/cuda_core/tests/helpers/nanosleep_kernel.py b/cuda_core/tests/helpers/nanosleep_kernel.py new file mode 100644 index 0000000000..e0cca15586 --- /dev/null +++ b/cuda_core/tests/helpers/nanosleep_kernel.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core.experimental import ( + LaunchConfig, + Program, + ProgramOptions, + launch, +) + + +class NanosleepKernel: + """ + Manages a kernel that sleeps for a specified duration using clock64(). + """ + + def __init__(self, device, sleep_duration_seconds=0.020): + """ + Initialize the nanosleep kernel. + + Args: + device: CUDA device to compile the kernel for + sleep_duration_seconds: Duration to sleep in seconds (default: 0.020 for 20ms) + """ + clock_rate_hz = device.properties.clock_rate * 1000 + sleep_cycles = int(sleep_duration_seconds * clock_rate_hz) + code = f""" + extern "C" + __global__ void nanosleep_kernel() {{ + long long int start = clock64(); + while (clock64() - start < {sleep_cycles}) {{ + __nanosleep(1000000); // 1 ms yield to avoid 100% spin + }} + }} + """ + program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile("cubin") + self.kernel = mod.get_kernel("nanosleep_kernel") + + def launch(self, stream): + """Launch the nanosleep kernel on the given stream.""" + config = LaunchConfig(grid=1, block=1) + launch(stream, config, self.kernel) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 412c8c2d3d..9b9937dd55 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -10,12 +10,9 @@ Device, Event, EventOptions, - LaunchConfig, - Program, - ProgramOptions, - launch, ) from helpers.latch import LatchKernel +from helpers.nanosleep_kernel import NanosleepKernel def test_event_init_disabled(): @@ -28,29 +25,13 @@ def test_event_elapsed_time_basic(init_cuda): options = EventOptions(enable_timing=True) stream = device.create_stream() - # Create a simple kernel that sleeps for 20 ms to ensure a measurable delay + # Create a nanosleep kernel that sleeps for 20 ms to ensure a measurable delay # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics - # Use clock64() in a loop to ensure we actually wait for the full duration - clock_rate_hz = device.properties.clock_rate * 1000 - sleep_cycles = int(0.020 * clock_rate_hz) # 20 ms in clock cycles - code = f""" - extern "C" - __global__ void nanosleep_kernel() {{ - long long int start = clock64(); - while (clock64() - start < {sleep_cycles}) {{ - __nanosleep(1000000); // 1 ms yield to avoid 100% spin - }} - }} - """ - program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}") - prog = Program(code, code_type="c++", options=program_options) - mod = prog.compile("cubin") - kernel = mod.get_kernel("nanosleep_kernel") + nanosleep = NanosleepKernel(device, sleep_duration_seconds=0.020) e1 = stream.record(options=options) # Launch the nanosleep kernel to introduce a guaranteed delay - config = LaunchConfig(grid=1, block=1) - launch(stream, config, kernel) + nanosleep.launch(stream) e2 = stream.record(options=options) e2.sync() delta_ms = e2 - e1 From 528e77e558f82a072745c846ae8cf2c5e4d9eded Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 25 Nov 2025 14:50:06 -0800 Subject: [PATCH 09/13] Change NanosleepKernel API to sleep_duration_ms --- cuda_core/tests/helpers/nanosleep_kernel.py | 8 ++++---- cuda_core/tests/test_event.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cuda_core/tests/helpers/nanosleep_kernel.py b/cuda_core/tests/helpers/nanosleep_kernel.py index e0cca15586..52238340b2 100644 --- a/cuda_core/tests/helpers/nanosleep_kernel.py +++ b/cuda_core/tests/helpers/nanosleep_kernel.py @@ -14,16 +14,16 @@ class NanosleepKernel: Manages a kernel that sleeps for a specified duration using clock64(). """ - def __init__(self, device, sleep_duration_seconds=0.020): + def __init__(self, device, sleep_duration_ms=20): """ Initialize the nanosleep kernel. Args: device: CUDA device to compile the kernel for - sleep_duration_seconds: Duration to sleep in seconds (default: 0.020 for 20ms) + sleep_duration_ms: Duration to sleep in milliseconds (default: 20) """ - clock_rate_hz = device.properties.clock_rate * 1000 - sleep_cycles = int(sleep_duration_seconds * clock_rate_hz) + # clock_rate is in kHz + sleep_cycles = int(sleep_duration_ms * device.properties.clock_rate) + 1 code = f""" extern "C" __global__ void nanosleep_kernel() {{ diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 9b9937dd55..bf57d4a49e 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -27,7 +27,7 @@ def test_event_elapsed_time_basic(init_cuda): # Create a nanosleep kernel that sleeps for 20 ms to ensure a measurable delay # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics - nanosleep = NanosleepKernel(device, sleep_duration_seconds=0.020) + nanosleep = NanosleepKernel(device, sleep_duration_ms=20) e1 = stream.record(options=options) # Launch the nanosleep kernel to introduce a guaranteed delay From f585ce04dd2a0a589b99d6460d9e9029556465a8 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 25 Nov 2025 14:56:36 -0800 Subject: [PATCH 10/13] Rename back to test_timing_success --- cuda_core/tests/test_event.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index bf57d4a49e..7ab8ca3254 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -20,7 +20,7 @@ def test_event_init_disabled(): cuda.core.experimental._event.Event() # Ensure back door is locked. -def test_event_elapsed_time_basic(init_cuda): +def test_timing_success(init_cuda): device = Device() options = EventOptions(enable_timing=True) stream = device.create_stream() From 909f380a719fa00adf890d0c6af4285b50d83244 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 25 Nov 2025 15:00:36 -0800 Subject: [PATCH 11/13] Streamline a comment --- cuda_core/tests/test_event.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 7ab8ca3254..bf5a1b6575 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -30,8 +30,7 @@ def test_timing_success(init_cuda): nanosleep = NanosleepKernel(device, sleep_duration_ms=20) e1 = stream.record(options=options) - # Launch the nanosleep kernel to introduce a guaranteed delay - nanosleep.launch(stream) + nanosleep.launch(stream) # Insert a guaranteed delay e2 = stream.record(options=options) e2.sync() delta_ms = e2 - e1 From 18563e864a0533ae5791676fce3be7cb03cee35b Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 25 Nov 2025 15:08:37 -0800 Subject: [PATCH 12/13] Polish comments. Make the code more similar to the existing code. --- cuda_core/tests/test_event.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index bf5a1b6575..ec35448619 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -21,30 +21,29 @@ def test_event_init_disabled(): def test_timing_success(init_cuda): - device = Device() options = EventOptions(enable_timing=True) + device = Device() stream = device.create_stream() - # Create a nanosleep kernel that sleeps for 20 ms to ensure a measurable delay - # This guarantees delta_ms > 10 without depending on OS/driver timing characteristics + # Create a nanosleep kernel that sleeps for 20 ms to ensure a measurable delay. + # This guarantees elapsed_time_ms > 10 without depending on OS/driver timing characteristics. nanosleep = NanosleepKernel(device, sleep_duration_ms=20) e1 = stream.record(options=options) nanosleep.launch(stream) # Insert a guaranteed delay e2 = stream.record(options=options) e2.sync() - delta_ms = e2 - e1 - assert isinstance(delta_ms, float) + elapsed_time_ms = e2 - e1 + assert isinstance(elapsed_time_ms, float) # Sanity check: cuEventElapsedTime should always return a finite float for two completed # events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general # undefined behavior, without asserting anything about the magnitude of the measured time. - assert math.isfinite(delta_ms) - # With the nanosleep kernel between events, we can assert a positive elapsed time. - # The kernel sleeps for 20 ms using clock64(), so delta_ms should be at least ~10 ms. - # Using a 10 ms threshold (half the sleep duration) provides a large safety margin above - # the ~0.5 microsecond resolution of cudaEventElapsedTime, making this test deterministic - # and non-flaky. - assert delta_ms > 10 + assert math.isfinite(elapsed_time_ms) + # With the nanosleep kernel between events, the kernel sleeps for 20 ms using clock64(), + # so elapsed_time_ms should definitely be larger than 10 ms. This provides a large safety + # margin above the ~0.5 microsecond resolution of cudaEventElapsedTime(), which should + # make this test deterministic and non-flaky. + assert elapsed_time_ms > 10 def test_is_sync_busy_waited(init_cuda): From 35510e6b620729480b1f1a38a11748efb930b884 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 25 Nov 2025 21:26:43 -0800 Subject: [PATCH 13/13] Simplify nanosleep_kernel implementation. --- cuda_core/tests/helpers/nanosleep_kernel.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cuda_core/tests/helpers/nanosleep_kernel.py b/cuda_core/tests/helpers/nanosleep_kernel.py index 52238340b2..ea6ae34dcf 100644 --- a/cuda_core/tests/helpers/nanosleep_kernel.py +++ b/cuda_core/tests/helpers/nanosleep_kernel.py @@ -14,7 +14,7 @@ class NanosleepKernel: Manages a kernel that sleeps for a specified duration using clock64(). """ - def __init__(self, device, sleep_duration_ms=20): + def __init__(self, device, sleep_duration_ms: int = 20): """ Initialize the nanosleep kernel. @@ -22,14 +22,13 @@ def __init__(self, device, sleep_duration_ms=20): device: CUDA device to compile the kernel for sleep_duration_ms: Duration to sleep in milliseconds (default: 20) """ - # clock_rate is in kHz - sleep_cycles = int(sleep_duration_ms * device.properties.clock_rate) + 1 code = f""" extern "C" __global__ void nanosleep_kernel() {{ - long long int start = clock64(); - while (clock64() - start < {sleep_cycles}) {{ - __nanosleep(1000000); // 1 ms yield to avoid 100% spin + // The maximum sleep duration is approximately 1 millisecond. + unsigned int one_ms = 1000000U; + for (unsigned int i = 0; i < {sleep_duration_ms}; ++i) {{ + __nanosleep(one_ms); }} }} """