Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions cuda_core/tests/helpers/nanosleep_kernel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from cuda.core.experimental import (
LaunchConfig,
Program,
ProgramOptions,
launch,
)


class NanosleepKernel:
"""
Manages a kernel that sleeps for a specified duration using clock64().
"""

def __init__(self, device, sleep_duration_ms: int = 20):
"""
Initialize the nanosleep kernel.

Args:
device: CUDA device to compile the kernel for
sleep_duration_ms: Duration to sleep in milliseconds (default: 20)
"""
code = f"""
extern "C"
__global__ void nanosleep_kernel() {{
// The maximum sleep duration is approximately 1 millisecond.
unsigned int one_ms = 1000000U;
for (unsigned int i = 0; i < {sleep_duration_ms}; ++i) {{
__nanosleep(one_ms);
}}
}}
"""
program_options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
prog = Program(code, code_type="c++", options=program_options)
mod = prog.compile("cubin")
self.kernel = mod.get_kernel("nanosleep_kernel")

def launch(self, stream):
"""Launch the nanosleep kernel on the given stream."""
config = LaunchConfig(grid=1, block=1)
launch(stream, config, self.kernel)
38 changes: 20 additions & 18 deletions cuda_core/tests/test_event.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import os
import time

import math

import cuda.core.experimental
import pytest
Expand All @@ -12,8 +12,7 @@
EventOptions,
)
from helpers.latch import LatchKernel

from cuda_python_test_helpers import IS_WSL
from helpers.nanosleep_kernel import NanosleepKernel


def test_event_init_disabled():
Expand All @@ -23,25 +22,28 @@ def test_event_init_disabled():

def test_timing_success(init_cuda):
options = EventOptions(enable_timing=True)
stream = Device().create_stream()
delay_seconds = 0.5
device = Device()
stream = device.create_stream()

# Create a nanosleep kernel that sleeps for 20 ms to ensure a measurable delay.
# This guarantees elapsed_time_ms > 10 without depending on OS/driver timing characteristics.
nanosleep = NanosleepKernel(device, sleep_duration_ms=20)

e1 = stream.record(options=options)
time.sleep(delay_seconds)
nanosleep.launch(stream) # Insert a guaranteed delay
e2 = stream.record(options=options)
e2.sync()
elapsed_time_ms = e2 - e1
assert isinstance(elapsed_time_ms, float)
# Using a generous tolerance, to avoid flaky tests:
# We only want to exercise the __sub__ method, this test is not meant
# to stress-test the CUDA driver or time.sleep().
delay_ms = delay_seconds * 1000
if os.name == "nt" or IS_WSL: # noqa: SIM108
# For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
generous_tolerance = 100
else:
# Most modern Linux kernels have a default timer resolution of 1 ms.
generous_tolerance = 20
assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
# Sanity check: cuEventElapsedTime should always return a finite float for two completed
# events. This guards against unexpected driver/HW anomalies (e.g. NaN or inf) or general
# undefined behavior, without asserting anything about the magnitude of the measured time.
assert math.isfinite(elapsed_time_ms)
# With the nanosleep kernel between events, the kernel sleeps for 20 ms using clock64(),
# so elapsed_time_ms should definitely be larger than 10 ms. This provides a large safety
# margin above the ~0.5 microsecond resolution of cudaEventElapsedTime(), which should
# make this test deterministic and non-flaky.
assert elapsed_time_ms > 10


def test_is_sync_busy_waited(init_cuda):
Expand Down
Loading