Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions tests/integration/defs/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,9 +667,11 @@ def trt_gpu_clock_lock(request):
gpu_list = get_gpu_device_list()
gpu_ids = [gpu.split()[1][:-1] for gpu in gpu_list] # Extract GPU IDs
gpu_ids_str = ",".join(gpu_ids)
enable_clock_locking = request.config.getoption("--enable-gpu-clock-lock")
gpu_clock_lock = GPUClockLock(
gpu_id=gpu_ids_str,
interval_ms=1000.0,
enable_clock_locking=enable_clock_locking,
)

yield gpu_clock_lock
Expand Down Expand Up @@ -2138,6 +2140,13 @@ def pytest_addoption(parser):
help="Path to the output XML file for periodic JUnit XML reporter. "
"Only used with --periodic-junit.",
)
parser.addoption(
"--enable-gpu-clock-lock",
action="store_true",
default=False,
help="Enable GPU clock locking during tests. "
"By default, GPU clock locking is disabled.",
)


@pytest.hookimpl(trylast=True)
Expand Down
27 changes: 23 additions & 4 deletions tests/integration/defs/perf/gpu_clock_lock.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,15 @@ def __init__(self, gpu_id, gpu_clock, mem_clock, timestamp, graphics_clk,

class GPUClockLock:

def __init__(self, gpu_id, interval_ms):
def __init__(self, gpu_id, interval_ms, enable_clock_locking=False):
"""
Sets up clock values and tears down every run. At the end of the session call teardown to complete session and
reset GPU clocks.

Args:
gpu_id (str): GPU identifier, either comma-separated UUIDs or comma-separated indices in string.
interval_ms (float): Interval duration between monitoring samples.
enable_clock_locking (bool): If True, enable GPU clock locking. Default is False.
"""
# Initialize pynvml
self._nvml_initialized = False
Expand All @@ -84,6 +85,7 @@ def __init__(self, gpu_id, interval_ms):
self._gpu_id = gpu_id
self._gpu_id_list = [int(id) for id in gpu_id.split(",")]
self._mobile_disable_clock_locking = False
self._enable_clock_locking = enable_clock_locking

# Create GPU handles, one per GPU.
try:
Expand Down Expand Up @@ -207,6 +209,10 @@ def _lock_gpu_clocks(self):
Implements fail-fast semantics: if any GPU fails to lock, all operations
are rolled back and an exception is raised.
"""
if not self._enable_clock_locking:
print_warning("Clock locking is not enabled inside TRTLLM code")
return

if self._mobile_disable_clock_locking:
print_info("Clock locking disabled for mobile/Jetson devices")
return
Expand Down Expand Up @@ -256,12 +262,20 @@ def _lock_gpu_clocks(self):
f"GPU {gpu_idx}: Locked clocks to SM={target_sm_clk}MHz, MEM={target_mem_clk}MHz"
)
except pynvml.NVMLError as e:
print_error(f"Failed to lock clocks for GPU {gpu_idx}: {e}")
# Rollback any GPUs that were successfully locked
self._rollback_locked_gpus(locked_gpus,
original_clocks_backup)
raise GPUClockLockFailFastError(
f"Failed to lock clocks for GPU {gpu_idx}: {e}")

# Only raise GPUClockLockFailFastError for non-permission errors
if isinstance(e, pynvml.NVMLError_NoPermission):
print_warning(
f"Permission denied while locking GPU {gpu_idx}, continuing: {e}"
)
else:
print_error(
f"Failed to lock clocks for GPU {gpu_idx}: {e}")
raise GPUClockLockFailFastError(
f"Failed to lock clocks for GPU {gpu_idx}: {e}")

# Phase 3: Only mark as locked if all GPUs succeeded
self._original_clocks = original_clocks_backup
Expand Down Expand Up @@ -421,6 +435,11 @@ def validate_gpu_monitoring_data(self, deviation_perc=0.07, num_entries=3):
before considering the entire dataset as invalid
"""

if not self._enable_clock_locking:
print_info(
"Skipped gpu monitoring validation (clock locking not enabled)")
return

if self._mobile_disable_clock_locking:
print_info("Skipped gpu monitoring validation for mobile board")
return
Expand Down