NVIDIA · yufeiwu-nv · Nov 28, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
@@ -667,9 +667,11 @@ def trt_gpu_clock_lock(request):
     gpu_list = get_gpu_device_list()
     gpu_ids = [gpu.split()[1][:-1] for gpu in gpu_list]  # Extract GPU IDs
     gpu_ids_str = ",".join(gpu_ids)
+    enable_clock_locking = request.config.getoption("--enable-gpu-clock-lock")
     gpu_clock_lock = GPUClockLock(
         gpu_id=gpu_ids_str,
         interval_ms=1000.0,
+        enable_clock_locking=enable_clock_locking,
     )
 
     yield gpu_clock_lock
@@ -2138,6 +2140,13 @@ def pytest_addoption(parser):
         help="Path to the output XML file for periodic JUnit XML reporter. "
         "Only used with --periodic-junit.",
     )
+    parser.addoption(
+        "--enable-gpu-clock-lock",
+        action="store_true",
+        default=False,
+        help="Enable GPU clock locking during tests. "
+        "By default, GPU clock locking is disabled.",
+    )
 
 
 @pytest.hookimpl(trylast=True)

diff --git a/tests/integration/defs/perf/gpu_clock_lock.py b/tests/integration/defs/perf/gpu_clock_lock.py
@@ -67,14 +67,15 @@ def __init__(self, gpu_id, gpu_clock, mem_clock, timestamp, graphics_clk,
 
 class GPUClockLock:
 
-    def __init__(self, gpu_id, interval_ms):
+    def __init__(self, gpu_id, interval_ms, enable_clock_locking=False):
         """
         Sets up clock values and tears down every run. At the end of the session call teardown to complete session and
         reset GPU clocks.
 
         Args:
             gpu_id (str): GPU identifier, either comma-separated UUIDs or comma-separated indices in string.
             interval_ms (float): Interval duration between monitoring samples.
+            enable_clock_locking (bool): If True, enable GPU clock locking. Default is False.
         """
         # Initialize pynvml
         self._nvml_initialized = False
@@ -84,6 +85,7 @@ def __init__(self, gpu_id, interval_ms):
         self._gpu_id = gpu_id
         self._gpu_id_list = [int(id) for id in gpu_id.split(",")]
         self._mobile_disable_clock_locking = False
+        self._enable_clock_locking = enable_clock_locking
 
         # Create GPU handles, one per GPU.
         try:
@@ -207,6 +209,10 @@ def _lock_gpu_clocks(self):
         Implements fail-fast semantics: if any GPU fails to lock, all operations
         are rolled back and an exception is raised.
         """
+        if not self._enable_clock_locking:
+            print_warning("Clock locking is not enabled inside TRTLLM code")
+            return
+
         if self._mobile_disable_clock_locking:
             print_info("Clock locking disabled for mobile/Jetson devices")
             return
@@ -256,12 +262,20 @@ def _lock_gpu_clocks(self):
                         f"GPU {gpu_idx}: Locked clocks to SM={target_sm_clk}MHz, MEM={target_mem_clk}MHz"
                     )
                 except pynvml.NVMLError as e:
-                    print_error(f"Failed to lock clocks for GPU {gpu_idx}: {e}")
                     # Rollback any GPUs that were successfully locked
                     self._rollback_locked_gpus(locked_gpus,
                                                original_clocks_backup)
-                    raise GPUClockLockFailFastError(
-                        f"Failed to lock clocks for GPU {gpu_idx}: {e}")
+
+                    # Only raise GPUClockLockFailFastError for non-permission errors
+                    if isinstance(e, pynvml.NVMLError_NoPermission):
+                        print_warning(
+                            f"Permission denied while locking GPU {gpu_idx}, continuing: {e}"
+                        )
+                    else:
+                        print_error(
+                            f"Failed to lock clocks for GPU {gpu_idx}: {e}")
+                        raise GPUClockLockFailFastError(
+                            f"Failed to lock clocks for GPU {gpu_idx}: {e}")
 
             # Phase 3: Only mark as locked if all GPUs succeeded
             self._original_clocks = original_clocks_backup
@@ -421,6 +435,11 @@ def validate_gpu_monitoring_data(self, deviation_perc=0.07, num_entries=3):
         before considering the entire dataset as invalid
         """
 
+        if not self._enable_clock_locking:
+            print_info(
+                "Skipped gpu monitoring validation (clock locking not enabled)")
+            return
+
         if self._mobile_disable_clock_locking:
             print_info("Skipped gpu monitoring validation for mobile board")
             return