NVIDIA · qiaoxj07 · Nov 18, 2025 · Nov 16, 2025
@@ -6,7 +6,6 @@
 
 from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm._tensorrt_engine import LLM
-from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
 from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter
 from tensorrt_llm.bench.build.build import get_model_config
 from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
@@ -115,6 +114,8 @@ def get_llm(runtime_config: RuntimeConfig, kwargs: dict):
             kwargs["enable_iter_perf_stats"] = True
 
     elif runtime_config.backend == "_autodeploy":
+        from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
+
         kwargs["world_size"] = kwargs.pop("tensor_parallel_size", None)
         llm_cls = AutoDeployLLM
 

diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
@@ -16,7 +16,6 @@
 from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm import MultimodalEncoder
 from tensorrt_llm._tensorrt_engine import LLM
-from tensorrt_llm._torch.auto_deploy.llm import LLM as AutoDeployLLM
 from tensorrt_llm._utils import mpi_rank
 from tensorrt_llm.executor.utils import LlmLauncherEnvs
 from tensorrt_llm.inputs.multimodal import MultimodalServerConfig
@@ -162,6 +161,8 @@ def launch_server(
     if backend == 'pytorch':
         llm = PyTorchLLM(**llm_args)
     elif backend == '_autodeploy':
+        from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
+
         # AutoDeploy does not support build_config
         llm_args.pop("build_config", None)
         llm = AutoDeployLLM(**llm_args)