Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 19 additions & 9 deletions tensorrt_llm/executor/base_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,13 @@ def _configure_affinity(self, device_id):
Note:
If the process already has constrained affinity, a warning is logged.
Configuration is handled as follows:
TLLM_NUMA_WORKER_AFFINITY = <unset>
-> affinity is auto-configured only if it is unconstrained
TLLM_NUMA_WORKER_AFFINITY = 1
-> affinity is unconditionally auto-configured
TLLM_NUMA_WORKER_AFFINITY = 0 or any other value
-> affinity is unconditionally _not_ auto-configured
TLLM_NUMA_AWARE_WORKER_AFFINITY = <unset>
-> Affinity is automatically configured if it is unconstrained,
and deleted if it is constrained externally by the user.
TLLM_NUMA_AWARE_WORKER_AFFINITY = 1
-> Affinity is unconditionally auto-configured.
TLLM_NUMA_AWARE_WORKER_AFFINITY = 0 or any other value
-> Affinity is unconditionally _not_ auto-configured.
'''

# Get the current affinity setting
Expand All @@ -141,22 +142,31 @@ def _configure_affinity(self, device_id):
all_cpus = list(range(psutil.cpu_count()))

constrained_affinity = (cpu_affinity != all_cpus)
numa_aware_affinity = os.environ.get("TLLM_NUMA_AWARE_WORKER_AFFINITY")

# If the process is affined to a constrained set of CPUs, warn the user
# so as to ensure that this is what is intended
# If affinity is constrained but the user hasn't explicitly
# requested NUMA-aware affinity, remove the constraints.
if constrained_affinity:
logger.warning(
f"Worker process {pid} is affined to run on the following CPUs: "
f"{cpu_affinity} (subset of all logical CPUs). This may harm "
f"performance if set incorrectly.")
if numa_aware_affinity is None:
logger.warning(
f"Worker process {pid} has constrained CPU affinity "
f"but `TLLM_NUMA_AWARE_WORKER_AFFINITY` is not set. "
f"Removing CPU affinity constraints.")
process.cpu_affinity(all_cpus)

# If affinity is unconstrained and the user hasn't explicitly
# prohibited it or the user has explicitly requested it, choose the
# optimal affinity based upon the NUMA topology
numa_aware_affinity = os.environ.get("TLLM_NUMA_AWARE_WORKER_AFFINITY")
if ((numa_aware_affinity is None and not constrained_affinity)
or (numa_aware_affinity == "1")):
process.cpu_affinity(get_numa_aware_cpu_affinity(device_id))
logger.info(
f"Worker process {pid} CPU affinity set to "
f"{process.cpu_affinity()} for optimal NUMA-aware scheduling.")

def _get_comm_ranks_device_id(self):
device_id = self.global_rank % torch.cuda.device_count()
Expand Down
13 changes: 5 additions & 8 deletions tests/integration/defs/accuracy/test_disaggregated_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ def launch_disaggregated_llm(
ctx_model: str = None,
gen_model: str = None,
server_waiting_timeout: int = DEFAULT_SERVER_WAITING_TIMEOUT,
max_workers: int = 16):
max_workers: int = 16,
enable_perf=False):
temp_dir = tempfile.TemporaryDirectory()
disaggregated_serving_config_path = os.path.join(
temp_dir.name, "disaggregated_serving_config.yaml")
Expand All @@ -104,9 +105,7 @@ def launch_disaggregated_llm(
print(
f"Using unified tp parameter for testing is not recommended. Please use server configs instead."
)

enable_perf = True
perf_max_requests = 10000
perf_max_requests = 50

def _apply_perf_flags(cfg: Optional[Dict[str, Any]]):
if not isinstance(cfg, dict):
Expand All @@ -120,6 +119,7 @@ def _apply_perf_flags(cfg: Optional[Dict[str, Any]]):
_apply_perf_flags(disaggregated_server_config)
_apply_perf_flags(ctx_server_config)
_apply_perf_flags(gen_server_config)

disaggregated_server_config = revise_disaggregated_server_config_urls_with_free_ports(
disaggregated_server_config)

Expand Down Expand Up @@ -366,7 +366,7 @@ def _get_perf_metrics():
except requests.exceptions.RequestException as e:
print(f"Error fetching {perf_url}: {e}")

def _show_kvcache_time(kv_cache_perf_dir, max_lines=1000):
def _show_kvcache_time(kv_cache_perf_dir, max_lines=100):
print(f"kv_cache_perf_dir: {kv_cache_perf_dir}")
for file in os.listdir(kv_cache_perf_dir):
print(f"file: {file}")
Expand Down Expand Up @@ -475,9 +475,6 @@ def test_auto_dtype(self, disable_overlap_scheduler, ctx_enable_block_reuse,
"disable_overlap_scheduler": disable_overlap_scheduler,
"kv_cache_config": {
"enable_block_reuse": gen_enable_block_reuse
},
"cache_transceiver_config": {
"backend": "DEFAULT"
}
}
gen_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"}
Expand Down
6 changes: 0 additions & 6 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -349,8 +349,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-b
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5651824)
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKIP (https://nvbugs/5651854)
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] SKIP (https://nvbugs/5651854)
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_empty_batch[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5601682)
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] SKIP (https://nvbugs/5655584)
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
Expand All @@ -370,10 +368,7 @@ unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEP] SKIP
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[MNNVL] SKIP (https://nvbugs/5664904)
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8] SKIP (https://nvbugs/5670469)
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-NVFP4-nvfp4-quantized/Llama-3_3-Nemotron-Super-49B-v1_nvfp4_hf] SKIP (https://nvbugs/5670469)
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-True] SKIP (https://nvbugs/5670480)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] SKIP (https://nvbugs/5673610)
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False] SKIP (https://nvbugs/5670480)
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-True-False] SKIP (https://nvbugs/5670480)
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5673559)
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5673578)
examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen2.5_14b_instruct_int4-nb:4] SKIP (https://nvbugs/5666826)
Expand All @@ -393,7 +388,6 @@ unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_
test_e2e.py::test_openai_responses SKIP (https://nvbugs/5635153)
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438)
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5688388)
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-True-True] SKIP (https://nvbugs/5670480)
accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721)
unittest/_torch/speculative/test_eagle3.py::test_llama_eagle3[True-FLASHINFER-False-False-False-False-True-False-False] SKIP (https://nvbugs/5691246)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5698897)
Expand Down