Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
model: llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
hostname: localhost
port: 8000
backend: pytorch

context_servers:
num_instances: 1
tensor_parallel_size: 4
pipeline_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 8192
max_seq_len: 257000
max_input_len: 256000
max_batch_size: 1
trust_remote_code: true
enable_chunked_prefill: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.3
disable_overlap_scheduler: true
cuda_graph_config: null
cache_transceiver_config:
backend: UCX
# Intentionally small to reproduce buffer overflow bug
max_tokens_in_buffer: 2048
urls:
- "localhost:8001"

generation_servers:
num_instances: 1
tensor_parallel_size: 4
pipeline_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 8192
max_seq_len: 257000
max_input_len: 256000
max_batch_size: 1
trust_remote_code: true
enable_chunked_prefill: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.3
disable_overlap_scheduler: true
cuda_graph_config: null
cache_transceiver_config:
backend: UCX
# Intentionally small to reproduce buffer overflow bug
max_tokens_in_buffer: 2048
urls:
- "localhost:8002"
138 changes: 138 additions & 0 deletions tests/integration/defs/disaggregated/test_disaggregated.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,8 @@ def get_test_config(test_desc, example_dir, test_root):
(3,
f"{test_configs_root}/disagg_config_deepseek_v3_lite_empty_batch.yaml"
),
"llama4_kv_cache_overflow":
(8, f"{test_configs_root}/disagg_config_llama4_kv_cache_overflow.yaml"),
}

if test_desc not in config_map:
Expand Down Expand Up @@ -1685,6 +1687,104 @@ def get_config_for_benchmark(model_root, backend):
return serve_config


def run_disaggregated_genai_perf(config_file,
model_path,
num_ranks,
server_start_timeout=1200,
input_tokens=128000,
output_tokens=100,
env=None,
cwd=None):
"""Run disaggregated test with genai-perf for performance/stress testing."""
cleanup_output_files()
run_env = env.copy()
run_env["UCX_TLS"] = "^ib"

workers_cmd = [
'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
str(num_ranks), 'trtllm-serve', 'disaggregated_mpi_worker', '-c',
config_file
]

server_cmd = [
'trtllm-serve', 'disaggregated', '--server_start_timeout',
str(server_start_timeout), '-c', config_file
]

artifact_dir = os.path.join(cwd or ".", "benchmark-results")

try:
with (open('output_workers.log', 'w') as output_workers,
popen(workers_cmd,
stdout=output_workers,
stderr=subprocess.STDOUT,
env=run_env,
cwd=cwd) as workers_proc, open('output_disagg.log', 'w') as
output_disagg,
popen(server_cmd,
stdout=output_disagg,
stderr=subprocess.STDOUT,
env=run_env,
cwd=cwd) as server_proc):

# Wait for server to be ready
if not wait_for_server(
"localhost", 8000, timeout_seconds=server_start_timeout):
raise RuntimeError(
f"Disaggregated server did not become ready within {server_start_timeout} seconds"
)

# Run genai-perf
genai_perf_cmd = [
'genai-perf', 'profile', '--model', model_path, '--tokenizer',
model_path, '--endpoint-type', 'chat', '--endpoint',
'/v1/chat/completions', '--streaming', '--url',
'localhost:8000', '--synthetic-input-tokens-mean',
str(input_tokens), '--synthetic-input-tokens-stddev', '0',
'--output-tokens-mean',
str(output_tokens), '--output-tokens-stddev', '0',
'--extra-inputs', f'max_tokens:{output_tokens}',
'--extra-inputs', f'min_tokens:{output_tokens}',
'--extra-inputs', 'ignore_eos:true', '--concurrency', '1',
'--warmup-request-count', '8', '--num-dataset-entries', '64',
'--random-seed', '100', '--artifact-dir', artifact_dir, '--',
'-v', '-H', 'Authorization: Bearer NOT USED', '-H',
'Accept: text/event-stream', '-p', '200000'
]

check_call(genai_perf_cmd,
env=env,
poll_procs=[workers_proc, server_proc])

except Exception:
# Print outputs on error
logger.error("-------- Workers output (last 30 lines) --------")
try:
with open('output_workers.log', 'r') as f:
lines = f.read().split('\n')
for line in lines[-30:]:
if line.strip():
logger.error(line)
except FileNotFoundError:
pass

logger.error("-------- Disagg server output (last 30 lines) --------")
try:
with open('output_disagg.log', 'r') as f:
lines = f.read().split('\n')
for line in lines[-30:]:
if line.strip():
logger.error(line)
except FileNotFoundError:
pass
raise
finally:
server_proc.terminate()
workers_proc.terminate()
server_proc.wait()
workers_proc.wait()


@pytest.mark.parametrize("benchmark_model_root", [
'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf',
'llama-3.1-8b-instruct-hf-fp8'
Expand Down Expand Up @@ -1768,3 +1868,41 @@ def test_disaggregated_deepseek_v3_lite_bf16_empty_batch(
print(f"E2EL: {e2el} ms, TTFT: {ttft} ms")

assert e2el > 0 and ttft > 0


@pytest.mark.skip_less_device(8)
@pytest.mark.skip_less_device_memory(140000)
@pytest.mark.parametrize(
"model_path",
['llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8'])
def test_llama4_long_context_kv_cache_overflow(disaggregated_test_root,
disaggregated_example_root,
llm_venv, model_path):
"""
RCCA: https://nvbugspro.nvidia.com/bug/5555681
Test to reproduce KV cache buffer overflow bug with long context.
"""
models_root = llm_models_root()
llama4_model_root = os.path.join(models_root, model_path)

# Create symlink to match config file path
src_dst_dict = {
llama4_model_root: f"{llm_venv.get_working_directory()}/{model_path}",
}
for src, dst in src_dst_dict.items():
if not os.path.islink(dst):
os.makedirs(os.path.dirname(dst), exist_ok=True)
os.symlink(src, dst, target_is_directory=True)

num_ranks, config_file = get_test_config("llama4_kv_cache_overflow",
disaggregated_example_root,
os.path.dirname(__file__))

run_disaggregated_genai_perf(config_file=config_file,
model_path=llama4_model_root,
num_ranks=num_ranks,
server_start_timeout=1200,
input_tokens=128000,
output_tokens=100,
env=llm_venv._new_env,
cwd=llm_venv.get_working_directory())
1 change: 1 addition & 0 deletions tests/integration/test_lists/qa/llm_function_core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_att
disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_llama4_long_context_kv_cache_overflow[llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8]
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8]
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
Expand Down
Loading