Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions autotest/benchmark/test_longtext_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,51 +5,51 @@

@pytest.mark.gpu_num_1
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1, kvint_list=[4, 8], is_longtext=True))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=1, kvint_list=[4, 8], is_longtext=True))
def test_longtext_tp1(config, run_id, run_config, worker_id):
result, msg = longtext_throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=1),
worker_id=worker_id)

assert result, msg


@pytest.mark.gpu_num_2
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2, kvint_list=[4, 8], is_longtext=True))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=2, kvint_list=[4, 8], is_longtext=True))
def test_longtext_tp2(config, run_id, run_config, worker_id):
result, msg = longtext_throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=2),
worker_id=worker_id)

assert result, msg


@pytest.mark.gpu_num_4
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, kvint_list=[4, 8], is_longtext=True))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=4, kvint_list=[4, 8], is_longtext=True))
def test_longtext_tp4(config, run_id, run_config, worker_id):
result, msg = longtext_throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=4),
worker_id=worker_id)

assert result, msg


@pytest.mark.gpu_num_8
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=8, kvint_list=[4, 8], is_longtext=True))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=8, kvint_list=[4, 8], is_longtext=True))
def test_longtext_tp8(config, run_id, run_config, worker_id):
result, msg = longtext_throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=8),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=8),
worker_id=worker_id)

assert result, msg
16 changes: 8 additions & 8 deletions autotest/benchmark/test_prefixcache_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,51 +5,51 @@

@pytest.mark.gpu_num_1
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1, kvint_list=[4, 8]))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=1, kvint_list=[4, 8]))
def test_prefixcache_throughput_tp1(config, run_id, run_config, worker_id):
result, msg = prefixcache_throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=1),
worker_id=worker_id)

assert result, msg


@pytest.mark.gpu_num_2
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2, kvint_list=[4, 8]))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=2, kvint_list=[4, 8]))
def test_prefixcache_throughput_tp2(config, run_id, run_config, worker_id):
result, msg = prefixcache_throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=2),
worker_id=worker_id)

assert result, msg


@pytest.mark.gpu_num_4
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, kvint_list=[4, 8]))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=4, kvint_list=[4, 8]))
def test_prefixcache_throughput_tp4(config, run_id, run_config, worker_id):
result, msg = prefixcache_throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=4),
worker_id=worker_id)

assert result, msg


@pytest.mark.gpu_num_8
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, kvint_list=[4, 8]))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=4, kvint_list=[4, 8]))
def test_prefixcache_throughput_tp8(config, run_id, run_config, worker_id):
result, msg = prefixcache_throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=8),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=8),
worker_id=worker_id)

assert result, msg
18 changes: 9 additions & 9 deletions autotest/benchmark/test_throughput_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,51 +7,51 @@

@pytest.mark.gpu_num_1
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1, kvint_list=[4, 8]))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=1, kvint_list=[4, 8]))
def test_throughput_tp1(config, run_id, run_config, worker_id):
result, msg = throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=1),
worker_id=worker_id)

assert result, msg


@pytest.mark.gpu_num_2
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2, kvint_list=[4, 8]))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=2, kvint_list=[4, 8]))
def test_throughput_tp2(config, run_id, run_config, worker_id):
result, msg = throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=2),
worker_id=worker_id)

assert result, msg


@pytest.mark.gpu_num_4
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, kvint_list=[4, 8]))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=4, kvint_list=[4, 8]))
def test_throughput_tp4(config, run_id, run_config, worker_id):
result, msg = throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=4),
worker_id=worker_id)

assert result, msg


@pytest.mark.gpu_num_8
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, kvint_list=[4, 8]))
@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=4, kvint_list=[4, 8]))
def test_throughput_tp8(config, run_id, run_config, worker_id):
result, msg = throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=8),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=8),
worker_id=worker_id)

assert result, msg
Expand All @@ -73,7 +73,7 @@ def test_throughput_func_tp2(config, run_id, run_config, worker_id):
result, msg = throughput_test(config,
run_id,
run_config,
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=2),
worker_id=worker_id,
is_smoke=True)

Expand Down
86 changes: 46 additions & 40 deletions autotest/config-3090.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,56 +9,62 @@ tp_config:
empty: 2

turbomind_chat_model:
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-1B-Instruct
- internlm/internlm3-8b-instruct
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen3-8B
- Qwen/Qwen3-4B
- Qwen/Qwen3-1.7B
- Qwen/Qwen3-0.6B
- Qwen/Qwen2.5-7B-Instruct
tp:
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-1B-Instruct
- internlm/internlm3-8b-instruct
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen3-8B
- Qwen/Qwen3-4B
- Qwen/Qwen3-1.7B
- Qwen/Qwen3-0.6B
- Qwen/Qwen2.5-7B-Instruct

pytorch_chat_model:
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-1B-Instruct
- internlm/internlm3-8b-instruct
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen3-8B
- Qwen/Qwen3-4B
- Qwen/Qwen3-1.7B
- Qwen/Qwen3-0.6B
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen2.5-VL-3B-Instruct
- Qwen/Qwen2.5-VL-7B-Instruct
tp:
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-1B-Instruct
- internlm/internlm3-8b-instruct
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen3-8B
- Qwen/Qwen3-4B
- Qwen/Qwen3-1.7B
- Qwen/Qwen3-0.6B
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen2.5-VL-3B-Instruct
- Qwen/Qwen2.5-VL-7B-Instruct

turbomind_vl_model:
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
tp:
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B

pytorch_vl_model:
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen2.5-VL-3B-Instruct
- Qwen/Qwen2.5-VL-7B-Instruct
tp:
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen2.5-VL-3B-Instruct
- Qwen/Qwen2.5-VL-7B-Instruct

turbomind_base_model:
- internlm/internlm3-8b-instruct
- Qwen/Qwen3-8B
tp:
- internlm/internlm3-8b-instruct
- Qwen/Qwen3-8B

pytorch_base_model:
- internlm/internlm3-8b-instruct
- Qwen/Qwen3-8B
tp:
- internlm/internlm3-8b-instruct
- Qwen/Qwen3-8B

turbomind_quatization:
no_awq:
Expand Down
58 changes: 32 additions & 26 deletions autotest/config-5080.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,42 +9,48 @@ tp_config:
empty: 2

turbomind_chat_model:
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-1B-Instruct
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen3-4B
- Qwen/Qwen3-1.7B
- Qwen/Qwen3-0.6B
tp:
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-1B-Instruct
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen3-4B
- Qwen/Qwen3-1.7B
- Qwen/Qwen3-0.6B

pytorch_chat_model:
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-1B-Instruct
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen3-4B
- Qwen/Qwen3-1.7B
- Qwen/Qwen3-0.6B
- Qwen/Qwen2.5-VL-3B-Instruct
tp:
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-1B-Instruct
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen3-4B
- Qwen/Qwen3-1.7B
- Qwen/Qwen3-0.6B
- Qwen/Qwen2.5-VL-3B-Instruct

turbomind_vl_model:
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
tp:
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B

pytorch_vl_model:
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen2.5-VL-3B-Instruct
tp:
- OpenGVLab/InternVL3-2B-Instruct
- OpenGVLab/InternVL3-1B-Instruct
- OpenGVLab/InternVL2_5-1B
- Qwen/Qwen2.5-VL-3B-Instruct

turbomind_base_model:
- Qwen/Qwen3-4B
tp:
- Qwen/Qwen3-4B

pytorch_base_model:
- Qwen/Qwen3-4B
tp:
- Qwen/Qwen3-4B

turbomind_quatization:
no_awq:
Expand Down
Loading
Loading