InternLM · littlegy · Nov 24, 2025 · Nov 25, 2025 · Nov 28, 2025 · Nov 28, 2025
diff --git a/autotest/benchmark/test_longtext_performance.py b/autotest/benchmark/test_longtext_performance.py
@@ -5,51 +5,51 @@
 
 @pytest.mark.gpu_num_1
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1, kvint_list=[4, 8], is_longtext=True))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=1, kvint_list=[4, 8], is_longtext=True))
 def test_longtext_tp1(config, run_id, run_config, worker_id):
     result, msg = longtext_throughput_test(config,
                                            run_id,
                                            run_config,
-                                           cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1),
+                                           cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=1),
                                            worker_id=worker_id)
 
     assert result, msg
 
 
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2, kvint_list=[4, 8], is_longtext=True))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=2, kvint_list=[4, 8], is_longtext=True))
 def test_longtext_tp2(config, run_id, run_config, worker_id):
     result, msg = longtext_throughput_test(config,
                                            run_id,
                                            run_config,
-                                           cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
+                                           cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=2),
                                            worker_id=worker_id)
 
     assert result, msg
 
 
 @pytest.mark.gpu_num_4
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, kvint_list=[4, 8], is_longtext=True))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=4, kvint_list=[4, 8], is_longtext=True))
 def test_longtext_tp4(config, run_id, run_config, worker_id):
     result, msg = longtext_throughput_test(config,
                                            run_id,
                                            run_config,
-                                           cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4),
+                                           cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=4),
                                            worker_id=worker_id)
 
     assert result, msg
 
 
 @pytest.mark.gpu_num_8
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=8, kvint_list=[4, 8], is_longtext=True))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=8, kvint_list=[4, 8], is_longtext=True))
 def test_longtext_tp8(config, run_id, run_config, worker_id):
     result, msg = longtext_throughput_test(config,
                                            run_id,
                                            run_config,
-                                           cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=8),
+                                           cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=8),
                                            worker_id=worker_id)
 
     assert result, msg
diff --git a/autotest/benchmark/test_prefixcache_performance.py b/autotest/benchmark/test_prefixcache_performance.py
@@ -5,51 +5,51 @@
 
 @pytest.mark.gpu_num_1
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1, kvint_list=[4, 8]))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=1, kvint_list=[4, 8]))
 def test_prefixcache_throughput_tp1(config, run_id, run_config, worker_id):
     result, msg = prefixcache_throughput_test(config,
                                               run_id,
                                               run_config,
-                                              cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1),
+                                              cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=1),
                                               worker_id=worker_id)
 
     assert result, msg
 
 
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2, kvint_list=[4, 8]))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=2, kvint_list=[4, 8]))
 def test_prefixcache_throughput_tp2(config, run_id, run_config, worker_id):
     result, msg = prefixcache_throughput_test(config,
                                               run_id,
                                               run_config,
-                                              cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
+                                              cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=2),
                                               worker_id=worker_id)
 
     assert result, msg
 
 
 @pytest.mark.gpu_num_4
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, kvint_list=[4, 8]))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=4, kvint_list=[4, 8]))
 def test_prefixcache_throughput_tp4(config, run_id, run_config, worker_id):
     result, msg = prefixcache_throughput_test(config,
                                               run_id,
                                               run_config,
-                                              cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4),
+                                              cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=4),
                                               worker_id=worker_id)
 
     assert result, msg
 
 
 @pytest.mark.gpu_num_8
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, kvint_list=[4, 8]))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=4, kvint_list=[4, 8]))
 def test_prefixcache_throughput_tp8(config, run_id, run_config, worker_id):
     result, msg = prefixcache_throughput_test(config,
                                               run_id,
                                               run_config,
-                                              cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=8),
+                                              cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=8),
                                               worker_id=worker_id)
 
     assert result, msg
diff --git a/autotest/benchmark/test_throughput_performance.py b/autotest/benchmark/test_throughput_performance.py
@@ -7,51 +7,51 @@
 
 @pytest.mark.gpu_num_1
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1, kvint_list=[4, 8]))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=1, kvint_list=[4, 8]))
 def test_throughput_tp1(config, run_id, run_config, worker_id):
     result, msg = throughput_test(config,
                                   run_id,
                                   run_config,
-                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1),
+                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=1),
                                   worker_id=worker_id)
 
     assert result, msg
 
 
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2, kvint_list=[4, 8]))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=2, kvint_list=[4, 8]))
 def test_throughput_tp2(config, run_id, run_config, worker_id):
     result, msg = throughput_test(config,
                                   run_id,
                                   run_config,
-                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
+                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=2),
                                   worker_id=worker_id)
 
     assert result, msg
 
 
 @pytest.mark.gpu_num_4
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, kvint_list=[4, 8]))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=4, kvint_list=[4, 8]))
 def test_throughput_tp4(config, run_id, run_config, worker_id):
     result, msg = throughput_test(config,
                                   run_id,
                                   run_config,
-                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4),
+                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=4),
                                   worker_id=worker_id)
 
     assert result, msg
 
 
 @pytest.mark.gpu_num_8
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, kvint_list=[4, 8]))
+@pytest.mark.parametrize('run_config', get_benchmark_model_list(parallel_config=4, kvint_list=[4, 8]))
 def test_throughput_tp8(config, run_id, run_config, worker_id):
     result, msg = throughput_test(config,
                                   run_id,
                                   run_config,
-                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=8),
+                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=8),
                                   worker_id=worker_id)
 
     assert result, msg
@@ -73,7 +73,7 @@ def test_throughput_func_tp2(config, run_id, run_config, worker_id):
     result, msg = throughput_test(config,
                                   run_id,
                                   run_config,
-                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
+                                  cuda_prefix=get_cuda_prefix_by_workerid(worker_id, parallel_config=2),
                                   worker_id=worker_id,
                                   is_smoke=True)
 

diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml
@@ -9,56 +9,62 @@ tp_config:
     empty: 2
 
 turbomind_chat_model:
-    - meta-llama/Llama-3.2-3B-Instruct
-    - meta-llama/Llama-3.2-1B-Instruct
-    - internlm/internlm3-8b-instruct
-    - OpenGVLab/InternVL3-8B
-    - OpenGVLab/InternVL3-2B-Instruct
-    - OpenGVLab/InternVL3-1B-Instruct
-    - OpenGVLab/InternVL2_5-1B
-    - Qwen/Qwen3-8B
-    - Qwen/Qwen3-4B
-    - Qwen/Qwen3-1.7B
-    - Qwen/Qwen3-0.6B
-    - Qwen/Qwen2.5-7B-Instruct
+    tp:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - meta-llama/Llama-3.2-1B-Instruct
+        - internlm/internlm3-8b-instruct
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen2.5-7B-Instruct
 
 pytorch_chat_model:
-    - meta-llama/Llama-3.2-3B-Instruct
-    - meta-llama/Llama-3.2-1B-Instruct
-    - internlm/internlm3-8b-instruct
-    - OpenGVLab/InternVL3-8B
-    - OpenGVLab/InternVL3-2B-Instruct
-    - OpenGVLab/InternVL3-1B-Instruct
-    - OpenGVLab/InternVL2_5-1B
-    - Qwen/Qwen3-8B
-    - Qwen/Qwen3-4B
-    - Qwen/Qwen3-1.7B
-    - Qwen/Qwen3-0.6B
-    - Qwen/Qwen2.5-7B-Instruct
-    - Qwen/Qwen2.5-VL-3B-Instruct
-    - Qwen/Qwen2.5-VL-7B-Instruct
+    tp:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - meta-llama/Llama-3.2-1B-Instruct
+        - internlm/internlm3-8b-instruct
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-3B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
 
 turbomind_vl_model:
-    - OpenGVLab/InternVL3-8B
-    - OpenGVLab/InternVL3-2B-Instruct
-    - OpenGVLab/InternVL3-1B-Instruct
-    - OpenGVLab/InternVL2_5-1B
+    tp:
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
 
 pytorch_vl_model:
-    - OpenGVLab/InternVL3-8B
-    - OpenGVLab/InternVL3-2B-Instruct
-    - OpenGVLab/InternVL3-1B-Instruct
-    - OpenGVLab/InternVL2_5-1B
-    - Qwen/Qwen2.5-VL-3B-Instruct
-    - Qwen/Qwen2.5-VL-7B-Instruct
+    tp:
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen2.5-VL-3B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
 
 turbomind_base_model:
-    - internlm/internlm3-8b-instruct
-    - Qwen/Qwen3-8B
+    tp:
+        - internlm/internlm3-8b-instruct
+        - Qwen/Qwen3-8B
 
 pytorch_base_model:
-    - internlm/internlm3-8b-instruct
-    - Qwen/Qwen3-8B
+    tp:
+        - internlm/internlm3-8b-instruct
+        - Qwen/Qwen3-8B
 
 turbomind_quatization:
     no_awq:

diff --git a/autotest/config-5080.yaml b/autotest/config-5080.yaml
@@ -9,42 +9,48 @@ tp_config:
     empty: 2
 
 turbomind_chat_model:
-    - meta-llama/Llama-3.2-3B-Instruct
-    - meta-llama/Llama-3.2-1B-Instruct
-    - OpenGVLab/InternVL3-2B-Instruct
-    - OpenGVLab/InternVL3-1B-Instruct
-    - OpenGVLab/InternVL2_5-1B
-    - Qwen/Qwen3-4B
-    - Qwen/Qwen3-1.7B
-    - Qwen/Qwen3-0.6B
+    tp:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - meta-llama/Llama-3.2-1B-Instruct
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
 
 pytorch_chat_model:
-    - meta-llama/Llama-3.2-3B-Instruct
-    - meta-llama/Llama-3.2-1B-Instruct
-    - OpenGVLab/InternVL3-2B-Instruct
-    - OpenGVLab/InternVL3-1B-Instruct
-    - OpenGVLab/InternVL2_5-1B
-    - Qwen/Qwen3-4B
-    - Qwen/Qwen3-1.7B
-    - Qwen/Qwen3-0.6B
-    - Qwen/Qwen2.5-VL-3B-Instruct
+    tp:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - meta-llama/Llama-3.2-1B-Instruct
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen2.5-VL-3B-Instruct
 
 turbomind_vl_model:
-    - OpenGVLab/InternVL3-2B-Instruct
-    - OpenGVLab/InternVL3-1B-Instruct
-    - OpenGVLab/InternVL2_5-1B
+    tp:
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
 
 pytorch_vl_model:
-    - OpenGVLab/InternVL3-2B-Instruct
-    - OpenGVLab/InternVL3-1B-Instruct
-    - OpenGVLab/InternVL2_5-1B
-    - Qwen/Qwen2.5-VL-3B-Instruct
+    tp:
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen2.5-VL-3B-Instruct
 
 turbomind_base_model:
-    - Qwen/Qwen3-4B
+    tp:
+        - Qwen/Qwen3-4B
 
 pytorch_base_model:
-    - Qwen/Qwen3-4B
+    tp:
+        - Qwen/Qwen3-4B
 
 turbomind_quatization:
     no_awq: