[https://nvbugs/5547414][fix] Use cached models (#8755)

HuiGao-NV · web-flow · commit ae57738bae74 · 2025-10-29T19:10:10.000-07:00
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2832,7 +2832,7 @@ def test_dummy_load_format(self):
                       False,
                       True,
                       True,
-                      False,
+                      True,
                       marks=pytest.mark.skip_less_mpi_world_size(8))],
         ids=["latency", "multi_gpus_no_cache"])
     def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
diff --git a/tests/integration/defs/triton_server/test_triton_llm.py b/tests/integration/defs/triton_server/test_triton_llm.py
@@ -3708,7 +3708,7 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
                 f"--test-llmapi",
                 'dataset',
                 f"--dataset={os.path.join(llm_backend_dataset_root, 'mini_cnn_eval.json')}",
-                f"--tokenizer-dir=TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+                f"--tokenizer-dir={tiny_llama_model_root}",
             ]
 
             print_info("DEBUG:: run_cmd: python3 " + " ".join(run_cmd))
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -387,7 +387,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5582277)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] SKIP (https://nvbugs/5582277)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5582277)
-accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] SKIP (https://nvbugs/5547414)
 triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
 unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741)
 full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)

Original file line number	Diff line number	Diff line change
`@@ -3708,7 +3708,7 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,`
`3708`	`3708`	`f"--test-llmapi",`
`3709`	`3709`	`'dataset',`
`3710`	`3710`	`f"--dataset={os.path.join(llm_backend_dataset_root, 'mini_cnn_eval.json')}",`
`3711`		`- f"--tokenizer-dir=TinyLlama/TinyLlama-1.1B-Chat-v1.0",`
	`3711`	`+ f"--tokenizer-dir={tiny_llama_model_root}",`
`3712`	`3712`	`]`
`3713`	`3713`
`3714`	`3714`	`print_info("DEBUG:: run_cmd: python3 " + " ".join(run_cmd))`