NVIDIA · xinhe-nv · Apr 23, 2025 · Apr 21, 2025 · Apr 21, 2025
@@ -19,7 +19,8 @@
 from tensorrt_llm.quantization import QuantAlgo
 
 from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_nvls,
-                        skip_pre_ada, skip_pre_blackwell, skip_pre_hopper)
+                        skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
+                        skip_pre_hopper)
 from .accuracy_core import (MMLU, CliFlowAccuracyTestHarness, CnnDailymail,
                             Humaneval, PassKeyRetrieval64k,
                             PassKeyRetrieval128k, SlimPajama6B, ZeroScrolls)
@@ -57,6 +58,7 @@ def test_weight_only(self, precision: str):
     def test_int8_kv_cache(self):
         self.run(kv_cache_quant_algo=QuantAlgo.INT8)
 
+    @skip_post_blackwell
     @parametrize_with_ids("per_token,per_channel", [(False, False),
                                                     (True, True)])
     def test_smooth_quant(self, per_token: bool, per_channel: bool):
@@ -142,6 +144,7 @@ class TestStarcoder2_15B(CliFlowAccuracyTestHarness):
     MODEL_PATH = f"{llm_models_root()}/starcoder2-model"
     EXAMPLE_FOLDER = "gpt"
 
+    @skip_post_blackwell
     def test_smooth_quant_ootb(self):
         self.run(tasks=[Humaneval(self.MODEL_NAME)],
                  quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL)
@@ -194,9 +197,11 @@ class TestPhi2(CliFlowAccuracyTestHarness):
     MODEL_PATH = f"{llm_models_root()}/phi-2"
     EXAMPLE_FOLDER = "phi"
 
+    @skip_post_blackwell
     def test_auto_dtype(self):
         self.run(dtype='auto')
 
+    @skip_post_blackwell
     @pytest.mark.skip_less_device(2)
     def test_tp2(self):
         self.run(tp_size=2)
@@ -316,6 +321,7 @@ def test_medusa(self, cuda_graph, mocker):
                  extra_build_args=["--speculative_decoding_mode=medusa"],
                  extra_summarize_args=extra_summarize_args)
 
+    @skip_post_blackwell
     @parametrize_with_ids("cuda_graph,chunked_context,typical_acceptance",
                           [(False, False, False), (True, False, False),
                            (True, True, False), (True, False, True)])
@@ -360,6 +366,7 @@ def test_beam_search(self):
                  extra_build_args=["--max_beam_width=5"],
                  extra_summarize_args=["--num_beams=5"])
 
+    @skip_post_blackwell
     def test_int4_gptq(self):
         self.run(
             quant_algo=QuantAlgo.W4A16_GPTQ,
@@ -386,6 +393,7 @@ class TestLlama2_7B(CliFlowAccuracyTestHarness):
     def test_auto_dtype(self):
         self.run(dtype='auto')
 
+    @skip_post_blackwell
     def test_smooth_quant(self):
         self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
 
@@ -433,21 +441,25 @@ def test_fp8_low_latency_gemm_plugin(self):
                  extra_build_args=["--low_latency_gemm_plugin=fp8"])
 
     @pytest.mark.skip_less_device(2)
+    @skip_post_blackwell
     def test_smooth_quant_ootb_tp2(self):
         self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL, tp_size=2)
 
     @pytest.mark.skip_less_device(2)
+    @skip_post_blackwell
     def test_int4_awq_tp2(self):
         self.run(quant_algo=QuantAlgo.W4A16_AWQ, tp_size=2)
 
     @pytest.mark.skip_less_device(2)
+    @skip_post_blackwell
     def test_int4_awq_prequantized_tp2(self, mocker):
         mocker.patch.object(
             self.__class__, "MODEL_PATH",
             f"{llm_models_root()}/llama-models-v2/Llama-2-7B-AWQ")
         self.run(quant_algo=QuantAlgo.W4A16_AWQ, tp_size=2)
 
     @pytest.mark.skip_less_device(2)
+    @skip_post_blackwell
     def test_int4_gptq_prequantized_tp2(self, mocker):
         mocker.patch.object(
             self.__class__, "MODEL_PATH",
@@ -469,16 +481,19 @@ def test_auto_dtype(self):
     def test_float32(self):
         self.run(dtype='float32')
 
+    @skip_post_blackwell
     @pytest.mark.parametrize("precision", ["int8", "int4"])
     def test_weight_only(self, precision: str):
         quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
         self.run(quant_algo=quant_algo)
 
+    @skip_post_blackwell
     @pytest.mark.parametrize("precision", ["int8", "int4"])
     def test_weight_only_int8_kv_cache(self, precision: str):
         quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
         self.run(quant_algo=quant_algo, kv_cache_quant_algo=QuantAlgo.INT8)
 
+    @skip_post_blackwell
     @pytest.mark.parametrize("precision", ["int8", "int4"])
     def test_weight_only_manage_weights(self, precision: str):
         quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
@@ -567,6 +582,7 @@ class TestLlama3_1_8B(CliFlowAccuracyTestHarness):
     def test_auto_dtype(self):
         self.run(dtype='auto')
 
+    @skip_post_blackwell
     def test_smooth_quant(self):
         self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
 
@@ -575,12 +591,14 @@ def test_fp8(self):
         self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)
 
     @skip_pre_ada
+    @skip_post_blackwell
     def test_fp8_rowwise(self):
         self.run(tasks=[CnnDailymail(self.MODEL_NAME),
                         MMLU(self.MODEL_NAME)],
                  quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN)
 
     @skip_pre_ada
+    @skip_post_blackwell
     def test_fp8_rowwise_meta_recipe(self):
         self.run(quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN,
                  extra_acc_spec="meta_recipe",
@@ -601,6 +619,7 @@ def test_tp4(self, gemm_allreduce: bool):
             extra_build_args=extra_build_args)
 
     @skip_pre_ada
+    @skip_post_blackwell
     @pytest.mark.skip_less_device(4)
     @pytest.mark.parametrize(
         "gemm_allreduce", [False, pytest.param(True, marks=skip_no_nvls)],
@@ -646,6 +665,7 @@ def test_fp8_prequantized(self, mocker):
         self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)
 
     @skip_pre_ada
+    @skip_post_blackwell
     def test_medusa_fp8_prequantized(self, mocker):
         # nvidia/Llama-3.1-8B-Medusa-FP8
         mocker.patch.object(self.__class__, "MODEL_PATH",
@@ -670,23 +690,29 @@ class TestLlama3_2_1B(CliFlowAccuracyTestHarness):
     def test_auto_dtype(self):
         self.run(dtype='auto')
 
+    @skip_post_blackwell
     def test_smooth_quant(self):
         self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
 
+    @skip_post_blackwell
     def test_smooth_quant_ootb(self):
         self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL)
 
+    @skip_post_blackwell
     def test_smooth_quant_ootb_manage_weights(self):
         self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL,
                  extra_build_args=["--fast_build"])
 
+    @skip_post_blackwell
     def test_int4_awq(self):
         self.run(quant_algo=QuantAlgo.W4A16_AWQ)
 
+    @skip_post_blackwell
     def test_int4_awq_int8_kv_cache(self):
         self.run(quant_algo=QuantAlgo.W4A16_AWQ,
                  kv_cache_quant_algo=QuantAlgo.INT8)
 
+    @skip_post_blackwell
     def test_int4_awq_manage_weights(self):
         self.run(quant_algo=QuantAlgo.W4A16_AWQ,
                  extra_build_args=["--fast_build"])
@@ -733,10 +759,12 @@ def test_fp8_pp2(self):
                  pp_size=2)
 
     @skip_pre_ada
+    @skip_post_blackwell
     def test_fp8_rowwise(self):
         self.run(quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN)
 
     @skip_pre_ada
+    @skip_post_blackwell
     def test_fp8_rowwise_meta_recipe(self):
         self.run(quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN,
                  extra_acc_spec="meta_recipe",
@@ -830,6 +858,7 @@ def test_weight_only(self, precision: str):
         quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
         self.run(quant_algo=quant_algo, extra_convert_args=["--ckpt-type=hf"])
 
+    @skip_post_blackwell
     def test_smooth_quant(self):
         self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN,
                  extra_convert_args=[
@@ -841,6 +870,7 @@ def test_smooth_quant(self):
     def test_fp8(self):
         self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)
 
+    @skip_post_blackwell
     def test_int4_awq(self):
         self.run(quant_algo=QuantAlgo.W4A16_AWQ)
 
@@ -859,6 +889,7 @@ def test_weight_only(self, precision: str):
         quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
         self.run(quant_algo=quant_algo, extra_convert_args=["--ckpt-type=hf"])
 
+    @skip_post_blackwell
     @pytest.mark.skip_less_device_memory(50000)
     def test_smooth_quant(self):
         self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN,
@@ -871,6 +902,7 @@ def test_smooth_quant(self):
     def test_fp8(self):
         self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)
 
+    @skip_post_blackwell
     def test_int4_awq(self):
         self.run(quant_algo=QuantAlgo.W4A16_AWQ)
 
@@ -887,6 +919,7 @@ def test_auto_dtype(self):
                  dtype='auto',
                  extra_convert_args=["--ckpt-type=hf"])
 
+    @skip_post_blackwell
     @pytest.mark.parametrize("precision", ["int8", "int4"])
     def test_weight_only(self, precision: str):
         quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
@@ -910,6 +943,7 @@ def test_auto_dtype(self):
     def test_weight_only(self):
         self.run(quant_algo=QuantAlgo.W8A16)
 
+    @skip_post_blackwell
     def test_int4_gptq_prequantized(self, mocker):
         mocker.patch.object(self.__class__, "MODEL_PATH",
                             f"{llm_models_root()}/Qwen-7B-Chat-Int4")
@@ -938,6 +972,7 @@ class TestQwen2_0_5BInstruct(CliFlowAccuracyTestHarness):
     def test_auto_dtype(self):
         self.run(dtype='auto')
 
+    @skip_post_blackwell
     def test_weight_only(self):
         self.run(quant_algo=QuantAlgo.W8A16)
 
@@ -956,9 +991,11 @@ class TestQwen2_7BInstruct(CliFlowAccuracyTestHarness):
     def test_auto_dtype(self):
         self.run(dtype='auto')
 
+    @skip_post_blackwell
     def test_weight_only(self):
         self.run(quant_algo=QuantAlgo.W8A16)
 
+    @skip_post_blackwell
     def test_int4_awq_prequantized(self, mocker):
         mocker.patch.object(self.__class__, "MODEL_PATH",
                             f"{llm_models_root()}/Qwen2-7B-Instruct-AWQ")
@@ -990,6 +1027,7 @@ class TestQwen2_5_1_5BInstruct(CliFlowAccuracyTestHarness):
     def test_auto_dtype(self):
         self.run(dtype='auto')
 
+    @skip_post_blackwell
     def test_weight_only(self):
         self.run(quant_algo=QuantAlgo.W8A16)
 

@@ -18,7 +18,7 @@
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo
 
-from ..conftest import llm_models_root, skip_pre_ada
+from ..conftest import llm_models_root, skip_post_blackwell, skip_pre_ada
 from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness
 
 
@@ -27,6 +27,7 @@ class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
     MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-8B"
 
     @skip_pre_ada
+    @skip_post_blackwell
     def test_fp8_rowwise(self):
         quant_config = QuantConfig(QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN)
 
@@ -65,6 +66,7 @@ def test_auto_dtype(self):
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
 
+    @skip_post_blackwell
     def test_weight_only(self):
         quant_config = QuantConfig(QuantAlgo.W8A16)
         with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
@@ -133,6 +135,7 @@ def test_auto_dtype(self):
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_post_blackwell
     def test_weight_only(self):
         quant_config = QuantConfig(QuantAlgo.W8A16)
         with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:

@@ -22,6 +22,7 @@
 from defs.trt_test_alternative import check_call
 
 
+@skip_post_blackwell
 @pytest.mark.parametrize("use_dynamic_tree", [False, True],
                          ids=['eagle1', 'eagle2'])
 @pytest.mark.parametrize("batch_size", [1, 8], ids=['bs1', 'bs8'])

@@ -17,6 +17,7 @@
 import pytest
 from defs.common import (convert_weights, generate_summary_cmd, venv_check_call,
                          venv_mpi_check_call)
+from defs.conftest import skip_post_blackwell
 from defs.trt_test_alternative import check_call
 
 
@@ -26,7 +27,8 @@
 @pytest.mark.parametrize("llm_exaone_model_root",
                          ['exaone_3.0_7.8b_instruct', 'exaone_deep_2.4b'],
                          indirect=True)
-@pytest.mark.parametrize("use_weight_only", [True, False],
+@pytest.mark.parametrize("use_weight_only",
+                         [pytest.param(True, marks=skip_post_blackwell), False],
                          ids=["enable_weight_only", "disable_weight_only"])
 def test_llm_exaone_1gpu(data_type, exaone_example_root, llm_exaone_model_root,
                          llama_example_root, llm_datasets_root, llm_rouge_root,

@@ -23,7 +23,8 @@
                          venv_mpi_check_call)
 from defs.conftest import (evaltool_mmlu_post_process,
                            evaltool_wikilingua_post_process, llm_models_root,
-                           skip_pre_ada, skip_pre_blackwell)
+                           skip_post_blackwell, skip_pre_ada,
+                           skip_pre_blackwell)
 from defs.trt_test_alternative import check_call
 from evaltool.constants import (EVALTOOL_INFERENCE_SERVER_STARTUP_SCRIPT,
                                 EVALTOOL_INFERENCE_SERVER_STOP_SCRIPT,
@@ -888,6 +889,7 @@ def test_llm_mixtral_1gpu_fp4_llmapi(
     venv_check_call(llm_venv, mmlu_cmd)
 
 
+@skip_post_blackwell
 @pytest.mark.parametrize(
     "model_name", ['mixtral-8x7b-v0.1-AWQ', 'Mixtral-8x7B-Instruct-v0.1'])
 def test_llm_mixtral_int4_awq_1gpu_summary(llama_example_root,

@@ -17,7 +17,7 @@
 
 import pytest
 from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
-from defs.conftest import get_device_memory, skip_pre_ada
+from defs.conftest import get_device_memory, skip_post_blackwell, skip_pre_ada
 from defs.trt_test_alternative import check_call
 
 
@@ -616,7 +616,7 @@ def _test_llm_multimodal_general(llm_venv,
     'neva-22b',
     'kosmos-2',
     'video-neva',
-    'Phi-3-vision-128k-instruct',
+    pytest.param('Phi-3-vision-128k-instruct', marks=skip_post_blackwell),
     'Phi-3.5-vision-instruct',
     'Phi-4-multimodal-instruct',
     'Llama-3.2-11B-Vision',

@@ -24,7 +24,8 @@
                            evaltool_mmlu_post_process,
                            evaltool_mtbench_post_process,
                            evaltool_wikilingua_post_process, get_device_memory,
-                           skip_fp8_pre_ada, skip_pre_ada)
+                           get_sm_version, skip_fp8_pre_ada,
+                           skip_post_blackwell, skip_pre_ada)
 from defs.trt_test_alternative import check_call
 from evaltool.constants import (EVALTOOL_INFERENCE_SERVER_STARTUP_SCRIPT,
                                 EVALTOOL_INFERENCE_SERVER_STOP_SCRIPT,
@@ -421,6 +422,8 @@ def test_llm_phi_lora_1gpu(data_type, lora_data_type, phi_example_root,
     model_name = 'phi-3-lora'
     if data_type == 'fp8':
         skip_fp8_pre_ada(use_fp8=True)
+        if get_sm_version() >= 100:
+            pytest.skip("FP8 is not supported on post-Blackwell architectures")
         model_dir = quantize_data(
             llm_venv,
             phi_example_root,
@@ -570,6 +573,7 @@ def test_llm_phi_quantization_1gpu(data_type, llm_phi_model_root, llm_venv,
 
 
 @skip_pre_ada
+@skip_post_blackwell
 @pytest.mark.parametrize("llm_phi_model_root", [
     "phi-2", "Phi-3-mini-128k-instruct", "Phi-3-small-128k-instruct",
     "Phi-3.5-mini-instruct", "Phi-3.5-MoE-instruct", "Phi-4-mini-instruct"