Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion tests/integration/defs/accuracy/test_cli_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from tensorrt_llm.quantization import QuantAlgo

from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_nvls,
skip_pre_ada, skip_pre_blackwell, skip_pre_hopper)
skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
skip_pre_hopper)
from .accuracy_core import (MMLU, CliFlowAccuracyTestHarness, CnnDailymail,
Humaneval, PassKeyRetrieval64k,
PassKeyRetrieval128k, SlimPajama6B, ZeroScrolls)
Expand Down Expand Up @@ -57,6 +58,7 @@ def test_weight_only(self, precision: str):
def test_int8_kv_cache(self):
self.run(kv_cache_quant_algo=QuantAlgo.INT8)

@skip_post_blackwell
@parametrize_with_ids("per_token,per_channel", [(False, False),
(True, True)])
def test_smooth_quant(self, per_token: bool, per_channel: bool):
Expand Down Expand Up @@ -142,6 +144,7 @@ class TestStarcoder2_15B(CliFlowAccuracyTestHarness):
MODEL_PATH = f"{llm_models_root()}/starcoder2-model"
EXAMPLE_FOLDER = "gpt"

@skip_post_blackwell
def test_smooth_quant_ootb(self):
self.run(tasks=[Humaneval(self.MODEL_NAME)],
quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL)
Expand Down Expand Up @@ -194,9 +197,11 @@ class TestPhi2(CliFlowAccuracyTestHarness):
MODEL_PATH = f"{llm_models_root()}/phi-2"
EXAMPLE_FOLDER = "phi"

@skip_post_blackwell
def test_auto_dtype(self):
self.run(dtype='auto')

@skip_post_blackwell
@pytest.mark.skip_less_device(2)
def test_tp2(self):
self.run(tp_size=2)
Expand Down Expand Up @@ -316,6 +321,7 @@ def test_medusa(self, cuda_graph, mocker):
extra_build_args=["--speculative_decoding_mode=medusa"],
extra_summarize_args=extra_summarize_args)

@skip_post_blackwell
@parametrize_with_ids("cuda_graph,chunked_context,typical_acceptance",
[(False, False, False), (True, False, False),
(True, True, False), (True, False, True)])
Expand Down Expand Up @@ -360,6 +366,7 @@ def test_beam_search(self):
extra_build_args=["--max_beam_width=5"],
extra_summarize_args=["--num_beams=5"])

@skip_post_blackwell
def test_int4_gptq(self):
self.run(
quant_algo=QuantAlgo.W4A16_GPTQ,
Expand All @@ -386,6 +393,7 @@ class TestLlama2_7B(CliFlowAccuracyTestHarness):
def test_auto_dtype(self):
self.run(dtype='auto')

@skip_post_blackwell
def test_smooth_quant(self):
self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)

Expand Down Expand Up @@ -433,21 +441,25 @@ def test_fp8_low_latency_gemm_plugin(self):
extra_build_args=["--low_latency_gemm_plugin=fp8"])

@pytest.mark.skip_less_device(2)
@skip_post_blackwell
def test_smooth_quant_ootb_tp2(self):
self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL, tp_size=2)

@pytest.mark.skip_less_device(2)
@skip_post_blackwell
def test_int4_awq_tp2(self):
self.run(quant_algo=QuantAlgo.W4A16_AWQ, tp_size=2)

@pytest.mark.skip_less_device(2)
@skip_post_blackwell
def test_int4_awq_prequantized_tp2(self, mocker):
mocker.patch.object(
self.__class__, "MODEL_PATH",
f"{llm_models_root()}/llama-models-v2/Llama-2-7B-AWQ")
self.run(quant_algo=QuantAlgo.W4A16_AWQ, tp_size=2)

@pytest.mark.skip_less_device(2)
@skip_post_blackwell
def test_int4_gptq_prequantized_tp2(self, mocker):
mocker.patch.object(
self.__class__, "MODEL_PATH",
Expand All @@ -469,16 +481,19 @@ def test_auto_dtype(self):
def test_float32(self):
self.run(dtype='float32')

@skip_post_blackwell
@pytest.mark.parametrize("precision", ["int8", "int4"])
def test_weight_only(self, precision: str):
quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
self.run(quant_algo=quant_algo)

@skip_post_blackwell
@pytest.mark.parametrize("precision", ["int8", "int4"])
def test_weight_only_int8_kv_cache(self, precision: str):
quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
self.run(quant_algo=quant_algo, kv_cache_quant_algo=QuantAlgo.INT8)

@skip_post_blackwell
@pytest.mark.parametrize("precision", ["int8", "int4"])
def test_weight_only_manage_weights(self, precision: str):
quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
Expand Down Expand Up @@ -567,6 +582,7 @@ class TestLlama3_1_8B(CliFlowAccuracyTestHarness):
def test_auto_dtype(self):
self.run(dtype='auto')

@skip_post_blackwell
def test_smooth_quant(self):
self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)

Expand All @@ -575,12 +591,14 @@ def test_fp8(self):
self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)

@skip_pre_ada
@skip_post_blackwell
def test_fp8_rowwise(self):
self.run(tasks=[CnnDailymail(self.MODEL_NAME),
MMLU(self.MODEL_NAME)],
quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN)

@skip_pre_ada
@skip_post_blackwell
def test_fp8_rowwise_meta_recipe(self):
self.run(quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN,
extra_acc_spec="meta_recipe",
Expand All @@ -601,6 +619,7 @@ def test_tp4(self, gemm_allreduce: bool):
extra_build_args=extra_build_args)

@skip_pre_ada
@skip_post_blackwell
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize(
"gemm_allreduce", [False, pytest.param(True, marks=skip_no_nvls)],
Expand Down Expand Up @@ -646,6 +665,7 @@ def test_fp8_prequantized(self, mocker):
self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)

@skip_pre_ada
@skip_post_blackwell
def test_medusa_fp8_prequantized(self, mocker):
# nvidia/Llama-3.1-8B-Medusa-FP8
mocker.patch.object(self.__class__, "MODEL_PATH",
Expand All @@ -670,23 +690,29 @@ class TestLlama3_2_1B(CliFlowAccuracyTestHarness):
def test_auto_dtype(self):
self.run(dtype='auto')

@skip_post_blackwell
def test_smooth_quant(self):
self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)

@skip_post_blackwell
def test_smooth_quant_ootb(self):
self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL)

@skip_post_blackwell
def test_smooth_quant_ootb_manage_weights(self):
self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL,
extra_build_args=["--fast_build"])

@skip_post_blackwell
def test_int4_awq(self):
self.run(quant_algo=QuantAlgo.W4A16_AWQ)

@skip_post_blackwell
def test_int4_awq_int8_kv_cache(self):
self.run(quant_algo=QuantAlgo.W4A16_AWQ,
kv_cache_quant_algo=QuantAlgo.INT8)

@skip_post_blackwell
def test_int4_awq_manage_weights(self):
self.run(quant_algo=QuantAlgo.W4A16_AWQ,
extra_build_args=["--fast_build"])
Expand Down Expand Up @@ -733,10 +759,12 @@ def test_fp8_pp2(self):
pp_size=2)

@skip_pre_ada
@skip_post_blackwell
def test_fp8_rowwise(self):
self.run(quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN)

@skip_pre_ada
@skip_post_blackwell
def test_fp8_rowwise_meta_recipe(self):
self.run(quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN,
extra_acc_spec="meta_recipe",
Expand Down Expand Up @@ -830,6 +858,7 @@ def test_weight_only(self, precision: str):
quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
self.run(quant_algo=quant_algo, extra_convert_args=["--ckpt-type=hf"])

@skip_post_blackwell
def test_smooth_quant(self):
self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN,
extra_convert_args=[
Expand All @@ -841,6 +870,7 @@ def test_smooth_quant(self):
def test_fp8(self):
self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)

@skip_post_blackwell
def test_int4_awq(self):
self.run(quant_algo=QuantAlgo.W4A16_AWQ)

Expand All @@ -859,6 +889,7 @@ def test_weight_only(self, precision: str):
quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
self.run(quant_algo=quant_algo, extra_convert_args=["--ckpt-type=hf"])

@skip_post_blackwell
@pytest.mark.skip_less_device_memory(50000)
def test_smooth_quant(self):
self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN,
Expand All @@ -871,6 +902,7 @@ def test_smooth_quant(self):
def test_fp8(self):
self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)

@skip_post_blackwell
def test_int4_awq(self):
self.run(quant_algo=QuantAlgo.W4A16_AWQ)

Expand All @@ -887,6 +919,7 @@ def test_auto_dtype(self):
dtype='auto',
extra_convert_args=["--ckpt-type=hf"])

@skip_post_blackwell
@pytest.mark.parametrize("precision", ["int8", "int4"])
def test_weight_only(self, precision: str):
quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
Expand All @@ -910,6 +943,7 @@ def test_auto_dtype(self):
def test_weight_only(self):
self.run(quant_algo=QuantAlgo.W8A16)

@skip_post_blackwell
def test_int4_gptq_prequantized(self, mocker):
mocker.patch.object(self.__class__, "MODEL_PATH",
f"{llm_models_root()}/Qwen-7B-Chat-Int4")
Expand Down Expand Up @@ -938,6 +972,7 @@ class TestQwen2_0_5BInstruct(CliFlowAccuracyTestHarness):
def test_auto_dtype(self):
self.run(dtype='auto')

@skip_post_blackwell
def test_weight_only(self):
self.run(quant_algo=QuantAlgo.W8A16)

Expand All @@ -956,9 +991,11 @@ class TestQwen2_7BInstruct(CliFlowAccuracyTestHarness):
def test_auto_dtype(self):
self.run(dtype='auto')

@skip_post_blackwell
def test_weight_only(self):
self.run(quant_algo=QuantAlgo.W8A16)

@skip_post_blackwell
def test_int4_awq_prequantized(self, mocker):
mocker.patch.object(self.__class__, "MODEL_PATH",
f"{llm_models_root()}/Qwen2-7B-Instruct-AWQ")
Expand Down Expand Up @@ -990,6 +1027,7 @@ class TestQwen2_5_1_5BInstruct(CliFlowAccuracyTestHarness):
def test_auto_dtype(self):
self.run(dtype='auto')

@skip_post_blackwell
def test_weight_only(self):
self.run(quant_algo=QuantAlgo.W8A16)

Expand Down
5 changes: 4 additions & 1 deletion tests/integration/defs/accuracy/test_llm_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from tensorrt_llm.models.modeling_utils import QuantConfig
from tensorrt_llm.quantization import QuantAlgo

from ..conftest import llm_models_root, skip_pre_ada
from ..conftest import llm_models_root, skip_post_blackwell, skip_pre_ada
from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness


Expand All @@ -27,6 +27,7 @@ class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-8B"

@skip_pre_ada
@skip_post_blackwell
def test_fp8_rowwise(self):
quant_config = QuantConfig(QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN)

Expand Down Expand Up @@ -65,6 +66,7 @@ def test_auto_dtype(self):
task.evaluate(llm,
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)

@skip_post_blackwell
def test_weight_only(self):
quant_config = QuantConfig(QuantAlgo.W8A16)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
Expand Down Expand Up @@ -133,6 +135,7 @@ def test_auto_dtype(self):
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)

@skip_post_blackwell
def test_weight_only(self):
quant_config = QuantConfig(QuantAlgo.W8A16)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
Expand Down
1 change: 1 addition & 0 deletions tests/integration/defs/examples/test_eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from defs.trt_test_alternative import check_call


@skip_post_blackwell
@pytest.mark.parametrize("use_dynamic_tree", [False, True],
ids=['eagle1', 'eagle2'])
@pytest.mark.parametrize("batch_size", [1, 8], ids=['bs1', 'bs8'])
Expand Down
4 changes: 3 additions & 1 deletion tests/integration/defs/examples/test_exaone.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pytest
from defs.common import (convert_weights, generate_summary_cmd, venv_check_call,
venv_mpi_check_call)
from defs.conftest import skip_post_blackwell
from defs.trt_test_alternative import check_call


Expand All @@ -26,7 +27,8 @@
@pytest.mark.parametrize("llm_exaone_model_root",
['exaone_3.0_7.8b_instruct', 'exaone_deep_2.4b'],
indirect=True)
@pytest.mark.parametrize("use_weight_only", [True, False],
@pytest.mark.parametrize("use_weight_only",
[pytest.param(True, marks=skip_post_blackwell), False],
ids=["enable_weight_only", "disable_weight_only"])
def test_llm_exaone_1gpu(data_type, exaone_example_root, llm_exaone_model_root,
llama_example_root, llm_datasets_root, llm_rouge_root,
Expand Down
4 changes: 3 additions & 1 deletion tests/integration/defs/examples/test_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
venv_mpi_check_call)
from defs.conftest import (evaltool_mmlu_post_process,
evaltool_wikilingua_post_process, llm_models_root,
skip_pre_ada, skip_pre_blackwell)
skip_post_blackwell, skip_pre_ada,
skip_pre_blackwell)
from defs.trt_test_alternative import check_call
from evaltool.constants import (EVALTOOL_INFERENCE_SERVER_STARTUP_SCRIPT,
EVALTOOL_INFERENCE_SERVER_STOP_SCRIPT,
Expand Down Expand Up @@ -888,6 +889,7 @@ def test_llm_mixtral_1gpu_fp4_llmapi(
venv_check_call(llm_venv, mmlu_cmd)


@skip_post_blackwell
@pytest.mark.parametrize(
"model_name", ['mixtral-8x7b-v0.1-AWQ', 'Mixtral-8x7B-Instruct-v0.1'])
def test_llm_mixtral_int4_awq_1gpu_summary(llama_example_root,
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/defs/examples/test_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import pytest
from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
from defs.conftest import get_device_memory, skip_pre_ada
from defs.conftest import get_device_memory, skip_post_blackwell, skip_pre_ada
from defs.trt_test_alternative import check_call


Expand Down Expand Up @@ -616,7 +616,7 @@ def _test_llm_multimodal_general(llm_venv,
'neva-22b',
'kosmos-2',
'video-neva',
'Phi-3-vision-128k-instruct',
pytest.param('Phi-3-vision-128k-instruct', marks=skip_post_blackwell),
'Phi-3.5-vision-instruct',
'Phi-4-multimodal-instruct',
'Llama-3.2-11B-Vision',
Expand Down
6 changes: 5 additions & 1 deletion tests/integration/defs/examples/test_phi.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
evaltool_mmlu_post_process,
evaltool_mtbench_post_process,
evaltool_wikilingua_post_process, get_device_memory,
skip_fp8_pre_ada, skip_pre_ada)
get_sm_version, skip_fp8_pre_ada,
skip_post_blackwell, skip_pre_ada)
from defs.trt_test_alternative import check_call
from evaltool.constants import (EVALTOOL_INFERENCE_SERVER_STARTUP_SCRIPT,
EVALTOOL_INFERENCE_SERVER_STOP_SCRIPT,
Expand Down Expand Up @@ -421,6 +422,8 @@ def test_llm_phi_lora_1gpu(data_type, lora_data_type, phi_example_root,
model_name = 'phi-3-lora'
if data_type == 'fp8':
skip_fp8_pre_ada(use_fp8=True)
if get_sm_version() >= 100:
pytest.skip("FP8 is not supported on post-Blackwell architectures")
model_dir = quantize_data(
llm_venv,
phi_example_root,
Expand Down Expand Up @@ -570,6 +573,7 @@ def test_llm_phi_quantization_1gpu(data_type, llm_phi_model_root, llm_venv,


@skip_pre_ada
@skip_post_blackwell
@pytest.mark.parametrize("llm_phi_model_root", [
"phi-2", "Phi-3-mini-128k-instruct", "Phi-3-small-128k-instruct",
"Phi-3.5-mini-instruct", "Phi-3.5-MoE-instruct", "Phi-4-mini-instruct"
Expand Down