Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions examples/llm_ptq/hf_ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -1107,7 +1107,10 @@ def quantize_main(
print(f"Excluding MTP layer from quantization: {pattern}")

# Use constant amax for KV quantizers when a cast format is selected.
if args.kv_cache_qformat in _KV_CAST_FORMATS:
# Recipes are authoritative for KV cache config (including use_constant_amax),
# so skip this post-hoc override when --recipe is used; rely on the YAML instead
# (see modelopt_recipes/general/ptq/*_cast_kv.yaml).
if args.recipe is None and args.kv_cache_qformat in _KV_CAST_FORMATS:
quant_cfg = copy.deepcopy(quant_cfg)
_set_kv_cache_constant_amax(quant_cfg["quant_cfg"])

Expand Down Expand Up @@ -1163,7 +1166,9 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--recipe",
help=(
"PTQ recipe YAML file or name without suffix (e.g. general/ptq/nvfp4_default-fp8_kv)."
"PTQ recipe YAML file or name without suffix (e.g. general/ptq/nvfp4_default-fp8_cast_kv, "
"general/ptq/nvfp4_default-fp8_kv, general/ptq/nvfp4_default-nvfp4_cast_kv). "
"When set, --kv_cache_qformat is ignored; the recipe fully determines KV cache config."
),
default=None,
)
Expand Down Expand Up @@ -1252,7 +1257,9 @@ def parse_args() -> argparse.Namespace:
"Specify KV cache quantization format. Default: fp8_cast. "
"Formats ending in '_cast' (fp8_cast, nvfp4_cast) set the amax to FP8 range "
"without data-driven calibration. "
"Other formats (fp8, nvfp4, etc.) use data-driven calibration."
"Other formats (fp8, nvfp4, etc.) use data-driven calibration. "
"Ignored when --recipe is given: the recipe YAML is authoritative for KV "
"cache config (use the *_cast_kv.yaml recipes for the cast variants)."
),
)
parser.add_argument(
Expand Down
70 changes: 70 additions & 0 deletions modelopt_recipes/general/ptq/fp8_default-fp8_cast_kv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: Copyright year should be 2025 for new files, not 2024 (same applies to the other two new YAML files).

# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

metadata:
recipe_type: ptq
description: >-
FP8 per-tensor weight and activation (W8A8), FP8 KV cache with constant amax
(skips KV calibration; amax hardcoded to FP8 E4M3 max 448.0), max calibration.
quantize:
algorithm: max
quant_cfg:
- quantizer_name: '*'
enable: false
- quantizer_name: '*input_quantizer'
cfg:
num_bits: e4m3
axis:
- quantizer_name: '*weight_quantizer'
cfg:
num_bits: e4m3
axis:
- quantizer_name: '*[kv]_bmm_quantizer'
enable: true
cfg:
num_bits: e4m3
use_constant_amax: true
- quantizer_name: '*block_sparse_moe.gate*'
enable: false
- quantizer_name: '*linear_attn.conv1d*'
enable: false
- quantizer_name: '*lm_head*'
enable: false
- quantizer_name: '*mixer.conv1d*'
enable: false
- quantizer_name: '*mlp.gate.*'
enable: false
- quantizer_name: '*mlp.shared_expert_gate.*'
enable: false
- quantizer_name: '*output_layer*'
enable: false
- quantizer_name: '*proj_out.*'
enable: false
- quantizer_name: '*router*'
enable: false
- quantizer_name: 'output.*'
enable: false
- parent_class: 'nn.BatchNorm1d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.BatchNorm2d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.BatchNorm3d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.LeakyReLU'
quantizer_name: '*'
enable: false
78 changes: 78 additions & 0 deletions modelopt_recipes/general/ptq/nvfp4_default-fp8_cast_kv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

metadata:
recipe_type: ptq
description: >-
NVFP4 W4A4, FP8 KV cache with constant amax (skips KV calibration; amax
hardcoded to FP8 E4M3 max 448.0), max calibration.
quantize:
algorithm: max
quant_cfg:
- quantizer_name: '*'
enable: false
- quantizer_name: '*weight_quantizer'
enable: true
cfg:
block_sizes:
-1: 16
type: dynamic
scale_bits: e4m3
num_bits: e2m1
- quantizer_name: '*input_quantizer'
enable: true
cfg:
block_sizes:
-1: 16
type: dynamic
scale_bits: e4m3
num_bits: e2m1
- quantizer_name: '*[kv]_bmm_quantizer'
enable: true
cfg:
num_bits: e4m3
use_constant_amax: true
- quantizer_name: '*block_sparse_moe.gate*'
enable: false
- quantizer_name: '*linear_attn.conv1d*'
enable: false
- quantizer_name: '*lm_head*'
enable: false
- quantizer_name: '*mixer.conv1d*'
enable: false
- quantizer_name: '*mlp.gate.*'
enable: false
- quantizer_name: '*mlp.shared_expert_gate.*'
enable: false
- quantizer_name: '*output_layer*'
enable: false
- quantizer_name: '*proj_out.*'
enable: false
- quantizer_name: '*router*'
enable: false
- quantizer_name: 'output.*'
enable: false
- parent_class: 'nn.BatchNorm1d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.BatchNorm2d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.BatchNorm3d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.LeakyReLU'
quantizer_name: '*'
enable: false
84 changes: 84 additions & 0 deletions modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

metadata:
recipe_type: ptq
description: >-
NVFP4 W4A4, NVFP4 KV cache with constant amax (skips KV calibration; amax
hardcoded to FP8 E4M3 max 448.0 — the deployment kernel upcasts NVFP4 KV
values to FP8 before attention, so the scale must land in the FP8 range),
max calibration.
quantize:
algorithm: max
quant_cfg:
- quantizer_name: '*'
enable: false
- quantizer_name: '*weight_quantizer'
enable: true
cfg:
block_sizes:
-1: 16
type: dynamic
scale_bits: e4m3
num_bits: e2m1
- quantizer_name: '*input_quantizer'
enable: true
cfg:
block_sizes:
-1: 16
type: dynamic
scale_bits: e4m3
num_bits: e2m1
- quantizer_name: '*[kv]_bmm_quantizer'
enable: true
cfg:
block_sizes:
-1: 16
type: dynamic
scale_bits: e4m3
num_bits: e2m1
use_constant_amax: true
- quantizer_name: '*block_sparse_moe.gate*'
enable: false
- quantizer_name: '*linear_attn.conv1d*'
enable: false
- quantizer_name: '*lm_head*'
enable: false
- quantizer_name: '*mixer.conv1d*'
enable: false
- quantizer_name: '*mlp.gate.*'
enable: false
- quantizer_name: '*mlp.shared_expert_gate.*'
enable: false
- quantizer_name: '*output_layer*'
enable: false
- quantizer_name: '*proj_out.*'
enable: false
- quantizer_name: '*router*'
enable: false
- quantizer_name: 'output.*'
enable: false
- parent_class: 'nn.BatchNorm1d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.BatchNorm2d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.BatchNorm3d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.LeakyReLU'
quantizer_name: '*'
enable: false
3 changes: 3 additions & 0 deletions tests/unit/recipe/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,10 @@ def test_load_recipe_builtin_description():

_BUILTIN_PTQ_RECIPES = [
"general/ptq/fp8_default-fp8_kv",
"general/ptq/fp8_default-fp8_cast_kv",
"general/ptq/nvfp4_default-fp8_kv",
"general/ptq/nvfp4_default-fp8_cast_kv",
"general/ptq/nvfp4_default-nvfp4_cast_kv",
"general/ptq/nvfp4_mlp_only-fp8_kv",
"general/ptq/nvfp4_omlp_only-fp8_kv",
]
Expand Down
Loading