Skip to content

Commit

Permalink
Update nemo.export module for quantized models (#9250)
Browse files Browse the repository at this point in the history
* Remove config aligner - no longer needed after TRT-LLM 0.9 update

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Change default export precision to bf16 (more frequent)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Specify gpt_attention_plugin

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
  • Loading branch information
janekl and pablo-garay committed May 21, 2024
1 parent 91003a0 commit 4a6f158
Show file tree
Hide file tree
Showing 4 changed files with 5 additions and 84 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ export:
decoder_type: llama # gptnext, gpt2, llama
inference_tensor_parallel: 1 # Default using 1 TP for inference
inference_pipeline_parallel: 1 # Default using 1 PP for inference
dtype: 16 # Default precision data type
dtype: bf16 # Default precision data type

model_file: llama2-7b-fp16.nemo # Nemo file path
model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
Expand Down
1 change: 0 additions & 1 deletion nemo/export/trt_llm/qnemo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .align_config import align_config
from .qnemo_to_tensorrt_llm import qnemo_to_tensorrt_llm
46 changes: 0 additions & 46 deletions nemo/export/trt_llm/qnemo/align_config.py

This file was deleted.

40 changes: 4 additions & 36 deletions nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,10 @@
import json
import os
import subprocess
from typing import List, Optional

from nemo.export.trt_llm.qnemo import align_config
from nemo.export.trt_llm.tensorrt_llm_build import MODEL_NAME, get_engine_name
from typing import List, Optional

CONFIG_NAME = "config.json"
CONFIG_TRTLLM_BUILD_NAME = "config_trtllm_build.json"


def qnemo_to_tensorrt_llm(
Expand All @@ -34,6 +31,7 @@ def qnemo_to_tensorrt_llm(
lora_target_modules: Optional[List[str]] = None,
):
"""Build TRT-LLM engine via trtllm-build CLI API in a subprocess."""
assert not lora_target_modules, f"LoRA is not supported for quantized checkpoints, got {lora_target_modules}"
print(
"Note that setting n_gpus, tensor_parallel_size and pipeline_parallel_size parameters"
" for quantized models is possible only on export step via nemo.export.quantize module."
Expand All @@ -58,6 +56,8 @@ def qnemo_to_tensorrt_llm(
str(max_prompt_embedding_table_size),
"--gemm_plugin",
model_config["dtype"],
"--gpt_attention_plugin",
model_config["dtype"],
"--strongly_typed",
"--use_custom_all_reduce",
"disable",
Expand All @@ -75,35 +75,3 @@ def qnemo_to_tensorrt_llm(

print("Building engine done. Full logs are:")
print(result.stdout.decode())

# Alignment to make nemo-fw tensorrt_llm.runtime ModelConfig definition compatible with config
# produced by trtllm-build API. The new config is saved as "config.json" while the source build
# config is saved as "config_trtllm_build.json" in the engine directory for reference.
os.rename(os.path.join(engine_dir, CONFIG_NAME), os.path.join(engine_dir, CONFIG_TRTLLM_BUILD_NAME))
with open(os.path.join(engine_dir, CONFIG_TRTLLM_BUILD_NAME), "r") as f:
config_trtllm_build = json.load(f)

config = align_config(config_trtllm_build)

# Other parameters
assert lora_target_modules is None
config["builder_config"]["lora_target_modules"] = lora_target_modules

with open(os.path.join(engine_dir, CONFIG_NAME), "w") as f:
json.dump(config, f, indent=2)

# Rename for consistency with how engine is run later
for i in range(config["builder_config"]["world_size"]):
os.rename(
os.path.join(engine_dir, f"rank{i}.engine"),
os.path.join(
engine_dir,
get_engine_name(
MODEL_NAME,
config["builder_config"]["precision"],
config["builder_config"]["tensor_parallel"],
config["builder_config"]["pipeline_parallel"],
i,
),
),
)

0 comments on commit 4a6f158

Please sign in to comment.