From 2d017957d959f18389c970f4d7f1117e00f9b8d1 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Fri, 12 Apr 2024 18:07:10 -0700 Subject: [PATCH 01/35] Add trtllm checkpoint --- nemo/export/tensorrt_llm.py | 56 ++++++- nemo/export/trt_llm/nemo/convert.py | 62 +++++--- nemo/export/trt_llm/nemo/nemo_ckpt_convert.py | 26 +++- nemo/export/trt_llm/nemo_utils.py | 146 +++++++++++++++++- 4 files changed, 257 insertions(+), 33 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 473fefaea6a2..c504a8364382 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -29,7 +29,7 @@ from nemo.deploy import ITritonDeployable from nemo.export.trt_llm.model_config_trt import model_config_to_tensorrt_llm from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer -from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_llm_to_model_config +from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_llm_to_model_config, nemo_to_trtllm from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_refit from nemo.export.trt_llm.utils import is_nemo_file, unpack_nemo_ckpt @@ -229,6 +229,60 @@ def export( if load_model: self._load() + def convert_to_tllm_file( + self, + nemo_checkpoint_path: str, + model_type: str, + delete_existing_files: bool = True, + n_gpus: int = 1, + tensor_parallel_size: int = None, + pipeline_parallel_size: int = None, + max_input_token: int = 256, + max_output_token: int = 256, + max_batch_size: int = 8, + max_prompt_embedding_table_size=None, + use_inflight_batching: bool = False, + enable_context_fmha: bool = True, + paged_kv_cache: bool = False, + dtype: str = "bfloat16", + load_model: bool = True, + enable_multi_block_mode: bool = False, + use_lora_plugin: str = None, + lora_target_modules: List[str] = None, + max_lora_rank: int = 64, + use_parallel_embedding: bool = False, + save_nemo_model_config: bool = False, + ): + nemo_export_dir = self.model_dir + + if pipeline_parallel_size is None: + tensor_parallel_size = n_gpus + pipeline_parallel_size = 1 + elif tensor_parallel_size is None: + tensor_parallel_size = 1 + pipeline_parallel_size = n_gpus + + weight_dict, model_configs, self.tokenizer = nemo_to_trtllm( + in_file=nemo_checkpoint_path, + decoder_type=model_type, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + pipeline_parallel_size=pipeline_parallel_size, + use_parallel_embedding=use_parallel_embedding, + nemo_export_dir=nemo_export_dir, + save_nemo_model_config=save_nemo_model_config, + ) + + # tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") + # if os.path.exists(tokenizer_path): + # shutil.copy(tokenizer_path, self.model_dir) + # else: + # self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer')) + + # nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml") + # if os.path.exists(nemo_model_config): + # shutil.copy(nemo_model_config, self.model_dir) + def build( self, nemo_model, diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py index 09476da6b939..f94503481952 100644 --- a/nemo/export/trt_llm/nemo/convert.py +++ b/nemo/export/trt_llm/nemo/convert.py @@ -44,7 +44,8 @@ def save_val(val, dir, key, tp_num=None): if len(val.shape) >= 2: val = np.ascontiguousarray(np.transpose(val.reshape(val.shape[0], -1), [1, 0])) global weights_dict - weights_dict[f"model.{key}.{suffix}"] = val + weights_dict[key] = val + # weights_dict[f"model.{key}.{suffix}"] = val def save_split(split_vals, dir, key, i, split_factor): @@ -178,11 +179,15 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t tp_size = config.get("tp_size", 1) int8_outputs = config.get("int8_outputs", None) multi_query_mode = config.get("multi_query_mode", False) + local_dim = config.get("local_dim", None) num_kv_heads = config.get("num_kv_heads", num_attention_heads) size_per_head = config.get("kv_channels", None) save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only" + layer_num = key.split(".")[1] + layer_prefix = f'transformer.layers.{layer_num}' + if not isinstance(vals, list): vals = [vals] @@ -210,12 +215,22 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t or "final_layernorm.bias" in key ): # shared weights, only need to convert the weights of rank 0 - if "post_self_attn_layernorm.weight" in key: - key = key.replace("post_self_attn_layernorm.weight", "post_attention_layernorm.weight") - elif "mlp.linear_fc2.bias" in key: - key = key.replace("mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias") - elif "attention.linear_proj.bias" in key: - key = key.replace("attention.linear_proj.bias", "attention.dense.bias") + if "post_self_attn_layernorm" in key or "post_attention_layernorm" in key: + if key.endswith('weight'): + key = f'{layer_prefix}.post_layernorm.weight' + else: + key = f'{layer_prefix}.post_layernorm.bias' + elif "mlp.linear_fc2.bias" in key or "mlp.dense_4h_to_h.bias" in key: + key = f'{layer_prefix}.mlp.proj.bias' + elif "attention.linear_proj.bias" in key or "attention.dense.bias" in key: + key = f'{layer_prefix}.attention.dense.bias' + elif "final_layernorm" in key: + key = key.replace("final_layernorm", "transformer.ln_f") + elif "input_layernorm" in key: + if key.endswith('weight'): + key = f'{layer_prefix}.input_layernorm.weight' + else: + key = f'{layer_prefix}.input_layernorm.bias' if tp_rank == 0: save_val(vals[0], saved_dir, key) @@ -228,10 +243,10 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t cat_dim = 0 val = np.concatenate(vals, axis=cat_dim) split_vals = np.split(val, split_factor, axis=cat_dim) - if "attention.linear_proj.weight" in key: - key = key.replace("attention.linear_proj.weight", "attention.dense.weight") - elif "mlp.linear_fc2.weight" in key: - key = key.replace("mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight") + if "attention.linear_proj.weight" in key or "attention.dense.weight" in key: + key = f'{layer_prefix}.attention.dense.weight' + elif "mlp.linear_fc2.weight" in key or "mlp.dense_4h_to_h.weight" in key: + key = f'{layer_prefix}.mlp.proj.weight' save_split(split_vals, saved_dir, key, tp_rank, split_factor) if act_range is not None and int8_outputs == "all": base_key = key.replace(".weight", "") @@ -251,8 +266,10 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t val = np.concatenate(vals, axis=cat_dim) split_vals = np.split(val, split_factor, axis=cat_dim) - if "mlp.linear_fc1" in key: - key = key.replace("mlp.linear_fc1", "mlp.dense_h_to_4h") + if key.endswith("weight"): + key = f'{layer_prefix}.mlp.fc.weight' + else: + key = f'{layer_prefix}.mlp.fc.bias' save_split(split_vals, saved_dir, key, tp_rank, split_factor) if act_range is not None and int8_outputs == "all": base_key = key.replace(".weight", "") @@ -261,8 +278,10 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t if split_gated_activation: assert not save_int8 - prefix, dot, suffix = key.rpartition(".") - key = prefix + ".gate" + dot + suffix + if key.endswith("weight"): + key = f'{layer_prefix}.mlp.gate.weight' + else: + key = f'{layer_prefix}.mlp.gate.bias' gate = np.concatenate(gates, axis=cat_dim) split_vals = np.split(gate, split_factor, axis=cat_dim) @@ -279,9 +298,6 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor) elif "attention.query_key_value.bias" in key or "attention.linear_qkv.bias" in key: - if "attention.linear_qkv.bias" in key: - key = key.replace("attention.linear_qkv.bias", "attention.query_key_value.bias") - qkv_hidden_dim = vals[0].shape[0] size_per_head = qkv_hidden_dim // (num_attention_heads + 2 * num_kv_heads) q_num = num_attention_heads // num_kv_heads @@ -300,10 +316,9 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t v_split = np.split(qkv[2], split_factor, axis=0) # Concatenate Q, K, and V together - split_vals = [ - np.concatenate([q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], axis=0) - for i in range(split_factor) - ] + split_vals = [np.concatenate([q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], axis=0) + for i in range(split_factor)] + key = f'{layer_prefix}.attention.qkv.bias' save_split(split_vals, saved_dir, key, tp_rank, split_factor) elif "attention.query_key_value.weight" in key or "attention.linear_qkv.weight" in key: @@ -342,8 +357,7 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t for i in range(split_factor) ] - if "attention.linear_qkv.weight" in key: - key = key.replace("attention.linear_qkv.weight", "attention.query_key_value.weight") + key = f'{layer_prefix}.attention.qkv.weight' save_split(split_vals, saved_dir, key, tp_rank, split_factor) if save_int8: base_key = key.replace(".weight", "") diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py index e803d9c989d1..e58855d49914 100644 --- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py +++ b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py @@ -27,7 +27,7 @@ import tensorstore # This is important even though not used. Otherwise zarr raises error. import torch import zarr -from tensorrt_llm._utils import np_bfloat16, str_dtype_to_torch, torch_to_numpy +from tensorrt_llm._utils import np_bfloat16, str_dtype_to_torch, torch_to_numpy, pad_vocab_size from tqdm import tqdm from transformers import AutoTokenizer, GPT2Tokenizer, LlamaConfig @@ -172,6 +172,7 @@ def convert_dist_checkpoint(unpacked_checkpoints_dir: UnpackedNemoCheckpointDir, multi_query_mode = nemo_model_config.get("multi_query_mode", False) num_attention_heads = nemo_model_config["num_attention_heads"] kv_channels = nemo_model_config.get("kv_channels", None) + use_parallel_embedding = args.use_parallel_embedding if num_kv_heads == 0: if multi_query_mode: num_kv_heads = 1 @@ -189,6 +190,7 @@ def convert_dist_checkpoint(unpacked_checkpoints_dir: UnpackedNemoCheckpointDir, "kv_channels": kv_channels, "use_attention_nemo_shape": True, "transpose_weights": True, + "use_parallel_embedding": use_parallel_embedding, } # split_factor: in how many parts a TP training node is split @@ -200,22 +202,34 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): if has_position_embedding: val = model[get_layer_name("position_embedding", prefix)] val = torch_to_numpy(val.to(storage_type).cpu()) - model_level_weights["model.wpe.bin"].append(val) + model_level_weights["transformer.position_embedding.weight"].append(val) if pp_idx == 0: val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)] if embedding_scaling: val = val * float(math.sqrt(hidden_size)) + vocab_size = val.shape[0] + if use_parallel_embedding: + # Pad vocab_size first + if vocab_size % inference_tp_size != 0: + vocab_size_padded = pad_vocab_size(vocab_size, inference_tp_size) + pad_width = vocab_size_padded - vocab_size + val = torch.nn.functional.pad( + val, + (0, 0, 0, pad_width), + value=0 + ) + val = torch_to_numpy(val.to(storage_type).cpu()) - model_level_weights["model.wte.bin"].append(val) + model_level_weights["transformer.vocab_embedding.weight"].append(val) if share_embeddings_and_output: val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)] val = torch_to_numpy(val.to(storage_type).cpu()) - model_level_weights["model.lm_head.weight.bin"].append(val) + model_level_weights["lm_head.weight"].append(val) if has_lm_head and pp_idx == training_pp_size - 1: val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)] val = torch_to_numpy(val.to(storage_type).cpu()) - model_level_weights["model.lm_head.weight.bin"].append(val) + model_level_weights["lm_head.weight"].append(val) weights_dict = {} @@ -278,7 +292,7 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): model_level_weights[key] = np.concatenate(values, axis=0) weights_dict[key] = model_level_weights[key] - vocab_size = model_level_weights["model.wte.bin"].shape[0] + vocab_size = model_level_weights["transformer.vocab_embedding.weight"].shape[0] if nemo_model_config["tokenizer"].get("library", None) == "huggingface": tokenizer = AutoTokenizer.from_pretrained( diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index af077ea32a8e..7f757e42dee6 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -22,12 +22,14 @@ import shutil import sys import tempfile +import json from pathlib import Path from typing import Dict, List, Tuple, Union import numpy as np import tensorrt_llm from tensorrt_llm import str_dtype_to_trt +from tensorrt_llm._utils import pad_vocab_size from transformers import AutoTokenizer, LlamaConfig, PretrainedConfig, PreTrainedTokenizer from nemo.export.trt_llm.model_config import ( @@ -55,6 +57,7 @@ def _nemo_llm_decode( storage_type: str = "bfloat16", load_checkpoints_on_gpu: bool = False, decoder_type: str = "gptnext", + use_parallel_embedding: bool = False, save_nemo_model_config: bool = False, ) -> Tuple[Dict[str, np.ndarray], PretrainedConfig, PreTrainedTokenizer]: """Decodes the NEMO file and returns the weights dict, llm config and tokenizer.""" @@ -67,6 +70,7 @@ def _nemo_llm_decode( args.load_checkpoints_on_gpu = load_checkpoints_on_gpu args.verbose = False args.decoder_type = decoder_type + args.use_parallel_embedding = use_parallel_embedding input_path = Path(args.in_file) if not input_path.exists(): @@ -299,8 +303,8 @@ def nemo_llm_model_to_model_config( LOGGER.info( f'''Resharing: Rank {tensorrt_llm.mpi_rank()} mapping: - tp_rank {parallel_state.get_tensor_model_parallel_rank()} -> {model_config.mapping.tp_rank}, - pp_rank {parallel_state.get_pipeline_model_parallel_rank()} -> {model_config.mapping.pp_rank}, + tp_rank {parallel_state.get_tensor_model_parallel_rank()} -> {model_config.mapping.tp_rank}, + pp_rank {parallel_state.get_pipeline_model_parallel_rank()} -> {model_config.mapping.pp_rank}, tp_group {model_config.mapping.tp_group}''' ) @@ -323,3 +327,141 @@ def nemo_llm_model_to_model_config( model_config.lm_head.weight = lm_head_weight return [model_config] + +def nemo_to_trtllm( + in_file: str, + decoder_type: str, + nemo_export_dir: Union[str, Path], + dtype: str = "bfloat16", + tensor_parallel_size: int = 1, + pipeline_parallel_size: int = 1, + use_parallel_embedding: bool = False, + save_nemo_model_config: bool = False, +) -> Tuple[List[ModelConfig], PreTrainedTokenizer]: + """Converts the NEMO file and construct the `ModelConfig` before tensorrt_llm deployment.""" + dtype_str = dtype + + weights_dict, llm_model_config, tokenizer = _nemo_llm_decode( + in_file=in_file, + out_dir=nemo_export_dir, + tensor_parallelism=tensor_parallel_size, + processes=1, + storage_type=dtype_str, + use_parallel_embedding=use_parallel_embedding, + load_checkpoints_on_gpu=False, + decoder_type=decoder_type, + save_nemo_model_config=save_nemo_model_config, + ) + + world_size = tensor_parallel_size*pipeline_parallel_size + + lm_head_weight = weights_dict["lm_head.weight"] + + vocab_size = weights_dict["transformer.vocab_embedding.weight"].shape[0] + vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size) + + if vocab_size_padded != vocab_size: + pad_width = vocab_size_padded - vocab_size + lm_head_weight = np.pad( + lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0 + ) + + config = { + 'architecture': + 'GPTForCausalLM', + 'dtype': + dtype_str, + 'num_hidden_layers': + llm_model_config.n_layer, + 'num_attention_heads': + llm_model_config.n_head, + 'num_key_value_heads': + llm_model_config.num_kv_heads, + 'hidden_size': + llm_model_config.n_embd, + 'intermediate_size': + llm_model_config.intermediate_size, + 'norm_epsilon': + llm_model_config.layer_norm_epsilon, + 'vocab_size': + vocab_size_padded, + 'position_embedding_type': + "rope_gpt_neox" if llm_model_config.position_embedding_type == "rope" else "learned_absolute", + 'max_position_embeddings': + llm_model_config.n_positions, + 'hidden_act': + llm_model_config.activation_function, + 'use_parallel_embedding': + use_parallel_embedding, + 'embedding_sharding_dim': + 0, + 'share_embedding_table': + False, + 'quantization': { + 'quant_algo': None, + 'kv_cache_quant_algo': None, + }, + 'mapping': { + 'world_size': world_size, + 'tp_size': tensor_parallel_size, + 'pp_size': pipeline_parallel_size, + }, + 'bias': + llm_model_config.bias, + 'apply_query_key_layer_scaling': + False, + 'rotary_pct': + llm_model_config.rotary_pct, + } + + with open(os.path.join(nemo_export_dir, 'config.json'), 'w') as f: + json.dump(config, f, indent=4) + + model_configs = [] + for i in range(world_size): + weights_dict_local = weights_dict.copy() + + mapping = tensorrt_llm.Mapping( + world_size=world_size, + rank=i, + tp_size=tensor_parallel_size, + pp_size=pipeline_parallel_size + ) + + embedding_weight = np.ascontiguousarray( + split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank) + ) if use_parallel_embedding else weights_dict["transformer.vocab_embedding.weight"] + + weights_dict_local["transformer.vocab_embedding.weight"] = embedding_weight + + weights_dict_local["lm_head.weight"] = np.ascontiguousarray( + split(lm_head_weight, mapping.tp_size, mapping.tp_rank) + ) + + # save to file for trtllm-build, move to build_trtllm API for the future + from safetensors.numpy import save_file + save_file( + weights_dict_local, os.path.join(nemo_export_dir, f'rank{i}.safetensors') + ) + + # config = PretrainedConfig( + # architecture='GPTForCausalLM', + # dtype=dtype_str, + # num_hidden_layers=llm_model_config.get('n_layer'), + # num_attention_heads=llm_model_config.get('n_head'), + # num_key_value_heads=llm_model_config.get('n_kv_head'), + # hidden_size=llm_model_config.get('n_embd'), + # intermediate_size=llm_model_config.get('n_inner'), + # norm_epsilon=llm_model_config.get('layer_norm_epsilon'), + # vocab_size=tokenizer.vocab_size, + # position_embedding_type=llm_model_config.get('position_embedding_type'), + # max_position_embeddings=llm_model_config.get('n_positions'), + # hidden_act=llm_model_config.get('activation_function'), + # use_parallel_embedding=use_parallel_embedding, + # bias=llm_model_config.get('bias'), + # rotary_pct=llm_model_config.get('rotary_pct') + # ) + # config.mapping = mapping + # model_configs.append(config) + + return weights_dict, model_configs, tokenizer From fa12e7764f7194fac54d242ca939c5e526145f23 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Fri, 12 Apr 2024 19:22:06 -0700 Subject: [PATCH 02/35] Change model config --- nemo/export/trt_llm/nemo/nemo_ckpt_convert.py | 2 +- nemo/export/trt_llm/nemo_utils.py | 24 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py index e58855d49914..1460d7e80811 100644 --- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py +++ b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py @@ -321,7 +321,7 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): with config_path.open("w") as config_file: config.write(config_file) - return weights_dict, llm_config, tokenizer + return weights_dict, nemo_model_config, tokenizer @torch.no_grad() diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 7f757e42dee6..e13fac339fda 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -341,7 +341,7 @@ def nemo_to_trtllm( """Converts the NEMO file and construct the `ModelConfig` before tensorrt_llm deployment.""" dtype_str = dtype - weights_dict, llm_model_config, tokenizer = _nemo_llm_decode( + weights_dict, nemo_model_config, tokenizer = _nemo_llm_decode( in_file=in_file, out_dir=nemo_export_dir, tensor_parallelism=tensor_parallel_size, @@ -372,25 +372,25 @@ def nemo_to_trtllm( 'dtype': dtype_str, 'num_hidden_layers': - llm_model_config.n_layer, + nemo_model_config.get('num_layers'), 'num_attention_heads': - llm_model_config.n_head, + nemo_model_config.get('num_attention_heads'), 'num_key_value_heads': - llm_model_config.num_kv_heads, + nemo_model_config.get('num_query_groups'), 'hidden_size': - llm_model_config.n_embd, + nemo_model_config.get('hidden_size'), 'intermediate_size': - llm_model_config.intermediate_size, + nemo_model_config.get('ffn_hidden_size'), 'norm_epsilon': - llm_model_config.layer_norm_epsilon, + nemo_model_config.get('layernorm_epsilon'), 'vocab_size': vocab_size_padded, 'position_embedding_type': - "rope_gpt_neox" if llm_model_config.position_embedding_type == "rope" else "learned_absolute", + "rope_gpt_neox" if nemo_model_config.get('position_embedding_type') == "rope" else "learned_absolute", 'max_position_embeddings': - llm_model_config.n_positions, + nemo_model_config.get('max_position_embeddings'), 'hidden_act': - llm_model_config.activation_function, + nemo_model_config.get('activation'), 'use_parallel_embedding': use_parallel_embedding, 'embedding_sharding_dim': @@ -407,11 +407,11 @@ def nemo_to_trtllm( 'pp_size': pipeline_parallel_size, }, 'bias': - llm_model_config.bias, + nemo_model_config.get('bias'), 'apply_query_key_layer_scaling': False, 'rotary_pct': - llm_model_config.rotary_pct, + nemo_model_config.get('rotary_percentage'), } with open(os.path.join(nemo_export_dir, 'config.json'), 'w') as f: From 4c715aea51a272d6ea8c12012300d2b9fa50c23f Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Tue, 16 Apr 2024 12:58:29 -0700 Subject: [PATCH 03/35] fix no query_group --- nemo/export/trt_llm/nemo_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index e13fac339fda..68f360d463e5 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -376,7 +376,7 @@ def nemo_to_trtllm( 'num_attention_heads': nemo_model_config.get('num_attention_heads'), 'num_key_value_heads': - nemo_model_config.get('num_query_groups'), + nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']), 'hidden_size': nemo_model_config.get('hidden_size'), 'intermediate_size': From 6005519f62a286eaeeefdd6f74abad8ba8e8b325 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Wed, 17 Apr 2024 18:44:27 -0700 Subject: [PATCH 04/35] Using build API --- nemo/export/tensorrt_llm.py | 32 +++++++---- nemo/export/trt_llm/nemo/nemo_ckpt_convert.py | 17 ------ nemo/export/trt_llm/nemo_utils.py | 47 +++++----------- nemo/export/trt_llm/tensorrt_llm_build.py | 54 ++++++++++++++++++- 4 files changed, 87 insertions(+), 63 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index c504a8364382..a77d7a915dd1 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -29,9 +29,10 @@ from nemo.deploy import ITritonDeployable from nemo.export.trt_llm.model_config_trt import model_config_to_tensorrt_llm from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer -from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_llm_to_model_config, nemo_to_trtllm +from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_llm_to_model_config, nemo_to_trtllm_config from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_refit from nemo.export.trt_llm.utils import is_nemo_file, unpack_nemo_ckpt +from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine use_deploy = True try: @@ -262,7 +263,7 @@ def convert_to_tllm_file( tensor_parallel_size = 1 pipeline_parallel_size = n_gpus - weight_dict, model_configs, self.tokenizer = nemo_to_trtllm( + weights_dicts, model_configs, self.tokenizer = nemo_to_trtllm_config( in_file=nemo_checkpoint_path, decoder_type=model_type, dtype=dtype, @@ -273,15 +274,26 @@ def convert_to_tllm_file( save_nemo_model_config=save_nemo_model_config, ) - # tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") - # if os.path.exists(tokenizer_path): - # shutil.copy(tokenizer_path, self.model_dir) - # else: - # self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer')) + for weight_dict, model_config in zip(weights_dicts, model_configs): + build_and_save_engine( + max_input_len=max_input_token, + max_output_len=max_output_token, + max_batch_size=max_batch_size, + model_config=model_config, + model_weights=weight_dict, + model_dir=self.model_dir, + # trt_model_type=trt_model_type + ) + + tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") + if os.path.exists(tokenizer_path): + shutil.copy(tokenizer_path, self.model_dir) + else: + self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer')) - # nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml") - # if os.path.exists(nemo_model_config): - # shutil.copy(nemo_model_config, self.model_dir) + nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml") + if os.path.exists(nemo_model_config): + shutil.copy(nemo_model_config, self.model_dir) def build( self, diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py index 1460d7e80811..83c3f84a9b46 100644 --- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py +++ b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py @@ -292,7 +292,6 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): model_level_weights[key] = np.concatenate(values, axis=0) weights_dict[key] = model_level_weights[key] - vocab_size = model_level_weights["transformer.vocab_embedding.weight"].shape[0] if nemo_model_config["tokenizer"].get("library", None) == "huggingface": tokenizer = AutoTokenizer.from_pretrained( @@ -305,22 +304,6 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): tokenizer_config["model"] = os.path.join(out_dir, "tokenizer.model") tokenizer = build_tokenizer(tokenizer_config) - llm_config = nemo_to_llm_config( - nemo_model_config, vocab_size, tokenizer.eos_token_id, tokenizer.bos_token_id, args.decoder_type, - ) - - llm_config.is_mcore = is_mcore - - config = configparser.ConfigParser() - decoder_name_dict = {"llama": "llama", "falcon": "falcon"} - model_name = decoder_name_dict[args.decoder_type] if args.decoder_type in decoder_name_dict else "gpt" - - config[model_name] = {k: str(v) for k, v in vars(llm_config).items()} - config[model_name]["storage_dtype"] = args.storage_type - config_path = out_dir / "config.ini" - with config_path.open("w") as config_file: - config.write(config_file) - return weights_dict, nemo_model_config, tokenizer diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 68f360d463e5..6f67d96a2e75 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -328,7 +328,7 @@ def nemo_llm_model_to_model_config( return [model_config] -def nemo_to_trtllm( +def nemo_to_trtllm_config( in_file: str, decoder_type: str, nemo_export_dir: Union[str, Path], @@ -401,23 +401,20 @@ def nemo_to_trtllm( 'quant_algo': None, 'kv_cache_quant_algo': None, }, - 'mapping': { - 'world_size': world_size, - 'tp_size': tensor_parallel_size, - 'pp_size': pipeline_parallel_size, - }, 'bias': nemo_model_config.get('bias'), 'apply_query_key_layer_scaling': False, 'rotary_pct': nemo_model_config.get('rotary_percentage'), + 'logits_dtype': 'float32', + 'world_size': world_size, + 'tp_size': tensor_parallel_size, + 'pp_size': pipeline_parallel_size, } - with open(os.path.join(nemo_export_dir, 'config.json'), 'w') as f: - json.dump(config, f, indent=4) - model_configs = [] + weights_dicts = [] for i in range(world_size): weights_dict_local = weights_dict.copy() @@ -438,30 +435,10 @@ def nemo_to_trtllm( split(lm_head_weight, mapping.tp_size, mapping.tp_rank) ) - # save to file for trtllm-build, move to build_trtllm API for the future - from safetensors.numpy import save_file - save_file( - weights_dict_local, os.path.join(nemo_export_dir, f'rank{i}.safetensors') - ) + from tensorrt_llm.models.modeling_utils import PretrainedConfig + config = PretrainedConfig(**config) + config.mapping = mapping + model_configs.append(config) + weights_dicts.append(weights_dict_local) - # config = PretrainedConfig( - # architecture='GPTForCausalLM', - # dtype=dtype_str, - # num_hidden_layers=llm_model_config.get('n_layer'), - # num_attention_heads=llm_model_config.get('n_head'), - # num_key_value_heads=llm_model_config.get('n_kv_head'), - # hidden_size=llm_model_config.get('n_embd'), - # intermediate_size=llm_model_config.get('n_inner'), - # norm_epsilon=llm_model_config.get('layer_norm_epsilon'), - # vocab_size=tokenizer.vocab_size, - # position_embedding_type=llm_model_config.get('position_embedding_type'), - # max_position_embeddings=llm_model_config.get('n_positions'), - # hidden_act=llm_model_config.get('activation_function'), - # use_parallel_embedding=use_parallel_embedding, - # bias=llm_model_config.get('bias'), - # rotary_pct=llm_model_config.get('rotary_pct') - # ) - # config.mapping = mapping - # model_configs.append(config) - - return weights_dict, model_configs, tokenizer + return weights_dicts, model_configs, tokenizer diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index 3ad27a2eb9a6..bb3f942ede66 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -25,13 +25,17 @@ import torch from tensorrt_llm import str_dtype_to_trt from tensorrt_llm._utils import np_dtype_to_trt -from tensorrt_llm.builder import Builder +from tensorrt_llm.builder import Builder, BuildConfig from tensorrt_llm.logger import logger from tensorrt_llm.models.modeling_utils import add_lora from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode +from tensorrt_llm.commands.build import build as build_trtllm +from tensorrt_llm.plugin import PluginConfig +from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights + MODEL_NAME = "NeMo" LOGGER = logging.getLogger("NeMo") @@ -348,3 +352,51 @@ def build( tok = time.time() t = time.strftime("%H:%M:%S", time.gmtime(tok - tik)) logger.info(f"Total time of building all {args.mapping.world_size} engines: {t}") + +def build_and_save_engine( + max_input_len=1024, + max_output_len=1024, + max_batch_size=4, + model_dir=None, + model_weights=None, + model_config=None, + trt_model_type='GPTForCausalLM' +): + try: + model_cls = getattr(tensorrt_llm.models, trt_model_type) + except: + raise AttributeError(f"Could not find TRTLLM model type: {trt_model_type}!") + + str_dtype = model_config.dtype + plugin_config = PluginConfig() + plugin_config.set_gpt_attention_plugin(dtype=str_dtype) + plugin_config.set_gemm_plugin(dtype=str_dtype) + max_num_tokens = max_batch_size*max_input_len + + build_dict = { + 'max_input_len': max_input_len, + 'max_output_len': max_output_len, + 'max_batch_size': max_batch_size, + 'max_beam_width': 1, + 'max_num_tokens': max_num_tokens, + 'opt_num_tokens': None, + 'max_prompt_embedding_table_size': 0, + 'gather_context_logits': False, + 'gather_generation_logits': False, + 'strongly_typed': False, + 'builder_opt': None, + } + build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config) + + model = model_cls.from_config(model_config) + model = optimize_model( + model, + use_parallel_embedding=model_config.use_parallel_embedding, + share_embedding_table=model_config.share_embedding_table, + ) + preprocess_weights(model_weights, model_config) + model.load(model_weights) + engine = build_trtllm(model, build_config) + engine.save(model_dir) + + return engine \ No newline at end of file From 38b3b41be4c8ecda22f007e032bad45adf45cddb Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Thu, 18 Apr 2024 18:56:26 -0700 Subject: [PATCH 05/35] Change export to new API --- nemo/export/tensorrt_llm.py | 32 ++++++++++------------- nemo/export/trt_llm/decoder/__init__.py | 6 +++++ nemo/export/trt_llm/nemo_utils.py | 11 ++++---- nemo/export/trt_llm/tensorrt_llm_build.py | 6 ++--- 4 files changed, 29 insertions(+), 26 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index a77d7a915dd1..b8b33a797df8 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -112,6 +112,7 @@ def export( max_output_token: int = 256, max_batch_size: int = 8, max_prompt_embedding_table_size=None, + use_parallel_embedding: bool = False, use_inflight_batching: bool = False, enable_context_fmha: bool = True, paged_kv_cache: bool = False, @@ -188,32 +189,27 @@ def export( tmp_dir = tempfile.TemporaryDirectory() nemo_export_dir = Path(tmp_dir.name) - model_configs, self.tokenizer = nemo_llm_to_model_config( + weights_dicts, model_configs, self.tokenizer = nemo_to_trtllm_config( in_file=nemo_checkpoint_path, decoder_type=model_type, dtype=dtype, tensor_parallel_size=tensor_parallel_size, pipeline_parallel_size=pipeline_parallel_size, + use_parallel_embedding=use_parallel_embedding, nemo_export_dir=nemo_export_dir, save_nemo_model_config=save_nemo_model_config, ) - model_config_to_tensorrt_llm( - model_configs, - self.model_dir, - world_size=tensor_parallel_size * pipeline_parallel_size, - max_input_len=max_input_token, - max_output_len=max_output_token, - max_batch_size=max_batch_size, - max_prompt_embedding_table_size=max_prompt_embedding_table_size, - use_inflight_batching=use_inflight_batching, - paged_kv_cache=paged_kv_cache, - enable_context_fmha=enable_context_fmha, - enable_multi_block_mode=enable_multi_block_mode, - use_lora_plugin=use_lora_plugin, - lora_target_modules=lora_target_modules, - max_lora_rank=max_lora_rank, - ) + for weight_dict, model_config in zip(weights_dicts, model_configs): + build_and_save_engine( + max_input_len=max_input_token, + max_output_len=max_output_token, + max_batch_size=max_batch_size, + model_config=model_config, + model_weights=weight_dict, + model_dir=self.model_dir, + model_type=model_type + ) tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") if os.path.exists(tokenizer_path): @@ -251,7 +247,7 @@ def convert_to_tllm_file( use_lora_plugin: str = None, lora_target_modules: List[str] = None, max_lora_rank: int = 64, - use_parallel_embedding: bool = False, + save_nemo_model_config: bool = False, ): nemo_export_dir = self.model_dir diff --git a/nemo/export/trt_llm/decoder/__init__.py b/nemo/export/trt_llm/decoder/__init__.py index 5fe749408cb9..c0cf41ee9607 100644 --- a/nemo/export/trt_llm/decoder/__init__.py +++ b/nemo/export/trt_llm/decoder/__init__.py @@ -40,6 +40,12 @@ DECODER_GEMMA: GemmaDecoderLayerConfigBuilder, } +DECODER_MODEL_TYPE = { + DECODER_GPT2: 'GPTForCausalLM', + DECODER_LLAMA: 'LLaMAForCausalLM', + DECODER_GEMMA: 'GemmaForCausalLM', +} + def build_decoder_layer_config(layer, decoder: str, dtype=trt.float16, rank=0, tensor_parallel=1): """Builds the decoder layer config with the input torch module.""" diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 6f67d96a2e75..485f03dcd98d 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -30,7 +30,8 @@ import tensorrt_llm from tensorrt_llm import str_dtype_to_trt from tensorrt_llm._utils import pad_vocab_size -from transformers import AutoTokenizer, LlamaConfig, PretrainedConfig, PreTrainedTokenizer +from transformers import AutoTokenizer, LlamaConfig, PreTrainedTokenizer +from tensorrt_llm.models.modeling_utils import PretrainedConfig from nemo.export.trt_llm.model_config import ( LAYERNORM_DEFAULT, @@ -45,6 +46,7 @@ from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir, unpack_nemo_ckpt from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer, convert_dist_checkpoint, convert_nemo_model from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, get_tensor_parallel_group, split +from nemo.export.trt_llm.decoder import DECODER_MODEL_TYPE LOGGER = logging.getLogger("NeMo") @@ -337,8 +339,8 @@ def nemo_to_trtllm_config( pipeline_parallel_size: int = 1, use_parallel_embedding: bool = False, save_nemo_model_config: bool = False, -) -> Tuple[List[ModelConfig], PreTrainedTokenizer]: - """Converts the NEMO file and construct the `ModelConfig` before tensorrt_llm deployment.""" +) -> Tuple[List[Dict], List[PretrainedConfig], PreTrainedTokenizer]: + """Converts the NEMO file and construct the `PretrainedConfig` before tensorrt_llm deployment.""" dtype_str = dtype weights_dict, nemo_model_config, tokenizer = _nemo_llm_decode( @@ -368,7 +370,7 @@ def nemo_to_trtllm_config( config = { 'architecture': - 'GPTForCausalLM', + DECODER_MODEL_TYPE[decoder_type], 'dtype': dtype_str, 'num_hidden_layers': @@ -435,7 +437,6 @@ def nemo_to_trtllm_config( split(lm_head_weight, mapping.tp_size, mapping.tp_rank) ) - from tensorrt_llm.models.modeling_utils import PretrainedConfig config = PretrainedConfig(**config) config.mapping = mapping model_configs.append(config) diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index bb3f942ede66..77992dd81506 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -360,12 +360,12 @@ def build_and_save_engine( model_dir=None, model_weights=None, model_config=None, - trt_model_type='GPTForCausalLM' + model_type='gpt' ): try: - model_cls = getattr(tensorrt_llm.models, trt_model_type) + model_cls = getattr(tensorrt_llm.models, model_config.architecture) except: - raise AttributeError(f"Could not find TRTLLM model type: {trt_model_type}!") + raise AttributeError(f"Could not find TRTLLM model type: {model_type}!") str_dtype = model_config.dtype plugin_config = PluginConfig() From ed409f850c37b791973a596abf6209c6df56ecba Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Thu, 18 Apr 2024 21:12:38 -0700 Subject: [PATCH 06/35] Update generate API --- nemo/export/tensorrt_llm.py | 65 ----------- nemo/export/trt_llm/decoder/__init__.py | 1 + nemo/export/trt_llm/tensorrt_llm_run.py | 147 ++++++++++++++---------- 3 files changed, 89 insertions(+), 124 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index b8b33a797df8..872b37fed70c 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -226,71 +226,6 @@ def export( if load_model: self._load() - def convert_to_tllm_file( - self, - nemo_checkpoint_path: str, - model_type: str, - delete_existing_files: bool = True, - n_gpus: int = 1, - tensor_parallel_size: int = None, - pipeline_parallel_size: int = None, - max_input_token: int = 256, - max_output_token: int = 256, - max_batch_size: int = 8, - max_prompt_embedding_table_size=None, - use_inflight_batching: bool = False, - enable_context_fmha: bool = True, - paged_kv_cache: bool = False, - dtype: str = "bfloat16", - load_model: bool = True, - enable_multi_block_mode: bool = False, - use_lora_plugin: str = None, - lora_target_modules: List[str] = None, - max_lora_rank: int = 64, - - save_nemo_model_config: bool = False, - ): - nemo_export_dir = self.model_dir - - if pipeline_parallel_size is None: - tensor_parallel_size = n_gpus - pipeline_parallel_size = 1 - elif tensor_parallel_size is None: - tensor_parallel_size = 1 - pipeline_parallel_size = n_gpus - - weights_dicts, model_configs, self.tokenizer = nemo_to_trtllm_config( - in_file=nemo_checkpoint_path, - decoder_type=model_type, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - pipeline_parallel_size=pipeline_parallel_size, - use_parallel_embedding=use_parallel_embedding, - nemo_export_dir=nemo_export_dir, - save_nemo_model_config=save_nemo_model_config, - ) - - for weight_dict, model_config in zip(weights_dicts, model_configs): - build_and_save_engine( - max_input_len=max_input_token, - max_output_len=max_output_token, - max_batch_size=max_batch_size, - model_config=model_config, - model_weights=weight_dict, - model_dir=self.model_dir, - # trt_model_type=trt_model_type - ) - - tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") - if os.path.exists(tokenizer_path): - shutil.copy(tokenizer_path, self.model_dir) - else: - self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer')) - - nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml") - if os.path.exists(nemo_model_config): - shutil.copy(nemo_model_config, self.model_dir) - def build( self, nemo_model, diff --git a/nemo/export/trt_llm/decoder/__init__.py b/nemo/export/trt_llm/decoder/__init__.py index c0cf41ee9607..84e1ecf396c7 100644 --- a/nemo/export/trt_llm/decoder/__init__.py +++ b/nemo/export/trt_llm/decoder/__init__.py @@ -42,6 +42,7 @@ DECODER_MODEL_TYPE = { DECODER_GPT2: 'GPTForCausalLM', + DECODER_GPTNEXT: 'GPTForCausalLM', DECODER_LLAMA: 'LLaMAForCausalLM', DECODER_GEMMA: 'GemmaForCausalLM', } diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 1e24f4f207a4..9c640a8c6487 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -27,6 +27,7 @@ from tensorrt_llm.lora_manager import LoraManager from tensorrt_llm.quantization import QuantMode from tensorrt_llm.runtime import ModelConfig, SamplingConfig +from tensorrt_llm.runtime import ModelRunner from transformers import PreTrainedTokenizer from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group @@ -55,7 +56,7 @@ class TensorrtLLMHostContext: class TensorrtLLMWorkerContext: """The MPI worker side context for TRT LLM inference.""" - decoder: tensorrt_llm.runtime.GenerationSession = None + decoder: ModelRunner = None sampling_config: SamplingConfig = None lora_manager: LoraManager = None max_batch_size: int = 0 @@ -131,39 +132,46 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path) runtime_rank = tensorrt_llm.mpi_rank() - assert runtime_rank < torch.cuda.device_count(), f"Rank {runtime_rank} out of bound" - runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank, tp_size=tp_size, pp_size=pp_size) - - torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) - engine_name = get_engine_name(MODEL_NAME, dtype, tp_size, pp_size, runtime_rank) - serialize_path = os.path.join(engine_dir, engine_name) - logger.info(f"Reading from serialize path {serialize_path}") - with open(serialize_path, "rb") as f: - engine_buffer = f.read() - decoder = tensorrt_llm.runtime.GenerationSession( - model_config, engine_buffer, runtime_mapping, debug_mode=False + decoder = ModelRunner.from_dir( + engine_dir=engine_dir, + lora_ckpt_source=lora_ckpt_list, + rank=runtime_rank, + debug_mode=False ) + # runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank, tp_size=tp_size, pp_size=pp_size) + + # torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) + # engine_name = get_engine_name(MODEL_NAME, dtype, tp_size, pp_size, runtime_rank) + # serialize_path = os.path.join(engine_dir, engine_name) + # logger.info(f"Reading from serialize path {serialize_path}") + + # with open(serialize_path, "rb") as f: + # engine_buffer = f.read() + # decoder = tensorrt_llm.runtime.GenerationSession( + # model_config, engine_buffer, runtime_mapping, debug_mode=False + # ) + sampling_config = SamplingConfig( end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams ) - if decoder.use_lora_plugin: - lora_manager = LoraManager() - if lora_ckpt_list is not None: - lora_manager.load_from_nemo( - model_files=lora_ckpt_list, model_config=model_config, runtime_mapping=runtime_mapping, - ) - else: - lora_manager = None + # if decoder.use_lora_plugin: + # lora_manager = LoraManager() + # if lora_ckpt_list is not None: + # lora_manager.load_from_nemo( + # model_files=lora_ckpt_list, model_config=model_config, runtime_mapping=runtime_mapping, + # ) + # else: + # lora_manager = None # Initialize the global context so it can be used during `run` API. global tensorrt_llm_worker_context tensorrt_llm_worker_context.decoder = decoder tensorrt_llm_worker_context.sampling_config = sampling_config - tensorrt_llm_worker_context.lora_manager = lora_manager + # tensorrt_llm_worker_context.lora_manager = lora_manager tensorrt_llm_worker_context.max_batch_size = max_batch_size tensorrt_llm_worker_context.max_input_len = max_input_len @@ -200,7 +208,6 @@ def _forward( decoder = tensorrt_llm_worker_context.decoder assert decoder is not None, "Invalid worker context, decoder is not loaded." sampling_config = tensorrt_llm_worker_context.sampling_config - lora_manager = tensorrt_llm_worker_context.lora_manager max_batch_size = tensorrt_llm_worker_context.max_batch_size max_input_len = tensorrt_llm_worker_context.max_input_len @@ -210,60 +217,82 @@ def _forward( max_length = max(input_lengths) assert max_length <= max_input_len, f"input length {max_length} exceedng max input length {max_input_len}" pad_id = sampling_config.pad_id + end_id = sampling_config.end_id + num_beams = sampling_config.num_beams - if decoder.remove_input_padding: - line_encoded = torch.concat(input_tensors).cuda() - else: - line_encoded = torch.nested.to_padded_tensor( - torch.nested.nested_tensor(input_tensors, dtype=torch.int32), pad_id - ).cuda() + # if decoder.remove_input_padding: + # line_encoded = torch.concat(input_tensors).cuda() + # else: + # line_encoded = torch.nested.to_padded_tensor( + # torch.nested.nested_tensor(input_tensors, dtype=torch.int32), pad_id + # ).cuda() - input_lengths = torch.tensor(input_lengths, dtype=torch.int32).cuda() + # input_lengths = torch.tensor(input_lengths, dtype=torch.int32).cuda() - if prompt_table is None: - ptuning_args = [] - else: - if task_vocab_size is None: - raise Exception("task_vocab_size cannot be None") + # if prompt_table is None: + # ptuning_args = [] + # else: + # if task_vocab_size is None: + # raise Exception("task_vocab_size cannot be None") - task_vocab_size = torch.tensor([task_vocab_size], dtype=torch.int32, device="cuda") - task_ids = torch.tensor(task_ids, dtype=torch.int32, device="cuda") - prompt_table = prompt_table.cuda() - ptuning_args = [prompt_table, task_ids, task_vocab_size] + # task_vocab_size = torch.tensor([task_vocab_size], dtype=torch.int32, device="cuda") + # task_ids = torch.tensor(task_ids, dtype=torch.int32, device="cuda") + # prompt_table = prompt_table.cuda() + # ptuning_args = [prompt_table, task_ids, task_vocab_size] with torch.no_grad(): - sampling_config.top_k = top_k - sampling_config.top_p = top_p - sampling_config.temperature = temperature - for key, param in sampling_kwargs.items(): - # set any additional SamplingConfig kwargs - setattr(sampling_config, key, param) - - decoder.setup( - batch_size, - max_context_length=max_length, + # sampling_config.top_k = top_k + # sampling_config.top_p = top_p + # sampling_config.temperature = temperature + # for key, param in sampling_kwargs.items(): + # # set any additional SamplingConfig kwargs + # setattr(sampling_config, key, param) + + # decoder.setup( + # batch_size, + # max_context_length=max_length, + # max_new_tokens=max_output_len, + # lora_manager=lora_manager, + # lora_uids=lora_uids, + # ) + + # outputs = decoder.decode( + # line_encoded, + # input_lengths, + # sampling_config, + # *ptuning_args, + # stop_words_list=stop_words_list, + # bad_words_list=bad_words_list, + # no_repeat_ngram_size=no_repeat_ngram_size, + # streaming=streaming, + # output_sequence_lengths=True, + # return_dict=True, + # ) + + outputs = decoder.generate( + input_tensors, max_new_tokens=max_output_len, - lora_manager=lora_manager, - lora_uids=lora_uids, - ) - - outputs = decoder.decode( - line_encoded, - input_lengths, - sampling_config, - *ptuning_args, + end_id=end_id, + pad_id=pad_id, + temperature=temperature, + top_k=top_k, + top_p=top_p, + num_beams=num_beams, stop_words_list=stop_words_list, bad_words_list=bad_words_list, - no_repeat_ngram_size=no_repeat_ngram_size, + lora_uids=lora_uids, + prompt_table=prompt_table, + prompt_tasks=task_ids, streaming=streaming, output_sequence_lengths=True, return_dict=True, ) + torch.cuda.synchronize() runtime_rank = tensorrt_llm.mpi_rank() if runtime_rank == 0 or multiprocessed_env: - return outputs, decoder.log_probs + return outputs else: return None From a472f017c874430a4004c98d108f48dc2a7f44fe Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Thu, 18 Apr 2024 22:35:30 -0700 Subject: [PATCH 07/35] Fix runtime config --- nemo/export/trt_llm/nemo_utils.py | 6 +++--- nemo/export/trt_llm/tensorrt_llm_run.py | 22 ++++++++++++++-------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 485f03dcd98d..0519dd81c92f 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -437,9 +437,9 @@ def nemo_to_trtllm_config( split(lm_head_weight, mapping.tp_size, mapping.tp_rank) ) - config = PretrainedConfig(**config) - config.mapping = mapping - model_configs.append(config) + model_config = PretrainedConfig(**config) + model_config.mapping = mapping + model_configs.append(model_config) weights_dicts.append(weights_dict_local) return weights_dicts, model_configs, tokenizer diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 9c640a8c6487..9ef1335a9066 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -129,7 +129,13 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b engine_dir = Path(engine_dir) config_path = engine_dir / "config.json" - model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path) + #model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path) + + with open(config_path, "r") as f: + config = json.load(f) + + max_batch_size = config["build_config"]["max_batch_size"] + max_input_len = config["build_config"]["max_input_len"] runtime_rank = tensorrt_llm.mpi_rank() assert runtime_rank < torch.cuda.device_count(), f"Rank {runtime_rank} out of bound" @@ -312,7 +318,7 @@ def load( config_path = os.path.join(engine_dir, "config.json") with open(config_path, "r") as f: config = json.load(f) - world_size = config["builder_config"]["world_size"] + world_size = config["pretrained_config"]["mapping"]["world_size"] if world_size == 1: _load(tokenizer, engine_dir, lora_ckpt_list, num_beams) executor = None @@ -325,9 +331,9 @@ def load( for future in futures: future.result() - max_batch_size = config["builder_config"]["max_batch_size"] - max_input_len = config["builder_config"]["max_input_len"] - add_bos = config["builder_config"]["add_bos"] + max_batch_size = config["build_config"]["max_batch_size"] + max_input_len = config["build_config"]["max_input_len"] + #add_bos = config["builder_config"]["add_bos"] return TensorrtLLMHostContext( executor=executor, @@ -335,7 +341,7 @@ def load( tokenizer=tokenizer, max_batch_size=max_batch_size, max_input_len=max_input_len, - add_bos=add_bos, + add_bos=False, ) @@ -560,7 +566,7 @@ def generate( if no_repeat_ngram_size is not None: no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device()) - outputs, log_probs = forward( + outputs = forward( input_tensors=input_tensors, max_output_len=max_output_len, host_context=host_context, @@ -643,7 +649,7 @@ def generate_streaming( if no_repeat_ngram_size is not None: no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device()) - outputs, log_probs = forward( + outputs = forward( input_tensors=input_tensors, max_output_len=max_output_len, host_context=host_context, From a1c477dfa61033ad30067befeae13c377a791f87 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Fri, 19 Apr 2024 16:15:00 -0700 Subject: [PATCH 08/35] Fix for llama --- nemo/export/trt_llm/nemo_utils.py | 3 ++- nemo/export/trt_llm/tensorrt_llm_run.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 0519dd81c92f..3202a334e146 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -30,6 +30,7 @@ import tensorrt_llm from tensorrt_llm import str_dtype_to_trt from tensorrt_llm._utils import pad_vocab_size +from tensorrt_llm.functional import non_gated_version from transformers import AutoTokenizer, LlamaConfig, PreTrainedTokenizer from tensorrt_llm.models.modeling_utils import PretrainedConfig @@ -392,7 +393,7 @@ def nemo_to_trtllm_config( 'max_position_embeddings': nemo_model_config.get('max_position_embeddings'), 'hidden_act': - nemo_model_config.get('activation'), + non_gated_version(nemo_model_config.get('activation')), 'use_parallel_embedding': use_parallel_embedding, 'embedding_sharding_dim': diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 9ef1335a9066..d02acf85c82d 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -142,7 +142,8 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b decoder = ModelRunner.from_dir( engine_dir=engine_dir, - lora_ckpt_source=lora_ckpt_list, + lora_dir=lora_ckpt_list, + lora_ckpt_source="nemo", rank=runtime_rank, debug_mode=False ) From b43f8488dd0aae41f2558d13c71a3c0e7ddf73b0 Mon Sep 17 00:00:00 2001 From: abharwani Date: Mon, 22 Apr 2024 10:50:39 -0700 Subject: [PATCH 09/35] Fix for ptuning --- nemo/export/tensorrt_llm.py | 8 ++++---- nemo/export/trt_llm/utils.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 872b37fed70c..31f94063c0ee 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -448,7 +448,7 @@ def get_hidden_size(self): if self.config is None: return None else: - return self.config["builder_config"]["hidden_size"] + return self.config["pretrained_config"]["hidden_size"] @property def get_triton_input(self): @@ -651,15 +651,15 @@ def _get_prompt_embedding_table( raise TypeError(prompt_embeddings_checkpoint_path + " is not a nemo file.") prompt_embeddings_table = self._get_prompt_embedding_table_ckpt(prompt_embeddings_checkpoint_path) - dtype = self.config['builder_config']['precision'] + dtype = self.config['pretrained_config']['dtype'] prompt_embeddings_table = prompt_embeddings_table.to( dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype) ).cuda() - if prompt_embeddings_table.size(dim=1) != self.config["builder_config"]["hidden_size"]: + if prompt_embeddings_table.size(dim=1) != self.config["pretrained_config"]["hidden_size"]: raise Exception( "Hidden dimension of the model is {0} and does not match with the dimension of the prompt table.".format( - self.config["builder_config"]["hidden_size"] + self.config["pretrained_config"]["hidden_size"] ) ) diff --git a/nemo/export/trt_llm/utils.py b/nemo/export/trt_llm/utils.py index f5894644e3ba..17b1a81cb46b 100644 --- a/nemo/export/trt_llm/utils.py +++ b/nemo/export/trt_llm/utils.py @@ -22,6 +22,7 @@ import numpy as np import torch import yaml +from pathlib import Path log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s" logging.basicConfig(format=log_format) From a827421be97eb849483afff2a7da57879eee281b Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Mon, 22 Apr 2024 18:14:35 -0700 Subject: [PATCH 10/35] Fix TP issue --- nemo/export/trt_llm/nemo/convert.py | 4 ++-- nemo/export/trt_llm/nemo_utils.py | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py index f94503481952..946ad88fe5d1 100644 --- a/nemo/export/trt_llm/nemo/convert.py +++ b/nemo/export/trt_llm/nemo/convert.py @@ -39,12 +39,12 @@ def gpu_map_location(storage, loc): def save_val(val, dir, key, tp_num=None): - suffix = "bin" if tp_num is None else f"{tp_num}.bin" + suffix = "" if tp_num is None else f".{tp_num}.bin" # Transpose linear layer weights to the correct shape. if len(val.shape) >= 2: val = np.ascontiguousarray(np.transpose(val.reshape(val.shape[0], -1), [1, 0])) global weights_dict - weights_dict[key] = val + weights_dict[f"{key}{suffix}"] = val # weights_dict[f"model.{key}.{suffix}"] = val diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 3202a334e146..6195f128604d 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -419,7 +419,15 @@ def nemo_to_trtllm_config( model_configs = [] weights_dicts = [] for i in range(world_size): - weights_dict_local = weights_dict.copy() + weights_dict_local = {} + for k, v in weights_dict.items(): + if k.endswith(".bin"): # TP split + if k.endswith(f"{i}.bin"): + new_key = k.replace(f".{i}.bin","") + weights_dict_local[new_key] = v + else: + weights_dict_local[k] = v + mapping = tensorrt_llm.Mapping( world_size=world_size, From 2b38efb6009199f88d9f47fb6c308a8f8851cb01 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Mon, 22 Apr 2024 19:00:30 -0700 Subject: [PATCH 11/35] Change TP rank for building weight dict --- nemo/export/trt_llm/nemo_utils.py | 19 +++++++++---------- nemo/export/trt_llm/tensorrt_llm_build.py | 3 ++- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 6195f128604d..639afa3125e9 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -419,16 +419,6 @@ def nemo_to_trtllm_config( model_configs = [] weights_dicts = [] for i in range(world_size): - weights_dict_local = {} - for k, v in weights_dict.items(): - if k.endswith(".bin"): # TP split - if k.endswith(f"{i}.bin"): - new_key = k.replace(f".{i}.bin","") - weights_dict_local[new_key] = v - else: - weights_dict_local[k] = v - - mapping = tensorrt_llm.Mapping( world_size=world_size, rank=i, @@ -436,6 +426,15 @@ def nemo_to_trtllm_config( pp_size=pipeline_parallel_size ) + weights_dict_local = {} + for k, v in weights_dict.items(): + if k.endswith(".bin"): # TP split + if k.endswith(f"{mapping.tp_rank}.bin"): + new_key = k.replace(f".{mapping.tp_rank}.bin","") + weights_dict_local[new_key] = v + else: + weights_dict_local[k] = v + embedding_weight = np.ascontiguousarray( split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank) ) if use_parallel_embedding else weights_dict["transformer.vocab_embedding.weight"] diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index 77992dd81506..9fd48913d9b4 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -367,6 +367,7 @@ def build_and_save_engine( except: raise AttributeError(f"Could not find TRTLLM model type: {model_type}!") + logger.set_level("info") str_dtype = model_config.dtype plugin_config = PluginConfig() plugin_config.set_gpt_attention_plugin(dtype=str_dtype) @@ -399,4 +400,4 @@ def build_and_save_engine( engine = build_trtllm(model, build_config) engine.save(model_dir) - return engine \ No newline at end of file + return engine From 64dd6313327f9da86833a13e4a495c555a40bf0b Mon Sep 17 00:00:00 2001 From: abharwani Date: Tue, 23 Apr 2024 01:35:36 -0700 Subject: [PATCH 12/35] Add lora config --- nemo/export/tensorrt_llm.py | 6 +++++- nemo/export/trt_llm/tensorrt_llm_build.py | 18 +++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 31f94063c0ee..6ceaa5fcf657 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -208,7 +208,11 @@ def export( model_config=model_config, model_weights=weight_dict, model_dir=self.model_dir, - model_type=model_type + model_type=model_type, + lora_ckpt_list=self.lora_ckpt_list, + use_lora_plugin=use_lora_plugin, + max_lora_rank=max_lora_rank, + lora_target_modules=lora_target_modules ) tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index 9fd48913d9b4..b8a26b736a60 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -34,6 +34,7 @@ from tensorrt_llm.commands.build import build as build_trtllm from tensorrt_llm.plugin import PluginConfig +from tensorrt_llm.lora_manager import LoraBuildConfig from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights MODEL_NAME = "NeMo" @@ -360,7 +361,11 @@ def build_and_save_engine( model_dir=None, model_weights=None, model_config=None, - model_type='gpt' + model_type='gpt', + lora_ckpt_list=None, + use_lora_plugin=None, + max_lora_rank=64, + lora_target_modules=None ): try: model_cls = getattr(tensorrt_llm.models, model_config.architecture) @@ -374,6 +379,8 @@ def build_and_save_engine( plugin_config.set_gemm_plugin(dtype=str_dtype) max_num_tokens = max_batch_size*max_input_len + + build_dict = { 'max_input_len': max_input_len, 'max_output_len': max_output_len, @@ -389,6 +396,15 @@ def build_and_save_engine( } build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config) + if use_lora_plugin is not None: + build_config.plugin_config.set_lora_plugin(use_lora_plugin) + lora_config = LoraBuildConfig( + lora_dir=lora_ckpt_list, + lora_ckpt_source='nemo', + max_lora_rank=max_lora_rank, + lora_target_modules=lora_target_modules) + build_config.lora_config = lora_config + model = model_cls.from_config(model_config) model = optimize_model( model, From cdb738951bc29afc8669644600de2f45b15d4c77 Mon Sep 17 00:00:00 2001 From: abharwani Date: Tue, 23 Apr 2024 04:27:33 -0700 Subject: [PATCH 13/35] add prompt embedding table config --- nemo/export/tensorrt_llm.py | 3 ++- nemo/export/trt_llm/tensorrt_llm_build.py | 5 +++-- nemo/export/trt_llm/tensorrt_llm_run.py | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 6ceaa5fcf657..2785bc567efd 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -212,7 +212,8 @@ def export( lora_ckpt_list=self.lora_ckpt_list, use_lora_plugin=use_lora_plugin, max_lora_rank=max_lora_rank, - lora_target_modules=lora_target_modules + lora_target_modules=lora_target_modules, + max_prompt_embedding_table_size=max_prompt_embedding_table_size ) tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index b8a26b736a60..b568ad5fc766 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -365,7 +365,8 @@ def build_and_save_engine( lora_ckpt_list=None, use_lora_plugin=None, max_lora_rank=64, - lora_target_modules=None + lora_target_modules=None, + max_prompt_embedding_table_size=0 ): try: model_cls = getattr(tensorrt_llm.models, model_config.architecture) @@ -388,7 +389,7 @@ def build_and_save_engine( 'max_beam_width': 1, 'max_num_tokens': max_num_tokens, 'opt_num_tokens': None, - 'max_prompt_embedding_table_size': 0, + 'max_prompt_embedding_table_size': max_prompt_embedding_table_size, 'gather_context_logits': False, 'gather_generation_logits': False, 'strongly_typed': False, diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index d02acf85c82d..3e033cc85cb2 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -276,6 +276,8 @@ def _forward( # return_dict=True, # ) + prompt_tasks = ",".join(str(task) for task in task_ids) + outputs = decoder.generate( input_tensors, max_new_tokens=max_output_len, @@ -289,7 +291,7 @@ def _forward( bad_words_list=bad_words_list, lora_uids=lora_uids, prompt_table=prompt_table, - prompt_tasks=task_ids, + prompt_tasks=prompt_tasks, streaming=streaming, output_sequence_lengths=True, return_dict=True, From 487eb26003c99644bcb4e3a1845e15b8aae9a904 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Tue, 23 Apr 2024 15:34:01 -0700 Subject: [PATCH 14/35] Fix PP isue --- nemo/export/trt_llm/nemo_utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 639afa3125e9..91cd6272fbcc 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -418,6 +418,7 @@ def nemo_to_trtllm_config( model_configs = [] weights_dicts = [] + num_layers = nemo_model_config.get('num_layers') for i in range(world_size): mapping = tensorrt_llm.Mapping( world_size=world_size, @@ -425,6 +426,7 @@ def nemo_to_trtllm_config( tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size ) + layers_range = mapping.pp_layers(num_layers) weights_dict_local = {} for k, v in weights_dict.items(): @@ -432,6 +434,11 @@ def nemo_to_trtllm_config( if k.endswith(f"{mapping.tp_rank}.bin"): new_key = k.replace(f".{mapping.tp_rank}.bin","") weights_dict_local[new_key] = v + elif "layers" in k: # PP + layer_num = int(k.split(".")[2]) + if layer_num in layers_range: + new_key = k.replace(f"{layer_num}", f"{layer_num-layers_range[0]}") + weights_dict_local[new_key] = v else: weights_dict_local[k] = v From b80388bf6f6fd2bf786165b3f206bdd9c3664da5 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Tue, 23 Apr 2024 17:11:00 -0700 Subject: [PATCH 15/35] PP layers fix --- nemo/export/trt_llm/nemo_utils.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 91cd6272fbcc..c16f9b5894e2 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -430,17 +430,15 @@ def nemo_to_trtllm_config( weights_dict_local = {} for k, v in weights_dict.items(): - if k.endswith(".bin"): # TP split - if k.endswith(f"{mapping.tp_rank}.bin"): - new_key = k.replace(f".{mapping.tp_rank}.bin","") - weights_dict_local[new_key] = v - elif "layers" in k: # PP - layer_num = int(k.split(".")[2]) + new_key = k + if new_key.endswith(".bin"): # TP split + if new_key.endswith(f"{mapping.tp_rank}.bin"): + new_key = new_key.replace(f".{mapping.tp_rank}.bin","") + if "layers" in new_key: # PP + layer_num = int(new_key.split(".")[2]) if layer_num in layers_range: - new_key = k.replace(f"{layer_num}", f"{layer_num-layers_range[0]}") - weights_dict_local[new_key] = v - else: - weights_dict_local[k] = v + new_key = new_key.replace(f"{layer_num}", f"{layer_num-layers_range[0]}") + weights_dict_local[new_key] = v embedding_weight = np.ascontiguousarray( split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank) From fab487b821ce2998d9c0956c46a2a509474606c9 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Tue, 23 Apr 2024 17:20:29 -0700 Subject: [PATCH 16/35] Fix no prompt task ids --- nemo/export/trt_llm/tensorrt_llm_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 3e033cc85cb2..4f1f41fb4841 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -276,7 +276,7 @@ def _forward( # return_dict=True, # ) - prompt_tasks = ",".join(str(task) for task in task_ids) + prompt_tasks = None if task_ids is None else ",".join(str(task) for task in task_ids) outputs = decoder.generate( input_tensors, From 8f0f36d5ad710a403444991361f13f0d7b0c0ca2 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Tue, 23 Apr 2024 17:27:33 -0700 Subject: [PATCH 17/35] Add bos for Gemma --- nemo/export/trt_llm/tensorrt_llm_run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 4f1f41fb4841..b020440baf5b 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -336,7 +336,7 @@ def load( max_batch_size = config["build_config"]["max_batch_size"] max_input_len = config["build_config"]["max_input_len"] - #add_bos = config["builder_config"]["add_bos"] + add_bos = True if config["pretrained_config"]["architecture"] == "GemmaForCausalLM" else False return TensorrtLLMHostContext( executor=executor, @@ -344,7 +344,7 @@ def load( tokenizer=tokenizer, max_batch_size=max_batch_size, max_input_len=max_input_len, - add_bos=False, + add_bos=add_bos, ) From 5d3503e1a662300c544ba290aaaf24a88021cfaf Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Tue, 23 Apr 2024 18:00:50 -0700 Subject: [PATCH 18/35] Add multi block mode --- nemo/export/tensorrt_llm.py | 3 ++- nemo/export/trt_llm/tensorrt_llm_build.py | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 2785bc567efd..4fb1e026f60c 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -213,7 +213,8 @@ def export( use_lora_plugin=use_lora_plugin, max_lora_rank=max_lora_rank, lora_target_modules=lora_target_modules, - max_prompt_embedding_table_size=max_prompt_embedding_table_size + max_prompt_embedding_table_size=max_prompt_embedding_table_size, + enable_multi_block_mode=enable_multi_block_mode ) tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index b568ad5fc766..fcce0541ce71 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -366,7 +366,8 @@ def build_and_save_engine( use_lora_plugin=None, max_lora_rank=64, lora_target_modules=None, - max_prompt_embedding_table_size=0 + max_prompt_embedding_table_size=0, + enable_multi_block_mode: bool = False ): try: model_cls = getattr(tensorrt_llm.models, model_config.architecture) @@ -378,10 +379,11 @@ def build_and_save_engine( plugin_config = PluginConfig() plugin_config.set_gpt_attention_plugin(dtype=str_dtype) plugin_config.set_gemm_plugin(dtype=str_dtype) + plugin_config.set_plugin("multi_block_mode", enable_multi_block_mode) max_num_tokens = max_batch_size*max_input_len - - + + build_dict = { 'max_input_len': max_input_len, 'max_output_len': max_output_len, @@ -405,7 +407,7 @@ def build_and_save_engine( max_lora_rank=max_lora_rank, lora_target_modules=lora_target_modules) build_config.lora_config = lora_config - + model = model_cls.from_config(model_config) model = optimize_model( model, From a8f54b0b7f4c2d713874e71db9650a7bec423eab Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Tue, 23 Apr 2024 19:03:56 -0700 Subject: [PATCH 19/35] Embedding and layernorm for PP --- nemo/export/trt_llm/nemo_utils.py | 40 +++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index c16f9b5894e2..93c977baafb1 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -419,6 +419,15 @@ def nemo_to_trtllm_config( model_configs = [] weights_dicts = [] num_layers = nemo_model_config.get('num_layers') + + pp_key = { + "transformer.vocab_embedding.weight", + "transformer.position_embedding.weight", + "lm_head.weight", + "transformer.ln_f.weight", + "transformer.ln_f.bias" + } + for i in range(world_size): mapping = tensorrt_llm.Mapping( world_size=world_size, @@ -430,6 +439,8 @@ def nemo_to_trtllm_config( weights_dict_local = {} for k, v in weights_dict.items(): + if k in pp_key: + continue new_key = k if new_key.endswith(".bin"): # TP split if new_key.endswith(f"{mapping.tp_rank}.bin"): @@ -440,15 +451,30 @@ def nemo_to_trtllm_config( new_key = new_key.replace(f"{layer_num}", f"{layer_num-layers_range[0]}") weights_dict_local[new_key] = v - embedding_weight = np.ascontiguousarray( - split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank) - ) if use_parallel_embedding else weights_dict["transformer.vocab_embedding.weight"] + if mapping.is_first_pp_rank(): + embedding_weight = np.ascontiguousarray( + split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank) + ) if use_parallel_embedding else weights_dict["transformer.vocab_embedding.weight"] - weights_dict_local["transformer.vocab_embedding.weight"] = embedding_weight + weights_dict_local["transformer.vocab_embedding.weight"] = embedding_weight - weights_dict_local["lm_head.weight"] = np.ascontiguousarray( - split(lm_head_weight, mapping.tp_size, mapping.tp_rank) - ) + pos_embedding_weight = weights_dict.get("transformer.position_embedding.weight") + if pos_embedding_weight is not None: + if use_parallel_embedding: + pos_embedding_weight = np.ascontiguousarray( + split(pos_embedding_weight, mapping.tp_size, mapping.tp_rank) + ) + weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight + + if mapping.is_last_pp_rank(): + weights_dict_local["lm_head.weight"] = np.ascontiguousarray( + split(lm_head_weight, mapping.tp_size, mapping.tp_rank) + ) + weights_dict_local["transformer.ln_f.weight"] = weights_dict["transformer.ln_f.weight"] + + ln_f_bias = weights_dict.get("transformer.ln_f.bias") + if ln_f_bias is not None: + weights_dict_local["transformer.ln_f.bias"] = ln_f_bias model_config = PretrainedConfig(**config) model_config.mapping = mapping From bdf7cfcee3b3e57d6895029f9f79c15378fb548b Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Wed, 24 Apr 2024 17:09:28 -0700 Subject: [PATCH 20/35] MPI multiprocess support for multinode --- nemo/export/tensorrt_llm.py | 82 +++++++++++++------------ nemo/export/trt_llm/tensorrt_llm_run.py | 5 +- 2 files changed, 47 insertions(+), 40 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 4fb1e026f60c..c1901a634ad7 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -186,48 +186,52 @@ def export( self.model = None - tmp_dir = tempfile.TemporaryDirectory() - nemo_export_dir = Path(tmp_dir.name) - - weights_dicts, model_configs, self.tokenizer = nemo_to_trtllm_config( - in_file=nemo_checkpoint_path, - decoder_type=model_type, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - pipeline_parallel_size=pipeline_parallel_size, - use_parallel_embedding=use_parallel_embedding, - nemo_export_dir=nemo_export_dir, - save_nemo_model_config=save_nemo_model_config, - ) - - for weight_dict, model_config in zip(weights_dicts, model_configs): - build_and_save_engine( - max_input_len=max_input_token, - max_output_len=max_output_token, - max_batch_size=max_batch_size, - model_config=model_config, - model_weights=weight_dict, - model_dir=self.model_dir, - model_type=model_type, - lora_ckpt_list=self.lora_ckpt_list, - use_lora_plugin=use_lora_plugin, - max_lora_rank=max_lora_rank, - lora_target_modules=lora_target_modules, - max_prompt_embedding_table_size=max_prompt_embedding_table_size, - enable_multi_block_mode=enable_multi_block_mode + if tensorrt_llm.mpi_rank() == 0: + tmp_dir = tempfile.TemporaryDirectory() + nemo_export_dir = Path(tmp_dir.name) + + weights_dicts, model_configs, self.tokenizer = nemo_to_trtllm_config( + in_file=nemo_checkpoint_path, + decoder_type=model_type, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + pipeline_parallel_size=pipeline_parallel_size, + use_parallel_embedding=use_parallel_embedding, + nemo_export_dir=nemo_export_dir, + save_nemo_model_config=save_nemo_model_config, ) - tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") - if os.path.exists(tokenizer_path): - shutil.copy(tokenizer_path, self.model_dir) - else: - self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer')) + for weight_dict, model_config in zip(weights_dicts, model_configs): + build_and_save_engine( + max_input_len=max_input_token, + max_output_len=max_output_token, + max_batch_size=max_batch_size, + model_config=model_config, + model_weights=weight_dict, + model_dir=self.model_dir, + model_type=model_type, + lora_ckpt_list=self.lora_ckpt_list, + use_lora_plugin=use_lora_plugin, + max_lora_rank=max_lora_rank, + lora_target_modules=lora_target_modules, + max_prompt_embedding_table_size=max_prompt_embedding_table_size, + enable_multi_block_mode=enable_multi_block_mode + ) + + tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") + if os.path.exists(tokenizer_path): + shutil.copy(tokenizer_path, self.model_dir) + else: + self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer')) + + nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml") + if os.path.exists(nemo_model_config): + shutil.copy(nemo_model_config, self.model_dir) - nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml") - if os.path.exists(nemo_model_config): - shutil.copy(nemo_model_config, self.model_dir) + tmp_dir.cleanup() - tmp_dir.cleanup() + if tensorrt_llm.mpi_world_size() > 1: + tensorrt_llm.mpi_barrier() if load_model: self._load() @@ -372,7 +376,7 @@ def forward( ), "Task: {0} doesn't exist in the task list.".format(task_ids[i]) input_task_ids.append(self.task_ids[task_ids[i]]) if not streaming: - if torch.distributed.is_initialized(): + if torch.distributed.is_initialized() or tensorrt_llm.mpi_world_size() > 1: multiprocessed_env = True else: multiprocessed_env = False diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index b020440baf5b..ef0bff84c770 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -138,7 +138,6 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b max_input_len = config["build_config"]["max_input_len"] runtime_rank = tensorrt_llm.mpi_rank() - assert runtime_rank < torch.cuda.device_count(), f"Rank {runtime_rank} out of bound" decoder = ModelRunner.from_dir( engine_dir=engine_dir, @@ -325,6 +324,10 @@ def load( if world_size == 1: _load(tokenizer, engine_dir, lora_ckpt_list, num_beams) executor = None + elif tensorrt_llm.mpi_world_size() > 1: + _load(tokenizer, engine_dir, lora_ckpt_list, num_beams) + executor = None + tensorrt_llm.mpi_barrier() else: executor = MPIPoolExecutor(max_workers=world_size) futures = [] From 599520f710a7bdc62d43e7910d90ddb05445c830 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Wed, 24 Apr 2024 17:49:08 -0700 Subject: [PATCH 21/35] Only output text on first rank --- nemo/export/trt_llm/tensorrt_llm_run.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index ef0bff84c770..da490c2ae0a3 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -592,6 +592,8 @@ def generate( **sampling_kwargs, ) assert outputs is not None + if tensorrt_llm.mpi_rank() != 0: + return None output_ids = outputs['output_ids'] sequence_lengths = outputs['sequence_lengths'] From 7821ff90d6c37d7942618be2edd1b9f848a23def Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Wed, 24 Apr 2024 18:14:33 -0700 Subject: [PATCH 22/35] Change to ModelRunnerCpp --- nemo/export/trt_llm/tensorrt_llm_run.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index da490c2ae0a3..627c8e0cc31a 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -27,7 +27,7 @@ from tensorrt_llm.lora_manager import LoraManager from tensorrt_llm.quantization import QuantMode from tensorrt_llm.runtime import ModelConfig, SamplingConfig -from tensorrt_llm.runtime import ModelRunner +from tensorrt_llm.runtime import ModelRunnerCpp from transformers import PreTrainedTokenizer from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group @@ -56,7 +56,7 @@ class TensorrtLLMHostContext: class TensorrtLLMWorkerContext: """The MPI worker side context for TRT LLM inference.""" - decoder: ModelRunner = None + decoder: ModelRunnerCpp = None sampling_config: SamplingConfig = None lora_manager: LoraManager = None max_batch_size: int = 0 @@ -136,14 +136,20 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b max_batch_size = config["build_config"]["max_batch_size"] max_input_len = config["build_config"]["max_input_len"] + max_output_len = config["build_config"]["max_output_len"] + max_beam_width = config["build_config"]["max_beam_width"] runtime_rank = tensorrt_llm.mpi_rank() - decoder = ModelRunner.from_dir( + decoder = ModelRunnerCpp.from_dir( engine_dir=engine_dir, lora_dir=lora_ckpt_list, lora_ckpt_source="nemo", rank=runtime_rank, + max_batch_size=max_batch_size, + max_input_len=max_input_len, + max_output_len=max_output_len, + max_beam_width=max_beam_width, debug_mode=False ) From 3ecd9a7ab822fb012e4bf7819ac91c71b606ad72 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Thu, 25 Apr 2024 10:11:44 -0700 Subject: [PATCH 23/35] Add falcon --- nemo/export/trt_llm/decoder/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/export/trt_llm/decoder/__init__.py b/nemo/export/trt_llm/decoder/__init__.py index 84e1ecf396c7..f473e3819f08 100644 --- a/nemo/export/trt_llm/decoder/__init__.py +++ b/nemo/export/trt_llm/decoder/__init__.py @@ -45,6 +45,7 @@ DECODER_GPTNEXT: 'GPTForCausalLM', DECODER_LLAMA: 'LLaMAForCausalLM', DECODER_GEMMA: 'GemmaForCausalLM', + DECODER_FALCON: 'FalconForCausalLM' } From 0ce5ae5e42ceea947457cbebcd968cca8149b594 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Thu, 25 Apr 2024 10:47:49 -0700 Subject: [PATCH 24/35] Add rotary_pct default value --- nemo/export/trt_llm/nemo_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 93c977baafb1..be99cd255ae6 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -409,7 +409,7 @@ def nemo_to_trtllm_config( 'apply_query_key_layer_scaling': False, 'rotary_pct': - nemo_model_config.get('rotary_percentage'), + nemo_model_config.get('rotary_percentage', 1.0), 'logits_dtype': 'float32', 'world_size': world_size, 'tp_size': tensor_parallel_size, From 4d576ef353a9c64f00543d136e04e4b52d3679e0 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Mon, 29 Apr 2024 13:39:54 -0700 Subject: [PATCH 25/35] Falcon fix --- nemo/export/trt_llm/nemo/convert.py | 5 +++++ nemo/export/trt_llm/nemo_utils.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py index 946ad88fe5d1..ff3239678fa6 100644 --- a/nemo/export/trt_llm/nemo/convert.py +++ b/nemo/export/trt_llm/nemo/convert.py @@ -231,6 +231,11 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t key = f'{layer_prefix}.input_layernorm.weight' else: key = f'{layer_prefix}.input_layernorm.bias' + elif "pre_mlp_layernorm" in key: + if key.endswith('weight'): + key = f'{layer_prefix}.mlp_layernorm.weight' + else: + key = f'{layer_prefix}.mlp_layernorm.bias' if tp_rank == 0: save_val(vals[0], saved_dir, key) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index be99cd255ae6..c2ec2b6b2814 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -420,6 +420,10 @@ def nemo_to_trtllm_config( weights_dicts = [] num_layers = nemo_model_config.get('num_layers') + if decoder_type == "falcon": + config["new_decoder_architecture"] = False if num_layers == 32 else True + config["parallel_attention"] = True + pp_key = { "transformer.vocab_embedding.weight", "transformer.position_embedding.weight", From aa28fc95fba018946697562497ef6b54e06cecc9 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Tue, 30 Apr 2024 15:54:23 -0700 Subject: [PATCH 26/35] Add MOE config --- nemo/export/trt_llm/nemo_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index c2ec2b6b2814..7824099fbfd8 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -410,6 +410,14 @@ def nemo_to_trtllm_config( False, 'rotary_pct': nemo_model_config.get('rotary_percentage', 1.0), + 'moe_num_experts': + nemo_model_config.get('num_moe_experts', 0), + 'moe_top_k': + nemo_model_config.get('moe_router_topk'), + 'moe_normalization_mode': + nemo_model_config.get('moe_renorm_mode'), + 'moe_tp_mode': + nemo_model_config.get('moe_tp_mode'), 'logits_dtype': 'float32', 'world_size': world_size, 'tp_size': tensor_parallel_size, From da84b22d9a0c07d2161197395fb9848748f87669 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Wed, 1 May 2024 10:56:33 -0700 Subject: [PATCH 27/35] Fix MOE weight dict --- nemo/export/trt_llm/nemo/convert.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py index ff3239678fa6..14fa7cebf59f 100644 --- a/nemo/export/trt_llm/nemo/convert.py +++ b/nemo/export/trt_llm/nemo/convert.py @@ -56,10 +56,11 @@ def save_split(split_vals, dir, key, i, split_factor): def save_expert_split(split_vals, dir, key, i, split_factor): for j, val in enumerate(split_vals): tp_num = i * split_factor + j - suffix = "bin" if tp_num is None else f"{tp_num}.bin" + suffix = "" if tp_num is None else f".{tp_num}.bin" global weights_dict - weights_dict[f"model.{key}.{suffix}"] = val + weights_dict[f"{key}{suffix}"] = val + # weights_dict[f"model.{key}.{suffix}"] = val def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False): @@ -233,9 +234,9 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t key = f'{layer_prefix}.input_layernorm.bias' elif "pre_mlp_layernorm" in key: if key.endswith('weight'): - key = f'{layer_prefix}.mlp_layernorm.weight' + key = f'{layer_prefix}.post_layernorm.weight' else: - key = f'{layer_prefix}.mlp_layernorm.bias' + key = f'{layer_prefix}.post_layernorm.bias' if tp_rank == 0: save_val(vals[0], saved_dir, key) @@ -385,8 +386,8 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t pass elif "mlp.router.weight" in key: val = np.concatenate(vals, axis=1) - split_vals = np.split(val, split_factor, axis=0) - save_split(split_vals, saved_dir, key, tp_rank, split_factor) + key = f'{layer_prefix}.mlp.router.weight' + save_val(val, saved_dir, key) elif "experts.linear_fc1.weight" in key: cat_dim = -1 val = np.concatenate(vals, axis=cat_dim) @@ -397,12 +398,14 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t split_w3s = np.split(w3, split_factor, axis=1) split_vals = [np.concatenate(item, axis=1) for item in zip(split_w3s, split_w1s)] + key = f'{layer_prefix}.mlp.experts_weight_1' save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor) elif "experts.linear_fc2.weight" in key: cat_dim = -1 val = np.concatenate(vals, axis=cat_dim) split_vals = np.split(val, split_factor, axis=cat_dim) + key = f'{layer_prefix}.mlp.experts_weight_2' save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor) else: print(f"[WARNING] {key} not handled by converter") From 30e6ecebdfa85d8b574e8bf14e4abae9fcfb4dc2 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Thu, 2 May 2024 10:35:28 -0700 Subject: [PATCH 28/35] Clean code --- nemo/export/trt_llm/nemo/convert.py | 2 - nemo/export/trt_llm/tensorrt_llm_run.py | 71 ------------------------- 2 files changed, 73 deletions(-) diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py index 14fa7cebf59f..7848305d48a0 100644 --- a/nemo/export/trt_llm/nemo/convert.py +++ b/nemo/export/trt_llm/nemo/convert.py @@ -45,7 +45,6 @@ def save_val(val, dir, key, tp_num=None): val = np.ascontiguousarray(np.transpose(val.reshape(val.shape[0], -1), [1, 0])) global weights_dict weights_dict[f"{key}{suffix}"] = val - # weights_dict[f"model.{key}.{suffix}"] = val def save_split(split_vals, dir, key, i, split_factor): @@ -60,7 +59,6 @@ def save_expert_split(split_vals, dir, key, i, split_factor): global weights_dict weights_dict[f"{key}{suffix}"] = val - # weights_dict[f"model.{key}.{suffix}"] = val def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False): diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 627c8e0cc31a..34e18b35559d 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -153,37 +153,14 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b debug_mode=False ) - # runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank, tp_size=tp_size, pp_size=pp_size) - - # torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) - # engine_name = get_engine_name(MODEL_NAME, dtype, tp_size, pp_size, runtime_rank) - # serialize_path = os.path.join(engine_dir, engine_name) - # logger.info(f"Reading from serialize path {serialize_path}") - - # with open(serialize_path, "rb") as f: - # engine_buffer = f.read() - # decoder = tensorrt_llm.runtime.GenerationSession( - # model_config, engine_buffer, runtime_mapping, debug_mode=False - # ) - sampling_config = SamplingConfig( end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams ) - # if decoder.use_lora_plugin: - # lora_manager = LoraManager() - # if lora_ckpt_list is not None: - # lora_manager.load_from_nemo( - # model_files=lora_ckpt_list, model_config=model_config, runtime_mapping=runtime_mapping, - # ) - # else: - # lora_manager = None - # Initialize the global context so it can be used during `run` API. global tensorrt_llm_worker_context tensorrt_llm_worker_context.decoder = decoder tensorrt_llm_worker_context.sampling_config = sampling_config - # tensorrt_llm_worker_context.lora_manager = lora_manager tensorrt_llm_worker_context.max_batch_size = max_batch_size tensorrt_llm_worker_context.max_input_len = max_input_len @@ -232,55 +209,7 @@ def _forward( end_id = sampling_config.end_id num_beams = sampling_config.num_beams - # if decoder.remove_input_padding: - # line_encoded = torch.concat(input_tensors).cuda() - # else: - # line_encoded = torch.nested.to_padded_tensor( - # torch.nested.nested_tensor(input_tensors, dtype=torch.int32), pad_id - # ).cuda() - - # input_lengths = torch.tensor(input_lengths, dtype=torch.int32).cuda() - - # if prompt_table is None: - # ptuning_args = [] - # else: - # if task_vocab_size is None: - # raise Exception("task_vocab_size cannot be None") - - # task_vocab_size = torch.tensor([task_vocab_size], dtype=torch.int32, device="cuda") - # task_ids = torch.tensor(task_ids, dtype=torch.int32, device="cuda") - # prompt_table = prompt_table.cuda() - # ptuning_args = [prompt_table, task_ids, task_vocab_size] - with torch.no_grad(): - # sampling_config.top_k = top_k - # sampling_config.top_p = top_p - # sampling_config.temperature = temperature - # for key, param in sampling_kwargs.items(): - # # set any additional SamplingConfig kwargs - # setattr(sampling_config, key, param) - - # decoder.setup( - # batch_size, - # max_context_length=max_length, - # max_new_tokens=max_output_len, - # lora_manager=lora_manager, - # lora_uids=lora_uids, - # ) - - # outputs = decoder.decode( - # line_encoded, - # input_lengths, - # sampling_config, - # *ptuning_args, - # stop_words_list=stop_words_list, - # bad_words_list=bad_words_list, - # no_repeat_ngram_size=no_repeat_ngram_size, - # streaming=streaming, - # output_sequence_lengths=True, - # return_dict=True, - # ) - prompt_tasks = None if task_ids is None else ",".join(str(task) for task in task_ids) outputs = decoder.generate( From 479d871d4b71096bee0df241037fefbb88c150ea Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Thu, 2 May 2024 10:47:02 -0700 Subject: [PATCH 29/35] Add rotary_base --- nemo/export/trt_llm/nemo_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 7824099fbfd8..b70efa81fe64 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -410,6 +410,8 @@ def nemo_to_trtllm_config( False, 'rotary_pct': nemo_model_config.get('rotary_percentage', 1.0), + 'rotary_base': + nemo_model_config.get('rotary_base', 10000), 'moe_num_experts': nemo_model_config.get('num_moe_experts', 0), 'moe_top_k': From 05b4cbc492589065938710ac007410a4a8a81c86 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Thu, 2 May 2024 10:48:29 -0700 Subject: [PATCH 30/35] Fix MOE config --- nemo/export/trt_llm/nemo_utils.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index b70efa81fe64..3aaf7139775b 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -33,6 +33,7 @@ from tensorrt_llm.functional import non_gated_version from transformers import AutoTokenizer, LlamaConfig, PreTrainedTokenizer from tensorrt_llm.models.modeling_utils import PretrainedConfig +from tensorrt_llm.layers import MoeConfig from nemo.export.trt_llm.model_config import ( LAYERNORM_DEFAULT, @@ -369,6 +370,10 @@ def nemo_to_trtllm_config( lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0 ) + hidden_act = nemo_model_config.get('activation') + hidden_act = hidden_act.split("-")[-1] if nemo_model_config.get('num_moe_experts', 0) \ + else non_gated_version(hidden_act) + config = { 'architecture': DECODER_MODEL_TYPE[decoder_type], @@ -393,7 +398,7 @@ def nemo_to_trtllm_config( 'max_position_embeddings': nemo_model_config.get('max_position_embeddings'), 'hidden_act': - non_gated_version(nemo_model_config.get('activation')), + hidden_act, 'use_parallel_embedding': use_parallel_embedding, 'embedding_sharding_dim': @@ -417,9 +422,9 @@ def nemo_to_trtllm_config( 'moe_top_k': nemo_model_config.get('moe_router_topk'), 'moe_normalization_mode': - nemo_model_config.get('moe_renorm_mode'), + nemo_model_config.get('moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE), 'moe_tp_mode': - nemo_model_config.get('moe_tp_mode'), + nemo_model_config.get('moe_tp_mode', MoeConfig.ParallelismMode.TENSOR_PARALLEL), 'logits_dtype': 'float32', 'world_size': world_size, 'tp_size': tensor_parallel_size, From d2ff752902baca142fbc5d8c73d93d0a62858634 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Thu, 2 May 2024 19:06:40 -0700 Subject: [PATCH 31/35] Fix falcon new architecture --- nemo/export/trt_llm/nemo_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 3aaf7139775b..868ee927bcde 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -467,7 +467,9 @@ def nemo_to_trtllm_config( if "layers" in new_key: # PP layer_num = int(new_key.split(".")[2]) if layer_num in layers_range: - new_key = new_key.replace(f"{layer_num}", f"{layer_num-layers_range[0]}") + new_key = new_key.replace(f"layers.{layer_num}", f"layers.{layer_num-layers_range[0]}") + if config.get("new_decoder_architecture", False) and "post_layernorm" in new_key: + new_key = new_key.replace("post_layernorm", "mlp_layernorm") weights_dict_local[new_key] = v if mapping.is_first_pp_rank(): From ad5c2faa9afdf79545382906830d677a7de1dbde Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 17:14:30 +0000 Subject: [PATCH 32/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo/export/tensorrt_llm.py | 4 +- nemo/export/trt_llm/decoder/__init__.py | 2 +- nemo/export/trt_llm/nemo/convert.py | 6 +- nemo/export/trt_llm/nemo/nemo_ckpt_convert.py | 8 +- nemo/export/trt_llm/nemo_utils.py | 121 ++++++++---------- nemo/export/trt_llm/tensorrt_llm_build.py | 22 ++-- nemo/export/trt_llm/tensorrt_llm_run.py | 7 +- 7 files changed, 72 insertions(+), 98 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 964a7cd57e11..bad1cd2329dc 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -33,9 +33,9 @@ from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_to_trtllm_config from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer +from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_refit from nemo.export.trt_llm.utils import is_nemo_file -from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine use_deploy = True try: @@ -237,7 +237,7 @@ def export( max_lora_rank=max_lora_rank, lora_target_modules=lora_target_modules, max_prompt_embedding_table_size=max_prompt_embedding_table_size, - enable_multi_block_mode=enable_multi_block_mode + enable_multi_block_mode=enable_multi_block_mode, ) tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") diff --git a/nemo/export/trt_llm/decoder/__init__.py b/nemo/export/trt_llm/decoder/__init__.py index f473e3819f08..b5e22b5e513e 100644 --- a/nemo/export/trt_llm/decoder/__init__.py +++ b/nemo/export/trt_llm/decoder/__init__.py @@ -45,7 +45,7 @@ DECODER_GPTNEXT: 'GPTForCausalLM', DECODER_LLAMA: 'LLaMAForCausalLM', DECODER_GEMMA: 'GemmaForCausalLM', - DECODER_FALCON: 'FalconForCausalLM' + DECODER_FALCON: 'FalconForCausalLM', } diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py index 7848305d48a0..ba6dc44beac8 100644 --- a/nemo/export/trt_llm/nemo/convert.py +++ b/nemo/export/trt_llm/nemo/convert.py @@ -320,8 +320,10 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t v_split = np.split(qkv[2], split_factor, axis=0) # Concatenate Q, K, and V together - split_vals = [np.concatenate([q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], axis=0) - for i in range(split_factor)] + split_vals = [ + np.concatenate([q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], axis=0) + for i in range(split_factor) + ] key = f'{layer_prefix}.attention.qkv.bias' save_split(split_vals, saved_dir, key, tp_rank, split_factor) diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py index eca77b1137ed..44133de381bd 100644 --- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py +++ b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py @@ -27,7 +27,7 @@ import tensorstore # This is important even though not used. Otherwise zarr raises error. import torch import zarr -from tensorrt_llm._utils import np_bfloat16, str_dtype_to_torch, torch_to_numpy, pad_vocab_size +from tensorrt_llm._utils import np_bfloat16, pad_vocab_size, str_dtype_to_torch, torch_to_numpy from tqdm import tqdm from transformers import AutoTokenizer, GPT2Tokenizer, LlamaConfig @@ -216,11 +216,7 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): if vocab_size % inference_tp_size != 0: vocab_size_padded = pad_vocab_size(vocab_size, inference_tp_size) pad_width = vocab_size_padded - vocab_size - val = torch.nn.functional.pad( - val, - (0, 0, 0, pad_width), - value=0 - ) + val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0) val = torch_to_numpy(val.to(storage_type).cpu()) model_level_weights["transformer.vocab_embedding.weight"].append(val) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 38f2f1f40d6c..0dcf7d0e29fe 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -17,12 +17,12 @@ import copy import csv import datetime +import json import logging import os import shutil import sys import tempfile -import json from pathlib import Path from typing import Dict, List, Tuple, Union @@ -31,11 +31,12 @@ from tensorrt_llm import str_dtype_to_trt from tensorrt_llm._utils import pad_vocab_size from tensorrt_llm.functional import non_gated_version -from transformers import AutoTokenizer, LlamaConfig, PreTrainedTokenizer -from tensorrt_llm.models.modeling_utils import PretrainedConfig from tensorrt_llm.layers import MoeConfig +from tensorrt_llm.models.modeling_utils import PretrainedConfig +from transformers import AutoTokenizer, LlamaConfig, PreTrainedTokenizer from nemo.export.tarutils import TarPath +from nemo.export.trt_llm.decoder import DECODER_MODEL_TYPE from nemo.export.trt_llm.model_config import ( LAYERNORM_DEFAULT, LAYERNORM_RMS, @@ -49,7 +50,6 @@ from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer, convert_dist_checkpoint, convert_nemo_model from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, get_tensor_parallel_group, split -from nemo.export.trt_llm.decoder import DECODER_MODEL_TYPE LOGGER = logging.getLogger("NeMo") @@ -329,6 +329,7 @@ def nemo_llm_model_to_model_config( return [model_config] + def nemo_to_trtllm_config( in_file: str, decoder_type: str, @@ -354,7 +355,7 @@ def nemo_to_trtllm_config( save_nemo_model_config=save_nemo_model_config, ) - world_size = tensor_parallel_size*pipeline_parallel_size + world_size = tensor_parallel_size * pipeline_parallel_size lm_head_weight = weights_dict["lm_head.weight"] @@ -363,65 +364,42 @@ def nemo_to_trtllm_config( if vocab_size_padded != vocab_size: pad_width = vocab_size_padded - vocab_size - lm_head_weight = np.pad( - lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0 - ) + lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0) hidden_act = nemo_model_config.get('activation') - hidden_act = hidden_act.split("-")[-1] if nemo_model_config.get('num_moe_experts', 0) \ - else non_gated_version(hidden_act) + hidden_act = ( + hidden_act.split("-")[-1] if nemo_model_config.get('num_moe_experts', 0) else non_gated_version(hidden_act) + ) config = { - 'architecture': - DECODER_MODEL_TYPE[decoder_type], - 'dtype': - dtype_str, - 'num_hidden_layers': - nemo_model_config.get('num_layers'), - 'num_attention_heads': - nemo_model_config.get('num_attention_heads'), - 'num_key_value_heads': - nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']), - 'hidden_size': - nemo_model_config.get('hidden_size'), - 'intermediate_size': - nemo_model_config.get('ffn_hidden_size'), - 'norm_epsilon': - nemo_model_config.get('layernorm_epsilon'), - 'vocab_size': - vocab_size_padded, - 'position_embedding_type': - "rope_gpt_neox" if nemo_model_config.get('position_embedding_type') == "rope" else "learned_absolute", - 'max_position_embeddings': - nemo_model_config.get('max_position_embeddings'), - 'hidden_act': - hidden_act, - 'use_parallel_embedding': - use_parallel_embedding, - 'embedding_sharding_dim': - 0, - 'share_embedding_table': - False, - 'quantization': { - 'quant_algo': None, - 'kv_cache_quant_algo': None, - }, - 'bias': - nemo_model_config.get('bias'), - 'apply_query_key_layer_scaling': - False, - 'rotary_pct': - nemo_model_config.get('rotary_percentage', 1.0), - 'rotary_base': - nemo_model_config.get('rotary_base', 10000), - 'moe_num_experts': - nemo_model_config.get('num_moe_experts', 0), - 'moe_top_k': - nemo_model_config.get('moe_router_topk'), - 'moe_normalization_mode': - nemo_model_config.get('moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE), - 'moe_tp_mode': - nemo_model_config.get('moe_tp_mode', MoeConfig.ParallelismMode.TENSOR_PARALLEL), + 'architecture': DECODER_MODEL_TYPE[decoder_type], + 'dtype': dtype_str, + 'num_hidden_layers': nemo_model_config.get('num_layers'), + 'num_attention_heads': nemo_model_config.get('num_attention_heads'), + 'num_key_value_heads': nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']), + 'hidden_size': nemo_model_config.get('hidden_size'), + 'intermediate_size': nemo_model_config.get('ffn_hidden_size'), + 'norm_epsilon': nemo_model_config.get('layernorm_epsilon'), + 'vocab_size': vocab_size_padded, + 'position_embedding_type': "rope_gpt_neox" + if nemo_model_config.get('position_embedding_type') == "rope" + else "learned_absolute", + 'max_position_embeddings': nemo_model_config.get('max_position_embeddings'), + 'hidden_act': hidden_act, + 'use_parallel_embedding': use_parallel_embedding, + 'embedding_sharding_dim': 0, + 'share_embedding_table': False, + 'quantization': {'quant_algo': None, 'kv_cache_quant_algo': None,}, + 'bias': nemo_model_config.get('bias'), + 'apply_query_key_layer_scaling': False, + 'rotary_pct': nemo_model_config.get('rotary_percentage', 1.0), + 'rotary_base': nemo_model_config.get('rotary_base', 10000), + 'moe_num_experts': nemo_model_config.get('num_moe_experts', 0), + 'moe_top_k': nemo_model_config.get('moe_router_topk'), + 'moe_normalization_mode': nemo_model_config.get( + 'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE + ), + 'moe_tp_mode': nemo_model_config.get('moe_tp_mode', MoeConfig.ParallelismMode.TENSOR_PARALLEL), 'logits_dtype': 'float32', 'world_size': world_size, 'tp_size': tensor_parallel_size, @@ -441,15 +419,12 @@ def nemo_to_trtllm_config( "transformer.position_embedding.weight", "lm_head.weight", "transformer.ln_f.weight", - "transformer.ln_f.bias" + "transformer.ln_f.bias", } for i in range(world_size): mapping = tensorrt_llm.Mapping( - world_size=world_size, - rank=i, - tp_size=tensor_parallel_size, - pp_size=pipeline_parallel_size + world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size ) layers_range = mapping.pp_layers(num_layers) @@ -458,10 +433,10 @@ def nemo_to_trtllm_config( if k in pp_key: continue new_key = k - if new_key.endswith(".bin"): # TP split + if new_key.endswith(".bin"): # TP split if new_key.endswith(f"{mapping.tp_rank}.bin"): - new_key = new_key.replace(f".{mapping.tp_rank}.bin","") - if "layers" in new_key: # PP + new_key = new_key.replace(f".{mapping.tp_rank}.bin", "") + if "layers" in new_key: # PP layer_num = int(new_key.split(".")[2]) if layer_num in layers_range: new_key = new_key.replace(f"layers.{layer_num}", f"layers.{layer_num-layers_range[0]}") @@ -470,9 +445,13 @@ def nemo_to_trtllm_config( weights_dict_local[new_key] = v if mapping.is_first_pp_rank(): - embedding_weight = np.ascontiguousarray( - split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank) - ) if use_parallel_embedding else weights_dict["transformer.vocab_embedding.weight"] + embedding_weight = ( + np.ascontiguousarray( + split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank) + ) + if use_parallel_embedding + else weights_dict["transformer.vocab_embedding.weight"] + ) weights_dict_local["transformer.vocab_embedding.weight"] = embedding_weight diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index fcce0541ce71..a05c392bee63 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -25,18 +25,16 @@ import torch from tensorrt_llm import str_dtype_to_trt from tensorrt_llm._utils import np_dtype_to_trt -from tensorrt_llm.builder import Builder, BuildConfig +from tensorrt_llm.builder import BuildConfig, Builder +from tensorrt_llm.commands.build import build as build_trtllm from tensorrt_llm.logger import logger -from tensorrt_llm.models.modeling_utils import add_lora +from tensorrt_llm.lora_manager import LoraBuildConfig +from tensorrt_llm.models.modeling_utils import add_lora, optimize_model, preprocess_weights from tensorrt_llm.network import net_guard +from tensorrt_llm.plugin import PluginConfig from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.commands.build import build as build_trtllm -from tensorrt_llm.plugin import PluginConfig -from tensorrt_llm.lora_manager import LoraBuildConfig -from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights - MODEL_NAME = "NeMo" LOGGER = logging.getLogger("NeMo") @@ -354,6 +352,7 @@ def build( t = time.strftime("%H:%M:%S", time.gmtime(tok - tik)) logger.info(f"Total time of building all {args.mapping.world_size} engines: {t}") + def build_and_save_engine( max_input_len=1024, max_output_len=1024, @@ -367,7 +366,7 @@ def build_and_save_engine( max_lora_rank=64, lora_target_modules=None, max_prompt_embedding_table_size=0, - enable_multi_block_mode: bool = False + enable_multi_block_mode: bool = False, ): try: model_cls = getattr(tensorrt_llm.models, model_config.architecture) @@ -380,9 +379,7 @@ def build_and_save_engine( plugin_config.set_gpt_attention_plugin(dtype=str_dtype) plugin_config.set_gemm_plugin(dtype=str_dtype) plugin_config.set_plugin("multi_block_mode", enable_multi_block_mode) - max_num_tokens = max_batch_size*max_input_len - - + max_num_tokens = max_batch_size * max_input_len build_dict = { 'max_input_len': max_input_len, @@ -405,7 +402,8 @@ def build_and_save_engine( lora_dir=lora_ckpt_list, lora_ckpt_source='nemo', max_lora_rank=max_lora_rank, - lora_target_modules=lora_target_modules) + lora_target_modules=lora_target_modules, + ) build_config.lora_config = lora_config model = model_cls.from_config(model_config) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 8bccf6f7ac6a..201032b86615 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -26,8 +26,7 @@ from tensorrt_llm.logger import logger from tensorrt_llm.lora_manager import LoraManager from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime import ModelConfig, SamplingConfig -from tensorrt_llm.runtime import ModelRunnerCpp +from tensorrt_llm.runtime import ModelConfig, ModelRunnerCpp, SamplingConfig from transformers import PreTrainedTokenizer from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group @@ -136,7 +135,7 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b engine_dir = Path(engine_dir) config_path = engine_dir / "config.json" - #model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path) + # model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path) with open(config_path, "r") as f: config = json.load(f) @@ -157,7 +156,7 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b max_input_len=max_input_len, max_output_len=max_output_len, max_beam_width=max_beam_width, - debug_mode=False + debug_mode=False, ) sampling_config = SamplingConfig( From 170df0ec3c1d818f5d00c9f6dde5f9d1ae750442 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Fri, 3 May 2024 12:17:11 -0700 Subject: [PATCH 33/35] Fix Gemma 7B --- nemo/export/trt_llm/nemo_utils.py | 1 + scripts/export/export_to_trt_llm.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index 0dcf7d0e29fe..c9f51b33fce7 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -377,6 +377,7 @@ def nemo_to_trtllm_config( 'num_hidden_layers': nemo_model_config.get('num_layers'), 'num_attention_heads': nemo_model_config.get('num_attention_heads'), 'num_key_value_heads': nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']), + 'head_size': nemo_model_config.get('kv_channels'), 'hidden_size': nemo_model_config.get('hidden_size'), 'intermediate_size': nemo_model_config.get('ffn_hidden_size'), 'norm_epsilon': nemo_model_config.get('layernorm_epsilon'), diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py index 9798473dd880..c75d81ac2d06 100644 --- a/scripts/export/export_to_trt_llm.py +++ b/scripts/export/export_to_trt_llm.py @@ -78,7 +78,6 @@ def get_args(argv): '--use_lora_plugin', nargs='?', const=None, - default=False, choices=['float16', 'float32', 'bfloat16'], help="Activates the lora plugin which enables embedding sharing.", ) From b2413a182611af436670815520cebabfaef8aa71 Mon Sep 17 00:00:00 2001 From: Bobby Chen Date: Tue, 7 May 2024 15:43:21 -0700 Subject: [PATCH 34/35] Add rotary_scaling --- nemo/export/trt_llm/nemo/convert.py | 1 - nemo/export/trt_llm/nemo_utils.py | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py index ba6dc44beac8..7598b3f6825f 100644 --- a/nemo/export/trt_llm/nemo/convert.py +++ b/nemo/export/trt_llm/nemo/convert.py @@ -178,7 +178,6 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t tp_size = config.get("tp_size", 1) int8_outputs = config.get("int8_outputs", None) multi_query_mode = config.get("multi_query_mode", False) - local_dim = config.get("local_dim", None) num_kv_heads = config.get("num_kv_heads", num_attention_heads) size_per_head = config.get("kv_channels", None) diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index c9f51b33fce7..59f7d7f05a3e 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -17,7 +17,6 @@ import copy import csv import datetime -import json import logging import os import shutil @@ -410,10 +409,13 @@ def nemo_to_trtllm_config( model_configs = [] weights_dicts = [] num_layers = nemo_model_config.get('num_layers') + rotary_scaling = nemo_model_config.get("seq_len_interpolation_factor") if decoder_type == "falcon": config["new_decoder_architecture"] = False if num_layers == 32 else True config["parallel_attention"] = True + if rotary_scaling is not None: + config["rotary_scaling"] = {"type": "linear", "factor": float(rotary_scaling)} pp_key = { "transformer.vocab_embedding.weight", From d75d6015be083f3308edeecd320c14678396beda Mon Sep 17 00:00:00 2001 From: oyilmaz-nvidia Date: Mon, 13 May 2024 17:28:30 +0000 Subject: [PATCH 35/35] Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia --- nemo/export/tensorrt_llm.py | 13 +++++++++---- nemo/export/trt_llm/nemo_utils.py | 20 ++++++++++++++------ nemo/export/trt_llm/tensorrt_llm_build.py | 16 ++++++++++------ nemo/export/trt_llm/tensorrt_llm_run.py | 9 +++++++-- scripts/export/export_to_trt_llm.py | 11 ++++++++++- 5 files changed, 50 insertions(+), 19 deletions(-) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index bad1cd2329dc..af4f1b6699ee 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -286,7 +286,9 @@ def build( # Build or refit TRT-LLM engine from a nemo model. model_configs = nemo_llm_model_to_model_config( - nemo_model=nemo_model, decoder_type=model_type, nemo_model_config=nemo_model_config, + nemo_model=nemo_model, + decoder_type=model_type, + nemo_model_config=nemo_model_config, ) model_config_to_tensorrt_llm( @@ -305,7 +307,9 @@ def build( ) def refit( - self, nemo_model, nemo_model_config, + self, + nemo_model, + nemo_model_config, ): assert self.use_refit, "TRT-LLM model must be built() with refit=True" @@ -336,7 +340,6 @@ def forward( output_log_probs: bool = False, **sampling_kwargs, ): - """ Exports nemo checkpoints to TensorRT-LLM. @@ -672,7 +675,9 @@ def _get_prompt_embedding_table_ckpt(self, prompt_embeddings_checkpoint_path): return weights.cpu().detach() def _get_prompt_embedding_table( - self, prompt_embeddings_table=None, prompt_embeddings_checkpoint_path=None, + self, + prompt_embeddings_table=None, + prompt_embeddings_checkpoint_path=None, ): if prompt_embeddings_table is not None and prompt_embeddings_checkpoint_path is not None: LOGGER.warning( diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index f1a54fce241b..d735cab36b00 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -201,7 +201,9 @@ def nemo_llm_to_model_config( def to_word_list_format( - word_dict: List[List[str]], tokenizer=None, ref_str="", + word_dict: List[List[str]], + tokenizer=None, + ref_str="", ): ''' format of word_dict @@ -257,7 +259,10 @@ def to_word_list_format( def nemo_llm_model_to_model_config( - nemo_model: str, decoder_type: str, nemo_model_config: str, dtype_str: str = "float32", + nemo_model: str, + decoder_type: str, + nemo_model_config: str, + dtype_str: str = "float32", ) -> Tuple[List[ModelConfig], PreTrainedTokenizer]: """Converts the NEMO model object and construct the `ModelConfig` before tensorrt_llm deployment.""" from megatron.core import parallel_state @@ -382,15 +387,18 @@ def nemo_to_trtllm_config( 'intermediate_size': nemo_model_config.get('ffn_hidden_size'), 'norm_epsilon': nemo_model_config.get('layernorm_epsilon'), 'vocab_size': vocab_size_padded, - 'position_embedding_type': "rope_gpt_neox" - if nemo_model_config.get('position_embedding_type') == "rope" - else "learned_absolute", + 'position_embedding_type': ( + "rope_gpt_neox" if nemo_model_config.get('position_embedding_type') == "rope" else "learned_absolute" + ), 'max_position_embeddings': nemo_model_config.get('max_position_embeddings'), 'hidden_act': hidden_act, 'use_parallel_embedding': use_parallel_embedding, 'embedding_sharding_dim': 0, 'share_embedding_table': False, - 'quantization': {'quant_algo': None, 'kv_cache_quant_algo': None,}, + 'quantization': { + 'quant_algo': None, + 'kv_cache_quant_algo': None, + }, 'bias': nemo_model_config.get('bias'), 'apply_query_key_layer_scaling': False, 'rotary_pct': nemo_model_config.get('rotary_percentage', 1.0), diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index a05c392bee63..ac8d9094ea32 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -60,11 +60,11 @@ def serialize_engine(engine, path): def refit_runtime_engine(params, cuda_engine): ''' - @brief: Inplace refit one TensorRT cuda engine using weights from the network, - user should guarantee that the engine is built with REFIT flag, and the network has the same structure with the engine. - @param engine_buffer: A serialized TensorRT engine. - @param network: Network object. - @return: A serialized TRT engine if refit successfully, None otherwise + @brief: Inplace refit one TensorRT cuda engine using weights from the network, + user should guarantee that the engine is built with REFIT flag, and the network has the same structure with the engine. + @param engine_buffer: A serialized TensorRT engine. + @param network: Network object. + @return: A serialized TRT engine if refit successfully, None otherwise ''' logger.info(f'Refit runtime engine') tik = time.time() @@ -91,7 +91,11 @@ def refit_runtime_engine(params, cuda_engine): def build_rank_engine( - tensorrt_llm_gpt, builder: Builder, builder_config: tensorrt_llm.builder.BuilderConfig, engine_name, args, + tensorrt_llm_gpt, + builder: Builder, + builder_config: tensorrt_llm.builder.BuilderConfig, + engine_name, + args, ): str_dtype_to_trt(args.dtype) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 201032b86615..92fc36272f7c 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -330,7 +330,10 @@ def load_refit( # Manipulate the tensorrt_llm mapping to make it compatible with the multiprocessed env. assert tensorrt_llm.mpi_world_size() == torch.distributed.get_world_size(), "MPI world size mismatch" runtime_mapping = tensorrt_llm.Mapping( - world_size=tensorrt_llm.mpi_world_size(), rank=runtime_rank, tp_size=tensorrt_llm.mpi_world_size(), pp_size=1, + world_size=tensorrt_llm.mpi_world_size(), + rank=runtime_rank, + tp_size=tensorrt_llm.mpi_world_size(), + pp_size=1, ) engine_name = get_engine_name( @@ -361,7 +364,9 @@ def load_refit( lora_manager = LoraManager() if lora_ckpt_list is not None: lora_manager.load_from_nemo( - model_files=lora_ckpt_list, model_config=model_config, runtime_mapping=runtime_mapping, + model_files=lora_ckpt_list, + model_config=model_config, + runtime_mapping=runtime_mapping, ) else: lora_manager = None diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py index c75d81ac2d06..5e5833444f65 100644 --- a/scripts/export/export_to_trt_llm.py +++ b/scripts/export/export_to_trt_llm.py @@ -85,7 +85,16 @@ def get_args(argv): '--lora_target_modules', nargs='+', default=None, - choices=["attn_qkv", "attn_q", "attn_k", "attn_v", "attn_dense", "mlp_h_to_4h", "mlp_gate", "mlp_4h_to_h",], + choices=[ + "attn_qkv", + "attn_q", + "attn_k", + "attn_v", + "attn_dense", + "mlp_h_to_4h", + "mlp_gate", + "mlp_4h_to_h", + ], help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.", ) parser.add_argument(