From 6e9e318e91f126313219aa77b27f14b3d69aa87b Mon Sep 17 00:00:00 2001
From: Kevin Xie <kevxie@nvidia.com>
Date: Thu, 28 Sep 2023 09:00:05 -0700
Subject: [PATCH 1/4] Update code

---
 .../cpp}/CMakeLists.txt                       |    0
 {cpp/benchmarks => benchmarks/cpp}/README.md  |    4 +-
 .../cpp}/bertBenchmark.cpp                    |    4 +-
 .../cpp}/gptSessionBenchmark.cpp              |    9 +-
 benchmarks/{ => python}/README.md             |   10 +-
 benchmarks/{ => python}/allowed_configs.py    |   98 +-
 benchmarks/{ => python}/base_benchmark.py     |    0
 benchmarks/{ => python}/benchmark.py          |    0
 benchmarks/{ => python}/bert_benchmark.py     |    0
 benchmarks/{ => python}/gpt_benchmark.py      |   46 +-
 benchmarks/{ => python}/mem_monitor.py        |    0
 cpp/CMakeLists.txt                            |  114 +-
 .../modules/find_library_create_target.cmake  |    5 +-
 .../tensorrt_llm/batch_manager/GptManager.h   |   35 +-
 .../batch_manager/batchScheduler.h            |   79 +
 .../batch_manager/kvCacheManager.h            |   31 +-
 .../tensorrt_llm/batch_manager/llmRequest.h   |  117 +-
 .../batch_manager/trtGptModelOptionalParams.h |   68 +
 .../tensorrt_llm/runtime/gptJsonConfig.h      |   23 +-
 .../tensorrt_llm/runtime/gptModelConfig.h     |   46 +-
 cpp/include/tensorrt_llm/runtime/gptSession.h |   17 +-
 .../runtime/iStatefulGptDecoder.h             |    1 +
 cpp/include/tensorrt_llm/runtime/tllmLogger.h |    7 +-
 .../tensorrt_llm/runtime/worldConfig.h        |   59 +-
 cpp/tensorrt_llm/CMakeLists.txt               |   15 +-
 .../libtensorrt_llm_batch_manager_static.a    |    3 -
 ...sorrt_llm_batch_manager_static.pre_cxx11.a |    3 -
 cpp/tensorrt_llm/common/assert.h              |    4 +
 cpp/tensorrt_llm/common/cublasMMWrapper.cpp   |  360 ++-
 cpp/tensorrt_llm/common/cublasMMWrapper.h     |   25 +-
 cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh |    4 +-
 cpp/tensorrt_llm/common/cudaDriverWrapper.cpp |    7 +
 cpp/tensorrt_llm/common/cudaTypeUtils.cuh     |   22 +-
 cpp/tensorrt_llm/common/cudaUtils.h           |   42 +-
 cpp/tensorrt_llm/common/int8Utils.cuh         |   60 -
 cpp/tensorrt_llm/common/logger.h              |    8 +
 cpp/tensorrt_llm/common/memoryUtils.cu        |    6 +-
 cpp/tensorrt_llm/common/nvtxUtils.h           |   44 +
 cpp/tensorrt_llm/common/reduceKernelUtils.cuh |    2 +-
 cpp/tensorrt_llm/common/stringUtils.h         |    6 +
 cpp/tensorrt_llm/common/tensor.cpp            |   18 +-
 cpp/tensorrt_llm/common/tensor.h              |    3 +-
 cpp/tensorrt_llm/common/tllmException.cpp     |    4 +-
 .../gemm/warp/mma_tensorop_dequantizer.h      |    4 +-
 cpp/tensorrt_llm/kernels/banBadWords.cu       |    3 +-
 cpp/tensorrt_llm/kernels/banRepeatNgram.cu    |    3 +-
 .../kernels/beamSearchPenaltyKernels.cu       |   53 +-
 .../kernels/beamSearchPenaltyKernels.h        |    8 +-
 .../kernels/beamSearchTopkKernels.cu          |   25 +-
 .../fmhaRunner.cpp                            |    5 +
 .../cutlass_kernels/cutlass_heuristic.cpp     |    6 +
 .../fpA_intB_gemm/fpA_intB_gemm.h             |   33 +-
 .../fpA_intB_gemm/fpA_intB_gemm_template.h    |   61 +-
 .../cutlass_kernels/int8_gemm/int8_gemm.h     |   61 +-
 .../int8_gemm/int8_gemm_template.h            |  129 +-
 .../kernels/decoderMaskedMultiheadAttention.h |    2 +
 .../decoderMaskedMultiheadAttentionLaunch.h   |    8 +-
 .../decoderMaskedMultiheadAttentionTemplate.h |    6 +-
 .../decoderMaskedMultiheadAttentionUtils.h    |  138 +-
 cpp/tensorrt_llm/kernels/decodingKernels.cu   |   76 +-
 cpp/tensorrt_llm/kernels/decodingKernels.h    |   10 +-
 cpp/tensorrt_llm/kernels/gptKernels.h         |    7 +
 .../kernels/onlineSoftmaxBeamsearchKernels.cu |   35 +-
 cpp/tensorrt_llm/kernels/quantization.cu      |    1 -
 .../kernels/samplingPenaltyKernels.cu         |  113 +-
 .../kernels/samplingPenaltyKernels.h          |   11 +-
 .../kernels/samplingTopKKernels.cu            |   46 +-
 .../kernels/stopCriteriaKernels.cu            |    3 +-
 .../kernels/unfusedAttentionKernels.cu        |   20 +-
 .../kernels/unfusedAttentionKernels.h         |    4 +-
 .../kernels/weightOnlyBatchedGemv/common.h    |   81 +
 .../kernels/weightOnlyBatchedGemv/kernel.h    |  430 +++
 .../weightOnlyBatchedGemv/kernelLauncher.cu   |  224 ++
 .../kernelLauncher.h}                         |   18 +-
 .../kernels/weightOnlyBatchedGemv/utility.h   |   99 +
 .../weightOnlyBatchedGemvBs1Int4b.cu          |   98 +
 .../weightOnlyBatchedGemvBs1Int8b.cu          |   98 +
 .../weightOnlyBatchedGemvBs2Int4b.cu          |   97 +
 .../weightOnlyBatchedGemvBs2Int8b.cu          |   97 +
 .../weightOnlyBatchedGemvBs3Int4b.cu          |   98 +
 .../weightOnlyBatchedGemvBs3Int8b.cu          |   98 +
 .../weightOnlyBatchedGemvBs4Int4b.cu          |   97 +
 .../weightOnlyBatchedGemvBs4Int8b.cu          |   98 +
 ...OnlyGroupwiseMatrixVectorMultiplication.cu |  236 --
 .../weightOnlyMatrixVectorMultiplication.cu   |  381 ---
 .../weightOnlyMatrixVectorMultiplication.h    |   48 -
 .../layers/baseBeamSearchLayer.cu             |   30 +-
 cpp/tensorrt_llm/layers/baseBeamSearchLayer.h |    6 +-
 cpp/tensorrt_llm/layers/baseSamplingLayer.cpp |   22 +-
 cpp/tensorrt_llm/layers/baseSamplingLayer.h   |    4 +-
 .../layers/dynamicDecodeLayer.cpp             |   25 +-
 .../layers/onlineBeamSearchLayer.cu           |    8 +-
 cpp/tensorrt_llm/layers/topPSamplingLayer.cu  |    1 -
 cpp/tensorrt_llm/plugins/CMakeLists.txt       |   21 +-
 cpp/tensorrt_llm/plugins/api/InferPlugin.cpp  |  179 --
 cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp   |  209 ++
 cpp/tensorrt_llm/plugins/api/tllmPlugin.h     |   56 +
 .../bertAttentionPlugin.cpp                   |   62 +-
 .../bertAttentionPlugin/bertAttentionPlugin.h |   30 +-
 .../plugins/common/checkMacrosPlugin.cpp      |  126 +-
 .../plugins/common/checkMacrosPlugin.h        |  276 +-
 .../plugins/common/gemmPluginProfiler.h       |  470 +++
 cpp/tensorrt_llm/plugins/common/plugin.cpp    |   51 +-
 cpp/tensorrt_llm/plugins/common/plugin.h      |  100 +-
 cpp/tensorrt_llm/plugins/exports.map          |    9 +-
 .../plugins/gemmPlugin/gemmPlugin.cpp         |  364 ++-
 .../plugins/gemmPlugin/gemmPlugin.h           |  118 +-
 .../gptAttentionCommon/gptAttentionCommon.cpp |  106 +-
 .../gptAttentionCommon/gptAttentionCommon.h   |   59 +-
 .../gptAttentionCommonImpl.h                  |    9 +-
 .../gptAttentionPlugin/gptAttentionPlugin.cpp |  126 +-
 .../gptAttentionPlugin/gptAttentionPlugin.h   |   47 +-
 .../plugins/identityPlugin/identityPlugin.cpp |   30 +-
 .../plugins/identityPlugin/identityPlugin.h   |   30 +-
 .../layernormPlugin/layernormPlugin.cpp       |   38 +-
 .../plugins/layernormPlugin/layernormPlugin.h |   30 +-
 .../layernormQuantizationPlugin.cpp           |   44 +-
 .../layernormQuantizationPlugin.h             |   30 +-
 .../plugins/lookupPlugin/lookupPlugin.cpp     |   40 +-
 .../plugins/lookupPlugin/lookupPlugin.h       |   30 +-
 .../plugins/ncclPlugin/allgatherPlugin.cpp    |   34 +-
 .../plugins/ncclPlugin/allgatherPlugin.h      |   30 +-
 .../plugins/ncclPlugin/allreducePlugin.cpp    |   34 +-
 .../plugins/ncclPlugin/allreducePlugin.h      |   30 +-
 .../plugins/ncclPlugin/recvPlugin.cpp         |   34 +-
 .../plugins/ncclPlugin/recvPlugin.h           |   30 +-
 .../plugins/ncclPlugin/sendPlugin.cpp         |   34 +-
 .../plugins/ncclPlugin/sendPlugin.h           |   30 +-
 .../quantizePerTokenPlugin.cpp                |   38 +-
 .../quantizePerTokenPlugin.h                  |   29 +-
 .../quantizeTensorPlugin.cpp                  |   40 +-
 .../quantizeTensorPlugin.h                    |   29 +-
 .../plugins/rmsnormPlugin/rmsnormPlugin.cpp   |   36 +-
 .../plugins/rmsnormPlugin/rmsnormPlugin.h     |   30 +-
 .../rmsnormQuantizationPlugin.cpp             |   42 +-
 .../rmsnormQuantizationPlugin.h               |   30 +-
 .../smoothQuantGemmPlugin.cpp                 |  222 +-
 .../smoothQuantGemmPlugin.h                   |   83 +-
 .../weightOnlyGroupwiseQuantMatmulPlugin.cpp  |  190 +-
 .../weightOnlyGroupwiseQuantMatmulPlugin.h    |   90 +-
 .../weightOnlyQuantMatmulPlugin.cpp           |  193 +-
 .../weightOnlyQuantMatmulPlugin.h             |   76 +-
 cpp/tensorrt_llm/runtime/CMakeLists.txt       |    9 +-
 cpp/tensorrt_llm/runtime/bufferManager.cpp    |   18 +-
 cpp/tensorrt_llm/runtime/gptDecoder.cpp       |   35 +-
 cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp  |   40 +-
 cpp/tensorrt_llm/runtime/gptJsonConfig.cpp    |   29 +-
 cpp/tensorrt_llm/runtime/gptSession.cpp       |  281 +-
 cpp/tensorrt_llm/runtime/ncclCommunicator.cpp |  135 +
 cpp/tensorrt_llm/runtime/ncclCommunicator.h   |   44 +
 cpp/tensorrt_llm/runtime/runtimeBuffers.cpp   |  289 +-
 cpp/tensorrt_llm/runtime/runtimeBuffers.h     |   33 +-
 cpp/tensorrt_llm/runtime/runtimeKernels.cu    |  131 +-
 .../runtime/statefulGptDecoder.cpp            |   44 +-
 cpp/tensorrt_llm/runtime/tllmRuntime.cpp      |   42 +-
 cpp/tensorrt_llm/runtime/tllmRuntime.h        |    4 +-
 cpp/tensorrt_llm/runtime/torchView.h          |    1 +
 .../runtime/utils/multiDeviceUtils.h          |   53 +
 .../runtime/utils/sessionUtils.cpp            |   38 +-
 cpp/tensorrt_llm/runtime/utils/sessionUtils.h |   12 +-
 cpp/tensorrt_llm/runtime/worldConfig.cpp      |   38 +-
 cpp/tensorrt_llm/thop/CMakeLists.txt          |    6 +-
 cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp     |    4 +-
 cpp/tensorrt_llm/thop/dynamicDecodeOp.h       |    4 +-
 cpp/tensorrt_llm/thop/fp8Op.cpp               |    6 +-
 cpp/tensorrt_llm/thop/gatherTreeOp.cpp        |   28 +-
 cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp   |   16 +-
 cpp/tests/README.md                           |    8 +
 cpp/tests/resources/.gitignore                |    1 +
 .../resources/scripts/build_gpt_engines.py    |    9 -
 .../resources/scripts/build_gptj_engines.py   |    8 +-
 .../resources/scripts/build_llama_engines.py  |   86 +
 .../scripts/generate_expected_llama_output.py |   59 +
 cpp/tests/resources/scripts/test_cpp.py       |   48 +-
 cpp/tests/runtime/gptDecoderBatchTest.cpp     |   36 +-
 cpp/tests/runtime/gptDecoderTest.cpp          |    5 +-
 cpp/tests/runtime/gptSessionTest.cpp          |  122 +-
 cpp/tests/runtime/runtimeKernelTest.cpp       |  257 +-
 docker/Dockerfile.multi                       |   26 +-
 docker/Makefile                               |    4 +-
 docs/Doxygen                                  | 2658 +++++++++++++++++
 docs/Makefile                                 |   20 +
 docs/README.md                                |   40 +
 docs/graph-rewriting.md                       |  193 ++
 docs/make.bat                                 |   35 +
 docs/requirements.txt                         |    4 +
 .../2023-05-17-how-to-add-a-new-model.md      |    0
 docs/{ => source}/2023-05-19-how-to-debug.md  |    0
 docs/source/CONTRIBUTING.md                   |   69 +
 README.md => docs/source/README.md            |    0
 docs/{ => source}/architecture.md             |    0
 docs/source/conf.py                           |   88 +
 docs/{ => source}/gpt_attention.md            |    0
 docs/{ => source}/gpt_runtime.md              |    7 +-
 docs/{ => source}/in_flight_batching.md       |    0
 docs/source/index.rst                         |   65 +
 docs/{ => source}/performance.md              |    0
 docs/{ => source}/precision.md                |    0
 .../python-api/tensorrt_llm.functional.rst    |   11 +
 .../source/python-api/tensorrt_llm.layers.rst |   69 +
 .../source/python-api/tensorrt_llm.models.rst |   11 +
 .../source/python-api/tensorrt_llm.plugin.rst |   10 +
 .../python-api/tensorrt_llm.quantization.rst  |   10 +
 .../python-api/tensorrt_llm.runtime.rst       |   11 +
 examples/baichuan/build.py                    |   19 +-
 examples/baichuan/run.py                      |    5 +-
 examples/baichuan/summarize.py                |    3 +-
 examples/bloom/README.md                      |   20 +-
 examples/bloom/build.py                       |   41 +-
 examples/bloom/summarize.py                   |    3 +-
 examples/bloom/weight.py                      |   25 +-
 examples/chatglm2-6b/build.py                 |    2 +
 examples/chatglm6b/build.py                   |    2 +
 examples/cpp_library/main.cpp                 |    1 -
 examples/cpp_library/tensorrt_llm_libutils.h  |    8 +
 examples/falcon/README.md                     |   35 +-
 examples/falcon/build.py                      |  140 +-
 examples/falcon/requirements.txt              |    1 +
 examples/falcon/run.py                        |  191 +-
 examples/falcon/summarize.py                  |   66 +-
 examples/falcon/weight.py                     |  139 +-
 examples/gpt/README.md                        |    7 +-
 examples/gpt/build.py                         |   34 +-
 examples/gpt/hf_gpt_convert.py                |    7 +
 examples/gpt/run.py                           |    5 +-
 examples/gpt/summarize.py                     |    3 +-
 examples/gpt/weight.py                        |   18 +-
 examples/gptj/README.md                       |   30 +
 examples/gptj/build.py                        |   21 +-
 examples/gptj/run.py                          |    5 +-
 examples/gptj/summarize.py                    |    3 +-
 examples/gptj/weight.py                       |   10 +-
 examples/gptneox/README.md                    |    1 +
 examples/gptneox/build.py                     |   26 +-
 examples/gptneox/summarize.py                 |    3 +-
 examples/llama/README.md                      |   89 +-
 examples/llama/build.py                       |  108 +-
 examples/llama/convert.py                     |    4 +-
 examples/llama/hf_llama_convert.py            |   25 +
 examples/llama/run.py                         |   22 +-
 examples/llama/summarize.py                   |   22 +-
 examples/llama/weight.py                      |  346 ++-
 examples/mpt/build.py                         |   16 +
 examples/mpt/run.py                           |    5 +-
 examples/openai_triton/CMakeLists.txt         |   19 +-
 .../TritonFlashAttentionPlugin.cpp            |   45 +-
 .../TritonFlashAttentionPlugin.h              |   30 +-
 examples/openai_triton/plugin.py              |    6 +-
 examples/openai_triton/tritonPlugins.cpp      |   20 +-
 examples/opt/build.py                         |    4 +-
 examples/opt/summarize.py                     |    3 +-
 examples/quantization/summarize.py            |    3 +-
 requirements-dev-windows.txt                  |   25 +
 requirements-dev.txt                          |    3 +-
 requirements-windows.txt                      |   20 +
 requirements.txt                              |    4 +-
 scripts/build_wheel.py                        |   80 +-
 setup.py                                      |   29 +-
 tensorrt_llm/__init__.py                      |    6 +-
 tensorrt_llm/_common.py                       |    6 +-
 tensorrt_llm/_utils.py                        |    1 +
 tensorrt_llm/builder.py                       |   48 +-
 tensorrt_llm/functional.py                    |  220 +-
 tensorrt_llm/graph_rewriting.py               |   16 +-
 tensorrt_llm/layers/__init__.py               |    5 +-
 tensorrt_llm/layers/attention.py              |  170 +-
 tensorrt_llm/layers/linear.py                 |   28 +-
 tensorrt_llm/models/baichuan/model.py         |  114 +-
 tensorrt_llm/models/bloom/model.py            |  161 +-
 tensorrt_llm/models/chatglm2_6b/model.py      |  225 +-
 tensorrt_llm/models/chatglm6b/model.py        |  185 +-
 tensorrt_llm/models/falcon/model.py           |  230 +-
 tensorrt_llm/models/generation_mixin.py       |  319 +-
 tensorrt_llm/models/gpt/model.py              |  204 +-
 tensorrt_llm/models/gptj/model.py             |  116 +-
 tensorrt_llm/models/gptneox/model.py          |  175 +-
 tensorrt_llm/models/llama/model.py            |  272 +-
 tensorrt_llm/models/opt/model.py              |  130 +-
 tensorrt_llm/models/quantized/quant.py        |   20 +-
 tensorrt_llm/network.py                       |   30 +-
 tensorrt_llm/plugin/__init__.py               |    8 +-
 tensorrt_llm/plugin/plugin.py                 |   29 +-
 tensorrt_llm/quantization/functional.py       |   20 +-
 tensorrt_llm/quantization/layers.py           |  215 +-
 tensorrt_llm/runtime/generation.py            |  182 +-
 tensorrt_llm/tools/__init__.py                |    0
 tensorrt_llm/tools/plugin_gen/__init__.py     |    0
 tensorrt_llm/tools/plugin_gen/core.py         |  693 +++++
 tensorrt_llm/tools/plugin_gen/plugin_gen.py   |  339 +++
 tensorrt_llm/tools/plugin_gen/shape_infer.py  |  322 ++
 .../plugin_gen/templates/CMakeLists.txt.tpl   |   73 +
 .../plugin_gen/templates/functional.py.tpl    |   70 +
 .../tools/plugin_gen/templates/plugin.cpp.tpl |  302 ++
 .../tools/plugin_gen/templates/plugin.h.tpl   |  120 +
 .../templates/tritonPlugins.cpp.tpl           |  147 +
 tests/attention/test_gpt_attention.py         |  196 +-
 tests/attention/test_gpt_attention_IFB.py     |  127 +-
 tests/functional/test_arange.py               |   91 +
 tests/model/test_bert.py                      |    3 +-
 tests/model/test_bloom.py                     |    5 +-
 tests/model/test_falcon.py                    |    8 +-
 tests/model/test_gpt.py                       |  106 +-
 tests/model/test_gptj.py                      |   31 +-
 tests/model/test_gptneox.py                   |    2 +-
 tests/model/test_llama.py                     |   18 +-
 tests/quantization/test_smooth_quant_gemm.py  |    4 +
 tests/test_graph_rewriter.py                  |   49 +-
 tests/test_layer.py                           |   21 +-
 tests/test_plugins.py                         |   17 +
 tests/tools/__init__.py                       |    0
 tests/tools/plugin_gen/__init__.py            |    0
 tests/tools/plugin_gen/build_engine.py        |  195 ++
 tests/tools/plugin_gen/kernel_config.py       |   49 +
 tests/tools/plugin_gen/run_engine.py          |  169 ++
 tests/tools/plugin_gen/test_core.py           |   58 +
 tests/tools/plugin_gen/test_plugin_gen.py     |   21 +
 tests/tools/plugin_gen/test_shape_infer.py    |   59 +
 317 files changed, 15996 insertions(+), 6294 deletions(-)
 rename {cpp/benchmarks => benchmarks/cpp}/CMakeLists.txt (100%)
 rename {cpp/benchmarks => benchmarks/cpp}/README.md (91%)
 rename {cpp/benchmarks => benchmarks/cpp}/bertBenchmark.cpp (99%)
 rename {cpp/benchmarks => benchmarks/cpp}/gptSessionBenchmark.cpp (96%)
 rename benchmarks/{ => python}/README.md (75%)
 rename benchmarks/{ => python}/allowed_configs.py (83%)
 rename benchmarks/{ => python}/base_benchmark.py (100%)
 rename benchmarks/{ => python}/benchmark.py (100%)
 rename benchmarks/{ => python}/bert_benchmark.py (100%)
 rename benchmarks/{ => python}/gpt_benchmark.py (93%)
 rename benchmarks/{ => python}/mem_monitor.py (100%)
 create mode 100644 cpp/include/tensorrt_llm/batch_manager/batchScheduler.h
 create mode 100644 cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
 delete mode 100644 cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a
 delete mode 100644 cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a
 delete mode 100644 cpp/tensorrt_llm/common/int8Utils.cuh
 create mode 100644 cpp/tensorrt_llm/common/nvtxUtils.h
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.cu
 rename cpp/tensorrt_llm/kernels/{weightOnlyGroupwiseMatrixVectorMultiplication.h => weightOnlyBatchedGemv/kernelLauncher.h} (52%)
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int4b.cu
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int8b.cu
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int4b.cu
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int8b.cu
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int4b.cu
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int8b.cu
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int4b.cu
 create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int8b.cu
 delete mode 100644 cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.cu
 delete mode 100644 cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.cu
 delete mode 100644 cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h
 delete mode 100644 cpp/tensorrt_llm/plugins/api/InferPlugin.cpp
 create mode 100644 cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp
 create mode 100644 cpp/tensorrt_llm/plugins/api/tllmPlugin.h
 create mode 100644 cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h
 mode change 100755 => 100644 cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h
 mode change 100755 => 100644 cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h
 mode change 100755 => 100644 cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h
 create mode 100644 cpp/tensorrt_llm/runtime/ncclCommunicator.cpp
 create mode 100644 cpp/tensorrt_llm/runtime/ncclCommunicator.h
 create mode 100644 cpp/tensorrt_llm/runtime/utils/multiDeviceUtils.h
 create mode 100644 cpp/tests/resources/scripts/build_llama_engines.py
 create mode 100644 cpp/tests/resources/scripts/generate_expected_llama_output.py
 create mode 100644 docs/Doxygen
 create mode 100644 docs/Makefile
 create mode 100644 docs/README.md
 create mode 100644 docs/graph-rewriting.md
 create mode 100644 docs/make.bat
 create mode 100644 docs/requirements.txt
 rename docs/{ => source}/2023-05-17-how-to-add-a-new-model.md (100%)
 rename docs/{ => source}/2023-05-19-how-to-debug.md (100%)
 create mode 100644 docs/source/CONTRIBUTING.md
 rename README.md => docs/source/README.md (100%)
 rename docs/{ => source}/architecture.md (100%)
 create mode 100644 docs/source/conf.py
 rename docs/{ => source}/gpt_attention.md (100%)
 rename docs/{ => source}/gpt_runtime.md (98%)
 rename docs/{ => source}/in_flight_batching.md (100%)
 create mode 100644 docs/source/index.rst
 rename docs/{ => source}/performance.md (100%)
 rename docs/{ => source}/precision.md (100%)
 create mode 100644 docs/source/python-api/tensorrt_llm.functional.rst
 create mode 100644 docs/source/python-api/tensorrt_llm.layers.rst
 create mode 100644 docs/source/python-api/tensorrt_llm.models.rst
 create mode 100644 docs/source/python-api/tensorrt_llm.plugin.rst
 create mode 100644 docs/source/python-api/tensorrt_llm.quantization.rst
 create mode 100644 docs/source/python-api/tensorrt_llm.runtime.rst
 create mode 100644 requirements-dev-windows.txt
 create mode 100644 requirements-windows.txt
 create mode 100644 tensorrt_llm/tools/__init__.py
 create mode 100644 tensorrt_llm/tools/plugin_gen/__init__.py
 create mode 100644 tensorrt_llm/tools/plugin_gen/core.py
 create mode 100644 tensorrt_llm/tools/plugin_gen/plugin_gen.py
 create mode 100644 tensorrt_llm/tools/plugin_gen/shape_infer.py
 create mode 100644 tensorrt_llm/tools/plugin_gen/templates/CMakeLists.txt.tpl
 create mode 100644 tensorrt_llm/tools/plugin_gen/templates/functional.py.tpl
 create mode 100644 tensorrt_llm/tools/plugin_gen/templates/plugin.cpp.tpl
 create mode 100644 tensorrt_llm/tools/plugin_gen/templates/plugin.h.tpl
 create mode 100644 tensorrt_llm/tools/plugin_gen/templates/tritonPlugins.cpp.tpl
 create mode 100644 tests/functional/test_arange.py
 create mode 100644 tests/test_plugins.py
 create mode 100644 tests/tools/__init__.py
 create mode 100644 tests/tools/plugin_gen/__init__.py
 create mode 100644 tests/tools/plugin_gen/build_engine.py
 create mode 100644 tests/tools/plugin_gen/kernel_config.py
 create mode 100644 tests/tools/plugin_gen/run_engine.py
 create mode 100644 tests/tools/plugin_gen/test_core.py
 create mode 100644 tests/tools/plugin_gen/test_plugin_gen.py
 create mode 100644 tests/tools/plugin_gen/test_shape_infer.py

diff --git a/cpp/benchmarks/CMakeLists.txt b/benchmarks/cpp/CMakeLists.txt
similarity index 100%
rename from cpp/benchmarks/CMakeLists.txt
rename to benchmarks/cpp/CMakeLists.txt
diff --git a/cpp/benchmarks/README.md b/benchmarks/cpp/README.md
similarity index 91%
rename from cpp/benchmarks/README.md
rename to benchmarks/cpp/README.md
index e9c996d545f..d28378726a4 100644
--- a/cpp/benchmarks/README.md
+++ b/benchmarks/cpp/README.md
@@ -7,7 +7,7 @@ multiple GPUs or multiple nodes with multiple GPUs.
 
 ### 1. Build TensorRT-LLM and benchmarking source code
 
-Please follow the [`installation document`](../../README.md) to build TensorRT-LLM.
+Please follow the [`installation document`](../../../README.md) to build TensorRT-LLM.
 
 After that, you can build benchmarking source code for C++ runtime
 ```
@@ -19,7 +19,7 @@ make -j benchmarks
 
 Before you launch C++ benchmarking, please make sure that you have already built engine(s) using TensorRT-LLM API, C++ benchmarking code cannot generate engine(s) for you.
 
-You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../../benchmarks/README.md).
+You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../python/README.md).
 
 For detailed usage, you can do the following
 ```
diff --git a/cpp/benchmarks/bertBenchmark.cpp b/benchmarks/cpp/bertBenchmark.cpp
similarity index 99%
rename from cpp/benchmarks/bertBenchmark.cpp
rename to benchmarks/cpp/bertBenchmark.cpp
index 48fb8492cf0..06f148cad87 100644
--- a/cpp/benchmarks/bertBenchmark.cpp
+++ b/benchmarks/cpp/bertBenchmark.cpp
@@ -15,13 +15,13 @@
  * limitations under the License.
  */
 #include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 #include "tensorrt_llm/runtime/tllmRuntime.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <NvInfer.h>
-#include <NvInferPlugin.h>
 #include <chrono>
 #include <cxxopts.hpp>
 #include <filesystem>
@@ -228,7 +228,7 @@ int main(int argc, char* argv[])
     {
         throw std::invalid_argument("Unexpected log level: " + logLevel);
     }
-    initLibNvInferPlugins(logger.get(), "tensorrt_llm");
+    initTrtLlmPlugins(logger.get());
 
     benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens, logger,
         result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>());
diff --git a/cpp/benchmarks/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
similarity index 96%
rename from cpp/benchmarks/gptSessionBenchmark.cpp
rename to benchmarks/cpp/gptSessionBenchmark.cpp
index c6b08f242e9..ebe7b8de12f 100644
--- a/cpp/benchmarks/gptSessionBenchmark.cpp
+++ b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 #include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
 #include "tensorrt_llm/runtime/gptSession.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 
 #include <NvInfer.h>
-#include <NvInferPlugin.h>
 #include <chrono>
 #include <cxxopts.hpp>
 #include <iostream>
@@ -41,7 +41,10 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
     auto const json = GptJsonConfig::parse(dataPath / "config.json");
     auto const modelConfig = json.getModelConfig();
     auto const inputPacked = modelConfig.usePackedInput();
-    auto const worldConfig = WorldConfig::mpi(*logger);
+    SizeType deviceCount{0};
+    TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+    auto const worldConfig
+        = WorldConfig::mpi(*logger, deviceCount, json.getTensorParallelism(), json.getPipelineParallelism());
     auto const enginePath = dataPath / json.engineFilename(worldConfig, modelName);
     auto const dtype = modelConfig.getDataType();
     auto const useHalf = (dtype == nvinfer1::DataType::kHALF);
@@ -233,7 +236,7 @@ int main(int argc, char* argv[])
     // Argument: Enable CUDA graph
     auto enableCudaGraph = result.count("enable_cuda_graph") > 0;
 
-    initLibNvInferPlugins(logger.get(), "tensorrt_llm");
+    initTrtLlmPlugins(logger.get());
 
     benchmarkGptSession(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inOutLen,
         logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>(),
diff --git a/benchmarks/README.md b/benchmarks/python/README.md
similarity index 75%
rename from benchmarks/README.md
rename to benchmarks/python/README.md
index d7edddb2aaf..39e0743789a 100644
--- a/benchmarks/README.md
+++ b/benchmarks/python/README.md
@@ -5,12 +5,12 @@ multiple GPUs or multiple nodes with multiple GPUs.
 
 ## Overview
 
-The benchmark implementation and entrypoint can be found in [`benchmarks/benchmark.py`](./benchmark.py). There are some other scripts in the directory:
+The benchmark implementation and entrypoint can be found in [`benchmarks/python/benchmark.py`](./benchmark.py). There are some other scripts in the directory:
 
-* [`benchmarks/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
-* [`benchmarks/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
-* [`benchmarks/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
-* [`benchmarks/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
+* [`benchmarks/python/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
+* [`benchmarks/python/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
+* [`benchmarks/python/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
+* [`benchmarks/python/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
 
 ## Usage
 
diff --git a/benchmarks/allowed_configs.py b/benchmarks/python/allowed_configs.py
similarity index 83%
rename from benchmarks/allowed_configs.py
rename to benchmarks/python/allowed_configs.py
index 51e566a8514..5a6ce1a7670 100644
--- a/benchmarks/allowed_configs.py
+++ b/benchmarks/python/allowed_configs.py
@@ -14,12 +14,12 @@
 # limitations under the License.
 from typing import Literal, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Extra
 
 from tensorrt_llm.functional import PositionEmbeddingType
 
 
-class BuildConfig(BaseModel):
+class BuildConfig(BaseModel, extra=Extra.allow):
     num_layers: int
     num_heads: int
     hidden_size: int
@@ -28,10 +28,10 @@ class BuildConfig(BaseModel):
     n_positions: int
     max_batch_size: int
     max_input_len: int
-    num_kv_heads: int = None
+    num_kv_heads: Optional[int] = None
     max_output_len: Optional[int] = None
-    builder_opt: Optional[
-        int] = None  # TRT builder_optimization_level from 0 to 5
+    # TRT builder_optimization_level from 0 to 5
+    builder_opt: Optional[int] = None
     inter_size: Optional[int] = None
     rotary_dim: Optional[int] = None
     type_vocab_size: Optional[int] = None
@@ -44,11 +44,10 @@ class BuildConfig(BaseModel):
     enable_context_fmha: bool = True
     # None means using the model family's default value defined in the ctor
     position_embedding_type: Optional[PositionEmbeddingType] = None
-    # Only when position embedding is RoPE, this value makes sense, make default value to be None, not 0 or 1
-    # to prevent misuse
+    # Only when position embedding is RoPE, this value makes sense, make
+    # default value to be None, not 0 or 1 to prevent misuse
     rotary_pct: Optional[float] = None
     bias: bool = True
-    remove_input_padding: bool = True
 
 
 class ModelConfig(BaseModel):
@@ -439,6 +438,89 @@ class ModelConfig(BaseModel):
                     enable_qk_half_accum=False,
                     enable_context_fmha=False,
                 )),
+    "falcon_rw_1b":
+    ModelConfig(name="falcon_rw_1b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=24,
+                    num_heads=32,
+                    hidden_size=2048,
+                    vocab_size=50304,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=256,
+                    max_input_len=1024,
+                    max_output_len=1024,
+                    builder_opt=None,
+                    bias=True,
+                    use_alibi=True,
+                    parallel_attention=False,
+                    new_decoder_architecture=False,
+                )),
+    "falcon_7b":
+    ModelConfig(name="falcon_7b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=32,
+                    num_heads=71,
+                    num_kv_heads=1,
+                    hidden_size=4544,
+                    vocab_size=65024,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=128,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                    bias=False,
+                    use_alibi=False,
+                    parallel_attention=True,
+                    new_decoder_architecture=False,
+                )),
+    "falcon_40b":
+    ModelConfig(name="falcon_40b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=60,
+                    num_heads=128,
+                    num_kv_heads=8,
+                    hidden_size=8192,
+                    vocab_size=65024,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=64,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                    bias=False,
+                    use_alibi=False,
+                    parallel_attention=True,
+                    new_decoder_architecture=False,
+                )),
+    "falcon_180b":
+    ModelConfig(name="falcon_180b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=80,
+                    num_heads=232,
+                    num_kv_heads=8,
+                    hidden_size=14848,
+                    vocab_size=65024,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=8,
+                    max_input_len=1024,
+                    max_output_len=1024,
+                    builder_opt=None,
+                    bias=False,
+                    use_alibi=False,
+                    parallel_attention=True,
+                    new_decoder_architecture=False,
+                )),
 }
 
 
diff --git a/benchmarks/base_benchmark.py b/benchmarks/python/base_benchmark.py
similarity index 100%
rename from benchmarks/base_benchmark.py
rename to benchmarks/python/base_benchmark.py
diff --git a/benchmarks/benchmark.py b/benchmarks/python/benchmark.py
similarity index 100%
rename from benchmarks/benchmark.py
rename to benchmarks/python/benchmark.py
diff --git a/benchmarks/bert_benchmark.py b/benchmarks/python/bert_benchmark.py
similarity index 100%
rename from benchmarks/bert_benchmark.py
rename to benchmarks/python/bert_benchmark.py
diff --git a/benchmarks/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py
similarity index 93%
rename from benchmarks/gpt_benchmark.py
rename to benchmarks/python/gpt_benchmark.py
index f11f868272c..e66c7a180e7 100644
--- a/benchmarks/gpt_benchmark.py
+++ b/benchmarks/python/gpt_benchmark.py
@@ -81,22 +81,24 @@ def __init__(self,
             self.per_token = False
             self.per_channel = False
 
-            self.use_gpt_attention_plugin = False
-            self.use_gemm_plugin = False
-            self.use_layernorm_plugin = False
-            self.use_rmsnorm_plugin = False
-            self.use_lookup_plugin = False
+            is_plugin_mode = mode == 'plugin'
+            plg_dtype = dtype if is_plugin_mode else False
+            self.use_gpt_attention_plugin = plg_dtype
+            self.use_gemm_plugin = plg_dtype
+            self.use_layernorm_plugin = plg_dtype
+            # Enable RMS Norm plugin for the LLaMA family.
+            if is_plugin_mode and 'llama' in model_name:
+                self.use_rmsnorm_plugin = dtype
+            else:
+                self.use_rmsnorm_plugin = False
+            self.use_lookup_plugin = plg_dtype
             self.enable_context_fmha = True
             self.quant_mode = QuantMode(0)
-            if mode == 'plugin':
-                self.use_gpt_attention_plugin = dtype
-                self.use_gemm_plugin = dtype
-                self.use_layernorm_plugin = dtype
-                self.use_lookup_plugin = dtype
-                if "llama" in model_name:
-                    self.use_rmsnorm_plugin = dtype
+            self.remove_input_padding = is_plugin_mode
+
             for key, value in get_build_config(model_name).items():
                 setattr(self, key, value)
+
             # Override the n_position/max_input_len/max_output_len/max_batch_size to value from cmd line if that's specified.
             if n_positions is not None:
                 assert isinstance(
@@ -122,6 +124,7 @@ def __init__(self,
                 self.num_kv_heads = self.num_heads
             if kwargs.get('force_num_layer_1', False):
                 self.num_layers = 1
+
             if self.use_smooth_quant:
                 self.quant_mode = QuantMode.use_smooth_quant(
                     self.per_token, self.per_channel)
@@ -195,7 +198,7 @@ def prepare_inputs(self, config):
         input_lengths = torch.tensor([inlen
                                       for _ in range(batch_size)]).int().cuda()
 
-        self.decoder.setup(batch_size, inlen, outlen)
+        self.decoder.setup(batch_size, inlen, outlen, beam_width=self.num_beams)
         return (input_ids, input_lengths)
 
     def build(self):
@@ -334,6 +337,21 @@ def build(self):
                     world_size=self.world_size,
                     tp_size=self.world_size),  # TP only
                 use_parallel_embedding=(self.model_name == 'bloom_176b'))
+        elif family == "falcon":
+            tensorrt_llm_model = tensorrt_llm.models.FalconForCausalLM(
+                num_layers=self.num_layers,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+                hidden_size=self.hidden_size,
+                vocab_size=self.vocab_size,
+                max_position_embeddings=self.n_positions,
+                dtype=kv_dtype,
+                bias=self.bias,
+                use_alibi=self.use_alibi,
+                new_decoder_architecture=self.new_decoder_architecture,
+                parallel_attention=self.parallel_attention,
+                mapping=tensorrt_llm.Mapping(world_size=self.world_size,
+                                             tp_size=self.world_size))
         else:
             raise Exception(f'Unexpected model: {self.model_name}')
 
@@ -429,7 +447,7 @@ def build(self):
 
     def run(self, inputs, config):
         batch_size, inlen, outlen = config[0], config[1], config[2]
-        self.decoder.setup(batch_size, inlen, outlen)
+        self.decoder.setup(batch_size, inlen, outlen, beam_width=self.num_beams)
         if self.remove_input_padding:
             self.decoder.decode_batch(inputs[0], self.sampling_config)
         else:
diff --git a/benchmarks/mem_monitor.py b/benchmarks/python/mem_monitor.py
similarity index 100%
rename from benchmarks/mem_monitor.py
rename to benchmarks/python/mem_monitor.py
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1b94dabce9d..58ee3e8a638 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -28,6 +28,14 @@ project(tensorrt_llm LANGUAGES CXX)
 option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON)
 option(BUILD_TESTS "Build Google tests" ON)
 option(BUILD_BENCHMARKS "Build benchmarks" ON)
+option(NVTX_DISABLE "Disable all NVTX features" ON)
+
+if(NVTX_DISABLE)
+  add_compile_definitions("NVTX_DISABLE")
+  message(STATUS "NVTX is disabled")
+else()
+  message(STATUS "NVTX is enabled")
+endif()
 
 if(EXISTS
    "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/batch_manager/CMakeLists.txt")
@@ -67,15 +75,35 @@ endif()
 check_language(CUDA)
 if(CMAKE_CUDA_COMPILER)
   message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}")
-  execute_process(
-    COMMAND
-      "bash" "-c"
-      "${CMAKE_CUDA_COMPILER} --version | egrep -o 'V[0-9]+.[0-9]+.[0-9]+' | cut -c2-"
-    RESULT_VARIABLE _BASH_SUCCESS
-    OUTPUT_VARIABLE CMAKE_CUDA_COMPILER_VERSION
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(NOT _BASH_SUCCESS EQUAL 0)
-    message(FATAL_ERROR "Failed to determine CUDA version")
+  if(NOT WIN32) # Linux
+    execute_process(
+      COMMAND
+        "bash" "-c"
+        "${CMAKE_CUDA_COMPILER} --version | egrep -o 'V[0-9]+.[0-9]+.[0-9]+' | cut -c2-"
+      RESULT_VARIABLE _BASH_SUCCESS
+      OUTPUT_VARIABLE CMAKE_CUDA_COMPILER_VERSION
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(NOT _BASH_SUCCESS EQUAL 0)
+      message(FATAL_ERROR "Failed to determine CUDA version")
+    endif()
+
+  else() # Windows
+    execute_process(
+      COMMAND ${CMAKE_CUDA_COMPILER} --version
+      OUTPUT_VARIABLE versionString
+      RESULT_VARIABLE versionResult)
+
+    if(versionResult EQUAL 0 AND versionString MATCHES
+                                 "V[0-9]+\\.[0-9]+\\.[0-9]+")
+      string(REGEX REPLACE "V" "" version ${CMAKE_MATCH_0})
+      set(CMAKE_CUDA_COMPILER_VERSION "${version}")
+    else()
+      message(FATAL_ERROR "Failed to determine CUDA version")
+    endif()
+
+    # Export shared libs as both `.lib` and `.dll` to avoid linking errors.
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
   endif()
 else()
   message(FATAL_ERROR "No CUDA compiler found")
@@ -102,85 +130,94 @@ message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
 enable_language(CUDA)
 
-# TODO: FindCUDA is deprecated and should be replaced by FindCUDAToolkit
-# https://cmake.org/cmake/help/latest/module/FindCUDA.html
-find_package(CUDA ${CUDA_REQUIRED_VERSION} REQUIRED)
-
-message(STATUS "CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}")
+find_package(CUDAToolkit REQUIRED)
 
 find_library(
   CUDNN_LIB cudnn
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR} ${CUDNN_ROOT_DIR}
-  PATH_SUFFIXES lib64 lib)
+  HINTS ${CUDNN_ROOT_DIR} ${CUDAToolkit_LIBRARY_DIR}
+  PATH_SUFFIXES lib64 lib lib/x64)
 find_library(
   CUBLAS_LIB cublas
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
+  HINTS ${CUDAToolkit_LIBRARY_DIR}
   PATH_SUFFIXES lib64 lib lib/stubs)
 find_library(
   CUBLASLT_LIB cublasLt
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
+  HINTS ${CUDAToolkit_LIBRARY_DIR}
   PATH_SUFFIXES lib64 lib lib/stubs)
-find_library(
-  CUDART_LIB cudart
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64)
 find_library(
   CUDA_DRV_LIB cuda
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs)
-set(CUDA_LIBRARIES ${CUDART_LIB})
+  HINTS ${CUDAToolkit_LIBRARY_DIR}
+  PATH_SUFFIXES stubs lib lib64 lib/stubs lib64/stubs)
+
+set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
 
 find_library(RT_LIB rt)
 
 set_ifndef(ENABLE_MULTI_DEVICE 1)
 if(ENABLE_MULTI_DEVICE EQUAL 1)
   # NCCL dependencies
-  set_ifndef(NCCL_LIB_DIR /usr/lib/x86_64-linux-gnu/)
+  set_ifndef(NCCL_LIB_DIR /usr/lib/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu/)
   set_ifndef(NCCL_INCLUDE_DIR /usr/include/)
   find_library(NCCL_LIB nccl HINTS ${NCCL_LIB_DIR})
 endif()
 
-set(3RDPARTY_DIR ../3rdparty)
-include_directories(${CUDA_INCLUDE_DIRS} ${CUDNN_ROOT_DIR}/include
-                    ${NCCL_INCLUDE_DIR} ${3RDPARTY_DIR}/cutlass/include)
+get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_SOURCE_DIR} PATH)
+
+set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
+include_directories(
+  ${CUDA_INCLUDE_DIRS} ${CUDNN_ROOT_DIR}/include ${NCCL_INCLUDE_DIR}
+  ${3RDPARTY_DIR}/cutlass/include ${3RDPARTY_DIR}/NVTX/include)
 
 # TRT dependencies
 set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR})
-set_ifndef(TRT_INCLUDE_DIR /usr/include/x86_64-linux-gnu)
+set_ifndef(TRT_INCLUDE_DIR /usr/include/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu)
 set(TRT_LIB nvinfer)
 find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR})
 find_library_create_target(nvuffparser nvparsers SHARED ${TRT_LIB_DIR})
 
-if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11")
+if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11")
   add_definitions("-DENABLE_BF16")
   message(
     STATUS
-      "CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag"
+      "CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag"
   )
 endif()
 
-if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.8")
+if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11.8")
   add_definitions("-DENABLE_FP8")
   message(
     STATUS
-      "CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.8, enable -DENABLE_FP8 flag"
+      "CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 11.8, enable -DENABLE_FP8 flag"
   )
 endif()
 
+# MPI MPI isn't used until tensorrt_llm/CMakeLists.txt is invoked. However, if
+# it's not called before "CMAKE_CXX_FLAGS" is set, it breaks on Windows for some
+# reason, so we just call it here as a workaround.
+find_package(MPI REQUIRED)
+
 # C++17
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
 set(CMAKE_CXX_FLAGS
-    "-Wno-deprecated-declarations ${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE}"
+    "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE}"
 )
 
+# Disable deprecated declarations warnings
+if(NOT WIN32)
+  set(CMAKE_CXX_FLAGS "-Wno-deprecated-declarations ${CMAKE_CXX_FLAGS}")
+else()
+  # /wd4996 is the Windows equivalent to turn off warnings for deprecated
+  # declarations
+  set(CMAKE_CXX_FLAGS "/wd4996 ${CMAKE_CXX_FLAGS}")
+endif()
+
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
 
-set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDA_PATH}/include)
+set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR})
 message(STATUS "COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}")
 
 if(BUILD_PYT)
@@ -277,5 +314,6 @@ if(BUILD_TESTS)
 endif()
 
 if(BUILD_BENCHMARKS)
-  add_subdirectory(benchmarks)
+  add_subdirectory(${TRT_LLM_ROOT_DIR}/benchmarks/cpp
+                   ${CMAKE_BINARY_DIR}/benchmarks)
 endif()
diff --git a/cpp/cmake/modules/find_library_create_target.cmake b/cpp/cmake/modules/find_library_create_target.cmake
index c315d88ca78..1af806f19d0 100644
--- a/cpp/cmake/modules/find_library_create_target.cmake
+++ b/cpp/cmake/modules/find_library_create_target.cmake
@@ -31,8 +31,9 @@ macro(find_library_create_target target_name lib libtype hints)
   find_library(${lib}_LIB_PATH ${lib})
   message(STATUS "Library that was found ${${lib}_LIB_PATH}")
   add_library(${target_name} ${libtype} IMPORTED)
-  set_property(TARGET ${target_name} PROPERTY IMPORTED_LOCATION
-                                              ${${lib}_LIB_PATH})
+  set_target_properties(
+    ${target_name} PROPERTIES IMPORTED_LOCATION ${${lib}_LIB_PATH}
+                              IMPORTED_IMPLIB ${${lib}_LIB_PATH})
   message(
     STATUS
       "=========================================================================================="
diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
index 3adb7a9e5e9..d626e1f7659 100644
--- a/cpp/include/tensorrt_llm/batch_manager/GptManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
@@ -17,8 +17,10 @@
 #pragma once
 
 #include "tensorrt_llm/batch_manager/BatchManager.h"
+#include "tensorrt_llm/batch_manager/batchScheduler.h"
 #include "tensorrt_llm/batch_manager/callbacks.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
 #include <atomic>
 #include <cstdlib>
 #include <filesystem>
@@ -42,12 +44,13 @@ class TrtGptModel;
 class GptManager
 {
 public:
-    using RequestTable = std::map<uint64_t, LlmRequest>;
+    using SizeType = tensorrt_llm::runtime::SizeType;
+    using RequestList = std::list<std::shared_ptr<LlmRequest>>;
 
-    GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType, int32_t mMaxSeqLen,
-        int32_t maxNumRequests, int32_t maxBeamWidth, GetInferenceRequestsCallback getInferenceRequestsCb,
+    GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType, int32_t maxBeamWidth,
+        batch_scheduler::SchedulerPolicy schedulerPolicy, GetInferenceRequestsCallback getInferenceRequestsCb,
         SendResponseCallback sendResponseCb, PollStopSignalCallback pollStopSignalCb = nullptr,
-        std::optional<int32_t> maxTokensInPagedKvCache = std::nullopt);
+        const TrtGptModelOptionalParams& optionalParams = TrtGptModelOptionalParams());
 
     /* Wraps the user-provided callback for requests.
        Adds requests to request table.
@@ -56,9 +59,8 @@ class GptManager
 
     /* Does the following:
        1. Returns completed requests to Triton
-       2. Frees KV cache and other dedicated resources
-       3. Deletes entry from request_table */
-    BatchManagerErrorCode_t return_completed_requests();
+       2. Deletes entry from activeRequests */
+    BatchManagerErrorCode_t returnCompletedRequests();
 
     BatchManagerErrorCode_t pollStopSignals();
 
@@ -69,20 +71,23 @@ class GptManager
        1. Maps batch manager requests to backend request
        2. Invokes one step of backend
        3. Updates state of all requests */
-    virtual BatchManagerErrorCode_t step(RequestTable& requestTable);
+    virtual BatchManagerErrorCode_t step(RequestList& activeRequests, std::set<uint64_t>& activeRequestsIds);
 
 private:
-    void validateLlmRequest(LlmRequest& newReq);
-    static LlmRequest fillLlmRequest(std::shared_ptr<InferenceRequest> newReq);
+    void validateLlmRequest(LlmRequest& newReq) const;
+    static std::shared_ptr<LlmRequest> fillLlmRequest(std::shared_ptr<InferenceRequest> newReq);
     static std::shared_ptr<std::vector<int32_t>> getReqInputTokens(std::shared_ptr<InferenceRequest> new_req);
     static int32_t getMaxNewTokens(std::shared_ptr<InferenceRequest> newReq);
 
     std::shared_ptr<TrtGptModel> mTrtGptModel;
-    int32_t mMaxNumRequests;
-    int32_t mMaxSeqLen;
-
-    // Table of live requests
-    std::map<uint64_t, LlmRequest> mRequestTable;
+    SizeType mMaxInputLen;
+    SizeType mMaxOutputLen;
+    SizeType mMaxNumSequences;
+
+    // List of live requests
+    RequestList mActiveRequests;
+    // IDs of live requests
+    std::set<uint64_t> mActiveRequestsIds;
 
     GetInferenceRequestsCallback mGetInferenceRequestsCb;
     SendResponseCallback mSendResponseCb;
diff --git a/cpp/include/tensorrt_llm/batch_manager/batchScheduler.h b/cpp/include/tensorrt_llm/batch_manager/batchScheduler.h
new file mode 100644
index 00000000000..4208c8b4c2b
--- /dev/null
+++ b/cpp/include/tensorrt_llm/batch_manager/batchScheduler.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/runtime/common.h"
+#include <list>
+
+namespace tensorrt_llm::batch_manager::batch_scheduler
+{
+
+enum class SchedulerPolicy
+{
+    MAX_UTILIZATION,
+    GUARANTEED_COMPLETION,
+};
+
+class BatchScheduler
+{
+public:
+    using RequestTable = std::map<uint64_t, std::shared_ptr<LlmRequest>>;
+    using SizeType = tensorrt_llm::runtime::SizeType;
+    using RequestList = std::list<std::shared_ptr<LlmRequest>>;
+
+    BatchScheduler(int32_t maxNumRequests, int32_t maxInputLen,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager, SchedulerPolicy schedulerPolicy)
+        : mMaxNumRequests(maxNumRequests)
+        , mMaxInputLen(maxInputLen)
+        , mKvCacheManager(kvCacheManager)
+        , mSchedulerPolicy(schedulerPolicy)
+    {
+    }
+
+    /// @brief Takes as input a sorted list of requets and outputs a map of requests
+    ///        to update for this current iteration
+    RequestTable scheduleRequests(const RequestList& requestList);
+
+private:
+    /// @brief Schedule request using the MAX_UTILIZATION policy
+    RequestTable scheduleRequestsMaxUtilization(const RequestList& requestList);
+
+    /// @brief Try reserving resources to advance this req by one step, using MAX_UTILIZATION policy
+    bool trySchedulingRequestMaxUtilization(
+        const LlmRequest& req, SizeType& numScheduledRequests, SizeType& numScheduledBlocks);
+
+    /// @brief Schedule request using the GUARANTEED_COMPLETION policy
+    RequestTable scheduleRequestsGuaranteedCompletion(const RequestList& requestList);
+
+    /// @brief Schedule up to mMaxNumReuests requests
+    RequestTable scheduleMaxNumRequests(const RequestList& requestList);
+
+    /// The maximum number of requests to schedule
+    int32_t mMaxNumRequests;
+
+    /// The maximum prompt length
+    int32_t mMaxInputLen;
+
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mKvCacheManager;
+
+    /// The scheduling policy to use
+    SchedulerPolicy mSchedulerPolicy;
+};
+
+} // namespace tensorrt_llm::batch_manager::batch_scheduler
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
index cd5845851a9..c6498620720 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -16,8 +16,10 @@
 
 #pragma once
 
+#include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/gptModelConfig.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 
 #include <NvInferRuntime.h>
@@ -26,11 +28,6 @@
 #include <memory>
 #include <vector>
 
-namespace tensorrt_llm::runtime
-{
-class GptModelConfig;
-}
-
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
@@ -158,7 +155,6 @@ class BlockManager
         return mFreeBlocks.size();
     }
 
-private:
     [[nodiscard]] bool hasFreeBlocks(std::size_t numRequired = 1) const
     {
         return getNumFreeBlocks() >= numRequired;
@@ -203,6 +199,17 @@ class KVCacheManager
         return mBlockManager;
     }
 
+    /// @brief  Function that computes the number of KV cache blocks needed to advance a request by one iteration
+    /// @param req The request for which we need to calculate the number of needed KV cache blocks
+    /// @return  The number of blocks
+    SizeType getNeededBlocksOneStep(const LlmRequest& req) const;
+
+    /// @brief  Function that computes the number of KV cache blocks needed to advance a request to completion (i.e. for
+    /// maxNewTokens)
+    /// @param req The request for which we need to calculate the number of needed KV cache blocks
+    /// @return  The number of blocks
+    SizeType getNeededBlocksToCompletion(const LlmRequest& req) const;
+
     [[nodiscard]] std::vector<runtime::ITensor::SharedPtr> const& getMemoryPools() const
     {
         return mPools;
@@ -217,15 +224,21 @@ class KVCacheManager
     [[nodiscard]] std::vector<runtime::ITensor::UniquePtr> getBlockPointersOfSlot(
         SizeType batchSlotIdx, SizeType beamWidth, SizeType maxBlocksPerSeq) const;
 
-    [[nodiscard]] std::vector<runtime::ITensor::UniquePtr> getBlockPointersOfBatch(
+    [[nodiscard]] runtime::ITensor::UniquePtr getBlockPointersOfBatch(
         SizeType batchSize, SizeType beamWidth, SizeType maxBlocksPerSeq) const;
 
     // Volume of [2, numKvHeads, tokensPerBlock, sizePerHead]
-    [[nodiscard]] static SizeType constexpr calculatePageSize(tensorrt_llm::runtime::GptModelConfig const& modelConfig);
+    [[nodiscard]] static SizeType constexpr calculatePageSize(tensorrt_llm::runtime::GptModelConfig const& modelConfig)
+    {
+        return 2 * modelConfig.getNbKvHeads() * modelConfig.getTokensPerBlock() * modelConfig.getSizePerHead();
+    }
 
     // numLayers * 2 * numKvHeads * sizePerHead
     [[nodiscard]] static SizeType constexpr calculateCacheSizePerToken(
-        tensorrt_llm::runtime::GptModelConfig const& modelConfig);
+        tensorrt_llm::runtime::GptModelConfig const& modelConfig)
+    {
+        return modelConfig.getNbLayers() * 2 * modelConfig.getNbKvHeads() * modelConfig.getSizePerHead();
+    }
 
 private:
     // Number of elements per one blocks
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index 913576f1286..500561caefa 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "tensorrt_llm/batch_manager/BatchManager.h"
 #include "tensorrt_llm/runtime/samplingConfig.h"
 
+#include <assert.h>
 #include <cstdint>
 #include <memory>
 #include <vector>
@@ -52,24 +52,135 @@ class LlmRequest
         , mIsStreaming(isStreaming)
         , mEndId(endId)
         , mPadId(padId)
+        , mBatchSlot(-1)
     {
+        mMaxSentTokenPos = mPromptLen - 1;
         // Scatter the input tokens to other beam
         mTokens = std::make_shared<BeamTokens>(mSamplingConfig.beamWidth, *input_tokens);
     }
 
+    /// @brief Get total number of tokens for this req (prompt + generated)
+    /// @param beam The beam index
+    /// @return  The number of tokens
+    int32_t getNumTokens(int beam) const
+    {
+        return mTokens->at(beam).size();
+    }
+
+    /// @brief Get a token at a given position and beam index
+    /// @param beam  The beam index
+    /// @param pos The position of the token relative to beginning of the prompt
+    /// @return  The token index
+    int32_t getToken(int beam, int pos) const
+    {
+        return mTokens->at(beam).at(pos);
+    }
+
+    /// @brief Get the tokens at a given beam index
+    /// @param beam  The beam index
+    /// @return  A vector of tokens for this beam index, includes the prompt
+    std::vector<int32_t> getTokens(int beam) const
+    {
+        return mTokens->at(beam);
+    }
+
+    /// @brief Get the number of generated tokens
+    /// @return  The number of generated tokens (doesn't include the prompt tokens)
+    int32_t getNumGeneratedTokens() const
+    {
+        return mNumGeneratedTokens;
+    }
+
+    /// @brief Add new generated tokens to the vector of tokens
+    /// @param beamTokens A vector containing the tokens to add for each beam index
+    ///                   beamTokens is expected to be of size beamWidth
+    void addNewTokens(const std::vector<int32_t>& beamTokens)
+    {
+        assert(mSamplingConfig.beamWidth == beamTokens.size());
+        for (int beam = 0; beam < beamTokens.size(); ++beam)
+        {
+            mTokens->at(beam).push_back(beamTokens[beam]);
+        }
+        mNumGeneratedTokens++;
+    }
+
+    /// @brief Sets the generated tokens for all beams. Erases all previous generated tokens.
+    /// @param generatedBeamTokens The generated tokens for all beams (vector of vector of tokens)
+    void setGeneratedTokens(const BeamTokens& generatedBeamTokens)
+    {
+        assert(generatedBeamTokens.size() == mSamplingConfig.beamWidth);
+        for (int beam = 0; beam < generatedBeamTokens.size(); ++beam)
+        {
+            auto& beamTokens = (*mTokens)[beam];
+            beamTokens.resize(mPromptLen);
+            beamTokens.insert(beamTokens.end(), generatedBeamTokens[beam].begin(), generatedBeamTokens[beam].end());
+        }
+        mNumGeneratedTokens = generatedBeamTokens.at(0).size();
+    }
+
+    /// @brief Pause a request by moving the generated tokens to the prompt
+    /// @param maxInputLen The maximum prompt len.
+    void pause(SizeType maxInputLen)
+    {
+        // TODO: For beamWidth > 1, we would need to support swapping to avoid
+        // recomputing from the start
+        // See https://jirasw.nvidia.com/browse/TRT-21715
+        // As a temporary solution, we currently reset the tokens to the prompt
+        if (mSamplingConfig.beamWidth > 1)
+        {
+            for (auto& beamTokens : *mTokens)
+            {
+                beamTokens.resize(mPromptLen);
+            }
+        }
+        else
+        {
+            SizeType newPromptLen = std::min(maxInputLen, mPromptLen + mNumGeneratedTokens);
+            for (auto& beamTokens : *mTokens)
+            {
+                beamTokens.resize(newPromptLen);
+            }
+            mMaxNewTokens -= (newPromptLen - mPromptLen);
+            mPromptLen = newPromptLen;
+        }
+        mNumGeneratedTokens = 0;
+        mState = REQUEST_STATE_CONTEXT_INIT;
+        mBatchSlot = -1;
+    }
+
+    /// @brief Get the maximum position of the tokens returned to the client. Use to ensure we don't return to client
+    /// duplicated token positions.
+    /// @return The maximum position of the tokens sent to the client
+    int32_t getMaxSentTokenPos() const
+    {
+        return mMaxSentTokenPos;
+    }
+
+    /// @brief Sets the maximum position of the tokens returned to the client. Use to ensure we don't return to client
+    /// duplicated token positions.
+    /// @param pos The maximum position
+    void setMaxSentTokenPos(int32_t pos)
+    {
+        mMaxSentTokenPos = pos;
+    }
+
     uint64_t mRequestId;
     int32_t mMaxNewTokens;
     // Tokens [beam_size, mPromptLen + mNumGeneratedTokens]
-    std::shared_ptr<BeamTokens> mTokens;
     runtime::SamplingConfig mSamplingConfig;
     int32_t mPromptLen;
-    int32_t mNumGeneratedTokens;
     LlmRequestState_t mState;
     bool mIsStreaming;
     std::optional<SizeType> mEndId;
     std::optional<SizeType> mPadId;
+    int32_t mBatchSlot;
 
     ~LlmRequest() {}
+
+private:
+    std::shared_ptr<BeamTokens> mTokens;
+    int32_t mNumGeneratedTokens;
+    int32_t mMaxSentTokenPos;
 };
 
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
new file mode 100644
index 00000000000..43184c935dc
--- /dev/null
+++ b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
@@ -0,0 +1,68 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/runtime/common.h"
+
+#include <optional>
+
+namespace tensorrt_llm::batch_manager
+{
+
+class TrtGptModelOptionalParams
+{
+public:
+    using SizeType = tensorrt_llm::runtime::SizeType;
+
+    TrtGptModelOptionalParams()
+        : mMaxNumSequences(std::nullopt)
+        , mMaxTokensInPagedKvCache(std::nullopt)
+        , mKvCacheFreeGpuMemFraction(std::nullopt)
+    {
+    }
+
+    TrtGptModelOptionalParams(std::optional<SizeType> maxNumSequences, std::optional<SizeType> maxTokensInPagedKvCache,
+        std::optional<float> kvCacheFreeGpuMemFraction)
+        : mMaxNumSequences(maxNumSequences)
+        , mMaxTokensInPagedKvCache(maxTokensInPagedKvCache)
+        , mKvCacheFreeGpuMemFraction(kvCacheFreeGpuMemFraction)
+    {
+    }
+
+    [[nodiscard]] std::optional<SizeType> getMaxTokensInPagedKvCache() const
+    {
+        return mMaxTokensInPagedKvCache;
+    }
+
+    [[nodiscard]] std::optional<float> getKvCacheFreeGpuMemFraction() const
+    {
+        return mKvCacheFreeGpuMemFraction;
+    }
+
+    [[nodiscard]] std::optional<float> getMaxNumSequences() const
+    {
+        return mMaxNumSequences;
+    }
+
+private:
+    std::optional<SizeType> mMaxNumSequences;
+    std::optional<SizeType> mMaxTokensInPagedKvCache;
+    std::optional<float> mKvCacheFreeGpuMemFraction;
+};
+
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h b/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h
index efd7a590a1c..5226fd0e5b5 100644
--- a/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h
@@ -32,10 +32,12 @@ namespace tensorrt_llm::runtime
 class GptJsonConfig
 {
 public:
-    GptJsonConfig(std::string name, std::string precision, SizeType worldSize, GptModelConfig const& modelConfig)
+    GptJsonConfig(std::string name, std::string precision, SizeType tensorParallelism, SizeType pipelineParallelism,
+        GptModelConfig const& modelConfig)
         : mName(std::move(name))
         , mPrecision(std::move(precision))
-        , mWorldSize{worldSize}
+        , mTensorParallelism{tensorParallelism}
+        , mPipelineParallelism{pipelineParallelism}
         , mGptModelConfig(modelConfig)
     {
     }
@@ -61,9 +63,19 @@ class GptJsonConfig
         return mPrecision;
     }
 
-    [[nodiscard]] SizeType const& getWorldSize() const
+    [[nodiscard]] SizeType constexpr getTensorParallelism() const
     {
-        return mWorldSize;
+        return mTensorParallelism;
+    }
+
+    [[nodiscard]] SizeType constexpr getPipelineParallelism() const
+    {
+        return mPipelineParallelism;
+    }
+
+    [[nodiscard]] SizeType constexpr getWorldSize() const
+    {
+        return mTensorParallelism * mPipelineParallelism;
     }
 
     [[nodiscard]] std::string engineFilename(WorldConfig const& worldConfig, std::string const& model) const;
@@ -76,7 +88,8 @@ class GptJsonConfig
 private:
     std::string const mName;
     std::string const mPrecision;
-    SizeType const mWorldSize;
+    SizeType const mTensorParallelism;
+    SizeType const mPipelineParallelism;
     GptModelConfig const mGptModelConfig;
 };
 
diff --git a/cpp/include/tensorrt_llm/runtime/gptModelConfig.h b/cpp/include/tensorrt_llm/runtime/gptModelConfig.h
index 863aa35c4ea..5c3892b6ae8 100644
--- a/cpp/include/tensorrt_llm/runtime/gptModelConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/gptModelConfig.h
@@ -35,11 +35,13 @@ class GptModelConfig
         , mHiddenSize(hiddenSize)
         , mDataType(dtype)
         , mUseGptAttentionPlugin(false)
-        , mUseInflightBatching(false)
         , mInputPacked{false}
         , mPagedKvCache{false}
         , mTokensPerBlock{64}
         , mQuantMode{common::QuantMode::none()}
+        , mMaxBatchSize(0)
+        , mMaxInputLen(0)
+        , mMaxOutputLen(0)
     {
     }
 
@@ -53,9 +55,10 @@ class GptModelConfig
         return (mVocabSize + worldSize - 1) / worldSize * worldSize;
     }
 
-    [[nodiscard]] SizeType constexpr getNbLayers() const noexcept
+    [[nodiscard]] SizeType constexpr getNbLayers(SizeType pipelineParallelism = 1) const
     {
-        return mNbLayers;
+        TLLM_CHECK(mNbLayers % pipelineParallelism == 0);
+        return mNbLayers / pipelineParallelism;
     }
 
     [[nodiscard]] SizeType constexpr getNbHeads() const noexcept
@@ -138,14 +141,39 @@ class GptModelConfig
         mQuantMode = QuantMode;
     }
 
-    [[nodiscard]] bool constexpr useInflightBatching() const noexcept
+    [[nodiscard]] bool constexpr supportsInflightBatching() const noexcept
     {
-        return mUseInflightBatching;
+        return mUseGptAttentionPlugin && mInputPacked && mPagedKvCache;
     }
 
-    void constexpr useInflightBatching(bool useInflightBatching) noexcept
+    [[nodiscard]] SizeType constexpr getMaxBatchSize() const noexcept
     {
-        mUseInflightBatching = useInflightBatching;
+        return mMaxBatchSize;
+    }
+
+    void constexpr setMaxBatchSize(SizeType maxBatchSize) noexcept
+    {
+        mMaxBatchSize = maxBatchSize;
+    }
+
+    [[nodiscard]] SizeType constexpr getMaxInputLen() const noexcept
+    {
+        return mMaxInputLen;
+    }
+
+    void constexpr setMaxInputLen(SizeType maxInputLen) noexcept
+    {
+        mMaxInputLen = maxInputLen;
+    }
+
+    [[nodiscard]] SizeType constexpr getMaxOutputLen() const noexcept
+    {
+        return mMaxOutputLen;
+    }
+
+    void constexpr setMaxOutputLen(SizeType maxOutputLen) noexcept
+    {
+        mMaxOutputLen = maxOutputLen;
     }
 
 private:
@@ -156,11 +184,13 @@ class GptModelConfig
     SizeType mHiddenSize;
     nvinfer1::DataType mDataType;
     bool mUseGptAttentionPlugin;
-    bool mUseInflightBatching;
     bool mInputPacked;
     bool mPagedKvCache;
     SizeType mTokensPerBlock;
     common::QuantMode mQuantMode;
+    SizeType mMaxBatchSize;
+    SizeType mMaxInputLen;
+    SizeType mMaxOutputLen;
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h
index def229827fd..dca47a446cd 100644
--- a/cpp/include/tensorrt_llm/runtime/gptSession.h
+++ b/cpp/include/tensorrt_llm/runtime/gptSession.h
@@ -25,13 +25,13 @@
 #include "tensorrt_llm/runtime/samplingConfig.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
+#include <NvInferRuntime.h>
+
 #include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>
 
-#include <NvInferRuntime.h>
-
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 class KVCacheManager;
@@ -47,6 +47,7 @@ std::vector<uint8_t> loadEngine(std::string const& enginePath);
 
 class TllmRuntime;
 class IStatefulGptDecoder;
+class NcclCommunicator;
 class RuntimeBuffers;
 
 class GptSession
@@ -109,6 +110,9 @@ class GptSession
     void createContexts();
     void createDecoder(bool decoderPerRequest);
 
+    bool executeDecoderStep(ITensor::SharedPtr& outputIds, ITensor::SharedPtr& newTokens, SizeType decoderStep);
+    void finalizeOutputIds(ITensor& outputIds);
+
     class CudaGraphExecutor
     {
     public:
@@ -131,13 +135,15 @@ class GptSession
             return mInstance != nullptr;
         }
 
+        void clear();
+        void prepareNextGraph(TllmRuntime const& runtime, SizeType nextContextId);
+        void launch(CudaStream const& stream);
+
+    private:
         void create(cudaGraph_t const& graph);
         bool update(cudaGraph_t const& graph);
         void uploadToStream(CudaStream const& stream);
-        void launch(CudaStream const& stream);
-        void clear();
 
-    private:
         using cudaGraphExecPtr = cudaGraphExec_t;
         cudaGraphExecPtr mInstance;
     };
@@ -146,6 +152,7 @@ class GptSession
     GptModelConfig const mModelConfig;
     WorldConfig const mWorldConfig;
     int mDevice{-1};
+    std::shared_ptr<NcclCommunicator> mPipelineComm;
 
     SizeType mDecoderMaxSequenceLength{};
 
diff --git a/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h b/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
index b6f96dfa8d0..507798e5fb8 100644
--- a/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
+++ b/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h
@@ -61,6 +61,7 @@ class Output
 
     // parameters for beam search
     TensorPtr cacheIndirection; // [batchSize, maxBeamWidth, maxSeqLen], mandatory in beam search, on gpu
+    TensorPtr sequenceLengths;  // [batchSize, maxBeamWidth], mandatory, on gpu
 };
 } // namespace decoder
 
diff --git a/cpp/include/tensorrt_llm/runtime/tllmLogger.h b/cpp/include/tensorrt_llm/runtime/tllmLogger.h
index 53b20a3a6c2..5ffd39b7893 100644
--- a/cpp/include/tensorrt_llm/runtime/tllmLogger.h
+++ b/cpp/include/tensorrt_llm/runtime/tllmLogger.h
@@ -18,9 +18,7 @@
 
 #include <NvInferRuntime.h>
 
-namespace tensorrt_llm
-{
-namespace runtime
+namespace tensorrt_llm::runtime
 {
 
 class TllmLogger : public nvinfer1::ILogger
@@ -33,5 +31,4 @@ class TllmLogger : public nvinfer1::ILogger
     void setLevel(Severity level);
 };
 
-} // namespace runtime
-} // namespace tensorrt_llm
+} // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/worldConfig.h b/cpp/include/tensorrt_llm/runtime/worldConfig.h
index 0fbf305be0b..41193f098f3 100644
--- a/cpp/include/tensorrt_llm/runtime/worldConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/worldConfig.h
@@ -19,6 +19,8 @@
 #include "tensorrt_llm/runtime/common.h"
 
 #include <NvInferRuntime.h>
+#include <optional>
+#include <vector>
 
 namespace tensorrt_llm::runtime
 {
@@ -27,9 +29,10 @@ class WorldConfig
 public:
     static SizeType constexpr kDefaultGpusPerNode = 8;
 
-    constexpr explicit WorldConfig(
-        SizeType worldSize = 1, SizeType rank = 0, SizeType gpusPerNode = kDefaultGpusPerNode)
-        : mSize{worldSize}
+    constexpr explicit WorldConfig(SizeType tensorParallelism = 1, SizeType pipelineParallelism = 1, SizeType rank = 0,
+        SizeType gpusPerNode = kDefaultGpusPerNode)
+        : mTensorParallelism{tensorParallelism}
+        , mPipelineParallelism{pipelineParallelism}
         , mRank{rank}
         , mGpusPerNode{gpusPerNode}
     {
@@ -37,7 +40,22 @@ class WorldConfig
 
     [[nodiscard]] SizeType constexpr getSize() const noexcept
     {
-        return mSize;
+        return mTensorParallelism * mPipelineParallelism;
+    }
+
+    [[nodiscard]] SizeType constexpr getTensorParallelism() const noexcept
+    {
+        return mTensorParallelism;
+    }
+
+    [[nodiscard]] SizeType constexpr getPipelineParallelism() const noexcept
+    {
+        return mPipelineParallelism;
+    }
+
+    [[nodiscard]] bool constexpr isPipelineParallel() const noexcept
+    {
+        return mPipelineParallelism > 1;
     }
 
     [[nodiscard]] SizeType constexpr getRank() const noexcept
@@ -55,12 +73,39 @@ class WorldConfig
         return mRank % mGpusPerNode;
     }
 
-    static WorldConfig mpi(nvinfer1::ILogger& logger, SizeType gpusPerNode = kDefaultGpusPerNode);
+    [[nodiscard]] SizeType constexpr getPipelineParallelRank() const noexcept
+    {
+        return mRank / mTensorParallelism;
+    }
+
+    [[nodiscard]] SizeType constexpr getTensorParallelRank() const noexcept
+    {
+        return mRank % mTensorParallelism;
+    }
+
+    [[nodiscard]] bool constexpr isFirstPipelineParallelRank() const noexcept
+    {
+        return getPipelineParallelRank() == 0;
+    }
+
+    [[nodiscard]] bool constexpr isLastPipelineParallelRank() const noexcept
+    {
+        return getPipelineParallelRank() == getPipelineParallelism() - 1;
+    }
+
+    [[nodiscard]] std::vector<SizeType> getPipelineParallelGroup() const;
+
+    static WorldConfig mpi(nvinfer1::ILogger& logger, SizeType gpusPerNode = kDefaultGpusPerNode,
+        std::optional<SizeType> tensorParallelism = std::nullopt,
+        std::optional<SizeType> pipelineParallelism = std::nullopt);
 
-    static WorldConfig mpi(SizeType gpusPerNode = kDefaultGpusPerNode);
+    static WorldConfig mpi(SizeType gpusPerNode = kDefaultGpusPerNode,
+        std::optional<SizeType> tensorParallelism = std::nullopt,
+        std::optional<SizeType> pipelineParallelism = std::nullopt);
 
 private:
-    SizeType mSize;
+    SizeType mTensorParallelism;
+    SizeType mPipelineParallelism;
     SizeType mRank;
     SizeType mGpusPerNode;
 };
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
index 49fb5409580..2e6c1acf139 100644
--- a/cpp/tensorrt_llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/CMakeLists.txt
@@ -24,8 +24,8 @@ set(STATIC_TARGET
 set(API_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
 
 find_package(MPI REQUIRED)
-message(STATUS "Using MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}")
-message(STATUS "Using MPI_LIBRARIES: ${MPI_LIBRARIES}")
+message(STATUS "Using MPI_CXX_INCLUDE_DIRS: ${MPI_CXX_INCLUDE_DIRS}")
+message(STATUS "Using MPI_CXX_LIBRARIES: ${MPI_CXX_LIBRARIES}")
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/cutlass_extensions/include
                     ${API_INCLUDE_DIR} ${MPI_INCLUDE_PATH})
@@ -66,10 +66,9 @@ endif()
 set(TRTLLM_LINK_LIBS
     ${CUBLAS_LIB}
     ${CUBLASLT_LIB}
-    ${CUDART_LIB}
     ${CUDNN_LIB}
     ${CMAKE_DL_LIBS}
-    ${MPI_LIBRARIES}
+    ${MPI_CXX_LIBRARIES}
     ${TRT_LIB}
     common_src
     kernels_src
@@ -86,8 +85,14 @@ set_target_properties(
   ${SHARED_TARGET} PROPERTIES CXX_STANDARD "17" CXX_STANDARD_REQUIRED "YES"
                               CXX_EXTENSIONS "NO")
 
+if(NOT MSVC) # Unix-like compilers
+  set(ALLOW_UNDEFINED_FLAG "-Wl, --no-undefined")
+else() # MSVC
+  set(UNDEFINED_FLAG "")
+endif()
+
 target_link_libraries(${SHARED_TARGET} PUBLIC ${TRTLLM_LINK_LIBS}
-                                              "-Wl,--no-undefined")
+                                              ${UNDEFINED_FLAG})
 
 # ################################# STATIC LIBRARY
 # ##############################################################################
diff --git a/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a
deleted file mode 100644
index c8105658ffa..00000000000
--- a/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5a549ddc6871f9499ee1321bd66d8d30c291af3f6320c7a1c6b9276a19bad62a
-size 10941362
diff --git a/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a
deleted file mode 100644
index 85380f34812..00000000000
--- a/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5a2c0d092b5f2bfa4d57528994048b8bf00010a502e543c2cc26a41f3d788ae4
-size 10932338
diff --git a/cpp/tensorrt_llm/common/assert.h b/cpp/tensorrt_llm/common/assert.h
index a54ebba1307..1c4bca699b6 100644
--- a/cpp/tensorrt_llm/common/assert.h
+++ b/cpp/tensorrt_llm/common/assert.h
@@ -30,7 +30,11 @@ namespace tensorrt_llm::common
 
 } // namespace tensorrt_llm::common
 
+#if defined(_WIN32)
+#define TLLM_LIKELY(x) (__assume((x) == 1), (x))
+#else
 #define TLLM_LIKELY(x) __builtin_expect((x), 1)
+#endif
 
 #define TLLM_CHECK(val)                                                                                                \
     do                                                                                                                 \
diff --git a/cpp/tensorrt_llm/common/cublasMMWrapper.cpp b/cpp/tensorrt_llm/common/cublasMMWrapper.cpp
index d7a660f1e81..07e50a9ecbc 100644
--- a/cpp/tensorrt_llm/common/cublasMMWrapper.cpp
+++ b/cpp/tensorrt_llm/common/cublasMMWrapper.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "tensorrt_llm/common/cublasMMWrapper.h"
+#include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/cublasVersionCheck.h"
 #include <algorithm>
 
@@ -70,38 +71,48 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, c
     Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f);
 }
 
+void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k,
+    const void* A, const int lda, const void* B, const int ldb, void* C, const int ldc,
+    const std::optional<cublasLtMatmulHeuristicResult_t>& heuristic)
+{
+    if (heuristic)
+    {
+        Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f, (*heuristic).algo,
+            (*heuristic).state == CUBLAS_STATUS_SUCCESS && (*heuristic).workspaceSize < CUBLAS_WORKSPACE_SIZE);
+    }
+    else
+    {
+        Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f, {}, false);
+    }
+}
+
 void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k,
     const void* A, const int lda, const void* B, const int ldb, void* C, const int ldc, float f_alpha, float f_beta)
 {
-    half h_alpha = (half) (f_alpha);
-    half h_beta = (half) (f_beta);
+    bool usingCublasLt = Atype_ == CUDA_R_16F;
+    bool isFp16ComputeType = computeType_ == CUDA_R_16F;
 
-    mu_->lock();
-    // TODO: default cublas libs
-    int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
-    bool using_cublasLt = (Atype_ == CUDA_R_16F) ? true : false;
     int batch_count = 1;
-    // fp32 use cublas as default
-    // fp16 use cublasLt as default
-    const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
-    const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
-
     int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(Atype_));
 
     cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
+
+    cublasLtMatmulAlgo_t algo;
+    void* workSpace = cublas_workspace_;
+    int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
     if (findAlgo)
     {
         if (info.stages != -1)
         {
-            using_cublasLt = true;
+            usingCublasLt = true;
         }
         else
         {
-            using_cublasLt = false;
+            usingCublasLt = false;
         }
     }
 
-    if (using_cublasLt)
+    if (usingCublasLt)
     {
         cublasLtMatmulDesc_t operationDesc = NULL;
         cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
@@ -112,7 +123,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, c
         cudaDataType_t computeType;
 #endif
 
-        if (is_fp16_computeType)
+        if (isFp16ComputeType)
         {
 #if (CUDART_VERSION >= 11000)
             computeType = CUBLAS_COMPUTE_16F;
@@ -131,23 +142,6 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, c
             scaleType = CUDA_R_32F;
         }
 
-        // --------------------------------------
-        // Create descriptors for the original matrices
-        cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
-        cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
-        cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
-#else
-        cublasLtMatmulDescCreate(&operationDesc, computeType);
-#endif
-
-        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
-        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
-
-        cublasLtMatmulAlgo_t algo;
-        void* workSpace = cublas_workspace_;
-        int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
         if (findAlgo)
         {
             if (info.workspaceSize > workspaceSize)
@@ -174,9 +168,103 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, c
 #endif
             }
         }
+    }
+
+    Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, f_alpha, f_beta, algo, findAlgo);
+}
 
-        cublasLtMatmul(*cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc,
-            (findAlgo == 1 ? (&algo) : NULL), workSpace, workspaceSize, stream_);
+void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k,
+    const void* A, const int lda, const void* B, const int ldb, void* C, const int ldc, float f_alpha, float f_beta,
+    const cublasLtMatmulAlgo_t& algo, bool hasAlgo)
+{
+    half h_alpha = (half) (f_alpha);
+    half h_beta = (half) (f_beta);
+
+    std::lock_guard<std::mutex> lock(*mu_);
+
+    // TODO: default cublas libs
+    bool usingCublasLt = Atype_ == CUDA_R_16F;
+    bool isFp16ComputeType = computeType_ == CUDA_R_16F;
+    int batch_count = 1;
+    // fp32 use cublas as default
+    // fp16 use cublasLt as default
+    const void* alpha = isFp16ComputeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
+    const void* beta = isFp16ComputeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
+    if (hasAlgo)
+    {
+        int32_t stages;
+        cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
+        if (stages != -1)
+        {
+            usingCublasLt = true;
+        }
+        else
+        {
+            usingCublasLt = false;
+        }
+    }
+
+    if (usingCublasLt)
+    {
+        cublasLtMatmulDesc_t operationDesc = NULL;
+        cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+        cudaDataType_t scaleType;
+#if (CUDART_VERSION >= 11000)
+        cublasComputeType_t computeType;
+#else
+        cudaDataType_t computeType;
+#endif
+
+        if (isFp16ComputeType)
+        {
+#if (CUDART_VERSION >= 11000)
+            computeType = CUBLAS_COMPUTE_16F;
+#else
+            computeType = CUDA_R_16F;
+#endif
+            scaleType = CUDA_R_16F;
+        }
+        else
+        {
+#if (CUDART_VERSION >= 11000)
+            computeType = CUBLAS_COMPUTE_32F;
+#else
+            computeType = CUDA_R_32F;
+#endif
+            scaleType = CUDA_R_32F;
+        }
+        // --------------------------------------
+        // Create descriptors for the original matrices
+        cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
+        cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
+        cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+#else
+        cublasLtMatmulDescCreate(&operationDesc, computeType);
+#endif
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
+
+        void* workSpace = cublas_workspace_;
+        int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+        if (hasAlgo)
+        {
+            cublasLtMatmulHeuristicResult_t heurResult;
+            // We have to check if the heruistic is correct given current shape size
+            cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck(
+                getCublasLtHandle(), operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
+
+            if (algoStatus != CUBLAS_STATUS_SUCCESS || heurResult.state != CUBLAS_STATUS_SUCCESS
+                || heurResult.workspaceSize > CUBLAS_WORKSPACE_SIZE)
+            {
+                // Rely on runtime based heruistic
+                hasAlgo = false;
+            }
+        }
+
+        check_cuda_error(cublasLtMatmul(*cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C,
+            Cdesc, (hasAlgo ? (&algo) : NULL), workSpace, workspaceSize, stream_));
 
         cublasLtMatmulDescDestroy(operationDesc);
         cublasLtMatrixLayoutDestroy(Adesc);
@@ -186,12 +274,12 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, c
     }
     else
     {
-        int cublasAlgo = info.algoId;
+        // Go with default heruistic to choose tactic as cuBLAS does not allow to choose tactics in Ampere+
+        cublasGemmAlgo_t cublasAlgo = CUBLAS_GEMM_DEFAULT;
         check_cuda_error(cublasGemmEx(*cublas_handle_, transa, transb, m, n, k, alpha, A, Atype_, lda, B, Btype_, ldb,
             beta, C, Ctype_, ldc, computeType_, static_cast<cublasGemmAlgo_t>(cublasAlgo)));
         sync_check_cuda_error();
     }
-    mu_->unlock();
 }
 
 void cublasMMWrapper::setWorkspace(void* workspace)
@@ -201,27 +289,25 @@ void cublasMMWrapper::setWorkspace(void* workspace)
 
 void cublasMMWrapper::setFP32GemmConfig()
 {
-    Atype_ = CUDA_R_32F;
-    Btype_ = CUDA_R_32F;
-    Ctype_ = CUDA_R_32F;
-    computeType_ = CUDA_R_32F;
+    setGemmConfig(CUDA_R_32F, CUDA_R_32F, CUDA_R_32F, CUDA_R_32F);
 }
 
 void cublasMMWrapper::setFP16GemmConfig()
 {
-    Atype_ = CUDA_R_16F;
-    Btype_ = CUDA_R_16F;
-    Ctype_ = CUDA_R_16F;
-    computeType_ = CUDA_R_32F;
+    setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
 }
 
 #ifdef ENABLE_BF16
 void cublasMMWrapper::setBF16GemmConfig()
 {
-    Atype_ = CUDA_R_16BF;
-    Btype_ = CUDA_R_16BF;
-    Ctype_ = CUDA_R_16BF;
-    computeType_ = CUDA_R_32F;
+    setGemmConfig(CUDA_R_16BF, CUDA_R_16BF, CUDA_R_16BF, CUDA_R_32F);
+}
+#endif
+
+#ifdef ENABLE_FP8
+void cublasMMWrapper::setFP8GemmConfig(cudaDataType_t outputType)
+{
+    setGemmConfig(CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, outputType, CUDA_R_32F);
 }
 #endif
 
@@ -410,20 +496,150 @@ bool cublasMMWrapper::isFuseBatchGemm(const int batch_count, const int m, const
     }
 }
 
-std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHandle_t lightHandle,
-    cublasLtMatmulDesc_t computeDesc, const void* alpha, const void* A, cublasLtMatrixLayout_t Adesc, const void* B,
-    cublasLtMatrixLayout_t Bdesc, const void* beta, const void* C, cublasLtMatrixLayout_t Cdesc, void* D,
-    cublasLtMatrixLayout_t Ddesc, cudaStream_t stream)
+std::vector<cublasLtMatmulHeuristicResult_t> cublasMMWrapper::getTactics(cublasOperation_t transa,
+    cublasOperation_t transb, const int m, const int n, const int k, const int lda, const int ldb, const int ldc)
+{
+    int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    cublasLtMatmulDesc_t operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
+    cudaDataType_t scaleType;
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType;
+#else
+    cudaDataType_t computeType;
+#endif
+
+    if (is_fp16_computeType)
+    {
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_16F;
+#else
+        computeType = CUDA_R_16F;
+#endif
+        scaleType = CUDA_R_16F;
+    }
+    else
+    {
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_32F;
+#else
+        computeType = CUDA_R_32F;
+#endif
+        scaleType = CUDA_R_32F;
+    }
+
+    // --------------------------------------
+    // Create descriptors for the original matrices
+    check_cuda_error(
+        cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda));
+    check_cuda_error(
+        cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb));
+    check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc));
+#if (CUDART_VERSION >= 11000)
+    check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType));
+#else
+    check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType));
+#endif
+
+    check_cuda_error(
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t)));
+    check_cuda_error(
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t)));
+
+    void* workSpace = cublas_workspace_;
+    int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+
+    const auto heuristics = getTactics(getCublasLtHandle(), operationDesc, Adesc, Bdesc, Cdesc, Cdesc);
+
+    check_cuda_error(cublasLtMatmulDescDestroy(operationDesc));
+    check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
+    check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
+    check_cuda_error(cublasLtMatrixLayoutDestroy(Cdesc));
+    sync_check_cuda_error();
+
+    return heuristics;
+}
+
+bool cublasMMWrapper::checkTactic(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n,
+    const int k, const int lda, const int ldb, const int ldc, const cublasLtMatmulHeuristicResult_t& heuristic) const
+{
+    int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    cublasLtMatmulDesc_t operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
+    cudaDataType_t scaleType;
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType;
+#else
+    cudaDataType_t computeType;
+#endif
+
+    if (is_fp16_computeType)
+    {
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_16F;
+#else
+        computeType = CUDA_R_16F;
+#endif
+        scaleType = CUDA_R_16F;
+    }
+    else
+    {
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_32F;
+#else
+        computeType = CUDA_R_32F;
+#endif
+        scaleType = CUDA_R_32F;
+    }
+
+    // --------------------------------------
+    // Create descriptors for the original matrices
+    check_cuda_error(
+        cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda));
+    check_cuda_error(
+        cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb));
+    check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc));
+#if (CUDART_VERSION >= 11000)
+    check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType));
+#else
+    check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType));
+#endif
+
+    check_cuda_error(
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t)));
+    check_cuda_error(
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t)));
+
+    void* workSpace = cublas_workspace_;
+    int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+
+    cublasLtMatmulHeuristicResult_t heurResult;
+    cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck(
+        getCublasLtHandle(), operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &heuristic.algo, &heurResult);
+
+    if (algoStatus != CUBLAS_STATUS_SUCCESS || heurResult.state != CUBLAS_STATUS_SUCCESS
+        || heurResult.workspaceSize > CUBLAS_WORKSPACE_SIZE)
+    {
+        return false;
+    }
+
+    check_cuda_error(cublasLtMatmulDescDestroy(operationDesc));
+    check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
+    check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
+    check_cuda_error(cublasLtMatrixLayoutDestroy(Cdesc));
+    sync_check_cuda_error();
+
+    return true;
+}
+
+std::vector<cublasLtMatmulHeuristicResult_t> cublasMMWrapper::getTactics(cublasLtHandle_t lightHandle,
+    cublasLtMatmulDesc_t computeDesc, cublasLtMatrixLayout_t Adesc, cublasLtMatrixLayout_t Bdesc,
+    cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Ddesc)
 {
 #if TLLM_CUBLAS_VER_LE(11, 4, 2)
     TLLM_CHECK_WITH_INFO(false, "CUBLAS version too low, must be > 11.4.2.");
-    return {false, cublasLtMatmulAlgo_t{}};
+    return {};
 #else
-    size_t returnSize;
-    int32_t pointer_mode;
-    cublasLtMatmulDescGetAttribute(
-        computeDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode), &returnSize);
-
     std::vector<cublasLtMatmulHeuristicResult_t> heuristics(200);
     cublasLtMatmulPreference_t preference;
     check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
@@ -431,6 +647,10 @@ std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHand
     uint64_t workspace_size = CUBLAS_WORKSPACE_SIZE;
     check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
         preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size)));
+    // Restrict reduction algorithms for numerical stability and better determenism
+    uint32_t reduction_mask = CUBLASLT_REDUCTION_SCHEME_INPLACE;
+    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
+        preference, CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, &reduction_mask, sizeof(reduction_mask)));
 #if TLLM_CUBLAS_VER_LT(12, 0, 0)
     uint32_t pointer_mode_mask = 0;
     check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
@@ -438,10 +658,30 @@ std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHand
 #endif
 
     int return_count = 0;
-    auto ret = cublasLtMatmulAlgoGetHeuristic(lightHandle, computeDesc, Adesc, Bdesc, Cdesc, Ddesc, preference,
-        heuristics.size(), heuristics.data(), &return_count);
+    check_cuda_error(cublasLtMatmulAlgoGetHeuristic(lightHandle, computeDesc, Adesc, Bdesc, Cdesc, Ddesc, preference,
+        heuristics.size(), heuristics.data(), &return_count));
     heuristics.resize(return_count);
 
+    return heuristics;
+#endif
+}
+
+std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHandle_t lightHandle,
+    cublasLtMatmulDesc_t computeDesc, const void* alpha, const void* A, cublasLtMatrixLayout_t Adesc, const void* B,
+    cublasLtMatrixLayout_t Bdesc, const void* beta, const void* C, cublasLtMatrixLayout_t Cdesc, void* D,
+    cublasLtMatrixLayout_t Ddesc, cudaStream_t stream)
+{
+#if TLLM_CUBLAS_VER_LE(11, 4, 2)
+    TLLM_CHECK_WITH_INFO(false, "CUBLAS version too low, must be > 11.4.2.");
+    return {false, cublasLtMatmulAlgo_t{}};
+#else
+    size_t returnSize;
+    int32_t pointer_mode;
+    cublasLtMatmulDescGetAttribute(
+        computeDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode), &returnSize);
+
+    const auto heuristics = getTactics(lightHandle, computeDesc, Adesc, Bdesc, Cdesc, Ddesc);
+
     std::map<int, std::vector<float>> algo_results;
     for (const auto& heuristic : heuristics)
     {
diff --git a/cpp/tensorrt_llm/common/cublasMMWrapper.h b/cpp/tensorrt_llm/common/cublasMMWrapper.h
index d1302d46e35..1fd8b64a0b0 100644
--- a/cpp/tensorrt_llm/common/cublasMMWrapper.h
+++ b/cpp/tensorrt_llm/common/cublasMMWrapper.h
@@ -23,6 +23,7 @@
 #include <cuda_runtime.h>
 #include <map>
 #include <mutex>
+#include <optional>
 #include <string>
 
 namespace tensorrt_llm
@@ -65,6 +66,16 @@ class cublasMMWrapper
         const void* beta, const void* C, cublasLtMatrixLayout_t Cdesc, void* D, cublasLtMatrixLayout_t Ddesc,
         const cublasLtMatmulAlgo_t* algo, void* workspace, size_t workspaceSizeInBytes, cudaStream_t stream);
 
+    bool checkTactic(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k,
+        const int lda, const int ldb, const int ldc, const cublasLtMatmulHeuristicResult_t& algo) const;
+
+    std::vector<cublasLtMatmulHeuristicResult_t> getTactics(cublasOperation_t transa, cublasOperation_t transb,
+        const int m, const int n, const int k, const int lda, const int ldb, const int ldc);
+
+    std::vector<cublasLtMatmulHeuristicResult_t> getTactics(cublasLtHandle_t lightHandle,
+        cublasLtMatmulDesc_t computeDesc, cublasLtMatrixLayout_t Adesc, cublasLtMatrixLayout_t Bdesc,
+        cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Ddesc);
+
     std::pair<bool, cublasLtMatmulAlgo_t> findBestAlgo(cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t computeDesc,
         const void* alpha, const void* A, cublasLtMatrixLayout_t Adesc, const void* B, cublasLtMatrixLayout_t Bdesc,
         const void* beta, const void* C, cublasLtMatrixLayout_t Cdesc, void* D, cublasLtMatrixLayout_t Ddesc,
@@ -83,9 +94,17 @@ class cublasMMWrapper
     void Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, const void* A,
         const int lda, const void* B, const int ldb, void* C, const int ldc);
 
+    void Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, const void* A,
+        const int lda, const void* B, const int ldb, void* C, const int ldc,
+        const std::optional<cublasLtMatmulHeuristicResult_t>& algo);
+
     void Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, const void* A,
         const int lda, const void* B, const int ldb, void* C, const int ldc, float f_alpha, float f_beta);
 
+    void Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, const void* A,
+        const int lda, const void* B, const int ldb, void* C, const int ldc, float f_alpha, float f_beta,
+        const cublasLtMatmulAlgo_t& algo, bool hasAlgo);
+
     void setWorkspace(void* workspace);
 
     void Int8Gemm(const int m, const int n, const int k, const int8_t* A, const int lda, const int8_t* B, const int ldb,
@@ -99,6 +118,10 @@ class cublasMMWrapper
 #ifdef ENABLE_BF16
     void setBF16GemmConfig();
 #endif
+#ifdef ENABLE_FP8
+    void setFP8GemmConfig(cudaDataType_t outputType = CUDA_R_16F);
+#endif
+
     void setStream(cudaStream_t stream);
 
     void setGemmConfig(cudaDataType_t aType, cudaDataType_t bType, cudaDataType_t cType, cudaDataType_t computeType);
@@ -131,7 +154,7 @@ class cublasMMWrapper
         return *(this->cublas_handle_);
     }
 
-    cublasLtHandle_t getCublasLtHandle()
+    cublasLtHandle_t getCublasLtHandle() const
     {
         return *(this->cublaslt_handle_);
     }
diff --git a/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh b/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh
index 47827e42a6b..13a6165dc4b 100644
--- a/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh
+++ b/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh
@@ -18,6 +18,7 @@
 
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
 
 namespace tensorrt_llm
 {
@@ -198,6 +199,7 @@ inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x)
 }
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
 inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y)
 {
     return bf16hmul2(x, y);
@@ -215,7 +217,7 @@ inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __n
     t.y = y;
     return t;
 }
-
+#endif
 #endif
 
 inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c)
diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
index 43199d7194f..bbee01c3679 100644
--- a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
+++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
@@ -16,10 +16,17 @@
 
 #define CUDA_LIB_NAME "cuda"
 
+#if defined(_WIN32)
+#include <windows.h>
+#define dllOpen(name) LoadLibrary("nv" name ".dll")
+#define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
+#define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name))
+#else // For non-Windows platforms
 #include <dlfcn.h>
 #define dllOpen(name) dlopen("lib" name ".so.1", RTLD_LAZY)
 #define dllClose(handle) dlclose(handle)
 #define dllGetSym(handle, name) dlsym(handle, name)
+#endif // defined(_WIN32)
 
 #include "cudaDriverWrapper.h"
 #include "tensorrt_llm/common/assert.h"
diff --git a/cpp/tensorrt_llm/common/cudaTypeUtils.cuh b/cpp/tensorrt_llm/common/cudaTypeUtils.cuh
index 027a8fb6909..64f26b430f9 100644
--- a/cpp/tensorrt_llm/common/cudaTypeUtils.cuh
+++ b/cpp/tensorrt_llm/common/cudaTypeUtils.cuh
@@ -19,8 +19,12 @@
 #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
+#include <assert.h>
 #include <cuda.h>
 #include <cuda_fp16.h>
+#if ENABLE_BF16
+#include <cuda_bf16.h>
+#endif
 
 namespace tensorrt_llm
 {
@@ -508,7 +512,11 @@ __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, half2>(half2 val)
 #endif // ENABLE BF16
 
 template <typename T>
-__device__ inline T cuda_abs(T val);
+__device__ inline T cuda_abs(T val)
+{
+    assert(false);
+    return {};
+}
 
 template <>
 __device__ inline float cuda_abs(float val)
@@ -548,18 +556,6 @@ __device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val)
 {
     return __habs2(val);
 }
-#else
-template <>
-__device__ inline __nv_bfloat16 cuda_abs(__nv_bfloat16 val)
-{
-    return fabs(val);
-}
-
-template <>
-__device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val)
-{
-    return make_bfloat162(fabs(val.x), fabs(val.y));
-}
 #endif
 
 #endif // ENABLE_FP16
diff --git a/cpp/tensorrt_llm/common/cudaUtils.h b/cpp/tensorrt_llm/common/cudaUtils.h
index f457c1aabcd..d43a93030c8 100644
--- a/cpp/tensorrt_llm/common/cudaUtils.h
+++ b/cpp/tensorrt_llm/common/cudaUtils.h
@@ -20,6 +20,7 @@
 #include "tensorrt_llm/common/cudaFp8Utils.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/tllmException.h"
+#include <cinttypes>
 #include <cublasLt.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
@@ -268,6 +269,15 @@ inline int getDeviceCount()
     return count;
 }
 
+/// Get the memory info
+/// \return The free and total amount of memory in bytes
+inline std::tuple<size_t, size_t> getDeviceMemoryInfo()
+{
+    size_t free, total;
+    check_cuda_error(cudaMemGetInfo(&free, &total));
+    return {free, total};
+}
+
 inline int getMultiProcessorCount()
 {
     int device_id;
@@ -301,8 +311,15 @@ inline int divUp(int a, int n)
     return (a + n - 1) / n;
 }
 
+template <typename T, typename U, typename = std::enable_if_t<std::is_integral<T>::value>,
+    typename = std::enable_if_t<std::is_integral<U>::value>>
+auto constexpr ceilDiv(T numerator, U denominator)
+{
+    return (numerator + denominator - 1) / denominator;
+}
+
 template <typename T>
-void printAbsMean(const T* buf, uint size, cudaStream_t stream, std::string name = "")
+void printAbsMean(const T* buf, uint64_t size, cudaStream_t stream, std::string name = "")
 {
     if (buf == nullptr)
     {
@@ -319,7 +336,7 @@ void printAbsMean(const T* buf, uint size, cudaStream_t stream, std::string name
     uint64_t zero_count = 0;
     float max_val = -1e10;
     bool find_inf = false;
-    for (uint i = 0; i < size; i++)
+    for (uint64_t i = 0; i < size; i++)
     {
         if (std::isinf((float) (h_tmp[i])))
         {
@@ -412,19 +429,24 @@ inline void print_element_(__nv_bfloat16 x)
     print_float_((float) x);
 }
 #endif
-inline void print_element_(unsigned long long ull)
+inline void print_element_(uint32_t ul)
+{
+    printf("%7" PRIu32, ul);
+}
+
+inline void print_element_(uint64_t ull)
 {
-    printf("%7llu ", ull);
+    printf("%7" PRIu64, ull);
 }
 
-inline void print_element_(int i)
+inline void print_element_(int32_t il)
 {
-    printf("%7d ", i);
+    printf("%7" PRId32, il);
 }
 
-inline void print_element_(size_t s)
+inline void print_element_(int64_t ill)
 {
-    printf("%7ld ", s);
+    printf("%7" PRId64, ill);
 }
 
 template <typename T>
@@ -478,9 +500,9 @@ template void printMatrix(const half* ptr, int m, int k, int stride, bool is_dev
 #ifdef ENABLE_BF16
 template void printMatrix(const __nv_bfloat16* ptr, int m, int k, int stride, bool is_device_ptr);
 #endif
-template void printMatrix(const unsigned long long* ptr, int m, int k, int stride, bool is_device_ptr);
+template void printMatrix(const uint32_t* ptr, int m, int k, int stride, bool is_device_ptr);
+template void printMatrix(const uint64_t* ptr, int m, int k, int stride, bool is_device_ptr);
 template void printMatrix(const int* ptr, int m, int k, int stride, bool is_device_ptr);
-template void printMatrix(const size_t* ptr, int m, int k, int stride, bool is_device_ptr);
 
 } // namespace tensorrt_llm::common
 
diff --git a/cpp/tensorrt_llm/common/int8Utils.cuh b/cpp/tensorrt_llm/common/int8Utils.cuh
deleted file mode 100644
index 3c2b01de5cd..00000000000
--- a/cpp/tensorrt_llm/common/int8Utils.cuh
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <stdint.h>
-
-namespace tensorrt_llm
-{
-namespace kernels
-{
-
-static inline __device__ int8_t float_to_int8_rn(float x)
-{
-    uint32_t dst;
-    asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
-    return reinterpret_cast<const int8_t&>(dst);
-}
-
-static inline __device__ uint32_t float4_to_char4(float x, float y, float z, float w)
-{
-    uint32_t dst;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720
-    uint32_t a;
-    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
-    uint32_t b;
-    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y));
-    uint32_t c;
-    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z));
-    uint32_t d;
-    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w));
-
-    asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2,  0;\n" : "=r"(dst) : "r"(d), "r"(c));
-    asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, %0;\n" : "+r"(dst) : "r"(b), "r"(a));
-#else
-    char4 tmp;
-    tmp.x = x;
-    tmp.y = y;
-    tmp.z = z;
-    tmp.w = w;
-    dst = reinterpret_cast<const uint32_t&>(tmp);
-#endif
-    return dst;
-}
-} // namespace kernels
-} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/common/logger.h b/cpp/tensorrt_llm/common/logger.h
index 0f15599d339..9173e1b1ce9 100644
--- a/cpp/tensorrt_llm/common/logger.h
+++ b/cpp/tensorrt_llm/common/logger.h
@@ -30,6 +30,14 @@ namespace tensorrt_llm::common
 class Logger
 {
 
+#if _WIN32
+// On Windows, the file wingdi.h is included which has
+// #define ERROR 0
+// This breaks everywhere ERROR is used in the Level enum
+// Alternative, untested solution to #undef: compile with NOGDI flag defined
+#undef ERROR
+#endif // _WIN32
+
 public:
     enum Level
     {
diff --git a/cpp/tensorrt_llm/common/memoryUtils.cu b/cpp/tensorrt_llm/common/memoryUtils.cu
index 730913153d6..f613414fc87 100644
--- a/cpp/tensorrt_llm/common/memoryUtils.cu
+++ b/cpp/tensorrt_llm/common/memoryUtils.cu
@@ -224,7 +224,7 @@ template void cudaAutoCpy(int* tgt, const int* src, size_t size, cudaStream_t st
 template void cudaAutoCpy(bool* tgt, const bool* src, size_t size, cudaStream_t stream);
 template void cudaAutoCpy(int8_t* tgt, const int8_t* src, size_t size, cudaStream_t stream);
 template void cudaAutoCpy(uint8_t* tgt, const uint8_t* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(uint* tgt, const uint* src, size_t size, cudaStream_t stream);
+template void cudaAutoCpy(uint32_t* tgt, const uint32_t* src, size_t size, cudaStream_t stream);
 template void cudaAutoCpy(unsigned long long* tgt, const unsigned long long* src, size_t size, cudaStream_t stream);
 template void cudaAutoCpy(char* tgt, const char* src, size_t size, cudaStream_t stream);
 
@@ -467,8 +467,8 @@ template void invokeCudaD2DcpyConvert(half* tgt, const int* src, const size_t si
 template void invokeCudaD2DcpyConvert(float* tgt, const float* src, const size_t size, cudaStream_t stream);
 template void invokeCudaD2DcpyConvert(half* tgt, const float* src, const size_t size, cudaStream_t stream);
 template void invokeCudaD2DcpyConvert(float* tgt, const half* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(uint* tgt, const int* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(int* tgt, const uint* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(uint32_t* tgt, const int* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(int* tgt, const uint32_t* src, const size_t size, cudaStream_t stream);
 template void invokeCudaD2DcpyConvert(int* tgt, const float* src, const size_t size, cudaStream_t stream);
 template void invokeCudaD2DcpyConvert(int* tgt, const half* src, const size_t size, cudaStream_t stream);
 
diff --git a/cpp/tensorrt_llm/common/nvtxUtils.h b/cpp/tensorrt_llm/common/nvtxUtils.h
new file mode 100644
index 00000000000..ed2065e551c
--- /dev/null
+++ b/cpp/tensorrt_llm/common/nvtxUtils.h
@@ -0,0 +1,44 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nvtx3/nvtx3.hpp>
+
+#include <array>
+
+namespace tensorrt_llm::common::nvtx
+{
+inline nvtx3::color nextColor()
+{
+#if !defined(NVTX_DISABLE)
+    constexpr std::array kColors{nvtx3::color{0xff00ff00}, nvtx3::color{0xff0000ff}, nvtx3::color{0xffffff00},
+        nvtx3::color{0xffff00ff}, nvtx3::color{0xff00ffff}, nvtx3::color{0xffff0000}, nvtx3::color{0xffffffff}};
+    constexpr auto numColors = kColors.size();
+
+    static thread_local int colorId = 0;
+    auto const color = kColors[colorId];
+    colorId = colorId + 1 >= numColors ? 0 : colorId + 1;
+    return color;
+#else
+    return nvtx3::color{0};
+#endif
+}
+
+} // namespace tensorrt_llm::common::nvtx
+
+#define NVTX3_SCOPED_RANGE(range) ::nvtx3::scoped_range range##_range(::tensorrt_llm::common::nvtx::nextColor(), #range)
diff --git a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
index 82e4e8c9b00..37501c5f601 100644
--- a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
+++ b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
@@ -72,7 +72,7 @@ __device__ inline void copy(const void* local, void* data)
     *out = *in;
 }
 
-static const float HALF_FLT_MAX = 65504.F;
+static float constexpr HALF_FLT_MAX = 65504.F;
 #define FINAL_MASK 0xffffffff
 
 template <typename T>
diff --git a/cpp/tensorrt_llm/common/stringUtils.h b/cpp/tensorrt_llm/common/stringUtils.h
index 3eac937cfd8..071c2279327 100644
--- a/cpp/tensorrt_llm/common/stringUtils.h
+++ b/cpp/tensorrt_llm/common/stringUtils.h
@@ -48,6 +48,12 @@ std::string fmtstr(char const* format, ...);
 std::string fmtstr(char const* format, ...) __attribute__((format(printf, 1, 2)));
 #endif
 
+// __PRETTY_FUNCTION__ is used for neat debugging printing but is not supported on Windows
+// The alternative is __FUNCSIG__, which is similar but not identical
+#if defined(_WIN32)
+#define __PRETTY_FUNCTION__ __FUNCSIG__
+#endif
+
 template <typename U, typename TStream, typename T>
 inline TStream& arr2outCasted(TStream& out, T* arr, size_t size)
 {
diff --git a/cpp/tensorrt_llm/common/tensor.cpp b/cpp/tensorrt_llm/common/tensor.cpp
index 464bc275e8a..059e69e0844 100644
--- a/cpp/tensorrt_llm/common/tensor.cpp
+++ b/cpp/tensorrt_llm/common/tensor.cpp
@@ -23,7 +23,6 @@
 #include "stdlib.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime_api.h>
-#include <dirent.h>
 #include <numeric>
 #include <stdlib.h>
 #include <string>
@@ -32,6 +31,10 @@
 #include <unordered_map>
 #include <vector>
 
+#if !defined(_WIN32)
+#include <dirent.h>
+#endif // !defined(_WIN32)
+
 namespace tensorrt_llm
 {
 namespace common
@@ -152,7 +155,7 @@ Tensor Tensor::loadNpy(const std::string& npy_file, const MemoryType where)
     parseNpyIntro(f_ptr, header_len, start_data);
     parseNpyHeader(f_ptr, header_len, type, shape);
 
-    const size_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+    const size_t size = std::accumulate(shape.begin(), shape.end(), size_t{1}, std::multiplies<size_t>());
     void* data_cpu = malloc(size * Tensor::getTypeSize(type));
     void* data = data_cpu;
 
@@ -338,7 +341,7 @@ Tensor Tensor::slice(std::vector<size_t> shape, size_t offset) const
     if (this->data != nullptr)
     {
         size_t n_elts = this->size();
-        size_t n_sliced_elts = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+        size_t n_sliced_elts = std::accumulate(shape.begin(), shape.end(), size_t{1}, std::multiplies<size_t>());
         TLLM_CHECK_WITH_INFO(n_sliced_elts + offset <= n_elts,
             fmtstr("The number (%ld) of elements of sliced tensor exceeds that (%ld) of the original tensor",
                 n_sliced_elts + offset, n_elts));
@@ -418,6 +421,7 @@ std::string TensorMap::toString()
 
 TensorMap TensorMap::fromNpyFolder(const std::string& base_folder)
 {
+#if !defined(_WIN32)
     DIR* dir_p = opendir(base_folder.c_str());
     TLLM_CHECK_WITH_INFO(dir_p != nullptr, fmtstr("Could not open folder %s. ", base_folder.c_str()));
     struct dirent* dp;
@@ -460,10 +464,15 @@ TensorMap TensorMap::fromNpyFolder(const std::string& base_folder)
     closedir(dir_p);
 
     return ret_tensor;
+#else
+    throw std::runtime_error("TensorMap::fromNpyFolder is not implemented on Windows.");
+    return {};
+#endif // !defined(_WIN32)
 }
 
 void TensorMap::saveNpy(const std::string& base_folder)
 {
+#if !defined(_WIN32)
     mode_t mode_0755 = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
     int ret = mkdir(base_folder.c_str(), mode_0755);
     TLLM_CHECK_WITH_INFO(ret == 0 || errno == EEXIST, fmtstr("Could not create folder %s.\n", base_folder.c_str()));
@@ -472,6 +481,9 @@ void TensorMap::saveNpy(const std::string& base_folder)
     {
         item.second.saveNpy(base_folder + "/" + item.second.whereToString() + "-" + item.first + ".npy");
     }
+#else
+    throw std::runtime_error("TensorMap::saveNpy is not implemented on Windows.");
+#endif // !defined(_WIN32)
 }
 
 } // namespace common
diff --git a/cpp/tensorrt_llm/common/tensor.h b/cpp/tensorrt_llm/common/tensor.h
index a0513a36761..b1dcbc626f3 100644
--- a/cpp/tensorrt_llm/common/tensor.h
+++ b/cpp/tensorrt_llm/common/tensor.h
@@ -26,7 +26,6 @@
 #include <cstdlib>
 #include <cuda_fp16.h>
 #include <cuda_runtime_api.h>
-#include <dirent.h>
 #include <numeric>
 #include <optional>
 #include <string>
@@ -98,11 +97,13 @@ struct TensorDataType<std::uint64_t>
     static constexpr DataType value = TYPE_UINT64;
 };
 
+#if !defined(_WIN32)
 template <>
 struct TensorDataType<unsigned long long>
 {
     static constexpr DataType value = TYPE_UINT64;
 };
+#endif // !defined(_WIN32)
 
 static_assert(sizeof(std::uint64_t) == sizeof(unsigned long long), "");
 
diff --git a/cpp/tensorrt_llm/common/tllmException.cpp b/cpp/tensorrt_llm/common/tllmException.cpp
index 52354b6e66b..2d2831c1477 100644
--- a/cpp/tensorrt_llm/common/tllmException.cpp
+++ b/cpp/tensorrt_llm/common/tllmException.cpp
@@ -44,8 +44,8 @@ TllmException::TllmException(char const* file, std::size_t line, const std::stri
 }
 #else
 TllmException::TllmException(char const* file, std::size_t line, const std::string& msg)
-    : _mNbFrames{}
-    , runtime_error{fmtstr("%s (%s:%zu)", msg.c_str(), file, line)}
+    : mNbFrames{}
+    , std::runtime_error{fmtstr("%s (%s:%zu)", msg.c_str(), file, line)}
 {
 }
 #endif
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
index dead7975f6f..bdac36fd95d 100644
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
@@ -415,12 +415,12 @@ class MmaTensorOpDequantizer<MmaOperator_, Shape_, Operand::kB, half_t, layout::
             "");
 
         multiplies<ExpandedMmaOperandB> mul_op;
-        plus<ExpandedMmaOperandB> plus_op;
-
         ExpandedMmaOperandB* operand_frag_ptr = reinterpret_cast<ExpandedMmaOperandB*>(&operand_frag);
 
         if constexpr (hasZero(QuantOp))
         {
+            plus<ExpandedMmaOperandB> plus_op;
+
             CUTLASS_PRAGMA_UNROLL
             for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
             {
diff --git a/cpp/tensorrt_llm/kernels/banBadWords.cu b/cpp/tensorrt_llm/kernels/banBadWords.cu
index 82d169d9d51..45cce552dea 100644
--- a/cpp/tensorrt_llm/kernels/banBadWords.cu
+++ b/cpp/tensorrt_llm/kernels/banBadWords.cu
@@ -98,7 +98,8 @@ void invokeBanBadWords(T* logits, const int** output_ids_ptr, const int** parent
     int vocab_size_padded, const int* sequence_lengths, int max_seq_len, cudaStream_t stream)
 {
     dim3 block, grid;
-    block.x = min(((bad_words_len + 32 - 1) / 32) * 32, 256UL);
+    constexpr size_t max_blocks{256};
+    block.x = min(((bad_words_len + 32 - 1) / 32) * 32, max_blocks);
     grid.x = (bad_words_len + block.x - 1) / block.x;
     grid.y = local_batch_size * beam_width;
 
diff --git a/cpp/tensorrt_llm/kernels/banRepeatNgram.cu b/cpp/tensorrt_llm/kernels/banRepeatNgram.cu
index 3f232c2120c..5f2c686baae 100644
--- a/cpp/tensorrt_llm/kernels/banRepeatNgram.cu
+++ b/cpp/tensorrt_llm/kernels/banRepeatNgram.cu
@@ -146,7 +146,8 @@ void invokeBanRepeatNgram(T* logits, const int** output_ids_buf, const bool* fin
 
     // step (current generated length, except start token) is from 1 ~ max_seq_len
     dim3 block, grid;
-    block.x = min(((step + 32 - 1) / 32) * 32, 256UL);
+    constexpr size_t max_blocks{256};
+    block.x = min(((step + 32 - 1) / 32) * 32, max_blocks);
     grid.x = (step + block.x - 1) / block.x;
     grid.y = local_batch_size * beam_width;
 
diff --git a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu
index e06cee8c7e9..2415d10e7cf 100644
--- a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu
@@ -91,20 +91,18 @@ __global__ void add_bias_temperature(half2* logits, const half2* bias, const int
 template <typename T, bool IS_ADDITIVE>
 __global__ void apply_repetition_penalty(T* logits, const int batch_size, const int beam_width, const int vocab_size,
     const int vocab_size_padded, const int** output_ids_ptr, const int** parent_ids_ptr, const int* input_lengths,
-    const int* sequence_lengths, const int max_input_length, const float repetition_penalty, int max_seq_len)
+    const int* sequence_lengths, const float repetition_penalty, int max_seq_len)
 {
     const int tid = threadIdx.x;
     const int bbid = blockIdx.x;
     const int batch_id = bbid / beam_width;
     const int beam_idx{bbid % beam_width};
-    const int bbsize = batch_size * beam_width;
 
     logits += bbid * vocab_size_padded;
     extern __shared__ char sbuf[];
     T* penalty_logits = reinterpret_cast<T*>(sbuf);
     // prevent misaligment when sizeof(T) = 2
     int* penalty_indices = reinterpret_cast<int*>(sbuf + (sizeof(T) * max_seq_len + 31) / 32 * 32);
-    const int input_length = (input_lengths != nullptr) ? input_lengths[bbid] : max_input_length;
     const int current_step{sequence_lengths[bbid]};
     if (tid == 0)
     {
@@ -127,11 +125,6 @@ __global__ void apply_repetition_penalty(T* logits, const int batch_size, const
             int parent_beam = bbid % beam_width;
             for (int i = current_step - 2; i >= 0; --i)
             {
-                // Skip the padded tokens.
-                if (i >= input_length && i < max_input_length)
-                {
-                    continue;
-                }
                 parent_beam = parent_ids_ptr[batch_id][parent_beam * max_seq_len + i];
                 prev_id = output_ids_ptr[batch_id][parent_beam * max_seq_len + i];
                 prev_logit = logits[prev_id];
@@ -150,23 +143,20 @@ __global__ void apply_repetition_penalty(T* logits, const int batch_size, const
     __syncthreads();
     for (int i = tid; i < current_step; i += blockDim.x)
     {
-        if (i >= input_length && i < max_input_length)
-        {
-            continue;
-        }
         logits[penalty_indices[i]] = penalty_logits[i];
     }
 }
 
 template <typename T>
 __global__ void apply_min_length_penalty(T* logits, const int min_length, const int* end_ids,
-    const int* sequence_lengths, const int max_input_length, const int beam_width, const int vocab_size_padded)
+    const int* sequence_lengths, const int* input_lengths, const int beam_width, const int vocab_size_padded)
 {
     int bbid = threadIdx.x + blockIdx.x * blockDim.x; // batch-beam index
     int bid = bbid / beam_width;                      // batch index
-    // We need +1 because sequence_lengths = max_input_length + num_gen_tokens -
-    // 1, which is equal to the length of k/v caches.
-    if (sequence_lengths[bbid] + 1 - max_input_length < min_length)
+    auto const input_length{input_lengths == nullptr ? 0 : input_lengths[bbid]};
+    // We need +1 because sequence_lengths = num_gen_tokens + input_length - 1,
+    // which is equal to the length of k/v caches.
+    if (sequence_lengths[bbid] + 1 - input_length < min_length)
     {
         T mask_val = (std::is_same<T, half>::value) ? -HALF_FLT_MAX : -FLT_MAX;
         logits[bbid * vocab_size_padded + end_ids[bid]] = mask_val;
@@ -175,11 +165,10 @@ __global__ void apply_min_length_penalty(T* logits, const int min_length, const
 
 template <typename T>
 void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const int** parent_ids_ptr,
-    const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int max_input_length,
-    const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size,
-    const int vocab_size_padded, const int* end_ids, const float temperature, const float repetition_penalty,
-    const RepetitionPenaltyType repetition_penalty_type, const int min_length, const int max_seq_len,
-    cudaStream_t stream)
+    const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int local_batch_size,
+    const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids,
+    const float temperature, const float repetition_penalty, const RepetitionPenaltyType repetition_penalty_type,
+    const int min_length, const int max_seq_len, cudaStream_t stream)
 {
     if (bias != nullptr || temperature != 1.0f || vocab_size != vocab_size_padded)
     {
@@ -210,14 +199,14 @@ void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const in
             {
                 apply_repetition_penalty<T, false><<<grid, block, smem_size, stream>>>(logits, batch_size, beam_width,
                     vocab_size, vocab_size_padded, output_ids_ptr, parent_ids_ptr, input_lengths, sequence_lengths,
-                    max_input_length, repetition_penalty, max_seq_len);
+                    repetition_penalty, max_seq_len);
                 sync_check_cuda_error();
             }
             else if (repetition_penalty_type == RepetitionPenaltyType::Additive)
             {
                 apply_repetition_penalty<T, true><<<grid, block, smem_size, stream>>>(logits, batch_size, beam_width,
                     vocab_size, vocab_size_padded, output_ids_ptr, parent_ids_ptr, input_lengths, sequence_lengths,
-                    max_input_length, repetition_penalty, max_seq_len);
+                    repetition_penalty, max_seq_len);
                 sync_check_cuda_error();
             }
         }
@@ -229,21 +218,21 @@ void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const in
     const int block_size = min(local_batch_size * beam_width, 1024);
     const int grid_size = (local_batch_size * beam_width + block_size - 1) / block_size;
     apply_min_length_penalty<<<grid_size, block_size, 0, stream>>>(
-        logits, min_length, end_ids, sequence_lengths, max_input_length, beam_width, vocab_size_padded);
+        logits, min_length, end_ids, sequence_lengths, input_lengths, beam_width, vocab_size_padded);
     sync_check_cuda_error();
 }
 
 template void invokeAddBiasApplyPenalties(float* logits, const int** output_ids_ptr, const int** parent_ids_ptr,
-    const int* input_lengths, const int* sequence_lengths, const float* bias, const int ite, const int max_input_length,
-    const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size,
-    const int vocab_size_padded, const int* end_ids, const float temperature, const float repetition_penalty,
-    const RepetitionPenaltyType repetition_penalty_type, const int min_length, int max_seq_len, cudaStream_t stream);
+    const int* input_lengths, const int* sequence_lengths, const float* bias, const int ite, const int local_batch_size,
+    const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids,
+    const float temperature, const float repetition_penalty, const RepetitionPenaltyType repetition_penalty_type,
+    const int min_length, int max_seq_len, cudaStream_t stream);
 
 template void invokeAddBiasApplyPenalties(half* logits, const int** output_ids_ptr, const int** parent_ids_ptr,
-    const int* input_lengths, const int* sequence_lengths, const half* bias, const int ite, const int max_input_length,
-    const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size,
-    const int vocab_size_padded, const int* end_ids, const float temperature, const float repetition_penalty,
-    const RepetitionPenaltyType repetition_penalty_type, const int min_length, int max_seq_len, cudaStream_t stream);
+    const int* input_lengths, const int* sequence_lengths, const half* bias, const int ite, const int local_batch_size,
+    const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids,
+    const float temperature, const float repetition_penalty, const RepetitionPenaltyType repetition_penalty_type,
+    const int min_length, int max_seq_len, cudaStream_t stream);
 
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h
index 888a1951873..c8321f6e8d2 100644
--- a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h
+++ b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h
@@ -26,10 +26,10 @@ namespace kernels
 
 template <typename T>
 void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const int** parent_ids_ptr,
-    const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int max_input_length,
-    const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size,
-    const int vocab_size_padded, const int* end_ids, const float temperature, const float repetition_penalty,
-    const RepetitionPenaltyType repetition_penalty_type, const int min_length, int max_seq_len, cudaStream_t stream);
+    const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int local_batch_size,
+    const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids,
+    const float temperature, const float repetition_penalty, const RepetitionPenaltyType repetition_penalty_type,
+    const int min_length, int max_seq_len, cudaStream_t stream);
 
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/beamSearchTopkKernels.cu b/cpp/tensorrt_llm/kernels/beamSearchTopkKernels.cu
index 0eeb3166a28..f8ecab4a0a2 100644
--- a/cpp/tensorrt_llm/kernels/beamSearchTopkKernels.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchTopkKernels.cu
@@ -691,11 +691,11 @@ template void invokeTopkBeamSearch(void* workspace, size_t& workspace_size, floa
 
 template <typename T>
 __global__ void tileEncoderResults(T* tiled_output, int* tiled_sequence_length, const T* output,
-    const int* sequence_length, const uint batch_size, const uint beam_width, const uint d_model)
+    const int* sequence_length, const uint32_t batch_size, const uint32_t beam_width, const uint32_t d_model)
 {
     if (blockIdx.x == 0)
     {
-        for (uint i = threadIdx.x; i < batch_size * beam_width; i += blockDim.x)
+        for (uint32_t i = threadIdx.x; i < batch_size * beam_width; i += blockDim.x)
         {
             tiled_sequence_length[i] = sequence_length[i / beam_width];
         }
@@ -704,7 +704,7 @@ __global__ void tileEncoderResults(T* tiled_output, int* tiled_sequence_length,
     int tgt_offset
         = blockIdx.x * gridDim.y * gridDim.z * d_model + blockIdx.y * gridDim.z * d_model + blockIdx.z * d_model;
     int src_offset = blockIdx.x * gridDim.z * d_model + blockIdx.z * d_model;
-    for (uint i = threadIdx.x; i < d_model; i += blockDim.x)
+    for (uint32_t i = threadIdx.x; i < d_model; i += blockDim.x)
     {
         tiled_output[i + tgt_offset] = output[i + src_offset];
     }
@@ -785,29 +785,16 @@ __global__ void insertUnfinishedPath(BeamHypotheses beam_hyps, const bool* finis
             int prev_id = beam_hyps.parent_ids_src[src_beam_idx * max_seq_len + last_token_idx];
             for (int token_idx = last_token_idx - 1; token_idx >= 0; token_idx--)
             {
-                int src_offset;
-                // skip the padding between inputs and outputs
-                if (token_idx > max_seq_len)
-                {
-                    src_offset = max_seq_len - beam_hyps.input_lengths[src_beam_idx];
-                }
-                else
-                {
-                    src_offset = 0;
-                }
                 // output_ids_tgt need to use max_seq_len + 1 because its shape is
                 // [bs, beam_width, max_seq_len + 1]
                 beam_hyps.output_ids_tgt[tgt_beam_idx * max_seq_len + token_idx]
-                    = beam_hyps.output_ids_src[bid * beam_width * max_seq_len + prev_id * max_seq_len + token_idx
-                        + src_offset];
+                    = beam_hyps.output_ids_src[bid * beam_width * max_seq_len + prev_id * max_seq_len + token_idx];
                 if (beam_hyps.log_probs != nullptr && beam_hyps.log_probs_src != nullptr)
                 {
                     beam_hyps.log_probs[tgt_beam_idx * max_seq_len + token_idx]
-                        = beam_hyps.log_probs_src[token_idx * batch_size * beam_width + bid * beam_width + prev_id
-                            + src_offset];
+                        = beam_hyps.log_probs_src[token_idx * batch_size * beam_width + bid * beam_width + prev_id];
                 }
-                prev_id = beam_hyps.parent_ids_src[bid * beam_width * max_seq_len + prev_id * max_seq_len + token_idx
-                    + src_offset];
+                prev_id = beam_hyps.parent_ids_src[bid * beam_width * max_seq_len + prev_id * max_seq_len + token_idx];
             }
             beam_hyps.sequence_lengths_tgt[tgt_beam_idx] = last_token_idx + 1;
 
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
index 8af213fcd4f..98caa3d5637 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+#define _USE_MATH_DEFINES
+// Include cmath with M_LOG2E defined
+#include <cmath>
+#undef _USE_MATH_DEFINES
+
 #include "fmhaRunner.h"
 #include "fused_multihead_attention_v2.h"
 
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
index df6d08f64aa..9673bdec21b 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
@@ -17,11 +17,17 @@
 #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h"
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 
+#ifndef _WIN32
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif // #ifndef _WIN32
+
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/numeric_types.h"
+
+#ifndef _WIN32
 #pragma GCC diagnostic pop
+#endif // #ifndef _WIN32
 
 #include <cuda_runtime_api.h>
 #include <vector>
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h
index 23ae3b59206..a9b7c16de84 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h
@@ -59,16 +59,23 @@ class CutlassFpAIntBGemmRunnerInterface
     virtual ~CutlassFpAIntBGemmRunnerInterface() {}
 
     virtual void gemm(const void* A, const void* B, const void* weight_scales, void* C, int m, int n, int k,
-        char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream)
+        tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream)
         = 0;
 
     virtual void gemm(const void* A, const void* B, const void* weight_scales, const void* weight_zero_points,
-        const void* biases, void* C, int m, int n, int k, const int group_size, char* workspace_ptr,
-        const size_t workspace_bytes, cudaStream_t stream)
+        const void* biases, void* C, int m, int n, int k, const int group_size, tkc::CutlassGemmConfig gemmConfig,
+        char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream)
         = 0;
 
     // Returns desired workspace size in bytes.
-    virtual int getWorkspaceSize(const int m, const int n, const int k) = 0;
+    virtual size_t getWorkspaceSize(const int m, const int n, const int k) = 0;
+
+    virtual std::vector<tkc::CutlassGemmConfig> getConfigs() const = 0;
+
+protected:
+    static constexpr int SPLIT_K_LIMIT = 7;
+    static constexpr int MIN_M_TILE = 32;
+    static constexpr int MIN_N_TILE = 128;
 };
 
 template <typename T, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp>
@@ -79,11 +86,12 @@ class CutlassFpAIntBGemmRunner : public virtual CutlassFpAIntBGemmRunnerInterfac
     ~CutlassFpAIntBGemmRunner();
 
     void gemm(const void* A, const void* B, const void* weight_scales, void* C, int m, int n, int k,
-        char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) override;
+        tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes,
+        cudaStream_t stream) override;
 
     void gemm(const void* A, const void* B, const void* weight_scales, const void* weight_zero_points,
-        const void* biases, void* C, int m, int n, int k, const int group_size, char* workspace_ptr,
-        const size_t workspace_bytes, cudaStream_t stream) override;
+        const void* biases, void* C, int m, int n, int k, const int group_size, tkc::CutlassGemmConfig gemmConfig,
+        char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) override;
 
     // Disabled since the fused GEMM, activation kernels will not be used in v1.
 
@@ -92,7 +100,9 @@ class CutlassFpAIntBGemmRunner : public virtual CutlassFpAIntBGemmRunnerInterfac
     //     stream);
 
     // Returns desired workspace size in bytes.
-    int getWorkspaceSize(const int m, const int n, const int k) override;
+    size_t getWorkspaceSize(const int m, const int n, const int k) override;
+
+    std::vector<tkc::CutlassGemmConfig> getConfigs() const override;
 
 private:
     template <typename EpilogueTag>
@@ -100,14 +110,7 @@ class CutlassFpAIntBGemmRunner : public virtual CutlassFpAIntBGemmRunnerInterfac
         const T* biases, T* C, int m, int n, int k, const int group_size, tkc::CutlassGemmConfig gemm_config,
         char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr);
 
-    template <typename EpilogueTag>
-    void run_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* weight_zero_points, const T* biases,
-        T* C, int m, int n, int k, const int group_size, char* workspace_ptr, const size_t workspace_bytes,
-        cudaStream_t stream);
-
 private:
-    static constexpr int split_k_limit = 7;
-
     int sm_;
     int multi_processor_count_;
 };
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
index d61165c99fb..2d60cbc02b4 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
+#ifndef _WIN32
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif // #ifndef _WIN32
 
 #include "cutlass/gemm/device/gemm_universal_base.h"
 #include "cutlass/gemm/kernel/default_gemm.h"
@@ -27,7 +29,10 @@
 #include "cutlass_extensions/gemm/threadblock/default_mma.h"
 #include "cutlass_extensions/gemm_configs.h"
 
+#ifndef _WIN32
 #pragma GCC diagnostic pop
+#endif // #ifndef _WIN32
+
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"
@@ -384,31 +389,6 @@ void CutlassFpAIntBGemmRunner<T, WeightType, QuantOp>::dispatch_to_arch<Epilogue
     }
 }
 
-template <typename T, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp>
-template <typename EpilogueTag>
-void CutlassFpAIntBGemmRunner<T, WeightType, QuantOp>::run_gemm<EpilogueTag>(const T* A, const WeightType* B,
-    const T* weight_scales, const T* weight_zero_points, const T* biases, T* C, int m, int n, int k,
-    const int group_size, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream)
-{
-    TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    static constexpr bool is_weight_only = !std::is_same<T, WeightType>::value;
-    std::vector<tkc::CutlassGemmConfig> candidate_configs = get_candidate_configs(sm_, is_weight_only, false);
-    std::vector<int> occupancies(candidate_configs.size());
-
-    for (size_t ii = 0; ii < candidate_configs.size(); ++ii)
-    {
-        dispatch_to_arch<EpilogueTag>(A, B, weight_scales, weight_zero_points, biases, C, m, n, k, group_size,
-            candidate_configs[ii], workspace_ptr, workspace_bytes, stream, &occupancies[ii]);
-    }
-    // Standard GEMM, so 1 "expert". We use the same function for MoE and regular FFN.
-    static constexpr int num_experts = 1;
-    tkc::CutlassGemmConfig chosen_config = estimate_best_config_from_occupancies(candidate_configs, occupancies, m, n,
-        k, num_experts, split_k_limit, workspace_bytes, multi_processor_count_, is_weight_only);
-
-    dispatch_to_arch<EpilogueTag>(A, B, weight_scales, weight_zero_points, biases, C, m, n, k, group_size,
-        chosen_config, workspace_ptr, workspace_bytes, stream);
-}
-
 // Disabled since the fused GEMM, activation kernels will not be used in v1.
 
 // template <typename T, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp>
@@ -447,15 +427,15 @@ void CutlassFpAIntBGemmRunner<T, WeightType, QuantOp>::run_gemm<EpilogueTag>(con
 template <typename T, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp>
 void CutlassFpAIntBGemmRunner<T, WeightType, QuantOp>::gemm(const void* A, const void* B, const void* weight_scales,
     const void* weight_zero_points, const void* biases, void* C, int m, int n, int k, const int group_size,
-    char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream)
+    tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream)
 {
     TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
     if constexpr ((QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS)
         || (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY))
     {
-        run_gemm<tkc::EpilogueOpBias>((const T*) A, (const WeightType*) B, (const T*) weight_scales,
-            (const T*) weight_zero_points, (const T*) biases, (T*) C, m, n, k, group_size, workspace_ptr,
-            workspace_bytes, stream);
+        dispatch_to_arch<tkc::EpilogueOpBias>((const T*) A, (const WeightType*) B, (const T*) weight_scales,
+            (const T*) weight_zero_points, (const T*) biases, (T*) C, m, n, k, group_size, gemmConfig, workspace_ptr,
+            workspace_bytes, stream, nullptr);
     }
     else
     {
@@ -466,14 +446,15 @@ void CutlassFpAIntBGemmRunner<T, WeightType, QuantOp>::gemm(const void* A, const
 
 template <typename T, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp>
 void CutlassFpAIntBGemmRunner<T, WeightType, QuantOp>::gemm(const void* A, const void* B, const void* weight_scales,
-    void* C, int m, int n, int k, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream)
+    void* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes,
+    cudaStream_t stream)
 {
     TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
     if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY)
     {
-        run_gemm<tkc::EpilogueOpNoBias>((const T*) A, (const WeightType*) B, (const T*) weight_scales, nullptr, nullptr,
-            (T*) C, m, n, k, k, workspace_ptr, workspace_bytes, stream);
+        dispatch_to_arch<tkc::EpilogueOpNoBias>((const T*) A, (const WeightType*) B, (const T*) weight_scales, nullptr,
+            nullptr, (T*) C, m, n, k, k, gemmConfig, workspace_ptr, workspace_bytes, stream, nullptr);
     }
     else
     {
@@ -482,14 +463,22 @@ void CutlassFpAIntBGemmRunner<T, WeightType, QuantOp>::gemm(const void* A, const
 }
 
 template <typename T, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp>
-int CutlassFpAIntBGemmRunner<T, WeightType, QuantOp>::getWorkspaceSize(const int m, const int n, const int k)
+std::vector<tkc::CutlassGemmConfig> CutlassFpAIntBGemmRunner<T, WeightType, QuantOp>::getConfigs() const
+{
+    static constexpr bool is_weight_only = !std::is_same<T, WeightType>::value;
+    std::vector<tkc::CutlassGemmConfig> candidateConfigs = get_candidate_configs(sm_, is_weight_only, false);
+    return candidateConfigs;
+}
+
+template <typename T, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp>
+size_t CutlassFpAIntBGemmRunner<T, WeightType, QuantOp>::getWorkspaceSize(const int m, const int n, const int k)
 {
     TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
     // These are the min tile sizes for each config, which would launch the maximum number of blocks
-    const int max_grid_m = (m + 31) / 32;
-    const int max_grid_n = (n + 127) / 128;
+    const int max_grid_m = cutlass::ceil_div(m, MIN_M_TILE);
+    const int max_grid_n = cutlass::ceil_div(n, MIN_N_TILE);
     // We need 4 bytes per block in the worst case. We launch split_k_limit in z dim.
-    return max_grid_m * max_grid_n * split_k_limit * 4;
+    return static_cast<size_t>(max_grid_m * max_grid_n * SPLIT_K_LIMIT * 4);
 }
 
 } // namespace cutlass_kernels
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h
index b1120bc6385..256cd91b544 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h
@@ -40,8 +40,6 @@ namespace cutlass_kernels
   Weights are assumed to be column-major.
 */
 
-using perfMapType = std::unordered_map<int, tkc::CutlassGemmConfig>;
-
 class CutlassInt8GemmRunnerInterface
 {
 public:
@@ -50,52 +48,19 @@ class CutlassInt8GemmRunnerInterface
     virtual ~CutlassInt8GemmRunnerInterface() {}
 
     virtual void gemm(const int8_t* A, const int8_t* B, tk::QuantMode quantOption, const float* alphaCol,
-        const float* alphaRow, void* C, int m, int n, int k, char* workspacePtr, const size_t workspaceBytes,
-        cudaStream_t stream)
-        = 0;
-
-    virtual void profileGemms(tk::QuantMode quantOption, int minM, int maxM, int n, int k, int8_t* A, int8_t* B,
-        void* C, float* alphaCol, float* alphaRow, char* workspace)
+        const float* alphaRow, void* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspacePtr,
+        const size_t workspaceBytes, cudaStream_t stream)
         = 0;
 
     // Returns desired workspace size in bytes.
-    virtual int getWorkspaceSize(const int m, const int n, const int k) = 0;
-
-    // Returns True if tactics has already been selected
-    bool hasSelectedTactics() const
-    {
-        return mTacticsMap.size() > 0;
-    }
-
-    void setSelectedTactics(const perfMapType& tacticsMap)
-    {
-        mTacticsMap = tacticsMap;
-    }
-
-    const perfMapType& getSelectedTactics() const
-    {
-        return mTacticsMap;
-    }
-
-    void setMaxM(int maxM)
-    {
-        mMaxM = maxM;
-    }
-
-    int getMaxM() const
-    {
-        return mMaxM;
-    }
+    virtual size_t getWorkspaceSize(const int m, const int n, const int k) = 0;
+
+    virtual std::vector<tkc::CutlassGemmConfig> getConfigs() const = 0;
 
 protected:
     static constexpr int SPLIT_K_LIMIT = 7;
-    static constexpr int MAX_STEP_M = 32768;
     static constexpr int MIN_M_TILE = 32;
     static constexpr int MIN_N_TILE = 64;
-
-    int mMaxM;
-
-    perfMapType mTacticsMap;
 };
 
 template <typename T>
@@ -106,25 +71,19 @@ class CutlassInt8GemmRunner : public virtual CutlassInt8GemmRunnerInterface
     ~CutlassInt8GemmRunner();
 
     void gemm(const int8_t* A, const int8_t* B, tk::QuantMode quantOption, const float* alphaCol, const float* alphaRow,
-        void* C, int m, int n, int k, char* workspacePtr, const size_t workspaceBytes, cudaStream_t stream) override;
-
-    void profileGemms(tk::QuantMode quantOption, int minM, int maxM, int n, int k, int8_t* A, int8_t* B, void* C,
-        float* alphaCol, float* alphaRow, char* workspace) override;
+        void* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspacePtr,
+        const size_t workspaceBytes, cudaStream_t stream) override;
 
     // Returns desired workspace size in bytes.
-    int getWorkspaceSize(const int m, const int n, const int k) override;
+    size_t getWorkspaceSize(const int m, const int n, const int k) override;
+
+    std::vector<tkc::CutlassGemmConfig> getConfigs() const override;
 
 private:
     void dispatchToArch(const int8_t* A, const int8_t* B, tk::QuantMode quantOption, const float* alphaCol,
         const float* alphaRow, T* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspacePtr,
         const size_t workspaceBytes, cudaStream_t stream, int* occupancy = nullptr);
 
-    tkc::CutlassGemmConfig profileGemm(tk::QuantMode quant_option, int m, int n, int k, int8_t* A, int8_t* B, void* C,
-        float* alphaCol, float* alphaRow, char* workspace);
-
-    float profileConfig(const tkc::CutlassGemmConfig& config, tk::QuantMode quantOption, int m, int n, int k, int8_t* A,
-        int8_t* B, void* C, float* alphaCol, float* alphaRow, char* workspace);
-
     int mSm;
     int mMultiProcessorCount;
 };
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h
index a92a3c3419d..4137c9a9faa 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
+#ifndef _WIN32
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif // #ifndef _WIN32
 
 // clang-format off
 #include <cutlass/gemm/device/default_gemm_configuration.h>
@@ -33,7 +35,9 @@
 #include "cutlass_extensions/gemm/kernel/default_int8_traits.h"
 #include "cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h"
 
+#ifndef _WIN32
 #pragma GCC diagnostic pop
+#endif // #ifndef _WIN32
 
 #include "tensorrt_llm/common/allocator.h"
 #include "tensorrt_llm/common/cudaUtils.h"
@@ -60,7 +64,6 @@ void genericInt8GemmKernelLauncher(const int8_t* A, const int8_t* B, tk::QuantMo
     size_t workspaceBytes, cudaStream_t stream, int* occupancy = nullptr)
 {
     TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    // #ifdef BUILD_CUTLASS_MIXED_GEMM
 
     using ElementInput = int8_t;
 
@@ -165,11 +168,6 @@ void genericInt8GemmKernelLauncher(const int8_t* A, const int8_t* B, tk::QuantMo
             = "Failed to run cutlass int8 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
         throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg);
     }
-    // #else
-    //     throw std::runtime_error(
-    //         "[TensorRT-LLM Error][int8gemm] TensorRT-LLM was built was mixed gemm support off. Please rebuild with
-    //         cmake option -DBUILD_CUTLASS_MIXED_GEMM=ON");
-    // #endif
 }
 
 template <typename T, typename arch, typename ThreadblockShape, typename WarpShape, int Stages, typename Enable = void>
@@ -355,136 +353,33 @@ void CutlassInt8GemmRunner<T>::dispatchToArch(const int8_t* A, const int8_t* B,
 
 template <typename T>
 void CutlassInt8GemmRunner<T>::gemm(const int8_t* A, const int8_t* B, tk::QuantMode quantOption, const float* alphaCol,
-    const float* alphaRow, void* C, int m, int n, int k, char* workspacePtr, const size_t workspaceBytes,
-    cudaStream_t stream)
+    const float* alphaRow, void* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspacePtr,
+    const size_t workspaceBytes, cudaStream_t stream)
 {
     TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    int mRounded = cutlass::round_up(m, MAX_STEP_M);
-    if (m < MAX_STEP_M)
-    {
-        mRounded = mmha::next_power_of_two(m);
-    }
-    mRounded = std::min(mMaxM, mRounded);
-    dispatchToArch(A, B, quantOption, alphaCol, alphaRow, reinterpret_cast<T*>(C), m, n, k, mTacticsMap[mRounded],
-        workspacePtr, workspaceBytes, stream);
-}
-
-template <typename T>
-float CutlassInt8GemmRunner<T>::profileConfig(const tkc::CutlassGemmConfig& config, tk::QuantMode quantOption, int m,
-    int n, int k, int8_t* A, int8_t* B, void* C, float* alphaCol, float* alphaRow, char* workspace)
-{
-    constexpr int warmup = 3;
-    constexpr int runs = 10;
-
-    const auto workspaceBytes = getWorkspaceSize(m, n, k);
-
-    cudaStream_t stream = cudaStreamDefault;
-    for (int i = 0; i < warmup; ++i)
-    {
-        dispatchToArch(A, B, quantOption, alphaCol, alphaRow, reinterpret_cast<T*>(C), m, n, k, config, workspace,
-            workspaceBytes, stream);
-    }
-
-    cudaEvent_t start;
-    cudaEvent_t stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    cudaDeviceSynchronize();
-    cudaEventRecord(start, 0);
-    std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
-
-    for (int i = 0; i < runs; ++i)
-    {
-        dispatchToArch(A, B, quantOption, alphaCol, alphaRow, reinterpret_cast<T*>(C), m, n, k, config, workspace,
-            workspaceBytes, stream);
-    }
-
-    cudaEventRecord(stop, 0);
-
-    cudaEventSynchronize(stop);
-
-    float elapsed;
-    cudaEventElapsedTime(&elapsed, start, stop);
-
-    cudaEventDestroy(start);
-    cudaEventDestroy(stop);
-
-    return elapsed / runs;
+    dispatchToArch(A, B, quantOption, alphaCol, alphaRow, reinterpret_cast<T*>(C), m, n, k, gemmConfig, workspacePtr,
+        workspaceBytes, stream);
 }
 
 template <typename T>
-tkc::CutlassGemmConfig CutlassInt8GemmRunner<T>::profileGemm(tk::QuantMode quantOption, int m, int n, int k, int8_t* A,
-    int8_t* B, void* C, float* alphaCol, float* alphaRow, char* workspace)
+std::vector<tkc::CutlassGemmConfig> CutlassInt8GemmRunner<T>::getConfigs() const
 {
-    TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
     static constexpr bool isWeightOnly = false;
     std::vector<tkc::CutlassGemmConfig> candidateConfigs
         = get_candidate_configs(mSm, isWeightOnly, mSm <= 70, /* SIMT configs */
             true);                                            /* INT8 configs */
-
-    float bestTime = std::numeric_limits<float>::max();
-    tkc::CutlassGemmConfig bestConfig;
-    bool foundOne = false;
-
-    for (int ii = 0; ii < candidateConfigs.size(); ++ii)
-    {
-        tkc::CutlassGemmConfig candidateConfig = candidateConfigs[ii];
-        float time = std::numeric_limits<float>::max();
-        try
-        {
-            time = profileConfig(candidateConfig, quantOption, m, n, k, A, B, C, alphaCol, alphaRow, workspace);
-            foundOne = true;
-        }
-        catch (...)
-        {
-            std::ostringstream msg;
-            msg << "Cannot profile configuration " << ii << " (for"
-                << " m=" << m << ", n=" << n << ", k=" << k << "). Skipped";
-            TLLM_LOG_DEBUG(msg.str());
-        }
-
-        if (time < bestTime)
-        {
-            bestConfig = candidateConfig;
-            bestTime = time;
-        }
-    }
-
-    if (!foundOne)
-    {
-        TLLM_LOG_ERROR("Have not found any valid GEMM config. Abort.");
-    }
-
-    return bestConfig;
-}
-
-template <typename T>
-void CutlassInt8GemmRunner<T>::profileGemms(tk::QuantMode quantOption, int minM, int maxM, int n, int k, int8_t* A,
-    int8_t* B, void* C, float* alphaCol, float* alphaRow, char* workspace)
-{
-    TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    const int startMinMRounded = mmha::next_power_of_two(minM);
-    for (int m = startMinMRounded; m < maxM;)
-    {
-        mTacticsMap[m] = profileGemm(quantOption, m, n, k, A, B, C, alphaCol, alphaRow, workspace);
-        // Profile different Ms increasing it in powers of 2 up to MAX_STEP_M
-        // From there step linearly with MAX_STEP_M step
-        m += min(m, MAX_STEP_M);
-    }
-    // Profile the largest possible M
-    mTacticsMap[maxM] = profileGemm(quantOption, maxM, n, k, A, B, C, alphaCol, alphaRow, workspace);
+    return candidateConfigs;
 }
 
 template <typename T>
-int CutlassInt8GemmRunner<T>::getWorkspaceSize(const int m, const int n, const int k)
+size_t CutlassInt8GemmRunner<T>::getWorkspaceSize(const int m, const int n, const int k)
 {
     TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
     // These are the min tile sizes for each config, which would launch the maximum number of blocks
     const int maxGridM = cutlass::ceil_div(m, MIN_M_TILE);
     const int maxGridN = cutlass::ceil_div(m, MIN_N_TILE);
     // We need 4 bytes per block in the worst case. We launch SPLIT_K_LIMIT in z dim.
-    return maxGridM * maxGridN * SPLIT_K_LIMIT * 4;
+    return static_cast<size_t>(maxGridM * maxGridN * SPLIT_K_LIMIT * 4);
 }
 
 } // namespace cutlass_kernels
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h
index dcf97b7336c..de44584aef4 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h
@@ -115,6 +115,8 @@ struct Multihead_attention_params_base
     PositionEmbeddingType position_embedding_type = PositionEmbeddingType::kLEARNED_ABSOLUTE;
     // The per-head latent space reserved for rotary embeddings.
     int rotary_embedding_dim = 0;
+    float rotary_embedding_base = 0.0f;
+    float rotary_embedding_scale = 0.0f;
     // The current timestep. TODO(bhsueh) Check that do we only this param in cross attention?
     int timestep = 0;
     // The current timestep of each sentences (support different timestep for different sentences)
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h
index 90dacc3ea70..70bb2b395b8 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h
@@ -117,10 +117,10 @@ inline size_t multi_block_grid_setup(
 #define MMHA_LAUNCH_CHECK(DYNAMIC_THDS_PER_BLOCK)                                                                      \
     std::size_t const dynamic_smem_sz{                                                                                 \
         mmha::smem_size_in_bytes<T, Dh, DO_MULTI_BLOCK>(params, DYNAMIC_THDS_PER_BLOCK)};                              \
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&available_blocks,                                                   \
+    TLLM_CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&available_blocks,                                   \
         mmha::masked_multihead_attention_kernel<T, T_cache, KVCacheBuffer, Dh, DYNAMIC_THDS_PER_BLOCK, HAS_BEAMS,      \
             DO_MULTI_BLOCK>,                                                                                           \
-        DYNAMIC_THDS_PER_BLOCK, dynamic_smem_sz);
+        DYNAMIC_THDS_PER_BLOCK, dynamic_smem_sz));
 
 #define MMHA_KERNEL(DYNAMIC_THDS_PER_BLOCK)                                                                            \
     std::size_t const dynamic_smem_sz{                                                                                 \
@@ -191,10 +191,10 @@ void mmha_launch_kernel_ex(
         // Tune block size based on batchxhead to increase occupancy.
         int num_blocks_per_sm = -1;
         std::size_t const smem_sz{mmha::smem_size_in_bytes<T, Dh, DO_MULTI_BLOCK>(params, THDS_PER_BLOCK)};
-        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
+        TLLM_CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
             mmha::masked_multihead_attention_kernel<T, T_cache, KVCacheBuffer, Dh, THDS_PER_BLOCK, HAS_BEAMS,
                 DO_MULTI_BLOCK>,
-            THDS_PER_BLOCK, smem_sz);
+            THDS_PER_BLOCK, smem_sz));
         TLLM_CHECK_WITH_INFO(
             num_blocks_per_sm >= 1, "Sequence Length is too long for the MMHA kernel (not enough shared memory).");
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
index df3059dcab5..42fe37d34cc 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
@@ -1246,7 +1246,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
     case PositionEmbeddingType::kALIBI: break;
     case PositionEmbeddingType::kROPE_GPTJ:
     {
-        apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, tlength);
+        apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, params.rotary_embedding_base,
+            params.rotary_embedding_scale, tlength);
         break;
     }
     case PositionEmbeddingType::kROPE_GPT_NEOX:
@@ -1278,7 +1279,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
             mmha::vec_from_smem_transpose(q, q_smem_, transpose_idx, smem_pitch);
             mmha::vec_from_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
 
-            mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, params.rotary_embedding_dim, tlength);
+            mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, params.rotary_embedding_dim,
+                params.rotary_embedding_base, params.rotary_embedding_scale, tlength);
 
             mmha::write_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
             mmha::write_smem_transpose(q, q_smem_, transpose_idx, smem_pitch);
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h
index 7253756c4a9..a514e759992 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h
@@ -1508,9 +1508,10 @@ inline __device__ void zero(T& dst)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-inline __device__ float2 rotary_embedding_coefficient(const int zid, const int rot_embed_dim, const float t_step)
+inline __device__ float2 rotary_embedding_coefficient(
+    const int zid, const int rot_embed_dim, const float base, const float scale, const float t_step)
 {
-    const float inv_freq = t_step / pow(10000.0f, zid / (float) rot_embed_dim);
+    const float inv_freq = (t_step * scale) / pow(base, zid / (float) rot_embed_dim);
     return {cos(inv_freq), sin(inv_freq)};
 }
 
@@ -1538,38 +1539,42 @@ inline __device__ __nv_bfloat162 rotary_embedding_transform(const __nv_bfloat162
 }
 #endif
 
-inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, float base, float scale, int t_step)
 {
     return;
 }
 
-inline __device__ void apply_rotary_embedding(float& q, float& k, int zid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    float& q, float& k, int zid, int rot_embed_dim, float base, float scale, int t_step)
 {
     return;
 }
 
-inline __device__ void apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    float2& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
     q = rotary_embedding_transform(q, coef);
 }
 
-inline __device__ void apply_rotary_embedding(float2& q, float2& k, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    float2& q, float2& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
     q = rotary_embedding_transform(q, coef);
     k = rotary_embedding_transform(k, coef);
 }
 
-inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    float4& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (4 * tid >= rot_embed_dim)
     {
@@ -1577,13 +1582,14 @@ inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_
     }
 
     Float4_& q_ = *reinterpret_cast<Float4_*>(&q);
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
     q_.x = rotary_embedding_transform(q_.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
     q_.y = rotary_embedding_transform(q_.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    float4& q, float4& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (4 * tid >= rot_embed_dim)
     {
@@ -1592,189 +1598,199 @@ inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int
 
     Float4_& q_ = *reinterpret_cast<Float4_*>(&q);
     Float4_& k_ = *reinterpret_cast<Float4_*>(&k);
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
     q_.x = rotary_embedding_transform(q_.x, coef0);
     k_.x = rotary_embedding_transform(k_.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
     q_.y = rotary_embedding_transform(q_.y, coef1);
     k_.y = rotary_embedding_transform(k_.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    uint32_t& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
     q = rotary_embedding_transform(q, coef);
 }
 
-inline __device__ void apply_rotary_embedding(uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
     q = rotary_embedding_transform(q, coef);
     k = rotary_embedding_transform(k, coef);
 }
 
-inline __device__ void apply_rotary_embedding(half2& q, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(half2& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
-    return apply_rotary_embedding(*reinterpret_cast<uint32_t*>(&q), tid, rot_embed_dim, t_step);
+    return apply_rotary_embedding(*reinterpret_cast<uint32_t*>(&q), tid, rot_embed_dim, base, scale, t_step);
 }
 
-inline __device__ void apply_rotary_embedding(half2& q, half2& k, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    half2& q, half2& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     return apply_rotary_embedding(
-        *reinterpret_cast<uint32_t*>(&q), *reinterpret_cast<uint32_t*>(&k), tid, rot_embed_dim, t_step);
+        *reinterpret_cast<uint32_t*>(&q), *reinterpret_cast<uint32_t*>(&k), tid, rot_embed_dim, base, scale, t_step);
 }
 
-inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
     q.x = rotary_embedding_transform(q.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
     q.y = rotary_embedding_transform(q.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(uint2& q, uint2& k, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    uint2& q, uint2& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
     q.y = rotary_embedding_transform(q.y, coef1);
     k.y = rotary_embedding_transform(k.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step);
     q.x = rotary_embedding_transform(q.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step);
     q.y = rotary_embedding_transform(q.y, coef1);
-    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step);
     q.z = rotary_embedding_transform(q.z, coef2);
-    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step);
     q.w = rotary_embedding_transform(q.w, coef3);
 }
 
-inline __device__ void apply_rotary_embedding(uint4& q, uint4& k, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    uint4& q, uint4& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step);
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step);
     q.y = rotary_embedding_transform(q.y, coef1);
     k.y = rotary_embedding_transform(k.y, coef1);
-    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step);
     q.z = rotary_embedding_transform(q.z, coef2);
     k.z = rotary_embedding_transform(k.z, coef2);
-    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step);
     q.w = rotary_embedding_transform(q.w, coef3);
     k.w = rotary_embedding_transform(k.w, coef3);
 }
 
 #ifdef ENABLE_BF16
-inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    __nv_bfloat162& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
     q = rotary_embedding_transform(q, coef);
 }
 
 inline __device__ void apply_rotary_embedding(
-    __nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, int t_step)
+    __nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step);
     q = rotary_embedding_transform(q, coef);
     k = rotary_embedding_transform(k, coef);
 }
 
-inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    bf16_4_t& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
     q.x = rotary_embedding_transform(q.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
     q.y = rotary_embedding_transform(q.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step);
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step);
     q.y = rotary_embedding_transform(q.y, coef1);
     k.y = rotary_embedding_transform(k.y, coef1);
 }
 
-inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    bf16_8_t& q, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step);
     q.x = rotary_embedding_transform(q.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step);
     q.y = rotary_embedding_transform(q.y, coef1);
-    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step);
     q.z = rotary_embedding_transform(q.z, coef2);
-    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step);
     q.w = rotary_embedding_transform(q.w, coef3);
 }
 
-inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, int t_step)
+inline __device__ void apply_rotary_embedding(
+    bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, float base, float scale, int t_step)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step);
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step);
     q.y = rotary_embedding_transform(q.y, coef1);
     k.y = rotary_embedding_transform(k.y, coef1);
-    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step);
     q.z = rotary_embedding_transform(q.z, coef2);
     k.z = rotary_embedding_transform(k.z, coef2);
-    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step);
     q.w = rotary_embedding_transform(q.w, coef3);
     k.w = rotary_embedding_transform(k.w, coef3);
 }
@@ -2067,10 +2083,10 @@ inline __device__ void convert_from_fp8(bf16_4_t* v, const fp8_4_t u)
 inline __device__ void convert_from_fp8(bf16_8_t* v, const fp8_8_t u)
 {
     __nv_bfloat162* v2 = reinterpret_cast<__nv_bfloat162*>(v);
-    convert_from_fp8(v2, u.x);
+    convert_from_fp8(v2 + 0, u.x);
     convert_from_fp8(v2 + 1, u.y);
-    convert_from_fp8(v2 + 1, u.z);
-    convert_from_fp8(v2 + 2, u.w);
+    convert_from_fp8(v2 + 2, u.z);
+    convert_from_fp8(v2 + 3, u.w);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.cu b/cpp/tensorrt_llm/kernels/decodingKernels.cu
index 2abbbf0c48b..4d40df54b2d 100644
--- a/cpp/tensorrt_llm/kernels/decodingKernels.cu
+++ b/cpp/tensorrt_llm/kernels/decodingKernels.cu
@@ -30,8 +30,6 @@ namespace kernels
 
 __global__ void gatherTree(gatherTreeParam param)
 {
-    const int max_input_length = param.input_lengths == nullptr ? 0 : param.max_input_length;
-
     for (int batchbeam_idx = blockIdx.x * blockDim.x + threadIdx.x; batchbeam_idx < param.batch_size * param.beam_width;
          batchbeam_idx += gridDim.x * blockDim.x)
     {
@@ -66,43 +64,36 @@ __global__ void gatherTree(gatherTreeParam param)
             continue;
         }
 
-        const int padding_offset = param.has_padding ? max_input_length - input_len : 0;
-        const int initial_tgt_ix = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len
-            + max_seq_len_b - 1 - padding_offset;
+        const int initial_tgt_ix
+            = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + max_seq_len_b - 1;
         const int initial_parent_ix
             = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + max_seq_len_b - 1;
-        param.beams[initial_tgt_ix] = __ldg(step_ids + initial_parent_ix);
+        param.output_ids[initial_tgt_ix] = __ldg(step_ids + initial_parent_ix);
         int parent = parent_ids == nullptr ? 0 : __ldg(parent_ids + initial_parent_ix) % param.beam_width;
         bool found_bad = false;
 
         for (int level = max_seq_len_b - 2; level >= 0; --level)
         {
-            if (param.has_padding && level >= input_len && level < max_input_length)
-            {
-                continue;
-            }
-            const int tgt_level{level >= max_input_length ? level - padding_offset : level};
-            const int level_beam_ix
-                = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + tgt_level;
+            const int level_beam_ix = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + level;
             const int level_parent_ix
                 = batch * param.beam_width * param.max_seq_len + parent * param.max_seq_len + level;
             if (parent < 0 || parent > param.beam_width)
             {
-                param.beams[level_beam_ix] = param.end_tokens[batch];
+                param.output_ids[level_beam_ix] = param.end_tokens[batch];
                 parent = -1;
                 found_bad = true;
             }
             else
             {
-                param.beams[level_beam_ix] = __ldg(step_ids + level_parent_ix);
+                param.output_ids[level_beam_ix] = __ldg(step_ids + level_parent_ix);
                 parent = parent_ids == nullptr ? 0 : __ldg(parent_ids + level_parent_ix) % param.beam_width;
             }
         }
         // set the padded part as end_token
         // input_len
-        for (int index = max_len - padding_offset; index < param.max_seq_len; ++index)
+        for (int index = max_len; index < param.max_seq_len; ++index)
         {
-            param.beams[batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + index]
+            param.output_ids[batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + index]
                 = param.end_tokens[batch];
         }
 
@@ -113,40 +104,21 @@ __global__ void gatherTree(gatherTreeParam param)
         {
             bool finished = false;
             // skip the step 0 because it is often the start token
-            int start_step = max_input_length == 0 ? 1 : max_input_length;
+            int start_step = 1;
             for (int time = start_step; time < max_seq_len_b; ++time)
             {
                 const int level_beam_ix
                     = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + time;
                 if (finished)
                 {
-                    param.beams[level_beam_ix] = param.end_tokens[batch];
+                    param.output_ids[level_beam_ix] = param.end_tokens[batch];
                 }
-                else if (param.beams[level_beam_ix] == param.end_tokens[batch])
+                else if (param.output_ids[level_beam_ix] == param.end_tokens[batch])
                 {
                     finished = true;
                 }
             }
         }
-
-        // transpose on output_ids
-        if (param.output_ids != nullptr)
-        {
-            for (int step_idx = 0; step_idx < param.max_seq_len; step_idx++)
-            {
-                param.output_ids[batchbeam_idx * param.max_seq_len + step_idx]
-                    = param.beams[batchbeam_idx * param.max_seq_len + step_idx];
-            }
-        }
-    }
-
-    // remove the pad length from sequence lengths
-    for (int batchbeam_idx = blockIdx.x * blockDim.x + threadIdx.x; batchbeam_idx < param.batch_size * param.beam_width;
-         batchbeam_idx += gridDim.x * blockDim.x)
-    {
-        const int input_len = param.input_lengths == nullptr ? 0 : param.input_lengths[batchbeam_idx];
-        const int pad_len = max_input_length - input_len;
-        param.sequence_lengths[batchbeam_idx] -= pad_len;
     }
 }
 
@@ -319,7 +291,7 @@ void invokeGatherTree(gatherTreeParam param)
 __global__ void finalize(int* output_ids, int* sequence_lengths, float* cum_log_probs, float* output_log_probs,
     const int* topk_output_ids, const int* topk_sequence_lengths, const float* scores, const float* topk_cum_log_probs,
     const float* topk_log_probs, const int* num_beams, const int* input_lengths, const int beam_width,
-    const int max_input_length, const int max_seq_len, bool do_remove_padding)
+    const int max_seq_len)
 {
     // output_ids: [bs, beam_width, max_seq_len]
     // sequence_lengths: [bs, beam_width]
@@ -337,11 +309,9 @@ __global__ void finalize(int* output_ids, int* sequence_lengths, float* cum_log_
     // Note that we remove the start_token (the id at first position) from topk_output_ids
 
     extern __shared__ char array[];
-    int* s_rank = (int*) (array);                                    // [beam_width]
-    float* s_scores = (float*) (s_rank + beam_width);                // [2 * beam_width]
-    int* s_sequence_lengths = (int*) (s_scores + beam_width * 2);    // [beam_width]
-    const int input_length = input_lengths[blockIdx.x * beam_width]; // input_lengths of same batch must be same
-    const int pad_len = do_remove_padding ? max_input_length - input_length : 0;
+    int* s_rank = (int*) (array);                                 // [beam_width]
+    float* s_scores = (float*) (s_rank + beam_width);             // [2 * beam_width]
+    int* s_sequence_lengths = (int*) (s_scores + beam_width * 2); // [beam_width]
     const int num_beam = num_beams[blockIdx.x];
     if (threadIdx.x < num_beam)
     {
@@ -426,8 +396,7 @@ __global__ void finalize(int* output_ids, int* sequence_lengths, float* cum_log_
 
     if (threadIdx.x < beam_width)
     {
-        s_sequence_lengths[threadIdx.x]
-            = topk_sequence_lengths[blockIdx.x * beam_width * 2 + s_rank[threadIdx.x]] - pad_len;
+        s_sequence_lengths[threadIdx.x] = topk_sequence_lengths[blockIdx.x * beam_width * 2 + s_rank[threadIdx.x]];
         sequence_lengths[blockIdx.x * beam_width + threadIdx.x] = s_sequence_lengths[threadIdx.x];
 
         if (cum_log_probs != nullptr)
@@ -443,15 +412,12 @@ __global__ void finalize(int* output_ids, int* sequence_lengths, float* cum_log_
         // start from step 1 to skip the start token
         for (int i = threadIdx.x; i < s_sequence_lengths[beam_idx]; i += blockDim.x)
         {
-            int src_pad_offset = do_remove_padding ? ((i >= input_length) ? pad_len : 0) : 0;
             output_ids[blockIdx.x * beam_width * max_seq_len + beam_idx * max_seq_len + i]
-                = topk_output_ids[blockIdx.x * (beam_width * 2) * max_seq_len + s_rank[beam_idx] * max_seq_len
-                    + (i + src_pad_offset)];
+                = topk_output_ids[blockIdx.x * (beam_width * 2) * max_seq_len + s_rank[beam_idx] * max_seq_len + i];
             if (output_log_probs != nullptr)
             {
                 output_log_probs[blockIdx.x * beam_width * max_seq_len + beam_idx * max_seq_len + i]
-                    = topk_log_probs[blockIdx.x * (beam_width * 2) * max_seq_len + s_rank[beam_idx] * max_seq_len
-                        + (i + src_pad_offset)];
+                    = topk_log_probs[blockIdx.x * (beam_width * 2) * max_seq_len + s_rank[beam_idx] * max_seq_len + i];
             }
         }
     }
@@ -460,8 +426,7 @@ __global__ void finalize(int* output_ids, int* sequence_lengths, float* cum_log_
 void invokeFinalize(int* output_ids, int* sequence_lengths, float* cum_log_probs, float* output_log_probs,
     const int* topk_output_ids, const int* topk_sequence_lengths, const float* scores, const float* topk_cum_log_probs,
     const float* topk_log_probs, const int* num_beams, const int* input_lengths, const int beam_width,
-    const int max_seq_len, const int batch_size, const int max_input_length, cudaStream_t stream,
-    bool do_remove_padding)
+    const int max_seq_len, const int batch_size, cudaStream_t stream)
 {
     TLLM_LOG_DEBUG("%s %s start", __FILE__, __PRETTY_FUNCTION__);
     dim3 block(beam_width * 2);
@@ -469,8 +434,7 @@ void invokeFinalize(int* output_ids, int* sequence_lengths, float* cum_log_probs
     TLLM_CHECK(block.x < 1024);
     finalize<<<batch_size, block, beam_width * sizeof(int) * 2 + (beam_width * 2) * sizeof(float), stream>>>(output_ids,
         sequence_lengths, cum_log_probs, output_log_probs, topk_output_ids, topk_sequence_lengths, scores,
-        topk_cum_log_probs, topk_log_probs, num_beams, input_lengths, beam_width, max_input_length, max_seq_len,
-        do_remove_padding);
+        topk_cum_log_probs, topk_log_probs, num_beams, input_lengths, beam_width, max_seq_len);
 }
 
 __global__ void initializeOutput(int* output_ids, const int* end_ids, const int max_seq_len)
diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.h b/cpp/tensorrt_llm/kernels/decodingKernels.h
index 9525c90d781..d942025e0aa 100644
--- a/cpp/tensorrt_llm/kernels/decodingKernels.h
+++ b/cpp/tensorrt_llm/kernels/decodingKernels.h
@@ -41,13 +41,10 @@ struct gatherTreeParam
     const int* step_ids = nullptr;   // [max_seq_len, batch_size, beam_width]
     const int* parent_ids = nullptr; // [max_seq_len, batch_size, beam_width]
     const int* end_tokens = nullptr; // [batch_size], end token ids of each query
-    int max_input_length = 0;        // max(input_lengths)
     int* output_ids = nullptr;       // the buffer to put finalized ids
-    // True if we have virtual padding tokens to fill up to max_input_len
-    bool has_padding = true;
     cudaStream_t stream;
-    float* cum_log_probs = nullptr; // [batch_size, beam_width]
-    float length_penalty = 1.0f;    // on cpu
+    float* cum_log_probs = nullptr;  // [batch_size, beam_width]
+    float length_penalty = 1.0f;     // on cpu
 };
 
 /*
@@ -58,8 +55,7 @@ void invokeGatherTree(gatherTreeParam param);
 void invokeFinalize(int* output_ids, int* sequence_lengths, float* cum_log_probs, float* output_log_probs,
     const int* topk_output_ids, const int* topk_sequence_lengths, const float* scores, const float* topk_cum_log_probs,
     const float* topk_log_probs, const int* num_beams, const int* input_lengths, const int beam_width,
-    const int max_seq_len, const int batch_size, const int max_input_length, cudaStream_t stream,
-    bool do_remove_padding = true);
+    const int max_seq_len, const int batch_size, cudaStream_t stream);
 
 void invokeInitializeOutput(int* output_ids, const int* end_ids, int batch_beam, int max_seq_len, cudaStream_t stream);
 
diff --git a/cpp/tensorrt_llm/kernels/gptKernels.h b/cpp/tensorrt_llm/kernels/gptKernels.h
index 76815203997..4130c357080 100644
--- a/cpp/tensorrt_llm/kernels/gptKernels.h
+++ b/cpp/tensorrt_llm/kernels/gptKernels.h
@@ -42,6 +42,13 @@ enum class PositionEmbeddingType : int8_t
     kALIBI = 3
 };
 
+enum class RotaryScalingType : int8_t
+{
+    kNONE = 0,
+    kLINEAR = 1,
+    kDYNAMIC = 2,
+};
+
 template <typename AttentionMaskDataType>
 struct BuildDecoderInfoParams
 {
diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu
index 508c695b1dc..6d00a43fa46 100644
--- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu
+++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu
@@ -30,12 +30,11 @@ void topK_softMax_kernelLauncher(const T* log_probs, const T* bias, const bool*
     const int temp_storage_size, BeamHypotheses* beam_hyps, const int batch_size, const int beam_width,
     const int vocab_size, const int* end_ids, T diversity_rate, const float length_penalty, cudaStream_t stream);
 
-#define CASE_K(K, MAX_K)                                                                                               \
-    case K ... MAX_K:                                                                                                  \
-        topK_softMax_kernelLauncher<T, MAX_K>(log_probs, bias, finished, sequence_lengths, cum_log_probs,              \
-            output_log_probs, output_ids_ptr, temp_storage, temp_storage_size, beam_hyps, batch_size, beam_width,      \
-            vocab_size, end_ids, diversity_rate, length_penalty, stream);                                              \
-        break;
+#define CASE_K(MAX_K)                                                                                                  \
+    topK_softMax_kernelLauncher<T, MAX_K>(log_probs, bias, finished, sequence_lengths, cum_log_probs,                  \
+        output_log_probs, output_ids_ptr, temp_storage, temp_storage_size, beam_hyps, batch_size, beam_width,          \
+        vocab_size, end_ids, diversity_rate, length_penalty, stream);                                                  \
+    break;
 
 template <typename T>
 void invokeTopkSoftMax(const T* log_probs, const T* bias, const bool* finished, const int* sequence_lengths,
@@ -44,13 +43,25 @@ void invokeTopkSoftMax(const T* log_probs, const T* bias, const bool* finished,
     const int vocab_size, const int* end_ids, const float diversity_rate, const float length_penalty,
     cudaStream_t stream)
 {
-    switch (beam_width)
+    int log_beam_width(0);
+    int recursor(beam_width - 1);
+    while (recursor >>= 1)
+        ++log_beam_width;
+
+    switch (log_beam_width)
     {
-        CASE_K(1, 4);
-        CASE_K(5, 8);
-        CASE_K(9, 16);
-        CASE_K(17, 32);
-        CASE_K(33, 64);
+    // 0 < beam_width <= 4
+    case 0: // 1, 2
+    case 1: // 3, 4
+        CASE_K(4)
+    case 2: // 4 < beam_width <= 8
+        CASE_K(8)
+    case 3: // 9 < beam_width <= 16
+        CASE_K(16)
+    case 4: // 16 < beam_width <= 32
+        CASE_K(32)
+    case 5: // 32 < beam_width <= 64
+        CASE_K(64)
     default: throw std::runtime_error(fmtstr("Topk kernel of beam search does not support beam_width=%d", beam_width));
     }
 }
diff --git a/cpp/tensorrt_llm/kernels/quantization.cu b/cpp/tensorrt_llm/kernels/quantization.cu
index 4417f9d8d43..5214fb02c8f 100644
--- a/cpp/tensorrt_llm/kernels/quantization.cu
+++ b/cpp/tensorrt_llm/kernels/quantization.cu
@@ -17,7 +17,6 @@
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/cudaTypeUtils.cuh"
 #include "tensorrt_llm/common/cudaUtils.h"
-#include "tensorrt_llm/common/int8Utils.cuh"
 #include "tensorrt_llm/common/reduceKernelUtils.cuh"
 #include "tensorrt_llm/kernels/quantization.h"
 
diff --git a/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.cu b/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.cu
index 7380ed0d2ec..281cb7417a2 100644
--- a/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.cu
+++ b/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.cu
@@ -192,22 +192,16 @@ template void invokeBatchApplyTemperaturePenalty(half* logits, const half* bias,
 template <typename T, RepetitionPenaltyType penalty_type>
 __global__ void applyRepetitionPenalty(T* logits, const float penalty, const int* start_ids, int* output_ids,
     const int batch_size, const int local_batch_size, const int vocab_size, const int vocab_size_padd,
-    const int* input_lengths, const int max_input_len, const int step)
+    const int* input_lengths, const int step)
 {
     extern __shared__ float penalty_logits[];
     int* penalty_indices = (int*) (penalty_logits + step);
 
     logits = logits + blockIdx.x * vocab_size_padd;
-    const int input_length = input_lengths != nullptr ? input_lengths[blockIdx.x] : max_input_len;
+    const int input_length = input_lengths != nullptr ? input_lengths[blockIdx.x] : 0;
     for (int index = threadIdx.x; index < step; index += blockDim.x)
     {
-
-        if (index >= input_length && index < max_input_len)
-        {
-            continue;
-        }
-
-        // output_ids shape: (input_len + output_len, batch_size)
+        // output_ids shape: (batch_size, input_len + output_len)
         int penalty_index = output_ids[index * batch_size + blockIdx.x];
         if (penalty_index >= vocab_size)
         {
@@ -241,13 +235,7 @@ __global__ void applyRepetitionPenalty(T* logits, const float penalty, const int
 
     for (int index = threadIdx.x; index < step; index += blockDim.x)
     {
-
-        if (index >= input_length && index < max_input_len)
-        {
-            continue;
-        }
-
-        // output_ids shape: (input_len + output_len, batch_size)
+        // output_ids shape: (batch_size, input_len + output_len)
         if (penalty_indices[index] >= vocab_size)
         {
             continue;
@@ -256,54 +244,15 @@ __global__ void applyRepetitionPenalty(T* logits, const float penalty, const int
     }
 }
 
-template <typename T>
-void invokeApplyRepetitionPenalty(T* logits, const float penalty, const int* start_ids, int* output_ids,
-    const int batch_size, const int local_batch_size, const int vocab_size, const int vocab_size_padd,
-    const int* input_lengths, const int max_input_len, const int step, const RepetitionPenaltyType penalty_type,
-    cudaStream_t stream)
-{
-    dim3 block(min(step, 1024));
-    dim3 grid(local_batch_size);
-    size_t smem_size = step * (sizeof(float) + sizeof(int));
-
-    if (penalty_type == RepetitionPenaltyType::Additive)
-    {
-        applyRepetitionPenalty<T, RepetitionPenaltyType::Additive><<<grid, block, smem_size, stream>>>(logits, penalty,
-            start_ids, output_ids, batch_size, local_batch_size, vocab_size, vocab_size_padd, input_lengths,
-            max_input_len, step);
-    }
-    else if (penalty_type == RepetitionPenaltyType::Multiplicative)
-    {
-        applyRepetitionPenalty<T, RepetitionPenaltyType::Multiplicative><<<grid, block, smem_size, stream>>>(logits,
-            penalty, start_ids, output_ids, batch_size, local_batch_size, vocab_size, vocab_size_padd, input_lengths,
-            max_input_len, step);
-    }
-    else if (penalty_type == RepetitionPenaltyType::None)
-    {
-        // do nothing
-    }
-}
-
-template void invokeApplyRepetitionPenalty(float* logits, const float penalty, const int* start_ids, int* output_ids,
-    const int batch_size, const int local_batch_size, const int vocab_size, const int vocab_size_padd,
-    const int* input_lengths, const int max_input_len, const int step, const RepetitionPenaltyType penalty_type,
-    cudaStream_t stream);
-
-template void invokeApplyRepetitionPenalty(half* logits, const float penalty, const int* start_ids, int* output_ids,
-    const int batch_size, const int local_batch_size, const int vocab_size, const int vocab_size_padd,
-    const int* input_lengths, const int max_input_len, const int step, const RepetitionPenaltyType penalty_type,
-    cudaStream_t stream);
-
 template <typename T, RepetitionPenaltyType penalty_type>
 __global__ void batchApplyRepetitionPenalty(T* logits, const float* penalties, const int** output_ids,
     const int* sequence_lengths, const int batch_size, const int vocab_size, const int* input_lengths,
-    const int max_input_length, const int max_seq_len)
+    const int max_seq_len)
 {
     extern __shared__ float penalty_logits[];
     int* penalty_indices = (int*) (penalty_logits + max_seq_len);
     const int batch_idx = blockIdx.x;
     const float penalty = penalties[batch_idx];
-    const int input_length = input_lengths != nullptr ? input_lengths[batch_idx] : max_input_length;
     const int current_step = sequence_lengths[batch_idx];
 
     logits += batch_idx * vocab_size;
@@ -312,12 +261,7 @@ __global__ void batchApplyRepetitionPenalty(T* logits, const float* penalties, c
     // A vocab id can appear multiple times but should be penalized once.
     for (int index = threadIdx.x; index < current_step; index += blockDim.x)
     {
-        // Skip the padding tokens in input sequences.
-        if (index >= input_length && index < max_input_length)
-        {
-            continue;
-        }
-        // output_ids shape: (input_len + output_len, batch_size)
+        // output_ids shape: (batch_size, input_len + output_len)
         int penalty_index = output_ids[batch_idx][blockIdx.y * max_seq_len + index];
         assert(penalty_index < vocab_size);
         penalty_indices[index] = penalty_index;
@@ -349,11 +293,6 @@ __global__ void batchApplyRepetitionPenalty(T* logits, const float* penalties, c
     // Phase 2. Replace a logit value by the penalized one.
     for (int index = threadIdx.x; index < current_step; index += blockDim.x)
     {
-        // Skip the padding tokens in input sequences.
-        if (index >= input_length && index < max_input_length)
-        {
-            continue;
-        }
         logits[penalty_indices[index]] = penalty_logits[index];
     }
 }
@@ -361,8 +300,7 @@ __global__ void batchApplyRepetitionPenalty(T* logits, const float* penalties, c
 template <typename T>
 void invokeBatchApplyRepetitionPenalty(T* logits, const float* penalties, const int** output_ids,
     const int* sequence_lengths, const int batch_size, const int local_batch_size, const int vocab_size,
-    const int* input_lengths, const int max_input_length, RepetitionPenaltyType penalty_type, int max_seq_len,
-    cudaStream_t stream)
+    const int* input_lengths, RepetitionPenaltyType penalty_type, int max_seq_len, cudaStream_t stream)
 {
     // Inputs
     //   logits [local_batch_size, vocab_size] : logit values.
@@ -370,24 +308,20 @@ void invokeBatchApplyRepetitionPenalty(T* logits, const float* penalties, const
     //   output_ids int**, [bs] array, each array has [1, max_seq_len]
     //   sequence_lengths int*, [bs]
     //   input_lengths [local_batch_size], input lengths
-    //   (optional).
-    //      Padding tokens at [input_length, max_input_length) of input will not
-    //      be penalized.
+
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     dim3 block(min(max_seq_len, 1024));
     dim3 grid(batch_size);
     size_t smem_size = max_seq_len * (sizeof(float) + sizeof(int));
     if (penalty_type == RepetitionPenaltyType::Additive)
     {
-        batchApplyRepetitionPenalty<T, RepetitionPenaltyType::Additive><<<grid, block, smem_size, stream>>>(logits,
-            penalties, output_ids, sequence_lengths, batch_size, vocab_size, input_lengths, max_input_length,
-            max_seq_len);
+        batchApplyRepetitionPenalty<T, RepetitionPenaltyType::Additive><<<grid, block, smem_size, stream>>>(
+            logits, penalties, output_ids, sequence_lengths, batch_size, vocab_size, input_lengths, max_seq_len);
     }
     else if (penalty_type == RepetitionPenaltyType::Multiplicative)
     {
-        batchApplyRepetitionPenalty<T, RepetitionPenaltyType::Multiplicative>
-            <<<grid, block, smem_size, stream>>>(logits, penalties, output_ids, sequence_lengths, batch_size,
-                vocab_size, input_lengths, max_input_length, max_seq_len);
+        batchApplyRepetitionPenalty<T, RepetitionPenaltyType::Multiplicative><<<grid, block, smem_size, stream>>>(
+            logits, penalties, output_ids, sequence_lengths, batch_size, vocab_size, input_lengths, max_seq_len);
     }
     else if (penalty_type == RepetitionPenaltyType::None)
     {
@@ -397,22 +331,21 @@ void invokeBatchApplyRepetitionPenalty(T* logits, const float* penalties, const
 
 template void invokeBatchApplyRepetitionPenalty(float* logits, const float* penalties, const int** output_ids,
     const int* sequence_lengths, const int batch_size, const int local_batch_size, const int vocab_size,
-    const int* input_lengths, const int max_input_length, RepetitionPenaltyType penalty_type, int max_seq_len,
-    cudaStream_t stream);
+    const int* input_lengths, RepetitionPenaltyType penalty_type, int max_seq_len, cudaStream_t stream);
 
 template void invokeBatchApplyRepetitionPenalty(half* logits, const float* penalties, const int** output_ids,
     const int* sequence_lengths, const int batch_size, const int local_batch_size, const int vocab_size,
-    const int* input_lengths, const int max_input_length, RepetitionPenaltyType penalty_type, int max_seq_len,
-    cudaStream_t stream);
+    const int* input_lengths, RepetitionPenaltyType penalty_type, int max_seq_len, cudaStream_t stream);
 
 template <typename T>
 __global__ void batchApplyMinLengthPenalty(T* logits, const int* min_lengths, const int* end_ids,
-    const int* sequence_lengths, const int max_input_length, const int vocab_size_padded)
+    const int* sequence_lengths, const int* input_lengths, const int vocab_size_padded)
 {
     int bid = threadIdx.x + blockIdx.x * blockDim.x; // batch index
-    // We need +1 because sequence_lengths = max_input_length + num_gen_tokens -
-    // 1, which is equal to the length of k/v caches.
-    if (sequence_lengths[bid] + 1 - max_input_length < min_lengths[bid])
+    auto const input_length{input_lengths == nullptr ? 0 : input_lengths[bid]};
+    // We need +1 because sequence_lengths = num_gen_tokens + input_length - 1, which is equal to the length of k/v
+    // caches.
+    if (sequence_lengths[bid] + 1 - input_length < min_lengths[bid])
     {
         T mask_val = (std::is_same<T, half>::value) ? -65504.0f : -FLT_MAX;
         logits[bid * vocab_size_padded + end_ids[bid]] = mask_val;
@@ -421,21 +354,21 @@ __global__ void batchApplyMinLengthPenalty(T* logits, const int* min_lengths, co
 
 template <typename T>
 void invokeMinLengthPenalty(T* logits, const int* min_lengths, const int* end_ids, const int* sequnece_lengths,
-    const int max_input_length, const int batch_size, const int vocab_size_padded, cudaStream_t stream)
+    const int* input_lengths, const int batch_size, const int vocab_size_padded, cudaStream_t stream)
 
 {
     const int block_size = min(batch_size, 1024);
     const int grid_size = (batch_size + block_size - 1) / block_size;
     batchApplyMinLengthPenalty<<<grid_size, block_size, 0, stream>>>(
-        logits, min_lengths, end_ids, sequnece_lengths, max_input_length, vocab_size_padded);
+        logits, min_lengths, end_ids, sequnece_lengths, input_lengths, vocab_size_padded);
 }
 
 template void invokeMinLengthPenalty(float* logits, const int* min_lengths, const int* end_ids,
-    const int* sequnece_lengths, const int max_input_length, const int batch_size, const int vocab_size_padded,
+    const int* sequnece_lengths, const int* input_lengths, const int batch_size, const int vocab_size_padded,
     cudaStream_t stream);
 
 template void invokeMinLengthPenalty(half* logits, const int* min_lengths, const int* end_ids,
-    const int* sequnece_lengths, const int max_input_length, const int batch_size, const int vocab_size_padded,
+    const int* sequnece_lengths, const int* input_lengths, const int batch_size, const int vocab_size_padded,
     cudaStream_t stream);
 
 } // namespace kernels
diff --git a/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.h b/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.h
index 701a1ebe8a6..0dc754ca82b 100644
--- a/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.h
+++ b/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.h
@@ -25,17 +25,10 @@ namespace tensorrt_llm
 namespace kernels
 {
 
-template <typename T>
-void invokeApplyRepetitionPenalty(T* logits, const float penalty, const int* start_ids, int* output_ids,
-    const int batch_size, const int local_batch_size, const int vocab_size, const int vocab_size_padd,
-    const int* input_lengths, const int max_input_len, const int step, const RepetitionPenaltyType penalty_type,
-    cudaStream_t stream);
-
 template <typename T>
 void invokeBatchApplyRepetitionPenalty(T* logits, const float* penalties, const int** output_ids,
     const int* sequence_lengths, const int batch_size, const int local_batch_size, const int vocab_size,
-    const int* input_lengths, const int max_input_length, const RepetitionPenaltyType penalty_type, int max_seq_len,
-    cudaStream_t stream);
+    const int* input_lengths, const RepetitionPenaltyType penalty_type, int max_seq_len, cudaStream_t stream);
 
 template <typename T>
 void invokeApplyTemperaturePenalty(T* logits, const T* bias, const float temperature, const int batch_size,
@@ -47,7 +40,7 @@ void invokeBatchApplyTemperaturePenalty(T* logits, const T* bias, const float* t
 
 template <typename T>
 void invokeMinLengthPenalty(T* logits, const int* min_lengths, const int* end_ids, const int* sequnece_lengths,
-    const int max_input_length, const int batch_size, const int vocab_size_padded, cudaStream_t stream);
+    const int* input_lengths, const int batch_size, const int vocab_size_padded, cudaStream_t stream);
 
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
index 6ba4f0a780f..d10f2cc290b 100644
--- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
+++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
@@ -186,7 +186,7 @@ __global__ void topk_stage1(const T* __restrict log_probs, T* tmp_log_probs, int
             const int index = tmp_topk_buf_index + ite;
             topk_tmp_id_buf[index] = total.p;
             topk_tmp_val_buf[index] = total.u;
-            if (total.p >= 0 && total.p < vocab_size)
+            if (total.p >= 0)
             {
                 tmp_log_probs[total.p] = -MAX_T_VAL;
             }
@@ -312,16 +312,15 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, T* t
     }
 }
 
-#define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_)                                           \
-    case K_MIN ... K_MAX:                                                                                              \
-        topk_stage1<T, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_>                                                                \
-            <<<batch_size * BLOCKS_PER_BEAM_, BLOCK_SIZE_1_, 0, stream>>>(log_probs, temp_log_probs, topk_tmp_id_buf,  \
-                topk_tmp_val_buf, finished, max_top_k, top_ks, vocab_size, end_ids, skip_decode);                      \
-        topk_stage2_sampling<T, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_>                                                       \
-            <<<batch_size, BLOCK_SIZE_2_, K_MAX * sizeof(int) + K_MAX * sizeof(float), stream>>>(topk_tmp_id_buf,      \
-                topk_tmp_val_buf, ids, sequence_lengths, finished, cum_log_probs, output_log_probs, max_top_k, top_ks, \
-                top_p, top_ps, curandstate, end_ids, vocab_size, skip_decode);                                         \
-        break;
+#define CASE_K(K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_)                                                  \
+    topk_stage1<T, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_>                                                                    \
+        <<<batch_size * BLOCKS_PER_BEAM_, BLOCK_SIZE_1_, 0, stream>>>(log_probs, temp_log_probs, topk_tmp_id_buf,      \
+            topk_tmp_val_buf, finished, max_top_k, top_ks, vocab_size, end_ids, skip_decode);                          \
+    topk_stage2_sampling<T, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_>                                                           \
+        <<<batch_size, BLOCK_SIZE_2_, K_MAX * sizeof(int) + K_MAX * sizeof(float), stream>>>(topk_tmp_id_buf,          \
+            topk_tmp_val_buf, ids, sequence_lengths, finished, cum_log_probs, output_log_probs, max_top_k, top_ks,     \
+            top_p, top_ps, curandstate, end_ids, vocab_size, skip_decode);                                             \
+    break;
 
 template <typename T>
 void invokeBatchTopKSampling(void* workspace, size_t& workspace_size, const T* log_probs, int** ids,
@@ -355,12 +354,27 @@ void invokeBatchTopKSampling(void* workspace, size_t& workspace_size, const T* l
     int* topk_tmp_id_buf = (int*) (temp_log_probs + temp_log_probs_buf_size);
     T* topk_tmp_val_buf = (T*) (topk_tmp_id_buf + topk_tmp_ids_buf_size);
 
-    switch (max_top_k)
+    // TODO (bhsueh) need to support case top_k = [2, 17] (use different cases of max_top_k)
+    int log_max_top_k(0);
+    int recursor(max_top_k - 1);
+    while (recursor >>= 1)
+        ++log_max_top_k;
+    switch (log_max_top_k)
     {
-        CASE_K(1, 16, 128, 128, 8);
-        CASE_K(17, 32, 256, 128, 8);
-        CASE_K(33, 64, 256, 256, 8);
-        CASE_K(65, 1024, 256, 256, 8);
+    case 0:
+    case 1:
+    case 2:
+    case 3: // 0 < max_top_k <= 16
+        CASE_K(16, 128, 128, 8);
+    case 4: // 16 < max_top_k <= 32
+        CASE_K(32, 256, 128, 8);
+    case 5: // 32 < max_top_k <= 64
+        CASE_K(64, 256, 256, 8);
+    case 6:
+    case 7:
+    case 8:
+    case 9: // 64 < max_top_k <= 1024
+        CASE_K(1024, 256, 256, 8);
     default: throw std::domain_error(fmtstr("top-k kernel supports 1<=k<=1024 but got k=%d", max_top_k));
     }
 }
diff --git a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu
index be4a2ba0601..7172251bfea 100644
--- a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu
+++ b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu
@@ -93,7 +93,8 @@ void invokeStopWordsCriterion(const int** output_ids, const int** parent_ids, co
     // Check if we have sampled a word from the stop_words list. If so, stop the
     // sequence.
     dim3 block, grid;
-    block.x = min(((stop_words_len + 32 - 1) / 32) * 32, 256UL);
+    constexpr size_t max_block_size{256};
+    block.x = min(((stop_words_len + 32 - 1) / 32) * 32, max_block_size);
     grid.x = (stop_words_len + block.x - 1) / block.x;
     grid.y = batch_size * beam_width;
 
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu
index 1e95938dcc6..3ef48a0c2b5 100644
--- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu
+++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu
@@ -1252,8 +1252,8 @@ struct Vec_t<__nv_bfloat16>
 template <typename T, bool ADD_BIAS, bool USING_CONTEXT_FMHA>
 __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, T* k_buf, T* v_buf, T* QKV, const T* __restrict qkv_bias,
     const int* seq_lens, const int* padding_offset, const int batch_size, const int seq_len, const int head_num,
-    const int kv_head_num, const int size_per_head, const int rotary_embedding_dim,
-    PositionEmbeddingType const position_embedding_type)
+    const int kv_head_num, const int size_per_head, const int rotary_embedding_dim, const float rotary_embedding_base,
+    const float rotary_embedding_scale, PositionEmbeddingType const position_embedding_type)
 {
     // This kernel add bias to QKV, which has shape [batch_size, seq_len, 3, head_num, size_per_head], and
     // QKV split to 3 split buffer q, k, v and transpose them to [batch_size, head_num, seq_len, size_per_head].
@@ -1351,11 +1351,10 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, T* k_buf, T* v_buf,
 
     switch (position_embedding_type)
     {
-    case PositionEmbeddingType::kLEARNED_ABSOLUTE:
     case PositionEmbeddingType::kROPE_GPTJ:
-    case PositionEmbeddingType::kALIBI:
     {
-        mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, dst_kv_seq_idx);
+        mmha::apply_rotary_embedding(
+            q, k, tidx, rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, dst_kv_seq_idx);
         break;
     }
     case PositionEmbeddingType::kROPE_GPT_NEOX:
@@ -1385,7 +1384,8 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, T* k_buf, T* v_buf,
             mmha::vec_from_smem_transpose(q, q_smem, transpose_idx, smem_pitch);
             mmha::vec_from_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
 
-            mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, rotary_embedding_dim, dst_kv_seq_idx);
+            mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, rotary_embedding_dim, rotary_embedding_base,
+                rotary_embedding_scale, dst_kv_seq_idx);
 
             mmha::write_smem_transpose(q, q_smem, transpose_idx, smem_pitch);
             mmha::write_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
@@ -1456,12 +1456,13 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, T* k_buf, T* v_buf,
 #define FUSED_QKV_BIAS_ROTARY_TRANSPOSE_LAUNCH(T, ADD_BIAS, USING_CONTEXT_FMHA)                                        \
     add_fusedQKV_bias_transpose_kernel<T, ADD_BIAS, USING_CONTEXT_FMHA><<<grid, block, smem_size, stream>>>(q_buf,     \
         k_buf, v_buf, QKV, qkv_bias, seq_lens, padding_offset, batch_size, seq_len, head_num, kv_head_num,             \
-        size_per_head, rotary_embedding_dim, position_embedding_type);
+        size_per_head, rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, position_embedding_type);
 
 template <typename T>
 void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, T* v_buf, T* QKV, const T* qkv_bias, const int* seq_lens,
     const int* padding_offset, const int batch_size, const int seq_len, const int token_num, const int head_num,
     const int kv_head_num, const int size_per_head, const bool using_context_fmha, const int rotary_embedding_dim,
+    const float rotary_embedding_base, const float rotary_embedding_scale,
     const PositionEmbeddingType position_embedding_type, const float* scale, const int int8_mode, cudaStream_t stream)
 {
     // [bs, seq_len, 3, head, Dh]
@@ -1534,8 +1535,9 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, T* v_buf, T* QKV, const
     template void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, T* v_buf, T* QKV, const T* qkv_bias,              \
         const int* seq_lens, const int* padding_offset, const int batch_size, const int seq_len, const int token_num,  \
         const int head_num, const int kv_head_num, const int size_per_head, const bool using_context_fmha,             \
-        const int rotary_embedding_dim, const PositionEmbeddingType position_embedding_type, const float* scale,       \
-        const int int8_mode, cudaStream_t stream)
+        const int rotary_embedding_dim, const float rotary_embedding_base, const float rotary_embedding_scale,         \
+        const PositionEmbeddingType position_embedding_type, const float* scale, const int int8_mode,                  \
+        cudaStream_t stream)
 INSTANTIATE_ADDFUSEDQKVBIAS_TRANSPOSE(float);
 INSTANTIATE_ADDFUSEDQKVBIAS_TRANSPOSE(half);
 #ifdef ENABLE_BF16
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h
index 51ccc08cb34..f5ec68e3a57 100644
--- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h
+++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h
@@ -76,6 +76,7 @@ template <typename T>
 void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, T* v_buf, T* QKV, const T* qkv_bias, const int* seq_lens,
     const int* padding_offset, const int batch_size, const int seq_len, const int token_num, const int head_num,
     const int kv_head_num, const int size_per_head, const bool using_context_fmha, const int rotary_embedding_dim,
+    const float rotary_embedding_base, const float rotary_embedding_scale,
     PositionEmbeddingType const position_embedding_type, const float* scale, const int int8_mode, cudaStream_t stream);
 
 template <typename T>
@@ -91,11 +92,12 @@ template <typename T>
 void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, T* v_buf, T* QKV, const int* seq_lens,
     const int* padding_offset, const int batch_size, const int seq_len, const int token_num, const int head_num,
     const int kv_head_num, const int size_per_head, const bool using_context_fmha, const int rotary_embedding_dim,
+    const float rotary_embedding_base, const float rotary_embedding_scale,
     PositionEmbeddingType const position_embedding_type, const float* scale, const int int8_mode, cudaStream_t stream)
 {
     invokeAddFusedQKVBiasTranspose(q_buf, k_buf, v_buf, QKV, (const T*) nullptr, seq_lens, padding_offset, batch_size,
         seq_len, token_num, head_num, kv_head_num, size_per_head, using_context_fmha, rotary_embedding_dim,
-        position_embedding_type, scale, int8_mode, stream);
+        rotary_embedding_base, rotary_embedding_scale, position_embedding_type, scale, int8_mode, stream);
 }
 
 template <typename T, typename KVCacheBuffer>
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h
new file mode 100644
index 00000000000..415f2d7b361
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <iostream>
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+struct WeightOnlyParams
+{
+    const uint8_t* qweight;
+    const half* scales;
+    const half* zeros;
+    const half* in;
+    const half* bias;
+    half* out;
+    const int m;
+    const int n;
+    const int k;
+    const int group_size;
+
+    WeightOnlyParams(const uint8_t* _qweight, const half* _scales, const half* _zeros, const half* _in,
+        const half* _bias, half* _out, const int _m, const int _n, const int _k, const int _group_size)
+        : qweight(_qweight)
+        , scales(_scales)
+        , zeros(_zeros)
+        , in(_in)
+        , bias(_bias)
+        , out(_out)
+        , m(_m)
+        , n(_n)
+        , k(_k)
+        , group_size(_group_size)
+    {
+    }
+};
+enum class WeightOnlyQuantType
+{
+    Int4b,
+    Int8b
+};
+enum class WeightOnlyType
+{
+    PerChannel,
+    GroupWise
+};
+
+struct WeightOnlyPerChannel;
+template <int GS>
+struct WeightOnlyGroupWise;
+
+enum class WeightOnlyActivationType
+{
+    Gelu,
+    Relu,
+    Identity,
+    InvalidType
+};
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h
new file mode 100644
index 00000000000..9a58c352b45
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h"
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+template <WeightOnlyQuantType QType>
+struct WeightLayoutDetails;
+
+template <>
+struct WeightLayoutDetails<WeightOnlyQuantType::Int4b>
+{
+    // Every four rows of the original weights are interleaved into a row with stride of 64, so if each thread
+    // processes 32 elements(for int4, we can use ldg.128 to load weights), then every group of two adjacent threads
+    // will alternately process four different row weights
+    // for example
+    // every 256 consecutive int4 elements [256*i, 256*(i+1)-1] of row N under interleave layout,
+    // the first 64 are from [64*i, 64*(i+1)-1] of row 4N before interleaving,
+    // and the second 64 are from [64*i, 64*(i+1)-1] of row 4N+1 before interleaving, and so on.
+    // So if each thread loads 32 int4 elements, then the elements of each 2 adjacent threads of each 8
+    // consecutive threads will come from row 4N ~ 4N+3 respectively before interleaving.
+    static constexpr int kElemBits = 4;
+    static constexpr int kInterleave = 4;
+    static constexpr int kStride = 64;
+
+    // The index remapping here is to counteracts the effect of cutlass::permute_B_rows_for_mixed_gemm
+    // input 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ... 31
+    // weight 0 1 8 9 16 17 24 25 2 3 10 11 18 19 26 27 4 5 12 13 20 21 28 29 6 7 14 15 22 23 30 31
+    static constexpr int kShuffleSize = 32;
+    static constexpr int kShuffleBasicTile = 2;
+    static constexpr int kShuffleContinous = 4;
+    static constexpr int kShuffleStrided = 4;
+
+    // The rearrangement here counteracts the effect of cutlass::add_bias_and_interleave_int4s_inplace
+    // Input int8 data layout
+    //      [elt_7  elt_5  elt_3  elt_1  elt_6  elt_4  elt_2  elt_0] (each elt occupies 4 bits)
+    //
+    // Converted fp16 data layout
+    //      [elt_7  elt_6  elt_5  elt_4  elt_3  elt_2  elt_1  elt_0] (each elt occupies 16 bits)
+    static constexpr int kConvertCount = 8;
+    using Converter
+        = cutlass::FastInterleavedAndBiasedNumericArrayConverter<cutlass::half_t, cutlass::uint4b_t, kConvertCount>;
+
+    // Each warp completes the internal reduce and writes the [Batch * NPerBlock * Interleave] results to the
+    // corresponding address in shared memory
+    template <int Num, int WarpSize>
+    __device__ __forceinline__ static void sync(float* res, float (*sm)[Num * kInterleave])
+    {
+#pragma unroll
+        for (int i = 0; i < Num; ++i)
+        {
+            res[i] += __shfl_xor_sync(~0, res[i], 16);
+            res[i] += __shfl_xor_sync(~0, res[i], 8);
+            res[i] += __shfl_xor_sync(~0, res[i], 1);
+        }
+        __syncthreads();
+        int warp = threadIdx.x / WarpSize, lane = threadIdx.x % WarpSize;
+        if (lane == 0 || lane == 2 || lane == 4 || lane == 6)
+        {
+#pragma unroll
+            for (int i = 0; i < Num; ++i)
+            {
+                sm[warp][i * kInterleave + lane / 2] = res[i];
+            }
+        }
+        __syncthreads();
+    }
+};
+
+template <>
+struct WeightLayoutDetails<WeightOnlyQuantType::Int8b>
+{
+    // Every two rows of the original weights are interleaved into a row with stride of 64, so if each thread
+    // processes 16 elements(for int8, we can use ldg.128 to load weights), then every group of four adjacent threads
+    // will alternately process two different row weights
+    // for example
+    // every 128 consecutive int8 elements [128*i, 128*(i+1)-1] of row N under interleave layout,
+    // the first 64 are from [64*i, 64*(i+1)-1] of row 2N before interleaving,
+    // and the last 64 are from [64*i, 64*(i+1)-1] of row 2N+1 before interleaving.
+    // So if each thread loads 16 int8 elements, then the elements of the first four and last four threads of each 8
+    // consecutive threads will come from row 2N and row 2N+1 respectively before interleaving.
+    static constexpr int kElemBits = 8;
+    static constexpr int kInterleave = 2;
+    static constexpr int kStride = 64;
+
+    // The index remapping here is to counteracts the effect of cutlass::permute_B_rows_for_mixed_gemm
+    // input 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+    // weight 0 1 8 9 2 3 10 11 4 5 12 13 6 7 14 15
+    static constexpr int kShuffleSize = 16;
+    static constexpr int kShuffleBasicTile = 2;
+    static constexpr int kShuffleContinous = 2;
+    static constexpr int kShuffleStrided = 4;
+
+    // The rearrangement here counteracts the effect of cutlass::add_bias_and_interleave_int8s_inplace
+    // Input int8 data layout
+    //      [elt_3  elt_1  elt_2  elt_0] (each elt occupies 8 bits)
+    //
+    // Converted fp16 data layout
+    //      [elt_3  elt_2  elt_1  elt_0] (each elt occupies 16 bits)
+    static constexpr int kConvertCount = 4;
+    using Converter = cutlass::FastInterleavedAndBiasedNumericArrayConverter<cutlass::half_t, uint8_t, kConvertCount>;
+
+    // Each warp completes the internal reduce and writes the [Batch * NPerBlock * Interleave] results to the
+    // corresponding address in shared memory
+    template <int Num, int WarpSize>
+    __device__ __forceinline__ static void sync(float* res, float (*sm)[Num * kInterleave])
+    {
+#pragma unroll
+        for (int i = 0; i < Num; ++i)
+        {
+            res[i] += __shfl_xor_sync(~0, res[i], 16);
+            res[i] += __shfl_xor_sync(~0, res[i], 8);
+            res[i] += __shfl_xor_sync(~0, res[i], 2);
+            res[i] += __shfl_xor_sync(~0, res[i], 1);
+        }
+        __syncthreads();
+        int warp = threadIdx.x / WarpSize, lane = threadIdx.x % WarpSize;
+        if (lane == 0 || lane == 4)
+        {
+#pragma unroll
+            for (int i = 0; i < Num; ++i)
+            {
+                sm[warp][i * kInterleave + lane / 4] = res[i];
+            }
+        }
+        __syncthreads();
+    }
+};
+
+template <WeightOnlyQuantType QType>
+struct WeightOnlyKernelDetails
+{
+    using Layout = WeightLayoutDetails<QType>;
+
+    static constexpr int kElemBits = Layout::kElemBits;
+    static constexpr int kInterleave = Layout::kInterleave;
+    static constexpr int kStride = Layout::kStride;
+
+    static constexpr int kShuffleSize = Layout::kShuffleSize;
+    static constexpr int kShuffleBasicTile = Layout::kShuffleBasicTile;
+    static constexpr int kShuffleContinous = Layout::kShuffleContinous;
+    static constexpr int kShuffleStrided = Layout::kShuffleStrided;
+
+    using Converter = typename Layout::Converter;
+    static constexpr int kConvertCount = Layout::kConvertCount;
+
+    // Use ldg128 load data from global memory
+    static constexpr int kAccessSize = 128;
+    using AccessType = uint4;
+
+    static constexpr int kElemsPerByte = 8 / kElemBits;
+    static constexpr int kElemsPerThread = kAccessSize / kElemBits;
+    static constexpr int kBytePerThread = kElemsPerThread / kElemsPerByte;
+    static constexpr int kThreadsNumPerTile = kStride / kElemsPerThread;
+    static constexpr int kThreadsNumPerInterleave = kThreadsNumPerTile * kInterleave;
+
+    static constexpr int kConvertIters = kElemsPerThread / kConvertCount;
+
+    // Each thread loads 16(int8b)/32(int4b) quantized weight elements each time through ldg128
+    // So more times of ldg128 are needed to load the same number of fp16 activation elements.
+    static constexpr int kActivationElemNumPerAccess = kAccessSize / (sizeof(half) * 8);
+    static constexpr int kActivationAccessNum = kElemsPerThread / kActivationElemNumPerAccess;
+};
+
+template <typename WeightOnlyFlag>
+struct WeightOnlyProperties;
+
+template <>
+struct WeightOnlyProperties<WeightOnlyPerChannel>
+{
+    static constexpr bool kIsFineGrained = false;
+    static constexpr int kGroupSize = 0;
+};
+
+template <int GS>
+struct WeightOnlyProperties<WeightOnlyGroupWise<GS>>
+{
+    static constexpr bool kIsFineGrained = true;
+    static constexpr int kGroupSize = GS;
+};
+
+template <WeightOnlyQuantType QType, typename WeightOnlyFlag, bool Zero, int BlockSize>
+struct WeightOnlyScaleLoader
+{
+    using ElemType = half;
+    using Details = WeightOnlyKernelDetails<QType>;
+    static constexpr bool kIsFineGrained = WeightOnlyProperties<WeightOnlyFlag>::kIsFineGrained;
+    static constexpr int kGroupSize = WeightOnlyProperties<WeightOnlyFlag>::kGroupSize;
+
+private:
+    const ElemType* _scales;
+    const ElemType* _zeros;
+    int _stride;
+    int _offset;
+
+public:
+    __device__ __forceinline__ WeightOnlyScaleLoader(
+        const ElemType* scales, const ElemType* zeros, int initial_offset, int stride)
+        : _scales(scales)
+        , _zeros(zeros)
+        , _stride(stride)
+    {
+        _scales += initial_offset;
+        if constexpr (Zero)
+        {
+            _zeros += initial_offset;
+        }
+        // Calculate the k dimension index of the element processed by the current thread of layout before interleave
+        // Used to load scales and zeros in groupwise weight only quant
+        _offset = threadIdx.x / Details::kThreadsNumPerInterleave * Details::kStride
+            + (threadIdx.x % Details::kThreadsNumPerTile) * Details::kElemsPerThread;
+    }
+
+    __device__ __forceinline__ void load(ElemType& scale, ElemType& zero, int nid)
+    {
+        int offset = nid * Details::kInterleave;
+        if constexpr (kIsFineGrained)
+        {
+            offset += _offset / kGroupSize * _stride;
+        }
+        scale = _scales[offset];
+        if constexpr (Zero)
+        {
+            zero = _zeros[offset];
+        }
+        else
+        {
+            zero = static_cast<ElemType>(0.f);
+        }
+    }
+
+    __device__ __forceinline__ void advance()
+    {
+        _offset += BlockSize * Details::kElemsPerThread / Details::kInterleave;
+    }
+
+    __device__ __forceinline__ int offset()
+    {
+        return _offset;
+    }
+};
+
+template <WeightOnlyQuantType QType, typename WeightOnlyFlag, template <typename T> class ActOp, bool Zero, bool Bias,
+    int NPerBlock, int Batch, int BlockSize>
+__global__ void weight_only_batched_gemv(const uint8_t* qweight, const half* scales, const half* zeros, const half* in,
+    const half* bias, half* out, const int n, const int k)
+{
+    static_assert(NPerBlock == 1 || (NPerBlock % 2 == 0));
+    using Details = WeightOnlyKernelDetails<QType>;
+
+    using Converter = typename Details::Converter;
+    using AccType = typename Details::AccessType;
+    using CvtSrcType = typename Converter::source_type;
+    using CvtResType = typename Converter::result_type;
+    using ScaleLoader = WeightOnlyScaleLoader<QType, WeightOnlyFlag, Zero, BlockSize>;
+    extern __shared__ uint8_t shmem[];
+    constexpr int Interleave = Details::kInterleave;
+    constexpr int WarpSize = 32;
+    constexpr int Num = Batch * NPerBlock;
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int n_start_id = bid * NPerBlock * Interleave;
+    // Calculate the n-dimensional index of the data processed by the current thread in the interleave tile
+    const int interleave_n_id = (tid / Details::kThreadsNumPerTile) % Interleave;
+
+    qweight += n_start_id * k / Details::kElemsPerByte;
+    ScaleLoader scale_loader(scales, zeros, n_start_id + interleave_n_id, n);
+
+    float(*sm)[Num * Interleave] = reinterpret_cast<float(*)[Num * Interleave]>(shmem);
+
+    // In order to take advantage of hfma2, we use fp16 for accumulation within threads and fp32 for accumulation
+    // between threads.
+    half accumulator[Num];
+    for (int i = 0; i < Num; ++i)
+    {
+        accumulator[i] = __float2half_rn(0.f);
+    }
+
+    // Iteration in k dimensions
+    for (int local_k = tid * Details::kElemsPerThread; local_k < k * Interleave;
+         local_k += BlockSize * Details::kElemsPerThread)
+    {
+        half weights_f16[Details::kElemsPerThread * NPerBlock];
+        half scale[NPerBlock], zero[NPerBlock];
+#pragma unroll
+        for (int idx = 0; idx < NPerBlock; ++idx)
+        {
+            // Load quantized weight and scales/zeros
+            uint8_t weights_quantized[Details::kBytePerThread];
+            load<AccType>(weights_quantized,
+                qweight + idx * Interleave * k / Details::kElemsPerByte + local_k / Details::kElemsPerByte);
+            scale_loader.load(scale[idx], zero[idx], idx);
+            half weights_vec[Details::kElemsPerThread];
+#pragma unroll
+            for (int i = 0; i < Details::kConvertIters; ++i)
+            {
+                // Use cutlass::FastInterleavedAndBiasedNumericArrayConverter for I2F type conversion
+                assign<CvtResType>(weights_vec + i * Details::kConvertCount,
+                    Converter::convert(*reinterpret_cast<CvtSrcType*>(
+                        weights_quantized + i * Details::kConvertCount / Details::kElemsPerByte)));
+            }
+#pragma unroll
+            for (int i = 0; i < Details::kShuffleContinous; ++i)
+            {
+#pragma unroll
+                for (int j = 0; j < Details::kShuffleStrided; ++j)
+                {
+                    // Dequantize the weights and arrange the shuffled elements back to the correct order in the
+                    // register array
+                    half2 v = *reinterpret_cast<half2*>(weights_vec + i * Details::kShuffleBasicTile
+                        + j * Details::kShuffleContinous * Details::kShuffleBasicTile);
+                    v = __hfma2(v, __half2half2(scale[idx]), __half2half2(zero[idx]));
+                    weights_f16[(i * Details::kShuffleStrided * Details::kShuffleBasicTile
+                                    + j * Details::kShuffleBasicTile + 0)
+                            * NPerBlock
+                        + idx]
+                        = v.x;
+                    weights_f16[(i * Details::kShuffleStrided * Details::kShuffleBasicTile
+                                    + j * Details::kShuffleBasicTile + 1)
+                            * NPerBlock
+                        + idx]
+                        = v.y;
+                }
+            }
+        }
+#pragma unroll
+        for (int b = 0; b < Batch; ++b)
+        {
+            half in_v[Details::kElemsPerThread];
+#pragma unroll
+            for (int idx = 0; idx < Details::kActivationAccessNum; ++idx)
+            {
+                // load activation elements
+                load<AccType>(in_v + idx * Details::kActivationElemNumPerAccess,
+                    in + b * k + scale_loader.offset() + idx * Details::kActivationElemNumPerAccess);
+            }
+            // Perform vector inner product and accumulate
+            if constexpr (NPerBlock == 1)
+            {
+                half2 v = __float2half2_rn(0.f);
+#pragma unroll
+                for (int y = 0; y < Details::kElemsPerThread; y += 2)
+                {
+                    v = __hfma2(*reinterpret_cast<half2*>(weights_f16 + y), *reinterpret_cast<half2*>(in_v + y), v);
+                }
+                accumulator[b] += __hadd(v.x, v.y);
+            }
+            else
+            {
+#pragma unroll
+                for (int x = 0; x < NPerBlock / 2; ++x)
+                {
+#pragma unroll
+                    for (int y = 0; y < Details::kElemsPerThread; ++y)
+                    {
+                        *reinterpret_cast<half2*>(accumulator + b * NPerBlock + x * 2)
+                            = __hfma2(*reinterpret_cast<half2*>(weights_f16 + y * NPerBlock + x * 2),
+                                __half2half2(in_v[y]), *reinterpret_cast<half2*>(accumulator + b * NPerBlock + x * 2));
+                    }
+                }
+            }
+        }
+        scale_loader.advance();
+    }
+    float reses[Num];
+#pragma unroll
+    for (int i = 0; i < Num; ++i)
+    {
+        reses[i] = __half2float(accumulator[i]);
+    }
+
+    // Each warp completes the internal reduce and writes the [Batch * NPerBlock * Interleave] results to the
+    // corresponding address in shared memory
+    Details::Layout::sync<Num, WarpSize>(reses, sm);
+
+    // Each thread is responsible for the accumulation and store to global memory of one element
+    for (int i = tid; i < Num * Interleave; i += BlockSize)
+    {
+        int nid = i % (NPerBlock * Interleave);
+        float v = 0.f;
+        for (int j = 0; j < BlockSize / WarpSize; ++j)
+        {
+            v += sm[j][i];
+        }
+        float bias_v = 0.f;
+        if constexpr (Bias)
+        {
+            bias_v = __half2float(bias[n_start_id + nid]);
+        }
+        int b = i / NPerBlock / Interleave;
+        out[b * n + n_start_id + nid] = __float2half_rn(ActOp<float>::apply(v + bias_v));
+    }
+}
+
+template <WeightOnlyQuantType QType, typename WeightOnlyFlag, template <typename T> class ActOp, bool Zero, bool Bias,
+    int NPerBlock, int Batch, int BlockSize>
+struct WeightOnlyBatchedGemvKernelLauncher
+{
+    static constexpr int kInterleave = WeightLayoutDetails<QType>::kInterleave;
+
+    static void run(const WeightOnlyParams& params, cudaStream_t stream)
+    {
+        dim3 grid(params.n / NPerBlock / kInterleave);
+        dim3 block(BlockSize);
+        int size = sizeof(float) * BlockSize / 32 * Batch * NPerBlock * kInterleave;
+        weight_only_batched_gemv<QType, WeightOnlyFlag, ActOp, Zero, Bias, NPerBlock, Batch, BlockSize>
+            <<<grid, block, size, stream>>>(
+                params.qweight, params.scales, params.zeros, params.in, params.bias, params.out, params.n, params.k);
+    }
+};
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.cu
new file mode 100644
index 00000000000..f04b2d354b7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.cu
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h"
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+template <WeightOnlyQuantType QType, typename WeightOnlyFlag, template <typename T> class ActOp, bool Zero, bool Bias,
+    int N_PER_BLOCK, int BATCH, int BLOCK_SIZE>
+struct WeightOnlyBatchedGemvKernelLauncher
+{
+    static void run(const WeightOnlyParams& params, cudaStream_t stream);
+};
+
+template <WeightOnlyQuantType QType, typename WeightOnlyFlag, template <typename T> class ActOp, int N_PER_BLOCK,
+    int BATCH, int BLOCK_SIZE>
+void select_zero_bias(const WeightOnlyParams& params, cudaStream_t stream)
+{
+    if (params.zeros && params.bias)
+    {
+        WeightOnlyBatchedGemvKernelLauncher<QType, WeightOnlyFlag, ActOp, true, true, N_PER_BLOCK, BATCH,
+            BLOCK_SIZE>::run(params, stream);
+    }
+    else if (params.zeros && !params.bias)
+    {
+        WeightOnlyBatchedGemvKernelLauncher<QType, WeightOnlyFlag, ActOp, true, false, N_PER_BLOCK, BATCH,
+            BLOCK_SIZE>::run(params, stream);
+    }
+    else if (!params.zeros && params.bias)
+    {
+        WeightOnlyBatchedGemvKernelLauncher<QType, WeightOnlyFlag, ActOp, false, true, N_PER_BLOCK, BATCH,
+            BLOCK_SIZE>::run(params, stream);
+    }
+    else
+    {
+        WeightOnlyBatchedGemvKernelLauncher<QType, WeightOnlyFlag, ActOp, false, false, N_PER_BLOCK, BATCH,
+            BLOCK_SIZE>::run(params, stream);
+    }
+}
+
+template <WeightOnlyQuantType QType, typename WeightOnlyFlag, int N_PER_BLOCK, int BATCH, int BLOCK_SIZE>
+void select_activation(WeightOnlyActivationType atype, const WeightOnlyParams& params, cudaStream_t stream)
+{
+    switch (atype)
+    {
+    case WeightOnlyActivationType::Gelu:
+    {
+        select_zero_bias<QType, WeightOnlyFlag, GeluActivation, N_PER_BLOCK, BATCH, BLOCK_SIZE>(params, stream);
+        break;
+    }
+    case WeightOnlyActivationType::Relu:
+    {
+        select_zero_bias<QType, WeightOnlyFlag, ReluActivation, N_PER_BLOCK, BATCH, BLOCK_SIZE>(params, stream);
+        break;
+    }
+    case WeightOnlyActivationType::Identity:
+    {
+        select_zero_bias<QType, WeightOnlyFlag, IdentityActivation, N_PER_BLOCK, BATCH, BLOCK_SIZE>(params, stream);
+        break;
+    }
+    default:
+    {
+        throw std::runtime_error("Use unsupported activation");
+        break;
+    }
+    }
+}
+
+template <typename WeightOnlyFlag, int N_PER_BLOCK, int BATCH, int BLOCK_SIZE>
+void select_quant_type(
+    WeightOnlyQuantType qtype, WeightOnlyActivationType atype, const WeightOnlyParams& params, cudaStream_t stream)
+{
+    if (qtype == WeightOnlyQuantType::Int4b)
+    {
+        select_activation<WeightOnlyQuantType::Int4b, WeightOnlyFlag, N_PER_BLOCK, BATCH, BLOCK_SIZE>(
+            atype, params, stream);
+    }
+    else if (qtype == WeightOnlyQuantType::Int8b)
+    {
+        select_activation<WeightOnlyQuantType::Int8b, WeightOnlyFlag, N_PER_BLOCK, BATCH, BLOCK_SIZE>(
+            atype, params, stream);
+    }
+    else
+    {
+        throw std::runtime_error("Unknown QuantType");
+    }
+}
+
+template <int N_PER_BLOCK, int BATCH, int BLOCK_SIZE>
+void select_groupwise_weight_only(WeightOnlyQuantType qtype, WeightOnlyType wtype, WeightOnlyActivationType atype,
+    const WeightOnlyParams& params, cudaStream_t stream)
+{
+    if (wtype == WeightOnlyType::GroupWise && params.group_size == 64)
+    {
+        select_quant_type<WeightOnlyGroupWise<64>, N_PER_BLOCK, BATCH, BLOCK_SIZE>(qtype, atype, params, stream);
+    }
+    else if (wtype == WeightOnlyType::GroupWise && params.group_size == 128)
+    {
+        select_quant_type<WeightOnlyGroupWise<128>, N_PER_BLOCK, BATCH, BLOCK_SIZE>(qtype, atype, params, stream);
+    }
+    else
+    {
+        throw std::runtime_error("Only support groupwise weight only for gs=64/128");
+    }
+}
+
+void weight_only_batched_gemv_launcher(WeightOnlyQuantType qtype, WeightOnlyType wtype, WeightOnlyActivationType atype,
+    const WeightOnlyParams& params, cudaStream_t stream)
+{
+    if (wtype == WeightOnlyType::PerChannel)
+    {
+        if (qtype == WeightOnlyQuantType::Int4b)
+        {
+            switch (params.m)
+            {
+            case 1:
+            {
+                select_activation<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, 1, 1, 192>(atype, params, stream);
+                break;
+            }
+            case 2:
+            {
+                select_activation<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, 2, 2, 128>(atype, params, stream);
+                break;
+            }
+            case 3:
+            {
+                select_activation<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, 2, 3, 256>(atype, params, stream);
+                break;
+            }
+            case 4:
+            {
+                select_activation<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, 4, 4, 256>(atype, params, stream);
+                break;
+            }
+            default:
+            {
+                throw std::runtime_error("Weight only cuda kernel only supported bs <= 4");
+                break;
+            }
+            }
+        }
+        else if (qtype == WeightOnlyQuantType::Int8b)
+        {
+            switch (params.m)
+            {
+            case 1:
+            {
+                select_activation<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, 2, 1, 256>(atype, params, stream);
+                break;
+            }
+            case 2:
+            {
+                select_activation<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, 2, 2, 256>(atype, params, stream);
+                break;
+            }
+            case 3:
+            {
+                select_activation<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, 2, 3, 256>(atype, params, stream);
+                break;
+            }
+            case 4:
+            {
+                select_activation<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, 2, 4, 256>(atype, params, stream);
+                break;
+            }
+            default:
+            {
+                throw std::runtime_error("Weight only cuda kernel only supported bs <= 4");
+                break;
+            }
+            }
+        }
+    }
+    else if (wtype == WeightOnlyType::GroupWise)
+    {
+        switch (params.m)
+        {
+        case 1:
+        {
+            select_groupwise_weight_only<2, 1, 256>(qtype, wtype, atype, params, stream);
+            break;
+        }
+        case 2:
+        {
+            select_groupwise_weight_only<2, 2, 256>(qtype, wtype, atype, params, stream);
+            break;
+        }
+        case 3:
+        {
+            select_groupwise_weight_only<2, 3, 128>(qtype, wtype, atype, params, stream);
+            break;
+        }
+        case 4:
+        {
+            select_groupwise_weight_only<2, 4, 128>(qtype, wtype, atype, params, stream);
+            break;
+        }
+        default:
+        {
+            throw std::runtime_error("Weight only cuda kernel only supported bs <= 4");
+            break;
+        }
+        }
+    }
+}
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h
similarity index 52%
rename from cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.h
rename to cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h
index ed04d343658..b4b032105e7 100644
--- a/cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.h
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h
@@ -1,12 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- * http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -15,18 +14,13 @@
  * limitations under the License.
  */
 #pragma once
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h"
 
 namespace tensorrt_llm
 {
 namespace kernels
 {
-
-void groupwise_weight_only_matmul_i2f_launcher(const int32_t* qweight, const half* qscales, const half* qzeros,
-    const half* in, const half* bias, half* out, const int batch, const int n, const int k, const int group_size,
-    cudaStream_t* stream);
-
-} // namespace kernels
+void weight_only_batched_gemv_launcher(WeightOnlyQuantType qtype, WeightOnlyType wtype, WeightOnlyActivationType atype,
+    const WeightOnlyParams& params, cudaStream_t stream);
+}
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h
new file mode 100644
index 00000000000..4decc521653
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass_extensions/interleaved_numeric_conversion.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+__forceinline__ __device__ float copysignf_pos(float a, float b)
+{
+    float r;
+    r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
+    return r;
+}
+
+__inline__ __device__ float tanh_opt(float x)
+{
+#if (__CUDA_ARCH__ >= 750 && CUDART_VERSION >= 11000)
+    float r;
+    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x));
+    return r;
+#else
+    const float exp_val = -1.f * fabs(2 * x);
+    return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
+#endif
+}
+
+template <typename T>
+struct GeluActivation
+{
+    static __device__ __forceinline__ T apply(const T& val)
+    {
+        const float cdf = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (val + 0.044715f * val * val * val))));
+        return val * cdf;
+    }
+};
+
+template <typename T>
+struct ReluActivation
+{
+    static __device__ __forceinline__ T apply(const T& val)
+    {
+        return val > static_cast<T>(0.0f) ? val : static_cast<T>(0.0f);
+    }
+};
+
+template <typename T>
+struct IdentityActivation
+{
+    static __device__ __forceinline__ T apply(const T& val)
+    {
+        return val;
+    }
+};
+
+template <typename VecType, typename T0, typename T1>
+__device__ __forceinline__ void load(T0* dst, T1* src, size_t offset = 0)
+{
+    *reinterpret_cast<VecType*>(dst) = *(reinterpret_cast<const VecType*>(src) + offset);
+}
+
+template <typename AssignType, typename T>
+__device__ __forceinline__ void assign(T* dst, const AssignType& val)
+{
+    *reinterpret_cast<AssignType*>(dst) = val;
+}
+
+template <typename VecType, typename T0, typename T1>
+__device__ __forceinline__ void store(T0* src, T1* dst, size_t offset = 0)
+{
+    *(reinterpret_cast<VecType*>(dst) + offset) = *reinterpret_cast<const VecType*>(src);
+}
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int4b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int4b.cu
new file mode 100644
index 00000000000..cb9ea68fd35
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int4b.cu
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    true, true, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    true, false, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    false, true, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    false, false, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    true, true, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    true, false, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    false, true, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    false, false, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, true, true, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, true, false, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, false, true, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, false, false, 1, 1, 192>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, false, 2, 1, 256>;
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int8b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int8b.cu
new file mode 100644
index 00000000000..59270fdd753
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int8b.cu
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, false, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, true, 2, 1, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, false, 2, 1, 256>;
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int4b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int4b.cu
new file mode 100644
index 00000000000..1302e8dcefb
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int4b.cu
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    true, true, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    true, false, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    false, true, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    false, false, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    true, true, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    true, false, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    false, true, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    false, false, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, true, true, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, true, false, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, false, true, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, false, false, 2, 2, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, false, 2, 2, 256>;
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int8b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int8b.cu
new file mode 100644
index 00000000000..72a515fe273
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int8b.cu
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, false, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, true, 2, 2, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, false, 2, 2, 256>;
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int4b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int4b.cu
new file mode 100644
index 00000000000..4224bdac29a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int4b.cu
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    true, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    true, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    false, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    false, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    true, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    true, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    false, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    false, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, true, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, true, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, false, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, false, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, false, 2, 3, 128>;
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int8b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int8b.cu
new file mode 100644
index 00000000000..032aea0cba8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int8b.cu
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    true, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    true, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    false, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    false, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    true, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    true, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    false, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    false, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, true, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, true, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, false, true, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, false, false, 2, 3, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, false, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, true, 2, 3, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, false, 2, 3, 128>;
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int4b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int4b.cu
new file mode 100644
index 00000000000..b3049c70fa5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int4b.cu
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    true, true, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    true, false, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    false, true, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, GeluActivation,
+    false, false, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    true, true, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    true, false, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    false, true, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel, ReluActivation,
+    false, false, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, true, true, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, true, false, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, false, true, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
+    IdentityActivation, false, false, 4, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, false, 2, 4, 128>;
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int8b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int8b.cu
new file mode 100644
index 00000000000..66cebb38b0f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int8b.cu
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    true, true, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    true, false, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    false, true, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, GeluActivation,
+    false, false, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    true, true, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    true, false, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    false, true, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel, ReluActivation,
+    false, false, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, true, true, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, true, false, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, false, true, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
+    IdentityActivation, false, false, 2, 4, 256>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, GeluActivation,
+    false, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>, ReluActivation,
+    false, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<64>,
+    IdentityActivation, false, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    GeluActivation, false, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    ReluActivation, false, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, true, false, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, true, 2, 4, 128>;
+template struct WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyGroupWise<128>,
+    IdentityActivation, false, false, 2, 4, 128>;
+
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.cu b/cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.cu
deleted file mode 100644
index 5349ffdd919..00000000000
--- a/cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.cu
+++ /dev/null
@@ -1,236 +0,0 @@
-#include "cutlass/cutlass.h"
-#include "cutlass_extensions/interleaved_numeric_conversion.h"
-#include "tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.h"
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <stdint.h>
-#include <stdio.h>
-
-namespace tensorrt_llm
-{
-namespace kernels
-{
-template <bool Zero, bool Bias, int N_PER_BLOCK, int BATCH, int BLOCK_SIZE, int GROUP_SIZE>
-__global__ void groupwise_weight_only_matmul_i2f(const int32_t* qweight, const half* scales, const half* zeros,
-    const half* in, const half* bias, half* out, const int n, const int k)
-{
-    static_assert(N_PER_BLOCK == 1 || (N_PER_BLOCK % 2 == 0));
-    using Converter = cutlass::FastInterleavedAndBiasedNumericArrayConverter<cutlass::half_t, cutlass::uint4b_t, 8>;
-    extern __shared__ uint8_t shmem[];
-    constexpr int Interleave = 4;
-    constexpr int NUM = BATCH * N_PER_BLOCK;
-    const int tid = threadIdx.x;
-    const int bid = blockIdx.x;
-    const int n_start_id = bid * N_PER_BLOCK * Interleave;
-    const int interleave_n_id = (tid / 2) % Interleave;
-
-    qweight += n_start_id * k / 8;
-    scales += (n_start_id + interleave_n_id);
-    if constexpr (Zero)
-    {
-        zeros += (n_start_id + interleave_n_id);
-    }
-    float(*sm)[NUM * Interleave] = reinterpret_cast<float(*)[NUM * Interleave]>(shmem);
-
-    half reses[NUM];
-    for (int i = 0; i < NUM; ++i)
-    {
-        reses[i] = __float2half_rn(0.f);
-    }
-
-    for (int local_k = tid * 32, real_k = tid / 8 * 64 + (tid % 2) * 32; local_k < k * Interleave;
-         local_k += BLOCK_SIZE * 32, real_k += BLOCK_SIZE * 32 / Interleave)
-    {
-        half weights_f16[32 * N_PER_BLOCK];
-        half scale[N_PER_BLOCK], zero[N_PER_BLOCK];
-#pragma unroll
-        for (int idx = 0; idx < N_PER_BLOCK; ++idx)
-        {
-            uint8_t weights_i4[16];
-            *reinterpret_cast<int4*>(weights_i4)
-                = *reinterpret_cast<const int4*>(qweight + idx * Interleave * k / 8 + local_k / 8);
-
-            scale[idx] = scales[real_k / GROUP_SIZE * n + idx * Interleave];
-            if constexpr (Zero)
-            {
-                zero[idx] = zeros[real_k / GROUP_SIZE * n + idx * Interleave];
-            }
-            else
-            {
-                zero[idx] = __float2half_rn(0.f);
-            }
-            half weights_vec[32];
-#pragma unroll
-            for (int i = 0; i < 4; ++i)
-            {
-                *reinterpret_cast<Converter::result_type*>(weights_vec + i * 8)
-                    = Converter::convert(*reinterpret_cast<Converter::source_type*>(weights_i4 + i * 4));
-            }
-#pragma unroll
-            for (int i = 0; i < 4; ++i)
-            {
-#pragma unroll
-                for (int j = 0; j < 4; ++j)
-                {
-                    half2 v = *reinterpret_cast<half2*>(weights_vec + i * 2 + j * 8);
-                    v = __hfma2(v, __half2half2(scale[idx]), __half2half2(zero[idx]));
-                    weights_f16[(i * 8 + j * 2 + 0) * N_PER_BLOCK + idx] = v.x;
-                    weights_f16[(i * 8 + j * 2 + 1) * N_PER_BLOCK + idx] = v.y;
-                }
-            }
-        }
-
-#pragma unroll
-        for (int b = 0; b < BATCH; ++b)
-        {
-            half in_v[32];
-#pragma unroll
-            for (int idx = 0; idx < 4; ++idx)
-            {
-                *reinterpret_cast<float4*>(in_v + idx * 8)
-                    = *reinterpret_cast<const float4*>(in + b * k + real_k + idx * 8);
-            }
-            if constexpr (N_PER_BLOCK == 1)
-            {
-                half2 v = __float2half2_rn(0.f);
-#pragma unroll
-                for (int y = 0; y < 32; y += 2)
-                {
-                    v = __hfma2(*reinterpret_cast<half2*>(weights_f16 + y), *reinterpret_cast<half2*>(in_v + y), v);
-                }
-                reses[b] += __hadd(v.x, v.y);
-            }
-            else
-            {
-#pragma unroll
-                for (int x = 0; x < N_PER_BLOCK / 2; ++x)
-                {
-#pragma unroll
-                    for (int y = 0; y < 32; ++y)
-                    {
-                        *reinterpret_cast<half2*>(reses + b * N_PER_BLOCK + x * 2)
-                            = __hfma2(*reinterpret_cast<half2*>(weights_f16 + y * N_PER_BLOCK + x * 2),
-                                __half2half2(in_v[y]), *reinterpret_cast<half2*>(reses + b * N_PER_BLOCK + x * 2));
-                    }
-                }
-            }
-        }
-    }
-    float reses2[NUM];
-#pragma unroll
-    for (int i = 0; i < NUM; ++i)
-    {
-        reses2[i] = __half2float(reses[i]);
-    }
-#pragma unroll
-    for (int i = 0; i < NUM; ++i)
-    {
-        reses2[i] += __shfl_xor_sync(~0, reses2[i], 16);
-        reses2[i] += __shfl_xor_sync(~0, reses2[i], 8);
-        reses2[i] += __shfl_xor_sync(~0, reses2[i], 1);
-    }
-    __syncthreads();
-    int warp = tid / 32, lane = tid % 32;
-    if (lane == 0 || lane == 2 || lane == 4 || lane == 6)
-    {
-#pragma unroll
-        for (int i = 0; i < NUM; ++i)
-        {
-            sm[warp][i * Interleave + lane / 2] = reses2[i];
-        }
-    }
-    __syncthreads();
-    for (int i = tid; i < NUM * Interleave; i += BLOCK_SIZE)
-    {
-        int nid = i % (N_PER_BLOCK * Interleave);
-        float v = 0.f;
-        for (int j = 0; j < BLOCK_SIZE / 32; ++j)
-        {
-            v += sm[j][i];
-        }
-        float bias_v;
-        if constexpr (Bias)
-        {
-            bias_v = __half2float(bias[n_start_id + nid]);
-        }
-        else
-        {
-            bias_v = 0.f;
-        }
-        int b = i / N_PER_BLOCK / Interleave;
-        out[b * n + n_start_id + nid] = __float2half_rn(v + bias_v);
-    }
-}
-
-#define RUN_groupwise_weight_only_matmul_i2f_2(Zero, Bias, N_PER_BLOCK, BATCH, BLOCKSIZE)                              \
-    {                                                                                                                  \
-        dim3 grid(n / N_PER_BLOCK / 4);                                                                                \
-        dim3 block(BLOCKSIZE);                                                                                         \
-        int size = sizeof(float) * BLOCKSIZE / 32 * BATCH * N_PER_BLOCK * 4;                                           \
-        if (group_size == 64)                                                                                          \
-        {                                                                                                              \
-            groupwise_weight_only_matmul_i2f<Zero, Bias, N_PER_BLOCK, BATCH, BLOCKSIZE, 64>                            \
-                <<<grid, block, size, *stream>>>(qweight, qscales, qzeros, in, bias, out, n, k);                       \
-        }                                                                                                              \
-        else if (group_size == 128)                                                                                    \
-        {                                                                                                              \
-            groupwise_weight_only_matmul_i2f<Zero, Bias, N_PER_BLOCK, BATCH, BLOCKSIZE, 128>                           \
-                <<<grid, block, size, *stream>>>(qweight, qscales, qzeros, in, bias, out, n, k);                       \
-        }                                                                                                              \
-        else                                                                                                           \
-        {                                                                                                              \
-            printf("Invalid group size. Only group size 64 and 128 supported for fine grained kernels.");              \
-            std::abort();                                                                                              \
-        }                                                                                                              \
-        break;                                                                                                         \
-    }
-
-#define RUN_groupwise_weight_only_matmul_i2f_1(N_PER_BLOCK, BATCH, BLOCKSIZE)                                          \
-    {                                                                                                                  \
-        if (qzeros && bias)                                                                                            \
-        {                                                                                                              \
-            RUN_groupwise_weight_only_matmul_i2f_2(true, true, N_PER_BLOCK, BATCH, BLOCKSIZE);                         \
-        }                                                                                                              \
-        else if (qzeros && !bias)                                                                                      \
-        {                                                                                                              \
-            RUN_groupwise_weight_only_matmul_i2f_2(true, false, N_PER_BLOCK, BATCH, BLOCKSIZE);                        \
-        }                                                                                                              \
-        else if (!qzeros && bias)                                                                                      \
-        {                                                                                                              \
-            RUN_groupwise_weight_only_matmul_i2f_2(false, true, N_PER_BLOCK, BATCH, BLOCKSIZE);                        \
-        }                                                                                                              \
-        else                                                                                                           \
-        {                                                                                                              \
-            RUN_groupwise_weight_only_matmul_i2f_2(false, false, N_PER_BLOCK, BATCH, BLOCKSIZE);                       \
-        }                                                                                                              \
-    }
-
-void groupwise_weight_only_matmul_i2f_launcher(const int32_t* qweight, const half* qscales, const half* qzeros,
-    const half* in, const half* bias, half* out, const int batch, const int n, const int k, const int group_size,
-    cudaStream_t* stream)
-{
-    switch (batch)
-    {
-    case 1: RUN_groupwise_weight_only_matmul_i2f_1(2, 1, 256);
-    case 2: RUN_groupwise_weight_only_matmul_i2f_1(2, 2, 256);
-    case 3: RUN_groupwise_weight_only_matmul_i2f_1(2, 3, 128);
-    case 4: RUN_groupwise_weight_only_matmul_i2f_1(2, 4, 128);
-    case 5: RUN_groupwise_weight_only_matmul_i2f_1(2, 5, 128);
-    case 6: RUN_groupwise_weight_only_matmul_i2f_1(2, 6, 256);
-    case 7: RUN_groupwise_weight_only_matmul_i2f_1(2, 7, 128);
-    case 8: RUN_groupwise_weight_only_matmul_i2f_1(2, 8, 128);
-    case 9: RUN_groupwise_weight_only_matmul_i2f_1(2, 9, 128);
-    case 10: RUN_groupwise_weight_only_matmul_i2f_1(4, 10, 128);
-    case 11: RUN_groupwise_weight_only_matmul_i2f_1(4, 11, 128);
-    case 12: RUN_groupwise_weight_only_matmul_i2f_1(2, 12, 128);
-    case 13: RUN_groupwise_weight_only_matmul_i2f_1(4, 13, 128);
-    case 14: RUN_groupwise_weight_only_matmul_i2f_1(4, 14, 128);
-    case 15: RUN_groupwise_weight_only_matmul_i2f_1(4, 15, 128);
-    case 16: RUN_groupwise_weight_only_matmul_i2f_1(4, 16, 128);
-    default: printf("vecquant4matmul_nk_kernel_launcher invalid batch!! batch=%d ", batch); std::abort();
-    }
-}
-
-} // namespace kernels
-} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.cu b/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.cu
deleted file mode 100644
index 79c798bc0e5..00000000000
--- a/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.cu
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "stdio.h"
-#include <cassert>
-#include <cmath>
-#include <cuda_fp16.h>
-
-#include "cutlass/cutlass.h"
-#include "cutlass_extensions/interleaved_numeric_conversion.h"
-#include "tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h"
-
-namespace tensorrt_llm
-{
-namespace kernels
-{
-
-/////////////////////////////////////////////////////////////////////
-/* Fast convert from weight only int8/int4 to half */
-
-template <QuantType Type>
-struct FastWeightOnlyHalfConverter;
-
-template <>
-struct FastWeightOnlyHalfConverter<QuantType::INT8_WEIGHT_ONLY>
-{
-    using Converter = cutlass::FastInterleavedAndBiasedNumericArrayConverter<cutlass::half_t, uint8_t, 4>;
-    static constexpr int kHalfLength = 4;
-    static constexpr int kWeightOnlyLength = 4;
-
-    __device__ static inline void convert(half halves[kHalfLength], uint8_t chars[kWeightOnlyLength], half scale)
-    {
-        *reinterpret_cast<Converter::result_type*>(halves)
-            = Converter::convert(*reinterpret_cast<Converter::source_type*>(chars));
-#pragma unroll
-        for (int i = 0; i < kHalfLength; ++i)
-        {
-            halves[i] *= scale;
-        }
-    }
-};
-
-template <>
-struct FastWeightOnlyHalfConverter<QuantType::PACKED_INT4_WEIGHT_ONLY>
-{
-    using Converter = cutlass::FastInterleavedAndBiasedNumericArrayConverter<cutlass::half_t, cutlass::uint4b_t, 8>;
-    static constexpr int kHalfLength = 8;
-    static constexpr int kWeightOnlyLength = 4;
-
-    __device__ static inline void convert(half halves[kHalfLength], uint8_t chars[kWeightOnlyLength], half scale)
-    {
-        *reinterpret_cast<Converter::result_type*>(halves)
-            = Converter::convert(*reinterpret_cast<Converter::source_type*>(chars));
-#pragma unroll
-        for (int i = 0; i < kHalfLength; ++i)
-        {
-            halves[i] *= scale;
-        }
-    }
-};
-
-/* Activation */
-
-__forceinline__ __device__ float copysignf_pos(float a, float b)
-{
-    float r;
-    r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
-    return r;
-}
-
-__inline__ __device__ float tanh_opt(float x)
-{
-#if (__CUDA_ARCH__ >= 750 && CUDART_VERSION >= 11000)
-    float r;
-    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x));
-    return r;
-#else
-    const float exp_val = -1.f * fabs(2 * x);
-    return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
-#endif
-}
-
-template <typename T>
-struct GeluActivation
-{
-    static __device__ __forceinline__ T apply(const T& val)
-    {
-        const float cdf = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (val + 0.044715f * val * val * val))));
-        return val * cdf;
-    }
-};
-
-template <typename T>
-struct ReluActivation
-{
-    static __device__ __forceinline__ T apply(const T& val)
-    {
-        return val > static_cast<T>(0.0f) ? val : static_cast<T>(0.0f);
-    }
-};
-
-template <typename T>
-struct IdentityActivation
-{
-    static __device__ __forceinline__ T apply(const T& val)
-    {
-        return val;
-    }
-};
-
-template <typename VecType, typename T0, typename T1>
-__device__ __forceinline__ void load(T0* dst, T1* src, size_t offset = 0)
-{
-    *reinterpret_cast<VecType*>(dst) = *(reinterpret_cast<const VecType*>(src) + offset);
-}
-
-template <typename VecType, typename T0, typename T1>
-__device__ __forceinline__ void store(T0* src, T1* dst, size_t offset = 0)
-{
-    *(reinterpret_cast<VecType*>(dst) + offset) = *reinterpret_cast<const VecType*>(src);
-}
-
-template <bool Bias, template <typename T> class Activation, int K = 0>
-__global__ void int8_weight_only_gemv_interleave(const int8_t* weight, const half* input, const half* scale_list,
-    const half* bias, half* output, const int n, const int k_)
-{
-    using Converter = FastWeightOnlyHalfConverter<QuantType::INT8_WEIGHT_ONLY>;
-    int k = K != 0 ? K : k_;
-    uint8_t vec_weight[16];
-    half vec_input[16];
-    half vec_weight_f16[16];
-    int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
-    int tile_id = blockIdx.x * blockDim.x / 32 + warp_id;
-    // Every two rows of the original weights are interleaved into a row with stride of 64, so if each thread
-    // processes 16 elements(for int8, we can use ldg.128 to load weights), then every group of four adjacent threads
-    // will alternately process two different row weights
-    // for example
-    // every 128 consecutive int8 elements [128*i, 128*(i+1)-1] of row N under interleave layout,
-    // the first 64 are from [64*i, 64*(i+1)-1] of row 2N before interleaving,
-    // and the last 64 are from [64*i, 64*(i+1)-1] of row 2N+1 before interleaving.
-    // So if each thread loads 16 int8 elements, then the elements of the first four and last four threads of each 8
-    // consecutive threads will come from row 2N and row 2N+1 respectively before interleaving.
-    int row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0);
-    weight += tile_id * k * 2;
-    float v = 0.f, v_bias;
-    half scale = scale_list[row_id];
-    if (Bias)
-    {
-        v_bias = __half2float(bias[row_id]);
-    }
-#pragma unroll
-    for (int i = lane_id * 16; i < k * 2; i += 16 * 32)
-    {
-        load<uint4>(vec_weight, weight + i);
-        load<float4>(vec_input, input + i / 128 * 64 + (i % 64));
-        load<float4>(vec_input + 8, input + i / 128 * 64 + (i % 64) + 8);
-#pragma unroll
-        for (int p = 0; p < 16; p += Converter::kHalfLength)
-        {
-            // The rearrangement here counteracts the effect of cutlass::add_bias_and_interleave_int8s_inplace
-            // Input int8 data layout
-            //      [elt_3  elt_1  elt_2  elt_0] (each elt occupies 8 bits)
-            //
-            // Converted fp16 data layout
-            //      [elt_3  elt_2  elt_1  elt_0] (each elt occupies 16 bits)
-            Converter::convert(vec_weight_f16 + p, vec_weight + p, scale);
-        }
-#pragma unroll
-        for (int p = 0; p < 16; ++p)
-        {
-            // The index remapping here is to counteracts the effect of cutlass::permute_B_rows_for_mixed_gemm
-            // input 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-            // weight 0 1 8 9 2 3 10 11 4 5 12 13 6 7 14 15
-            v += __half2float(__hmul(vec_input[p], vec_weight_f16[4 * ((p % 8) / 2) + p % 2 + 2 * (p / 8)]));
-        }
-    }
-    v += __shfl_xor_sync(0xffffffff, v, 16);
-    v += __shfl_xor_sync(0xffffffff, v, 8);
-    v += __shfl_xor_sync(0xffffffff, v, 2);
-    v += __shfl_xor_sync(0xffffffff, v, 1);
-    if (lane_id == 0 || lane_id == 4)
-    {
-        if (Bias)
-        {
-            output[row_id] = __float2half_rn(Activation<float>::apply(v + v_bias));
-        }
-        else
-        {
-            output[row_id] = __float2half_rn(Activation<float>::apply(v));
-        }
-    }
-}
-
-template <bool Bias, template <typename T> class Activation, int K = 0>
-__global__ void int4_weight_only_gemv_interleave(const int8_t* weight, const half* input, const half* scale_list,
-    const half* bias, half* output, const int n, const int k_)
-{
-    using Converter = FastWeightOnlyHalfConverter<QuantType::PACKED_INT4_WEIGHT_ONLY>;
-    int k = K != 0 ? K : k_;
-    uint8_t vec_weight[16];
-    half vec_input[32];
-    half vec_weight_f16[32];
-    int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
-    int tile_id = blockIdx.x * blockDim.x / 32 + warp_id;
-    // Every four rows of the original weights are interleaved into a row with stride of 64, so if each thread
-    // processes 32 elements(for int4, we can use ldg.128 to load weights), then every group of two adjacent threads
-    // will alternately process four different row weights
-    // for example
-    // every 256 consecutive int4 elements [256*i, 256*(i+1)-1] of row N under interleave layout,
-    // the first 64 are from [64*i, 64*(i+1)-1] of row 4N before interleaving,
-    // and the second 64 are from [64*i, 64*(i+1)-1] of row 4N+1 before interleaving, and so on.
-    // So if each thread loads 32 int4 elements, then the elements of each 2 adjacent threads of each 8
-    // consecutive threads will come from row 4N ~ 4N+3 respectively before interleaving.
-    int row_id = tile_id * 4 + ((lane_id % 8) / 2);
-    weight += tile_id * k / 2 * 4;
-    float v = 0.f, v_bias;
-    half scale = scale_list[row_id];
-    if (Bias)
-    {
-        v_bias = __half2float(bias[row_id]);
-    }
-#pragma unroll
-    for (int i = lane_id * 32; i < k * 4; i += 32 * 32)
-    {
-        load<uint4>(vec_weight, weight + i / 2);
-        load<float4>(vec_input, input + i / 256 * 64 + (i % 64));
-        load<float4>(vec_input + 8, input + i / 256 * 64 + (i % 64) + 8);
-        load<float4>(vec_input + 16, input + i / 256 * 64 + (i % 64) + 16);
-        load<float4>(vec_input + 24, input + i / 256 * 64 + (i % 64) + 24);
-#pragma unroll
-        for (int p = 0; p < 32; p += Converter::kHalfLength)
-        {
-            // The rearrangement here counteracts the effect of cutlass::add_bias_and_interleave_int4s_inplace
-            // Input int8 data layout
-            //      [elt_7  elt_5  elt_3  elt_1  elt_6  elt_4  elt_2  elt_0] (each elt occupies 4 bits)
-            //
-            // Converted fp16 data layout
-            //      [elt_7  elt_6  elt_5  elt_4  elt_3  elt_2  elt_1  elt_0] (each elt occupies 16 bits)
-            Converter::convert(vec_weight_f16 + p, vec_weight + p / 2, scale);
-        }
-#pragma unroll
-        for (int p = 0; p < 32; ++p)
-        {
-            // The index remapping here is to counteracts the effect of cutlass::permute_B_rows_for_mixed_gemm
-            // input 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ... 31
-            // weight 0 1 8 9 16 17 24 25 2 3 10 11 18 19 26 27 4 5 12 13 20 21 28 29 6 7 14 15 22 23 30 31
-            v += __half2float(__hmul(vec_input[p], vec_weight_f16[8 * ((p % 8) / 2) + p % 2 + 2 * (p / 8)]));
-        }
-    }
-    v += __shfl_xor_sync(0xffffffff, v, 16);
-    v += __shfl_xor_sync(0xffffffff, v, 8);
-    v += __shfl_xor_sync(0xffffffff, v, 1);
-    if (lane_id == 0 || lane_id == 2 || lane_id == 4 || lane_id == 6)
-    {
-        if (Bias)
-        {
-            output[row_id] = __float2half_rn(Activation<float>::apply(v + v_bias));
-        }
-        else
-        {
-            output[row_id] = __float2half_rn(Activation<float>::apply(v));
-        }
-    }
-}
-
-template <bool Bias, template <typename T> class Activation, int K = 0>
-void weight_only_gemv_kernel_launcher(const int8_t* weight, const half* input, const half* scale_list, const half* bias,
-    half* output, const int k, const int n, dim3 grid, dim3 block, QuantType qtype, cudaStream_t stream)
-{
-    if (qtype == QuantType::PACKED_INT4_WEIGHT_ONLY)
-    {
-        grid.x /= 2;
-        int4_weight_only_gemv_interleave<Bias, Activation, K>
-            <<<grid, block, 0, stream>>>(weight, input, scale_list, bias, output, n, k);
-    }
-    else if (qtype == QuantType::INT8_WEIGHT_ONLY)
-    {
-        int8_weight_only_gemv_interleave<Bias, Activation, K>
-            <<<grid, block, 0, stream>>>(weight, input, scale_list, bias, output, n, k);
-    }
-}
-
-#define INVOKE_WEIGHT_ONLY_GEMV(ActivationType, K)                                                                     \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        if (bias)                                                                                                      \
-        {                                                                                                              \
-            weight_only_gemv_kernel_launcher<true, ActivationType, K>(                                                 \
-                weight, input, scale_list, bias, output, k, n, grid, block, qtype, stream);                            \
-        }                                                                                                              \
-        else                                                                                                           \
-        {                                                                                                              \
-            weight_only_gemv_kernel_launcher<false, ActivationType, K>(                                                \
-                weight, input, scale_list, bias, output, k, n, grid, block, qtype, stream);                            \
-        }                                                                                                              \
-    } while (0);
-
-#define SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, K)                                                                 \
-    case K:                                                                                                            \
-    {                                                                                                                  \
-        INVOKE_WEIGHT_ONLY_GEMV(ActivationType, K);                                                                    \
-        break;                                                                                                         \
-    }
-#define INVOKE_WEIGHT_ONLY_KERNEL_FOR_SPECIFIED_SHAPE(ActivationType)                                                  \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        switch (k)                                                                                                     \
-        {                                                                                                              \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 1536)                                                          \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 2048)                                                          \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 2560)                                                          \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 4096)                                                          \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 4608)                                                          \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 5120)                                                          \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 6144)                                                          \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 7680)                                                          \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 8192)                                                          \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 10240)                                                         \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 12288)                                                         \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 15360)                                                         \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 16384)                                                         \
-            SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 20480)                                                         \
-        default:                                                                                                       \
-        {                                                                                                              \
-            INVOKE_WEIGHT_ONLY_GEMV(ActivationType, 0);                                                                \
-            break;                                                                                                     \
-        }                                                                                                              \
-        }                                                                                                              \
-    } while (0);
-#define INVOKE_WEIGHT_ONLY_KERNEL_FOR_DIFFERENT_ACT()                                                                  \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        switch (activation)                                                                                            \
-        {                                                                                                              \
-        case ActivationType::Gelu:                                                                                     \
-        {                                                                                                              \
-            INVOKE_WEIGHT_ONLY_KERNEL_FOR_SPECIFIED_SHAPE(GeluActivation);                                             \
-            break;                                                                                                     \
-        }                                                                                                              \
-        case ActivationType::Relu:                                                                                     \
-        {                                                                                                              \
-            INVOKE_WEIGHT_ONLY_KERNEL_FOR_SPECIFIED_SHAPE(ReluActivation);                                             \
-            break;                                                                                                     \
-        }                                                                                                              \
-        case ActivationType::Identity:                                                                                 \
-        {                                                                                                              \
-            INVOKE_WEIGHT_ONLY_KERNEL_FOR_SPECIFIED_SHAPE(IdentityActivation);                                         \
-            break;                                                                                                     \
-        }                                                                                                              \
-        default:                                                                                                       \
-        {                                                                                                              \
-            assert(false);                                                                                             \
-            break;                                                                                                     \
-        }                                                                                                              \
-        }                                                                                                              \
-    } while (0);
-
-template <>
-void weight_only_gemv_launcher(const half* input, const int8_t* weight, const half* scale_list, const half* bias,
-    half* output, const int k, const int n, ActivationType activation, QuantType qtype, cudaStream_t stream)
-{
-    dim3 block(512);
-    dim3 grid(n / 32);
-    INVOKE_WEIGHT_ONLY_KERNEL_FOR_DIFFERENT_ACT();
-}
-
-} // namespace kernels
-} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h b/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h
deleted file mode 100644
index 7deb48cd70e..00000000000
--- a/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "tensorrt_llm/common/int8Utils.cuh"
-#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h"
-#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
-#include <assert.h>
-#include <cmath>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <iostream>
-#include <vector>
-
-namespace tensorrt_llm
-{
-namespace kernels
-{
-
-using cutlass_kernels::QuantType;
-using cutlass_kernels::ActivationType;
-
-template <typename WT, typename AT>
-void weight_only_gemv_launcher(const AT* input, const WT* weight, const AT* scale_list, const AT* bias, AT* output,
-    const int k, const int n, ActivationType activation, QuantType qtype, cudaStream_t stream)
-{
-    assert(false);
-}
-
-template <>
-void weight_only_gemv_launcher(const half* input, const int8_t* weight, const half* scale_list, const half* bias,
-    half* output, const int k, const int n, ActivationType activation, QuantType qtype, cudaStream_t stream);
-} // namespace kernels
-} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu
index 2c199f0eca0..61059967b2e 100644
--- a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu
+++ b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu
@@ -28,7 +28,7 @@ namespace layers
 
 __global__ void update_indir_cache_kernel(int* tgt_indir_cache, const int* src_indir_cache, const int** parent_ids,
     const bool* finished, const int* sequence_lengths, const int* input_lengths, int batch_dim, int local_batch_size,
-    int beam_width, int max_seq_len, int max_input_length)
+    int beam_width, int max_seq_len)
 {
     int time_step = threadIdx.x + blockIdx.x * blockDim.x;
     int bb_id = threadIdx.y + blockIdx.y * blockDim.y;
@@ -36,34 +36,31 @@ __global__ void update_indir_cache_kernel(int* tgt_indir_cache, const int* src_i
     const int input_length{input_lengths == nullptr ? 0 : input_lengths[bb_id]};
     const int batch_id = bb_id / beam_width;
     const int beam_id = bb_id % beam_width;
-    if (bb_id >= beam_width * local_batch_size || time_step < input_length || time_step < max_input_length
-        || finished[bb_id])
+    if (bb_id >= beam_width * local_batch_size || time_step < input_length || finished[bb_id])
     {
         return;
     }
     int time_step_circ = time_step % max_seq_len;
     // FIXME: we will remove all paddings later (@boyang)
     // Skip input paddings when updating the indir cache table.
-    const int pad_len = max_input_length - input_length;
-    time_step_circ = time_step_circ >= max_input_length ? (time_step_circ - pad_len) : time_step_circ;
 
     const int src_beam = parent_ids[batch_id][beam_id * max_seq_len + current_step];
 
-    const uint tgt_offset = batch_id * beam_width * max_seq_len + beam_id * max_seq_len + time_step_circ;
-    const uint src_offset = batch_id * beam_width * max_seq_len + src_beam * max_seq_len + time_step_circ;
+    const uint32_t tgt_offset = batch_id * beam_width * max_seq_len + beam_id * max_seq_len + time_step_circ;
+    const uint32_t src_offset = batch_id * beam_width * max_seq_len + src_beam * max_seq_len + time_step_circ;
 
     tgt_indir_cache[tgt_offset] = (time_step == current_step) ? beam_id : src_indir_cache[src_offset];
 }
 
 void update_indir_cache_kernelLauncher(int* tgt_indir_cache, const int* src_indir_cache, const int** parent_ids,
     const bool* finished, const int* sequence_lengths, const int* input_lengths, int batch_dim, int local_batch_size,
-    int beam_width, int max_seq_len, int max_input_length, cudaStream_t stream)
+    int beam_width, int max_seq_len, cudaStream_t stream)
 {
     const dim3 block(32);
     // Update indirections steps [input_length[bb_id], sequence_lengths[bb_id]], included
     const dim3 grid((max_seq_len + block.x - 1) / block.x, local_batch_size * beam_width);
     update_indir_cache_kernel<<<grid, block, 0, stream>>>(tgt_indir_cache, src_indir_cache, parent_ids, finished,
-        sequence_lengths, input_lengths, batch_dim, local_batch_size, beam_width, max_seq_len, max_input_length);
+        sequence_lengths, input_lengths, batch_dim, local_batch_size, beam_width, max_seq_len);
 }
 
 template <typename T>
@@ -129,16 +126,16 @@ void BaseBeamSearchLayer<T>::forward(BeamSearchOutputParams& outputs, ForwardPar
     TLLM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
     Tensor& output_ids_ptr = outputs.output_ids_ptr;
 
-    const int batch_size{output_ids_ptr.shape[0]};
-    const int beam_width{output_ids_ptr.shape[1]};
-    const int max_seq_len{output_ids_ptr.shape[2]};
+    const auto batch_size = static_cast<std::int32_t>(output_ids_ptr.shape[0]);
+    const auto beam_width = static_cast<std::int32_t>(output_ids_ptr.shape[1]);
+    const auto max_seq_len = static_cast<std::int32_t>(output_ids_ptr.shape[2]);
     allocateBuffer(batch_size, beam_width);
 
     TLLM_CHECK_WITH_INFO(params.ite == 0, "Pipeline Parallelism is not supported yet !");
 
     const int ite{params.ite};
     Tensor const& logits = params.logits;
-    const int local_batch_size = logits.shape[0];
+    const auto local_batch_size = logits.shape[0];
 
     const T* embedding_bias = params.embedding_bias ? params.embedding_bias->template getPtr<const T>() : nullptr;
 
@@ -148,8 +145,8 @@ void BaseBeamSearchLayer<T>::forward(BeamSearchOutputParams& outputs, ForwardPar
 
     invokeAddBiasApplyPenalties(logits.getPtr<T>(), output_ids_ptr.template getPtr<const int*>(),
         outputs.parent_ids_ptr.template getPtr<const int*>(), input_lengths, sequence_length, embedding_bias, ite,
-        params.max_input_length, local_batch_size, batch_size, beam_width, vocab_size_, vocab_size_padded_, end_ids,
-        mTemperature, mRepetitionPenalty, mRepetitionPenaltyType, mMinLength, max_seq_len, stream_);
+        local_batch_size, batch_size, beam_width, vocab_size_, vocab_size_padded_, end_ids, mTemperature,
+        mRepetitionPenalty, mRepetitionPenaltyType, mMinLength, max_seq_len, stream_);
     sync_check_cuda_error();
 
     invokeSoftMax(outputs, params);
@@ -159,8 +156,7 @@ void BaseBeamSearchLayer<T>::forward(BeamSearchOutputParams& outputs, ForwardPar
         update_indir_cache_kernelLauncher(outputs.tgt_cache_indirection.template getPtr<int>(),
             params.src_cache_indirection.template getPtr<const int>(),
             outputs.parent_ids_ptr.template getPtr<const int*>(), outputs.finished->template getPtr<const bool>(),
-            sequence_length, input_lengths, batch_size, local_batch_size, beam_width, max_seq_len,
-            params.max_input_length, stream_);
+            sequence_length, input_lengths, batch_size, local_batch_size, beam_width, max_seq_len, stream_);
         sync_check_cuda_error();
     }
     sync_check_cuda_error();
diff --git a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.h b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.h
index f0fd7ede40c..ec3bec7625d 100644
--- a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.h
+++ b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.h
@@ -54,17 +54,15 @@ class BaseBeamSearchLayer : public BaseLayer
     class ForwardParams : public SoftmaxParams
     {
     public:
-        ForwardParams(int step, int ite, int max_input_length, tc::Tensor logits, tc::Tensor endIds,
-            tc::Tensor src_cache_indirection, int max_seq_len)
+        ForwardParams(
+            int step, int ite, tc::Tensor logits, tc::Tensor endIds, tc::Tensor src_cache_indirection, int max_seq_len)
             : SoftmaxParams(step, ite, std::move(logits), std::move(endIds))
-            , max_input_length{max_input_length}
             , src_cache_indirection{std::move(src_cache_indirection)}
             , max_seq_len{max_seq_len}
         {
         }
 
         // mandatory parameters
-        int max_input_length;
         int max_seq_len;
         tc::Tensor src_cache_indirection; // [local_batch_size, beam_width, max_seq_len]
 
diff --git a/cpp/tensorrt_llm/layers/baseSamplingLayer.cpp b/cpp/tensorrt_llm/layers/baseSamplingLayer.cpp
index 1cc7ab27ca7..0f8707b5f8a 100644
--- a/cpp/tensorrt_llm/layers/baseSamplingLayer.cpp
+++ b/cpp/tensorrt_llm/layers/baseSamplingLayer.cpp
@@ -177,7 +177,7 @@ void BaseSamplingLayer<T>::forward(DecodingOutputParams& outputs, ForwardParams
     auto const local_batch_size = params.logits.shape[0];
     auto const ite = params.ite;
     auto const step = params.step;
-    auto const max_input_length = params.max_input_length;
+    auto* const input_lengths = params.input_lengths ? params.input_lengths->template getPtr<const int>() : nullptr;
 
     auto* logits = params.logits.template getPtr<T>();
 
@@ -219,24 +219,16 @@ void BaseSamplingLayer<T>::forward(DecodingOutputParams& outputs, ForwardParams
                 = params.input_lengths ? params.input_lengths->template getPtr<const int>() : nullptr;
             invokeBatchApplyRepetitionPenalty(logits, repetition_penalty_buf_ + ite * local_batch_size,
                 outputs.output_ids_ptr.template getPtr<const int*>(), outputs.sequence_length->getPtr<const int>(),
-                batch_size, local_batch_size, vocab_size_padded_, input_lengths, max_input_length,
-                repetition_penalty_type_, params.max_seq_len, stream_);
+                batch_size, local_batch_size, vocab_size_padded_, input_lengths, repetition_penalty_type_,
+                params.max_seq_len, stream_);
             sync_check_cuda_error();
         }
     }
 
-    const int num_generated_tokens = step - max_input_length;
-    const auto min_lengths = std::begin(mMinLengths) + ite * local_batch_size;
-    const bool invoke_min_length_penalty = std::any_of(
-        min_lengths, min_lengths + local_batch_size, [&](int min_length) { return min_length > num_generated_tokens; });
-    if (invoke_min_length_penalty)
-    {
-        auto* end_ids = params.end_ids.template getPtr<const int>();
-        invokeMinLengthPenalty(logits, min_lengths_buf_ + ite * local_batch_size, end_ids,
-            outputs.sequence_length->getPtr<const int>(), max_input_length, local_batch_size, vocab_size_padded_,
-            stream_);
-        sync_check_cuda_error();
-    }
+    auto* end_ids = params.end_ids.template getPtr<const int>();
+    invokeMinLengthPenalty(logits, min_lengths_buf_ + ite * local_batch_size, end_ids,
+        outputs.sequence_length->getPtr<const int>(), input_lengths, local_batch_size, vocab_size_padded_, stream_);
+    sync_check_cuda_error();
 #undef ALL_OF
 
     runSampling(outputs, params);
diff --git a/cpp/tensorrt_llm/layers/baseSamplingLayer.h b/cpp/tensorrt_llm/layers/baseSamplingLayer.h
index fa35a261ec0..3ac1f699717 100644
--- a/cpp/tensorrt_llm/layers/baseSamplingLayer.h
+++ b/cpp/tensorrt_llm/layers/baseSamplingLayer.h
@@ -54,15 +54,13 @@ class BaseSamplingLayer : public BaseLayer
     class ForwardParams : public DecodingParams
     {
     public:
-        ForwardParams(int step, int ite, int max_input_length, tc::Tensor logits, tc::Tensor end_ids, int max_seq_len)
+        ForwardParams(int step, int ite, tc::Tensor logits, tc::Tensor end_ids, int max_seq_len)
             : DecodingParams{step, ite, std::move(logits), std::move(end_ids)}
-            , max_input_length{max_input_length}
             , max_seq_len{max_seq_len}
         {
         }
 
         // mandatory parameters
-        int max_input_length;
         int max_seq_len;
 
         // optional parameters
diff --git a/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp b/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp
index d597d1510b3..7849debf008 100644
--- a/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp
+++ b/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp
@@ -267,8 +267,7 @@ void DynamicDecodeLayer<T>::forward(OutputParams& outputs, ForwardParams const&
     }
 
     // common inputs
-    auto const max_input_length = params.max_input_length;
-    auto const& end_id = params.end_ids;
+    auto const& end_ids = params.end_ids;
 
     // dynamic decode GPT
     if (beam_width > 1)
@@ -286,7 +285,7 @@ void DynamicDecodeLayer<T>::forward(OutputParams& outputs, ForwardParams const&
         const size_t dynamic_decode_batch_size = has_diff_runtime_args_ ? 1 : local_batch_size;
         const int dynamic_decode_total_iteration = local_batch_size / dynamic_decode_batch_size;
 
-        for (uint dynamic_ite = ite * dynamic_decode_total_iteration;
+        for (uint32_t dynamic_ite = ite * dynamic_decode_total_iteration;
              dynamic_ite < (ite + 1) * dynamic_decode_total_iteration; ++dynamic_ite)
         {
             const int dynamic_id_offset = dynamic_ite * dynamic_decode_batch_size * beam_width;
@@ -295,9 +294,9 @@ void DynamicDecodeLayer<T>::forward(OutputParams& outputs, ForwardParams const&
             auto const logits_offset = logits.slice(
                 {dynamic_decode_batch_size, logits.shape[1], logits.shape[2]}, dynamic_decode_vocab_size_units_offset);
             auto const end_id_offset
-                = end_id.slice({dynamic_decode_batch_size}, dynamic_ite * dynamic_decode_batch_size);
-            typename BaseBeamSearchLayer<T>::ForwardParams dynamic_decode_input_tensors{
-                step, ite, max_input_length, logits_offset, end_id_offset, *params.src_cache_indirection, max_seq_len};
+                = end_ids.slice({dynamic_decode_batch_size}, dynamic_ite * dynamic_decode_batch_size);
+            typename BaseBeamSearchLayer<T>::ForwardParams dynamic_decode_input_tensors{step, ite, logits_offset,
+                end_id_offset, *params.src_cache_indirection, static_cast<std::int32_t>(max_seq_len)};
 
             dynamic_decode_input_tensors.embedding_bias = params.embedding_bias;
 
@@ -337,9 +336,9 @@ void DynamicDecodeLayer<T>::forward(OutputParams& outputs, ForwardParams const&
 
         Tensor const logits_slice{
             logits.slice({local_batch_size, beam_width, logits.shape[2]}, local_batch_offset * logits.shape[2])};
-        Tensor const end_id_slice{end_id.slice({local_batch_size}, ite * local_batch_size)};
+        Tensor const end_id_slice{end_ids.slice({local_batch_size}, ite * local_batch_size)};
         typename BaseSamplingLayer<T>::ForwardParams decode_input_tensors{
-            step, ite, max_input_length, logits_slice, end_id_slice, max_seq_len};
+            step, ite, logits_slice, end_id_slice, static_cast<std::int32_t>(max_seq_len)};
 
         decode_input_tensors.embedding_bias = params.embedding_bias;
 
@@ -368,11 +367,13 @@ void DynamicDecodeLayer<T>::forward(OutputParams& outputs, ForwardParams const&
         }
         if (outputs.output_log_probs)
         {
+            auto const generationStep = step - params.max_input_length;
+            TLLM_CHECK(generationStep >= 0);
             Tensor& output_log_probs = outputs.output_log_probs.value();
-            size_t step_offset = (step - max_input_length) * batch_size * beam_width;
-            decode_outputs.output_log_probs = output_log_probs.slice(
-                {output_log_probs.shape[0] - (step - max_input_length), local_batch_size * beam_width},
-                step_offset + local_batch_offset);
+            size_t step_offset = generationStep * batch_size * beam_width;
+            decode_outputs.output_log_probs
+                = output_log_probs.slice({output_log_probs.shape[0] - generationStep, local_batch_size * beam_width},
+                    step_offset + local_batch_offset);
         }
 
         // Run topk / topp decode layers.
diff --git a/cpp/tensorrt_llm/layers/onlineBeamSearchLayer.cu b/cpp/tensorrt_llm/layers/onlineBeamSearchLayer.cu
index 340662a24c1..1b1dda10fd1 100644
--- a/cpp/tensorrt_llm/layers/onlineBeamSearchLayer.cu
+++ b/cpp/tensorrt_llm/layers/onlineBeamSearchLayer.cu
@@ -98,12 +98,12 @@ void OnlineBeamSearchLayer<T>::invokeSoftMax(BeamSearchOutputParams& outputs, So
 {
     TLLM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
     Tensor const& output_ids_ptr = outputs.output_ids_ptr;
-    const int batch_size{output_ids_ptr.shape[0]};
-    const int beam_width{output_ids_ptr.shape[1]};
-    const int max_seq_len{output_ids_ptr.shape[2]};
+    const auto batch_size = static_cast<std::int32_t>(output_ids_ptr.shape[0]);
+    const auto beam_width = static_cast<std::int32_t>(output_ids_ptr.shape[1]);
+    const auto max_seq_len = static_cast<std::int32_t>(output_ids_ptr.shape[2]);
     const int ite{params.ite};
     Tensor const& logits{params.logits};
-    const int local_batch_size = logits.shape[0];
+    const auto local_batch_size = logits.shape[0];
 
     BeamHypotheses beamHypotheses;
     auto* const end_ids = params.end_ids.template getPtr<const int>();
diff --git a/cpp/tensorrt_llm/layers/topPSamplingLayer.cu b/cpp/tensorrt_llm/layers/topPSamplingLayer.cu
index 4cd9831bb23..f6bb8c8f8ae 100644
--- a/cpp/tensorrt_llm/layers/topPSamplingLayer.cu
+++ b/cpp/tensorrt_llm/layers/topPSamplingLayer.cu
@@ -232,7 +232,6 @@ void TopPSamplingLayer<T>::runSampling(DecodingOutputParams& outputs, DecodingPa
     auto const batch_size = outputs.output_ids_ptr.shape[0];
     auto const local_batch_size = params.logits.shape[0];
     auto const ite = params.ite;
-    auto const step = params.step;
 
     // in case of skip any, the logit value is already copied and processed.
     auto* logits = !skip_any_ ? params.logits.template getPtr<T>() : runtime_logits_buf_;
diff --git a/cpp/tensorrt_llm/plugins/CMakeLists.txt b/cpp/tensorrt_llm/plugins/CMakeLists.txt
index 0793d339f67..0a6583c851d 100755
--- a/cpp/tensorrt_llm/plugins/CMakeLists.txt
+++ b/cpp/tensorrt_llm/plugins/CMakeLists.txt
@@ -60,11 +60,15 @@ add_subdirectory(common)
 # Set gencodes
 list(APPEND PLUGIN_SOURCES "${PLUGIN_CU_SOURCES}")
 
-list(APPEND PLUGIN_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/api/InferPlugin.cpp")
+list(APPEND PLUGIN_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/api/tllmPlugin.cpp")
 
 # ################################# SHARED LIBRARY
 # ##############################################################################
 
+if(MSVC)
+  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS 1)
+endif()
+
 add_library(${PLUGIN_SHARED_TARGET} SHARED ${PLUGIN_SOURCES})
 
 target_include_directories(
@@ -85,12 +89,14 @@ set_target_properties(
              LIBRARY_OUTPUT_DIRECTORY "${TRT_OUT_DIR}"
              RUNTIME_OUTPUT_DIRECTORY "${TRT_OUT_DIR}")
 
-set_target_properties(
-  ${PLUGIN_SHARED_TARGET}
-  PROPERTIES
-    LINK_FLAGS
-    "-Wl,--exclude-libs,ALL -Wl,--version-script=${PLUGIN_EXPORT_MAP} -Wl,--no-undefined"
-)
+if(NOT MSVC)
+  set_target_properties(
+    ${PLUGIN_SHARED_TARGET}
+    PROPERTIES
+      LINK_FLAGS
+      "-Wl,--exclude-libs,ALL -Wl,--version-script=${PLUGIN_EXPORT_MAP} ${UNDEFINED_FLAG}"
+  )
+endif()
 
 set_target_properties(
   ${PLUGIN_SHARED_TARGET} PROPERTIES VERSION ${TRT_VERSION} SOVERSION
@@ -102,7 +108,6 @@ target_link_libraries(
   ${PLUGIN_SHARED_TARGET}
   ${CUBLAS_LIB}
   ${CUBLASLT_LIB}
-  ${CUDART_LIB}
   ${CUDNN_LIB}
   nvinfer
   ${CUDA_DRV_LIB}
diff --git a/cpp/tensorrt_llm/plugins/api/InferPlugin.cpp b/cpp/tensorrt_llm/plugins/api/InferPlugin.cpp
deleted file mode 100644
index c14452ba1b4..00000000000
--- a/cpp/tensorrt_llm/plugins/api/InferPlugin.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "NvInfer.h"
-#include "NvInferPlugin.h"
-#include "tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h"
-#include "tensorrt_llm/plugins/common/checkMacrosPlugin.h"
-#include "tensorrt_llm/plugins/common/plugin.h"
-#include "tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h"
-#include "tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h"
-#include "tensorrt_llm/plugins/identityPlugin/identityPlugin.h"
-#include "tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h"
-#include "tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h"
-#include "tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h"
-#if ENABLE_MULTI_DEVICE
-#include "tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h"
-#include "tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h"
-#include "tensorrt_llm/plugins/ncclPlugin/recvPlugin.h"
-#include "tensorrt_llm/plugins/ncclPlugin/sendPlugin.h"
-#endif // ENABLE_MULTI_DEVICE
-#include "tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h"
-#include "tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h"
-#include "tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h"
-#include "tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h"
-#include "tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h"
-#include "tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h"
-#include "tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h"
-#include <algorithm>
-#include <array>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <stack>
-#include <unordered_set>
-using namespace nvinfer1;
-using namespace nvinfer1::plugin;
-
-namespace nvinfer1
-{
-
-namespace
-{
-
-// This singleton ensures that each plugin is only registered once for a given
-// namespace and type, and attempts of duplicate registration are ignored.
-class PluginCreatorRegistry
-{
-public:
-    static PluginCreatorRegistry& getInstance()
-    {
-        static PluginCreatorRegistry instance;
-        return instance;
-    }
-
-    template <typename CreatorType>
-    void addPluginCreator(void* logger, const char* libNamespace)
-    {
-        // Make accesses to the plugin creator registry thread safe
-        std::lock_guard<std::mutex> lock(mRegistryLock);
-
-        std::string errorMsg;
-        std::string verboseMsg;
-
-        std::unique_ptr<CreatorType> pluginCreator{new CreatorType{}};
-        pluginCreator->setPluginNamespace(libNamespace);
-
-        nvinfer1::ILogger* trtLogger = static_cast<nvinfer1::ILogger*>(logger);
-        std::string pluginType = std::string{pluginCreator->getPluginNamespace()}
-            + "::" + std::string{pluginCreator->getPluginName()} + " version "
-            + std::string{pluginCreator->getPluginVersion()};
-
-        if (mRegistryList.find(pluginType) == mRegistryList.end())
-        {
-            bool status = getPluginRegistry()->registerCreator(*pluginCreator, libNamespace);
-            if (status)
-            {
-                mRegistry.push(std::move(pluginCreator));
-                mRegistryList.insert(pluginType);
-                verboseMsg = "Registered plugin creator - " + pluginType;
-            }
-            else
-            {
-                errorMsg = "Could not register plugin creator -  " + pluginType;
-            }
-        }
-        else
-        {
-            verboseMsg = "Plugin creator already registered - " + pluginType;
-        }
-
-        if (trtLogger)
-        {
-            if (!errorMsg.empty())
-            {
-                trtLogger->log(ILogger::Severity::kERROR, errorMsg.c_str());
-            }
-
-            if (!verboseMsg.empty())
-            {
-                trtLogger->log(ILogger::Severity::kVERBOSE, verboseMsg.c_str());
-            }
-        }
-    }
-
-    ~PluginCreatorRegistry()
-    {
-        std::lock_guard<std::mutex> lock(mRegistryLock);
-
-        // Release pluginCreators in LIFO order of registration.
-        while (!mRegistry.empty())
-        {
-            mRegistry.pop();
-        }
-        mRegistryList.clear();
-    }
-
-private:
-    PluginCreatorRegistry() {}
-
-    std::mutex mRegistryLock;
-    std::stack<std::unique_ptr<IPluginCreator>> mRegistry;
-    std::unordered_set<std::string> mRegistryList;
-
-public:
-    PluginCreatorRegistry(PluginCreatorRegistry const&) = delete;
-    void operator=(PluginCreatorRegistry const&) = delete;
-};
-
-template <typename CreatorType>
-void initializePlugin(void* logger, const char* libNamespace)
-{
-    PluginCreatorRegistry::getInstance().addPluginCreator<CreatorType>(logger, libNamespace);
-}
-
-} // namespace
-} // namespace nvinfer1
-
-// New Plugin APIs
-
-extern "C"
-{
-    bool initLibNvInferPlugins(void* logger, const char* libNamespace)
-    {
-        nvinfer1::initializePlugin<nvinfer1::plugin::IdentityPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::BertAttentionPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::GPTAttentionPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::GemmPluginCreator>(logger, libNamespace);
-#if ENABLE_MULTI_DEVICE
-        nvinfer1::initializePlugin<nvinfer1::plugin::SendPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::RecvPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::AllreducePluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::AllgatherPluginCreator>(logger, libNamespace);
-#endif // ENABLE_MULTI_DEVICE
-        nvinfer1::initializePlugin<nvinfer1::plugin::LayernormPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::RmsnormPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::SmoothQuantGemmPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::LayernormQuantizationPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::QuantizePerTokenPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::QuantizeTensorPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::RmsnormQuantizationPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::WeightOnlyGroupwiseQuantMatmulPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::WeightOnlyQuantMatmulPluginCreator>(logger, libNamespace);
-        nvinfer1::initializePlugin<nvinfer1::plugin::LookupPluginCreator>(logger, libNamespace);
-        return true;
-    }
-} // extern "C"
diff --git a/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp b/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp
new file mode 100644
index 00000000000..aa0fde60211
--- /dev/null
+++ b/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp
@@ -0,0 +1,209 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "tllmPlugin.h"
+
+#include "tensorrt_llm/common/stringUtils.h"
+#include "tensorrt_llm/runtime/tllmLogger.h"
+
+#include "tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h"
+#include "tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h"
+#include "tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h"
+#include "tensorrt_llm/plugins/identityPlugin/identityPlugin.h"
+#include "tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h"
+#include "tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h"
+#include "tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h"
+#if ENABLE_MULTI_DEVICE
+#include "tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h"
+#include "tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h"
+#include "tensorrt_llm/plugins/ncclPlugin/recvPlugin.h"
+#include "tensorrt_llm/plugins/ncclPlugin/sendPlugin.h"
+#endif // ENABLE_MULTI_DEVICE
+#include "tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h"
+#include "tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h"
+#include "tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h"
+#include "tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h"
+#include "tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h"
+#include "tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h"
+#include "tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h"
+
+#include <array>
+#include <cstdlib>
+
+#include <NvInferRuntime.h>
+
+namespace tc = tensorrt_llm::common;
+
+namespace
+{
+
+nvinfer1::IPluginCreator* creatorPtr(nvinfer1::IPluginCreator& creator)
+{
+    return &creator;
+}
+
+auto tllmLogger = tensorrt_llm::runtime::TllmLogger();
+
+nvinfer1::ILogger* gLogger{&tllmLogger};
+
+class GlobalLoggerFinder : public nvinfer1::ILoggerFinder
+{
+public:
+    nvinfer1::ILogger* findLogger() override
+    {
+        return gLogger;
+    }
+};
+
+GlobalLoggerFinder gGlobalLoggerFinder{};
+
+#if !defined(_MSC_VER)
+__attribute__((constructor))
+#endif
+void initOnLoad()
+{
+    auto constexpr kLoadPlugins = "TRT_LLM_LOAD_PLUGINS";
+    auto const loadPlugins = std::getenv(kLoadPlugins);
+    if (loadPlugins && loadPlugins[0] == '1')
+    {
+        initTrtLlmPlugins(gLogger);
+    }
+}
+
+bool pluginsInitialized = false;
+
+} // namespace
+
+// New Plugin APIs
+
+extern "C"
+{
+    bool initTrtLlmPlugins(void* logger, const char* libNamespace)
+    {
+        if (pluginsInitialized)
+            return true;
+
+        if (logger)
+        {
+            gLogger = static_cast<nvinfer1::ILogger*>(logger);
+        }
+        setLoggerFinder(&gGlobalLoggerFinder);
+
+        auto registry = getPluginRegistry();
+        std::int32_t nbCreators;
+        auto creators = getPluginCreators(nbCreators);
+
+        for (std::int32_t i = 0; i < nbCreators; ++i)
+        {
+            auto const creator = creators[i];
+            creator->setPluginNamespace(libNamespace);
+            registry->registerCreator(*creator, libNamespace);
+            if (gLogger)
+            {
+                auto const msg = tc::fmtstr("Registered plugin creator %s version %s in namespace %s",
+                    creator->getPluginName(), creator->getPluginVersion(), libNamespace);
+                gLogger->log(nvinfer1::ILogger::Severity::kVERBOSE, msg.c_str());
+            }
+        }
+
+        pluginsInitialized = true;
+        return true;
+    }
+
+    [[maybe_unused]] void setLoggerFinder([[maybe_unused]] nvinfer1::ILoggerFinder* finder)
+    {
+        tensorrt_llm::plugins::api::LoggerFinder::getInstance().setLoggerFinder(finder);
+    }
+
+    [[maybe_unused]] nvinfer1::IPluginCreator* const* getPluginCreators(std::int32_t& nbCreators)
+    {
+        static tensorrt_llm::plugins::IdentityPluginCreator identityPluginCreator;
+        static tensorrt_llm::plugins::BertAttentionPluginCreator bertAttentionPluginCreator;
+        static tensorrt_llm::plugins::GPTAttentionPluginCreator gptAttentionPluginCreator;
+        static tensorrt_llm::plugins::GemmPluginCreator gemmPluginCreator;
+#if ENABLE_MULTI_DEVICE
+        static tensorrt_llm::plugins::SendPluginCreator sendPluginCreator;
+        static tensorrt_llm::plugins::RecvPluginCreator recvPluginCreator;
+        static tensorrt_llm::plugins::AllreducePluginCreator allreducePluginCreator;
+        static tensorrt_llm::plugins::AllgatherPluginCreator allgatherPluginCreator;
+#endif // ENABLE_MULTI_DEVICE
+        static tensorrt_llm::plugins::LayernormPluginCreator layernormPluginCreator;
+        static tensorrt_llm::plugins::RmsnormPluginCreator rmsnormPluginCreator;
+        static tensorrt_llm::plugins::SmoothQuantGemmPluginCreator smoothQuantGemmPluginCreator;
+        static tensorrt_llm::plugins::LayernormQuantizationPluginCreator layernormQuantizationPluginCreator;
+        static tensorrt_llm::plugins::QuantizePerTokenPluginCreator quantizePerTokenPluginCreator;
+        static tensorrt_llm::plugins::QuantizeTensorPluginCreator quantizeTensorPluginCreator;
+        static tensorrt_llm::plugins::RmsnormQuantizationPluginCreator rmsnormQuantizationPluginCreator;
+        static tensorrt_llm::plugins::WeightOnlyGroupwiseQuantMatmulPluginCreator
+            weightOnlyGroupwiseQuantMatmulPluginCreator;
+        static tensorrt_llm::plugins::WeightOnlyQuantMatmulPluginCreator weightOnlyQuantMatmulPluginCreator;
+        static tensorrt_llm::plugins::LookupPluginCreator lookupPluginCreator;
+
+        static std::array pluginCreators
+            = { creatorPtr(identityPluginCreator),
+                  creatorPtr(bertAttentionPluginCreator),
+                  creatorPtr(gptAttentionPluginCreator),
+                  creatorPtr(gemmPluginCreator),
+#if ENABLE_MULTI_DEVICE
+                  creatorPtr(sendPluginCreator),
+                  creatorPtr(recvPluginCreator),
+                  creatorPtr(allreducePluginCreator),
+                  creatorPtr(allgatherPluginCreator),
+#endif // ENABLE_MULTI_DEVICE
+                  creatorPtr(layernormPluginCreator),
+                  creatorPtr(rmsnormPluginCreator),
+                  creatorPtr(smoothQuantGemmPluginCreator),
+                  creatorPtr(layernormQuantizationPluginCreator),
+                  creatorPtr(quantizePerTokenPluginCreator),
+                  creatorPtr(quantizeTensorPluginCreator),
+                  creatorPtr(rmsnormQuantizationPluginCreator),
+                  creatorPtr(weightOnlyGroupwiseQuantMatmulPluginCreator),
+                  creatorPtr(weightOnlyQuantMatmulPluginCreator),
+                  creatorPtr(lookupPluginCreator),
+              };
+        nbCreators = pluginCreators.size();
+        return pluginCreators.data();
+    }
+
+} // extern "C"
+
+namespace tensorrt_llm::plugins::api
+{
+LoggerFinder& tensorrt_llm::plugins::api::LoggerFinder::getInstance() noexcept
+{
+    static LoggerFinder instance;
+    return instance;
+}
+
+void LoggerFinder::setLoggerFinder(nvinfer1::ILoggerFinder* finder)
+{
+    std::lock_guard<std::mutex> lk(mMutex);
+    if (mLoggerFinder == nullptr && finder != nullptr)
+    {
+        mLoggerFinder = finder;
+    }
+}
+
+nvinfer1::ILogger* LoggerFinder::findLogger()
+{
+    std::lock_guard<std::mutex> lk(mMutex);
+    if (mLoggerFinder != nullptr)
+    {
+        return mLoggerFinder->findLogger();
+    }
+    return nullptr;
+}
+} // namespace tensorrt_llm::plugins::api
diff --git a/cpp/tensorrt_llm/plugins/api/tllmPlugin.h b/cpp/tensorrt_llm/plugins/api/tllmPlugin.h
new file mode 100644
index 00000000000..bfc034674c6
--- /dev/null
+++ b/cpp/tensorrt_llm/plugins/api/tllmPlugin.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <NvInferRuntime.h>
+#include <mutex>
+
+namespace tensorrt_llm::plugins::api
+{
+
+auto constexpr kDefaultNamespace = "tensorrt_llm";
+
+class LoggerFinder : public nvinfer1::ILoggerFinder
+{
+public:
+    //! Set the logger finder.
+    void setLoggerFinder(nvinfer1::ILoggerFinder* finder);
+
+    //! Get the logger.
+    nvinfer1::ILogger* findLogger() override;
+
+    static LoggerFinder& getInstance() noexcept;
+
+private:
+    LoggerFinder() = default;
+
+    nvinfer1::ILoggerFinder* mLoggerFinder{nullptr};
+    std::mutex mMutex;
+};
+
+} // namespace tensorrt_llm::plugins::api
+
+extern "C"
+{
+    // This function is used for explicitly registering the TRT-LLM plugins and the default logger.
+    bool initTrtLlmPlugins(void* logger, const char* libNamespace = tensorrt_llm::plugins::api::kDefaultNamespace);
+
+    // The functions below are used by TensorRT to when loading a shared plugin library with automatic registering.
+    // see https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#generating-plugin-library
+    TENSORRTAPI [[maybe_unused]] void setLoggerFinder([[maybe_unused]] nvinfer1::ILoggerFinder* finder);
+    TENSORRTAPI [[maybe_unused]] nvinfer1::IPluginCreator* const* getPluginCreators(int32_t& nbCreators);
+}
diff --git a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp
index 830000b0477..1ccdef46e48 100644
--- a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp
@@ -14,22 +14,22 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h"
+#include "bertAttentionPlugin.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h"
 #include "tensorrt_llm/kernels/gptKernels.h"
 #include "tensorrt_llm/kernels/unfusedAttentionKernels.h"
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::kernels;
-using namespace tensorrt_llm::common;
-using nvinfer1::plugin::BertAttentionPluginCreator;
-using nvinfer1::plugin::BertAttentionPlugin;
-using nvinfer1::plugin::nextWorkspacePtr;
+namespace tc = tensorrt_llm::common;
+
+using tensorrt_llm::plugins::BertAttentionPluginCreator;
+using tensorrt_llm::plugins::BertAttentionPlugin;
 
 static const char* BERT_ATTENTION_PLUGIN_VERSION{"1"};
 static const char* BERT_ATTENTION_PLUGIN_NAME{"BertAttention"};
 PluginFieldCollection BertAttentionPluginCreator::mFC{};
-std::vector<PluginField> BertAttentionPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> BertAttentionPluginCreator::mPluginAttributes;
 
 BertAttentionPlugin::BertAttentionPlugin(int num_heads, int head_size, float q_scaling, bool qk_half_accum,
     ContextFMHAType context_fmha_type, nvinfer1::DataType type)
@@ -56,7 +56,7 @@ BertAttentionPlugin::BertAttentionPlugin(const void* data, size_t length)
     read(d, mEnableContextFMHA);
     read(d, mFMHAForceFP32Acc);
     read(d, mType);
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -71,7 +71,7 @@ nvinfer1::IPluginV2DynamicExt* BertAttentionPlugin::clone() const noexcept
 nvinfer1::DimsExprs BertAttentionPlugin::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept
 {
-    PLUGIN_ASSERT(outputIndex == 0);
+    TLLM_CHECK(outputIndex == 0);
     auto ret = inputs[0];
     ret.d[2] = exprBuilder.constant(ret.d[2]->getConstantValue() / 3);
     return ret;
@@ -139,7 +139,7 @@ size_t BertAttentionPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc* i
     workspaces[8] = qk_buf_float_size;
     workspaces[9] = padding_offset_size;
 
-    return plugin::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS);
+    return tensorrt_llm::plugins::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS);
 }
 
 template <typename T>
@@ -168,7 +168,7 @@ int BertAttentionPlugin::enqueueImpl(const nvinfer1::PluginTensorDesc* inputDesc
     T* context_buf_ = (T*) (outputs[0]);
 
     auto cublasHandle = mCublasWrapper->getCublasHandle();
-    PLUGIN_CUBLASASSERT(cublasSetStream(cublasHandle, stream));
+    TLLM_CUDA_CHECK(cublasSetStream(cublasHandle, stream));
     mCublasWrapper->setStream(stream);
     mCublasWrapper->setWorkspace(workspace);
     if (inputDesc[0].type == DataType::kHALF)
@@ -225,9 +225,9 @@ int BertAttentionPlugin::enqueueImpl(const nvinfer1::PluginTensorDesc* inputDesc
     // Padding offset = nullptr here (remove padding is not supported).
     invokeAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, v_buf_2_, const_cast<T*>(attention_input), input_lengths,
         nullptr, request_batch_size, request_seq_len, batch_size * input_seq_len, mNumHeads, mNumHeads, mHeadSize,
-        mEnableContextFMHA, 0, PositionEmbeddingType::kLEARNED_ABSOLUTE, (float*) nullptr, 0, stream);
+        mEnableContextFMHA, 0, 0.0f, 0.0f, PositionEmbeddingType::kLEARNED_ABSOLUTE, (float*) nullptr, 0, stream);
 
-    const cudaDataType_t gemm_data_type = CudaDataType<T>::value;
+    const auto gemm_data_type = tc::CudaDataType<T>::value;
     const int attention_seq_len_1 = request_seq_len; // q length
     const int attention_seq_len_2 = request_seq_len; // kv length
     const T qk_scale = static_cast<T>(1.0f / (sqrtf(mHeadSize * 1.0f) * q_scaling));
@@ -338,7 +338,7 @@ int BertAttentionPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
 nvinfer1::DataType BertAttentionPlugin::getOutputDataType(
     int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
 {
-    PLUGIN_ASSERT(index == 0);
+    TLLM_CHECK(index == 0);
     return inputTypes[0];
 }
 
@@ -363,10 +363,10 @@ int BertAttentionPlugin::initialize() noexcept
 {
     auto cublasHandle = getCublasHandle();
     auto cublasLtHandle = getCublasLtHandle();
-    mCublasAlgoMap = new cublasAlgoMap(GEMM_CONFIG);
+    mCublasAlgoMap = new tc::cublasAlgoMap(GEMM_CONFIG);
     mCublasWrapperMutex = new std::mutex();
     mCublasWrapper
-        = new cublasMMWrapper(cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap, mCublasWrapperMutex, nullptr);
+        = new tc::cublasMMWrapper(cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap, mCublasWrapperMutex, nullptr);
     if (mEnableContextFMHA)
     {
         mFMHARunner = new FusedMHARunnerV2(DATA_TYPE_FP16, mNumHeads, mHeadSize, mQScaling);
@@ -415,16 +415,6 @@ void BertAttentionPlugin::serialize(void* buffer) const noexcept
 
 void BertAttentionPlugin::terminate() noexcept {}
 
-void BertAttentionPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* BertAttentionPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 BertAttentionPluginCreator::BertAttentionPluginCreator()
@@ -470,32 +460,32 @@ IPluginV2* BertAttentionPluginCreator::createPlugin(const char* name, const Plug
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "num_heads"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             num_heads = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "head_size"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             head_size = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "q_scaling"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kFLOAT32);
             q_scaling = static_cast<float>(*(static_cast<const float*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "enable_qk_half_accum"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT8);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT8);
             qk_half_accum = static_cast<bool>(*(static_cast<const int8_t*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "context_fmha_type"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT8);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT8);
             context_fmha_type = static_cast<ContextFMHAType>(*(static_cast<const int8_t*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
     }
@@ -529,13 +519,3 @@ IPluginV2* BertAttentionPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void BertAttentionPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* BertAttentionPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h
index 4b68f93e3b7..d8c04d9221c 100644
--- a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h
+++ b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_BERT_ATTENTION_PLUGIN_H
-#define TRT_BERT_ATTENTION_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h"
@@ -27,12 +26,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class BertAttentionPlugin : public IPluginV2DynamicExt
+class BertAttentionPlugin : public BasePlugin
 {
 public:
     BertAttentionPlugin() = delete;
@@ -74,12 +71,9 @@ class BertAttentionPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     const std::string mLayerName;
-    std::string mNamespace;
 
     int mNumHeads;
     int mHeadSize;
@@ -101,7 +95,7 @@ class BertAttentionPlugin : public IPluginV2DynamicExt
     tensorrt_llm::common::cublasMMWrapper* mCublasWrapper;
 };
 
-class BertAttentionPluginCreator : public IPluginCreator
+class BertAttentionPluginCreator : public BaseCreator
 {
 public:
     BertAttentionPluginCreator();
@@ -117,17 +111,9 @@ class BertAttentionPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_BERT_ATTENTION_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.cpp b/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.cpp
index be087319cb5..029c638feb8 100644
--- a/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.cpp
@@ -15,135 +15,21 @@
  * limitations under the License.
  */
 
-#include "tensorrt_llm/plugins/common/checkMacrosPlugin.h"
-#include <cstdlib>
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
+#include "checkMacrosPlugin.h"
 
-namespace nvinfer1
-{
-namespace plugin
-{
-
-namespace
-{
+#include "tensorrt_llm/common/logger.h"
 
-// This will be populated by the logger supplied by the user to initLibNvInferPlugins()
-ILogger* gLogger{};
-
-template <ILogger::Severity kSeverity>
-int LogStream<kSeverity>::Buf::sync()
+namespace tensorrt_llm::plugins
 {
-    std::string s = str();
-    while (!s.empty() && s.back() == '\n')
-    {
-        s.pop_back();
-    }
-    if (gLogger != nullptr)
-    {
-        gLogger->log(kSeverity, s.c_str());
-    }
-    str("");
-    return 0;
-}
-
-// These use gLogger, and therefore require initLibNvInferPlugins() to be called with a logger
-// (otherwise, it will not log)
-LogStream<ILogger::Severity::kERROR> gLogError;
-LogStream<ILogger::Severity::kWARNING> gLogWarning;
-LogStream<ILogger::Severity::kINFO> gLogInfo;
-LogStream<ILogger::Severity::kVERBOSE> gLogVerbose;
-} // namespace
-
-// break-pointable
-void throwCudaError(const char* file, const char* function, int line, int status, const char* msg)
-{
-    CudaError error(file, function, line, status, msg);
-    error.log(gLogError);
-    throw error;
-}
-
-// break-pointable
-void throwCublasError(const char* file, const char* function, int line, int status, const char* msg)
-{
-    if (msg == nullptr)
-    {
-        auto s_ = static_cast<cublasStatus_t>(status);
-        switch (s_)
-        {
-        case CUBLAS_STATUS_SUCCESS: msg = "CUBLAS_STATUS_SUCCESS"; break;
-        case CUBLAS_STATUS_NOT_INITIALIZED: msg = "CUBLAS_STATUS_NOT_INITIALIZED"; break;
-        case CUBLAS_STATUS_ALLOC_FAILED: msg = "CUBLAS_STATUS_ALLOC_FAILED"; break;
-        case CUBLAS_STATUS_INVALID_VALUE: msg = "CUBLAS_STATUS_INVALID_VALUE"; break;
-        case CUBLAS_STATUS_ARCH_MISMATCH: msg = "CUBLAS_STATUS_ARCH_MISMATCH"; break;
-        case CUBLAS_STATUS_MAPPING_ERROR: msg = "CUBLAS_STATUS_MAPPING_ERROR"; break;
-        case CUBLAS_STATUS_EXECUTION_FAILED: msg = "CUBLAS_STATUS_EXECUTION_FAILED"; break;
-        case CUBLAS_STATUS_INTERNAL_ERROR: msg = "CUBLAS_STATUS_INTERNAL_ERROR"; break;
-        case CUBLAS_STATUS_NOT_SUPPORTED: msg = "CUBLAS_STATUS_NOT_SUPPORTED"; break;
-        case CUBLAS_STATUS_LICENSE_ERROR: msg = "CUBLAS_STATUS_LICENSE_ERROR"; break;
-        }
-    }
-    CublasError error(file, function, line, status, msg);
-    error.log(gLogError);
-    throw error;
-}
-
-// break-pointable
-void throwCudnnError(const char* file, const char* function, int line, int status, const char* msg)
-{
-    CudnnError error(file, function, line, status, msg);
-    error.log(gLogError);
-    throw error;
-}
-
-// break-pointable
-void throwPluginError(char const* file, char const* function, int line, int status, char const* msg)
-{
-    PluginError error(file, function, line, status, msg);
-    reportValidationFailure(msg, file, line);
-    throw error;
-}
 
 void caughtError(const std::exception& e)
 {
-    gLogError << e.what() << std::endl;
+    TLLM_LOG_EXCEPTION(e);
 }
 
 void logError(const char* msg, const char* file, const char* fn, int line)
 {
-    gLogError << "Parameter check failed at: " << file << "::" << fn << "::" << line;
-    gLogError << ", condition: " << msg << std::endl;
-}
-
-void reportValidationFailure(char const* msg, char const* file, int line)
-{
-    std::ostringstream stream;
-    stream << "Validation failed: " << msg << std::endl << file << ':' << line << std::endl;
-    getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
+    TLLM_LOG_ERROR("Parameter check failed at: %s::%s::%d, condition: %s", file, fn, line, msg);
 }
 
-// break-pointable
-void reportAssertion(const char* msg, const char* file, int line)
-{
-    std::ostringstream stream;
-    stream << "Assertion failed: " << msg << std::endl
-           << file << ':' << line << std::endl
-           << "Aborting..." << std::endl;
-    getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
-    PLUGIN_CUASSERT(cudaDeviceReset());
-    abort();
-}
-
-void TRTException::log(std::ostream& logStream) const
-{
-    logStream << file << " (" << line << ") - " << name << " Error in " << function << ": " << status;
-    if (message != nullptr)
-    {
-        logStream << " (" << message << ")";
-    }
-    logStream << std::endl;
-}
-
-} // namespace plugin
-
-} // namespace nvinfer1
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.h b/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.h
index 1117389ce91..2280306f7b1 100644
--- a/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.h
+++ b/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.h
@@ -14,282 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef CHECK_MACROS_PLUGIN_H
-#define CHECK_MACROS_PLUGIN_H
+#pragma once
 
-#include "NvInfer.h"
-#include <mutex>
-#include <sstream>
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
 
-#ifdef _MSC_VER
-#define FN_NAME __FUNCTION__
-#else
-#define FN_NAME __func__
-#endif
-
-namespace nvinfer1
-{
-namespace plugin
-{
-
-namespace
-{
-template <ILogger::Severity kSeverity>
-class LogStream : public std::ostream
-{
-    class Buf : public std::stringbuf
-    {
-    public:
-        int sync() override;
-    };
-
-    Buf buffer;
-    std::mutex mLogStreamMutex;
-
-public:
-    std::mutex& getMutex()
-    {
-        return mLogStreamMutex;
-    }
-
-    LogStream()
-        : std::ostream(&buffer){};
-};
-
-// Use mutex to protect multi-stream write to buffer
-template <ILogger::Severity kSeverity, typename T>
-LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, T const& msg)
-{
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << msg;
-    return stream;
-}
-
-// Special handling static numbers
-template <ILogger::Severity kSeverity>
-inline LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, int32_t num)
+namespace tensorrt_llm::plugins
 {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << num;
-    return stream;
-}
 
-// Special handling std::endl
-template <ILogger::Severity kSeverity>
-inline LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, std::ostream& (*f)(std::ostream&) )
-{
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << f;
-    return stream;
-}
-
-} // namespace
-
-void reportValidationFailure(char const* msg, char const* file, int line);
-[[noreturn]] void reportAssertion(const char* msg, const char* file, int line);
 void logError(const char* msg, const char* file, const char* fn, int line);
 
-[[noreturn]] void throwCudaError(
-    const char* file, const char* function, int line, int status, const char* msg = nullptr);
-[[noreturn]] void throwCudnnError(
-    const char* file, const char* function, int line, int status, const char* msg = nullptr);
-[[noreturn]] void throwCublasError(
-    const char* file, const char* function, int line, int status, const char* msg = nullptr);
-[[noreturn]] void throwPluginError(
-    char const* file, char const* function, int line, int status, char const* msg = nullptr);
-
 void caughtError(const std::exception& e);
 
-class TRTException : public std::exception
-{
-public:
-    TRTException(const char* fl, const char* fn, int ln, int st, const char* msg, const char* nm)
-        : file(fl)
-        , function(fn)
-        , line(ln)
-        , status(st)
-        , message(msg)
-        , name(nm)
-    {
-    }
-
-    virtual void log(std::ostream& logStream) const;
-
-    void setMessage(const char* msg)
-    {
-        message = msg;
-    }
-
-protected:
-    const char* file{nullptr};
-    const char* function{nullptr};
-    int line{0};
-    int status{0};
-    const char* message{nullptr};
-    const char* name{nullptr};
-};
-
-class CudaError : public TRTException
-{
-public:
-    CudaError(const char* fl, const char* fn, int ln, int stat, const char* msg = nullptr)
-        : TRTException(fl, fn, ln, stat, msg, "Cuda")
-    {
-    }
-};
-
-class CudnnError : public TRTException
-{
-public:
-    CudnnError(const char* fl, const char* fn, int ln, int stat, const char* msg = nullptr)
-        : TRTException(fl, fn, ln, stat, msg, "Cudnn")
-    {
-    }
-};
-
-class CublasError : public TRTException
-{
-public:
-    CublasError(const char* fl, const char* fn, int ln, int stat, const char* msg = nullptr)
-        : TRTException(fl, fn, ln, stat, msg, "cuBLAS")
-    {
-    }
-};
-
-class PluginError : public TRTException
-{
-public:
-    PluginError(char const* fl, char const* fn, int ln, int stat, char const* msg = nullptr)
-        : TRTException(fl, fn, ln, stat, msg, "Plugin")
-    {
-    }
-};
-} // namespace plugin
-
-} // namespace nvinfer1
-
-#define PLUGIN_API_CHECK(condition)                                                                                    \
-    {                                                                                                                  \
-        if ((condition) == false)                                                                                      \
-        {                                                                                                              \
-            nvinfer1::plugin::logError(#condition, __FILE__, FN_NAME, __LINE__);                                       \
-            return;                                                                                                    \
-        }                                                                                                              \
-    }
-
-#define PLUGIN_API_CHECK_RETVAL(condition, retval)                                                                     \
-    {                                                                                                                  \
-        if ((condition) == false)                                                                                      \
-        {                                                                                                              \
-            nvinfer1::plugin::logError(#condition, __FILE__, FN_NAME, __LINE__);                                       \
-            return retval;                                                                                             \
-        }                                                                                                              \
-    }
-
-#define PLUGIN_API_CHECK_ENUM_RANGE(Type, val) PLUGIN_API_CHECK(int(val) >= 0 && int(val) < EnumMax<Type>())
-#define PLUGIN_API_CHECK_ENUM_RANGE_RETVAL(Type, val, retval)                                                          \
-    PLUGIN_API_CHECK_RETVAL(int(val) >= 0 && int(val) < EnumMax<Type>(), retval)
-
-#define PLUGIN_CHECK_CUDA(call)                                                                                        \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        cudaError_t status = call;                                                                                     \
-        if (status != cudaSuccess)                                                                                     \
-        {                                                                                                              \
-            return status;                                                                                             \
-        }                                                                                                              \
-    } while (0)
-
-#define PLUGIN_CHECK_CUDNN(call)                                                                                       \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        cudnnStatus_t status = call;                                                                                   \
-        if (status != CUDNN_STATUS_SUCCESS)                                                                            \
-        {                                                                                                              \
-            return status;                                                                                             \
-        }                                                                                                              \
-    } while (0)
-
-#define PLUGIN_CUBLASASSERT(status_)                                                                                   \
-    {                                                                                                                  \
-        auto s_ = status_;                                                                                             \
-        if (s_ != CUBLAS_STATUS_SUCCESS)                                                                               \
-        {                                                                                                              \
-            nvinfer1::plugin::throwCublasError(__FILE__, FN_NAME, __LINE__, s_);                                       \
-        }                                                                                                              \
-    }
-
-#define PLUGIN_CUDNNASSERT(status_)                                                                                    \
-    {                                                                                                                  \
-        auto s_ = status_;                                                                                             \
-        if (s_ != CUDNN_STATUS_SUCCESS)                                                                                \
-        {                                                                                                              \
-            const char* msg = cudnnGetErrorString(s_);                                                                 \
-            nvinfer1::plugin::throwCudnnError(__FILE__, FN_NAME, __LINE__, s_, msg);                                   \
-        }                                                                                                              \
-    }
-
-#define PLUGIN_CUASSERT(status_)                                                                                       \
-    {                                                                                                                  \
-        auto s_ = status_;                                                                                             \
-        if (s_ != cudaSuccess)                                                                                         \
-        {                                                                                                              \
-            const char* msg = cudaGetErrorString(s_);                                                                  \
-            nvinfer1::plugin::throwCudaError(__FILE__, FN_NAME, __LINE__, s_, msg);                                    \
-        }                                                                                                              \
-    }
-
-#define GET_MACRO(_1, _2, NAME, ...) NAME
-#define PLUGIN_VALIDATE(...) GET_MACRO(__VA_ARGS__, PLUGIN_VALIDATE_MSG, PLUGIN_VALIDATE_DEFAULT, )(__VA_ARGS__)
-
-// Logs failed condition and throws a PluginError.
-// PLUGIN_ASSERT will eventually perform this function, at which point PLUGIN_VALIDATE
-// will be removed.
-#define PLUGIN_VALIDATE_DEFAULT(condition)                                                                             \
-    {                                                                                                                  \
-        if (!(condition))                                                                                              \
-        {                                                                                                              \
-            nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, #condition);                            \
-        }                                                                                                              \
-    }
-
-#define PLUGIN_VALIDATE_MSG(condition, msg)                                                                            \
-    {                                                                                                                  \
-        if (!(condition))                                                                                              \
-        {                                                                                                              \
-            nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, msg);                                   \
-        }                                                                                                              \
-    }
-
-// Logs failed assertion and aborts.
-// Aborting is undesirable and will be phased-out from the plugin module, at which point
-// PLUGIN_ASSERT will perform the same function as PLUGIN_VALIDATE.
-#define PLUGIN_ASSERT(assertion)                                                                                       \
-    {                                                                                                                  \
-        if (!(assertion))                                                                                              \
-        {                                                                                                              \
-            nvinfer1::plugin::reportAssertion(#assertion, __FILE__, __LINE__);                                         \
-        }                                                                                                              \
-    }
-
-#define PLUGIN_FAIL(msg)                                                                                               \
-    {                                                                                                                  \
-        nvinfer1::plugin::reportAssertion(msg, __FILE__, __LINE__);                                                    \
-    }
-
-#define PLUGIN_ERROR(msg)                                                                                              \
-    {                                                                                                                  \
-        nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, msg);                                       \
-    }
-
-#define PLUGIN_CUERROR(status_)                                                                                        \
-    {                                                                                                                  \
-        auto s_ = status_;                                                                                             \
-        if (s_ != 0)                                                                                                   \
-            nvinfer1::plugin::logError(#status_ " failure.", __FILE__, FN_NAME, __LINE__);                             \
-    }
-
-#endif /*CHECK_MACROS_PLUGIN_H*/
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h b/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h
new file mode 100644
index 00000000000..2a975211004
--- /dev/null
+++ b/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h
@@ -0,0 +1,470 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/plugins/common/plugin.h"
+
+namespace tensorrt_llm::plugins
+{
+
+struct GemmDims
+{
+    int32_t minM;
+    int32_t maxM;
+    int32_t n;
+    int32_t k;
+
+    GemmDims()
+        : minM(-1)
+        , maxM(-1)
+        , n(-1)
+        , k(-1)
+    {
+    }
+
+    GemmDims(int32_t minM_, int32_t maxM_, int32_t n_, int32_t k_)
+        : minM(minM_)
+        , maxM(maxM_)
+        , n(n_)
+        , k(k_)
+    {
+    }
+
+    bool isInitialized() const
+    {
+        return minM >= 0 && maxM >= 0 && n >= 0 && k >= 0;
+    }
+};
+
+// Unique ID of GEMM
+// In our case GEMM is uniqly identified by N and K
+class GemmIdCore
+{
+public:
+    int n;
+    int k;
+    nvinfer1::DataType dtype;
+
+    GemmIdCore(int n_, int k_, const nvinfer1::DataType& dtype_)
+        : n(n_)
+        , k(k_)
+        , dtype(dtype_)
+    {
+    }
+
+    GemmIdCore()
+        : n(-1)
+        , k(-1)
+        , dtype(nvinfer1::DataType::kFLOAT) // dtype does not matter here
+    {
+    }
+
+    bool operator==(const GemmIdCore& id) const
+    {
+        return n == id.n && k == id.k && dtype == id.dtype;
+    }
+
+    friend std::ostream& operator<<(std::ostream& out, const GemmIdCore& id)
+    {
+        out << "(N;K)=(" << id.n << ";" << id.k << "),";
+        out << " type=" << static_cast<int>(id.dtype);
+        return out;
+    }
+};
+
+// Hash of GemmId
+struct GemmIdCoreHash
+{
+    std::size_t operator()(const GemmIdCore& id) const
+    {
+        auto h1 = std::hash<int>{}(id.n);
+        auto h2 = std::hash<int>{}(id.k);
+        auto h3 = std::hash<int>{}(static_cast<int>(id.dtype));
+        return h1 ^ h2 ^ h3;
+    }
+};
+
+template <typename Config, typename RunnerPtr, typename GemmIdType, typename GemmIdHashType>
+class GemmPluginProfiler
+{
+public:
+    static constexpr int MAX_PROFILE_M = 8192;
+
+    // Map for single GEMM for different Ms (GEMM dimension) to the best config for particular M
+    using MProfileMap = std::unordered_map<int, std::optional<Config>>;
+    using MProfileMapPtr = std::shared_ptr<MProfileMap>;
+
+    // requires exclusive ownership to write to *this
+    using reader_lock = std::unique_lock<std::shared_timed_mutex>;
+    // requires shared ownership to read from other
+    using writer_lock = std::shared_lock<std::shared_timed_mutex>;
+
+    // Struct of contining map if GEMMs to the best profiles for different Ms
+    struct MNKProfileMap
+    {
+        // Mutex guarding map
+        std::shared_timed_mutex mutex;
+        // Map from GEMM Id to profile for particular GEMM
+        std::unordered_map<GemmIdType, MProfileMapPtr, GemmIdHashType> profileMap;
+
+        bool existsMProfileMap(const GemmIdType& id)
+        {
+            const auto iter = profileMap.find(id);
+            return iter != profileMap.end();
+        }
+
+        void createMProfileMap(const GemmIdType& id)
+        {
+            profileMap[id] = std::make_shared<MProfileMap>();
+        }
+
+        MProfileMapPtr getMProfileMap(const GemmIdType& id)
+        {
+            const auto iter = profileMap.find(id);
+            if (iter == profileMap.end())
+            {
+                std::ostringstream msg;
+                msg << "Cannot find ID (" << id << ") in the profile map. Abort.";
+                TLLM_LOG_ERROR(msg.str());
+            }
+            return iter->second;
+        }
+    };
+
+    using MNKProfileMapPtr = std::shared_ptr<MNKProfileMap>;
+
+    GemmPluginProfiler()
+    {
+        mMNKProfileMap = std::make_shared<MNKProfileMap>();
+
+        // set SKIP_GEMM_PLUGIN_PROFILINGS=1 to avoid tactics profilings
+        const auto skip = std::getenv("SKIP_GEMM_PLUGIN_PROFILINGS");
+        mSkip = (skip != NULL && std::stoi(skip));
+        if (mSkip)
+        {
+            TLLM_LOG_DEBUG(
+                "SKIP_GEMM_PLUGIN_PROFILINGS is set. Skipping GEMM plugin profilings. It could result in runtime error "
+                "if default tactic is not defined.");
+        }
+    }
+
+    void serialize(char* buffer, const GemmIdType& gemmId) const
+    {
+        auto mProfileMap = mMNKProfileMap->getMProfileMap(gemmId);
+
+        // Save number of profiles for given GEMM ID
+        write(buffer, static_cast<int>(mProfileMap->size()));
+        for (const auto& pair : *mProfileMap)
+        {
+            // Save pair of M to the best GEMM config
+            write(buffer, pair);
+        }
+    }
+
+    void deserialize(const char*& data, GemmDims& dims, const GemmIdType& gemmId)
+    {
+        // NOTE(nkorobov): this mutex is not needed since each thread owns its own map, but will put here for
+        // consistency
+        writer_lock lock(mMNKProfileMap->mutex);
+
+        mDims = dims;
+
+        // GemmId gemmId(dims.n, dims.k);
+        if (!mMNKProfileMap->existsMProfileMap(gemmId))
+        {
+            // Create GEMM with GEMM ID if it does not exist
+            mMNKProfileMap->createMProfileMap(gemmId);
+        }
+        // Populate map with profiles of GEMM ID
+        auto profileMap = mMNKProfileMap->getMProfileMap(gemmId);
+        int selectedMapSize;
+        read(data, selectedMapSize);
+        for (int ii = 0; ii < selectedMapSize; ++ii)
+        {
+            std::pair<int, std::optional<Config>> config;
+            read(data, config);
+            profileMap->insert(config);
+        }
+    }
+
+    size_t getSerializationSize(const GemmIdType& gemmId) const
+    {
+        reader_lock lock(mMNKProfileMap->mutex);
+        return sizeof(int) +                                 // size of the tactics map
+            mMNKProfileMap->getMProfileMap(gemmId)->size()
+            * sizeof(std::pair<int, std::optional<Config>>); // size of the tactics map
+    }
+
+    void profileTactics(const std::vector<Config>& tactics, const RunnerPtr& runner, const nvinfer1::DataType& type,
+        const GemmDims& dims, const GemmIdType& gemmId)
+    {
+        writer_lock lock(mMNKProfileMap->mutex);
+
+        if (!dims.isInitialized())
+        {
+            return;
+        }
+
+        mRunner = runner;
+        mType = type;
+
+        const int maxM = std::min(nextPowerOfTwo(dims.maxM), MAX_PROFILE_M);
+        computeTmpSize(maxM, dims.n, dims.k);
+
+        if (!mMNKProfileMap->existsMProfileMap(gemmId))
+        {
+            // Create map for GEMM ID
+            mMNKProfileMap->createMProfileMap(gemmId);
+        }
+
+        if (mSkip)
+        {
+            return;
+        }
+
+        auto mProfileMap = mMNKProfileMap->getMProfileMap(gemmId);
+
+        auto profileTactics = [&tactics, &mProfileMap, this](int m, int n, int k)
+        {
+            if (mProfileMap->count(m) == 0)
+            {
+                // Profile different tactics for particular m and insert best config to the map
+                mProfileMap->insert({m, this->profileTacticsForProblem(m, n, k, tactics)});
+            }
+        };
+
+        // Allocate tmp data to run GEMMs
+        allocateTmpData();
+        const int startMinMRounded = nextPowerOfTwo(dims.minM);
+        for (int m = startMinMRounded; m < maxM; m *= 2)
+        {
+            profileTactics(m, dims.n, dims.k);
+        }
+
+        profileTactics(maxM, dims.n, dims.k);
+        // Free tmp data
+        freeTmpData();
+    }
+
+    void setSelectionTactics(const MNKProfileMapPtr& map)
+    {
+        mMNKProfileMap = map;
+    }
+
+    void setTmpWorkspaceSizeInBytes(size_t bytes)
+    {
+        mTmpWorkspaceSizeInBytes = bytes;
+    }
+
+    std::optional<Config> getBestConfig(int m, const GemmIdType& gemmId) const
+    {
+        reader_lock lock(mMNKProfileMap->mutex);
+
+        if (mSkip)
+        {
+            return std::nullopt;
+        }
+
+        const int mRounded = std::min(nextPowerOfTwo(m), MAX_PROFILE_M);
+        return mMNKProfileMap->getMProfileMap(gemmId)->at(mRounded);
+    }
+
+protected:
+    virtual void runTactic(int m, int n, int k, const Config& tactic, char* workspace, const cudaStream_t& stream) = 0;
+
+    virtual void computeTmpSize(int maxM, int n, int k) = 0;
+
+    virtual bool checkTactic(int m, int n, int k, const Config& tactic) const
+    {
+        return true;
+    }
+
+private:
+    void allocateTmpData()
+    {
+        TLLM_CHECK_WITH_INFO(mTmpWorkspaceSizeInBytes > 0, "tmpWorkspaceSizeInBytes must be larger than 0");
+        const auto status = cudaMalloc(&mWorkspaceTmp, mTmpWorkspaceSizeInBytes);
+        TLLM_CHECK_WITH_INFO(status == cudaSuccess, "Can't allocate tmp workspace for GEMM tactics profiling.");
+    }
+
+    void freeTmpData()
+    {
+        const auto status = cudaFree(mWorkspaceTmp);
+        TLLM_CHECK_WITH_INFO(status == cudaSuccess, "Can't free tmp workspace for GEMM tactics profiling.");
+    }
+
+    std::optional<Config> profileTacticsForProblem(int m, int n, int k, const std::vector<Config>& tactics)
+    {
+        TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+        float bestTime = std::numeric_limits<float>::max();
+        Config bestConfig;
+        bool foundOne = false;
+
+        // Iterate over all tactics for given M, N and K
+        for (int ii = 0; ii < tactics.size(); ++ii)
+        {
+            const Config& candidateConfig = tactics[ii];
+            float time = std::numeric_limits<float>::max();
+            try
+            {
+                if (!checkTactic(m, n, k, candidateConfig))
+                {
+                    continue;
+                }
+                // Profile particualar tactic for given M, N and K
+                time = profileTacticForProblem(m, n, k, candidateConfig);
+                foundOne = true;
+            }
+            catch (const std::exception& e)
+            {
+                std::ostringstream msg;
+                msg << "Cannot profile configuration " << ii << " (for"
+                    << " m=" << m << ", n=" << n << ", k=" << k << "). Skipped";
+                TLLM_LOG_WARNING(msg.str());
+                continue;
+            }
+
+            // Choose the fastest tactic
+            if (time < bestTime)
+            {
+                bestConfig = candidateConfig;
+                bestTime = time;
+            }
+        }
+
+        if (!foundOne)
+        {
+            std::ostringstream msg;
+            msg << "Have not found any valid GEMM config for shape ("
+                << "m=" << m << ", n=" << n << ", k=" << k << "). Will try to use default or fail at runtime";
+            TLLM_LOG_WARNING(msg.str());
+            return std::nullopt;
+        }
+        return {bestConfig};
+    }
+
+    float profileTacticForProblem(int m, int n, int k, const Config& tactic)
+    {
+        constexpr int warmup = 5;
+        constexpr int runs = 10;
+
+        cudaStream_t stream = cudaStreamDefault;
+        // Warmup the execution
+        for (int i = 0; i < warmup; ++i)
+        {
+            runTactic(m, n, k, tactic, mWorkspaceTmp, stream);
+        }
+
+        cudaEvent_t start;
+        cudaEvent_t stop;
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+        cudaDeviceSynchronize();
+        cudaEventRecord(start, 0);
+
+        // Profile GEMM
+        for (int i = 0; i < runs; ++i)
+        {
+            runTactic(m, n, k, tactic, mWorkspaceTmp, stream);
+        }
+
+        cudaEventRecord(stop, 0);
+
+        cudaEventSynchronize(stop);
+
+        float elapsed;
+        cudaEventElapsedTime(&elapsed, start, stop);
+
+        cudaEventDestroy(start);
+        cudaEventDestroy(stop);
+
+        return elapsed / runs;
+    }
+
+    int nextPowerOfTwo(int v) const
+    {
+        --v;
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        return ++v;
+    }
+
+protected:
+    RunnerPtr mRunner{nullptr};
+
+    nvinfer1::DataType mType{};
+
+private:
+    MNKProfileMapPtr mMNKProfileMap{};
+
+    size_t mTmpWorkspaceSizeInBytes{0};
+
+    char* mWorkspaceTmp{nullptr};
+
+    GemmDims mDims{};
+
+    bool mSkip{false};
+};
+
+template <typename GemmPluginProfilerType>
+class GemmPluginProfilerManager
+{
+public:
+    using MNKProfileMap = typename GemmPluginProfilerType::MNKProfileMap;
+    using MNKProfileMapPtr = typename GemmPluginProfilerType::MNKProfileMapPtr;
+    using GemmPluginProfilerPtr = std::shared_ptr<GemmPluginProfilerType>;
+
+    GemmPluginProfilerManager()
+    {
+        mMNKProfileMap = std::make_shared<MNKProfileMap>();
+    }
+
+    GemmPluginProfilerPtr createGemmPluginProfiler(bool inference)
+    {
+        auto profiler = std::make_shared<GemmPluginProfilerType>();
+        // If the profiler is created during the engine build,
+        // mMNKProfileMap is shared between different profilers to minimize the time spent on the profiling
+        // and do not repeat profiling for the GEMMs of the same shape.
+        if (!inference)
+        {
+            profiler->setSelectionTactics(mMNKProfileMap);
+        }
+        return profiler;
+    }
+
+private:
+    MNKProfileMapPtr mMNKProfileMap{};
+};
+
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/common/plugin.cpp b/cpp/tensorrt_llm/plugins/common/plugin.cpp
index a2a9c325332..64b5ef85a62 100644
--- a/cpp/tensorrt_llm/plugins/common/plugin.cpp
+++ b/cpp/tensorrt_llm/plugins/common/plugin.cpp
@@ -24,7 +24,11 @@
 #include <functional>
 #include <mutex>
 
-#define CUDA_MEM_ALIGN 128
+#ifdef _MSC_VER
+#define FN_NAME __FUNCTION__
+#else
+#define FN_NAME __func__
+#endif
 
 #if ENABLE_MULTI_DEVICE
 std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap()
@@ -51,10 +55,10 @@ inline CUcontext getCurrentCudaCtx()
     CUresult err = cuCtxGetCurrent(&ctx);
     if (err == CUDA_ERROR_NOT_INITIALIZED || ctx == nullptr)
     {
-        PLUGIN_CUASSERT(cudaFree(nullptr));
+        TLLM_CUDA_CHECK(cudaFree(nullptr));
         err = cuCtxGetCurrent(&ctx);
     }
-    PLUGIN_ASSERT(err == CUDA_SUCCESS);
+    TLLM_CHECK(err == CUDA_SUCCESS);
     return ctx;
 }
 
@@ -131,12 +135,12 @@ std::shared_ptr<cublasHandle_t> getCublasHandle()
         []() -> auto
         {
             auto handle = std::unique_ptr<cublasHandle_t>(new cublasHandle_t);
-            PLUGIN_CUBLASASSERT(cublasCreate(handle.get()));
+            TLLM_CUDA_CHECK(cublasCreate(handle.get()));
             return handle;
         },
         [](cublasHandle_t* handle)
         {
-            PLUGIN_CUBLASASSERT(cublasDestroy(*handle));
+            TLLM_CUDA_CHECK(cublasDestroy(*handle));
             delete handle;
         });
     return creator();
@@ -148,19 +152,19 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
         []() -> auto
         {
             auto handle = std::unique_ptr<cublasLtHandle_t>(new cublasLtHandle_t);
-            PLUGIN_CUBLASASSERT(cublasLtCreate(handle.get()));
+            TLLM_CUDA_CHECK(cublasLtCreate(handle.get()));
             return handle;
         },
         [](cublasLtHandle_t* handle)
         {
-            PLUGIN_CUBLASASSERT(cublasLtDestroy(*handle));
+            TLLM_CUDA_CHECK(cublasLtDestroy(*handle));
             delete handle;
         });
     return creator();
 }
 
 // ALIGNPTR
-int8_t* nvinfer1::plugin::alignPtr(int8_t* ptr, uintptr_t to)
+int8_t* tensorrt_llm::plugins::alignPtr(int8_t* ptr, uintptr_t to)
 {
     uintptr_t addr = (uintptr_t) ptr;
     if (addr % to)
@@ -171,32 +175,45 @@ int8_t* nvinfer1::plugin::alignPtr(int8_t* ptr, uintptr_t to)
 }
 
 // NEXTWORKSPACEPTR
-int8_t* nvinfer1::plugin::nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize)
+int8_t* tensorrt_llm::plugins::nextWorkspacePtrCommon(
+    int8_t* ptr, uintptr_t previousWorkspaceSize, const uintptr_t alignment)
 {
     uintptr_t addr = (uintptr_t) ptr;
     addr += previousWorkspaceSize;
-    return alignPtr((int8_t*) addr, CUDA_MEM_ALIGN);
+    return alignPtr((int8_t*) addr, alignment);
 }
 
-int8_t* nvinfer1::plugin::nextWorkspacePtr(int8_t* const base, uintptr_t& offset, const uintptr_t size)
+int8_t* tensorrt_llm::plugins::nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize)
+{
+    return nextWorkspacePtrCommon(ptr, previousWorkspaceSize, kCudaMemAlign);
+}
+
+int8_t* tensorrt_llm::plugins::nextWorkspacePtr(
+    int8_t* const base, uintptr_t& offset, const uintptr_t size, const uintptr_t alignment)
 {
     uintptr_t curr_offset = offset;
-    uintptr_t next_offset = curr_offset + ((size + CUDA_MEM_ALIGN - 1) / CUDA_MEM_ALIGN) * CUDA_MEM_ALIGN;
+    uintptr_t next_offset = curr_offset + ((size + alignment - 1) / alignment) * alignment;
     int8_t* newptr = size == 0 ? nullptr : base + curr_offset;
     offset = next_offset;
     return newptr;
 }
 
+int8_t* tensorrt_llm::plugins::nextWorkspacePtrWithAlignment(
+    int8_t* ptr, uintptr_t previousWorkspaceSize, const uintptr_t alignment)
+{
+    return nextWorkspacePtrCommon(ptr, previousWorkspaceSize, alignment);
+}
+
 // CALCULATE TOTAL WORKSPACE SIZE
-size_t nvinfer1::plugin::calculateTotalWorkspaceSize(size_t* workspaces, int count)
+size_t tensorrt_llm::plugins::calculateTotalWorkspaceSize(size_t* workspaces, int count, const uintptr_t alignment)
 {
     size_t total = 0;
     for (int i = 0; i < count; i++)
     {
         total += workspaces[i];
-        if (workspaces[i] % CUDA_MEM_ALIGN)
+        if (workspaces[i] % alignment)
         {
-            total += CUDA_MEM_ALIGN - (workspaces[i] % CUDA_MEM_ALIGN);
+            total += alignment - (workspaces[i] % alignment);
         }
     }
     return total;
@@ -219,7 +236,7 @@ PluginFieldParser::~PluginFieldParser()
         {
             std::stringstream ss;
             ss << "unused plugin field with name: " << name;
-            nvinfer1::plugin::logError(ss.str().c_str(), __FILE__, FN_NAME, __LINE__);
+            tensorrt_llm::plugins::logError(ss.str().c_str(), __FILE__, FN_NAME, __LINE__);
         }
     }
 }
@@ -253,7 +270,7 @@ std::optional<T> PluginFieldParser::getScalar(std::string_view const& name)
     }
     auto& record = mMap.at(name);
     auto const& f = mFields[record.index];
-    PLUGIN_ASSERT(toFieldType<T>() == f.type && f.length == 1);
+    TLLM_CHECK(toFieldType<T>() == f.type && f.length == 1);
     record.retrieved = true;
     return std::optional{*static_cast<T const*>(f.data)};
 }
diff --git a/cpp/tensorrt_llm/plugins/common/plugin.h b/cpp/tensorrt_llm/plugins/common/plugin.h
index 54234a736dd..9f070722d8a 100644
--- a/cpp/tensorrt_llm/plugins/common/plugin.h
+++ b/cpp/tensorrt_llm/plugins/common/plugin.h
@@ -14,10 +14,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_PLUGIN_H
-#define TRT_PLUGIN_H
-#include "NvInferPlugin.h"
+
+#pragma once
+
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/plugins/common/checkMacrosPlugin.h"
+
+#include <NvInferRuntime.h>
+
 #include <cstring>
 #include <cublasLt.h>
 #include <cublas_v2.h>
@@ -34,38 +38,27 @@
 #include <string>
 #include <unordered_map>
 
-typedef enum
-{
-    STATUS_SUCCESS = 0,
-    STATUS_FAILURE = 1,
-    STATUS_BAD_PARAM = 2,
-    STATUS_NOT_SUPPORTED = 3,
-    STATUS_NOT_INITIALIZED = 4
-} pluginStatus_t;
-
-namespace nvinfer1
-{
-
-namespace pluginInternal
+namespace tensorrt_llm::plugins
 {
 
-class BasePlugin : public IPluginV2
+class BasePlugin : public nvinfer1::IPluginV2DynamicExt
 {
-protected:
+public:
     void setPluginNamespace(const char* libNamespace) noexcept override
     {
         mNamespace = libNamespace;
     }
 
-    const char* getPluginNamespace() const noexcept override
+    [[nodiscard]] char const* getPluginNamespace() const noexcept override
     {
         return mNamespace.c_str();
     }
 
-    std::string mNamespace;
+protected:
+    std::string mNamespace{api::kDefaultNamespace};
 };
 
-class BaseCreator : public IPluginCreator
+class BaseCreator : public nvinfer1::IPluginCreator
 {
 public:
     void setPluginNamespace(const char* libNamespace) noexcept override
@@ -73,20 +66,15 @@ class BaseCreator : public IPluginCreator
         mNamespace = libNamespace;
     }
 
-    const char* getPluginNamespace() const noexcept override
+    [[nodiscard]] char const* getPluginNamespace() const noexcept override
     {
         return mNamespace.c_str();
     }
 
 protected:
-    std::string mNamespace;
+    std::string mNamespace{api::kDefaultNamespace};
 };
 
-} // namespace pluginInternal
-
-namespace plugin
-{
-
 // Write values into buffer
 template <typename T>
 void write(char*& buffer, const T& val)
@@ -103,59 +91,41 @@ void read(const char*& buffer, T& val)
     buffer += sizeof(T);
 }
 
-inline int32_t getTrtSMVersionDec(int32_t smVersion)
-{
-    // Treat SM89 as SM86 temporarily.
-    return (smVersion == 89) ? 86 : smVersion;
-}
-
-inline int32_t getTrtSMVersionDec(int32_t majorVersion, int32_t minorVersion)
-{
-    return getTrtSMVersionDec(majorVersion * 10 + minorVersion);
-}
-
-inline int32_t elementSize(DataType type) noexcept
+inline cudaDataType_t trtToCublasDtype(nvinfer1::DataType type)
 {
     switch (type)
     {
-    case DataType::kFLOAT: return 4;
-    case DataType::kHALF: return 2;
-    case DataType::kINT8: return 1;
-    case DataType::kINT32: return 4;
-    case DataType::kBOOL: return 1;
-    case DataType::kUINT8: return 1;
-    case DataType::kFP8: return 1;
+    case nvinfer1::DataType::kFLOAT: return CUDA_R_32F;
+    case nvinfer1::DataType::kHALF: return CUDA_R_16F;
 #if defined(NV_TENSORRT_MAJOR) && NV_TENSORRT_MAJOR >= 9
-    case DataType::kBF16: return 2;
-    case DataType::kINT64: return 8;
+    case nvinfer1::DataType::kBF16: return CUDA_R_16BF;
 #endif
+    default: TLLM_THROW("Not supported data type for cuBLAS");
     }
-    PLUGIN_FAIL("unreachable code path");
 }
 
+std::uintptr_t constexpr kCudaMemAlign = 128;
+
 int8_t* alignPtr(int8_t* ptr, uintptr_t to);
 
-int8_t* nextWorkspacePtr(int8_t* const base, uintptr_t& offset, const uintptr_t size);
+int8_t* nextWorkspacePtrCommon(int8_t* ptr, uintptr_t previousWorkspaceSize, const uintptr_t alignment);
+
+int8_t* nextWorkspacePtr(
+    int8_t* const base, uintptr_t& offset, const uintptr_t size, const uintptr_t alignment = kCudaMemAlign);
 
 int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize);
 
-size_t calculateTotalWorkspaceSize(size_t* workspaces, int count);
+int8_t* nextWorkspacePtrWithAlignment(int8_t* ptr, uintptr_t previousWorkspaceSize, const uintptr_t alignment);
 
-} // namespace plugin
-} // namespace nvinfer1
+size_t calculateTotalWorkspaceSize(size_t* workspaces, int count, const uintptr_t alignment = kCudaMemAlign);
+
+} // namespace tensorrt_llm::plugins
 
 inline bool isBuilding()
 {
-    std::string const& key = "IS_BUILDING";
-    char* val = getenv(key.c_str());
-    if (val == nullptr || std::string(val) != "1")
-    {
-        return false;
-    }
-    else
-    {
-        return true;
-    }
+    auto constexpr key = "IS_BUILDING";
+    auto const val = getenv(key);
+    return val != nullptr && std::string(val) == "1";
 }
 
 #define MPICHECK(cmd)                                                                                                  \
@@ -309,5 +279,3 @@ class PluginFieldParser
 
     std::unordered_map<std::string_view, Record> mMap;
 };
-
-#endif // TRT_PLUGIN_H
diff --git a/cpp/tensorrt_llm/plugins/exports.map b/cpp/tensorrt_llm/plugins/exports.map
index b0b1d3c5018..9d34b296f08 100644
--- a/cpp/tensorrt_llm/plugins/exports.map
+++ b/cpp/tensorrt_llm/plugins/exports.map
@@ -18,15 +18,16 @@
 /* Hides all symbols except those specified in the global section */
 {
   global:
-    getInferLibVersion;
-    getPluginRegistry;
-    initLibNvInferPlugins;
+    initTrtLlmPlugins;
+    setLoggerFinder;
+    getPluginCreators;
     extern "C++" {
       nvinfer1::IPluginCreator::*;
       nvinfer1::IPluginV2Ext::*;
       nvinfer1::IPluginV2IOExt::*;
       nvinfer1::PluginRegistrar*;
-      nvinfer1::plugin::*;
+      tensorrt_llm::plugins::api::*;
+      tensorrt_llm::plugins::*;
     };
   local: *;
 };
diff --git a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp
index c2c3d5c2308..1af7ce946f6 100644
--- a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp
@@ -14,41 +14,192 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h"
+#include "gemmPlugin.h"
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::common;
-using nvinfer1::plugin::GemmPluginCreator;
-using nvinfer1::plugin::GemmPlugin;
+using tensorrt_llm::plugins::GemmPluginCreator;
+using tensorrt_llm::plugins::GemmPlugin;
+using tensorrt_llm::plugins::CublasLtGemmPluginProfiler;
+using tensorrt_llm::plugins::CublasGemmWrapperPtr;
+using tensorrt_llm::plugins::read;
+using tensorrt_llm::plugins::write;
 
 static const char* GEMM_PLUGIN_VERSION{"1"};
 static const char* GEMM_PLUGIN_NAME{"Gemm"};
 PluginFieldCollection GemmPluginCreator::mFC{};
-std::vector<PluginField> GemmPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> GemmPluginCreator::mPluginAttributes;
 
-GemmPlugin::GemmPlugin(int transA, int transB, nvinfer1::DataType type)
+void getProblemParams(cublasOperation_t& transa, cublasOperation_t& transb, int& m, int& n, int& k, int& lda, int& ldb,
+    int& ldc, bool transA, bool transB, int M, int N, int K)
+{
+    transa = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+    transb = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+    m = N;
+    n = M;
+    k = K;
+    lda = transB ? K : N;
+    ldb = transA ? M : K;
+    ldc = N;
+}
+
+void runGemm(const int M, const int N, const int K, const bool transA, const bool transB, const nvinfer1::DataType type,
+    const CublasGemmWrapperPtr& cublasWrapperPtr, const void* act, const void* weight, void* output,
+    const std::optional<cublasLtMatmulHeuristicResult_t>& heuristic, void* workspace, cudaStream_t stream)
+{
+    auto cublasHandle = cublasWrapperPtr->getCublasHandle();
+    TLLM_CUDA_CHECK(cublasSetStream(cublasHandle, stream));
+    cublasWrapperPtr->setStream(stream);
+    cublasWrapperPtr->setWorkspace(workspace);
+    cublasOperation_t transa, transb;
+    int m, n, k;
+    int lda, ldb, ldc;
+    getProblemParams(transa, transb, m, n, k, lda, ldb, ldc, transA, transB, M, N, K);
+    cublasWrapperPtr->Gemm(transa, transb, m, n, k, weight, lda, act, ldb, output, ldc, heuristic);
+}
+
+void CublasLtGemmPluginProfiler::runTactic(
+    int m, int n, int k, const CublasLtGemmPluginProfiler::Config& tactic, char* workspace, const cudaStream_t& stream)
+{
+    size_t dataSize = sizeof(half);
+    if (mType == DataType::kFLOAT)
+    {
+        dataSize = sizeof(float);
+    }
+
+    void* actPtr = reinterpret_cast<void*>(workspace);
+    void* weightPtr = reinterpret_cast<void*>(
+        nextWorkspacePtrWithAlignment(reinterpret_cast<int8_t*>(actPtr), m * k * dataSize, ALIGNMENT));
+    void* outputPtr = reinterpret_cast<void*>(
+        nextWorkspacePtrWithAlignment(reinterpret_cast<int8_t*>(weightPtr), n * k * dataSize, ALIGNMENT));
+    char* workspacePtr = reinterpret_cast<char*>(
+        nextWorkspacePtrWithAlignment(reinterpret_cast<int8_t*>(outputPtr), m * n * dataSize, ALIGNMENT));
+    runGemm(m, n, k, mTransA, mTransB, mType, mRunner, actPtr, weightPtr, outputPtr, {tactic}, workspacePtr, stream);
+}
+
+bool CublasLtGemmPluginProfiler::checkTactic(int m, int n, int k, const Config& tactic) const
+{
+    cublasOperation_t transa, transb;
+    int M, N, K;
+    int lda, ldb, ldc;
+    getProblemParams(transa, transb, m, n, k, lda, ldb, ldc, mTransA, mTransB, n, m, k);
+
+    return mRunner->checkTactic(transa, transb, m, n, k, lda, ldb, ldc, tactic);
+}
+
+void CublasLtGemmPluginProfiler::computeTmpSize(int maxM, int n, int k)
+{
+    size_t dataSize = sizeof(half);
+    if (mType == DataType::kFLOAT)
+    {
+        dataSize = sizeof(float);
+    }
+
+    std::vector<size_t> workspaces = {
+        maxM * k * dataSize,  // A
+        n * k * dataSize,     // B
+        maxM * n * dataSize,  // C
+        CUBLAS_WORKSPACE_SIZE // workspace
+    };
+    size_t bytes = calculateTotalWorkspaceSize(workspaces.data(), workspaces.size(), ALIGNMENT);
+    setTmpWorkspaceSizeInBytes(bytes);
+}
+
+GemmPlugin::GemmPlugin(
+    int transA, int transB, nvinfer1::DataType type, bool useFp8, const GemmPlugin::PluginProfilerPtr& pluginProfiler)
     : mTransA(transA)
     , mTransB(transB)
     , mType(type)
+    , mUseFp8(useFp8)
+    , mPluginProfiler(pluginProfiler)
 {
+    init();
 }
 
 // Parameterized constructor
-GemmPlugin::GemmPlugin(const void* data, size_t length)
+GemmPlugin::GemmPlugin(const void* data, size_t length, const GemmPlugin::PluginProfilerPtr& pluginProfiler)
+    : mPluginProfiler(pluginProfiler)
 {
     const char *d = reinterpret_cast<const char*>(data), *a = d;
     read(d, mTransA);
     read(d, mTransB);
     read(d, mType);
-    PLUGIN_ASSERT(d == a + length);
+    read(d, mUseFp8);
+    read(d, mDims);
+
+    init();
+
+    mPluginProfiler->deserialize(d, mDims, mGemmId);
+
+    TLLM_CHECK(d == a + length);
+}
+
+void GemmPlugin::init()
+{
+    auto cublasHandle = getCublasHandle();
+    auto cublasLtHandle = getCublasLtHandle();
+    mCublasAlgoMap = std::make_shared<cublasAlgoMap>(GEMM_CONFIG);
+    mCublasWrapperMutex = std::make_shared<std::mutex>();
+    mCublasWrapper = std::make_shared<cublasMMWrapper>(
+        cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap.get(), mCublasWrapperMutex.get(), nullptr);
+
+    mPluginProfiler->setTranspose(mTransA, mTransB);
+
+    mGemmId = GemmIdCublas(GemmIdCore(mDims.n, mDims.k, mType), mTransA, mTransB);
+}
+
+void GemmPlugin::setGemmConfig()
+{
+    if (mType == DataType::kHALF)
+    {
+        mCublasWrapper->setFP16GemmConfig();
+    }
+    else if (mType == DataType::kFLOAT)
+    {
+        mCublasWrapper->setFP32GemmConfig();
+    }
+#ifdef ENABLE_BF16
+    else if (mType == DataType::kBF16)
+    {
+        mCublasWrapper->setBF16GemmConfig();
+    }
+#endif
+
+#ifdef ENABLE_FP8
+    if (mUseFp8)
+    {
+        mCublasWrapper->setFP8GemmConfig(trtToCublasDtype(mType));
+    }
+#endif
+}
+
+void GemmPlugin::configGemm()
+{
+    if (!mDims.isInitialized())
+    {
+        return;
+    }
+
+    setGemmConfig();
+
+    std::vector<cublasLtMatmulHeuristicResult_t> totalHeruistics;
+    for (int mCur = mDims.minM; mCur < mDims.maxM; mCur *= 2)
+    {
+        cublasOperation_t transa, transb;
+        int m, n, k;
+        int lda, ldb, ldc;
+        getProblemParams(transa, transb, m, n, k, lda, ldb, ldc, mTransA, mTransB, mCur, mDims.n, mDims.k);
+        const auto heruistics = mCublasWrapper->getTactics(transa, transb, m, n, k, lda, ldb, ldc);
+
+        totalHeruistics.insert(totalHeruistics.end(), heruistics.begin(), heruistics.end());
+    }
+    mPluginProfiler->profileTactics(totalHeruistics, mCublasWrapper, mType, mDims, mGemmId);
 }
 
 // IPluginV2DynamicExt Methods
 nvinfer1::IPluginV2DynamicExt* GemmPlugin::clone() const noexcept
 {
     auto* plugin = new GemmPlugin(*this);
-    plugin->setPluginNamespace(mNamespace.c_str());
-    plugin->initialize();
     return plugin;
 }
 
@@ -57,8 +208,8 @@ nvinfer1::DimsExprs GemmPlugin::getOutputDimensions(
 {
     try
     {
-        PLUGIN_ASSERT(nbInputs == 2);
-        PLUGIN_ASSERT(outputIndex == 0);
+        TLLM_CHECK(nbInputs == 2);
+        TLLM_CHECK(outputIndex == 0);
         const int nbDimsA = inputs[0].nbDims;
         const int nbDimsB = inputs[1].nbDims;
         DimsExprs ret;
@@ -107,9 +258,62 @@ bool GemmPlugin::supportsFormatCombination(
     return (inOut[pos].type == mType) && (inOut[pos].format == TensorFormat::kLINEAR);
 }
 
+int32_t computeMDimension(bool transA, const int32_t nbDims, const int32_t* dims)
+{
+    int32_t M = 1;
+    if (transA)
+    {
+        for (int i = nbDims - 1; i > 0; --i)
+        {
+            M *= dims[i];
+        }
+    }
+    else
+    {
+        for (int i = 0; i < nbDims - 1; ++i)
+        {
+            M *= dims[i];
+        }
+    }
+    return M;
+}
+
+int32_t computeNDimension(bool transB, const int32_t nbDims, const int32_t* dims)
+{
+    int32_t N = 1;
+    if (transB)
+    {
+        for (int i = 0; i < nbDims - 1; ++i)
+        {
+            N *= dims[i];
+        }
+    }
+    else
+    {
+        for (int i = nbDims - 1; i > 0; --i)
+        {
+            N *= dims[i];
+        }
+    }
+    return N;
+}
+
 void GemmPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
     const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept
 {
+    const int nbDimsA = in[0].max.nbDims;
+    const int nbDimsB = in[1].max.nbDims;
+
+    const auto minM = computeMDimension(mTransA, nbDimsA, in[0].min.d);
+    const auto maxM = computeMDimension(mTransA, nbDimsA, in[0].max.d);
+    const auto N = computeNDimension(mTransB, nbDimsB, in[1].max.d);
+    const auto K = mTransA ? in[0].max.d[0] : in[0].max.d[nbDimsA - 1];
+
+    if (!mDims.isInitialized())
+    {
+        mDims = {minM, maxM, N, K};
+    }
+    mGemmId.gemmIdCore = {N, K, mType};
 }
 
 size_t GemmPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
@@ -127,68 +331,19 @@ int GemmPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinf
     // outputs
     //     mat [M, N]
 
-    auto cublasHandle = mCublasWrapper->getCublasHandle();
-    PLUGIN_CUBLASASSERT(cublasSetStream(cublasHandle, stream));
-    mCublasWrapper->setStream(stream);
-    mCublasWrapper->setWorkspace(workspace);
-    if (mType == DataType::kHALF)
-    {
-        mCublasWrapper->setFP16GemmConfig();
-    }
-    else if (mType == DataType::kFLOAT)
-    {
-        mCublasWrapper->setFP32GemmConfig();
-    }
-#ifdef ENABLE_BF16
-    else if (mType == DataType::kBF16)
-    {
-        mCublasWrapper->setBF16GemmConfig();
-    }
-#endif
+    setGemmConfig();
 
     const int nbDimsA = inputDesc[0].dims.nbDims;
-    int M = 1, N = 1;
-    const int K = mTransA ? inputDesc[0].dims.d[0] : inputDesc[0].dims.d[nbDimsA - 1];
-    if (mTransA)
-    {
-        for (int i = nbDimsA - 1; i > 0; --i)
-        {
-            M *= inputDesc[0].dims.d[i];
-        }
-    }
-    else
-    {
-        for (int i = 0; i < nbDimsA - 1; ++i)
-        {
-            M *= inputDesc[0].dims.d[i];
-        }
-    }
     const int nbDimsB = inputDesc[1].dims.nbDims;
-    if (mTransB)
-    {
-        for (int i = 0; i < nbDimsB - 1; ++i)
-        {
-            N *= inputDesc[1].dims.d[i];
-        }
-    }
-    else
-    {
-        for (int i = nbDimsB - 1; i > 0; --i)
-        {
-            N *= inputDesc[1].dims.d[i];
-        }
-    }
-
-    cublasOperation_t transa = mTransB ? CUBLAS_OP_T : CUBLAS_OP_N;
-    cublasOperation_t transb = mTransA ? CUBLAS_OP_T : CUBLAS_OP_N;
-    const int m = N;
-    const int n = M;
-    const int k = K;
-    const auto lda = mTransB ? K : N;
-    const auto ldb = mTransA ? M : K;
-    const auto ldc = N;
-    mCublasWrapper->Gemm(transa, transb, m, n, k, inputs[1], lda, inputs[0], ldb, outputs[0], ldc);
+    const auto M = computeMDimension(mTransA, nbDimsA, inputDesc[0].dims.d);
+    const auto N = computeNDimension(mTransB, nbDimsB, inputDesc[1].dims.d);
+    const int K = mTransA ? inputDesc[0].dims.d[0] : inputDesc[0].dims.d[nbDimsA - 1];
 
+    // FIXME(nkorobov): enable best config selection
+    // const auto& bestTactic = mPluginProfiler->getBestConfig(M, mGemmId);
+    const std::optional<CublasLtGemmPluginProfiler::Config> bestTactic = {};
+    runGemm(M, N, K, mTransA, mTransB, mType, mCublasWrapper, inputs[0], inputs[1], outputs[0], bestTactic, workspace,
+        stream);
     return 0;
 }
 
@@ -196,7 +351,7 @@ int GemmPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinf
 nvinfer1::DataType GemmPlugin::getOutputDataType(
     int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
 {
-    PLUGIN_ASSERT(index == 0);
+    TLLM_CHECK(index == 0);
     return inputTypes[0];
 }
 
@@ -219,30 +374,16 @@ int GemmPlugin::getNbOutputs() const noexcept
 
 int GemmPlugin::initialize() noexcept
 {
-    auto cublasHandle = getCublasHandle();
-    auto cublasLtHandle = getCublasLtHandle();
-    mCublasAlgoMap = new cublasAlgoMap(GEMM_CONFIG);
-    mCublasWrapperMutex = new std::mutex();
-    mCublasWrapper
-        = new cublasMMWrapper(cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap, mCublasWrapperMutex, nullptr);
+    configGemm();
     return 0;
 }
 
-void GemmPlugin::destroy() noexcept
-{
-    delete mCublasAlgoMap;
-    delete mCublasWrapperMutex;
-    delete mCublasWrapper;
-
-    mCublasAlgoMap = nullptr;
-    mCublasWrapperMutex = nullptr;
-    mCublasWrapper = nullptr;
-    delete this;
-}
+void GemmPlugin::destroy() noexcept {}
 
 size_t GemmPlugin::getSerializationSize() const noexcept
 {
-    return sizeof(mTransA) + sizeof(mTransB) + sizeof(mType);
+    return sizeof(mTransA) + sizeof(mTransB) + sizeof(mType) + sizeof(mDims) + sizeof(mUseFp8)
+        + mPluginProfiler->getSerializationSize(mGemmId); // selected tactics container size
 }
 
 void GemmPlugin::serialize(void* buffer) const noexcept
@@ -251,21 +392,15 @@ void GemmPlugin::serialize(void* buffer) const noexcept
     write(d, mTransA);
     write(d, mTransB);
     write(d, mType);
+    write(d, mUseFp8);
+    write(d, mDims);
+    mPluginProfiler->serialize(d, mGemmId);
+
     assert(d == a + getSerializationSize());
 }
 
 void GemmPlugin::terminate() noexcept {}
 
-void GemmPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* GemmPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 GemmPluginCreator::GemmPluginCreator()
@@ -275,6 +410,7 @@ GemmPluginCreator::GemmPluginCreator()
     mPluginAttributes.emplace_back(PluginField("transA", nullptr, PluginFieldType::kINT32, 0));
     mPluginAttributes.emplace_back(PluginField("transB", nullptr, PluginFieldType::kINT32, 0));
     mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("use_fp8", nullptr, PluginFieldType::kINT32, 0));
     mFC.nbFields = mPluginAttributes.size();
     mFC.fields = mPluginAttributes.data();
 }
@@ -299,29 +435,38 @@ IPluginV2* GemmPluginCreator::createPlugin(const char* name, const PluginFieldCo
     const PluginField* fields = fc->fields;
     int transA, transB;
     nvinfer1::DataType type;
+    int useFp8;
     // Read configurations from each fields
     for (int i = 0; i < fc->nbFields; ++i)
     {
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "transa"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             transA = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "transb"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             transB = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
+        else if (!strcmp(attrName, "use_fp8"))
+        {
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
+            useFp8 = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
+        }
     }
     try
     {
-        auto* obj = new GemmPlugin(transA, transB, type);
+        // GemmPluginCreator is unique and shared for an engine generation
+        // Create plugin profiler with shared tactics map
+        auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false);
+        auto* obj = new GemmPlugin(transA, transB, type, useFp8, pluginProfiler);
         obj->setPluginNamespace(mNamespace.c_str());
         return obj;
     }
@@ -338,7 +483,10 @@ IPluginV2* GemmPluginCreator::deserializePlugin(const char* name, const void* se
     // call GemmPlugin::destroy()
     try
     {
-        auto* obj = new GemmPlugin(serialData, serialLength);
+        // GemmPluginCreator is unique and shared for an engine generation
+        // Create plugin profiler with shared tactics map
+        auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ true);
+        auto* obj = new GemmPlugin(serialData, serialLength, pluginProfiler);
         obj->setPluginNamespace(mNamespace.c_str());
         return obj;
     }
@@ -348,13 +496,3 @@ IPluginV2* GemmPluginCreator::deserializePlugin(const char* name, const void* se
     }
     return nullptr;
 }
-
-void GemmPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* GemmPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h
index 6651f383ad9..8775388b147 100644
--- a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h
+++ b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h
@@ -16,27 +16,98 @@
  */
 #ifndef TRT_GEMM_PLUGIN_H
 #define TRT_GEMM_PLUGIN_H
-#include "NvInferPlugin.h"
 #include "tensorrt_llm/common/cublasMMWrapper.h"
+#include "tensorrt_llm/plugins/common/gemmPluginProfiler.h"
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <set>
 #include <string>
 #include <vector>
 
-namespace nvinfer1
+namespace tensorrt_llm::plugins
 {
-namespace plugin
+
+using CublasGemmWrapper = tensorrt_llm::common::cublasMMWrapper;
+using CublasGemmWrapperPtr = std::shared_ptr<CublasGemmWrapper>;
+
+class GemmIdCublas
 {
+public:
+    GemmIdCore gemmIdCore{};
+    bool transA{};
+    bool transB{};
+
+    GemmIdCublas(const GemmIdCore& gemmIdCore_, bool transA_, bool transB_)
+        : gemmIdCore(gemmIdCore_)
+        , transA(transA_)
+        , transB(transB_)
+    {
+    }
+
+    GemmIdCublas() {}
+
+    bool operator==(const GemmIdCublas& id) const
+    {
+        return gemmIdCore == id.gemmIdCore && transA == id.transA && transB == id.transB;
+    }
+
+    friend std::ostream& operator<<(std::ostream& out, const GemmIdCublas& id)
+    {
+        out << "Core ID = {" << id.gemmIdCore << "}";
+        out << " transA=" << id.transA;
+        out << " transB=" << id.transB;
+        return out;
+    }
+};
+
+// Hash of GemmIdCublas
+struct GemmIdCublasHash
+{
+    std::size_t operator()(const GemmIdCublas& id) const
+    {
+        auto h1 = GemmIdCoreHash()(id.gemmIdCore);
+        auto h2 = std::hash<bool>{}(id.transA);
+        auto h3 = std::hash<bool>{}(id.transB);
+        return h1 ^ h2 ^ h3;
+    }
+};
 
-class GemmPlugin : public IPluginV2DynamicExt
+class CublasLtGemmPluginProfiler
+    : public GemmPluginProfiler<cublasLtMatmulHeuristicResult_t, CublasGemmWrapperPtr, GemmIdCublas, GemmIdCublasHash>
 {
 public:
+    using Config = cublasLtMatmulHeuristicResult_t;
+
+    void setTranspose(bool transposeA, bool transposeB)
+    {
+        mTransA = transposeA;
+        mTransB = transposeB;
+    }
+
+protected:
+    void runTactic(int m, int n, int k, const Config& tactic, char* workspace, const cudaStream_t& stream) override;
+
+    void computeTmpSize(int maxM, int n, int k) override;
+
+    bool checkTactic(int m, int n, int k, const Config& tactic) const override;
+
+private:
+    bool mTransA;
+    bool mTransB;
+
+    static constexpr size_t ALIGNMENT = 256;
+};
+
+class GemmPlugin : public BasePlugin
+{
+public:
+    using PluginProfilerPtr = std::shared_ptr<CublasLtGemmPluginProfiler>;
+
     GemmPlugin() = delete;
 
-    GemmPlugin(int transA, int transB, nvinfer1::DataType type);
+    GemmPlugin(int transA, int transB, nvinfer1::DataType type, bool useFp8, const PluginProfilerPtr& profiler);
 
-    GemmPlugin(const void* data, size_t length);
+    GemmPlugin(const void* data, size_t length, const PluginProfilerPtr& profiler);
 
     ~GemmPlugin() override = default;
 
@@ -66,23 +137,31 @@ class GemmPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
+
+private:
+    void init();
+    void configGemm();
+    void setGemmConfig();
 
 private:
     const std::string mLayerName;
-    std::string mNamespace;
 
     int mTransA;
     int mTransB;
     nvinfer1::DataType mType;
 
-    tensorrt_llm::common::cublasAlgoMap* mCublasAlgoMap;
-    std::mutex* mCublasWrapperMutex;
-    tensorrt_llm::common::cublasMMWrapper* mCublasWrapper;
+    std::shared_ptr<tensorrt_llm::common::cublasAlgoMap> mCublasAlgoMap;
+    std::shared_ptr<std::mutex> mCublasWrapperMutex;
+    CublasGemmWrapperPtr mCublasWrapper;
+
+    GemmDims mDims{};
+    GemmIdCublas mGemmId{};
+    bool mUseFp8{false};
+
+    PluginProfilerPtr mPluginProfiler;
 };
 
-class GemmPluginCreator : public IPluginCreator
+class GemmPluginCreator : public BaseCreator
 {
 public:
     GemmPluginCreator();
@@ -98,17 +177,12 @@ class GemmPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    GemmPluginProfilerManager<CublasLtGemmPluginProfiler> gemmPluginProfileManager;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
+} // namespace tensorrt_llm::plugins
 
 #endif // TRT_GEMM_PLUGIN_H
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
index 851d725bf98..c1ec28864d8 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
+++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
@@ -14,12 +14,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h"
-#include "checkMacrosPlugin.h"
+#include "gptAttentionCommon.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h"
 #include "tensorrt_llm/kernels/gptKernels.h"
 #include "tensorrt_llm/kernels/unfusedAttentionKernels.h"
+#include "tensorrt_llm/plugins/common/checkMacrosPlugin.h"
+#include "tensorrt_llm/runtime/iBuffer.h"
 #include <NvInferRuntimePlugin.h>
 #include <algorithm>
 #include <cstdint>
@@ -27,10 +28,9 @@
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::kernels;
-using namespace tensorrt_llm::common;
-using nvinfer1::plugin::GPTAttentionPluginCreatorCommon;
-using nvinfer1::plugin::GPTAttentionPluginCommon;
-using nvinfer1::plugin::nextWorkspacePtr;
+namespace tc = tensorrt_llm::common;
+using tensorrt_llm::plugins::GPTAttentionPluginCreatorCommon;
+using tensorrt_llm::plugins::GPTAttentionPluginCommon;
 
 template <typename KVCacheBuffer>
 struct KVCacheBufferDataType
@@ -78,6 +78,8 @@ struct FusedQKVMaskedAttentionDispatchParams
     int kv_head_num;
     int size_per_head;
     int rotary_embedding_dim;
+    float rotary_embedding_base;
+    float rotary_embedding_scale;
     PositionEmbeddingType position_embedding_type;
     int max_seq_len;
     const int* input_lengths;
@@ -90,7 +92,7 @@ struct FusedQKVMaskedAttentionDispatchParams
     const T* ia3_value_weights;
     const float* qkv_scale_out;
     const float* attention_out_scale;
-    QuantMode quant_option;
+    tc::QuantMode quant_option;
     bool multi_block_mode;
     int max_seq_len_tile;
     T* partial_out;
@@ -99,7 +101,7 @@ struct FusedQKVMaskedAttentionDispatchParams
     int* block_counter;
     const float* kv_scale_orig_quant;
     const float* kv_scale_quant_orig;
-    QuantMode kv_cache_quant_mode;
+    tc::QuantMode kv_cache_quant_mode;
     int multi_processor_count;
     KVCacheBuffer kv_block_array;
 };
@@ -157,6 +159,8 @@ void fusedQKV_masked_attention_dispatch(
     params.num_kv_heads = input_params.kv_head_num;
     params.hidden_size_per_head = input_params.size_per_head;
     params.rotary_embedding_dim = input_params.rotary_embedding_dim;
+    params.rotary_embedding_base = input_params.rotary_embedding_base;
+    params.rotary_embedding_scale = input_params.rotary_embedding_scale;
     params.position_embedding_type = input_params.position_embedding_type;
     // Note: keep norm factor (sqrt(K_dim)) when adopting megatron T5 structure (may adjust)
     params.inv_sqrt_dh = 1.F / (sqrtf((float) params.hidden_size_per_head) * input_params.q_scaling);
@@ -214,7 +218,8 @@ template void fusedQKV_masked_attention_dispatch(
 GPTAttentionPluginCommon::GPTAttentionPluginCommon(int num_heads, int num_kv_heads, int unidirectional, float q_scaling,
     tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
     int rotary_embedding_dim, // for RoPE. Use 0 for non-RoPE
-    int tp_size, int tp_rank, // for ALiBi
+    float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
+    float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi
     tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, int kv_cache_quant_mode,
     bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type, bool paged_kv_cache,
     nvinfer1::DataType type, int32_t max_context_length, bool qkv_bias_enabled)
@@ -224,6 +229,10 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(int num_heads, int num_kv_hea
     , mUnidirectional(unidirectional)
     , mQScaling(q_scaling)
     , mRotaryEmbeddingDim(rotary_embedding_dim)
+    , mRotaryEmbeddingBase(rotary_embedding_base)
+    , mRotaryEmbeddingScaleType(rotary_embedding_scale_type)
+    , mRotaryEmbeddingScale(rotary_embedding_scale)
+    , mRotaryEmbeddingMaxPositions(rotary_embedding_max_positions)
     , mPositionEmbeddingType(position_embedding_type)
     , mEnableContextFMHA(context_fmha_type != ContextFMHAType::DISABLED)
     , mFMHAForceFP32Acc(context_fmha_type == ContextFMHAType::ENABLED_WITH_FP32_ACC || type == DataType::kBF16)
@@ -239,8 +248,8 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(int num_heads, int num_kv_hea
     , mQKVBiasEnabled(qkv_bias_enabled)
 {
     mEnableContextFMHA = mEnableContextFMHA && (mType == DataType::kHALF || mType == DataType::kBF16);
-    PLUGIN_ASSERT(isRoPE() == (rotary_embedding_dim != 0));
-    TLLM_CHECK_WITH_INFO((getSMVersion() >= 80) || (mType != DataType::kBF16),
+    TLLM_CHECK(isRoPE() == (rotary_embedding_dim != 0));
+    TLLM_CHECK_WITH_INFO((tc::getSMVersion() >= 80) || (mType != DataType::kBF16),
         "Unsupported data type, pre SM 80 GPUs do not support bfloat16");
 }
 
@@ -266,6 +275,10 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(const void* data, size_t leng
     read(d, mQScaling);
     read(d, mPositionEmbeddingType);
     read(d, mRotaryEmbeddingDim);
+    read(d, mRotaryEmbeddingBase);
+    read(d, mRotaryEmbeddingScaleType);
+    read(d, mRotaryEmbeddingScale);
+    read(d, mRotaryEmbeddingMaxPositions);
     read(d, mTpSize);
     read(d, mTpRank);
     read(d, mEnableContextFMHA);
@@ -279,10 +292,10 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(const void* data, size_t leng
     read(d, mMaxContextLength);
     read(d, mQKVBiasEnabled);
 
-    mKVCacheQuantMode = QuantMode(kvCacheQuantMode);
+    mKVCacheQuantMode = tc::QuantMode(kvCacheQuantMode);
 
-    PLUGIN_ASSERT(d == a + length);
-    TLLM_CHECK_WITH_INFO((getSMVersion() >= 80) || (mType != DataType::kBF16),
+    TLLM_CHECK(d == a + length);
+    TLLM_CHECK_WITH_INFO((tc::getSMVersion() >= 80) || (mType != DataType::kBF16),
         "Unsupported data type, pre SM 80 GPUs do not support bfloat16");
 }
 
@@ -293,7 +306,7 @@ size_t GPTAttentionPluginCommon::getWorkspaceSizeForContext(
     const int local_hidden_units_qo = mNumHeads * getHeadSize();
     const int local_hidden_units_kv = mNumKVHeads * getHeadSize();
 
-    size_t const size = elementSize(type);
+    auto const size = tensorrt_llm::runtime::BufferDataType(type).getSize();
 
     size_t context_workspace_size = 0;
 
@@ -322,7 +335,7 @@ size_t GPTAttentionPluginCommon::getWorkspaceSizeForContext(
     workspaces[7] = qkv_buf_2_size;
     workspaces[8] = qk_buf_float_size;
     workspaces[9] = padding_offset_size;
-    context_workspace_size = plugin::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS);
+    context_workspace_size = tensorrt_llm::plugins::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS);
     return context_workspace_size;
 }
 
@@ -331,7 +344,7 @@ size_t GPTAttentionPluginCommon::getWorkspaceSizeForGeneration(DataType type, in
     const int local_hidden_units_qo = mNumHeads * getHeadSize();
     const int local_hidden_units_kv = mNumKVHeads * getHeadSize();
 
-    size_t const size = elementSize(type);
+    auto const size = tensorrt_llm::runtime::BufferDataType(type).getSize();
 
     size_t context_workspace_size = 0;
     size_t generation_workspace_size = 0;
@@ -350,7 +363,7 @@ size_t GPTAttentionPluginCommon::getWorkspaceSizeForGeneration(DataType type, in
     workspaces[1] = partial_sum_size;
     workspaces[2] = partial_max_size;
     workspaces[3] = block_counter_size;
-    generation_workspace_size = plugin::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS);
+    generation_workspace_size = tensorrt_llm::plugins::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS);
     return generation_workspace_size;
 }
 
@@ -402,7 +415,7 @@ int GPTAttentionPluginCommon::enqueueContext(const EnqueueContextParams<T, KVCac
         kv_cache_buffer.data = reinterpret_cast<BufferDataType*>(params.key_value_cache);
     }
 
-    const QuantMode quant_option = QuantMode::fromDescription();
+    const auto quant_option = tc::QuantMode::fromDescription();
     const float* qkv_scale_out = nullptr;
     const float* attention_out_scale = nullptr;
 
@@ -421,7 +434,7 @@ int GPTAttentionPluginCommon::enqueueContext(const EnqueueContextParams<T, KVCac
     const int request_seq_length = params.input_seq_length;
 
     auto cublasHandle = mCublasWrapper->getCublasHandle();
-    PLUGIN_CUBLASASSERT(cublasSetStream(cublasHandle, stream));
+    TLLM_CUDA_CHECK(cublasSetStream(cublasHandle, stream));
     mCublasWrapper->setStream(stream);
     mCublasWrapper->setWorkspace(params.workspace);
     if constexpr (std::is_same_v<T, half>)
@@ -490,11 +503,15 @@ int GPTAttentionPluginCommon::enqueueContext(const EnqueueContextParams<T, KVCac
     //  Otherwise, we could do cudaMemsetAsync(k_buf_2_, 0, k_buf_2_size + v_buf_2_size, stream);
     cudaMemsetAsync(
         k_buf_2_, 0, reinterpret_cast<int8_t*>(v_buf_2_) - reinterpret_cast<int8_t*>(k_buf_2_) + v_buf_2_size, stream);
+    float rotary_base, rotary_scale;
+    const int32_t kv_seq_len = params.input_seq_length;
+    update_rotary_params(kv_seq_len, rotary_base, rotary_scale);
 
     invokeAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, v_buf_2_, const_cast<T*>(params.attention_input),
         const_cast<T*>(params.qkv_bias), params.context_lengths, mRemovePadding ? padding_offset : nullptr,
         request_batch_size, request_seq_length, params.num_tokens, mNumHeads, mNumKVHeads, getHeadSize(),
-        mEnableContextFMHA, mRotaryEmbeddingDim, position_embedding_type, (float*) nullptr, 0, stream);
+        mEnableContextFMHA, mRotaryEmbeddingDim, rotary_base, rotary_scale, position_embedding_type, (float*) nullptr,
+        0, stream);
 
     sync_check_cuda_error();
 
@@ -506,7 +523,7 @@ int GPTAttentionPluginCommon::enqueueContext(const EnqueueContextParams<T, KVCac
         params.context_lengths, stream);
     sync_check_cuda_error();
 
-    const cudaDataType_t gemm_data_type = CudaDataType<T>::value;
+    const cudaDataType_t gemm_data_type = tc::CudaDataType<T>::value;
     const int attention_seq_len_1 = request_seq_length; // q length
     const int attention_seq_len_2 = request_seq_length; // kv length
     const T qk_scale = static_cast<T>(1.0f / (sqrtf(getHeadSize() * 1.0f) * q_scaling));
@@ -734,7 +751,7 @@ int GPTAttentionPluginCommon::enqueueGeneration(
     const bool* finished = nullptr;
     const bool has_ia3 = false;
 
-    const QuantMode quant_option = QuantMode::fromDescription();
+    const auto quant_option = tc::QuantMode::fromDescription();
     const float* qkv_scale_out = nullptr;
     const float* attention_out_scale = nullptr;
 
@@ -761,7 +778,7 @@ int GPTAttentionPluginCommon::enqueueGeneration(
     int* block_counter = reinterpret_cast<int*>(nextWorkspacePtr(workspace_byte_ptr, offset, block_counter_size));
     if (mMultiBlockMode)
     {
-        PLUGIN_CUASSERT(cudaMemsetAsync(block_counter, 0, block_counter_size, stream));
+        TLLM_CUDA_CHECK(cudaMemsetAsync(block_counter, 0, block_counter_size, stream));
     }
 
     KVCacheBuffer kv_cache_buffer;
@@ -824,6 +841,8 @@ int GPTAttentionPluginCommon::enqueueGeneration(
     dispatch_params.kv_scale_quant_orig = params.kv_scale_quant_orig;
     dispatch_params.kv_block_array = kv_cache_buffer;
     dispatch_params.multi_processor_count = mMultiProcessorCount;
+    const int32_t kv_seq_len = step;
+    update_rotary_params(kv_seq_len, dispatch_params.rotary_embedding_base, dispatch_params.rotary_embedding_scale);
     fusedQKV_masked_attention_dispatch(dispatch_params, stream);
     sync_check_cuda_error();
     return 0;
@@ -856,10 +875,10 @@ int GPTAttentionPluginCommon::initialize() noexcept
     auto cublasHandle = getCublasHandle();
     auto cublasLtHandle = getCublasLtHandle();
 
-    mCublasAlgoMap = new cublasAlgoMap(GEMM_CONFIG);
+    mCublasAlgoMap = new tc::cublasAlgoMap(GEMM_CONFIG);
     mCublasWrapperMutex = new std::mutex();
     mCublasWrapper
-        = new cublasMMWrapper(cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap, mCublasWrapperMutex, nullptr);
+        = new tc::cublasMMWrapper(cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap, mCublasWrapperMutex, nullptr);
     if (mEnableContextFMHA)
     {
         // Pre-checked during constructing.
@@ -906,9 +925,10 @@ void GPTAttentionPluginCommon::destroy() noexcept
 size_t GPTAttentionPluginCommon::getCommonSerializationSize() noexcept
 {
     return sizeof(mNumHeads) + sizeof(mNumKVHeads) + sizeof(mHeadSize) + sizeof(mUnidirectional) + sizeof(mQScaling)
-        + sizeof(mPositionEmbeddingType) + sizeof(mRotaryEmbeddingDim) + sizeof(mTpSize) + sizeof(mTpRank)
-        + sizeof(mEnableContextFMHA) + sizeof(mFMHAForceFP32Acc) + sizeof(mMultiBlockMode)
-        + sizeof(unsigned int) // mKVCacheQuantMode
+        + sizeof(mPositionEmbeddingType) + sizeof(mRotaryEmbeddingDim) + sizeof(mRotaryEmbeddingBase)
+        + sizeof(mRotaryEmbeddingScaleType) + sizeof(mRotaryEmbeddingScale) + sizeof(mRotaryEmbeddingMaxPositions)
+        + sizeof(mTpSize) + sizeof(mTpRank) + sizeof(mEnableContextFMHA) + sizeof(mFMHAForceFP32Acc)
+        + sizeof(mMultiBlockMode) + sizeof(unsigned int) // mKVCacheQuantMode
         + sizeof(mRemovePadding) + sizeof(mMaskType) + sizeof(mPagedKVCache) + sizeof(mType) + sizeof(mMaxContextLength)
         + sizeof(mQKVBiasEnabled);
 }
@@ -923,6 +943,10 @@ void GPTAttentionPluginCommon::serializeCommon(void* buffer) const noexcept
     write(d, mQScaling);
     write(d, mPositionEmbeddingType);
     write(d, mRotaryEmbeddingDim);
+    write(d, mRotaryEmbeddingBase);
+    write(d, mRotaryEmbeddingScaleType);
+    write(d, mRotaryEmbeddingScale);
+    write(d, mRotaryEmbeddingMaxPositions);
     write(d, mTpSize);
     write(d, mTpRank);
     write(d, mEnableContextFMHA);
@@ -943,16 +967,6 @@ void GPTAttentionPluginCommon::terminate() noexcept
     // Do nothing, destroy will always be called, so release the resources there.
 }
 
-void GPTAttentionPluginCommon::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* GPTAttentionPluginCommon::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 GPTAttentionPluginCreatorCommon::GPTAttentionPluginCreatorCommon()
@@ -965,6 +979,10 @@ GPTAttentionPluginCreatorCommon::GPTAttentionPluginCreatorCommon()
     mPluginAttributes.emplace_back(PluginField("q_scaling", nullptr, PluginFieldType::kFLOAT32, 1.0));
     mPluginAttributes.emplace_back(PluginField("position_embedding_type", nullptr, PluginFieldType::kINT8, 0));
     mPluginAttributes.emplace_back(PluginField("rotary_embedding_dim", nullptr, PluginFieldType::kINT32, 0));
+    mPluginAttributes.emplace_back(PluginField("rotary_embedding_base", nullptr, PluginFieldType::kFLOAT32, 0));
+    mPluginAttributes.emplace_back(PluginField("rotary_embedding_scale_type", nullptr, PluginFieldType::kINT8, 0));
+    mPluginAttributes.emplace_back(PluginField("rotary_embedding_scale", nullptr, PluginFieldType::kFLOAT32, 0));
+    mPluginAttributes.emplace_back(PluginField("rotary_embedding_max_positions", nullptr, PluginFieldType::kINT32, 0));
     mPluginAttributes.emplace_back(PluginField("tp_size", nullptr, PluginFieldType::kINT32, 0));
     mPluginAttributes.emplace_back(PluginField("tp_rank", nullptr, PluginFieldType::kINT32, 0));
     mPluginAttributes.emplace_back(PluginField("context_fmha_type", nullptr, PluginFieldType::kINT8, 0));
@@ -984,13 +1002,3 @@ const PluginFieldCollection* GPTAttentionPluginCreatorCommon::getFieldNames() no
 {
     return &mFC;
 }
-
-void GPTAttentionPluginCreatorCommon::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* GPTAttentionPluginCreatorCommon::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h
index de4f63ef608..0de39079344 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h
+++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_GPT_ATTENTION_COMMON_H
-#define TRT_GPT_ATTENTION_COMMON_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h"
@@ -28,12 +27,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class GPTAttentionPluginCommon : public IPluginV2DynamicExt
+class GPTAttentionPluginCommon : public BasePlugin
 {
 public:
     GPTAttentionPluginCommon() = delete;
@@ -41,7 +38,8 @@ class GPTAttentionPluginCommon : public IPluginV2DynamicExt
     GPTAttentionPluginCommon(int num_heads, int num_kv_heads, int unidirectional, float q_scaling,
         tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
         int rotary_embedding_dim, // for RoPE. Use 0 for non-RoPE
-        int tp_size, int tp_rank, // for ALiBi
+        float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
+        float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi
         tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, int kv_cache_quant_mode,
         bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type, bool paged_kv_cache,
         nvinfer1::DataType type, int32_t max_context_length, bool qkv_bias_enabled);
@@ -71,15 +69,13 @@ class GPTAttentionPluginCommon : public IPluginV2DynamicExt
 
     static size_t getCommonSerializationSize() noexcept;
     void serializeCommon(void* buffer) const noexcept;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
     const int getHeadSize(bool checkInit = true) const;
 
 protected:
     int getMaxSeqLenTile(int elemSize) const;
-    size_t getWorkspaceSizeForContext(DataType type, int32_t nbReq, int32_t max_input_length) const noexcept;
+    size_t getWorkspaceSizeForContext(nvinfer1::DataType type, int32_t nbReq, int32_t max_input_length) const noexcept;
     // total_num_seq is the sum of beam_width for multiple requests
-    size_t getWorkspaceSizeForGeneration(DataType type, int32_t total_num_seq) const noexcept;
+    size_t getWorkspaceSizeForGeneration(nvinfer1::DataType type, int32_t total_num_seq) const noexcept;
 
     template <typename T, typename KVCacheBuffer>
     struct EnqueueContextParams
@@ -142,9 +138,26 @@ class GPTAttentionPluginCommon : public IPluginV2DynamicExt
             || mPositionEmbeddingType == tensorrt_llm::kernels::PositionEmbeddingType::kROPE_GPT_NEOX;
     }
 
+    inline void update_rotary_params(int32_t kv_seq_len, float& base, float& scale)
+    {
+        base = mRotaryEmbeddingBase;
+        scale = 1.0f / mRotaryEmbeddingScale; // do the division here so that we can avoid it in the kernel
+        if (mPositionEmbeddingType == tensorrt_llm::kernels::PositionEmbeddingType::kROPE_GPT_NEOX
+            && mRotaryEmbeddingScaleType == tensorrt_llm::kernels::RotaryScalingType::kDYNAMIC)
+        {
+            if (kv_seq_len > mRotaryEmbeddingMaxPositions)
+            {
+                const float b
+                    = (mRotaryEmbeddingScale * kv_seq_len / mRotaryEmbeddingMaxPositions) - (mRotaryEmbeddingScale - 1);
+                const float p = static_cast<float>(mRotaryEmbeddingDim) / (mRotaryEmbeddingDim - 2);
+                base = mRotaryEmbeddingBase * pow(b, p);
+            }
+            scale = 1.0f; // scale factor is already used in updated base
+        }
+    }
+
 protected:
     const std::string mLayerName;
-    std::string mNamespace;
 
     int mNumHeads;
     int mNumKVHeads;
@@ -152,6 +165,10 @@ class GPTAttentionPluginCommon : public IPluginV2DynamicExt
     int mUnidirectional;
     float mQScaling;
     int mRotaryEmbeddingDim;
+    float mRotaryEmbeddingBase;
+    tensorrt_llm::kernels::RotaryScalingType mRotaryEmbeddingScaleType;
+    float mRotaryEmbeddingScale;
+    int mRotaryEmbeddingMaxPositions;
     tensorrt_llm::kernels::PositionEmbeddingType mPositionEmbeddingType;
     bool mRemovePadding = false;
     tensorrt_llm::kernels::AttentionMaskType mMaskType;
@@ -178,7 +195,7 @@ class GPTAttentionPluginCommon : public IPluginV2DynamicExt
     tensorrt_llm::common::cublasMMWrapper* mCublasWrapper;
 };
 
-class GPTAttentionPluginCreatorCommon : public IPluginCreator
+class GPTAttentionPluginCreatorCommon : public BaseCreator
 {
 public:
     GPTAttentionPluginCreatorCommon();
@@ -188,17 +205,9 @@ class GPTAttentionPluginCreatorCommon : public IPluginCreator
     template <typename T>
     T* deserializePluginImpl(const char* name, const void* serialData, size_t serialLength) noexcept;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 protected:
-    std::vector<PluginField> mPluginAttributes;
-    PluginFieldCollection mFC{};
-    std::string mNamespace;
+    std::vector<nvinfer1::PluginField> mPluginAttributes;
+    nvinfer1::PluginFieldCollection mFC{};
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_GPT_ATTENTION_COMMON_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommonImpl.h b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommonImpl.h
index 5dfb12dc206..7d9b3b941db 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommonImpl.h
+++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommonImpl.h
@@ -15,12 +15,11 @@
  * limitations under the License.
  */
 
-#ifndef TRT_GPT_ATTENTION_COMMON_IMPL_H
-#define TRT_GPT_ATTENTION_COMMON_IMPL_H
+#pragma once
 
 #include "gptAttentionCommon.h"
 
-namespace nvinfer1::plugin
+namespace tensorrt_llm::plugins
 {
 template <typename T>
 T* GPTAttentionPluginCommon::cloneImpl() const noexcept
@@ -52,6 +51,4 @@ T* GPTAttentionPluginCreatorCommon::deserializePluginImpl(
     }
     return nullptr;
 }
-} // namespace nvinfer1::plugin
-
-#endif // TRT_GPT_ATTENTION_COMMON_IMPL_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
index a5db1cfed9e..4aad4e1f443 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
@@ -14,14 +14,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h"
-#include "checkMacrosPlugin.h"
-#include "gptAttentionCommon.h"
-#include "gptAttentionCommon/gptAttentionCommonImpl.h"
-#include "plugin.h"
+#include "gptAttentionPlugin.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h"
 #include "tensorrt_llm/kernels/gptKernels.h"
 #include "tensorrt_llm/kernels/unfusedAttentionKernels.h"
+#include "tensorrt_llm/plugins/common/checkMacrosPlugin.h"
+#include "tensorrt_llm/plugins/common/plugin.h"
+#include "tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h"
+#include "tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommonImpl.h"
 #include <algorithm>
 #include <cstdint>
 #include <functional>
@@ -29,9 +29,8 @@
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::kernels;
-using namespace tensorrt_llm::common;
-using nvinfer1::plugin::GPTAttentionPluginCreator;
-using nvinfer1::plugin::GPTAttentionPlugin;
+using tensorrt_llm::plugins::GPTAttentionPluginCreator;
+using tensorrt_llm::plugins::GPTAttentionPlugin;
 
 static const char* GPT_ATTENTION_PLUGIN_VERSION{"1"};
 static const char* GPT_ATTENTION_PLUGIN_NAME{"GPTAttention"};
@@ -39,27 +38,21 @@ static const char* GPT_ATTENTION_PLUGIN_NAME{"GPTAttention"};
 GPTAttentionPlugin::GPTAttentionPlugin(int num_heads, int num_kv_heads, int unidirectional, float q_scaling,
     tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
     int rotary_embedding_dim, // for RoPE. 0 for non-RoPE
-    int tp_size, int tp_rank, // for ALiBi
+    float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
+    float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi
     tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, int kv_cache_quant_mode,
     bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type, bool paged_kv_cache,
-    nvinfer1::DataType type, bool in_flight_batching, int32_t max_context_length, bool qkv_bias_enabled)
+    nvinfer1::DataType type, int32_t max_context_length, bool qkv_bias_enabled)
     : GPTAttentionPluginCommon(num_heads, num_kv_heads, unidirectional, q_scaling, position_embedding_type,
-        rotary_embedding_dim, tp_size, tp_rank, context_fmha_type, multi_block_mode, kv_cache_quant_mode,
+        rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale_type, rotary_embedding_scale,
+        rotary_embedding_max_positions, tp_size, tp_rank, context_fmha_type, multi_block_mode, kv_cache_quant_mode,
         remove_input_padding, mask_type, paged_kv_cache, type, max_context_length, qkv_bias_enabled)
-    , mInFlightBatching(in_flight_batching)
 {
-    TLLM_CHECK(!mInFlightBatching || mRemovePadding);
 }
 
 GPTAttentionPlugin::GPTAttentionPlugin(const void* data, size_t length)
-    : GPTAttentionPluginCommon(data, GPTAttentionPluginCommon::getCommonSerializationSize())
+    : GPTAttentionPluginCommon(data, length)
 {
-    const char *d = reinterpret_cast<const char*>(data), *a = d;
-    d += GPTAttentionPluginCommon::getCommonSerializationSize();
-
-    read(d, mInFlightBatching);
-    TLLM_CHECK(d == a + length);
-    TLLM_CHECK(!mInFlightBatching || mRemovePadding);
 }
 
 // IPluginV2DynamicExt Methods
@@ -157,49 +150,56 @@ int GPTAttentionPlugin::enqueueImpl(const nvinfer1::PluginTensorDesc* inputDesc,
     cudaStream_t stream)
 {
     int32_t const nbSeq = inputDesc[getContextLengthsIdx()].dims.d[0];
-    if (!mInFlightBatching)
-    {
-        enqueueSome<T, KVCacheBuffer>(0, nbSeq, 0, inputDesc, outputDesc, inputs, outputs, workspace, stream);
-        return 0;
-    }
-    // In-flight batching code path
     int32_t const beam_width = inputDesc[getCacheIndirIdx()].dims.d[1];
     RequestType const* reqTypes = static_cast<RequestType const*>(inputs[getRequestTypesIdx()]);
 
     int32_t nbContextRequests = 0;
     int32_t contextTokenIdxEnd = 0;
     // count context requests
-    for (int32_t i = 0; i < nbSeq; i++)
+    for (int32_t seqIdx = 0; seqIdx < nbSeq; seqIdx++)
     {
-        if (reqTypes[i] != RequestType::kCONTEXT)
+        if (reqTypes[seqIdx] != RequestType::kCONTEXT)
         {
             break;
         }
         ++nbContextRequests;
-        contextTokenIdxEnd += (mRemovePadding ? getInputLength(inputs, i) : inputDesc[getInputTensorIdx()].dims.d[1]);
+        contextTokenIdxEnd += mRemovePadding ? static_cast<int32_t const*>(inputs[getHostContextLengthsIdx()])[seqIdx]
+                                             : inputDesc[getInputTensorIdx()].dims.d[1];
     }
-    for (int32_t i = nbContextRequests; i < nbSeq; i++)
+    for (int32_t seqIdx = nbContextRequests; seqIdx < nbSeq; seqIdx++)
+    {
+        TLLM_CHECK(reqTypes[seqIdx] == RequestType::kGENERATION);
+    }
+
+    // mixed requests require mRemovePadding and mPagedKVCache
+    if (nbContextRequests != 0 && nbContextRequests != nbSeq)
     {
-        TLLM_CHECK(reqTypes[i] == RequestType::kGENERATION);
+        TLLM_CHECK(mRemovePadding && mPagedKVCache);
     }
 
     if (nbContextRequests > 0)
     {
-        enqueueSome<T, KVCacheBuffer>(
-            0, nbContextRequests, 0, inputDesc, outputDesc, inputs, outputs, workspace, stream);
+        auto seqIdxBeg = 0;
+        auto tokenIdxBeg = 0;
+        auto localNbTokens = contextTokenIdxEnd;
+        enqueueSome<T, KVCacheBuffer>(seqIdxBeg, nbContextRequests, tokenIdxBeg, localNbTokens, inputDesc, outputDesc,
+            inputs, outputs, workspace, stream);
     }
 
-    if (nbSeq - nbContextRequests > 0)
+    if (auto nbGenerationSeq = nbSeq - nbContextRequests; nbGenerationSeq > 0)
     {
-        enqueueSome<T, KVCacheBuffer>(nbContextRequests, nbSeq - nbContextRequests, contextTokenIdxEnd, inputDesc,
-            outputDesc, inputs, outputs, workspace, stream);
+        auto seqIdxBeg = nbContextRequests;
+        auto tokenIdxBeg = contextTokenIdxEnd;
+        auto localNbTokens = nbGenerationSeq;
+        enqueueSome<T, KVCacheBuffer>(seqIdxBeg, nbGenerationSeq, tokenIdxBeg, localNbTokens, inputDesc, outputDesc,
+            inputs, outputs, workspace, stream);
     }
 
     return 0;
 }
 
 template <typename T, typename KVCacheBuffer>
-int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32_t tokenIdxBeg,
+int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32_t tokenIdxBeg, int32_t localNbTokens,
     const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc,
     const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream)
 {
@@ -217,12 +217,6 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
 
     auto const reqTypeInBatchPtr = static_cast<RequestType const*>(inputs[getRequestTypesIdx()]) + seqIdxBeg;
     bool const is_context = (reqTypeInBatchPtr[0] == RequestType::kCONTEXT);
-    TLLM_CHECK(std::all_of(reqTypeInBatchPtr, reqTypeInBatchPtr + localNbSeq,
-        [is_context](RequestType reqType)
-        {
-            TLLM_CHECK(reqType == RequestType::kCONTEXT || reqType == RequestType::kGENERATION);
-            return is_context == (reqType == RequestType::kCONTEXT);
-        }));
 
     const int* context_lengths = reinterpret_cast<const int*>(inputs[getContextLengthsIdx()]) + seqIdxBeg;
     // Note we still need context length during generation for MMHA optimziation.
@@ -235,7 +229,7 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
         auto const host_context_lengths = static_cast<int32_t const*>(inputs[getHostContextLengthsIdx()]) + seqIdxBeg;
         return *std::max_element(host_context_lengths, host_context_lengths + localNbSeq);
     }();
-    PLUGIN_ASSERT(max_context_len <= mMaxContextLength);
+    TLLM_CHECK(max_context_len <= mMaxContextLength);
 
     const float* kv_scale_orig_quant = nullptr;
     const float* kv_scale_quant_orig = nullptr;
@@ -276,29 +270,10 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
     if (is_context) // context stage
     {
         const int batch_size = localNbSeq;
-        const int request_batch_size = batch_size;
-        const int request_seq_len = max_context_len;
-        // num of total tokens (without paddings when remove paddings).
-        int num_tokens = 0;
-        if (!mRemovePadding)
-        {
-            num_tokens = request_batch_size * request_seq_len;
-        }
-        else if (mInFlightBatching)
-        {
-            auto const host_context_lengths
-                = static_cast<int32_t const*>(inputs[getHostContextLengthsIdx()]) + seqIdxBeg;
-            num_tokens = std::accumulate(host_context_lengths, host_context_lengths + localNbSeq, 0);
-        }
-        else
-        {
-            num_tokens = inputDesc[getInputTensorIdx()].dims.d[1];
-        }
-
         enqueueContext<T, KVCacheBuffer>(
             EnqueueContextParams<T, KVCacheBuffer>{attention_input, qkv_bias, max_context_len, maxSeqLen,
                 context_lengths, kv_scale_orig_quant, kv_scale_quant_orig, alibi_slopes, context_buf_, key_value_cache,
-                block_pointers, batch_size, num_tokens, tokens_per_block, max_blocks_per_sequence, workspace},
+                block_pointers, batch_size, localNbTokens, tokens_per_block, max_blocks_per_sequence, workspace},
             stream);
     }
     else // generation stage; input_seq_len == 1
@@ -387,16 +362,12 @@ int GPTAttentionPlugin::getNbOutputs() const noexcept
 
 size_t GPTAttentionPlugin::getSerializationSize() const noexcept
 {
-    return GPTAttentionPluginCommon::getCommonSerializationSize() + sizeof(mInFlightBatching);
+    return GPTAttentionPluginCommon::getCommonSerializationSize();
 }
 
 void GPTAttentionPlugin::serialize(void* buffer) const noexcept
 {
-    char *d = static_cast<char*>(buffer), *a = d;
     GPTAttentionPluginCommon::serializeCommon(buffer);
-    d += GPTAttentionPluginCommon::getCommonSerializationSize();
-    write(d, mInFlightBatching);
-    PLUGIN_ASSERT(d == a + getSerializationSize());
 }
 
 ///////////////
@@ -435,7 +406,10 @@ IPluginV2* GPTAttentionPluginCreator::createPlugin(const char* name, const Plugi
             p.getScalar<int32_t>("num_kv_heads").value(), p.getScalar<int32_t>("unidirectional").value(),
             p.getScalar<float>("q_scaling").value(),
             static_cast<PositionEmbeddingType>(p.getScalar<int8_t>("position_embedding_type").value()),
-            p.getScalar<int32_t>("rotary_embedding_dim").value(),
+            p.getScalar<int32_t>("rotary_embedding_dim").value(), p.getScalar<float>("rotary_embedding_base").value(),
+            static_cast<RotaryScalingType>(p.getScalar<int8_t>("rotary_embedding_scale_type").value()),
+            p.getScalar<float>("rotary_embedding_scale").value(),
+            p.getScalar<int32_t>("rotary_embedding_max_positions").value(),
             static_cast<int32_t>(p.getScalar<int32_t>("tp_size").value()),
             static_cast<int32_t>(p.getScalar<int32_t>("tp_rank").value()),
             static_cast<ContextFMHAType>(p.getScalar<int8_t>("context_fmha_type").value()),
@@ -445,7 +419,7 @@ IPluginV2* GPTAttentionPluginCreator::createPlugin(const char* name, const Plugi
             static_cast<AttentionMaskType>(p.getScalar<int32_t>("mask_type").value()),
             static_cast<bool>(p.getScalar<int32_t>("paged_kv_cache").value()),
             static_cast<nvinfer1::DataType>(p.getScalar<int32_t>("type_id").value()),
-            p.getScalar<int32_t>("in_flight_batching").value(), p.getScalar<int32_t>("max_context_length").value(),
+            p.getScalar<int32_t>("max_context_length").value(),
             static_cast<bool>(p.getScalar<int8_t>("qkv_bias_enabled").value()));
         obj->setPluginNamespace(mNamespace.c_str());
         return obj;
@@ -474,13 +448,3 @@ IPluginV2* GPTAttentionPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void GPTAttentionPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* GPTAttentionPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h
index bb27d8b6974..101ef1111c7 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h
+++ b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_GPT_ATTENTION_PLUGIN_H
-#define TRT_GPT_ATTENTION_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "checkMacrosPlugin.h"
 #include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/common/quantization.h"
@@ -31,9 +30,7 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 // batch_size = num_ctx_requests + num_gen_requests * beam_width
 // num_ctx_requests = number of context requests (single sequence per request).
@@ -70,10 +67,11 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon
     GPTAttentionPlugin(int num_heads, int num_kv_heads, int unidirectional, float q_scaling,
         tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
         int rotary_embedding_dim, // for RoPE. 0 for non-RoPE
-        int tp_size, int tp_rank, // for ALiBi
+        float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
+        float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi
         tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, int kv_cache_quant_mode,
         bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type, bool paged_kv_cache,
-        nvinfer1::DataType type, bool in_flight_batching, int32_t max_context_length, bool qkv_bias_enabled);
+        nvinfer1::DataType type, int32_t max_context_length, bool qkv_bias_enabled);
 
     GPTAttentionPlugin(const void* data, size_t length);
 
@@ -120,16 +118,12 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon
     enum class RequestType : int32_t
     {
         kCONTEXT = 0,
-        kGENERATION = 1,
-        kNONE = 2
+        kGENERATION = 1
     };
 
-private:
-    bool mInFlightBatching = false;
-
 private:
     template <typename T, typename KVCacheBuffer>
-    int enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32_t tokenIdxBeg,
+    int enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32_t tokenIdxBeg, int32_t localNbTokens,
         const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc,
         const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream);
 
@@ -192,28 +186,16 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon
 
     IndexType getHostContextLengthsIdx() const
     {
-        PLUGIN_ASSERT(mRemovePadding);
+        TLLM_CHECK(mRemovePadding);
         return (mKVCacheQuantMode.hasKvCacheQuant() ? 9 : 7) + (mPagedKVCache ? 1 : 0) + (isALiBi() ? 1 : 0);
     }
 
     IndexType getQKVBiasTensorIdx() const
     {
-        PLUGIN_ASSERT(mQKVBiasEnabled);
+        TLLM_CHECK(mQKVBiasEnabled);
         return (mKVCacheQuantMode.hasInt8KvCache() ? 9 : 7) + (mPagedKVCache ? 1 : 0) + (isALiBi() ? 1 : 0)
             + (mRemovePadding ? 1 : 0);
     }
-
-    int32_t getInputLength(const void* const* inputs, int32_t seqIdx) const
-    {
-        auto const reqType = static_cast<RequestType const*>(inputs[getRequestTypesIdx()])[seqIdx];
-        switch (reqType)
-        {
-        case RequestType::kCONTEXT: return static_cast<int32_t const*>(inputs[getHostContextLengthsIdx()])[seqIdx];
-        case RequestType::kGENERATION: return 1;
-        case RequestType::kNONE: return 0;
-        }
-        PLUGIN_ASSERT(!"Unexpected request type");
-    }
 };
 
 class GPTAttentionPluginCreator : public GPTAttentionPluginCreatorCommon
@@ -231,13 +213,6 @@ class GPTAttentionPluginCreator : public GPTAttentionPluginCreatorCommon
 
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
-
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_GPT_ATTENTION_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp b/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp
index 36f39a085b7..74ebe14e406 100644
--- a/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp
@@ -14,16 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/identityPlugin/identityPlugin.h"
+#include "identityPlugin.h"
 
 using namespace nvinfer1;
-using nvinfer1::plugin::IdentityPluginCreator;
-using nvinfer1::plugin::IdentityPlugin;
+using tensorrt_llm::plugins::IdentityPluginCreator;
+using tensorrt_llm::plugins::IdentityPlugin;
 
 static const char* IDENTITY_PLUGIN_VERSION{"1"};
 static const char* IDENTITY_PLUGIN_NAME{"Identity"};
 PluginFieldCollection IdentityPluginCreator::mFC{};
-std::vector<PluginField> IdentityPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> IdentityPluginCreator::mPluginAttributes;
 
 IdentityPlugin::IdentityPlugin() {}
 
@@ -31,7 +31,7 @@ IdentityPlugin::IdentityPlugin() {}
 IdentityPlugin::IdentityPlugin(const void* data, size_t length)
 {
     const char *d = reinterpret_cast<const char*>(data), *a = d;
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -152,16 +152,6 @@ void IdentityPlugin::destroy() noexcept
     delete this;
 }
 
-void IdentityPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* IdentityPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 IdentityPluginCreator::IdentityPluginCreator()
@@ -219,13 +209,3 @@ IPluginV2* IdentityPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void IdentityPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* IdentityPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.h b/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.h
index ca0e7465576..79d9e4104d4 100644
--- a/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.h
+++ b/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_IDENTITY_PLUGIN_H
-#define TRT_IDENTITY_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <mpi.h>
@@ -24,12 +23,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class IdentityPlugin : public IPluginV2DynamicExt
+class IdentityPlugin : public BasePlugin
 {
 public:
     IdentityPlugin();
@@ -64,15 +61,12 @@ class IdentityPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     const std::string mLayerName;
-    std::string mNamespace;
 };
 
-class IdentityPluginCreator : public IPluginCreator
+class IdentityPluginCreator : public BaseCreator
 {
 public:
     IdentityPluginCreator();
@@ -88,17 +82,9 @@ class IdentityPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_IDENTITY_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.cpp b/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.cpp
index c67f83cc363..98797a3fb84 100644
--- a/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.cpp
@@ -15,20 +15,20 @@
  * limitations under the License.
  */
 
-#include "tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h"
+#include "layernormPlugin.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/kernels/layernormKernels.h"
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::kernels;
 using namespace tensorrt_llm::common;
-using nvinfer1::plugin::LayernormPluginCreator;
-using nvinfer1::plugin::LayernormPlugin;
+using tensorrt_llm::plugins::LayernormPluginCreator;
+using tensorrt_llm::plugins::LayernormPlugin;
 
 static const char* LAYERNORM_PLUGIN_VERSION{"1"};
 static const char* LAYERNORM_PLUGIN_NAME{"Layernorm"};
 PluginFieldCollection LayernormPluginCreator::mFC{};
-std::vector<PluginField> LayernormPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> LayernormPluginCreator::mPluginAttributes;
 
 LayernormPlugin::LayernormPlugin(float eps, bool useDiffOfSquares, nvinfer1::DataType type)
     : mEps(eps)
@@ -46,7 +46,7 @@ LayernormPlugin::LayernormPlugin(const void* data, size_t length)
     read(d, mEps);
     read(d, mUseDiffOfSquares);
     read(d, mType);
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
     TLLM_CHECK_WITH_INFO((getSMVersion() >= 80) || (mType != DataType::kBF16), "Unsupported data type");
 }
 
@@ -67,7 +67,7 @@ nvinfer1::DimsExprs LayernormPlugin::getOutputDimensions(
 bool LayernormPlugin::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept
 {
-    PLUGIN_ASSERT(0 <= pos && pos < 5);
+    TLLM_CHECK(0 <= pos && pos < 5);
     return (inOut[pos].type == mType) && (inOut[pos].format == TensorFormat::kLINEAR);
 }
 
@@ -181,16 +181,6 @@ void LayernormPlugin::destroy() noexcept
     delete this;
 }
 
-void LayernormPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* LayernormPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 LayernormPluginCreator::LayernormPluginCreator()
@@ -231,17 +221,17 @@ IPluginV2* LayernormPluginCreator::createPlugin(const char* name, const PluginFi
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "eps"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kFLOAT32);
             eps = static_cast<float>(*(static_cast<const float*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "use_diff_of_squares"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             useDiffOfSquares = static_cast<bool>(*(static_cast<const bool*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
     }
@@ -275,13 +265,3 @@ IPluginV2* LayernormPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void LayernormPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* LayernormPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h b/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h
index 5f3fe6cab18..ef7a1b114e1 100644
--- a/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h
+++ b/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_LAYERNORM_PLUGIN_H
-#define TRT_LAYERNORM_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <mpi.h>
@@ -24,12 +23,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class LayernormPlugin : public IPluginV2DynamicExt
+class LayernormPlugin : public BasePlugin
 {
 public:
     LayernormPlugin(float eps, bool useDiffOfSquares, nvinfer1::DataType type);
@@ -64,8 +61,6 @@ class LayernormPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     float mEps;
@@ -73,10 +68,9 @@ class LayernormPlugin : public IPluginV2DynamicExt
     nvinfer1::DataType mType;
 
     const std::string mLayerName;
-    std::string mNamespace;
 };
 
-class LayernormPluginCreator : public IPluginCreator
+class LayernormPluginCreator : public BaseCreator
 {
 public:
     LayernormPluginCreator();
@@ -92,17 +86,9 @@ class LayernormPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_LAYERNORM_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp b/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp
index ec80a967e2c..e0545de42e9 100644
--- a/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp
@@ -14,19 +14,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h"
+#include "layernormQuantizationPlugin.h"
 #include "tensorrt_llm/kernels/layernormKernels.h"
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::kernels;
 using namespace tensorrt_llm::common;
-using nvinfer1::plugin::LayernormQuantizationPluginCreator;
-using nvinfer1::plugin::LayernormQuantizationPlugin;
+using tensorrt_llm::plugins::LayernormQuantizationPluginCreator;
+using tensorrt_llm::plugins::LayernormQuantizationPlugin;
 
 static const char* LAYERNORM_QUANTIZATION_PLUGIN_VERSION{"1"};
 static const char* LAYERNORM_QUANTIZATION_PLUGIN_NAME{"LayernormQuantization"};
 PluginFieldCollection LayernormQuantizationPluginCreator::mFC{};
-std::vector<PluginField> LayernormQuantizationPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> LayernormQuantizationPluginCreator::mPluginAttributes;
 
 LayernormQuantizationPlugin::LayernormQuantizationPlugin(
     float eps, bool useDiffOfSquares, bool dynamicActivationScaling, nvinfer1::DataType type)
@@ -45,7 +45,7 @@ LayernormQuantizationPlugin::LayernormQuantizationPlugin(const void* data, size_
     read(d, mUseDiffOfSquares);
     read(d, mDynActScaling);
     read(d, mType);
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -68,7 +68,7 @@ nvinfer1::DimsExprs LayernormQuantizationPlugin::getOutputDimensions(
     // Dynamic scaling output if enabled
     try
     {
-        PLUGIN_ASSERT(outputIndex == 1);
+        TLLM_CHECK(outputIndex == 1);
         DimsExprs ret;
         ret.nbDims = inputs[0].nbDims;
         for (int di = 0; di < ret.nbDims - 1; ++di)
@@ -89,8 +89,8 @@ bool LayernormQuantizationPlugin::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept
 {
     const int totalPoses = 6 + static_cast<int>(mDynActScaling);
-    PLUGIN_ASSERT(0 <= pos && pos < totalPoses);
-    PLUGIN_ASSERT(nbInputs == 4);
+    TLLM_CHECK(0 <= pos && pos < totalPoses);
+    TLLM_CHECK(nbInputs == 4);
     if (pos < nbInputs)
     {
         switch (pos)
@@ -224,16 +224,6 @@ void LayernormQuantizationPlugin::destroy() noexcept
     delete this;
 }
 
-void LayernormQuantizationPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* LayernormQuantizationPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 LayernormQuantizationPluginCreator::LayernormQuantizationPluginCreator()
@@ -276,22 +266,22 @@ IPluginV2* LayernormQuantizationPluginCreator::createPlugin(const char* name, co
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "eps"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kFLOAT32);
             eps = static_cast<float>(*(static_cast<const float*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "dyn_act_scaling"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             dynamicActivationScaling = static_cast<bool>(*(static_cast<const bool*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "use_diff_of_squares"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             useDiffOfSquares = static_cast<bool>(*(static_cast<const bool*>(fields[i].data)));
         }
     }
@@ -325,13 +315,3 @@ IPluginV2* LayernormQuantizationPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void LayernormQuantizationPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* LayernormQuantizationPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h b/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h
old mode 100755
new mode 100644
index b0720d4b224..5d4361d01c4
--- a/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h
+++ b/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_LAYERNORM_QUANTIZATION_PLUGIN_H
-#define TRT_LAYERNORM_QUANTIZATION_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <mpi.h>
@@ -24,12 +23,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class LayernormQuantizationPlugin : public IPluginV2DynamicExt
+class LayernormQuantizationPlugin : public BasePlugin
 {
 public:
     LayernormQuantizationPlugin(
@@ -65,8 +62,6 @@ class LayernormQuantizationPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     float mEps;
@@ -75,10 +70,9 @@ class LayernormQuantizationPlugin : public IPluginV2DynamicExt
     nvinfer1::DataType mType;
 
     const std::string mLayerName;
-    std::string mNamespace;
 };
 
-class LayernormQuantizationPluginCreator : public IPluginCreator
+class LayernormQuantizationPluginCreator : public BaseCreator
 {
 public:
     LayernormQuantizationPluginCreator();
@@ -94,17 +88,9 @@ class LayernormQuantizationPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_LAYERNORM_QUANTIZATION_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp b/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp
index 5907621951c..9174ebc7f27 100644
--- a/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp
@@ -17,20 +17,20 @@
 
 #include <cstdio>
 
+#include "lookupPlugin.h"
 #include "tensorrt_llm/kernels/lookupKernels.h"
 #include "tensorrt_llm/plugins/common/plugin.h"
-#include "tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h"
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::kernels;
 using namespace tensorrt_llm::common;
-using nvinfer1::plugin::LookupPluginCreator;
-using nvinfer1::plugin::LookupPlugin;
+using tensorrt_llm::plugins::LookupPluginCreator;
+using tensorrt_llm::plugins::LookupPlugin;
 
 static const char* LOOKUP_PLUGIN_VERSION{"1"};
 static const char* LOOKUP_PLUGIN_NAME{"Lookup"};
 PluginFieldCollection LookupPluginCreator::mFC{};
-std::vector<PluginField> LookupPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> LookupPluginCreator::mPluginAttributes;
 
 LookupPlugin::LookupPlugin(nvinfer1::DataType type, int rank)
     : mType(type)
@@ -44,7 +44,7 @@ LookupPlugin::LookupPlugin(const void* data, size_t length)
     const char *d = reinterpret_cast<const char*>(data), *a = d;
     read(d, mType);
     read(d, mRank);
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -61,8 +61,8 @@ nvinfer1::DimsExprs LookupPlugin::getOutputDimensions(
 {
     try
     {
-        PLUGIN_ASSERT(nbInputs == 2);
-        PLUGIN_ASSERT(outputIndex == 0);
+        TLLM_CHECK(nbInputs == 2);
+        TLLM_CHECK(outputIndex == 0);
         DimsExprs ret;
         const int nbDimsInput = inputs[0].nbDims;
         const int nbDimsWeight = inputs[1].nbDims;
@@ -157,7 +157,7 @@ int LookupPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvi
 nvinfer1::DataType LookupPlugin::getOutputDataType(
     int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
 {
-    PLUGIN_ASSERT(index == 0);
+    TLLM_CHECK(index == 0);
     return inputTypes[1];
 }
 
@@ -204,16 +204,6 @@ void LookupPlugin::serialize(void* buffer) const noexcept
 
 void LookupPlugin::terminate() noexcept {}
 
-void LookupPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* LookupPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 LookupPluginCreator::LookupPluginCreator()
@@ -252,12 +242,12 @@ IPluginV2* LookupPluginCreator::createPlugin(const char* name, const PluginField
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "rank"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             rank = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
     }
@@ -291,13 +281,3 @@ IPluginV2* LookupPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void LookupPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* LookupPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h b/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h
index 681cf0057af..035264715aa 100644
--- a/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h
+++ b/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h
@@ -14,21 +14,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_LOOKUP_PLUGIN_H
-#define TRT_LOOKUP_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <set>
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class LookupPlugin : public IPluginV2DynamicExt
+class LookupPlugin : public BasePlugin
 {
 public:
     LookupPlugin() = delete;
@@ -65,18 +62,15 @@ class LookupPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     const std::string mLayerName;
-    std::string mNamespace;
 
     nvinfer1::DataType mType;
     int mRank;
 };
 
-class LookupPluginCreator : public IPluginCreator
+class LookupPluginCreator : public BaseCreator
 {
 public:
     LookupPluginCreator();
@@ -92,17 +86,9 @@ class LookupPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_LOOKUP_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp
index fac13e9d6d0..8a1bca755bf 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp
@@ -14,16 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h"
+#include "allgatherPlugin.h"
 
 using namespace nvinfer1;
-using nvinfer1::plugin::AllgatherPluginCreator;
-using nvinfer1::plugin::AllgatherPlugin;
+using tensorrt_llm::plugins::AllgatherPluginCreator;
+using tensorrt_llm::plugins::AllgatherPlugin;
 
 static const char* ALLGATHER_PLUGIN_VERSION{"1"};
 static const char* ALLGATHER_PLUGIN_NAME{"AllGather"};
 PluginFieldCollection AllgatherPluginCreator::mFC{};
-std::vector<PluginField> AllgatherPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> AllgatherPluginCreator::mPluginAttributes;
 
 AllgatherPlugin::AllgatherPlugin(std::set<int> group, nvinfer1::DataType type)
     : mGroup(group)
@@ -43,7 +43,7 @@ AllgatherPlugin::AllgatherPlugin(const void* data, size_t length)
         read(d, groupItem);
         mGroup.insert(groupItem);
     }
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -200,16 +200,6 @@ void AllgatherPlugin::destroy() noexcept
     delete this;
 }
 
-void AllgatherPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* AllgatherPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 AllgatherPluginCreator::AllgatherPluginCreator()
@@ -248,7 +238,7 @@ IPluginV2* AllgatherPluginCreator::createPlugin(const char* name, const PluginFi
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "group"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             const auto* r = static_cast<const int*>(fields[i].data);
             for (int j = 0; j < fields[i].length; ++j)
             {
@@ -258,7 +248,7 @@ IPluginV2* AllgatherPluginCreator::createPlugin(const char* name, const PluginFi
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
     }
@@ -293,13 +283,3 @@ IPluginV2* AllgatherPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void AllgatherPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* AllgatherPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h
index 5366a44382a..923f4a2cdb0 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_ALLGATHER_PLUGIN_H
-#define TRT_ALLGATHER_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <mpi.h>
@@ -25,12 +24,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class AllgatherPlugin : public IPluginV2DynamicExt
+class AllgatherPlugin : public BasePlugin
 {
 public:
     AllgatherPlugin(std::set<int> group, nvinfer1::DataType type);
@@ -65,17 +62,14 @@ class AllgatherPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     const std::string mLayerName;
-    std::string mNamespace;
     std::set<int> mGroup;
     nvinfer1::DataType mType;
 };
 
-class AllgatherPluginCreator : public IPluginCreator
+class AllgatherPluginCreator : public BaseCreator
 {
 public:
     AllgatherPluginCreator();
@@ -91,17 +85,9 @@ class AllgatherPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_ALLGATHER_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
index d754e0591b3..546848161b2 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
@@ -14,16 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h"
+#include "allreducePlugin.h"
 
 using namespace nvinfer1;
-using nvinfer1::plugin::AllreducePluginCreator;
-using nvinfer1::plugin::AllreducePlugin;
+using tensorrt_llm::plugins::AllreducePluginCreator;
+using tensorrt_llm::plugins::AllreducePlugin;
 
 static const char* ALLREDUCE_PLUGIN_VERSION{"1"};
 static const char* ALLREDUCE_PLUGIN_NAME{"AllReduce"};
 PluginFieldCollection AllreducePluginCreator::mFC{};
-std::vector<PluginField> AllreducePluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> AllreducePluginCreator::mPluginAttributes;
 
 AllreducePlugin::AllreducePlugin(std::set<int> group, nvinfer1::DataType type)
     : mGroup(group)
@@ -43,7 +43,7 @@ AllreducePlugin::AllreducePlugin(const void* data, size_t length)
         read(d, groupItem);
         mGroup.insert(groupItem);
     }
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -196,16 +196,6 @@ void AllreducePlugin::destroy() noexcept
     delete this;
 }
 
-void AllreducePlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* AllreducePlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 AllreducePluginCreator::AllreducePluginCreator()
@@ -244,7 +234,7 @@ IPluginV2* AllreducePluginCreator::createPlugin(const char* name, const PluginFi
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "group"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             const auto* r = static_cast<const int*>(fields[i].data);
             for (int j = 0; j < fields[i].length; ++j)
             {
@@ -254,7 +244,7 @@ IPluginV2* AllreducePluginCreator::createPlugin(const char* name, const PluginFi
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
     }
@@ -289,13 +279,3 @@ IPluginV2* AllreducePluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void AllreducePluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* AllreducePluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h
index 87bc1495588..a5b6e798604 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_ALLREDUCE_PLUGIN_H
-#define TRT_ALLREDUCE_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <mpi.h>
@@ -25,12 +24,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class AllreducePlugin : public IPluginV2DynamicExt
+class AllreducePlugin : public BasePlugin
 {
 public:
     AllreducePlugin(std::set<int> group, nvinfer1::DataType type);
@@ -65,17 +62,14 @@ class AllreducePlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     const std::string mLayerName;
-    std::string mNamespace;
     std::set<int> mGroup;
     nvinfer1::DataType mType;
 };
 
-class AllreducePluginCreator : public IPluginCreator
+class AllreducePluginCreator : public BaseCreator
 {
 public:
     AllreducePluginCreator();
@@ -91,17 +85,9 @@ class AllreducePluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_ALLREDUCE_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp
index f553cbdfe9a..5a2f8036db4 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp
@@ -14,16 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/ncclPlugin/recvPlugin.h"
+#include "recvPlugin.h"
 
 using namespace nvinfer1;
-using nvinfer1::plugin::RecvPluginCreator;
-using nvinfer1::plugin::RecvPlugin;
+using tensorrt_llm::plugins::RecvPluginCreator;
+using tensorrt_llm::plugins::RecvPlugin;
 
 static const char* RECV_PLUGIN_VERSION{"1"};
 static const char* RECV_PLUGIN_NAME{"Recv"};
 PluginFieldCollection RecvPluginCreator::mFC{};
-std::vector<PluginField> RecvPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> RecvPluginCreator::mPluginAttributes;
 
 RecvPlugin::RecvPlugin(int srcRank, nvinfer1::DataType type)
     : mSrcRank(srcRank)
@@ -37,7 +37,7 @@ RecvPlugin::RecvPlugin(const void* data, size_t length)
     const char *d = reinterpret_cast<const char*>(data), *a = d;
     read(d, mType);
     read(d, mSrcRank);
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -158,16 +158,6 @@ void RecvPlugin::destroy() noexcept
     delete this;
 }
 
-void RecvPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* RecvPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 RecvPluginCreator::RecvPluginCreator()
@@ -207,12 +197,12 @@ IPluginV2* RecvPluginCreator::createPlugin(const char* name, const PluginFieldCo
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "src_rank"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             srcRank = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
     }
@@ -246,13 +236,3 @@ IPluginV2* RecvPluginCreator::deserializePlugin(const char* name, const void* se
     }
     return nullptr;
 }
-
-void RecvPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* RecvPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.h b/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.h
index 37be01ffdee..ac0da643dca 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.h
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_RECV_PLUGIN_H
-#define TRT_RECV_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <mpi.h>
@@ -24,12 +23,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class RecvPlugin : public IPluginV2DynamicExt
+class RecvPlugin : public BasePlugin
 {
 public:
     RecvPlugin(int srcRank, nvinfer1::DataType type);
@@ -64,17 +61,14 @@ class RecvPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
-    std::string mNamespace;
     ncclComm_t mComm; // TODO: (kaiyu) Remove this
     int mSrcRank;
     nvinfer1::DataType mType;
 };
 
-class RecvPluginCreator : public IPluginCreator
+class RecvPluginCreator : public BaseCreator
 {
 public:
     RecvPluginCreator();
@@ -90,17 +84,9 @@ class RecvPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_RECV_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp
index 31af9540344..ab74f500277 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp
@@ -14,16 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/ncclPlugin/sendPlugin.h"
+#include "sendPlugin.h"
 
 using namespace nvinfer1;
-using nvinfer1::plugin::SendPluginCreator;
-using nvinfer1::plugin::SendPlugin;
+using tensorrt_llm::plugins::SendPluginCreator;
+using tensorrt_llm::plugins::SendPlugin;
 
 static const char* SEND_PLUGIN_VERSION{"1"};
 static const char* SEND_PLUGIN_NAME{"Send"};
 PluginFieldCollection SendPluginCreator::mFC{};
-std::vector<PluginField> SendPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> SendPluginCreator::mPluginAttributes;
 
 SendPlugin::SendPlugin(int tgtRank, nvinfer1::DataType type)
     : mTgtRank(tgtRank)
@@ -37,7 +37,7 @@ SendPlugin::SendPlugin(const void* data, size_t length)
     const char *d = reinterpret_cast<const char*>(data), *a = d;
     read(d, mType);
     read(d, mTgtRank);
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -159,16 +159,6 @@ void SendPlugin::destroy() noexcept
     delete this;
 }
 
-void SendPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* SendPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 SendPluginCreator::SendPluginCreator()
@@ -208,12 +198,12 @@ IPluginV2* SendPluginCreator::createPlugin(const char* name, const PluginFieldCo
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "tgt_rank"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             tgtRank = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
     }
@@ -247,13 +237,3 @@ IPluginV2* SendPluginCreator::deserializePlugin(const char* name, const void* se
     }
     return nullptr;
 }
-
-void SendPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* SendPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.h b/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.h
index fc46643a5f2..70d3c049be2 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.h
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_SEND_PLUGIN_H
-#define TRT_SEND_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <mpi.h>
@@ -24,12 +23,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class SendPlugin : public IPluginV2DynamicExt
+class SendPlugin : public BasePlugin
 {
 public:
     SendPlugin(int tgtRank, nvinfer1::DataType type);
@@ -64,17 +61,14 @@ class SendPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
-    std::string mNamespace;
     ncclComm_t mComm; // TODO: (kaiyu) Remove this
     int mTgtRank;
     nvinfer1::DataType mType;
 };
 
-class SendPluginCreator : public IPluginCreator
+class SendPluginCreator : public BaseCreator
 {
 public:
     SendPluginCreator();
@@ -90,17 +84,9 @@ class SendPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_SEND_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp b/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp
index 443e8494a6f..7bf06144551 100644
--- a/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp
@@ -14,18 +14,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h"
+#include "quantizePerTokenPlugin.h"
 #include "tensorrt_llm/kernels/quantization.h"
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::kernels;
-using nvinfer1::plugin::QuantizePerTokenPluginCreator;
-using nvinfer1::plugin::QuantizePerTokenPlugin;
+using tensorrt_llm::plugins::QuantizePerTokenPluginCreator;
+using tensorrt_llm::plugins::QuantizePerTokenPlugin;
 
 static const char* QUANTIZE_PER_TOKEN_PLUGIN_VERSION{"1"};
 static const char* QUANTIZE_PER_TOKEN_PLUGIN_NAME{"QuantizePerToken"};
 PluginFieldCollection QuantizePerTokenPluginCreator::mFC{};
-std::vector<PluginField> QuantizePerTokenPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> QuantizePerTokenPluginCreator::mPluginAttributes;
 
 QuantizePerTokenPlugin::QuantizePerTokenPlugin() {}
 
@@ -33,7 +33,7 @@ QuantizePerTokenPlugin::QuantizePerTokenPlugin() {}
 QuantizePerTokenPlugin::QuantizePerTokenPlugin(const void* data, size_t length)
 {
     const char *d = reinterpret_cast<const char*>(data), *a = d;
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -49,8 +49,8 @@ nvinfer1::DimsExprs QuantizePerTokenPlugin::getOutputDimensions(
 {
     try
     {
-        PLUGIN_ASSERT(nbInputs == 1);
-        PLUGIN_ASSERT(outputIndex < 2);
+        TLLM_CHECK(nbInputs == 1);
+        TLLM_CHECK(outputIndex < 2);
         if (outputIndex == 0)
         {
             // Quantized input
@@ -142,8 +142,8 @@ int QuantizePerTokenPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
 nvinfer1::DataType QuantizePerTokenPlugin::getOutputDataType(
     int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
 {
-    PLUGIN_ASSERT(nbInputs == 1);
-    PLUGIN_ASSERT(index < 2);
+    TLLM_CHECK(nbInputs == 1);
+    TLLM_CHECK(index < 2);
     return index == 0 ? nvinfer1::DataType::kINT8 : nvinfer1::DataType::kFLOAT;
 }
 
@@ -188,16 +188,6 @@ void QuantizePerTokenPlugin::destroy() noexcept
     delete this;
 }
 
-void QuantizePerTokenPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* QuantizePerTokenPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 QuantizePerTokenPluginCreator::QuantizePerTokenPluginCreator()
@@ -255,13 +245,3 @@ IPluginV2* QuantizePerTokenPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void QuantizePerTokenPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* QuantizePerTokenPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h b/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h
old mode 100755
new mode 100644
index 641042a3c07..c10f0bc773f
--- a/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h
+++ b/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h
@@ -14,10 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_QUANTIZE_PER_TOKEN_PLUGIN_H
-#define TRT_QUANTIZE_PER_TOKEN_PLUGIN_H
+#pragma once
 
-#include "NvInferPlugin.h"
 #include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
@@ -26,12 +24,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class QuantizePerTokenPlugin : public IPluginV2DynamicExt
+class QuantizePerTokenPlugin : public BasePlugin
 {
 public:
     QuantizePerTokenPlugin();
@@ -66,15 +62,12 @@ class QuantizePerTokenPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     const std::string mLayerName;
-    std::string mNamespace;
 };
 
-class QuantizePerTokenPluginCreator : public IPluginCreator
+class QuantizePerTokenPluginCreator : public BaseCreator
 {
 public:
     QuantizePerTokenPluginCreator();
@@ -90,17 +83,9 @@ class QuantizePerTokenPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_QUANTIZE_PER_TOKEN_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp b/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp
index 13463fd49b5..27217c59e6c 100644
--- a/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp
@@ -14,18 +14,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h"
+#include "quantizeTensorPlugin.h"
 #include "tensorrt_llm/kernels/quantization.h"
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::kernels;
-using nvinfer1::plugin::QuantizeTensorPluginCreator;
-using nvinfer1::plugin::QuantizeTensorPlugin;
+using tensorrt_llm::plugins::QuantizeTensorPluginCreator;
+using tensorrt_llm::plugins::QuantizeTensorPlugin;
 
 static const char* QUANTIZE_TENSOR_PLUGIN_VERSION{"1"};
 static const char* QUANTIZE_TENSOR_PLUGIN_NAME{"QuantizeTensor"};
 PluginFieldCollection QuantizeTensorPluginCreator::mFC{};
-std::vector<PluginField> QuantizeTensorPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> QuantizeTensorPluginCreator::mPluginAttributes;
 
 QuantizeTensorPlugin::QuantizeTensorPlugin() {}
 
@@ -33,7 +33,7 @@ QuantizeTensorPlugin::QuantizeTensorPlugin() {}
 QuantizeTensorPlugin::QuantizeTensorPlugin(const void* data, size_t length)
 {
     const char *d = reinterpret_cast<const char*>(data), *a = d;
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -47,8 +47,8 @@ nvinfer1::DimsExprs QuantizeTensorPlugin::getOutputDimensions(
 {
     try
     {
-        PLUGIN_ASSERT(nbInputs == 2);
-        PLUGIN_ASSERT(outputIndex < 1);
+        TLLM_CHECK(nbInputs == 2);
+        TLLM_CHECK(outputIndex < 1);
         // Quantized input
         return inputs[0];
     }
@@ -76,7 +76,7 @@ bool QuantizeTensorPlugin::supportsFormatCombination(
         return inOut[pos].type == nvinfer1::DataType::kINT8 && inOut[pos].format == TensorFormat::kLINEAR;
     default:
         // Never should be here
-        PLUGIN_ASSERT(false);
+        TLLM_CHECK(false);
         return false;
     }
 }
@@ -126,8 +126,8 @@ int QuantizeTensorPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
 nvinfer1::DataType QuantizeTensorPlugin::getOutputDataType(
     int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
 {
-    PLUGIN_ASSERT(nbInputs == 2);
-    PLUGIN_ASSERT(index == 0);
+    TLLM_CHECK(nbInputs == 2);
+    TLLM_CHECK(index == 0);
     return nvinfer1::DataType::kINT8;
 }
 
@@ -175,16 +175,6 @@ void QuantizeTensorPlugin::destroy() noexcept
     delete this;
 }
 
-void QuantizeTensorPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* QuantizeTensorPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 QuantizeTensorPluginCreator::QuantizeTensorPluginCreator()
@@ -242,13 +232,3 @@ IPluginV2* QuantizeTensorPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void QuantizeTensorPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* QuantizeTensorPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h b/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h
old mode 100755
new mode 100644
index d0369f22f30..ec1d33785a4
--- a/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h
+++ b/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h
@@ -14,10 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_QUANTIZE_TENSOR_PLUGIN_H
-#define TRT_QUANTIZE_TENSOR_PLUGIN_H
+#pragma once
 
-#include "NvInferPlugin.h"
 #include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
@@ -26,12 +24,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class QuantizeTensorPlugin : public IPluginV2DynamicExt
+class QuantizeTensorPlugin : public BasePlugin
 {
 public:
     QuantizeTensorPlugin();
@@ -66,16 +62,13 @@ class QuantizeTensorPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     const std::string mLayerName;
-    std::string mNamespace;
     cudaDeviceProp mProp;
 };
 
-class QuantizeTensorPluginCreator : public IPluginCreator
+class QuantizeTensorPluginCreator : public BaseCreator
 {
 public:
     QuantizeTensorPluginCreator();
@@ -91,17 +84,9 @@ class QuantizeTensorPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_QUANTIZE_TENSOR_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.cpp b/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.cpp
index 3c1d590fadb..f2afe4157bf 100644
--- a/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.cpp
@@ -14,20 +14,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h"
+#include "rmsnormPlugin/rmsnormPlugin.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/kernels/rmsnormKernels.h"
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::kernels;
 using namespace tensorrt_llm::common;
-using nvinfer1::plugin::RmsnormPluginCreator;
-using nvinfer1::plugin::RmsnormPlugin;
+using tensorrt_llm::plugins::RmsnormPluginCreator;
+using tensorrt_llm::plugins::RmsnormPlugin;
 
 static const char* RMSNORM_PLUGIN_VERSION{"1"};
 static const char* RMSNORM_PLUGIN_NAME{"Rmsnorm"};
 PluginFieldCollection RmsnormPluginCreator::mFC{};
-std::vector<PluginField> RmsnormPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> RmsnormPluginCreator::mPluginAttributes;
 
 RmsnormPlugin::RmsnormPlugin(float eps, nvinfer1::DataType type)
     : mEps(eps)
@@ -43,7 +43,7 @@ RmsnormPlugin::RmsnormPlugin(const void* data, size_t length)
     const char *d = reinterpret_cast<const char*>(data), *a = d;
     read(d, mEps);
     read(d, mType);
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
     TLLM_CHECK_WITH_INFO((getSMVersion() >= 80) || (mType != DataType::kBF16), "Unsupported data type");
 }
 
@@ -64,7 +64,7 @@ nvinfer1::DimsExprs RmsnormPlugin::getOutputDimensions(
 bool RmsnormPlugin::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept
 {
-    PLUGIN_ASSERT(0 <= pos && pos < 5);
+    TLLM_CHECK(0 <= pos && pos < 5);
     return (inOut[pos].type == mType) && (inOut[pos].format == TensorFormat::kLINEAR);
 }
 
@@ -173,16 +173,6 @@ void RmsnormPlugin::destroy() noexcept
     delete this;
 }
 
-void RmsnormPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* RmsnormPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 RmsnormPluginCreator::RmsnormPluginCreator()
@@ -221,12 +211,12 @@ IPluginV2* RmsnormPluginCreator::createPlugin(const char* name, const PluginFiel
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "eps"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kFLOAT32);
             eps = static_cast<float>(*(static_cast<const float*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
     }
@@ -260,13 +250,3 @@ IPluginV2* RmsnormPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void RmsnormPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* RmsnormPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h b/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h
index ca2f6bb5c35..130886127ec 100644
--- a/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h
+++ b/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_RMSNORM_PLUGIN_H
-#define TRT_RMSNORM_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <mpi.h>
@@ -24,12 +23,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class RmsnormPlugin : public IPluginV2DynamicExt
+class RmsnormPlugin : public BasePlugin
 {
 public:
     RmsnormPlugin(float eps, nvinfer1::DataType type);
@@ -64,18 +61,15 @@ class RmsnormPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     float mEps;
     nvinfer1::DataType mType;
 
     const std::string mLayerName;
-    std::string mNamespace;
 };
 
-class RmsnormPluginCreator : public IPluginCreator
+class RmsnormPluginCreator : public BaseCreator
 {
 public:
     RmsnormPluginCreator();
@@ -91,17 +85,9 @@ class RmsnormPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_RMSNORM_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp b/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp
index b59d9b167ba..c7466d73819 100644
--- a/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp
@@ -14,19 +14,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h"
+#include "rmsnormQuantizationPlugin.h"
 #include "tensorrt_llm/kernels/rmsnormKernels.h"
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::kernels;
 using namespace tensorrt_llm::common;
-using nvinfer1::plugin::RmsnormQuantizationPluginCreator;
-using nvinfer1::plugin::RmsnormQuantizationPlugin;
+using tensorrt_llm::plugins::RmsnormQuantizationPluginCreator;
+using tensorrt_llm::plugins::RmsnormQuantizationPlugin;
 
 static const char* RMSNORM_QUANTIZATION_PLUGIN_VERSION{"1"};
 static const char* RMSNORM_QUANTIZATION_PLUGIN_NAME{"RmsnormQuantization"};
 PluginFieldCollection RmsnormQuantizationPluginCreator::mFC{};
-std::vector<PluginField> RmsnormQuantizationPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> RmsnormQuantizationPluginCreator::mPluginAttributes;
 
 RmsnormQuantizationPlugin::RmsnormQuantizationPlugin(float eps, bool dynamicActivationScaling, nvinfer1::DataType type)
     : mEps(eps)
@@ -42,7 +42,7 @@ RmsnormQuantizationPlugin::RmsnormQuantizationPlugin(const void* data, size_t le
     read(d, mEps);
     read(d, mDynActScaling);
     read(d, mType);
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -65,7 +65,7 @@ nvinfer1::DimsExprs RmsnormQuantizationPlugin::getOutputDimensions(
     // Dynamic scaling output if enabled
     try
     {
-        PLUGIN_ASSERT(outputIndex == 1);
+        TLLM_CHECK(outputIndex == 1);
         DimsExprs ret;
         ret.nbDims = inputs[0].nbDims;
         for (int di = 0; di < ret.nbDims - 1; ++di)
@@ -86,8 +86,8 @@ bool RmsnormQuantizationPlugin::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept
 {
     const int totalPoses = 6 + static_cast<int>(mDynActScaling);
-    PLUGIN_ASSERT(0 <= pos && pos < totalPoses);
-    PLUGIN_ASSERT(nbInputs == 4);
+    TLLM_CHECK(0 <= pos && pos < totalPoses);
+    TLLM_CHECK(nbInputs == 4);
     if (pos < nbInputs)
     {
         switch (pos)
@@ -218,16 +218,6 @@ void RmsnormQuantizationPlugin::destroy() noexcept
     delete this;
 }
 
-void RmsnormQuantizationPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* RmsnormQuantizationPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 RmsnormQuantizationPluginCreator::RmsnormQuantizationPluginCreator()
@@ -268,17 +258,17 @@ IPluginV2* RmsnormQuantizationPluginCreator::createPlugin(const char* name, cons
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "eps"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kFLOAT32);
             eps = static_cast<float>(*(static_cast<const float*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "dyn_act_scaling"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             dynamicActivationScaling = static_cast<bool>(*(static_cast<const bool*>(fields[i].data)));
         }
     }
@@ -312,13 +302,3 @@ IPluginV2* RmsnormQuantizationPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void RmsnormQuantizationPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* RmsnormQuantizationPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h b/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h
index aa08956dec7..761b86cc172 100644
--- a/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h
+++ b/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h
@@ -14,9 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_RMSNORM_QUANTIZATION_PLUGIN_H
-#define TRT_RMSNORM_QUANTIZATION_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
+
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <mpi.h>
@@ -24,12 +23,10 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
-class RmsnormQuantizationPlugin : public IPluginV2DynamicExt
+class RmsnormQuantizationPlugin : public BasePlugin
 {
 public:
     RmsnormQuantizationPlugin(float eps, bool dynamicActivationScaling, nvinfer1::DataType type);
@@ -64,8 +61,6 @@ class RmsnormQuantizationPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     float mEps;
@@ -73,10 +68,9 @@ class RmsnormQuantizationPlugin : public IPluginV2DynamicExt
     nvinfer1::DataType mType;
 
     const std::string mLayerName;
-    std::string mNamespace;
 };
 
-class RmsnormQuantizationPluginCreator : public IPluginCreator
+class RmsnormQuantizationPluginCreator : public BaseCreator
 {
 public:
     RmsnormQuantizationPluginCreator();
@@ -92,17 +86,9 @@ class RmsnormQuantizationPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_RMSNORM_QUANTIZATION_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp
index b684aa2707e..b0731cd63bb 100644
--- a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp
@@ -14,28 +14,68 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h"
+#include "smoothQuantGemmPlugin.h"
 #include <numeric>
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::common;
 using namespace tensorrt_llm::kernels::cutlass_kernels;
-using nvinfer1::plugin::SmoothQuantGemmPluginCreator;
-using nvinfer1::plugin::SmoothQuantGemmPlugin;
+using tensorrt_llm::plugins::SmoothQuantGemmPluginCreator;
+using tensorrt_llm::plugins::SmoothQuantGemmPlugin;
+using tensorrt_llm::plugins::SmoothQuantGemmPluginProfiler;
+using tensorrt_llm::plugins::read;
+using tensorrt_llm::plugins::write;
 
 static const char* SQ_GEMM_PLUGIN_VERSION{"1"};
 static const char* SQ_GEMM_PLUGIN_NAME{"SmoothQuantGemm"};
 PluginFieldCollection SmoothQuantGemmPluginCreator::mFC{};
-std::vector<PluginField> SmoothQuantGemmPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> SmoothQuantGemmPluginCreator::mPluginAttributes;
 
-SmoothQuantGemmPlugin::SmoothQuantGemmPlugin(QuantMode quantMode, nvinfer1::DataType type)
+void SmoothQuantGemmPluginProfiler::runTactic(int m, int n, int k, const SmoothQuantGemmPluginProfiler::Config& tactic,
+    char* workspace, const cudaStream_t& stream)
+{
+    int8_t* aTmp = reinterpret_cast<int8_t*>(workspace);
+    int8_t* bTmp = nextWorkspacePtr(aTmp, m * k * sizeof(int8_t));
+    void* cTmp = reinterpret_cast<void*>(nextWorkspacePtr(bTmp, n * k * sizeof(int8_t)));
+    float* alphaRowTmp = reinterpret_cast<float*>(
+        nextWorkspacePtr(reinterpret_cast<int8_t*>(cTmp), m * n * (mType == nvinfer1::DataType::kFLOAT ? 4 : 2)));
+    float* alphaColTmp
+        = reinterpret_cast<float*>(nextWorkspacePtr(reinterpret_cast<int8_t*>(alphaRowTmp), m * sizeof(float)));
+    char* workspaceTmp
+        = reinterpret_cast<char*>(nextWorkspacePtr(reinterpret_cast<int8_t*>(alphaColTmp), n * sizeof(float)));
+
+    const int wsSize = mRunner->getWorkspaceSize(m, n, k);
+
+    mRunner->gemm(
+        aTmp, bTmp, mQuantMode, alphaColTmp, alphaRowTmp, cTmp, m, n, k, tactic, workspaceTmp, wsSize, stream);
+}
+
+void SmoothQuantGemmPluginProfiler::computeTmpSize(int maxM, int n, int k)
+{
+    std::vector<size_t> workspaces = {
+        maxM * k * sizeof(int8_t),                                  // A
+        n * k * sizeof(int8_t),                                     // B
+        maxM * n * (mType == nvinfer1::DataType::kFLOAT ? 4u : 2u), // C
+        maxM * sizeof(float),                                       // alphaRow
+        n * sizeof(float),                                          // alphaCol
+        mRunner->getWorkspaceSize(maxM, n, k)                       // workspace
+    };
+    size_t bytes = calculateTotalWorkspaceSize(workspaces.data(), workspaces.size());
+    setTmpWorkspaceSizeInBytes(bytes);
+}
+
+SmoothQuantGemmPlugin::SmoothQuantGemmPlugin(
+    QuantMode quantMode, nvinfer1::DataType type, const SmoothQuantGemmPlugin::PluginProfilerPtr& pluginProfiler)
     : mQuantMode(quantMode)
+    , mPluginProfiler(pluginProfiler)
 {
     init(type);
 }
 
 // Parameterized constructor
-SmoothQuantGemmPlugin::SmoothQuantGemmPlugin(const void* data, size_t length)
+SmoothQuantGemmPlugin::SmoothQuantGemmPlugin(
+    const void* data, size_t length, const SmoothQuantGemmPlugin::PluginProfilerPtr& pluginProfiler)
+    : mPluginProfiler(pluginProfiler)
 {
     const char *d = reinterpret_cast<const char*>(data), *a = d;
     bool perChannelScaling = false, perTokenScaling = false;
@@ -43,29 +83,15 @@ SmoothQuantGemmPlugin::SmoothQuantGemmPlugin(const void* data, size_t length)
     unsigned int quantMode;
     read(d, quantMode);
     read(d, type);
-    read(d, mMinM);
-    read(d, mMaxM);
-    read(d, mN);
-    read(d, mK);
-    int selectedMapSize;
-    read(d, selectedMapSize);
-    perfMapType selectedTacticsMap;
-    for (int ii = 0; ii < selectedMapSize; ++ii)
-    {
-        std::pair<int, tensorrt_llm::cutlass_extensions::CutlassGemmConfig> config;
-        read(d, config);
-        selectedTacticsMap.insert(config);
-    }
+    read(d, mDims);
+
     mQuantMode = QuantMode(quantMode);
+
     init(type);
-    m_sqGemmRunner->setSelectedTactics(selectedTacticsMap);
-    m_sqGemmRunner->setMaxM(mMaxM);
-    PLUGIN_ASSERT(d == a + length);
-}
 
-void SmoothQuantGemmPlugin::setSelectedTactics(const perfMapType& selectedTacticsMap)
-{
-    m_sqGemmRunner->setSelectedTactics(selectedTacticsMap);
+    mPluginProfiler->deserialize(d, mDims, mGemmId);
+
+    TLLM_CHECK(d == a + length);
 }
 
 void SmoothQuantGemmPlugin::init(nvinfer1::DataType type)
@@ -86,36 +112,30 @@ void SmoothQuantGemmPlugin::init(nvinfer1::DataType type)
     else
     {
         // TODO (nkorobov): add bf16 support
-        PLUGIN_ASSERT(false);
+        TLLM_THROW("Support for bf16 is missing");
     }
+
+    mPluginProfiler->setQuantMode(mQuantMode);
+
+    mGemmId = GemmIdCore(mDims.n, mDims.k, mType);
 }
 
 // IPluginV2DynamicExt Methods
 nvinfer1::IPluginV2DynamicExt* SmoothQuantGemmPlugin::clone() const noexcept
 {
-    auto* plugin = new SmoothQuantGemmPlugin(mQuantMode, mType);
-    plugin->setPluginNamespace(mNamespace.c_str());
-    plugin->setProblemSize(mMinM, mMaxM, mN, mK);
-    plugin->setSelectedTactics(m_sqGemmRunner->getSelectedTactics());
-    plugin->setMaxM(m_sqGemmRunner->getMaxM());
+    auto* plugin = new SmoothQuantGemmPlugin(*this);
     return plugin;
 }
 
-void SmoothQuantGemmPlugin::setMaxM(int maxM)
-{
-    mMaxM = maxM;
-    m_sqGemmRunner->setMaxM(maxM);
-}
-
 nvinfer1::DimsExprs SmoothQuantGemmPlugin::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept
 {
     try
     {
-        PLUGIN_ASSERT(nbInputs == 4);
-        PLUGIN_ASSERT(outputIndex == 0);
+        TLLM_CHECK(nbInputs == 4);
+        TLLM_CHECK(outputIndex == 0);
         const int nbDimsA = inputs[0].nbDims;
-        PLUGIN_ASSERT(nbDimsA >= 2);
+        TLLM_CHECK(nbDimsA >= 2);
         DimsExprs ret;
         ret.nbDims = nbDimsA;
         for (int ii = 0; ii < nbDimsA - 1; ++ii)
@@ -167,8 +187,8 @@ bool SmoothQuantGemmPlugin::supportsFormatCombination(
 void SmoothQuantGemmPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
     const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept
 {
-    mMinM = std::accumulate(in[0].min.d, in[0].min.d + in[0].min.nbDims - 1, 1, std::multiplies<int>());
-    mMaxM = std::accumulate(in[0].max.d, in[0].max.d + in[0].max.nbDims - 1, 1, std::multiplies<int>());
+    const auto minM = std::accumulate(in[0].min.d, in[0].min.d + in[0].min.nbDims - 1, 1, std::multiplies<int>());
+    const auto maxM = std::accumulate(in[0].max.d, in[0].max.d + in[0].max.nbDims - 1, 1, std::multiplies<int>());
 
     const int maxK = in[0].max.d[in[0].max.nbDims - 1];
     const int maxN = in[1].max.d[0];
@@ -178,10 +198,13 @@ void SmoothQuantGemmPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorD
     TLLM_CHECK_WITH_INFO(minN == maxN, "Variable out channels is not allowed");
     TLLM_CHECK_WITH_INFO(minK == maxK, "Variable in channels is not allowed");
 
-    mK = maxK;
-    mN = maxN;
+    if (!mDims.isInitialized())
+    {
+        mDims = {minM, maxM, maxN, maxK};
+    }
+    mGemmId = {maxN, maxK, mType};
 
-    m_workspaceMaxSize = m_sqGemmRunner->getWorkspaceSize(mMaxM, maxN, maxK);
+    m_workspaceMaxSize = m_sqGemmRunner->getWorkspaceSize(maxM, maxN, maxK);
 }
 
 size_t SmoothQuantGemmPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
@@ -210,9 +233,11 @@ int SmoothQuantGemmPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
     const int k = inputDesc[0].dims.d[inputDesc[0].dims.nbDims - 1];
     const int wsSize = m_sqGemmRunner->getWorkspaceSize(m, n, k);
 
+    const auto& bestTactic = mPluginProfiler->getBestConfig(m, mGemmId);
+    TLLM_CHECK_WITH_INFO(bestTactic, "No valid SQ GEMM tactic");
     m_sqGemmRunner->gemm(reinterpret_cast<const int8_t*>(inputs[0]), reinterpret_cast<const int8_t*>(inputs[1]),
         mQuantMode, reinterpret_cast<const float*>(inputs[3]), reinterpret_cast<const float*>(inputs[2]),
-        reinterpret_cast<void*>(outputs[0]), m, n, k, reinterpret_cast<char*>(workspace), wsSize, stream);
+        reinterpret_cast<void*>(outputs[0]), m, n, k, *bestTactic, reinterpret_cast<char*>(workspace), wsSize, stream);
 
     return 0;
 }
@@ -221,7 +246,7 @@ int SmoothQuantGemmPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
 nvinfer1::DataType SmoothQuantGemmPlugin::getOutputDataType(
     int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
 {
-    PLUGIN_ASSERT(index == 0);
+    TLLM_CHECK(index == 0);
     return mType;
 }
 
@@ -252,14 +277,10 @@ void SmoothQuantGemmPlugin::terminate() noexcept {}
 
 size_t SmoothQuantGemmPlugin::getSerializationSize() const noexcept
 {
-    const auto& selectedTactics = m_sqGemmRunner->getSelectedTactics();
-    return sizeof(unsigned int) +    // QuantMode
-        sizeof(nvinfer1::DataType) + // dtype
-        4 * sizeof(int) +            // Problem sizes (minM, maxM, N, K)
-        sizeof(int) +                // selected tactics constainer num of elems
-        selectedTactics.size()
-        * sizeof(
-            std::pair<int, tensorrt_llm::cutlass_extensions::CutlassGemmConfig>); // selected tactics container size
+    return sizeof(unsigned int) +                       // QuantMode
+        sizeof(nvinfer1::DataType) +                    // dtype
+        sizeof(mDims) +                                 // Dimensions
+        mPluginProfiler->getSerializationSize(mGemmId); // selected tactics container size
 }
 
 void SmoothQuantGemmPlugin::serialize(void* buffer) const noexcept
@@ -267,16 +288,9 @@ void SmoothQuantGemmPlugin::serialize(void* buffer) const noexcept
     char *d = static_cast<char*>(buffer), *a = d;
     write(d, mQuantMode.value());
     write(d, mType);
-    write(d, mMinM);
-    write(d, m_sqGemmRunner->getMaxM());
-    write(d, mN);
-    write(d, mK);
-    const auto& selectedTacticsMap = m_sqGemmRunner->getSelectedTactics();
-    write(d, static_cast<int>(selectedTacticsMap.size()));
-    for (const auto& pair : selectedTacticsMap)
-    {
-        write(d, pair);
-    }
+    write(d, mDims);
+
+    mPluginProfiler->serialize(d, mGemmId);
     assert(d == a + getSerializationSize());
 }
 
@@ -286,58 +300,9 @@ void SmoothQuantGemmPlugin::destroy() noexcept
     delete this;
 }
 
-void SmoothQuantGemmPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* SmoothQuantGemmPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
-void SmoothQuantGemmPlugin::setProblemSize(int minM, int maxM, int n, int k)
-{
-    mMinM = minM;
-    mMaxM = maxM;
-    mN = n;
-    mK = k;
-}
-
-void SmoothQuantGemmPlugin::allocateTmpData()
-{
-    cudaMalloc(&mATmp, mMaxM * mK * sizeof(int8_t));
-    cudaMalloc(&mBTmp, mN * mK * sizeof(int8_t));
-    cudaMalloc(&mCTmp, mMaxM * mN * (mType == nvinfer1::DataType::kFLOAT ? 4 : 2));
-    cudaMalloc(&mAlphaRowTmp, mMaxM * sizeof(float));
-    cudaMalloc(&mAlphaColTmp, mN * sizeof(float));
-    cudaMalloc(&mWorkspaceTmp, m_sqGemmRunner->getWorkspaceSize(mMaxM, mN, mK));
-}
-
-void SmoothQuantGemmPlugin::freeTmpData()
-{
-    cudaFree(mATmp);
-    cudaFree(mBTmp);
-    cudaFree(mCTmp);
-    cudaFree(mAlphaRowTmp);
-    cudaFree(mAlphaColTmp);
-    cudaFree(mWorkspaceTmp);
-}
-
 void SmoothQuantGemmPlugin::configGemm()
 {
-    if (mMaxM == -1 || mMinM == -1 || mN == -1 || mK == -1)
-    {
-        return;
-    }
-    if (!m_sqGemmRunner->hasSelectedTactics())
-    {
-        allocateTmpData();
-        m_sqGemmRunner->profileGemms(
-            mQuantMode, mMinM, mMaxM, mN, mK, mATmp, mBTmp, mCTmp, mAlphaColTmp, mAlphaRowTmp, mWorkspaceTmp);
-        m_sqGemmRunner->setMaxM(mMaxM);
-        freeTmpData();
-    }
+    mPluginProfiler->profileTactics(m_sqGemmRunner->getConfigs(), m_sqGemmRunner, mType, mDims, mGemmId);
 }
 
 ///////////////
@@ -379,24 +344,27 @@ IPluginV2* SmoothQuantGemmPluginCreator::createPlugin(const char* name, const Pl
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "has_per_channel_scaling"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             perChannelScaling = static_cast<bool>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "has_per_token_scaling"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             perTokenScaling = static_cast<bool>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
     }
     try
     {
+        // SmoothQuantGemmPluginCreator is unique and shared for an engine generation
+        // Create plugin profiler with shared tactics map
+        auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false);
         QuantMode quantMode = QuantMode::fromDescription(true, true, perTokenScaling, perChannelScaling);
-        auto* obj = new SmoothQuantGemmPlugin(quantMode, type);
+        auto* obj = new SmoothQuantGemmPlugin(quantMode, type, pluginProfiler);
         obj->setPluginNamespace(mNamespace.c_str());
         return obj;
     }
@@ -414,7 +382,9 @@ IPluginV2* SmoothQuantGemmPluginCreator::deserializePlugin(
     // call SmoothQuantGemmPlugin::destroy()
     try
     {
-        auto* obj = new SmoothQuantGemmPlugin(serialData, serialLength);
+        // Create plugin profiler with private tactics map which is read from the serialized engine
+        auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ true);
+        auto* obj = new SmoothQuantGemmPlugin(serialData, serialLength, pluginProfiler);
         obj->setPluginNamespace(mNamespace.c_str());
         return obj;
     }
@@ -424,13 +394,3 @@ IPluginV2* SmoothQuantGemmPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void SmoothQuantGemmPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* SmoothQuantGemmPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h
index 1011098e3d5..d0a7fba78aa 100644
--- a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h
+++ b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h
@@ -14,12 +14,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_SMOOTH_QUANT_GEMM_PLUGIN_H
-#define TRT_SMOOTH_QUANT_GEMM_PLUGIN_H
+#pragma once
 
-#include "NvInferPlugin.h"
 #include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
+#include "tensorrt_llm/plugins/common/gemmPluginProfiler.h"
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include <cassert>
 #include <memory>
@@ -27,21 +26,43 @@
 #include <string>
 #include <vector>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace tensorrt_llm::plugins
 {
 
 using perfMapType = std::unordered_map<int, tensorrt_llm::cutlass_extensions::CutlassGemmConfig>;
+using SqGemmRunnerPtr = std::shared_ptr<tensorrt_llm::kernels::cutlass_kernels::CutlassInt8GemmRunnerInterface>;
+
+class SmoothQuantGemmPluginProfiler : public GemmPluginProfiler<tensorrt_llm::cutlass_extensions::CutlassGemmConfig,
+                                          SqGemmRunnerPtr, GemmIdCore, GemmIdCoreHash>
+{
+public:
+    using Config = tensorrt_llm::cutlass_extensions::CutlassGemmConfig;
+
+    void setQuantMode(const tensorrt_llm::common::QuantMode& quantMode)
+    {
+        mQuantMode = quantMode;
+    }
+
+protected:
+    void runTactic(int m, int n, int k, const Config& tactic, char* workspace, const cudaStream_t& stream) override;
+
+    void computeTmpSize(int maxM, int n, int k) override;
+
+private:
+    tensorrt_llm::common::QuantMode mQuantMode;
+};
 
-class SmoothQuantGemmPlugin : public IPluginV2DynamicExt
+class SmoothQuantGemmPlugin : public BasePlugin
 {
 public:
+    using PluginProfilerPtr = std::shared_ptr<SmoothQuantGemmPluginProfiler>;
+
     SmoothQuantGemmPlugin() = delete;
 
-    SmoothQuantGemmPlugin(tensorrt_llm::common::QuantMode quantMode, nvinfer1::DataType type);
+    SmoothQuantGemmPlugin(
+        tensorrt_llm::common::QuantMode quantMode, nvinfer1::DataType type, const PluginProfilerPtr& pluginProfiler);
 
-    SmoothQuantGemmPlugin(const void* data, size_t length);
+    SmoothQuantGemmPlugin(const void* data, size_t length, const PluginProfilerPtr& pluginProfiler);
 
     ~SmoothQuantGemmPlugin() override = default;
 
@@ -71,43 +92,28 @@ class SmoothQuantGemmPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     void init(nvinfer1::DataType type);
 
-    void setProblemSize(int minM, int maxM, int n, int k);
     void configGemm();
-    void setSelectedTactics(const perfMapType& selected_tactics_map);
-    void setMaxM(int maxM);
-
-    void allocateTmpData();
-    void freeTmpData();
 
 private:
     const std::string mLayerName;
-    std::string mNamespace;
 
-    std::shared_ptr<tensorrt_llm::kernels::cutlass_kernels::CutlassInt8GemmRunnerInterface> m_sqGemmRunner;
+    SqGemmRunnerPtr m_sqGemmRunner;
     tensorrt_llm::common::QuantMode mQuantMode;
     int m_workspaceMaxSize;
-    int mMaxM{-1};
-    int mMinM{-1};
-    int mN{-1};
-    int mK{-1};
-
-    int8_t* mATmp{nullptr};
-    int8_t* mBTmp{nullptr};
-    void* mCTmp{nullptr};
-    float* mAlphaRowTmp{nullptr};
-    float* mAlphaColTmp{nullptr};
-    char* mWorkspaceTmp{nullptr};
+
+    GemmDims mDims{};
+    GemmIdCore mGemmId{};
+
+    PluginProfilerPtr mPluginProfiler;
 
     nvinfer1::DataType mType;
 };
 
-class SmoothQuantGemmPluginCreator : public IPluginCreator
+class SmoothQuantGemmPluginCreator : public BaseCreator
 {
 public:
     SmoothQuantGemmPluginCreator();
@@ -123,17 +129,10 @@ class SmoothQuantGemmPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    GemmPluginProfilerManager<SmoothQuantGemmPluginProfiler> gemmPluginProfileManager;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_SMOOTH_QUANT_GEMM_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp
index ae03dbd0334..a01458f8282 100644
--- a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp
@@ -14,27 +14,89 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h"
+#include "weightOnlyGroupwiseQuantMatmulPlugin.h"
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::common;
 using namespace tensorrt_llm::kernels::cutlass_kernels;
-using nvinfer1::plugin::WeightOnlyGroupwiseQuantMatmulPluginCreator;
-using nvinfer1::plugin::WeightOnlyGroupwiseQuantMatmulPlugin;
+using tensorrt_llm::plugins::WeightOnlyGroupwiseQuantMatmulPluginCreator;
+using tensorrt_llm::plugins::WeightOnlyGroupwiseQuantMatmulPlugin;
+using tensorrt_llm::plugins::WeightOnlyGroupwiseQuantGemmPluginProfiler;
+
+// Flags for indicating whether the corresponding inputs are applied in mQuantAlgo
+// mQuantAlgo = pre_quant_scale * PRE_SCALE_QUANT + zero * ZER0 + bias * BIAS
+// Here pre_quant_scale, zero and bias are boolean type
+static constexpr int BIAS = int(1) << 0;
+static constexpr int ZER0 = int(1) << 1;
+static constexpr int PRE_SCALE_QUANT = int(1) << 2;
+using tensorrt_llm::plugins::read;
+using tensorrt_llm::plugins::write;
 
 static const char* WOQ_GROUPWISE_MATMUL_PLUGIN_VERSION{"1"};
 static const char* WOQ_GROUPWISE_MATMUL_PLUGIN_NAME{"WeightOnlyGroupwiseQuantMatmul"};
 PluginFieldCollection WeightOnlyGroupwiseQuantMatmulPluginCreator::mFC{};
-std::vector<PluginField> WeightOnlyGroupwiseQuantMatmulPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> WeightOnlyGroupwiseQuantMatmulPluginCreator::mPluginAttributes;
 
-WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin(
-    nvinfer1::DataType type, int quant_algo, int group_size)
+void WeightOnlyGroupwiseQuantGemmPluginProfiler::runTactic(int m, int n, int k,
+    const WeightOnlyGroupwiseQuantGemmPluginProfiler::Config& tactic, char* workspace, const cudaStream_t& stream)
+{
+    const int originalN = n * 8;
+    half* actPtr = reinterpret_cast<half*>(workspace);
+    cutlass::uint4b_t* weightPtr = reinterpret_cast<cutlass::uint4b_t*>(
+        nextWorkspacePtr(reinterpret_cast<int8_t*>(actPtr), m * k * sizeof(half)));
+    half* inputScalesPtr
+        = reinterpret_cast<half*>(nextWorkspacePtr(reinterpret_cast<int8_t*>(weightPtr), n * k * sizeof(float)));
+    half* zerosPtr = reinterpret_cast<half*>(
+        nextWorkspacePtr(reinterpret_cast<int8_t*>(inputScalesPtr), k * originalN * sizeof(half) / mGroupSize));
+    half* biasesPtr = reinterpret_cast<half*>(
+        nextWorkspacePtr(reinterpret_cast<int8_t*>(zerosPtr), k * originalN * sizeof(half) / mGroupSize));
+    half* outputPtr = reinterpret_cast<half*>(nextWorkspacePtr(reinterpret_cast<int8_t*>(biasesPtr), m * sizeof(half)));
+    char* workspacePtr
+        = reinterpret_cast<char*>(nextWorkspacePtr(reinterpret_cast<int8_t*>(outputPtr), m * originalN * sizeof(half)));
+
+    if ((mQuantAlgo & ZER0) == 0)
+    {
+        zerosPtr = nullptr;
+    }
+
+    if ((mQuantAlgo & BIAS) == 0)
+    {
+        biasesPtr = nullptr;
+    }
+
+    const int wsSize = mRunner->getWorkspaceSize(m, n, k);
+
+    mRunner->gemm(actPtr, weightPtr, inputScalesPtr, zerosPtr, biasesPtr, outputPtr, m, originalN, k, mGroupSize,
+        tactic, workspacePtr, wsSize, stream);
+}
+
+void WeightOnlyGroupwiseQuantGemmPluginProfiler::computeTmpSize(int maxM, int n, int k)
+{
+    const int originalN = n * 8;
+    std::vector<size_t> workspaces = {
+        maxM * k * sizeof(half),                   // A
+        k * n * sizeof(float),                     // B
+        k * originalN * sizeof(half) / mGroupSize, // scales
+        k * originalN * sizeof(half) / mGroupSize, // zeros
+        maxM * sizeof(half),                       // biases
+        maxM * originalN * sizeof(half),           // C
+        mRunner->getWorkspaceSize(maxM, n, k)      // workspace
+    };
+    size_t bytes = calculateTotalWorkspaceSize(workspaces.data(), workspaces.size());
+    setTmpWorkspaceSizeInBytes(bytes);
+}
+
+WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin(nvinfer1::DataType type, int quant_algo,
+    int group_size, const WeightOnlyGroupwiseQuantMatmulPlugin::PluginProfilerPtr& pluginProfiler)
+    : mPluginProfiler(pluginProfiler)
 {
     init(type, quant_algo, group_size);
 }
 
 // Parameterized constructor
-WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin(const void* data, size_t length)
+WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin(
+    const void* data, size_t length, const WeightOnlyGroupwiseQuantMatmulPlugin::PluginProfilerPtr& pluginProfiler)
+    : mPluginProfiler(pluginProfiler)
 {
     const char *d = reinterpret_cast<const char*>(data), *a = d;
     nvinfer1::DataType type;
@@ -43,8 +105,13 @@ WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin(const
     read(d, type);
     read(d, quant_algo);
     read(d, group_size);
+    read(d, mDims);
+
     init(type, quant_algo, group_size);
-    PLUGIN_ASSERT(d == a + length);
+
+    mPluginProfiler->deserialize(d, mDims, mGemmId);
+
+    TLLM_CHECK(d == a + length);
 }
 
 void WeightOnlyGroupwiseQuantMatmulPlugin::init(nvinfer1::DataType type, int quant_algo, int group_size)
@@ -79,18 +146,28 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::init(nvinfer1::DataType type, int qua
     }
     else
     {
-        PLUGIN_ASSERT(false);
+        TLLM_THROW("Unsupported data type");
     }
+
+    mPluginProfiler->setQuantAlgo(mQuantAlgo);
+    mPluginProfiler->setGroupSize(mGroupSize);
+
+    mGemmId = GemmIdCore(mDims.n, mDims.k, mType);
 }
 
 // IPluginV2DynamicExt Methods
 nvinfer1::IPluginV2DynamicExt* WeightOnlyGroupwiseQuantMatmulPlugin::clone() const noexcept
 {
-    auto* plugin = new WeightOnlyGroupwiseQuantMatmulPlugin(mType, mQuantAlgo, mGroupSize);
-    plugin->setPluginNamespace(mNamespace.c_str());
+    auto* plugin = new WeightOnlyGroupwiseQuantMatmulPlugin(*this);
     return plugin;
 }
 
+void WeightOnlyGroupwiseQuantMatmulPlugin::configGemm()
+{
+    mPluginProfiler->profileTactics(
+        m_weightOnlyGroupwiseGemmRunner->getConfigs(), m_weightOnlyGroupwiseGemmRunner, mType, mDims, mGemmId);
+}
+
 nvinfer1::DimsExprs WeightOnlyGroupwiseQuantMatmulPlugin::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept
 {
@@ -105,12 +182,12 @@ nvinfer1::DimsExprs WeightOnlyGroupwiseQuantMatmulPlugin::getOutputDimensions(
 
     try
     {
-        PLUGIN_ASSERT(nbInputs == mBiasesInputIdx + 1);
-        PLUGIN_ASSERT(outputIndex == 0);
+        TLLM_CHECK(nbInputs == mBiasesInputIdx + 1);
+        TLLM_CHECK(outputIndex == 0);
         const int nbDimsA = inputs[0].nbDims;
         const int nbDimsB = inputs[mWeightInputIdx].nbDims;
-        PLUGIN_ASSERT(nbDimsA >= 2);
-        PLUGIN_ASSERT(nbDimsB == 2);
+        TLLM_CHECK(nbDimsA >= 2);
+        TLLM_CHECK(nbDimsB == 2);
         DimsExprs ret;
         ret.nbDims = nbDimsA;
         for (int ii = 0; ii < nbDimsA - 1; ++ii)
@@ -157,14 +234,22 @@ bool WeightOnlyGroupwiseQuantMatmulPlugin::supportsFormatCombination(
 void WeightOnlyGroupwiseQuantMatmulPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
     const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept
 {
-    int maxM = 1;
-    for (int ii = 0; ii < in[0].max.nbDims - 1; ++ii)
-    {
-        maxM *= in[0].max.d[ii];
-    }
+    const auto minM = std::accumulate(in[0].min.d, in[0].min.d + in[0].min.nbDims - 1, 1, std::multiplies<int>());
+    const auto maxM = std::accumulate(in[0].max.d, in[0].max.d + in[0].max.nbDims - 1, 1, std::multiplies<int>());
+
     const int maxK = in[0].max.d[in[0].max.nbDims - 1];
     // int32 packed int4 elements
     const int maxN = in[mWeightInputIdx].max.d[1] * 8;
+
+    const auto K = maxK;
+    const auto N = maxN / 8;
+
+    if (!mDims.isInitialized())
+    {
+        mDims = {minM, maxM, N, K};
+    }
+    mGemmId = {N, K, mType};
+
     int smoothedActSize = maxM * maxK * (in[0].desc.type == nvinfer1::DataType::kFLOAT ? 4 : 2);
     m_workspaceMaxSize = smoothedActSize + m_weightOnlyGroupwiseGemmRunner->getWorkspaceSize(maxM, maxN, maxK);
 }
@@ -212,13 +297,17 @@ int WeightOnlyGroupwiseQuantMatmulPlugin::enqueue(const nvinfer1::PluginTensorDe
 
     if (mType == nvinfer1::DataType::kHALF)
     {
-        if (m < SMALL_M_FAST_PATH)
+        if (m < SMALL_M_FAST_PATH && mSM >= 75)
         {
             // Use CUDA kernels for small batch size
-            tensorrt_llm::kernels::groupwise_weight_only_matmul_i2f_launcher(
-                reinterpret_cast<const int32_t*>(inputs[mWeightInputIdx]),
+            // The CUDA kernel is designed for ColumnMajorTileInterleave weight layout used in fpAIntB cutlass kernel
+            // when sm >= 75 and the preprocessing of cutlass on sm70 does not interleave the weights.
+            tensorrt_llm::kernels::WeightOnlyParams params{reinterpret_cast<const uint8_t*>(inputs[mWeightInputIdx]),
                 reinterpret_cast<const half*>(inputs[mScalesInputIdx]), zeros_ptr, act_ptr, biases_ptr,
-                reinterpret_cast<half*>(outputs[0]), m, n * 8, k, mGroupSize, &stream);
+                reinterpret_cast<half*>(outputs[0]), m, n * 8, k, mGroupSize};
+            tensorrt_llm::kernels::weight_only_batched_gemv_launcher(tensorrt_llm::kernels::WeightOnlyQuantType::Int4b,
+                tensorrt_llm::kernels::WeightOnlyType::GroupWise,
+                tensorrt_llm::kernels::WeightOnlyActivationType::Identity, params, stream);
         }
         else
         {
@@ -227,10 +316,12 @@ int WeightOnlyGroupwiseQuantMatmulPlugin::enqueue(const nvinfer1::PluginTensorDe
 
             int32_t* weight_ptr = const_cast<int32_t*>(reinterpret_cast<const int32_t*>(inputs[mWeightInputIdx]));
 
+            const auto& bestTactic = mPluginProfiler->getBestConfig(m, mGemmId);
+            TLLM_CHECK_WITH_INFO(bestTactic, "No valid SQ GEMM tactic");
             m_weightOnlyGroupwiseGemmRunner->gemm(act_ptr, reinterpret_cast<cutlass::uint4b_t*>(weight_ptr),
                 reinterpret_cast<const half*>(inputs[mScalesInputIdx]), zeros_ptr, biases_ptr,
-                reinterpret_cast<half*>(outputs[0]), m, n * 8, k, mGroupSize,
-                reinterpret_cast<char*>(workspace + m * k * sizeof(half)), ws_bytes, stream);
+                reinterpret_cast<half*>(outputs[0]), m, n * 8, k, mGroupSize, *bestTactic,
+                reinterpret_cast<char*>(workspace) + m * k * sizeof(half), ws_bytes, stream);
         }
     }
     else
@@ -245,7 +336,7 @@ int WeightOnlyGroupwiseQuantMatmulPlugin::enqueue(const nvinfer1::PluginTensorDe
 nvinfer1::DataType WeightOnlyGroupwiseQuantMatmulPlugin::getOutputDataType(
     int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
 {
-    PLUGIN_ASSERT(index == 0);
+    TLLM_CHECK(index == 0);
     return mType;
 }
 
@@ -268,6 +359,7 @@ int WeightOnlyGroupwiseQuantMatmulPlugin::getNbOutputs() const noexcept
 
 int WeightOnlyGroupwiseQuantMatmulPlugin::initialize() noexcept
 {
+    configGemm();
     return 0;
 }
 
@@ -275,7 +367,11 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::terminate() noexcept {}
 
 size_t WeightOnlyGroupwiseQuantMatmulPlugin::getSerializationSize() const noexcept
 {
-    return 2 * sizeof(int) + sizeof(nvinfer1::DataType);
+    return sizeof(int) +                                // mQuantAlgo
+        sizeof(int) +                                   // mGroupSize
+        sizeof(nvinfer1::DataType) +                    // mType
+        sizeof(mDims) +                                 // Dimensions
+        mPluginProfiler->getSerializationSize(mGemmId); // selected tactics container size
 }
 
 void WeightOnlyGroupwiseQuantMatmulPlugin::serialize(void* buffer) const noexcept
@@ -284,6 +380,9 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::serialize(void* buffer) const noexcep
     write(d, mType);
     write(d, mQuantAlgo);
     write(d, mGroupSize);
+    write(d, mDims);
+
+    mPluginProfiler->serialize(d, mGemmId);
     assert(d == a + getSerializationSize());
 }
 
@@ -293,16 +392,6 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::destroy() noexcept
     delete this;
 }
 
-void WeightOnlyGroupwiseQuantMatmulPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* WeightOnlyGroupwiseQuantMatmulPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 WeightOnlyGroupwiseQuantMatmulPluginCreator::WeightOnlyGroupwiseQuantMatmulPluginCreator()
@@ -344,23 +433,26 @@ IPluginV2* WeightOnlyGroupwiseQuantMatmulPluginCreator::createPlugin(
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "quant_algo"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             QuantAlgo = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "group_size"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             GroupSize = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
     }
     try
     {
-        auto* obj = new WeightOnlyGroupwiseQuantMatmulPlugin(type, QuantAlgo, GroupSize);
+        // WeightOnlyGroupwiseQuantMatmulPluginCreator is unique and shared for an engine generation
+        // Create plugin profiler with shared tactics map
+        auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false);
+        auto* obj = new WeightOnlyGroupwiseQuantMatmulPlugin(type, QuantAlgo, GroupSize, pluginProfiler);
         obj->setPluginNamespace(mNamespace.c_str());
         return obj;
     }
@@ -378,7 +470,9 @@ IPluginV2* WeightOnlyGroupwiseQuantMatmulPluginCreator::deserializePlugin(
     // call weightOnlyGroupwiseQuantMatmulPlugin::destroy()
     try
     {
-        auto* obj = new WeightOnlyGroupwiseQuantMatmulPlugin(serialData, serialLength);
+        // Create plugin profiler with private tactics map which is read from the serialized engine
+        auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ true);
+        auto* obj = new WeightOnlyGroupwiseQuantMatmulPlugin(serialData, serialLength, pluginProfiler);
         obj->setPluginNamespace(mNamespace.c_str());
         return obj;
     }
@@ -388,13 +482,3 @@ IPluginV2* WeightOnlyGroupwiseQuantMatmulPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void WeightOnlyGroupwiseQuantMatmulPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* WeightOnlyGroupwiseQuantMatmulPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h
index 131e93cae86..52cf6adfa67 100644
--- a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h
+++ b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h
@@ -14,16 +14,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_WEIGHT_ONLY_GROUPWISE_QUANT_MATMUL_PLUGIN_H
-#define TRT_WEIGHT_ONLY_GROUPWISE_QUANT_MATMUL_PLUGIN_H
+#pragma once
 
-#include "NvInferPlugin.h"
-#include "cutlass/numeric_types.h"
 #include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
 #include "tensorrt_llm/kernels/preQuantScaleKernel.h"
-#include "tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.h"
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h"
+#include "tensorrt_llm/plugins/common/gemmPluginProfiler.h"
 #include "tensorrt_llm/plugins/common/plugin.h"
+
+#include <cutlass/numeric_types.h>
+
 #include <cassert>
 #include <memory>
 #include <set>
@@ -34,19 +35,50 @@
 // breaking dependencies
 #include "cutlass/integer_subbyte.h"
 
-namespace nvinfer1
+namespace tensorrt_llm::plugins
 {
-namespace plugin
+
+using WeightOnlyGemmRunner = tensorrt_llm::kernels::cutlass_kernels::CutlassFpAIntBGemmRunnerInterface;
+using WeightOnlyGemmRunnerPtr = std::shared_ptr<WeightOnlyGemmRunner>;
+
+class WeightOnlyGroupwiseQuantGemmPluginProfiler
+    : public GemmPluginProfiler<tensorrt_llm::cutlass_extensions::CutlassGemmConfig, WeightOnlyGemmRunnerPtr,
+          GemmIdCore, GemmIdCoreHash>
 {
+public:
+    using Config = tensorrt_llm::cutlass_extensions::CutlassGemmConfig;
+
+    void setQuantAlgo(int quantAlgo)
+    {
+        mQuantAlgo = quantAlgo;
+    }
+
+    void setGroupSize(int groupSize)
+    {
+        mGroupSize = groupSize;
+    }
+
+protected:
+    void runTactic(int m, int n, int k, const Config& tactic, char* workspace, const cudaStream_t& stream) override;
+
+    void computeTmpSize(int maxM, int n, int k) override;
+
+private:
+    int mQuantAlgo;
+    int mGroupSize;
+};
 
-class WeightOnlyGroupwiseQuantMatmulPlugin : public IPluginV2DynamicExt
+class WeightOnlyGroupwiseQuantMatmulPlugin : public BasePlugin
 {
 public:
+    using PluginProfilerPtr = std::shared_ptr<WeightOnlyGroupwiseQuantGemmPluginProfiler>;
+
     WeightOnlyGroupwiseQuantMatmulPlugin() = delete;
 
-    WeightOnlyGroupwiseQuantMatmulPlugin(nvinfer1::DataType type, int quant_algo, int group_size);
+    WeightOnlyGroupwiseQuantMatmulPlugin(
+        nvinfer1::DataType type, int quant_algo, int group_size, const PluginProfilerPtr& profiler);
 
-    WeightOnlyGroupwiseQuantMatmulPlugin(const void* data, size_t length);
+    WeightOnlyGroupwiseQuantMatmulPlugin(const void* data, size_t length, const PluginProfilerPtr& profiler);
 
     ~WeightOnlyGroupwiseQuantMatmulPlugin() override = default;
 
@@ -76,21 +108,20 @@ class WeightOnlyGroupwiseQuantMatmulPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     // group_size: 64, 128
     void init(nvinfer1::DataType type, int quant_algo, int group_size);
 
+    void configGemm();
+
 private:
     const std::string mLayerName;
-    std::string mNamespace;
 
-    std::shared_ptr<tensorrt_llm::kernels::cutlass_kernels::CutlassFpAIntBGemmRunnerInterface>
-        m_weightOnlyGroupwiseGemmRunner;
+    WeightOnlyGemmRunnerPtr m_weightOnlyGroupwiseGemmRunner;
     int m_workspaceMaxSize;
     nvinfer1::DataType mType;
+    int mSM = tensorrt_llm::common::getSMVersion();
 
     // When M is smaller than this value, we trigger a fast path
     // I.e. a tailored kernel instead of cutlass.
@@ -98,13 +129,6 @@ class WeightOnlyGroupwiseQuantMatmulPlugin : public IPluginV2DynamicExt
 
     int mQuantAlgo;
 
-    // Flags for indicating whether the corresponding inputs are applied in mQuantAlgo
-    // mQuantAlgo = pre_quant_scale * PRE_SCALE_QUANT + zero * ZER0 + bias * BIAS
-    // Here pre_quant_scale, zero and bias are boolean type
-    static constexpr int BIAS = int(1) << 0;
-    static constexpr int ZER0 = int(1) << 1;
-    static constexpr int PRE_SCALE_QUANT = int(1) << 2;
-
     int mGroupSize;
 
     int mPreQuantScaleInputIdx;
@@ -112,9 +136,14 @@ class WeightOnlyGroupwiseQuantMatmulPlugin : public IPluginV2DynamicExt
     int mScalesInputIdx;
     int mZerosInputIdx;
     int mBiasesInputIdx;
+
+    GemmDims mDims{};
+    GemmIdCore mGemmId{};
+
+    PluginProfilerPtr mPluginProfiler;
 };
 
-class WeightOnlyGroupwiseQuantMatmulPluginCreator : public IPluginCreator
+class WeightOnlyGroupwiseQuantMatmulPluginCreator : public BaseCreator
 {
 public:
     WeightOnlyGroupwiseQuantMatmulPluginCreator();
@@ -130,17 +159,10 @@ class WeightOnlyGroupwiseQuantMatmulPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    GemmPluginProfilerManager<WeightOnlyGroupwiseQuantGemmPluginProfiler> gemmPluginProfileManager;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_WEIGHT_ONLY_GROUPWISE_QUANT_MATMUL_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp
index 6ce0f163b8a..f612ca7160f 100644
--- a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp
@@ -14,34 +14,87 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h"
+#include "weightOnlyQuantMatmulPlugin.h"
 
 using namespace nvinfer1;
 using namespace tensorrt_llm::common;
 using namespace tensorrt_llm::kernels::cutlass_kernels;
-using nvinfer1::plugin::WeightOnlyQuantMatmulPluginCreator;
-using nvinfer1::plugin::WeightOnlyQuantMatmulPlugin;
+using tensorrt_llm::plugins::WeightOnlyQuantMatmulPluginCreator;
+using tensorrt_llm::plugins::WeightOnlyQuantMatmulPlugin;
+using tensorrt_llm::plugins::WeightOnlyQuantGemmPluginProfiler;
+using tensorrt_llm::plugins::read;
+using tensorrt_llm::plugins::write;
 
 static const char* WOQ_MATMUL_PLUGIN_VERSION{"1"};
 static const char* WOQ_MATMUL_PLUGIN_NAME{"WeightOnlyQuantMatmul"};
 PluginFieldCollection WeightOnlyQuantMatmulPluginCreator::mFC{};
-std::vector<PluginField> WeightOnlyQuantMatmulPluginCreator::mPluginAttributes;
+std::vector<nvinfer1::PluginField> WeightOnlyQuantMatmulPluginCreator::mPluginAttributes;
 
-WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin(nvinfer1::DataType type, int weightTypeId)
+void WeightOnlyQuantGemmPluginProfiler::runTactic(int m, int n, int k,
+    const WeightOnlyQuantGemmPluginProfiler::Config& tactic, char* workspace, const cudaStream_t& stream)
+{
+    const int originalN = n * (mWeightTypeId == 1 ? 4 : 8);
+    half* actPtr = reinterpret_cast<half*>(workspace);
+    int8_t* weightPtr
+        = reinterpret_cast<int8_t*>(nextWorkspacePtr(reinterpret_cast<int8_t*>(actPtr), m * k * sizeof(half)));
+    half* scalesPtr = reinterpret_cast<half*>(
+        nextWorkspacePtr(reinterpret_cast<int8_t*>(weightPtr), originalN * k * sizeof(int8_t)));
+    half* outputPtr
+        = reinterpret_cast<half*>(nextWorkspacePtr(reinterpret_cast<int8_t*>(scalesPtr), originalN * sizeof(half)));
+    char* workspacePtr
+        = reinterpret_cast<char*>(nextWorkspacePtr(reinterpret_cast<int8_t*>(outputPtr), m * originalN * sizeof(half)));
+
+    const int wsSize = mRunner->getWorkspaceSize(m, n, k);
+
+    if (mWeightTypeId == 1)
+    {
+        mRunner->gemm(actPtr, weightPtr, scalesPtr, outputPtr, m, originalN, k, tactic, workspacePtr, wsSize, stream);
+    }
+    else
+    {
+        mRunner->gemm(actPtr, reinterpret_cast<cutlass::uint4b_t*>(weightPtr), scalesPtr, outputPtr, m, originalN, k,
+            tactic, workspacePtr, wsSize, stream);
+    }
+}
+
+void WeightOnlyQuantGemmPluginProfiler::computeTmpSize(int maxM, int n, int k)
+{
+    const int originalN = n * (mWeightTypeId == 1 ? 4 : 8);
+    std::vector<size_t> workspaces = {
+        maxM * k * sizeof(half),              // A
+        originalN * k * sizeof(int8_t),       // B
+        originalN * sizeof(half),             // scales
+        maxM * originalN * sizeof(half),      // C
+        mRunner->getWorkspaceSize(maxM, n, k) // workspace
+    };
+    size_t bytes = calculateTotalWorkspaceSize(workspaces.data(), workspaces.size());
+    setTmpWorkspaceSizeInBytes(bytes);
+}
+
+WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin(
+    nvinfer1::DataType type, int weightTypeId, const WeightOnlyQuantMatmulPlugin::PluginProfilerPtr& pluginProfiler)
+    : mPluginProfiler(pluginProfiler)
 {
     init(type, weightTypeId);
 }
 
 // Parameterized constructor
-WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin(const void* data, size_t length)
+WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin(
+    const void* data, size_t length, const WeightOnlyQuantMatmulPlugin::PluginProfilerPtr& pluginProfiler)
+    : mPluginProfiler(pluginProfiler)
 {
     const char *d = reinterpret_cast<const char*>(data), *a = d;
     nvinfer1::DataType type;
     int weightTypeId = 0;
     read(d, type);
     read(d, weightTypeId);
+    read(d, mDims);
+
     init(type, weightTypeId);
-    PLUGIN_ASSERT(d == a + length);
+
+    mPluginProfiler->deserialize(d, mDims, mGemmId);
+
+    TLLM_CHECK(d == a + length);
 }
 
 void WeightOnlyQuantMatmulPlugin::init(nvinfer1::DataType type, int weightTypeId)
@@ -60,18 +113,27 @@ void WeightOnlyQuantMatmulPlugin::init(nvinfer1::DataType type, int weightTypeId
     }
     else
     {
-        PLUGIN_ASSERT(false);
+        TLLM_CHECK(false);
     }
+
+    mPluginProfiler->setWeightTypeId(mWeightTypeId);
+
+    mGemmId = GemmIdCore(mDims.n, mDims.k, mType);
 }
 
 // IPluginV2DynamicExt Methods
 nvinfer1::IPluginV2DynamicExt* WeightOnlyQuantMatmulPlugin::clone() const noexcept
 {
-    auto* plugin = new WeightOnlyQuantMatmulPlugin(mType, mWeightTypeId);
-    plugin->setPluginNamespace(mNamespace.c_str());
+    auto* plugin = new WeightOnlyQuantMatmulPlugin(*this);
     return plugin;
 }
 
+void WeightOnlyQuantMatmulPlugin::configGemm()
+{
+    mPluginProfiler->profileTactics(
+        m_weightOnlyGemmRunner->getConfigs(), m_weightOnlyGemmRunner, mType, mDims, mGemmId);
+}
+
 nvinfer1::DimsExprs WeightOnlyQuantMatmulPlugin::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept
 {
@@ -80,12 +142,12 @@ nvinfer1::DimsExprs WeightOnlyQuantMatmulPlugin::getOutputDimensions(
 
     try
     {
-        PLUGIN_ASSERT(nbInputs == 3);
-        PLUGIN_ASSERT(outputIndex == 0);
+        TLLM_CHECK(nbInputs == 3);
+        TLLM_CHECK(outputIndex == 0);
         const int nbDimsA = inputs[0].nbDims;
         const int nbDimsB = inputs[1].nbDims;
-        PLUGIN_ASSERT(nbDimsA >= 2);
-        PLUGIN_ASSERT(nbDimsB == 2);
+        TLLM_CHECK(nbDimsA >= 2);
+        TLLM_CHECK(nbDimsB == 2);
         DimsExprs ret;
         ret.nbDims = nbDimsA;
         for (int ii = 0; ii < nbDimsA - 1; ++ii)
@@ -145,13 +207,22 @@ bool WeightOnlyQuantMatmulPlugin::supportsFormatCombination(
 void WeightOnlyQuantMatmulPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
     const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept
 {
-    int maxM = 1;
-    for (int ii = 0; ii < in[0].max.nbDims - 1; ++ii)
-    {
-        maxM *= in[0].max.d[ii];
-    }
+    const auto minM = std::accumulate(in[0].min.d, in[0].min.d + in[0].min.nbDims - 1, 1, std::multiplies<int>());
+    const auto maxM = std::accumulate(in[0].max.d, in[0].max.d + in[0].max.nbDims - 1, 1, std::multiplies<int>());
+
     const int maxK = in[0].max.d[in[0].max.nbDims - 1];
     const int maxN = in[1].max.d[1] * (mWeightTypeId == 1 ? 4 : 8);
+
+    const auto K = maxK;
+    const auto N = maxN / (mWeightTypeId == 1 ? 4 : 8);
+
+    if (!mDims.isInitialized())
+    {
+        mDims = {minM, maxM, N, K};
+    }
+
+    mGemmId = {N, K, mType};
+
     m_workspaceMaxSize = m_weightOnlyGemmRunner->getWorkspaceSize(maxM, maxN, maxK);
 }
 
@@ -181,38 +252,50 @@ int WeightOnlyQuantMatmulPlugin::enqueue(const nvinfer1::PluginTensorDesc* input
     const int k = inputDesc[0].dims.d[inputDesc[0].dims.nbDims - 1];
 
     const int ws_size = m_weightOnlyGemmRunner->getWorkspaceSize(m, n, k);
+    const auto& bestTactic = mPluginProfiler->getBestConfig(m, mGemmId);
+    TLLM_CHECK_WITH_INFO(bestTactic, "No valid SQ GEMM tactic");
     if (mType == nvinfer1::DataType::kHALF && mWeightTypeId == 1)
     {
-        if (m == 1)
+        if (m < SMALL_M_FAST_PATH && mSM >= 75)
         {
-            const half* bias = nullptr;
-            tensorrt_llm::kernels::weight_only_gemv_launcher(reinterpret_cast<const half*>(inputs[0]),
-                reinterpret_cast<const int8_t*>(inputs[1]), reinterpret_cast<const half*>(inputs[2]), bias,
-                reinterpret_cast<half*>(outputs[0]), k, n * 4, tensorrt_llm::kernels::ActivationType::Identity,
-                tensorrt_llm::kernels::QuantType::INT8_WEIGHT_ONLY, stream);
+            // Use CUDA kernels for small batch size
+            // The CUDA kernel is designed for ColumnMajorTileInterleave weight layout used in fpAIntB cutlass kernel
+            // when sm >= 75 and the preprocessing of cutlass on sm70 does not interleave the weights.
+            tensorrt_llm::kernels::WeightOnlyParams params{reinterpret_cast<const uint8_t*>(inputs[1]),
+                reinterpret_cast<const half*>(inputs[2]), nullptr, reinterpret_cast<const half*>(inputs[0]), nullptr,
+                reinterpret_cast<half*>(outputs[0]), m, n * 4, k, 0};
+            tensorrt_llm::kernels::weight_only_batched_gemv_launcher(tensorrt_llm::kernels::WeightOnlyQuantType::Int8b,
+                tensorrt_llm::kernels::WeightOnlyType::PerChannel,
+                tensorrt_llm::kernels::WeightOnlyActivationType::Identity, params, stream);
         }
         else
         {
             m_weightOnlyGemmRunner->gemm(reinterpret_cast<const half*>(inputs[0]),
                 reinterpret_cast<const int8_t*>(inputs[1]), reinterpret_cast<const half*>(inputs[2]),
-                reinterpret_cast<half*>(outputs[0]), m, n * 4, k, reinterpret_cast<char*>(workspace), ws_size, stream);
+                reinterpret_cast<half*>(outputs[0]), m, n * 4, k, *bestTactic, reinterpret_cast<char*>(workspace),
+                ws_size, stream);
         }
     }
     else if (mType == nvinfer1::DataType::kHALF && mWeightTypeId == 2)
     {
-        if (m == 1)
+        if (m < SMALL_M_FAST_PATH && mSM >= 75)
         {
-            const half* bias = nullptr;
-            tensorrt_llm::kernels::weight_only_gemv_launcher(reinterpret_cast<const half*>(inputs[0]),
-                reinterpret_cast<const int8_t*>(inputs[1]), reinterpret_cast<const half*>(inputs[2]), bias,
-                reinterpret_cast<half*>(outputs[0]), k, n * 8, tensorrt_llm::kernels::ActivationType::Identity,
-                tensorrt_llm::kernels::QuantType::PACKED_INT4_WEIGHT_ONLY, stream);
+            // Use CUDA kernels for small batch size
+            // The CUDA kernel is designed for ColumnMajorTileInterleave weight layout used in fpAIntB cutlass kernel
+            // when sm >= 75 and the preprocessing of cutlass on sm70 does not interleave the weights.
+            tensorrt_llm::kernels::WeightOnlyParams params{reinterpret_cast<const uint8_t*>(inputs[1]),
+                reinterpret_cast<const half*>(inputs[2]), nullptr, reinterpret_cast<const half*>(inputs[0]), nullptr,
+                reinterpret_cast<half*>(outputs[0]), m, n * 8, k, 0};
+            tensorrt_llm::kernels::weight_only_batched_gemv_launcher(tensorrt_llm::kernels::WeightOnlyQuantType::Int4b,
+                tensorrt_llm::kernels::WeightOnlyType::PerChannel,
+                tensorrt_llm::kernels::WeightOnlyActivationType::Identity, params, stream);
         }
         else
         {
             m_weightOnlyGemmRunner->gemm(reinterpret_cast<const half*>(inputs[0]),
                 reinterpret_cast<const cutlass::uint4b_t*>(inputs[1]), reinterpret_cast<const half*>(inputs[2]),
-                reinterpret_cast<half*>(outputs[0]), m, n * 8, k, reinterpret_cast<char*>(workspace), ws_size, stream);
+                reinterpret_cast<half*>(outputs[0]), m, n * 8, k, *bestTactic, reinterpret_cast<char*>(workspace),
+                ws_size, stream);
         }
     }
     else
@@ -227,7 +310,7 @@ int WeightOnlyQuantMatmulPlugin::enqueue(const nvinfer1::PluginTensorDesc* input
 nvinfer1::DataType WeightOnlyQuantMatmulPlugin::getOutputDataType(
     int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
 {
-    PLUGIN_ASSERT(index == 0);
+    TLLM_CHECK(index == 0);
     return mType;
 }
 
@@ -250,6 +333,7 @@ int WeightOnlyQuantMatmulPlugin::getNbOutputs() const noexcept
 
 int WeightOnlyQuantMatmulPlugin::initialize() noexcept
 {
+    configGemm();
     return 0;
 }
 
@@ -257,7 +341,10 @@ void WeightOnlyQuantMatmulPlugin::terminate() noexcept {}
 
 size_t WeightOnlyQuantMatmulPlugin::getSerializationSize() const noexcept
 {
-    return sizeof(int) + sizeof(nvinfer1::DataType);
+    return sizeof(int) +                                // mWeightTypeId
+        sizeof(nvinfer1::DataType) +                    // mType
+        sizeof(mDims) +                                 // Dimensions
+        mPluginProfiler->getSerializationSize(mGemmId); // selected tactics container size
 }
 
 void WeightOnlyQuantMatmulPlugin::serialize(void* buffer) const noexcept
@@ -265,6 +352,9 @@ void WeightOnlyQuantMatmulPlugin::serialize(void* buffer) const noexcept
     char *d = static_cast<char*>(buffer), *a = d;
     write(d, mType);
     write(d, mWeightTypeId);
+    write(d, mDims);
+
+    mPluginProfiler->serialize(d, mGemmId);
     assert(d == a + getSerializationSize());
 }
 
@@ -274,16 +364,6 @@ void WeightOnlyQuantMatmulPlugin::destroy() noexcept
     delete this;
 }
 
-void WeightOnlyQuantMatmulPlugin::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* WeightOnlyQuantMatmulPlugin::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
-
 ///////////////
 
 WeightOnlyQuantMatmulPluginCreator::WeightOnlyQuantMatmulPluginCreator()
@@ -322,18 +402,21 @@ IPluginV2* WeightOnlyQuantMatmulPluginCreator::createPlugin(const char* name, co
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "weight_type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             weightTypeId = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
     }
     try
     {
-        auto* obj = new WeightOnlyQuantMatmulPlugin(type, weightTypeId);
+        // WeightOnlyGroupwiseQuantMatmulPluginCreator is unique and shared for an engine generation
+        // Create plugin profiler with shared tactics map
+        auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false);
+        auto* obj = new WeightOnlyQuantMatmulPlugin(type, weightTypeId, pluginProfiler);
         obj->setPluginNamespace(mNamespace.c_str());
         return obj;
     }
@@ -351,7 +434,9 @@ IPluginV2* WeightOnlyQuantMatmulPluginCreator::deserializePlugin(
     // call WeightOnlyQuantMatmulPlugin::destroy()
     try
     {
-        auto* obj = new WeightOnlyQuantMatmulPlugin(serialData, serialLength);
+        // Create plugin profiler with private tactics map which is read from the serialized engine
+        auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ true);
+        auto* obj = new WeightOnlyQuantMatmulPlugin(serialData, serialLength, pluginProfiler);
         obj->setPluginNamespace(mNamespace.c_str());
         return obj;
     }
@@ -361,13 +446,3 @@ IPluginV2* WeightOnlyQuantMatmulPluginCreator::deserializePlugin(
     }
     return nullptr;
 }
-
-void WeightOnlyQuantMatmulPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
-{
-    mNamespace = libNamespace;
-}
-
-const char* WeightOnlyQuantMatmulPluginCreator::getPluginNamespace() const noexcept
-{
-    return mNamespace.c_str();
-}
diff --git a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h
index c5cb6ed0be9..2cddeca382a 100644
--- a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h
+++ b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h
@@ -14,16 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_WEIGHT_ONLY_QUANT_MATMUL_PLUGIN_H
-#define TRT_WEIGHT_ONLY_QUANT_MATMUL_PLUGIN_H
+#pragma once
 
-#include "NvInferPlugin.h"
-#include "cutlass/numeric_types.h"
 #include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
-#include "tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h"
+#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h"
+#include "tensorrt_llm/plugins/common/gemmPluginProfiler.h"
 #include "tensorrt_llm/plugins/common/plugin.h"
+
 #include <cassert>
+#include <cutlass/numeric_types.h>
 #include <memory>
 #include <set>
 #include <string>
@@ -33,21 +33,43 @@
 // breaking dependencies
 #include "cutlass/integer_subbyte.h"
 
-namespace nvinfer1
+namespace tensorrt_llm::plugins
 {
-namespace plugin
+
+using WeightOnlyGemmRunner = tensorrt_llm::kernels::cutlass_kernels::CutlassFpAIntBGemmRunnerInterface;
+using WeightOnlyGemmRunnerPtr = std::shared_ptr<WeightOnlyGemmRunner>;
+
+class WeightOnlyQuantGemmPluginProfiler : public GemmPluginProfiler<tensorrt_llm::cutlass_extensions::CutlassGemmConfig,
+                                              WeightOnlyGemmRunnerPtr, GemmIdCore, GemmIdCoreHash>
 {
+public:
+    using Config = tensorrt_llm::cutlass_extensions::CutlassGemmConfig;
+
+    void setWeightTypeId(int weightId)
+    {
+        mWeightTypeId = weightId;
+    }
+
+protected:
+    void runTactic(int m, int n, int k, const Config& tactic, char* workspace, const cudaStream_t& stream) override;
+
+    void computeTmpSize(int maxM, int n, int k) override;
+
+private:
+    int mWeightTypeId;
+};
 
-class WeightOnlyQuantMatmulPlugin : public IPluginV2DynamicExt
+class WeightOnlyQuantMatmulPlugin : public BasePlugin
 {
 public:
+    using PluginProfilerPtr = std::shared_ptr<WeightOnlyQuantGemmPluginProfiler>;
     WeightOnlyQuantMatmulPlugin() = delete;
 
     // int8 weight only : weightTypeId = 1;
     // int4 weight only : weightTypeId = 2;
-    WeightOnlyQuantMatmulPlugin(nvinfer1::DataType type, int weightTypeId);
+    WeightOnlyQuantMatmulPlugin(nvinfer1::DataType type, int weightTypeId, const PluginProfilerPtr& profiler);
 
-    WeightOnlyQuantMatmulPlugin(const void* data, size_t length);
+    WeightOnlyQuantMatmulPlugin(const void* data, size_t length, const PluginProfilerPtr& profiler);
 
     ~WeightOnlyQuantMatmulPlugin() override = default;
 
@@ -77,25 +99,34 @@ class WeightOnlyQuantMatmulPlugin : public IPluginV2DynamicExt
     size_t getSerializationSize() const noexcept override;
     void serialize(void* buffer) const noexcept override;
     void destroy() noexcept override;
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-    const char* getPluginNamespace() const noexcept override;
 
 private:
     // int8 weight only : weightTypeId = 1;
     // int4 weight only : weightTypeId = 2;
     void init(nvinfer1::DataType type, int weightTypeId);
 
+    void configGemm();
+
 private:
     const std::string mLayerName;
-    std::string mNamespace;
 
-    std::shared_ptr<tensorrt_llm::kernels::cutlass_kernels::CutlassFpAIntBGemmRunnerInterface> m_weightOnlyGemmRunner;
+    WeightOnlyGemmRunnerPtr m_weightOnlyGemmRunner;
     int m_workspaceMaxSize;
     nvinfer1::DataType mType;
     int mWeightTypeId;
+    int mSM = tensorrt_llm::common::getSMVersion();
+
+    // When M is smaller than this value, we trigger a fast path
+    // I.e. a tailored kernel instead of cutlass.
+    static constexpr int SMALL_M_FAST_PATH = 5;
+
+    GemmDims mDims{};
+    GemmIdCore mGemmId{};
+
+    PluginProfilerPtr mPluginProfiler;
 };
 
-class WeightOnlyQuantMatmulPluginCreator : public IPluginCreator
+class WeightOnlyQuantMatmulPluginCreator : public BaseCreator
 {
 public:
     WeightOnlyQuantMatmulPluginCreator();
@@ -111,17 +142,10 @@ class WeightOnlyQuantMatmulPluginCreator : public IPluginCreator
     nvinfer1::IPluginV2* deserializePlugin(
         const char* name, const void* serialData, size_t serialLength) noexcept override;
 
-    void setPluginNamespace(const char* pluginNamespace) noexcept override;
-
-    const char* getPluginNamespace() const noexcept override;
-
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
+    GemmPluginProfilerManager<WeightOnlyQuantGemmPluginProfiler> gemmPluginProfileManager;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_WEIGHT_ONLY_QUANT_MATMUL_PLUGIN_H
+} // namespace tensorrt_llm::plugins
diff --git a/cpp/tensorrt_llm/runtime/CMakeLists.txt b/cpp/tensorrt_llm/runtime/CMakeLists.txt
index ecf09ca7980..4e5a6ec9373 100644
--- a/cpp/tensorrt_llm/runtime/CMakeLists.txt
+++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt
@@ -25,6 +25,7 @@ set(SRCS
     iBuffer.cpp
     iTensor.cpp
     memoryCounters.cpp
+    ncclCommunicator.cpp
     runtimeBuffers.cpp
     runtimeKernels.cu
     statefulGptDecoder.cpp
@@ -34,6 +35,7 @@ set(SRCS
 
 include_directories(${API_INCLUDE_DIR}/tensorrt_llm/runtime)
 
+add_compile_options(-Wall)
 add_library(runtime_src OBJECT ${SRCS})
 set_property(TARGET runtime_src PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET runtime_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
@@ -43,4 +45,9 @@ target_include_directories(runtime_src PRIVATE ${MPI_INCLUDE_PATH})
 set(JSON_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/json)
 add_subdirectory(${JSON_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/json)
 
-target_link_libraries(runtime_src PUBLIC nlohmann_json::nlohmann_json)
+if(ENABLE_MULTI_DEVICE EQUAL 1)
+  target_link_libraries(runtime_src PUBLIC nlohmann_json::nlohmann_json
+                                           ${NCCL_LIB})
+else()
+  target_link_libraries(runtime_src PUBLIC nlohmann_json::nlohmann_json)
+endif()
diff --git a/cpp/tensorrt_llm/runtime/bufferManager.cpp b/cpp/tensorrt_llm/runtime/bufferManager.cpp
index 796673b91fa..857efd4ea9f 100644
--- a/cpp/tensorrt_llm/runtime/bufferManager.cpp
+++ b/cpp/tensorrt_llm/runtime/bufferManager.cpp
@@ -86,7 +86,14 @@ void BufferManager::copy(void const* src, IBuffer& dst) const
 {
     if (dst.getSizeInBytes() > 0)
     {
-        TLLM_CUDA_CHECK(cudaMemcpyAsync(dst.data(), src, dst.getSizeInBytes(), cudaMemcpyDefault, mStream->get()));
+        if (IBuffer::memoryType(src) != MemoryType::kGPU && dst.getMemoryType() != MemoryType::kGPU)
+        {
+            std::memcpy(dst.data(), src, dst.getSizeInBytes());
+        }
+        else
+        {
+            TLLM_CUDA_CHECK(cudaMemcpyAsync(dst.data(), src, dst.getSizeInBytes(), cudaMemcpyDefault, mStream->get()));
+        }
     }
 }
 
@@ -94,7 +101,14 @@ void BufferManager::copy(IBuffer const& src, void* dst) const
 {
     if (src.getSizeInBytes() > 0)
     {
-        TLLM_CUDA_CHECK(cudaMemcpyAsync(dst, src.data(), src.getSizeInBytes(), cudaMemcpyDefault, mStream->get()));
+        if (IBuffer::memoryType(dst) != MemoryType::kGPU && src.getMemoryType() != MemoryType::kGPU)
+        {
+            std::memcpy(dst, src.data(), src.getSizeInBytes());
+        }
+        else
+        {
+            TLLM_CUDA_CHECK(cudaMemcpyAsync(dst, src.data(), src.getSizeInBytes(), cudaMemcpyDefault, mStream->get()));
+        }
     }
 }
 
diff --git a/cpp/tensorrt_llm/runtime/gptDecoder.cpp b/cpp/tensorrt_llm/runtime/gptDecoder.cpp
index fd4af84d886..6e4ba484c7a 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoder.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoder.cpp
@@ -131,6 +131,7 @@ template <typename T>
 typename tl::DynamicDecodeLayer<T>::OutputParams prepareOutputs(
     DecodingOutput& output, DecodingInput::TensorPtr const& inputLengths)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     typename tl::DynamicDecodeLayer<T>::OutputParams outputParams(tcc::toTllmTensor(*output.ids));
 
     outputParams.newTokens = tcc::toTllmTensor(*output.newTokens);
@@ -271,6 +272,7 @@ template class GptDecoder<half>;
 void IGptDecoder::gatherTree(ITensor& finalOutputIds, DecodingOutput const& decodingOutput,
     DecodingInput const& decodingInput, BufferManager const& manager)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto const& finalOutputIdsShape = finalOutputIds.getShape();
     auto const& decodingOutputIdsShape = decodingOutput.ids->getShape();
     auto const batchSize = finalOutputIdsShape.d[0];
@@ -322,39 +324,12 @@ void IGptDecoder::gatherTree(ITensor& finalOutputIds, DecodingOutput const& deco
             nullptr, // output_logs
             beamHypotheses.output_ids_tgt, beamHypotheses.sequence_lengths_tgt, beamHypotheses.normed_scores,
             beamHypotheses.cum_log_probs, beamHypotheses.log_probs, beamHypotheses.num_beams,
-            beamHypotheses.input_lengths, beamWidth, maxSeqLength, batchSize, decodingInput.maxLength, stream.get());
+            beamHypotheses.input_lengths, beamWidth, maxSeqLength, batchSize, stream.get());
         sync_check_cuda_error();
     }
     else
     {
-        auto workspace = manager.gpu(batchSize * beamWidth * maxSeqLength, nvinfer1::DataType::kINT32);
-        manager.setZero(*workspace);
-
-        // For sampling, it is equivalent to all parent ids are 0.
-        tensorrt_llm::kernels::gatherTreeParam param;
-        param.beams = bufferCast<SizeType>(*workspace);
-        // Remove prompt length if possible
-        param.sequence_lengths = bufferCast<SizeType>(*decodingOutput.lengths);
-        // add sequence_length 1 here because the sequence_length of time step t is t - 1
-        param.max_sequence_length_final_step = 1;
-        // response input lengths (used to slice the ids during postprocessing), used in interactive generation
-        // This feature is not supported yet, setting it to nullptr temporarily.
-        param.response_input_lengths = nullptr;
-        param.max_seq_len = maxSeqLength;
-        param.batch_size = batchSize;
-        param.beam_width = beamWidth;
-        param.step_ids = bufferCast<TokenIdType>(*decodingOutput.ids);
-        param.parent_ids = nullptr;
-        param.end_tokens = bufferCast<TokenIdType>(*decodingInput.endIds);
-        param.max_input_length = decodingInput.maxLength;
-        param.input_lengths = bufferCast<SizeType>(*decodingInput.lengths);
-        // decoder output has padding
-        param.has_padding = true;
-
-        param.output_ids = bufferCast<TokenIdType>(finalOutputIds);
-        param.stream = stream.get();
-        param.cum_log_probs = bufferCast<float>(*decodingOutput.cumLogProbs);
-        param.length_penalty = 1.0f;
-        tensorrt_llm::kernels::invokeGatherTree(param);
+        manager.copy(*decodingOutput.ids, finalOutputIds);
+        sync_check_cuda_error();
     }
 }
diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp
index 16331503a9a..cfba542dd22 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp
@@ -19,6 +19,7 @@
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/runtimeKernels.h"
 
+#include <algorithm>
 #include <memory>
 
 using namespace tensorrt_llm::runtime;
@@ -29,6 +30,7 @@ namespace
 {
 SamplingConfig extractSamplingConfig(SamplingConfig const& batchSamplingConfig, SizeType batchIdx)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     SamplingConfig samplingConfig{batchSamplingConfig.beamWidth};
 
     auto extractOptional = [&batchIdx](auto& single, auto const& batch)
@@ -59,6 +61,7 @@ SamplingConfig extractSamplingConfig(SamplingConfig const& batchSamplingConfig,
     samplingConfig.beamSearchDiversityRate = batchSamplingConfig.beamSearchDiversityRate;
     samplingConfig.lengthPenalty = batchSamplingConfig.lengthPenalty;
 
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
     return samplingConfig;
 }
 
@@ -73,6 +76,7 @@ GptDecoderBatch::GptDecoderBatch(
     , mEventStart(tc::CreateEvent())
     , mEventStop(tc::CreateEvent())
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto constexpr nvTokenIdType = TRTDataType<TokenIdType>::value;
     auto constexpr nvSizeType = TRTDataType<SizeType>::value;
     auto constexpr nvFloatType = TRTDataType<float>::value;
@@ -97,11 +101,13 @@ GptDecoderBatch::GptDecoderBatch(
     dOutput->lengths = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
     dOutput->cumLogProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
     dOutput->beamHypotheses.empty(mBufferManager);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void GptDecoderBatch::setup(
     SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     TLLM_CHECK(maxBatchSize > 0);
     TLLM_CHECK(maxBeamWidth > 0);
     TLLM_CHECK(maxSequenceLength > 0);
@@ -128,19 +134,19 @@ void GptDecoderBatch::setup(
     dOutput.newTokens->reshape(maxBatchSizeXmaxBeamWidth);
     mBufferManager.setZero(*dOutput.newTokens);
     dOutput.parentIds->reshape(jointOutputIdsShape);
+    dOutput.lengths->reshape(maxBatchSizeXmaxBeamWidth);
+    mBufferManager.setZero(*dOutput.lengths);
     dOutput.finished->reshape(maxBatchSizeXmaxBeamWidth);
     mBufferManager.setZero(*dOutput.finished);
     mBufferManager.setZero(*dOutput.finishedSum);
-    dOutput.lengths->reshape(maxBatchSizeXmaxBeamWidth);
-    mBufferManager.setZero(*dOutput.lengths);
-    dOutput.cumLogProbs->reshape(maxBatchSizeXmaxBeamWidth);
-    mBufferManager.setZero(*dOutput.cumLogProbs);
     // use batchSize many entries instead of the usual 1
     dOutput.finishedSum->reshape(maxBatchSizeShape);
     mBufferManager.setZero(*dOutput.finishedSum);
 
     if (maxBeamWidth > 1)
     {
+        dOutput.cumLogProbs->reshape(maxBatchSizeXmaxBeamWidth);
+        mBufferManager.setZero(*dOutput.cumLogProbs);
         dOutput.beamHypotheses.reshape(maxBatchSize, maxBeamWidth, mMaxSequenceLength);
     }
     else
@@ -171,6 +177,7 @@ void GptDecoderBatch::setup(
         mMaxNewTokens[i] = 0;
         mBeamWidths[i] = 0;
     }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void GptDecoderBatch::newRequest(
@@ -234,13 +241,13 @@ void GptDecoderBatch::newRequest(
     manager.setZero(*dOutput->finishedSum);
     dOutput->lengths = ITensor::slice(dJointOutput.lengths, batchIdx, localBatchSize);
     kernels::invokeFill(*dOutput->lengths, inputLength, *stream);
-    dOutput->cumLogProbs = ITensor::slice(dJointOutput.cumLogProbs, batchIdx, localBatchSize);
-    manager.setZero(*IBuffer::slice(dOutput->cumLogProbs, 0, 1));
     dOutput->newTokens = ITensor::slice(dJointOutput.newTokens, batchIdx, localBatchSize);
     manager.setZero(*dOutput->newTokens);
 
     if (beamWidth > 1)
     {
+        dOutput->cumLogProbs = ITensor::slice(dJointOutput.cumLogProbs, batchIdx, localBatchSize);
+        manager.setZero(*IBuffer::slice(dOutput->cumLogProbs, 0, 1));
         kernels::invokeFill(
             *IBuffer::slice(dOutput->cumLogProbs, 1, beamWidth - 1), DecodingOutput::kNegativeInfinity, *stream);
 
@@ -263,6 +270,7 @@ void GptDecoderBatch::newRequest(
     auto outputIdsView = ITensor::view(outputIds, ITensor::makeShape({beamWidth, mMaxSequenceLength}));
     kernels::invokeFill(*outputIdsView, endId, *stream);
     kernels::tileTensor(*outputIdsView, *inputIdsView, beamWidth, *stream);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void GptDecoderBatch::forward(decoder_batch::Output& output, decoder_batch::Input const& input)
@@ -284,6 +292,10 @@ void GptDecoderBatch::forward(decoder_batch::Output& output, decoder_batch::Inpu
     TLLM_CHECK(!srcCacheIndirection || srcCacheIndirection->getDataType() == TRTDataType<SizeType>::value);
     TLLM_CHECK(!tgtCacheIndirection || tgtCacheIndirection->getDataType() == TRTDataType<SizeType>::value);
 
+    // TODO(bhsueh) should remove this reshape and set shape to [batch_size, beam_width] outside
+    TensorPtr sequenceLengths = ITensor::view(output.sequenceLengths);
+    sequenceLengths->reshape(ITensor::makeShape({mActualBatchSize, maxBeamWidth}));
+    TLLM_CHECK(sequenceLengths);
     auto constexpr singleRequest = 1;
 
     mStream->record(mEventStart.get());
@@ -308,6 +320,8 @@ void GptDecoderBatch::forward(decoder_batch::Output& output, decoder_batch::Inpu
             dOutput.cacheIndirection
                 = ITensor::view(tgtView, ITensor::makeShape({singleRequest, mBeamWidths[i], tgtView->getShape().d[2]}));
         }
+        auto sequenceLengthsView = std::shared_ptr(ITensor::slice(sequenceLengths, i, singleRequest));
+        dOutput.lengths = ITensor::view(sequenceLengthsView, ITensor::makeShape({singleRequest, mBeamWidths[i]}));
 
         auto& decoder = *mDecoders[i];
         decoder.forwardAsync(dOutput, dInput);
@@ -321,6 +335,10 @@ void GptDecoderBatch::forward(decoder_batch::Output& output, decoder_batch::Inpu
 
         manager.copy(*dOutput.ids, *jointOutputIdsView);
 
+        auto jointSequenceLengthsView = ITensor::slice(mJointDecodingOutput->lengths, i, singleRequest);
+        jointSequenceLengthsView->reshape(ITensor::makeShape({1, mBeamWidths[i]}));
+        manager.copy(*dOutput.lengths, *jointSequenceLengthsView);
+
         if (mBeamWidths[i] > 1)
         {
             auto jointOutputParentIdsView = ITensor::slice(mJointDecodingOutput->parentIds, i, singleRequest);
@@ -347,11 +365,13 @@ void GptDecoderBatch::forward(decoder_batch::Output& output, decoder_batch::Inpu
             // This condition requires the synchronization above
             || *bufferCast<SizeType>(*dOutput.finishedSum) == static_cast<SizeType>(dOutput.finished->getSize());
     }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 // TODO (rkobus) call this at the end of forward if mFinished[i] changes from false to true?
 void GptDecoderBatch::postProcessRequest(SizeType batchIdx) const
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto& stream = mStreams[batchIdx];
     auto manager = BufferManager{stream};
 
@@ -368,10 +388,12 @@ void GptDecoderBatch::postProcessRequest(SizeType batchIdx) const
     auto& event = mEvents[batchIdx];
     stream->record(event.get());
     mStream->wait(event.get());
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void GptDecoderBatch::newBatch(GenerationInput const& inputs, SamplingConfig const& samplingConfig)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     // split batch into single requests
     auto const& inputLengths = inputs.lengths;
     mActualBatchSize = inputLengths->getShape().d[0];
@@ -409,28 +431,34 @@ void GptDecoderBatch::newBatch(GenerationInput const& inputs, SamplingConfig con
         request.stopWordsList = inputs.stopWordsList;
         newRequest(batchIdx, request, extractSamplingConfig(samplingConfig, batchIdx));
     }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 bool GptDecoderBatch::forward(decoder::Output& output, decoder::Input const& input)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     decoder_batch::Input batchInput{input.logits};
     batchInput.cacheIndirection = input.cacheIndirection;
 
     decoder_batch::Output batchOutput;
     batchOutput.cacheIndirection = output.cacheIndirection;
+    batchOutput.sequenceLengths = output.sequenceLengths;
 
     forward(batchOutput, batchInput);
 
     auto finished = getFinished();
 
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
     return std::all_of(finished.begin(), finished.end(), [](bool x) { return x; });
 }
 
 IStatefulGptDecoder::TensorPtr GptDecoderBatch::getFinalOutputIds() const
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     for (SizeType batchIdx = 0; batchIdx < mActualBatchSize; ++batchIdx)
     {
         postProcessRequest(batchIdx);
     }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
     return ITensor::slice(getOutputIds(), 0, mActualBatchSize);
 }
diff --git a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
index f7d12881487..ae5ffee9f48 100644
--- a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
+++ b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
@@ -55,9 +55,10 @@ GptJsonConfig parseJson(InputType&& i)
     auto const& builderConfig = json.at("builder_config");
     auto const name = builderConfig.at("name").template get<std::string>();
     auto const precision = builderConfig.at("precision").template get<std::string>();
-    auto const worldSize = builderConfig.at("tensor_parallel").template get<SizeType>();
-    auto const numHeads = builderConfig.at("num_heads").template get<SizeType>() / worldSize;
-    auto const hiddenSize = builderConfig.at("hidden_size").template get<SizeType>() / worldSize;
+    auto const tensorParallelism = builderConfig.at("tensor_parallel").template get<SizeType>();
+    auto const pipelineParallelism = parseJsonFieldOr(builderConfig, "pipeline_parallel", 1);
+    auto const numHeads = builderConfig.at("num_heads").template get<SizeType>() / tensorParallelism;
+    auto const hiddenSize = builderConfig.at("hidden_size").template get<SizeType>() / tensorParallelism;
     auto const vocabSize = builderConfig.at("vocab_size").template get<SizeType>();
     auto const numLayers = builderConfig.at("num_layers").template get<SizeType>();
 
@@ -74,33 +75,41 @@ GptJsonConfig parseJson(InputType&& i)
     auto const pagedKvCache = parseJsonFieldOr(builderConfig, "paged_kv_cache", false);
     auto const tokensPerBlock = parseJsonFieldOr(builderConfig, "tokens_per_block", 0);
     auto const quantMode = tc::QuantMode(parseJsonFieldOr(builderConfig, "quant_mode", tc::QuantMode::none().value()));
-    auto const numKvHeads = parseJsonFieldOr(builderConfig, "num_kv_heads", numHeads * worldSize) / worldSize;
+    auto const numKvHeads
+        = parseJsonFieldOr(builderConfig, "num_kv_heads", numHeads * tensorParallelism) / tensorParallelism;
+    auto const maxBatchSize = parseJsonFieldOr(builderConfig, "max_batch_size", 0);
+    auto const maxInputLen = parseJsonFieldOr(builderConfig, "max_input_len", 0);
+    auto const maxOutputLen = parseJsonFieldOr(builderConfig, "max_output_len", 0);
 
     auto const& pluginConfig = json.at("plugin_config");
     auto const& gptAttentionPlugin = pluginConfig.at("gpt_attention_plugin");
     auto const useGptAttentionPlugin = !gptAttentionPlugin.is_boolean() || gptAttentionPlugin.template get<bool>();
     auto const removeInputPadding = pluginConfig.at("remove_input_padding").template get<bool>();
-    auto const inflightBatching = pluginConfig.at("in_flight_batching").template get<bool>();
 
     auto modelConfig = GptModelConfig{vocabSize, numLayers, numHeads, hiddenSize, dataType};
     modelConfig.useGptAttentionPlugin(useGptAttentionPlugin);
     modelConfig.usePackedInput(removeInputPadding);
     modelConfig.usePagedKvCache(pagedKvCache);
-    modelConfig.useInflightBatching(inflightBatching);
     modelConfig.setTokensPerBlock(tokensPerBlock);
     modelConfig.setQuantMode(quantMode);
     modelConfig.setNbKvHeads(numKvHeads);
 
-    return GptJsonConfig{name, precision, worldSize, modelConfig};
+    modelConfig.setMaxBatchSize(maxBatchSize);
+    modelConfig.setMaxInputLen(maxInputLen);
+    modelConfig.setMaxOutputLen(maxOutputLen);
+
+    return GptJsonConfig{name, precision, tensorParallelism, pipelineParallelism, modelConfig};
 }
 
 } // namespace
 
 std::string GptJsonConfig::engineFilename(WorldConfig const& worldConfig, std::string const& model) const
-
 {
-    TLLM_CHECK_WITH_INFO(getWorldSize() == worldConfig.getSize(), "world size mismatch");
-    return model + "_" + getPrecision() + "_tp" + std::to_string(worldConfig.getSize()) + "_rank"
+    TLLM_CHECK_WITH_INFO(getTensorParallelism() == worldConfig.getTensorParallelism(), "tensor parallelism mismatch");
+    TLLM_CHECK_WITH_INFO(
+        getPipelineParallelism() == worldConfig.getPipelineParallelism(), "pipeline parallelism mismatch");
+    auto pp = worldConfig.isPipelineParallel() ? "_pp" + std::to_string(worldConfig.getPipelineParallelism()) : "";
+    return model + "_" + getPrecision() + "_tp" + std::to_string(worldConfig.getTensorParallelism()) + pp + "_rank"
         + std::to_string(worldConfig.getRank()) + ".engine";
 }
 
diff --git a/cpp/tensorrt_llm/runtime/gptSession.cpp b/cpp/tensorrt_llm/runtime/gptSession.cpp
index 90006df265f..a10c83daa35 100644
--- a/cpp/tensorrt_llm/runtime/gptSession.cpp
+++ b/cpp/tensorrt_llm/runtime/gptSession.cpp
@@ -24,6 +24,7 @@
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/decodingKernels.h"
 #include "tensorrt_llm/runtime/gptDecoderBatch.h"
+#include "tensorrt_llm/runtime/ncclCommunicator.h"
 #include "tensorrt_llm/runtime/runtimeBuffers.h"
 #include "tensorrt_llm/runtime/runtimeKernels.h"
 #include "tensorrt_llm/runtime/statefulGptDecoder.h"
@@ -50,32 +51,52 @@ GptSession::GptSession(GptModelConfig const& modelConfig, WorldConfig const& wor
     , mBuffers{std::make_shared<RuntimeBuffers>()}
     , mCudaGraphInstances{}
 {
-    TLLM_CHECK_WITH_INFO(mRuntime->getNbProfiles() == 1, "GPT only expects one optimization profile");
     createContexts();
-    mBuffers->create(*mRuntime, mModelConfig);
+    mBuffers->create(*mRuntime, mModelConfig, mWorldConfig);
+
+    if (mWorldConfig.isPipelineParallel())
+    {
+        mPipelineComm = NcclCommunicator::createPipelineComm(mWorldConfig, *mLogger);
+    }
+
     // TODO compare expected and runtime tensor names?
 }
 
-nvinfer1::ILogger& tensorrt_llm::runtime::GptSession::getLogger() const
+nvinfer1::ILogger& GptSession::getLogger() const
 {
     return *mLogger;
 }
 
-BufferManager& tensorrt_llm::runtime::GptSession::getBufferManager() const
+BufferManager& GptSession::getBufferManager() const
 {
     return mRuntime->getBufferManager();
 }
 
 void GptSession::createContexts()
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     mRuntime->clearContexts();
+    auto numProfiles = mRuntime->getNbProfiles();
+    TLLM_CHECK_WITH_INFO(
+        numProfiles == 1 || numProfiles == 2, "GPT only expects one optimization profile or two optimization profiles");
     // Instantiate two contexts for flip-flopping
-    mRuntime->addContext(0);
-    mRuntime->addContext(0);
+    if (numProfiles == 1)
+    {
+        mRuntime->addContext(0);
+        mRuntime->addContext(0);
+    }
+    else
+    {
+        mRuntime->addContext(1);
+        mRuntime->addContext(1);
+        mRuntime->addContext(0);
+    }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void GptSession::createDecoder(bool decoderPerRequest)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto const vocabSize = mModelConfig.getVocabSize();
     auto const vocabSizePadded = mModelConfig.getVocabSizePadded(mWorldConfig.getSize());
     auto const& stream = mRuntime->getStreamPtr();
@@ -84,6 +105,7 @@ void GptSession::createDecoder(bool decoderPerRequest)
         mDecoder = std::make_shared<GptDecoderBatch>(vocabSize, vocabSizePadded, stream);
     else
         mDecoder = std::make_shared<StatefulGptDecoder>(vocabSize, vocabSizePadded, stream);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void GptSession::setup(SizeType const batchSize, SizeType const beamWidth, SizeType const maxSequenceLength,
@@ -116,14 +138,17 @@ void GptSession::setup(SizeType const batchSize, SizeType const beamWidth, SizeT
             tokensPerBlock, maxNumBlocks, batchSize, kvDtype, mRuntime->getStreamPtr());
     }
 
-    auto const logitsType = utils::getTensorDataType(mRuntime->getEngine(), "logits");
-
-    createDecoder(decoderPerRequest);
-    mDecoder->setup(batchSize, beamWidth, maxSequenceLength, logitsType);
+    if (mWorldConfig.isLastPipelineParallelRank())
+    {
+        auto const logitsType = mRuntime->getEngine().getTensorDataType("logits");
+        createDecoder(decoderPerRequest);
+        mDecoder->setup(batchSize, beamWidth, maxSequenceLength, logitsType);
+    }
 
     // reshape does not care about maxInputLength or maxNewTokens
     auto const generationConfig = RuntimeBuffers::GenerationConfig{batchSize, beamWidth, 0, 0, maxSequenceLength};
-    mBuffers->reshape(generationConfig, mModelConfig, mWorldConfig.getSize());
+    mBuffers->reshape(generationConfig, mModelConfig, mWorldConfig);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void GptSession::generate(
@@ -151,20 +176,11 @@ void GptSession::generate(
     auto const beamWidth = generationConfig.beamWidth;
     auto const maxInputLength = generationConfig.maxInputLength;
     auto const maxNewTokens = generationConfig.maxNewTokens;
-    auto const maxSeqLength = generationConfig.maxSeqLength;
-    auto finalSeqLength = maxSeqLength;
 
     TLLM_CHECK_WITH_INFO(buffers.allocated, "Buffers not allocated, please call setup first!");
 
-    buffers.reshape(generationConfig, mModelConfig, mWorldConfig.getSize());
+    buffers.reshape(generationConfig, mModelConfig, mWorldConfig);
 
-    if (mModelConfig.usePackedInput())
-    {
-        buffers.inputOffsets->reshape(ITensor::makeShape({batchSize + 1}));
-        manager.setZero(*buffers.inputOffsets);
-        kernels::invokeInclusiveSum(
-            *ITensor::slice(buffers.inputOffsets, 1), *buffers.contextLengthsDevice, manager, stream);
-    }
     if (mModelConfig.usePagedKvCache())
     {
         auto const contextLengthsHost = bufferCast<SizeType const>(*buffers.contextLengthsHost);
@@ -174,23 +190,39 @@ void GptSession::generate(
         }
     }
 
-    mDecoder->newBatch(inputs, samplingConfig);
-
     RuntimeBuffers::TensorMap inputBuffers[2];
     RuntimeBuffers::TensorMap outputBuffers[2];
     auto& onTokenGenerated = outputs.onTokenGenerated;
+    outputs.ids->reshape(ITensor::makeShape({batchSize, beamWidth, mDecoderMaxSequenceLength}));
+    ITensor::SharedPtr newTokens;
+    if (mWorldConfig.isLastPipelineParallelRank())
+    {
+        mDecoder->newBatch(inputs, samplingConfig);
+        newTokens = mDecoder->getNewTokens();
+    }
+    else if (mWorldConfig.isFirstPipelineParallelRank())
+    {
+        newTokens = manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kINT32);
+    }
 
     for (SizeType step = 0; step < maxNewTokens; ++step)
     {
         auto const contextId = step % 2;
+        bool enqueueSuccessful = false;
         if (step == 0)
         {
+            SizeType contextIdForContextPhase = 0;
+            if (mRuntime->getNbProfiles() == 2)
+            {
+                contextIdForContextPhase = 2;
+            }
             buffers.prepareContextStep(
-                inputs.ids, inputs.padId, manager, *mKvCacheManager, generationConfig, mModelConfig);
-            buffers.getRuntimeBuffers(
-                inputBuffers[contextId], outputBuffers[contextId], step, inputs.ids, *mKvCacheManager, mModelConfig);
-            mRuntime->setInputTensors(contextId, inputBuffers[contextId]);
-            mRuntime->setOutputTensors(contextId, outputBuffers[contextId]);
+                inputs.ids, inputs.padId, manager, *mKvCacheManager, generationConfig, mModelConfig, mWorldConfig);
+            buffers.getRuntimeBuffers(inputBuffers[contextId], outputBuffers[contextId], step, inputs.ids,
+                *mKvCacheManager, mModelConfig, mWorldConfig);
+            mRuntime->setInputTensors(contextIdForContextPhase, inputBuffers[contextId]);
+            mRuntime->setOutputTensors(contextIdForContextPhase, outputBuffers[contextId]);
+
             if (isCudaGraphMode())
             {
                 for (auto& instance : mCudaGraphInstances)
@@ -198,16 +230,19 @@ void GptSession::generate(
                     instance.clear();
                 }
             }
-        }
-        bool enqueueSuccessful = false;
-        if (isCudaGraphMode() && mCudaGraphInstances[contextId].hasInstance())
-        {
-            mCudaGraphInstances[contextId].launch(stream);
-            enqueueSuccessful = true;
+            enqueueSuccessful = mRuntime->executeContext(contextIdForContextPhase);
         }
         else
         {
-            enqueueSuccessful = mRuntime->executeContext(contextId);
+            if (isCudaGraphMode() && mCudaGraphInstances[contextId].hasInstance())
+            {
+                mCudaGraphInstances[contextId].launch(stream);
+                enqueueSuccessful = true;
+            }
+            else
+            {
+                enqueueSuccessful = mRuntime->executeContext(contextId);
+            }
         }
 
         TLLM_CHECK_WITH_INFO(enqueueSuccessful, "Executing TRT engine failed!");
@@ -215,50 +250,25 @@ void GptSession::generate(
 
         if (step == 0)
         {
-            buffers.postContextStep(manager, generationConfig, mModelConfig);
+            buffers.postContextStep(manager, generationConfig, mModelConfig, mWorldConfig);
         }
 
         std::swap(buffers.cacheIndirectionDecoderInput, buffers.cacheIndirectionDecoderOutput);
 
-        decoder::Input decodingInput{buffers.logits};
-        decoder::Output decodingOutput{};
-        decodingInput.cacheIndirection = buffers.cacheIndirectionDecoderInput;
-        decodingOutput.cacheIndirection = buffers.cacheIndirectionDecoderOutput;
-
         if (step < maxNewTokens - 1)
         {
             auto const nextStep = step + 1;
             auto const nextContextId = nextStep % 2;
             auto nextInputIds = buffers.prepareNextStep(
-                step, mDecoder->getNewTokens(), manager, *mKvCacheManager, generationConfig, mModelConfig);
+                step, newTokens, manager, *mKvCacheManager, generationConfig, mModelConfig, mWorldConfig);
             buffers.getRuntimeBuffers(inputBuffers[nextContextId], outputBuffers[nextContextId], nextStep, nextInputIds,
-                *mKvCacheManager, mModelConfig);
+                *mKvCacheManager, mModelConfig, mWorldConfig);
             mRuntime->setInputTensors(nextContextId, inputBuffers[nextContextId]);
             mRuntime->setOutputTensors(nextContextId, outputBuffers[nextContextId]);
 
             if (isCudaGraphMode())
             {
-                // capture cuda graph
-                cudaGraph_t next_graph;
-                TLLM_CUDA_CHECK(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal));
-                mRuntime->executeContext(nextContextId);
-                TLLM_CUDA_CHECK(cudaStreamEndCapture(stream.get(), &next_graph));
-
-                if (mCudaGraphInstances[nextContextId].hasInstance())
-                {
-                    if (mCudaGraphInstances[nextContextId].update(next_graph))
-                    {
-                        mCudaGraphInstances[nextContextId].clear();
-                        mCudaGraphInstances[nextContextId].create(next_graph);
-                    }
-                }
-                else
-                {
-                    mCudaGraphInstances[nextContextId].create(next_graph);
-                }
-
-                TLLM_CUDA_CHECK(cudaGraphDestroy(next_graph));
-                mCudaGraphInstances[nextContextId].uploadToStream(stream);
+                mCudaGraphInstances[nextContextId].prepareNextGraph(*mRuntime, nextContextId);
             }
         }
 
@@ -267,17 +277,21 @@ void GptSession::generate(
         // FIXME(nkorobov): this synchronize is important to get logits right
         // manager.getStream().synchronize();
 
-        auto const shouldStop = mDecoder->forward(decodingOutput, decodingInput);
+        auto shouldStop = executeDecoderStep(outputs.ids, newTokens, maxInputLength + step);
 
-        if (onTokenGenerated)
+        if (mWorldConfig.isFirstPipelineParallelRank())
         {
-            // TODO(rkobus) use getNewTokens(), remove step from Callback?
-            onTokenGenerated(mDecoder->getOutputIds(), step, shouldStop || step == maxNewTokens - 1);
+            if (onTokenGenerated)
+            {
+                // TODO(rkobus) use getNewTokens(), remove step from Callback?
+                ITensor::SharedPtr outputIds
+                    = mWorldConfig.isPipelineParallel() ? outputs.ids : mDecoder->getOutputIds();
+                onTokenGenerated(outputIds, step, shouldStop || step == maxNewTokens - 1);
+            }
         }
 
         if (shouldStop)
         {
-            finalSeqLength = maxInputLength + step + 1;
             mLogger->log(nvinfer1::ILogger::Severity::kVERBOSE, "GPT decoding finished early");
             break;
         }
@@ -291,38 +305,161 @@ void GptSession::generate(
         }
     }
 
-    outputs.ids->reshape(ITensor::makeShape({batchSize, beamWidth, finalSeqLength}));
-    manager.copy(*mDecoder->getFinalOutputIds(), *outputs.ids);
+    finalizeOutputIds(*outputs.ids);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+bool GptSession::executeDecoderStep(ITensor::SharedPtr& outputIds, ITensor::SharedPtr& newTokens, SizeType decoderStep)
+{
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    auto& stream = mRuntime->getStream();
+    auto& buffers = *mBuffers;
+
+    auto shouldStopPtr = bufferCast<std::uint8_t>(*buffers.shouldStop);
+    auto& shouldStop = *shouldStopPtr;
+    shouldStop = false;
+    if (mWorldConfig.isLastPipelineParallelRank())
+    {
+        decoder::Input decodingInput{buffers.logits};
+        decoder::Output decodingOutput{};
+        decodingInput.cacheIndirection = buffers.cacheIndirectionDecoderInput;
+        decodingOutput.cacheIndirection = buffers.cacheIndirectionDecoderOutput;
+        decodingOutput.sequenceLengths = buffers.sequenceLengths;
+
+        shouldStop = mDecoder->forward(decodingOutput, decodingInput);
+    }
+
+    if (mWorldConfig.isPipelineParallel())
+    {
+        if (mWorldConfig.isLastPipelineParallelRank())
+        {
+            for (auto peer = 0; peer < mWorldConfig.getPipelineParallelism() - 1; ++peer)
+            {
+                mPipelineComm->send(shouldStopPtr, 1, peer, stream, *mLogger);
+            }
+            mPipelineComm->send(bufferCast<std::int32_t>(*newTokens), newTokens->getSize(), 0, stream, *mLogger);
+        }
+        else
+        {
+            auto const peer = mWorldConfig.getPipelineParallelism() - 1;
+            mPipelineComm->receive(shouldStopPtr, 1, peer, stream, *mLogger);
+
+            if (mWorldConfig.isFirstPipelineParallelRank())
+            {
+                mPipelineComm->receive(
+                    bufferCast<std::int32_t>(*newTokens), newTokens->getSize(), peer, stream, *mLogger);
+
+                auto const& newTokensShape = newTokens->getShape();
+                auto newTokensView
+                    = ITensor::view(outputIds, ITensor::makeShape({1, newTokensShape.d[0] * newTokensShape.d[1]}));
+                auto const& outputIdsShape = outputIds->getShape();
+                auto outputIdsView = ITensor::view(
+                    outputIds, ITensor::makeShape({outputIdsShape.d[0] * outputIdsShape.d[1], outputIdsShape.d[2]}));
+                kernels::invokeTransposeWithOutputOffset(*outputIdsView, *newTokensView, decoderStep, stream);
+            }
+        }
+    }
+    sync_check_cuda_error();
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+    return shouldStop;
+}
+
+void GptSession::finalizeOutputIds(ITensor& outputIds)
+{
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    auto& manager = mRuntime->getBufferManager();
+    auto& stream = mRuntime->getStream();
+
+    ITensor::SharedPtr finalOutputIds;
+    if (mWorldConfig.isLastPipelineParallelRank())
+    {
+        finalOutputIds = mDecoder->getFinalOutputIds();
+        if (mWorldConfig.isPipelineParallel())
+        {
+            mPipelineComm->send(
+                bufferCast<std::int32_t>(*finalOutputIds), finalOutputIds->getSize(), 0, stream, *mLogger);
+        }
+    }
+    if (mWorldConfig.isFirstPipelineParallelRank())
+    {
+        if (mWorldConfig.isPipelineParallel())
+        {
+            auto const peer = mWorldConfig.getPipelineParallelism() - 1;
+            mPipelineComm->receive(bufferCast<std::int32_t>(outputIds), outputIds.getSize(), peer, stream, *mLogger);
+        }
+        else
+        {
+            manager.copy(*finalOutputIds, outputIds);
+        }
+    }
     sync_check_cuda_error();
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void GptSession::CudaGraphExecutor::create(cudaGraph_t const& graph)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     assert(mInstance == nullptr);
     TLLM_CUDA_CHECK(cudaGraphInstantiate(&mInstance, graph, nullptr, nullptr, 0));
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void GptSession::CudaGraphExecutor::uploadToStream(CudaStream const& stream)
 {
-    assert(mInstance.hasInstance());
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    assert(hasInstance());
     TLLM_CUDA_CHECK(cudaGraphUpload(mInstance, stream.get()));
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void GptSession::CudaGraphExecutor::launch(CudaStream const& stream)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     TLLM_CUDA_CHECK(cudaGraphLaunch(mInstance, stream.get()));
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 bool GptSession::CudaGraphExecutor::update(cudaGraph_t const& graph)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     return cudaGraphExecUpdate(mInstance, graph, nullptr) != cudaSuccess;
 }
 
 void GptSession::CudaGraphExecutor::clear()
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     if (mInstance != nullptr)
     {
         TLLM_CUDA_CHECK(cudaGraphExecDestroy(mInstance));
         mInstance = nullptr;
     }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+void GptSession::CudaGraphExecutor::prepareNextGraph(TllmRuntime const& runtime, SizeType nextContextId)
+{
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    auto& stream = runtime.getStream();
+
+    cudaGraph_t nextGraph;
+    TLLM_CUDA_CHECK(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal));
+    runtime.executeContext(nextContextId);
+    TLLM_CUDA_CHECK(cudaStreamEndCapture(stream.get(), &nextGraph));
+
+    if (hasInstance())
+    {
+        if (update(nextGraph))
+        {
+            clear();
+            create(nextGraph);
+        }
+    }
+    else
+    {
+        create(nextGraph);
+    }
+
+    TLLM_CUDA_CHECK(cudaGraphDestroy(nextGraph));
+    uploadToStream(stream);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
diff --git a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp
new file mode 100644
index 00000000000..e338c0a46cf
--- /dev/null
+++ b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/runtime/ncclCommunicator.h"
+
+#include "tensorrt_llm/runtime/utils/multiDeviceUtils.h"
+
+#include <NvInferRuntime.h>
+#include <mpi.h>
+
+#if ENABLE_MULTI_DEVICE
+#include <nccl.h>
+#endif // ENABLE_MULTI_DEVICE
+
+using namespace tensorrt_llm::runtime;
+
+namespace
+{
+#if ENABLE_MULTI_DEVICE
+//! \brief For converting a C++ data type to a Nccl data type.
+template <typename T>
+struct NcclDataType
+{
+};
+
+template <>
+struct NcclDataType<half>
+{
+    static constexpr auto value = ncclDataType_t::ncclHalf;
+};
+
+template <>
+struct NcclDataType<float>
+{
+    static constexpr auto value = ncclDataType_t::ncclFloat;
+};
+
+template <>
+struct NcclDataType<std::uint8_t>
+{
+    static constexpr auto value = ncclDataType_t::ncclUint8;
+};
+
+template <>
+struct NcclDataType<std::int32_t>
+{
+    static constexpr auto value = ncclDataType_t::ncclInt32;
+};
+#endif // ENABLE_MULTI_DEVICE
+} // namespace
+
+template <typename T>
+void NcclCommunicator::send(
+    T* sendbuff, size_t count, int peer, CudaStream const& stream, nvinfer1::ILogger& logger) const
+{
+#if ENABLE_MULTI_DEVICE
+    auto datatype = NcclDataType<T>::value;
+    TLLM_NCCL_CHECK(ncclSend(sendbuff, count, datatype, peer, mComm, stream.get()), logger);
+#else
+    TLLM_THROW("Multi device support is disabled.");
+#endif // ENABLE_MULTI_DEVICE
+}
+
+template void NcclCommunicator::send(std::uint8_t*, size_t, int, CudaStream const&, nvinfer1::ILogger&) const;
+template void NcclCommunicator::send(std::int32_t*, size_t, int, CudaStream const&, nvinfer1::ILogger&) const;
+
+template <typename T>
+void NcclCommunicator::receive(
+    T* sendbuff, size_t count, int peer, CudaStream const& stream, nvinfer1::ILogger& logger) const
+{
+#if ENABLE_MULTI_DEVICE
+    auto datatype = NcclDataType<T>::value;
+    TLLM_NCCL_CHECK(ncclRecv(sendbuff, count, datatype, peer, mComm, stream.get()), logger);
+#else
+    TLLM_THROW("Multi device support is disabled.");
+#endif // ENABLE_MULTI_DEVICE
+}
+
+template void NcclCommunicator::receive(std::uint8_t*, size_t, int, CudaStream const&, nvinfer1::ILogger&) const;
+template void NcclCommunicator::receive(std::int32_t*, size_t, int, CudaStream const&, nvinfer1::ILogger&) const;
+
+std::shared_ptr<NcclCommunicator> NcclCommunicator::createPipelineComm(
+    WorldConfig const& worldConfig, nvinfer1::ILogger& logger)
+{
+#if ENABLE_MULTI_DEVICE
+    auto ppGroup = worldConfig.getPipelineParallelGroup();
+
+    int myRank = worldConfig.getRank();
+    int groupRank = 0;
+    for (auto it = ppGroup.begin(); it != ppGroup.end(); ++it)
+    {
+        if (*it == myRank)
+        {
+            break;
+        }
+        ++groupRank;
+    }
+
+    ncclUniqueId id;
+    if (myRank == ppGroup.front())
+    {
+        ncclGetUniqueId(&id);
+        for (auto it = std::next(std::begin(ppGroup), 1); it != ppGroup.end(); ++it)
+        {
+            TLLM_MPI_CHECK(MPI_Send(&id, sizeof(id), MPI_BYTE, *it, 0, MPI_COMM_WORLD), logger);
+        }
+    }
+    else
+    {
+        MPI_Status status;
+        TLLM_MPI_CHECK(MPI_Recv(&id, sizeof(id), MPI_BYTE, ppGroup.front(), 0, MPI_COMM_WORLD, &status), logger);
+    }
+
+    auto pipelineComm = std::make_shared<NcclCommunicator>();
+    TLLM_NCCL_CHECK(ncclCommInitRank(&pipelineComm->mComm, ppGroup.size(), id, groupRank), logger);
+
+    return pipelineComm;
+#else
+    TLLM_THROW("Multi device support is disabled.");
+    return nullptr;
+#endif // ENABLE_MULTI_DEVICE
+}
diff --git a/cpp/tensorrt_llm/runtime/ncclCommunicator.h b/cpp/tensorrt_llm/runtime/ncclCommunicator.h
new file mode 100644
index 00000000000..1843cd24a4f
--- /dev/null
+++ b/cpp/tensorrt_llm/runtime/ncclCommunicator.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/worldConfig.h"
+
+struct ncclComm;
+typedef struct ncclComm* ncclComm_t;
+
+namespace tensorrt_llm::runtime
+{
+
+class NcclCommunicator
+{
+public:
+    template <typename T>
+    void send(T* sendbuff, size_t count, int peer, CudaStream const& stream, nvinfer1::ILogger& logger) const;
+
+    template <typename T>
+    void receive(T* sendbuff, size_t count, int peer, CudaStream const& stream, nvinfer1::ILogger& logger) const;
+
+    static std::shared_ptr<NcclCommunicator> createPipelineComm(
+        WorldConfig const& worldConfig, nvinfer1::ILogger& logger);
+
+private:
+    ncclComm_t mComm;
+};
+
+} // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp b/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp
index 0875eb59032..7252a633544 100644
--- a/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp
+++ b/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp
@@ -19,6 +19,9 @@
 
 #include "tensorrt_llm/runtime/runtimeBuffers.h"
 
+#include <algorithm>
+#include <iostream>
+
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/runtime/runtimeKernels.h"
 #include "tensorrt_llm/runtime/tllmRuntime.h"
@@ -30,20 +33,22 @@ RuntimeBuffers::GenerationConfig RuntimeBuffers::GenerationConfig::fromInput(ITe
     ITensor::SharedPtr const& inputLengthsHost, bool const inputPacked, SizeType const beamWidth,
     SizeType const maxSequenceLength, std::optional<SizeType> const& maxNewTokensOpt, BufferManager& manager)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto const batchSize = static_cast<SizeType>(inputLengthsHost->getSize());
 
     auto const* inputLengthsPtr = bufferCast<SizeType>(*inputLengthsHost);
-    auto const maxInputLength = *std::max_element(inputLengthsPtr, inputLengthsPtr + batchSize);
+    SizeType const maxInputLength = *std::max_element(inputLengthsPtr, inputLengthsPtr + batchSize);
 
+    auto const& inputShape = inputIds->getShape();
     if (inputPacked)
     {
         auto const inputLengthSum = std::reduce(inputLengthsPtr, inputLengthsPtr + batchSize);
-        TLLM_CHECK_WITH_INFO(inputIds->getShape().d[0] == 1 && inputIds->getShape().d[1] == inputLengthSum,
+        TLLM_CHECK_WITH_INFO(inputShape.d[0] == 1 && inputShape.d[1] == inputLengthSum,
             "Packed input must have shape [1, <sum of input lengths>].");
     }
     else
     {
-        TLLM_CHECK_WITH_INFO(inputIds->getShape().d[0] == batchSize && inputIds->getShape().d[1] == maxInputLength,
+        TLLM_CHECK_WITH_INFO(inputShape.d[0] == batchSize && inputShape.d[1] == maxInputLength,
             "Padded input must have shape [batch size, max input length]");
     }
 
@@ -52,55 +57,74 @@ RuntimeBuffers::GenerationConfig RuntimeBuffers::GenerationConfig::fromInput(ITe
         "Max input length is equal to or larger that maxSequenceLength given in setup. No new tokens can be "
         "generated.");
 
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
     return GenerationConfig{batchSize, beamWidth, maxInputLength, maxNewTokens, maxSequenceLength};
 }
 
 void RuntimeBuffers::clear()
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    contextLengthsHost = nullptr;
+    contextLengthsDevice = nullptr;
+
     logits = nullptr;
     sequenceLengths = nullptr;
     pastKeyValueLengths = nullptr;
     attentionMask = nullptr;
     positionIds = nullptr;
     lastTokenIds = nullptr;
+    requestTypes = nullptr;
 
     presentKeysVals.clear();
     presentKeysValsAlt.clear();
+    kvCacheBlockPointers = nullptr;
 
-    contextLengthsHost = nullptr;
-    requestTypes = nullptr;
+    cacheIndirectionDecoderInput = nullptr;
+    cacheIndirectionDecoderOutput = nullptr;
+
+    hiddenStates = nullptr;
 
     allocated = false;
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-void RuntimeBuffers::create(TllmRuntime& runtime, GptModelConfig const& modelConfig)
+void RuntimeBuffers::create(TllmRuntime& runtime, GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto& manager = runtime.getBufferManager();
+    auto& engine = runtime.getEngine();
 
-    auto const logitsType = utils::getTensorDataType(runtime.getEngine(), "logits");
-    logits = manager.emptyTensor(MemoryType::kGPU, logitsType);
+    if (worldConfig.isLastPipelineParallelRank())
+    {
+        auto const logitsType = engine.getTensorDataType("logits");
+        logits = manager.emptyTensor(MemoryType::kGPU, logitsType);
+    }
 
     contextLengthsHost = manager.emptyTensor(MemoryType::kPINNED, nvinfer1::DataType::kINT32);
-    inputOffsets = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
+    sequenceLengths = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
+    lastTokenIds = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
+
+    auto const localNbLayers = modelConfig.getNbLayers(worldConfig.getPipelineParallelism());
+    auto const firstLayerId = worldConfig.getPipelineParallelRank() * localNbLayers;
 
     presentKeysVals
-        = utils::createBufferVector(runtime, modelConfig.getNbLayers(), "present_key_value_", MemoryType::kGPU);
+        = utils::createBufferVector(runtime, firstLayerId, localNbLayers, "present_key_value_", MemoryType::kGPU);
 
     if (modelConfig.useGptAttentionPlugin())
     {
-        sequenceLengths = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
         pastKeyValueLengths = manager.emptyTensor(MemoryType::kCPU, nvinfer1::DataType::kINT32);
     }
     else
     {
         presentKeysValsAlt
-            = utils::createBufferVector(runtime, modelConfig.getNbLayers(), "present_key_value_", MemoryType::kGPU);
+            = utils::createBufferVector(runtime, firstLayerId, localNbLayers, "present_key_value_", MemoryType::kGPU);
     }
 
     if (modelConfig.usePagedKvCache())
     {
-        kvCacheBlockPointers = utils::createBufferVector(
-            runtime, modelConfig.getNbLayers(), "kv_cache_block_pointers_", MemoryType::kGPU);
+        auto const kvCacheBlockPointersType
+            = engine.getTensorDataType(("kv_cache_block_pointers_" + std::to_string(firstLayerId)).c_str());
+        kvCacheBlockPointers = manager.emptyTensor(MemoryType::kGPU, kvCacheBlockPointersType);
     }
 
     if (modelConfig.useGptAttentionPlugin())
@@ -110,31 +134,51 @@ void RuntimeBuffers::create(TllmRuntime& runtime, GptModelConfig const& modelCon
 
     cacheIndirectionDecoderInput = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
     cacheIndirectionDecoderOutput = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
+
+    shouldStop = BufferManager::pinned(ITensor::makeShape({1}), nvinfer1::DataType::kUINT8);
+
+    if (worldConfig.isPipelineParallel())
+    {
+        hiddenStates = manager.emptyTensor(MemoryType::kGPU, modelConfig.getDataType());
+    }
+
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void RuntimeBuffers::reshape(
-    GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, SizeType worldSize)
+    GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+
     auto const batchSize = generationConfig.batchSize;
     auto const beamWidth = generationConfig.beamWidth;
+    auto const maxInputLength = generationConfig.maxInputLength;
     auto const maxSeqLength = generationConfig.maxSeqLength;
 
-    auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldSize);
-    // logits are tiled to {batchSize, beamWidth, vocabSizePadded} after context step of engine
-    logits->reshape(ITensor::makeShape({batchSize, 1, vocabSizePadded}));
+    if (worldConfig.isLastPipelineParallelRank())
+    {
+        auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize());
+        // logits are tiled to {batchSize, beamWidth, vocabSizePadded} after context step of engine
+        logits->reshape(ITensor::makeShape({batchSize, 1, vocabSizePadded}));
+    }
+
+    sequenceLengths->reshape(ITensor::makeShape({batchSize}));
+    lastTokenIds->reshape(ITensor::makeShape({batchSize}));
 
     auto kvCacheShape
         = ITensor::makeShape({batchSize, 2, modelConfig.getNbKvHeads(), maxSeqLength, modelConfig.getSizePerHead()});
     if (modelConfig.usePagedKvCache())
     {
+        auto const localNbLayers = modelConfig.getNbLayers(worldConfig.getPipelineParallelism());
         auto const tokensPerBlock = modelConfig.getTokensPerBlock();
         auto const maxBlocksPerSeq = (maxSeqLength + tokensPerBlock - 1) / tokensPerBlock;
 
         // reserve batchSize * beamWidth and resize to batchSize
-        auto cacheBlockPointersShape = ITensor::makeShape({batchSize * beamWidth, 2, maxBlocksPerSeq * 2});
-        utils::reshapeBufferVector(kvCacheBlockPointers, cacheBlockPointersShape);
-        cacheBlockPointersShape.d[0] = batchSize;
-        utils::reshapeBufferVector(kvCacheBlockPointers, cacheBlockPointersShape);
+        auto cacheBlockPointersShape
+            = ITensor::makeShape({localNbLayers, batchSize * beamWidth, 2, maxBlocksPerSeq * 2});
+        kvCacheBlockPointers->reshape(cacheBlockPointersShape);
+        cacheBlockPointersShape.d[1] = batchSize;
+        kvCacheBlockPointers->reshape(cacheBlockPointersShape);
     }
     else
     {
@@ -143,7 +187,6 @@ void RuntimeBuffers::reshape(
 
     if (modelConfig.useGptAttentionPlugin())
     {
-        sequenceLengths->reshape(ITensor::makeShape({batchSize}));
         pastKeyValueLengths->reshape(ITensor::makeShape({batchSize}));
         requestTypes->reshape(ITensor::makeShape({batchSize}));
     }
@@ -156,27 +199,40 @@ void RuntimeBuffers::reshape(
     cacheIndirectionDecoderInput->reshape(cacheIndirShape);
     cacheIndirectionDecoderOutput->reshape(cacheIndirShape);
 
+    if (worldConfig.isPipelineParallel())
+    {
+        // reserve max size
+        auto const maxNumTokens = std::max(batchSize * beamWidth, batchSize * maxInputLength);
+        auto const hiddenSize = modelConfig.getHiddenSize() * worldConfig.getTensorParallelism();
+        auto const hiddenStatesShape = ITensor::makeShape({1, maxNumTokens, hiddenSize});
+        hiddenStates->reshape(hiddenStatesShape);
+    }
+
     allocated = true;
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-void RuntimeBuffers::tile(
-    BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig)
+void RuntimeBuffers::tile(BufferManager& manager, GenerationConfig const& generationConfig,
+    GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
 {
-    auto const batchSize = generationConfig.batchSize;
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto const beamWidth = generationConfig.beamWidth;
     TLLM_CHECK_WITH_INFO(beamWidth > 1, "Tiling is only necessary for beam search.");
 
-    // logits needs beamWidth in second dimension
-    auto logitsShape = logits->getShape();
-    logitsShape.d[1] *= beamWidth;
-    utils::tileBufferReplace(logits, beamWidth, manager);
-    logits->reshape(logitsShape);
+    if (worldConfig.isLastPipelineParallelRank())
+    {
+        // logits needs beamWidth in second dimension
+        auto logitsShape = logits->getShape();
+        logitsShape.d[1] *= beamWidth;
+        utils::tileBufferReplace(logits, beamWidth, manager);
+        logits->reshape(logitsShape);
+    }
 
     utils::tileBufferReplace(contextLengthsDevice, beamWidth, manager);
+    utils::tileBufferReplace(sequenceLengths, beamWidth, manager);
 
     if (modelConfig.useGptAttentionPlugin())
     {
-        utils::tileBufferReplace(sequenceLengths, beamWidth, manager);
         utils::tileCpuBufferReplace(contextLengthsHost, beamWidth, manager);
         utils::tileCpuBufferReplace(pastKeyValueLengths, beamWidth, manager);
     }
@@ -192,14 +248,15 @@ void RuntimeBuffers::tile(
         for (auto& buffer : presentKeysValsAlt)
             utils::tileBufferReplace(buffer, beamWidth, manager);
     }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-void RuntimeBuffers::postContextStep(
-    BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig)
+void RuntimeBuffers::postContextStep(BufferManager& manager, GenerationConfig const& generationConfig,
+    GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto const batchSize = generationConfig.batchSize;
     auto const beamWidth = generationConfig.beamWidth;
-    auto const maxSeqLength = generationConfig.maxSeqLength;
 
     if (modelConfig.useGptAttentionPlugin())
     {
@@ -210,7 +267,7 @@ void RuntimeBuffers::postContextStep(
 
     if (beamWidth > 1)
     {
-        tile(manager, generationConfig, modelConfig);
+        tile(manager, generationConfig, modelConfig, worldConfig);
     }
 
     // no need to copy data in lastTokenIds because it is overwritten in prepareNextStep
@@ -218,21 +275,23 @@ void RuntimeBuffers::postContextStep(
 
     if (modelConfig.useGptAttentionPlugin() && modelConfig.usePagedKvCache())
     {
-        auto const& pointersShape = kvCacheBlockPointers[0]->getShape();
-        auto const maxBlocksPerSeq = pointersShape.d[pointersShape.nbDims - 1] / 2;
-        auto cacheBlockPointersShape = ITensor::makeShape({batchSize * beamWidth, 2, maxBlocksPerSeq * 2});
-        utils::reshapeBufferVector(kvCacheBlockPointers, cacheBlockPointersShape);
+        auto cacheBlockPointersShape = kvCacheBlockPointers->getShape();
+        cacheBlockPointersShape.d[1] = batchSize * beamWidth;
+        kvCacheBlockPointers->reshape(cacheBlockPointersShape);
     }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType const padId, BufferManager& manager,
-    KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig)
+    KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig,
+    WorldConfig const& worldConfig)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto& stream = manager.getStream();
     SizeType const batchSize = generationConfig.batchSize;
-    SizeType const beamWidth = generationConfig.beamWidth;
     SizeType const maxInputLength = generationConfig.maxInputLength;
-    SizeType const maxSeqLength = generationConfig.maxSeqLength;
+
+    manager.copy(*contextLengthsDevice, *sequenceLengths);
 
     if (modelConfig.useGptAttentionPlugin())
     {
@@ -246,24 +305,25 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c
             std::fill_n(RequestTypesPtr, batchSize, 0);
         }
 
-        if (modelConfig.usePackedInput())
+        auto const inputSize = inputIds->getSize();
+        auto const& inputShape = inputIds->getShape();
+
+        auto const contextLengthsHostPtr = bufferCast<SizeType const>(*contextLengthsHost);
+        std::vector<SizeType> positionIdsVec(inputSize);
+        auto begin = std::begin(positionIdsVec);
+        for (SizeType i = 0; i < batchSize; ++i)
         {
-            auto const inputOffsetsHost = manager.copyFrom(*inputOffsets, MemoryType::kCPU);
-            auto const* inputOffsetsPtr = bufferCast<SizeType>(*inputOffsetsHost);
-
-            std::vector<SizeType> positionIdsVec(inputIds->getShape().d[1]);
-            for (SizeType i = 0; i < batchSize; ++i)
-                std::iota(std::begin(positionIdsVec) + inputOffsetsPtr[i],
-                    std::begin(positionIdsVec) + inputOffsetsPtr[i + 1], 0);
-            positionIds = manager.copyFrom(positionIdsVec, inputIds->getShape(), MemoryType::kGPU);
+            auto end = begin + (modelConfig.usePackedInput() ? contextLengthsHostPtr[i] : maxInputLength);
+            std::iota(begin, end, 0);
+            begin = end;
         }
-        else
+        positionIds = manager.copyFrom(positionIdsVec, inputShape, MemoryType::kGPU);
+
+        if (worldConfig.isPipelineParallel())
         {
-            std::vector<SizeType> positionIdsVec(inputIds->getSize());
-            for (SizeType i = 0; i < batchSize; ++i)
-                std::iota(std::begin(positionIdsVec) + i * maxInputLength,
-                    std::begin(positionIdsVec) + (i + 1) * maxInputLength, 0);
-            positionIds = manager.copyFrom(positionIdsVec, inputIds->getShape(), MemoryType::kGPU);
+            auto const hiddenSize = hiddenStates->getShape().d[2];
+            auto const hiddenStatesShape = ITensor::makeShape({inputShape.d[0], inputShape.d[1], hiddenSize});
+            hiddenStates->reshape(hiddenStatesShape);
         }
     }
     else
@@ -285,62 +345,55 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c
         positionIds = manager.copyFrom(positionIdsVec, attentionMask->getShape(), MemoryType::kGPU);
     }
 
-    if (modelConfig.useGptAttentionPlugin())
-    {
-        manager.copy(*contextLengthsDevice, *sequenceLengths);
-    }
-
     if (modelConfig.useGptAttentionPlugin() && modelConfig.usePagedKvCache())
     {
         auto constexpr contextBeamWidth = 1;
-        auto const& pointersShape = kvCacheBlockPointers[0]->getShape();
+        auto const& pointersShape = kvCacheBlockPointers->getShape();
         auto const maxBlocksPerSeq = pointersShape.d[pointersShape.nbDims - 1] / 2;
         auto const& blockPointersBatch
             = kvCacheManager.getBlockPointersOfBatch(batchSize, contextBeamWidth, maxBlocksPerSeq);
-        for (auto layer = 0; layer < modelConfig.getNbLayers(); ++layer)
-        {
-            TLLM_CHECK(blockPointersBatch[layer]->getSizeInBytes() == kvCacheBlockPointers[layer]->getSizeInBytes());
-            auto pointersPtr = bufferCast<int64_t>(*blockPointersBatch[layer]);
-            auto pointersPtr32 = reinterpret_cast<int32_t*>(pointersPtr);
-            manager.copy(pointersPtr32, *kvCacheBlockPointers[layer]);
-        }
+        TLLM_CHECK(blockPointersBatch->getSizeInBytes() == kvCacheBlockPointers->getSizeInBytes());
+        auto pointersPtr = bufferCast<int64_t>(*blockPointersBatch);
+        auto pointersPtr32 = reinterpret_cast<int32_t*>(pointersPtr);
+        manager.copy(pointersPtr32, *kvCacheBlockPointers);
     }
 
     if (modelConfig.usePackedInput())
     {
-        lastTokenIds = manager.copyFrom(*ITensor::slice(inputOffsets, 1), MemoryType::kGPU);
+        kernels::invokeInclusiveSum(*lastTokenIds, *contextLengthsDevice, manager, stream);
     }
     else
     {
-        lastTokenIds = manager.copyFrom(*contextLengthsDevice, MemoryType::kGPU);
+        manager.copy(*contextLengthsDevice, *lastTokenIds);
     }
 
     manager.setZero(*cacheIndirectionDecoderInput);
     manager.setZero(*cacheIndirectionDecoderOutput);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 };
 
 RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, TensorPtr const& outputIds,
     BufferManager& manager, KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig,
-    GptModelConfig const& modelConfig)
+    GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto& stream = manager.getStream();
     SizeType const batchSize = generationConfig.batchSize;
     SizeType const beamWidth = generationConfig.beamWidth;
-    SizeType const maxSeqLength = generationConfig.maxSeqLength;
 
-    nvinfer1::Dims nextInputIdsShape;
+    nvinfer1::Dims inputShape;
     if (modelConfig.usePackedInput())
     {
-        // squeeze first dim and batch in last dim
-        nextInputIdsShape = ITensor::makeShape({1, batchSize * beamWidth});
+        // batch in last dim
+        inputShape = ITensor::makeShape({1, batchSize * beamWidth});
     }
     else
     {
-        // squeeze first dim
-        nextInputIdsShape = ITensor::makeShape({batchSize * beamWidth, 1});
+        // batch in first dim
+        inputShape = ITensor::makeShape({batchSize * beamWidth, 1});
     }
 
-    auto nextInputIds = ITensor::view(outputIds, nextInputIdsShape);
+    auto nextInputIds = ITensor::view(outputIds, inputShape);
 
     if (modelConfig.useGptAttentionPlugin())
     {
@@ -354,18 +407,16 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, T
             pastKeyValueLengthsPtr[i] = contextLengthsHostPtr[i * srcStride] + step;
         }
 
-        // The sequence_lengths = context_lengths + step for generation stage.
-        kernels::invokeAdd(*sequenceLengths, 1, stream);
-
-        positionIds->reshape(contextLengthsDevice->getShape());
+        positionIds->reshape(inputShape);
         manager.copy(*contextLengthsDevice, *positionIds);
         kernels::invokeAdd(*positionIds, step, stream);
 
-        auto const size = static_cast<SizeType>(positionIds->getSize());
-        if (modelConfig.usePackedInput())
-            positionIds->reshape(ITensor::makeShape({1, size}));
-        else
-            positionIds->reshape(ITensor::makeShape({size, 1}));
+        if (worldConfig.isPipelineParallel())
+        {
+            auto const hiddenSize = hiddenStates->getShape().d[2];
+            auto const hiddenStatesShape = ITensor::makeShape({inputShape.d[0], inputShape.d[1], hiddenSize});
+            hiddenStates->reshape(hiddenStatesShape);
+        }
     }
     else
     {
@@ -405,16 +456,13 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, T
         {
             kvCacheManager.addToken(batchIdx);
         }
-        auto const& pointersShape = kvCacheBlockPointers[0]->getShape();
+        auto const& pointersShape = kvCacheBlockPointers->getShape();
         auto const maxBlocksPerSeq = pointersShape.d[pointersShape.nbDims - 1] / 2;
         auto const& blockPointersBatch = kvCacheManager.getBlockPointersOfBatch(batchSize, beamWidth, maxBlocksPerSeq);
-        for (auto layer = 0; layer < modelConfig.getNbLayers(); ++layer)
-        {
-            TLLM_CHECK(blockPointersBatch[layer]->getSizeInBytes() == kvCacheBlockPointers[layer]->getSizeInBytes());
-            auto pointersPtr = bufferCast<int64_t>(*blockPointersBatch[layer]);
-            auto pointersPtr32 = reinterpret_cast<int32_t*>(pointersPtr);
-            manager.copy(pointersPtr32, *kvCacheBlockPointers[layer]);
-        }
+        TLLM_CHECK(blockPointersBatch->getSizeInBytes() == kvCacheBlockPointers->getSizeInBytes());
+        auto pointersPtr = bufferCast<int64_t>(*blockPointersBatch);
+        auto pointersPtr32 = reinterpret_cast<int32_t*>(pointersPtr);
+        manager.copy(pointersPtr32, *kvCacheBlockPointers);
     }
 
     kernels::invokeFill(*lastTokenIds, 1, stream);
@@ -423,23 +471,44 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, T
         kernels::invokeInclusiveSum(*lastTokenIds, *lastTokenIds, manager, stream);
     }
 
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
     return nextInputIds;
 };
 
 void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, SizeType const step,
-    TensorPtr const& inputIds, KvCacheManager& kvCacheManager, GptModelConfig const& modelConfig) const
+    TensorPtr const& inputIds, KvCacheManager& kvCacheManager, GptModelConfig const& modelConfig,
+    WorldConfig const& worldConfig) const
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     inputBuffers.clear();
     outputBuffers.clear();
 
-    outputBuffers.insert_or_assign("logits", ITensor::view(logits)); // feed a view to TensorRT runtime
+    if (worldConfig.isLastPipelineParallelRank())
+    {
+        // feed a view to TensorRT runtime so reshaping does not change logits buffer
+        outputBuffers.insert_or_assign("logits", ITensor::view(logits));
+    }
+    else
+    {
+        outputBuffers.insert_or_assign("hidden_states_output", hiddenStates);
+    }
 
-    inputBuffers.insert_or_assign("input_ids", inputIds);
+    if (worldConfig.isFirstPipelineParallelRank())
+    {
+        inputBuffers.insert_or_assign("input_ids", inputIds);
+    }
+    else
+    {
+        inputBuffers.insert_or_assign("hidden_states_input", hiddenStates);
+    }
 
     inputBuffers.insert_or_assign("context_lengths", contextLengthsDevice);
     inputBuffers.insert_or_assign("last_token_ids", lastTokenIds);
     inputBuffers.insert_or_assign("position_ids", positionIds);
 
+    auto const localNbLayers = modelConfig.getNbLayers(worldConfig.getPipelineParallelism());
+    auto const firstLayerId = worldConfig.getPipelineParallelRank() * localNbLayers;
+
     if (modelConfig.useGptAttentionPlugin())
     {
         inputBuffers.insert_or_assign("cache_indirection", cacheIndirectionDecoderOutput);
@@ -453,14 +522,15 @@ void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outpu
         }
         if (modelConfig.usePagedKvCache())
         {
-            utils::insertTensorVector(inputBuffers, "past_key_value_", kvCacheManager.getMemoryPools());
-            utils::insertTensorVector(outputBuffers, "present_key_value_", kvCacheManager.getMemoryPools());
-            utils::insertTensorVector(inputBuffers, "kv_cache_block_pointers_", kvCacheBlockPointers);
+            utils::insertTensorVector(inputBuffers, "past_key_value_", kvCacheManager.getMemoryPools(), firstLayerId);
+            utils::insertTensorVector(
+                outputBuffers, "present_key_value_", kvCacheManager.getMemoryPools(), firstLayerId);
+            utils::insertTensorSlices(inputBuffers, "kv_cache_block_pointers_", kvCacheBlockPointers, firstLayerId);
         }
         else
         {
-            utils::insertTensorVector(inputBuffers, "past_key_value_", presentKeysVals);
-            utils::insertTensorVector(outputBuffers, "present_key_value_", presentKeysVals);
+            utils::insertTensorVector(inputBuffers, "past_key_value_", presentKeysVals, firstLayerId);
+            utils::insertTensorVector(outputBuffers, "present_key_value_", presentKeysVals, firstLayerId);
         }
     }
     else
@@ -468,14 +538,14 @@ void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outpu
         inputBuffers.insert_or_assign("attention_mask", attentionMask);
         inputBuffers.insert_or_assign("cache_indirection", cacheIndirectionDecoderOutput);
         utils::insertTensorVector(
-            outputBuffers, "present_key_value_", (step % 2) ? presentKeysValsAlt : presentKeysVals);
+            outputBuffers, "present_key_value_", (step % 2) ? presentKeysValsAlt : presentKeysVals, firstLayerId);
 
         if (step == 0)
         {
             auto kvCacheShape = presentKeysValsAlt.at(0)->getShape();
             kvCacheShape.d[3] = 0;
 
-            for (SizeType i = 0; i < modelConfig.getNbLayers(); ++i)
+            for (SizeType i = firstLayerId; i < firstLayerId + localNbLayers; ++i)
             {
                 std::string name = "past_key_value_" + std::to_string(i);
                 TensorPtr tmp = ITensor::view(presentKeysValsAlt[i], kvCacheShape);
@@ -485,7 +555,8 @@ void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outpu
         else
         {
             utils::insertTensorVector(
-                inputBuffers, "past_key_value_", (step % 2) ? presentKeysVals : presentKeysValsAlt);
+                inputBuffers, "past_key_value_", (step % 2) ? presentKeysVals : presentKeysValsAlt, firstLayerId);
         }
     }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
diff --git a/cpp/tensorrt_llm/runtime/runtimeBuffers.h b/cpp/tensorrt_llm/runtime/runtimeBuffers.h
index 16981c6c655..049b815dca5 100644
--- a/cpp/tensorrt_llm/runtime/runtimeBuffers.h
+++ b/cpp/tensorrt_llm/runtime/runtimeBuffers.h
@@ -19,6 +19,7 @@
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/gptModelConfig.h"
 #include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/worldConfig.h"
 
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
@@ -40,7 +41,7 @@ class RuntimeBuffers
     // general
     TensorPtr contextLengthsHost;
     TensorPtr contextLengthsDevice;
-    TensorPtr inputOffsets;
+    TensorPtr inputOffsets; // helper for packed input
 
     // engine
     TensorPtr logits;
@@ -49,16 +50,22 @@ class RuntimeBuffers
     TensorPtr attentionMask;       // without attention plugin
     TensorPtr positionIds;
     TensorPtr lastTokenIds;
-    TensorPtr requestTypes; // with attention plugin and inflight batching. Host tensor
+    TensorPtr requestTypes; // with attention plugin. Host tensor
 
     std::vector<TensorPtr> presentKeysVals;
     std::vector<TensorPtr> presentKeysValsAlt; // without attention plugin
-    std::vector<TensorPtr> kvCacheBlockPointers;
+    TensorPtr kvCacheBlockPointers;            // [numLayers, batchSize * beamWidth, 2, maxBlocksPerSeq * 2]
 
     // beam search (shared between engine and decoder)
     TensorPtr cacheIndirectionDecoderInput;
     TensorPtr cacheIndirectionDecoderOutput;
 
+    // decoder
+    TensorPtr shouldStop;
+
+    // pipeline parallelism
+    TensorPtr hiddenStates;
+
     bool allocated{false};
 
 public:
@@ -91,24 +98,28 @@ class RuntimeBuffers
 public:
     void clear();
 
-    void create(TllmRuntime& runtime, GptModelConfig const& modelConfig);
+    void create(TllmRuntime& runtime, GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
 
-    void reshape(GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, SizeType worldSize);
+    void reshape(
+        GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
 
-    void postContextStep(
-        BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig);
+    void postContextStep(BufferManager& manager, GenerationConfig const& generationConfig,
+        GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
 
     void prepareContextStep(TensorPtr const& inputIds, TokenIdType padId, BufferManager& manager,
-        KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig);
+        KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig,
+        WorldConfig const& worldConfig);
     TensorPtr prepareNextStep(SizeType step, TensorPtr const& outputIds, BufferManager& manager,
-        KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig);
+        KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig,
+        WorldConfig const& worldConfig);
 
     void getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, SizeType step, TensorPtr const& inputIds,
-        KvCacheManager& kvCacheManager, GptModelConfig const& modelConfig) const;
+        KvCacheManager& kvCacheManager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) const;
 
 private:
     // Some tensors are properly tiled, some are just reshaped.
-    void tile(BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig);
+    void tile(BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig,
+        WorldConfig const& worldConfig);
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.cu b/cpp/tensorrt_llm/runtime/runtimeKernels.cu
index a70ad011b1f..892385e8749 100644
--- a/cpp/tensorrt_llm/runtime/runtimeKernels.cu
+++ b/cpp/tensorrt_llm/runtime/runtimeKernels.cu
@@ -35,9 +35,10 @@ namespace
 template <typename T>
 __global__ void fill(T* data, std::size_t size, T const value)
 {
-    auto const idx = static_cast<std::size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    auto const tidx = static_cast<std::size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    auto const stride = static_cast<std::size_t>(blockDim.x) * gridDim.x;
 
-    if (idx < size)
+    for (auto idx = tidx; idx < size; idx += stride)
     {
         data[idx] = value;
     }
@@ -49,14 +50,17 @@ void invokeFill(IBuffer& buffer, T const value, CudaStream const& stream)
 {
     auto data = bufferCast<T>(buffer);
     auto const size = buffer.getSize();
-    dim3 const blockSize(256);
-    dim3 const gridSize((size + blockSize.x - 1) / blockSize.x);
+    dim3 const blockSize{256};
+    std::size_t const gridx{tc::ceilDiv(size, blockSize.x)};
+    std::size_t const gridMax{std::numeric_limits<std::uint32_t>::max()};
+    dim3 const gridSize{static_cast<std::uint32_t>(std::min(gridx, gridMax))};
 
     fill<<<gridSize, blockSize, 0, stream.get()>>>(data, size, value);
 }
 
 // template instantiation
-template void invokeFill(IBuffer&, SizeType, CudaStream const&);
+template void invokeFill(IBuffer&, std::int32_t, CudaStream const&);
+template void invokeFill(IBuffer&, std::int8_t, CudaStream const&);
 template void invokeFill(IBuffer&, float, CudaStream const&);
 
 namespace
@@ -64,9 +68,10 @@ namespace
 template <typename T>
 __global__ void add(T* data, std::size_t size, T const value)
 {
-    auto const idx = static_cast<std::size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    auto const tidx = static_cast<std::size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    auto const stride = static_cast<std::size_t>(blockDim.x) * gridDim.x;
 
-    if (idx < size)
+    for (auto idx = tidx; idx < size; idx += stride)
     {
         data[idx] += value;
     }
@@ -78,13 +83,17 @@ void invokeAdd(IBuffer& buffer, T const value, CudaStream const& stream)
 {
     auto data = bufferCast<T>(buffer);
     auto const size = buffer.getSize();
-    dim3 const blockSize(256);
-    dim3 const gridSize((size + blockSize.x - 1) / blockSize.x);
+    dim3 const blockSize{256};
+    std::size_t const gridx{tc::ceilDiv(size, blockSize.x)};
+    std::size_t const gridMax{std::numeric_limits<std::uint32_t>::max()};
+    dim3 const gridSize{static_cast<std::uint32_t>(std::min(gridx, gridMax))};
 
     add<<<gridSize, blockSize, 0, stream.get()>>>(data, size, value);
 }
 
-template void invokeAdd(IBuffer&, SizeType, CudaStream const&);
+template void invokeAdd(IBuffer&, std::int32_t, CudaStream const&);
+template void invokeAdd(IBuffer&, std::int8_t, CudaStream const&);
+template void invokeAdd(IBuffer&, float, CudaStream const&);
 
 namespace
 {
@@ -572,19 +581,21 @@ void invokeCopyPackedInputToOutput(ITensor& outputIds, ITensor const& inputIds,
 namespace
 {
 template <typename T>
-__global__ void scatterTensor(T* output, T const* input, SizeType const batchSize, SizeType const inputRowSize,
-    SizeType const outputRowSize, SizeType const beamWidth)
+__global__ void scatterTensor(T* output, T const* input, std::uint32_t const batchSize,
+    std::uint32_t const inputRowSize, std::size_t const outputRowSize, std::uint32_t const beamWidth)
 {
-    SizeType const tidx = blockIdx.x * blockDim.x + threadIdx.x;
-    SizeType const tidy = blockIdx.y * blockDim.y + threadIdx.y;
+    auto const tidx = static_cast<std::size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    auto const tidy = static_cast<std::size_t>(blockIdx.y) * blockDim.y + threadIdx.y;
+    auto const stridex = static_cast<std::size_t>(blockDim.x) * gridDim.x;
+    auto const stridey = static_cast<std::size_t>(blockDim.y) * gridDim.y;
 
-    for (SizeType batchIdx = tidy; batchIdx < batchSize; batchIdx += blockDim.y * gridDim.y)
+    for (auto batchIdx = tidy; batchIdx < batchSize; batchIdx += stridey)
     {
-        for (SizeType columnIdx = tidx; columnIdx < inputRowSize; columnIdx += blockDim.x * gridDim.x)
+        for (auto columnIdx = tidx; columnIdx < inputRowSize; columnIdx += stridex)
         {
             auto const inputIdx = batchIdx * inputRowSize + columnIdx;
             auto const value = input[inputIdx];
-            SizeType constexpr beamIdx = 0;
+            std::size_t constexpr beamIdx{0};
             auto const outputIdx = (batchIdx * beamWidth + beamIdx) * outputRowSize + columnIdx;
             output[outputIdx] = value;
         }
@@ -592,19 +603,21 @@ __global__ void scatterTensor(T* output, T const* input, SizeType const batchSiz
 }
 
 template <typename T>
-__global__ void tileTensor(T* output, T const* input, SizeType const batchSize, SizeType const inputRowSize,
-    SizeType const outputRowSize, SizeType const beamWidth)
+__global__ void tileTensor(T* output, T const* input, std::uint32_t const batchSize, std::size_t const inputRowSize,
+    std::size_t const outputRowSize, std::uint32_t const beamWidth)
 {
-    SizeType const tidx = blockIdx.x * blockDim.x + threadIdx.x;
-    SizeType const tidy = blockIdx.y * blockDim.y + threadIdx.y;
+    auto const tidx = static_cast<std::size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    auto const tidy = static_cast<std::size_t>(blockIdx.y) * blockDim.y + threadIdx.y;
+    auto const stridex = static_cast<std::size_t>(blockDim.x) * gridDim.x;
+    auto const stridey = static_cast<std::size_t>(blockDim.y) * gridDim.y;
 
-    for (SizeType batchIdx = tidy; batchIdx < batchSize; batchIdx += blockDim.y * gridDim.y)
+    for (auto batchIdx = tidy; batchIdx < batchSize; batchIdx += stridey)
     {
-        for (SizeType columnIdx = tidx; columnIdx < inputRowSize; columnIdx += blockDim.x * gridDim.x)
+        for (auto columnIdx = tidx; columnIdx < inputRowSize; columnIdx += stridex)
         {
             auto const inputIdx = batchIdx * inputRowSize + columnIdx;
             auto const value = input[inputIdx];
-            for (SizeType beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
+            for (std::size_t beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
             {
                 auto const outputIdx = (batchIdx * beamWidth + beamIdx) * outputRowSize + columnIdx;
                 output[outputIdx] = value;
@@ -615,18 +628,20 @@ __global__ void tileTensor(T* output, T const* input, SizeType const batchSize,
 
 template <typename T>
 __global__ void tileTensorInPlace(
-    T* inputOutput, SizeType const batchSize, SizeType const inputOutputRowSize, SizeType const beamWidth)
+    T* inputOutput, std::uint32_t const batchSize, std::size_t const inputOutputRowSize, std::uint32_t const beamWidth)
 {
-    SizeType const tidx = blockIdx.x * blockDim.x + threadIdx.x;
-    SizeType const tidy = blockIdx.y * blockDim.y + threadIdx.y;
+    auto const tidx = static_cast<std::size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    auto const tidy = static_cast<std::size_t>(blockIdx.y) * blockDim.y + threadIdx.y;
+    auto const stridex = static_cast<std::size_t>(blockDim.x) * gridDim.x;
+    auto const stridey = static_cast<std::size_t>(blockDim.y) * gridDim.y;
 
-    for (SizeType batchIdx = tidy; batchIdx < batchSize; batchIdx += blockDim.y * gridDim.y)
+    for (auto batchIdx = tidy; batchIdx < batchSize; batchIdx += stridey)
     {
-        for (SizeType columnIdx = tidx; columnIdx < inputOutputRowSize; columnIdx += blockDim.x * gridDim.x)
+        for (auto columnIdx = tidx; columnIdx < inputOutputRowSize; columnIdx += stridex)
         {
             auto const inputIdx = (batchIdx * beamWidth + 0) * inputOutputRowSize + columnIdx;
             auto const value = inputOutput[inputIdx];
-            for (SizeType beamIdx = 1; beamIdx < beamWidth; ++beamIdx)
+            for (std::size_t beamIdx = 1; beamIdx < beamWidth; ++beamIdx)
             {
                 auto const outputIdx = (batchIdx * beamWidth + beamIdx) * inputOutputRowSize + columnIdx;
                 inputOutput[outputIdx] = value;
@@ -641,22 +656,24 @@ template <typename T>
 void invokeScatterTensor(ITensor& output, ITensor const& input, SizeType beamWidth, CudaStream const& stream)
 {
     auto const& inputShape = input.getShape();
-    auto const nbInputRows = inputShape.d[0];
-    auto const inputRowSize = static_cast<SizeType>(input.getSize()) / nbInputRows;
+    auto const nbInputRows = static_cast<std::uint32_t>(inputShape.d[0]);
+    auto const inputRowSize = input.getSize() / static_cast<std::size_t>(nbInputRows);
     auto const& outputShape = output.getShape();
-    auto const nbOutputRows = outputShape.d[0];
-    auto const outputRowSize = static_cast<SizeType>(output.getSize()) / nbOutputRows;
+    auto const nbOutputRows = static_cast<std::uint32_t>(outputShape.d[0]);
+    auto const outputRowSize = output.getSize() / static_cast<std::size_t>(nbOutputRows);
 
     TLLM_CHECK_WITH_INFO(nbOutputRows == beamWidth * nbInputRows,
         common::fmtstr(
             "nbOutputRows (%d) must be beamWidth (%d) times nbInputRows (%d)", nbOutputRows, beamWidth, nbInputRows));
     TLLM_CHECK_WITH_INFO(outputRowSize >= inputRowSize,
-        common::fmtstr("output row size (%d) must be at least input row size (%d)", outputRowSize, inputRowSize));
+        common::fmtstr("output row size (%ld) must be at least input row size (%ld)", outputRowSize, inputRowSize));
 
-    dim3 const blockSize(256, 1);
-    dim3 const gridSize((inputRowSize + blockSize.x - 1) / blockSize.x, nbInputRows);
-    scatterTensor<<<gridSize, blockSize, 0, stream.get()>>>(
-        bufferCast<T>(output), bufferCast<T const>(input), nbInputRows, inputRowSize, outputRowSize, beamWidth);
+    dim3 const blockSize{256, 1};
+    std::size_t const gridx{tc::ceilDiv(inputRowSize, blockSize.x)};
+    std::size_t const gridMax{std::numeric_limits<std::uint32_t>::max()};
+    dim3 const gridSize{static_cast<std::uint32_t>(std::min(gridx, gridMax)), nbInputRows};
+    scatterTensor<<<gridSize, blockSize, 0, stream.get()>>>(bufferCast<T>(output), bufferCast<T const>(input),
+        nbInputRows, inputRowSize, outputRowSize, static_cast<uint32_t>(beamWidth));
 }
 
 void scatterTensor(ITensor& output, ITensor const& input, SizeType beamWidth, CudaStream const& stream)
@@ -676,22 +693,24 @@ template <typename T>
 void invokeTileTensor(ITensor& output, ITensor const& input, SizeType const beamWidth, CudaStream const& stream)
 {
     auto const& inputShape = input.getShape();
-    auto const nbInputRows = inputShape.d[0];
-    auto const inputRowSize = static_cast<SizeType>(input.getSize()) / nbInputRows;
+    auto const nbInputRows = static_cast<std::uint32_t>(inputShape.d[0]);
+    auto const inputRowSize = input.getSize() / static_cast<std::size_t>(nbInputRows);
     auto const& outputShape = output.getShape();
-    auto const nbOutputRows = outputShape.d[0];
-    auto const outputRowSize = static_cast<SizeType>(output.getSize()) / nbOutputRows;
+    auto const nbOutputRows = static_cast<std::uint32_t>(outputShape.d[0]);
+    auto const outputRowSize = output.getSize() / static_cast<std::size_t>(nbOutputRows);
 
     TLLM_CHECK_WITH_INFO(nbOutputRows == beamWidth * nbInputRows,
         common::fmtstr(
             "nbOutputRows (%d) must be beamWidth (%d) times nbInputRows (%d)", nbOutputRows, beamWidth, nbInputRows));
     TLLM_CHECK_WITH_INFO(outputRowSize >= inputRowSize,
-        common::fmtstr("output row size (%d) must be at least input row size (%d)", outputRowSize, inputRowSize));
+        common::fmtstr("output row size (%ld) must be at least input row size (%ld)", outputRowSize, inputRowSize));
 
-    dim3 const blockSize(256, 1);
-    dim3 const gridSize((inputRowSize + blockSize.x - 1) / blockSize.x, nbInputRows);
-    tileTensor<<<gridSize, blockSize, 0, stream.get()>>>(
-        bufferCast<T>(output), bufferCast<T const>(input), nbInputRows, inputRowSize, outputRowSize, beamWidth);
+    dim3 const blockSize{256, 1};
+    std::size_t const gridx{tc::ceilDiv(inputRowSize, blockSize.x)};
+    std::size_t const gridMax{std::numeric_limits<std::uint32_t>::max()};
+    dim3 const gridSize{static_cast<std::uint32_t>(std::min(gridx, gridMax)), nbInputRows};
+    tileTensor<<<gridSize, blockSize, 0, stream.get()>>>(bufferCast<T>(output), bufferCast<T const>(input), nbInputRows,
+        inputRowSize, outputRowSize, static_cast<uint32_t>(beamWidth));
 }
 
 void tileTensor(ITensor& output, ITensor const& input, SizeType beamWidth, CudaStream const& stream)
@@ -711,14 +730,16 @@ template <typename T>
 void invokeTileTensorInPlace(ITensor& inputOutput, SizeType const beamWidth, CudaStream const& stream)
 {
     auto const& inputOutputShape = inputOutput.getShape();
-    auto const nbOutputRows = inputOutputShape.d[0];
-    auto const nbInputRows = nbOutputRows / beamWidth;
-    auto const inputOutputRowSize = static_cast<SizeType>(inputOutput.getSize()) / nbOutputRows;
-
-    dim3 const blockSize(256, 1);
-    dim3 const gridSize((inputOutputRowSize + blockSize.x - 1) / blockSize.x, nbInputRows);
+    auto const nbOutputRows = static_cast<std::uint32_t>(inputOutputShape.d[0]);
+    auto const nbInputRows = nbOutputRows / static_cast<std::uint32_t>(beamWidth);
+    auto const inputOutputRowSize = inputOutput.getSize() / static_cast<std::size_t>(nbOutputRows);
+
+    dim3 const blockSize{256, 1};
+    std::size_t const gridx{tc::ceilDiv(inputOutputRowSize, blockSize.x)};
+    std::size_t const gridMax{std::numeric_limits<std::uint32_t>::max()};
+    dim3 const gridSize{static_cast<std::uint32_t>(std::min(gridx, gridMax)), nbInputRows};
     tileTensorInPlace<<<gridSize, blockSize, 0, stream.get()>>>(
-        bufferCast<T>(inputOutput), nbInputRows, inputOutputRowSize, beamWidth);
+        bufferCast<T>(inputOutput), nbInputRows, inputOutputRowSize, static_cast<std::uint32_t>(beamWidth));
 }
 
 void tileTensorInplace(ITensor& tensor, SizeType beamWidth, CudaStream const& stream)
diff --git a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp
index 0a35c93fbaf..dcdbe62b8ec 100644
--- a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp
+++ b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp
@@ -16,12 +16,16 @@
 
 #include "tensorrt_llm/runtime/statefulGptDecoder.h"
 
+#include <algorithm>
+#include <iostream>
+
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/runtime/runtimeKernels.h"
 
 namespace tc = tensorrt_llm::common;
 using namespace tensorrt_llm::runtime;
+
 using TensorPtr = ITensor::SharedPtr;
 
 StatefulGptDecoder::StatefulGptDecoder(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream)
@@ -30,6 +34,7 @@ StatefulGptDecoder::StatefulGptDecoder(std::size_t vocabSize, std::size_t vocabS
     , mStream{std::move(stream)}
     , mBufferManager{mStream}
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto constexpr nvTokenIdType = TRTDataType<TokenIdType>::value;
     auto constexpr nvSizeType = TRTDataType<SizeType>::value;
     auto constexpr nvFloatType = TRTDataType<float>::value;
@@ -53,18 +58,22 @@ StatefulGptDecoder::StatefulGptDecoder(std::size_t vocabSize, std::size_t vocabS
     dOutput->lengths = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
     dOutput->cumLogProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
     dOutput->beamHypotheses.empty(mBufferManager);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void StatefulGptDecoder::setup(
     SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     mDecoder = IGptDecoder::create(dtype, mVocabSize, mVocabSizePadded, mStream);
 
     reshapeBuffers(maxBatchSize, maxBeamWidth, maxSequenceLength);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void StatefulGptDecoder::reshapeBuffers(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     TLLM_CHECK(batchSize > 0);
     TLLM_CHECK(beamWidth > 0);
     TLLM_CHECK(maxSequenceLength > 0);
@@ -93,13 +102,11 @@ void StatefulGptDecoder::reshapeBuffers(SizeType batchSize, SizeType beamWidth,
     dOutput.finished->reshape(batchSizeXbeamWidth);
     mBufferManager.setZero(*dOutput.finished);
     mBufferManager.setZero(*dOutput.finishedSum);
-    dOutput.lengths->reshape(batchSizeXbeamWidth);
-    mBufferManager.setZero(*dOutput.lengths);
-    dOutput.cumLogProbs->reshape(batchSizeXbeamWidth);
-    mBufferManager.setZero(*dOutput.cumLogProbs);
 
     if (beamWidth > 1)
     {
+        dOutput.cumLogProbs->reshape(batchSizeXbeamWidth);
+        mBufferManager.setZero(*dOutput.cumLogProbs);
         dOutput.beamHypotheses.reshape(batchSize, beamWidth, mMaxSequenceLength);
     }
     else
@@ -111,6 +118,7 @@ void StatefulGptDecoder::reshapeBuffers(SizeType batchSize, SizeType beamWidth,
     mNbSteps = 0;
     mFinished.clear();
     mFinished.resize(batchSize, true);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 namespace
@@ -119,6 +127,7 @@ void initOutputIds(TensorPtr const& outputIds, TensorPtr const& inputIds, Tensor
     TensorPtr const& inputOffsets, SizeType const padId, SizeType const endId, SizeType const maxInputLength,
     bool const inputPacked, CudaStream const& stream)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     kernels::invokeFill(*outputIds, endId, stream);
 
     if (inputPacked)
@@ -129,11 +138,13 @@ void initOutputIds(TensorPtr const& outputIds, TensorPtr const& inputIds, Tensor
     {
         kernels::invokeCopyInputToOutput(*outputIds, *inputIds, *inputLengths, padId, stream);
     }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 } // namespace
 
 void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig const& samplingConfig)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto& manager = mBufferManager;
     auto& stream = mStream;
 
@@ -155,7 +166,7 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig
     auto const& inputIds = inputs.ids;
     auto const inputLengthsHost = manager.copyFrom(*inputLengths, MemoryType::kCPU);
     auto const* inputLengthsData = bufferCast<SizeType>(*inputLengthsHost);
-    auto const maxInputLength = *std::max_element(inputLengthsData, inputLengthsData + inputLengths->getSize());
+    SizeType const maxInputLength = *std::max_element(inputLengthsData, inputLengthsData + inputLengths->getSize());
 
     TensorPtr inputOffsets = manager.emptyTensor(MemoryType::kGPU, TRTDataType<SizeType>::value);
     if (inputs.packed)
@@ -191,17 +202,17 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig
     manager.setZero(*dOutput.newTokens);
     manager.setZero(*dOutput.finished);
     manager.setZero(*dOutput.finishedSum);
-    kernels::invokeFill(*dOutput.lengths, maxInputLength, *stream);
-    std::vector<float> cumLogProbsHost(batchSize * beamWidth, DecodingOutput::kNegativeInfinity);
-    // Set the entries for the first beam to 0
-    for (SizeType i = 0; i < batchSize; ++i)
-    {
-        cumLogProbsHost[tc::flat_index2(i, 0, beamWidth)] = 0;
-    }
-    manager.copy(cumLogProbsHost.data(), *dOutput.cumLogProbs);
 
     if (beamWidth > 1)
     {
+        std::vector<float> cumLogProbsHost(batchSize * beamWidth, DecodingOutput::kNegativeInfinity);
+        // Set the entries for the first beam to 0
+        for (SizeType i = 0; i < batchSize; ++i)
+        {
+            cumLogProbsHost[tc::flat_index2(i, 0, beamWidth)] = 0;
+        }
+        manager.copy(cumLogProbsHost.data(), *dOutput.cumLogProbs);
+
         // kernels::invokeFill(*dOutput.cumLogProbs, DecodingOutput::kNegativeInfinity, *stream);
         // for (SizeType batchIdx = 0; batchIdx < batchSize; ++batchIdx)
         // {
@@ -225,10 +236,12 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig
     mNbSteps = 0;
     mFinished.clear();
     mFinished.resize(batchSize, false);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 bool StatefulGptDecoder::forward(decoder::Output& output, decoder::Input const& input)
 {
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto& logits = input.logits;
     auto const& logitsShape = logits->getShape();
 
@@ -245,6 +258,7 @@ bool StatefulGptDecoder::forward(decoder::Output& output, decoder::Input const&
         "Specify both srcCacheIndirection and tgtCacheIndirection or neither.");
     TLLM_CHECK(!srcCacheIndirection || srcCacheIndirection->getDataType() == TRTDataType<SizeType>::value);
     TLLM_CHECK(!tgtCacheIndirection || tgtCacheIndirection->getDataType() == TRTDataType<SizeType>::value);
+    auto& sequenceLengths = output.sequenceLengths;
 
     auto& stream = mStream;
     auto& dInput = *mDecodingInput;
@@ -255,6 +269,7 @@ bool StatefulGptDecoder::forward(decoder::Output& output, decoder::Input const&
         dInput.cacheIndirection = srcCacheIndirection;
         dOutput.cacheIndirection = tgtCacheIndirection;
     }
+    dOutput.lengths = sequenceLengths;
 
     auto& decoder = *mDecoder;
     decoder.forwardAsync(dOutput, dInput);
@@ -269,15 +284,18 @@ bool StatefulGptDecoder::forward(decoder::Output& output, decoder::Input const&
         || *bufferCast<SizeType>(*dOutput.finishedSum) == static_cast<SizeType>(dOutput.finished->getSize());
 
     std::fill(mFinished.begin(), mFinished.end(), finished);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
     return finished;
 }
 
 IStatefulGptDecoder::TensorPtr StatefulGptDecoder::getFinalOutputIds() const
 {
     // TODO (rkobus) can we do this inplace?
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto& outputIds = mDecodingOutput->ids;
     auto finalOutputIds = mBufferManager.gpu(outputIds->getShape(), outputIds->getDataType());
     IGptDecoder::gatherTree(*finalOutputIds, *mDecodingOutput, *mDecodingInput, mBufferManager);
     mBufferManager.copy(*finalOutputIds, *outputIds);
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
     return outputIds;
 }
diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
index de5e2b8b561..9b7c51284de 100644
--- a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
+++ b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
@@ -15,6 +15,7 @@
  */
 #include "tllmRuntime.h"
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/nvtxUtils.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include "tensorrt_llm/common/tensor.h"
 #include "tllmBuffers.h"
@@ -99,20 +100,24 @@ void TllmRuntime::clearContexts()
     mContexts.clear();
 }
 
-bool TllmRuntime::executeContext(SizeType contextIndex)
+bool TllmRuntime::executeContext(SizeType contextIndex) const
 {
+    NVTX3_FUNC_RANGE();
     auto& context = getContext(contextIndex);
     return context.enqueueV3(mStream->get());
 }
 
 void TllmRuntime::setInputTensors(SizeType contextIndex, TensorMap const& tensorMap)
 {
+    NVTX3_FUNC_RANGE();
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto& context = getContext(contextIndex);
     for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i)
     {
         auto const name = mEngine->getIOTensorName(i);
         if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kINPUT)
         {
+            NVTX3_SCOPED_RANGE(input_tensor);
             auto pos = tensorMap.find(name);
             if (pos == tensorMap.end())
             {
@@ -126,14 +131,14 @@ void TllmRuntime::setInputTensors(SizeType contextIndex, TensorMap const& tensor
             auto const shapeProvided = tensor->getShape();
             TLLM_CHECK_WITH_INFO(shapeExpected.nbDims == shapeProvided.nbDims,
                 tc::fmtstr("%s: expected %d dims, provided %d dims", name, shapeExpected.nbDims, shapeProvided.nbDims));
-            for (SizeType i = 0; i < shapeExpected.nbDims; ++i)
+            for (SizeType j = 0; j < shapeExpected.nbDims; ++j)
             {
-                auto const dimExpected = shapeExpected.d[i];
-                auto const dimProvided = shapeProvided.d[i];
+                auto const dimExpected = shapeExpected.d[j];
+                auto const dimProvided = shapeProvided.d[j];
                 if (dimExpected >= 0 && dimExpected != dimProvided)
                 {
                     TLLM_LOG_WARNING(
-                        "%s: expected dim[%d] = %d, provided dim[%d] = %d", name, i, dimExpected, i, dimProvided);
+                        "%s: expected dim[%d] = %d, provided dim[%d] = %d", name, j, dimExpected, j, dimProvided);
                 }
             }
             TLLM_CHECK_WITH_INFO(context.setInputShape(name, shapeProvided), name);
@@ -155,30 +160,37 @@ void TllmRuntime::setInputTensors(SizeType contextIndex, TensorMap const& tensor
         }
     }
 
-    char const* missing;
-    auto const nbMissing = context.inferShapes(1, &missing);
-    if (nbMissing > 0)
     {
-        TLLM_THROW("Input shape not specified: %s", missing);
+        NVTX3_SCOPED_RANGE(infer_shapes);
+        char const* missing;
+        auto const nbMissing = context.inferShapes(1, &missing);
+        if (nbMissing > 0)
+        {
+            TLLM_THROW("Input shape not specified: %s", missing);
+        }
+        else if (nbMissing < 0)
+        {
+            TLLM_THROW("Invalid input shape");
+        }
     }
-    else if (nbMissing < 0)
+
     {
-        TLLM_THROW("Invalid input shape");
+        NVTX3_SCOPED_RANGE(final_checks);
+        TLLM_CHECK_WITH_INFO(context.allInputDimensionsSpecified(), "Input dimensions not specified");
+        TLLM_CHECK_WITH_INFO(context.allInputShapesSpecified(), "Input shapes not specified");
     }
-
-    TLLM_CHECK_WITH_INFO(context.allInputDimensionsSpecified(), "Input dimensions not specified");
-    TLLM_CHECK_WITH_INFO(context.allInputShapesSpecified(), "Input shapes not specified");
 }
 
 void TllmRuntime::setOutputTensors(SizeType contextIndex, TensorMap& tensorMap)
 {
-
+    NVTX3_FUNC_RANGE();
     auto& context = getContext(contextIndex);
     for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i)
     {
         auto const name = mEngine->getIOTensorName(i);
         if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kOUTPUT)
         {
+            NVTX3_SCOPED_RANGE(output_tensor);
             auto const dims = context.getTensorShape(name);
             auto const type = mEngine->getTensorDataType(name);
             auto pos = tensorMap.find(name);
diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.h b/cpp/tensorrt_llm/runtime/tllmRuntime.h
index c6de6edb1a4..0a1e438445f 100644
--- a/cpp/tensorrt_llm/runtime/tllmRuntime.h
+++ b/cpp/tensorrt_llm/runtime/tllmRuntime.h
@@ -50,7 +50,7 @@ class TllmRuntime
         return static_cast<SizeType>(mContexts.size());
     }
 
-    nvinfer1::IExecutionContext& getContext(SizeType contextIndex)
+    nvinfer1::IExecutionContext& getContext(SizeType contextIndex) const
     {
         return *mContexts.at(contextIndex);
     }
@@ -68,7 +68,7 @@ class TllmRuntime
 
     void setOutputTensors(SizeType contextIndex, TensorMap& tensorMap);
 
-    bool executeContext(SizeType contextIndex);
+    bool executeContext(SizeType contextIndex) const;
 
     CudaStream const& getStream() const;
 
diff --git a/cpp/tensorrt_llm/runtime/torchView.h b/cpp/tensorrt_llm/runtime/torchView.h
index f10555774cb..93d6cdbf491 100644
--- a/cpp/tensorrt_llm/runtime/torchView.h
+++ b/cpp/tensorrt_llm/runtime/torchView.h
@@ -18,6 +18,7 @@
 
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/torchUtils.h"
 
 #include <ATen/ATen.h>
 #include <torch/types.h>
diff --git a/cpp/tensorrt_llm/runtime/utils/multiDeviceUtils.h b/cpp/tensorrt_llm/runtime/utils/multiDeviceUtils.h
new file mode 100644
index 00000000000..1acd2e53091
--- /dev/null
+++ b/cpp/tensorrt_llm/runtime/utils/multiDeviceUtils.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/common/stringUtils.h"
+
+#include <mpi.h>
+
+#if ENABLE_MULTI_DEVICE
+#include <nccl.h>
+#endif // ENABLE_MULTI_DEVICE
+
+#define TLLM_MPI_CHECK(cmd, logger)                                                                                    \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        auto e = cmd;                                                                                                  \
+        if (e != MPI_SUCCESS)                                                                                          \
+        {                                                                                                              \
+            logger.log(nvinfer1::ILogger::Severity::kERROR,                                                            \
+                tensorrt_llm::common::fmtstr("Failed: MPI error %s:%d '%d'", __FILE__, __LINE__, e).c_str());          \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+
+#if ENABLE_MULTI_DEVICE
+#define TLLM_NCCL_CHECK(cmd, logger)                                                                                   \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        ncclResult_t r = cmd;                                                                                          \
+        if (r != ncclSuccess)                                                                                          \
+        {                                                                                                              \
+            logger.log(nvinfer1::ILogger::Severity::kERROR,                                                            \
+                tensorrt_llm::common::fmtstr(                                                                          \
+                    "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(r))                      \
+                    .c_str());                                                                                         \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+#endif // ENABLE_MULTI_DEVICE
diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
index 984ed057745..c7884b1cfc8 100644
--- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
+++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
@@ -50,29 +50,18 @@ std::vector<uint8_t> loadEngine(std::string const& enginePath)
     return engineBlob;
 }
 
-void insertTensorVector(StringPtrMap<ITensor>& map, std::string const& key, std::vector<ITensor::SharedPtr> const& vec)
-{
-    for (std::size_t i = 0; i < vec.size(); ++i)
-        map.insert_or_assign(key + std::to_string(i), vec[i]);
-}
-
-nvinfer1::DataType getTensorDataType(nvinfer1::ICudaEngine const& engine, std::string const& name)
-{
-    return engine.getTensorDataType(name.c_str());
-}
-
-std::vector<ITensor::SharedPtr> createBufferVector(
-    TllmRuntime const& runtime, SizeType const numBuffers, std::string const& prefix, MemoryType memType)
+std::vector<ITensor::SharedPtr> createBufferVector(TllmRuntime const& runtime, SizeType const indexOffset,
+    SizeType const numBuffers, std::string const& prefix, MemoryType memType)
 {
     auto const& manager = runtime.getBufferManager();
     auto const& engine = runtime.getEngine();
 
     std::vector<ITensor::SharedPtr> vector;
 
-    for (SizeType i = 0; i < numBuffers; ++i)
+    for (SizeType i = indexOffset; i < indexOffset + numBuffers; ++i)
     {
         std::string name{prefix + std::to_string(i)};
-        auto type = getTensorDataType(engine, name);
+        auto type = engine.getTensorDataType(name.c_str());
         vector.emplace_back(manager.emptyTensor(memType, type));
     }
     return vector;
@@ -86,6 +75,25 @@ void reshapeBufferVector(std::vector<ITensor::SharedPtr>& vector, nvinfer1::Dims
     }
 }
 
+void insertTensorVector(StringPtrMap<ITensor>& map, std::string const& key, std::vector<ITensor::SharedPtr> const& vec,
+    SizeType const indexOffset)
+{
+    for (std::size_t i = 0; i < vec.size(); ++i)
+        map.insert_or_assign(key + std::to_string(indexOffset + i), vec[i]);
+}
+
+void insertTensorSlices(
+    StringPtrMap<ITensor>& map, std::string const& key, ITensor::SharedPtr const& tensor, SizeType const indexOffset)
+{
+    auto const numSlices = tensor->getShape().d[0];
+    for (SizeType i = 0; i < numSlices; ++i)
+    {
+        ITensor::SharedPtr slice = ITensor::slice(tensor, i, 1);
+        slice->squeeze(0);
+        map.insert_or_assign(key + std::to_string(indexOffset + i), slice);
+    }
+}
+
 void setRawPointers(ITensor& pointers, ITensor::SharedPtr const& input, int32_t pointersSlot, int32_t inputSlot)
 {
     auto const pointersLength = static_cast<int32_t>(pointers.getSizeInBytes() / sizeof(void**));
diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
index 48b42c322b1..538d56edd22 100644
--- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
+++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
@@ -37,14 +37,16 @@ int initDevice(WorldConfig const& worldConfig);
 
 std::vector<uint8_t> loadEngine(std::string const& enginePath);
 
-void insertTensorVector(StringPtrMap<ITensor>& map, std::string const& key, std::vector<ITensor::SharedPtr> const& vec);
+std::vector<ITensor::SharedPtr> createBufferVector(TllmRuntime const& runtime, SizeType indexOffset,
+    SizeType numBuffers, std::string const& prefix, MemoryType memType);
 
-nvinfer1::DataType getTensorDataType(nvinfer1::ICudaEngine const& engine, std::string const& name);
+void reshapeBufferVector(std::vector<ITensor::SharedPtr>& vector, nvinfer1::Dims const& shape);
 
-std::vector<ITensor::SharedPtr> createBufferVector(
-    TllmRuntime const& runtime, SizeType const numBuffers, std::string const& prefix, MemoryType memType);
+void insertTensorVector(StringPtrMap<ITensor>& map, std::string const& key, std::vector<ITensor::SharedPtr> const& vec,
+    SizeType indexOffset);
 
-void reshapeBufferVector(std::vector<ITensor::SharedPtr>& vector, nvinfer1::Dims const& shape);
+void insertTensorSlices(
+    StringPtrMap<ITensor>& map, std::string const& key, ITensor::SharedPtr const& tensor, SizeType indexOffset);
 
 void setRawPointers(ITensor& pointers, ITensor::SharedPtr const& input, int32_t pointersSlot, int32_t inputSlot);
 
diff --git a/cpp/tensorrt_llm/runtime/worldConfig.cpp b/cpp/tensorrt_llm/runtime/worldConfig.cpp
index e0523195c80..0ca547e9381 100644
--- a/cpp/tensorrt_llm/runtime/worldConfig.cpp
+++ b/cpp/tensorrt_llm/runtime/worldConfig.cpp
@@ -19,6 +19,7 @@
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
+#include "tensorrt_llm/runtime/utils/multiDeviceUtils.h"
 
 #include <cstdlib>
 #include <mpi.h>
@@ -26,18 +27,6 @@
 using namespace tensorrt_llm::runtime;
 namespace tc = tensorrt_llm::common;
 
-#define TLLM_MPI_CHECK(cmd, logger)                                                                                    \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        auto e = cmd;                                                                                                  \
-        if (e != MPI_SUCCESS)                                                                                          \
-        {                                                                                                              \
-            logger.log(nvinfer1::ILogger::Severity::kERROR,                                                            \
-                tc::fmtstr("Failed: MPI error %s:%d '%d'", __FILE__, __LINE__, e).c_str());                            \
-            exit(EXIT_FAILURE);                                                                                        \
-        }                                                                                                              \
-    } while (0)
-
 namespace
 {
 
@@ -67,7 +56,8 @@ void initMpi(nvinfer1::ILogger& logger, int threadMode = MPI_THREAD_FUNNELED)
 
 } // namespace
 
-WorldConfig WorldConfig::mpi(nvinfer1::ILogger& logger, SizeType gpusPerNode)
+WorldConfig WorldConfig::mpi(nvinfer1::ILogger& logger, SizeType gpusPerNode, std::optional<SizeType> tensorParallelism,
+    std::optional<SizeType> pipelineParallelism)
 {
     initMpi(logger);
 
@@ -75,11 +65,27 @@ WorldConfig WorldConfig::mpi(nvinfer1::ILogger& logger, SizeType gpusPerNode)
     TLLM_MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &mpiSize), logger);
     TLLM_MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank), logger);
     logger.log(nvinfer1::ILogger::Severity::kINFO, tc::fmtstr("MPI size: %d, rank: %d", mpiSize, mpiRank).c_str());
-    return WorldConfig{mpiSize, mpiRank, gpusPerNode};
+
+    auto pp = pipelineParallelism.value_or(1);
+    auto tp = tensorParallelism.value_or(mpiSize / pp);
+    TLLM_CHECK(mpiSize == tp * pp);
+    return WorldConfig{tp, pp, mpiRank, gpusPerNode};
 }
 
-WorldConfig WorldConfig::mpi(SizeType gpusPerNode)
+WorldConfig WorldConfig::mpi(
+    SizeType gpusPerNode, std::optional<SizeType> tensorParallelism, std::optional<SizeType> pipelineParallelism)
 {
     TllmLogger logger{};
-    return mpi(logger, gpusPerNode);
+    return mpi(logger, gpusPerNode, tensorParallelism, pipelineParallelism);
+}
+
+std::vector<SizeType> WorldConfig::getPipelineParallelGroup() const
+{
+    std::vector<SizeType> group;
+    auto const groupIdx = getTensorParallelRank();
+    for (SizeType i = 0; i < getPipelineParallelism(); ++i)
+    {
+        group.push_back(groupIdx + i * getTensorParallelism());
+    }
+    return group;
 }
diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt
index 2bbb7f72ba2..065b9b40617 100644
--- a/cpp/tensorrt_llm/thop/CMakeLists.txt
+++ b/cpp/tensorrt_llm/thop/CMakeLists.txt
@@ -15,12 +15,12 @@
 add_library(th_utils STATIC thUtils.cu torchAllocator.cpp)
 set_property(TARGET th_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET th_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-target_link_libraries(th_utils PUBLIC ${TORCH_LIBRARIES} -lcublas -lcudart
-                                      -lcurand)
+target_link_libraries(th_utils PUBLIC ${TORCH_LIBRARIES} ${CUBLAS_LIB}
+                                      ${CURAND_LIB})
 
 add_library(th_common SHARED dynamicDecodeOp.cpp weightOnlyQuantOp.cpp
                              gatherTreeOp.cpp fp8Op.cpp)
 set_property(TARGET th_common PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_link_libraries(
   th_common PRIVATE ${TORCH_LIBRARIES} th_utils ${Python3_LIBRARIES}
-                    ${STATIC_TARGET} "-Wl,--no-undefined")
+                    ${STATIC_TARGET} ${UNDEFINED_FLAG})
diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp
index e6e35180005..75d168368a9 100644
--- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp
+++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp
@@ -139,7 +139,7 @@ void FtDynamicDecode<T>::setup(size_t batch_size, size_t beam_width, th::optiona
 
 template <typename T>
 void FtDynamicDecode<T>::forward(th::Tensor& logits, // (batch_size, beam_width, hidden_size)
-    int step, int max_input_length, uint ite, int local_batch_size, th::Tensor end_id,
+    int step, int max_input_length, uint64_t ite, int local_batch_size, th::Tensor end_id,
     th::optional<th::Tensor> embedding_bias_opt, th::optional<th::Tensor> input_lengths_opt,
     th::optional<th::Tensor> sequence_limit_length_opt, th::optional<th::Tensor> stop_words_list_opt,
     th::optional<th::Tensor> bad_words_list_opt, th::optional<th::Tensor> no_repeat_ngram_size_opt,
@@ -342,7 +342,7 @@ th::Tensor DynamicDecodeOp::forward(th::Tensor logits, int64_t step, int64_t max
 
     dynamic_decode_->forward(
         // Inputs
-        logits, static_cast<int>(step), static_cast<int>(max_input_length), static_cast<uint>(ite),
+        logits, static_cast<int>(step), static_cast<int>(max_input_length), static_cast<uint32_t>(ite),
         static_cast<int>(local_batch_size), end_id, embedding_bias_opt, input_lengths_opt, sequence_limit_length_opt,
         stop_words_list_opt, bad_words_list_opt, no_repeat_ngram_size_opt, src_cache_indirection_opt,
         // Outputs
diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h
index 29cc97ee31c..d451ffadbb0 100644
--- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h
+++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h
@@ -38,7 +38,7 @@ class IFtDynamicDecode
         = 0;
 
     virtual void forward(th::Tensor& logits, // (batch_size, beam_width, hidden_size)
-        int step, int max_input_length, uint ite, int local_batch_size, th::Tensor end_id,
+        int step, int max_input_length, uint64_t ite, int local_batch_size, th::Tensor end_id,
         th::optional<th::Tensor> embedding_bias_opt, th::optional<th::Tensor> input_lengths_opt,
         th::optional<th::Tensor> sequence_limit_length_opt, th::optional<th::Tensor> stop_words_list_opt,
         th::optional<th::Tensor> bad_words_list_opt, th::optional<th::Tensor> no_repeat_ngram_size_opt,
@@ -76,7 +76,7 @@ class FtDynamicDecode : public IFtDynamicDecode
         th::optional<th::Tensor> top_p_reset_ids_opt) override;
 
     void forward(th::Tensor& logits, // (batch_size, beam_width, hidden_size)
-        int step, int max_input_length, uint ite, int local_batch_size, th::Tensor end_id,
+        int step, int max_input_length, uint64_t ite, int local_batch_size, th::Tensor end_id,
         th::optional<th::Tensor> embedding_bias_opt, th::optional<th::Tensor> input_lengths_opt,
         th::optional<th::Tensor> sequence_limit_length_opt, th::optional<th::Tensor> stop_words_list_opt,
         th::optional<th::Tensor> bad_words_list_opt, th::optional<th::Tensor> no_repeat_ngram_size_opt,
diff --git a/cpp/tensorrt_llm/thop/fp8Op.cpp b/cpp/tensorrt_llm/thop/fp8Op.cpp
index 011cf92f026..231046a4df6 100644
--- a/cpp/tensorrt_llm/thop/fp8Op.cpp
+++ b/cpp/tensorrt_llm/thop/fp8Op.cpp
@@ -39,10 +39,10 @@ std::vector<Tensor> e4m3_quantize_helper(Tensor input, QuantizeMode quantize_mod
     TORCH_CHECK(_st == torch::kFloat32 || _st == torch::kFloat16 || _st == torch::kBFloat16,
         "Invalid datatype. input must be FP16 or BF16 or FP32");
 
-    std::vector<long int> quantized_input_shape;
+    std::vector<int64_t> quantized_input_shape;
     for (int i = 0; i < input.dim(); i++)
         quantized_input_shape.push_back(input.size(i));
-    std::vector<long int> scale_shape;
+    std::vector<int64_t> scale_shape;
     if (quantize_mode == QuantizeMode::PER_TOKEN)
     {
         for (int i = 0; i < input.dim() - 1; i++)
@@ -113,7 +113,7 @@ Tensor e4m3_dequantize_helper(Tensor input, Tensor scales, QuantizeMode quantize
 
     TORCH_CHECK(input.scalar_type() == torch::kInt8, "Invalid datatype. input must be Int8 (Fp8)");
 
-    std::vector<long int> dequantized_input_shape;
+    std::vector<int64_t> dequantized_input_shape;
     for (int i = 0; i < input.dim(); i++)
         dequantized_input_shape.push_back(input.size(i));
     TORCH_CHECK(scales.dim() == input.dim());
diff --git a/cpp/tensorrt_llm/thop/gatherTreeOp.cpp b/cpp/tensorrt_llm/thop/gatherTreeOp.cpp
index 0058528af9c..f71d6df7213 100644
--- a/cpp/tensorrt_llm/thop/gatherTreeOp.cpp
+++ b/cpp/tensorrt_llm/thop/gatherTreeOp.cpp
@@ -32,13 +32,13 @@ th::Tensor gatherTree(th::Tensor& sequence_lengths, th::Tensor& output_ids, th::
     th::optional<th::Tensor> beam_hyps_log_probs, th::optional<th::Tensor> beam_hyps_min_normed_scores,
     th::optional<th::Tensor> beam_hyps_num_beams, th::optional<th::Tensor> beam_hyps_is_done,
     th::optional<th::Tensor> finished, th::Tensor& length_penalty, int64_t batch_size, int64_t beam_width,
-    int64_t max_input_length, int64_t max_seq_len, bool use_beam_hyps)
+    int64_t max_seq_len, bool use_beam_hyps)
 {
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    th::Tensor final_output_ids = torch::zeros(
+        {batch_size, beam_width, max_seq_len}, torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false));
     if (use_beam_hyps && beam_width > 1)
     {
-        auto stream = at::cuda::getCurrentCUDAStream().stream();
-        th::Tensor final_output_ids = torch::zeros({batch_size, beam_width, max_seq_len},
-            torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false));
         tl::kernels::invokeInitializeOutput(get_ptr<int32_t>(final_output_ids), get_ptr<int32_t>(end_ids),
             batch_size * beam_width, max_seq_len, stream);
 
@@ -69,19 +69,14 @@ th::Tensor gatherTree(th::Tensor& sequence_lengths, th::Tensor& output_ids, th::
             nullptr, // output_logs
             beamHypotheses.output_ids_tgt, beamHypotheses.sequence_lengths_tgt, beamHypotheses.normed_scores,
             beamHypotheses.cum_log_probs, beamHypotheses.log_probs, beamHypotheses.num_beams,
-            get_ptr<int32_t>(tiled_input_lengths), beam_width, max_seq_len, batch_size, max_input_length, stream);
+            get_ptr<int32_t>(tiled_input_lengths), beam_width, max_seq_len, batch_size, stream);
         sync_check_cuda_error();
-
-        return final_output_ids;
     }
-    else
+    else if (!use_beam_hyps && beam_width > 1)
     {
         th::Tensor workspace = torch::zeros(batch_size * beam_width * max_seq_len * sizeof(int32_t),
             torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
 
-        th::Tensor final_output_ids = torch::zeros({batch_size, beam_width, max_seq_len},
-            torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false));
-
         // For sampling, it is equivalent to all parent ids are 0.
         tl::kernels::gatherTreeParam param;
         param.beams = get_ptr<int32_t>(workspace);
@@ -98,10 +93,9 @@ th::Tensor gatherTree(th::Tensor& sequence_lengths, th::Tensor& output_ids, th::
         param.step_ids = get_ptr<int32_t>(output_ids);
         param.parent_ids = beam_width == 1 ? nullptr : get_ptr<int32_t>(parent_ids);
         param.end_tokens = get_ptr<int32_t>(end_ids);
-        param.max_input_length = max_input_length;
         param.input_lengths = get_ptr<int32_t>(tiled_input_lengths);
 
-        param.stream = at::cuda::getCurrentCUDAStream().stream();
+        param.stream = stream;
         param.output_ids = get_ptr<int32_t>(final_output_ids);
         param.cum_log_probs = cum_log_probs_opt.has_value() ? get_ptr<float>(cum_log_probs_opt.value()) : nullptr;
         param.length_penalty = get_val<float>(length_penalty, 0);
@@ -109,8 +103,14 @@ th::Tensor gatherTree(th::Tensor& sequence_lengths, th::Tensor& output_ids, th::
         // NOTE: need to remove all prompt virtual tokens
         tl::kernels::invokeGatherTree(param);
         sync_check_cuda_error();
-        return final_output_ids;
     }
+    else
+    {
+        cudaMemcpyAsync(get_ptr<int32_t>(final_output_ids), get_ptr<int32_t>(output_ids),
+            sizeof(int) * batch_size * beam_width * max_seq_len, cudaMemcpyDeviceToDevice, stream);
+        sync_check_cuda_error();
+    }
+    return final_output_ids;
 }
 
 } // namespace torch_ext
diff --git a/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp b/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp
index fc9a4daf347..79e8751c8ab 100644
--- a/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp
+++ b/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp
@@ -164,17 +164,17 @@ std::vector<Tensor> symmetric_quantize_helper(
     const size_t input_mat_size = num_rows * num_cols;
     const size_t quantized_mat_size = num_rows * bytes_per_out_col;
 
-    std::vector<long int> quantized_weight_shape;
-    std::vector<long int> scale_shape;
+    std::vector<int64_t> quantized_weight_shape;
+    std::vector<int64_t> scale_shape;
     if (weight.dim() == 2)
     {
-        quantized_weight_shape = {long(num_rows), long(bytes_per_out_col)};
-        scale_shape = {long(num_cols)};
+        quantized_weight_shape = {int64_t(num_rows), int64_t(bytes_per_out_col)};
+        scale_shape = {int64_t(num_cols)};
     }
     else if (weight.dim() == 3)
     {
-        quantized_weight_shape = {long(num_experts), long(num_rows), long(bytes_per_out_col)};
-        scale_shape = {long(num_experts), long(num_cols)};
+        quantized_weight_shape = {int64_t(num_experts), int64_t(num_rows), int64_t(bytes_per_out_col)};
+        scale_shape = {int64_t(num_experts), int64_t(num_cols)};
     }
     else
     {
@@ -273,7 +273,7 @@ Tensor unpack_int4_packed_tensor_to_int8(Tensor weight)
     TORCH_CHECK(weight.numel() != 0, "weight should not be empty tensor");
     TORCH_CHECK(weight.dtype() == torch::kInt8, "Weight must be a packed int8 tensor");
 
-    std::vector<long int> int8_tensor_size(weight.dim());
+    std::vector<int64_t> int8_tensor_size(weight.dim());
     for (int i = 0; i < weight.dim(); ++i)
     {
         int8_tensor_size[i] = weight.size(i);
@@ -307,7 +307,7 @@ Tensor pack_int8_tensor_to_packed_int4(Tensor weight)
     TORCH_CHECK(weight.numel() != 0, "weight should not be empty tensor");
     TORCH_CHECK(weight.dtype() == torch::kInt8, "Weight must be a int8 tensor");
 
-    std::vector<long int> packed_tensor_size(weight.dim());
+    std::vector<int64_t> packed_tensor_size(weight.dim());
     for (int i = 0; i < weight.dim(); ++i)
     {
         packed_tensor_size[i] = weight.size(i);
diff --git a/cpp/tests/README.md b/cpp/tests/README.md
index 98506379e71..5c2ee9a84a7 100644
--- a/cpp/tests/README.md
+++ b/cpp/tests/README.md
@@ -26,6 +26,12 @@ Single tests can be executed from `CPP_BUILD_DIR/tests`, e.g.
 
 ### Build engines
 
+To avoid discrepancy in the reference and tests data set `SKIP_GEMM_PLUGIN_PROFILINGS=1` to disable GEMM tactic profiling in GEMM plugins.
+
+```bash
+export SKIP_GEMM_PLUGIN_PROFILINGS=1
+```
+
 [Scripts](resources/scripts) are provided that download the GPT2 and GPT-J models from Huggingface and convert them to TensorRT engines.
 The weights and built engines are stored under [cpp/tests/resources/models](resources/models).
 To build the engines from the top-level directory:
@@ -33,6 +39,7 @@ To build the engines from the top-level directory:
 ```bash
 PYTHONPATH=examples/gpt python3 cpp/tests/resources/scripts/build_gpt_engines.py
 PYTHONPATH=examples/gptj python3 cpp/tests/resources/scripts/build_gptj_engines.py
+PYTHONPATH=examples/llama python3 cpp/tests/resources/scripts/build_llama_engines.py
 ```
 
 ### Generate expected output
@@ -42,6 +49,7 @@ End-to-end tests read inputs and expected outputs from Numpy files located at [c
 ```bash
 PYTHONPATH=examples/gpt python3 cpp/tests/resources/scripts/generate_expected_gpt_output.py
 PYTHONPATH=examples/gptj python3 cpp/tests/resources/scripts/generate_expected_gptj_output.py
+PYTHONPATH=examples/llama python3 cpp/tests/resources/scripts/generate_expected_llama_output.py
 ```
 
 ### Run test
diff --git a/cpp/tests/resources/.gitignore b/cpp/tests/resources/.gitignore
index 949c93855ed..d864e5f6cfc 100644
--- a/cpp/tests/resources/.gitignore
+++ b/cpp/tests/resources/.gitignore
@@ -1,5 +1,6 @@
 models/gpt2
 models/gpt-j-6b
+models/llama-7b-hf
 models/c-model
 models/rt_engine
 /models/v2
diff --git a/cpp/tests/resources/scripts/build_gpt_engines.py b/cpp/tests/resources/scripts/build_gpt_engines.py
index 8ed877a6a11..2ac8747d971 100755
--- a/cpp/tests/resources/scripts/build_gpt_engines.py
+++ b/cpp/tests/resources/scripts/build_gpt_engines.py
@@ -131,15 +131,6 @@ def build_engines(model_cache: _tp.Optional[str] = None, world_size: int = 1):
                  engine_dir / 'fp16-plugin-packed-paged' / tp_dir, world_size,
                  '--dtype=float16', '--use_gpt_attention_plugin=float16',
                  '--remove_input_padding', '--paged_kv_cache')
-    # build_engine(fp16_weight_dir_x_gpu,
-    #              engine_dir / 'fp16-inflight-batching-plugin' / tp_dir,
-    #              world_size, '--dtype=float16', '--use_inflight_batching',
-    #              '--use_gpt_attention_plugin=float16', '--remove_input_padding')
-    build_engine(fp16_weight_dir_x_gpu,
-                 engine_dir / 'fp16-inflight-batching-plugin-paged' / tp_dir,
-                 world_size, '--dtype=float16', '--use_inflight_batching',
-                 '--use_gpt_attention_plugin=float16', '--remove_input_padding',
-                 '--paged_kv_cache')
 
     print("Done.")
 
diff --git a/cpp/tests/resources/scripts/build_gptj_engines.py b/cpp/tests/resources/scripts/build_gptj_engines.py
index 1913065aa43..f6002699480 100755
--- a/cpp/tests/resources/scripts/build_gptj_engines.py
+++ b/cpp/tests/resources/scripts/build_gptj_engines.py
@@ -106,12 +106,10 @@ def build_engines(model_cache: _tp.Optional[str] = None, only_fp8=False):
                      '--use_gpt_attention_plugin=float16',
                      '--remove_input_padding')
 
-        print("\nBuilding fp16-inflight-batching-plugin-paged engine")
-        build_engine(hf_dir,
-                     engine_dir / 'fp16-inflight-batching-plugin-paged/1-gpu',
+        print("\nBuilding fp16-plugin-packed-paged engine")
+        build_engine(hf_dir, engine_dir / 'fp16-plugin-packed-paged/1-gpu',
                      '--use_gpt_attention_plugin=float16',
-                     '--use_inflight_batching', '--remove_input_padding',
-                     '--paged_kv_cache')
+                     '--use_inflight_batching')
         print("Done.")
 
 
diff --git a/cpp/tests/resources/scripts/build_llama_engines.py b/cpp/tests/resources/scripts/build_llama_engines.py
new file mode 100644
index 00000000000..30d0eee6330
--- /dev/null
+++ b/cpp/tests/resources/scripts/build_llama_engines.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse as _arg
+import pathlib as _pl
+import subprocess as _sp
+import sys as _sys
+import typing as _tp
+
+
+def run_command(command: _tp.Sequence[str], *, cwd=None, **kwargs) -> None:
+    print(f"Running: cd %s && %s" %
+          (str(cwd or _pl.Path.cwd()), " ".join(command)))
+    _sp.check_call(command, cwd=cwd, **kwargs)
+
+
+def build_engine(weigth_dir: _pl.Path, engine_dir: _pl.Path, *args):
+    build_args = [_sys.executable, "examples/llama/build.py"] + (
+        ['--model_dir', str(weigth_dir)] if weigth_dir else []) + [
+            '--output_dir',
+            str(engine_dir),
+            '--dtype=float16',
+            '--use_gpt_attention_plugin=float16',
+            '--use_gemm_plugin=float16',
+            '--max_batch_size=32',
+            '--max_input_len=40',
+            '--max_output_len=20',
+            '--max_beam_width=2',
+            '--log_level=error',
+        ] + list(args)
+    run_command(build_args)
+
+
+def build_engines(model_cache: str):
+    resources_dir = _pl.Path(__file__).parent.resolve().parent
+    models_dir = resources_dir / 'models'
+    model_name = 'llama-7b-hf'
+
+    if model_cache:
+        print("Copy model from model_cache")
+        model_cache_dir = _pl.Path(model_cache) / 'llama-models' / model_name
+        assert (model_cache_dir.is_dir())
+
+        run_command(["rsync", "-av", str(model_cache_dir), "."], cwd=models_dir)
+
+    hf_dir = models_dir / model_name
+    assert hf_dir.is_dir()
+
+    engine_dir = models_dir / 'rt_engine' / model_name
+
+    tp_size = 1
+    pp_size = 1
+    print(f"\nBuilding fp16 tp{tp_size} pp{pp_size} engine")
+    build_engine(hf_dir, engine_dir / 'fp16-plugin/1-gpu')
+
+    tp_size = 2
+    pp_size = 2
+    world_size = tp_size * pp_size
+    print(f"\nBuilding fp16 tp{tp_size} pp{pp_size} engine")
+    build_engine(hf_dir, engine_dir / f'fp16-plugin/{world_size}-gpu',
+                 f'--world_size={world_size}', f'--tp_size={tp_size}',
+                 f'--pp_size={pp_size}')
+
+    print("Done.")
+
+
+if __name__ == "__main__":
+    parser = _arg.ArgumentParser()
+    parser.add_argument("--model_cache",
+                        type=str,
+                        help="Directory where models are stored")
+
+    build_engines(**vars(parser.parse_args()))
diff --git a/cpp/tests/resources/scripts/generate_expected_llama_output.py b/cpp/tests/resources/scripts/generate_expected_llama_output.py
new file mode 100644
index 00000000000..f263352163e
--- /dev/null
+++ b/cpp/tests/resources/scripts/generate_expected_llama_output.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import run
+
+
+def generate_output(engine: str,
+                    num_beams: int,
+                    output_name: str,
+                    max_output_len: int = 8):
+
+    model = 'llama-7b-hf'
+    resources_dir = Path(__file__).parent.resolve().parent
+    models_dir = resources_dir / 'models'
+    hf_dir = models_dir / model
+    engine_dir = models_dir / 'rt_engine' / model / engine / '1-gpu/'
+
+    data_dir = resources_dir / 'data'
+    input_file = data_dir / 'input_tokens.npy'
+    model_data_dir = data_dir / model
+    if num_beams <= 1:
+        output_dir = model_data_dir / 'sampling'
+    else:
+        output_dir = model_data_dir / ('beam_search_' + str(num_beams))
+
+    run.generate(engine_dir=str(engine_dir),
+                 tokenizer_dir=str(hf_dir),
+                 input_file=str(input_file),
+                 output_npy=str(output_dir / (output_name + '.npy')),
+                 output_csv=str(output_dir / (output_name + '.csv')),
+                 max_output_len=max_output_len,
+                 num_beams=num_beams)
+
+
+def generate_outputs(num_beams):
+    print('Generating Llama FP16 outputs')
+    generate_output(engine='fp16-plugin',
+                    num_beams=num_beams,
+                    output_name='output_tokens_fp16_plugin')
+
+
+if __name__ == '__main__':
+    generate_outputs(num_beams=1)
+    generate_outputs(num_beams=2)
diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py
index 1f7e3d01460..9ef98e88138 100755
--- a/cpp/tests/resources/scripts/test_cpp.py
+++ b/cpp/tests/resources/scripts/test_cpp.py
@@ -49,6 +49,7 @@ def run_tests(cuda_architectures: _tp.Optional[str] = None,
               dist_dir: _tp.Optional[str] = None,
               model_cache: _tp.Optional[str] = None,
               skip_gptj=False,
+              skip_llama=False,
               only_fp8=False,
               trt_root: _tp.Optional[str] = None) -> None:
     root_dir = find_root_dir()
@@ -92,7 +93,10 @@ def run_command(command: _tp.Sequence[str],
     model_cache = ["--model_cache", model_cache] if model_cache else []
     only_fp8_arg = ["--only_fp8"] if only_fp8 else []
 
-    gpt_env = {**_os.environ, "PYTHONPATH": "examples/gpt"}
+    gpt_env = {
+        **_os.environ, "PYTHONPATH": "examples/gpt",
+        "SKIP_GEMM_PLUGIN_PROFILINGS": "1"
+    }
     build_gpt_engines = [python_exe,
                          str(scripts_dir / "build_gpt_engines.py")
                          ] + model_cache
@@ -110,7 +114,10 @@ def run_command(command: _tp.Sequence[str],
         ] + model_cache + only_fp8_arg
         run_command(build_gptj_engines)
 
-        gptj_env = {**_os.environ, "PYTHONPATH": "examples/gptj"}
+        gptj_env = {
+            **_os.environ, "PYTHONPATH": "examples/gptj",
+            "SKIP_GEMM_PLUGIN_PROFILINGS": "1"
+        }
         generate_expected_gptj_output = [
             python_exe,
             str(scripts_dir / "generate_expected_gptj_output.py")
@@ -119,22 +126,46 @@ def run_command(command: _tp.Sequence[str],
     else:
         _log.info("Skipping GPT-J tests")
 
+    if not skip_llama:
+        build_llama_engines = [
+            python_exe, str(scripts_dir / "build_llama_engines.py")
+        ] + model_cache
+        run_command(build_llama_engines)
+
+        llama_env = {
+            **_os.environ, "PYTHONPATH": "examples/llama",
+            "SKIP_GEMM_PLUGIN_PROFILINGS": "1"
+        }
+        generate_expected_llama_output = [
+            python_exe,
+            str(scripts_dir / "generate_expected_llama_output.py")
+        ]
+        run_command(generate_expected_llama_output, env=llama_env)
+    else:
+        _log.info("Skipping Lllama tests")
+
     build_dir = build_dir if build_dir.is_absolute() else root_dir / build_dir
 
     make_google_tests = ["make", "-j", "google-tests"]
     run_command(make_google_tests, cwd=build_dir)
 
+    cpp_env = {**_os.environ, "SKIP_GEMM_PLUGIN_PROFILINGS": "1"}
     ctest = ["ctest", "--output-on-failure", "--output-junit", "report.xml"]
+    excluded_tests = []
     if skip_gptj:
-        ctest.extend(["-E", ".*Gptj.*"])
+        excluded_tests.append(".*Gptj.*")
+    if skip_llama:
+        excluded_tests.append(".*Llama.*")
     if only_fp8:
         ctest.extend(["-R", ".*FP8.*"])
     else:
-        ctest.extend(["-E", ".*FP8.*"])
-    run_command(ctest, cwd=build_dir)
+        excluded_tests.append(".*FP8.*")
+    if excluded_tests:
+        ctest.extend(["-E", "|".join(excluded_tests)])
+    run_command(ctest, cwd=build_dir, env=cpp_env)
 
     make_benchmarks = ["make", "-j", "benchmarks"]
-    run_command(make_benchmarks, cwd=build_dir)
+    run_command(make_benchmarks, cwd=build_dir, env=cpp_env)
 
     benchmark = [
         str(build_dir / "benchmarks" / "gptSessionBenchmark"), "--model", "gpt",
@@ -142,7 +173,7 @@ def run_command(command: _tp.Sequence[str],
         "../tests/resources/models/rt_engine/gpt2/fp16-plugin/1-gpu",
         "--batch_size", "8", "--input_output_len", "10,20", "--duration", "10"
     ]
-    run_command(benchmark, cwd=build_dir)
+    run_command(benchmark, cwd=build_dir, env=cpp_env)
 
 
 if __name__ == "__main__":
@@ -165,6 +196,9 @@ def run_command(command: _tp.Sequence[str],
     parser.add_argument("--skip_gptj",
                         action="store_true",
                         help="Skip the tests for GPT-J")
+    parser.add_argument("--skip_llama",
+                        action="store_true",
+                        help="Skip the tests for Llama")
     parser.add_argument(
         "--only_fp8",
         action="store_true",
diff --git a/cpp/tests/runtime/gptDecoderBatchTest.cpp b/cpp/tests/runtime/gptDecoderBatchTest.cpp
index 02fcbec1570..f0e90ba9f54 100644
--- a/cpp/tests/runtime/gptDecoderBatchTest.cpp
+++ b/cpp/tests/runtime/gptDecoderBatchTest.cpp
@@ -92,9 +92,11 @@ void verifyResults(BufferManager& manager, GptDecoderBatch const& decoder,
 
 void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig> const& samplingConfigs, int maxBeamWidth)
 {
-    SizeType constexpr worldSize{1};
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    SizeType constexpr tensorParallelism{1};
+    SizeType constexpr pipelineParallelism{1};
     SizeType constexpr localRank{0};
-    WorldConfig constexpr worldConfig{worldSize, localRank};
+    WorldConfig constexpr worldConfig{tensorParallelism, pipelineParallelism, localRank};
 
     SizeType constexpr vocabSize{51200};
     SizeType constexpr nbLayers{2};
@@ -122,6 +124,14 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig> con
     decoder.setup(batchSize, maxBeamWidth, maxSeqLength, modelConfig.getDataType());
 
     std::vector<SizeType> const inputLengths{4, 5, 6, 7};
+    std::vector<SizeType> tiledInputLengths;
+    for (int batch_id = 0; batch_id < inputLengths.size(); batch_id++)
+    {
+        for (int beam_id = 0; beam_id < maxBeamWidth; beam_id++)
+        {
+            tiledInputLengths.push_back(inputLengths.at(batch_id));
+        }
+    }
 
     // set up inputs
     auto logits = std::shared_ptr(
@@ -147,6 +157,10 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig> con
         manager.setZero(*tgtCacheIndirection);
         outputs.cacheIndirection = tgtCacheIndirection;
     }
+    auto sequenceLengths
+        = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize * maxBeamWidth}), TRTDataType<SizeType>::value));
+    manager.copy(tiledInputLengths.data(), *sequenceLengths);
+    outputs.sequenceLengths = sequenceLengths;
 
     auto constexpr tokenId = 1;
     std::vector<decoder_batch::Input::TensorPtr> inputIds;
@@ -198,9 +212,11 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig> con
 void testDecoderWavefront(
     nvinfer1::DataType const dtype, std::vector<SamplingConfig> const& samplingConfigs, int maxBeamWidth)
 {
-    SizeType constexpr worldSize{1};
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    SizeType constexpr tensorParallelism{1};
+    SizeType constexpr pipelineParallelism{1};
     SizeType constexpr localRank{0};
-    WorldConfig constexpr worldConfig{worldSize, localRank};
+    WorldConfig constexpr worldConfig{tensorParallelism, pipelineParallelism, localRank};
 
     SizeType constexpr vocabSize{51200};
     SizeType constexpr nbLayers{2};
@@ -228,6 +244,14 @@ void testDecoderWavefront(
     decoder.setup(batchSize, maxBeamWidth, maxSeqLength, modelConfig.getDataType());
 
     std::vector<SizeType> const inputLengths{4, 5, 6, 7};
+    std::vector<SizeType> tiledInputLengths;
+    for (int batch_id = 0; batch_id < inputLengths.size(); batch_id++)
+    {
+        for (int beam_id = 0; beam_id < maxBeamWidth; beam_id++)
+        {
+            tiledInputLengths.push_back(inputLengths.at(batch_id));
+        }
+    }
 
     // set up inputs
     auto logits = std::shared_ptr(
@@ -253,6 +277,10 @@ void testDecoderWavefront(
         manager.setZero(*tgtCacheIndirection);
         outputs.cacheIndirection = tgtCacheIndirection;
     }
+    auto sequenceLengths
+        = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize * maxBeamWidth}), TRTDataType<SizeType>::value));
+    manager.copy(tiledInputLengths.data(), *sequenceLengths);
+    outputs.sequenceLengths = sequenceLengths;
 
     auto const& nbSteps = decoder.getNbSteps();
     EXPECT_EQ(nbSteps.size(), batchSize);
diff --git a/cpp/tests/runtime/gptDecoderTest.cpp b/cpp/tests/runtime/gptDecoderTest.cpp
index 3aeb3c95349..f91a882dd5d 100644
--- a/cpp/tests/runtime/gptDecoderTest.cpp
+++ b/cpp/tests/runtime/gptDecoderTest.cpp
@@ -31,9 +31,10 @@ namespace
 
 void testDecoder(nvinfer1::DataType const dtype, SamplingConfig const& samplingConfig)
 {
-    SizeType constexpr worldSize{1};
+    SizeType constexpr tensorParallelism{1};
+    SizeType constexpr pipelineParallelism{1};
     SizeType constexpr localRank{0};
-    WorldConfig constexpr worldConfig{worldSize, localRank};
+    WorldConfig constexpr worldConfig{tensorParallelism, pipelineParallelism, localRank};
 
     SizeType constexpr vocabSize{51200};
     SizeType constexpr nbLayers{2};
diff --git a/cpp/tests/runtime/gptSessionTest.cpp b/cpp/tests/runtime/gptSessionTest.cpp
index 24b36a60153..5ba7be6cf03 100644
--- a/cpp/tests/runtime/gptSessionTest.cpp
+++ b/cpp/tests/runtime/gptSessionTest.cpp
@@ -21,6 +21,7 @@
 
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/common/tensor.h"
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
 #include "tensorrt_llm/runtime/gptSession.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
@@ -28,8 +29,6 @@
 #include <algorithm>
 #include <filesystem>
 
-#include <NvInferPlugin.h>
-
 using namespace tensorrt_llm::runtime;
 
 namespace tc = tensorrt_llm::common;
@@ -43,6 +42,7 @@ auto const DATA_PATH = TEST_RESOURCE_PATH / "data";
 
 auto const GPT_MODEL_DIR = "gpt2";
 auto const GPTJ_MODEL_DIR = "gpt-j-6b";
+auto const LLAMA_MODEL_DIR = "llama-7b-hf";
 
 // Engines need to be generated using cpp/tests/resources/scripts/build_gpt_engines.py.
 auto const FP32_GPT_DIR = "fp32-default";
@@ -51,9 +51,6 @@ auto const FP16_GPT_DIR = "fp16-default";
 auto const FP16_GPT_ATTENTION_DIR = "fp16-plugin";
 auto const FP16_GPT_ATTENTION_PACKED_DIR = FP16_GPT_ATTENTION_DIR + std::string("-packed");
 auto const FP16_GPT_ATTENTION_PACKED_PAGED_DIR = FP16_GPT_ATTENTION_PACKED_DIR + std::string("-paged");
-auto const FP16_GPT_ATTENTION_INFLIGHT_BATCHING_DIR = "fp16-inflight-batching-plugin";
-auto const FP16_GPT_ATTENTION_INFLIGHT_BATCHING_PAGED_DIR
-    = FP16_GPT_ATTENTION_INFLIGHT_BATCHING_DIR + std::string("-paged");
 
 // Expected outputs need to be generated using cpp/tests/resources/scripts/generate_expected_gpt_output.py.
 auto const FP32_RESULT_FILE = "output_tokens_fp32.npy";
@@ -62,6 +59,18 @@ auto const FP16_RESULT_FILE = "output_tokens_fp16.npy";
 auto const FP16_PLUGIN_RESULT_FILE = "output_tokens_fp16_plugin.npy";
 auto const FP16_PLUGIN_PACKED_RESULT_FILE = "output_tokens_fp16_plugin_packed.npy";
 
+struct ModelIds
+{
+    int endId;
+    int padId;
+};
+
+struct ModelParams
+{
+    char const* baseDir;
+    ModelIds ids;
+};
+
 class ModelSpec
 {
 public:
@@ -70,7 +79,6 @@ class ModelSpec
         , mResultsFile{std::move(resultsFile)}
         , mDataType{dtype}
         , mUseGptAttentionPlugin{false}
-        , mUseInflightBatching{false}
         , mUsePackedInput{false}
         , mUsePagedKvCache{false}
         , mDecoderPerRequest{false}
@@ -83,12 +91,6 @@ class ModelSpec
         return *this;
     }
 
-    ModelSpec& useInflightBatching()
-    {
-        mUseInflightBatching = true;
-        return *this;
-    }
-
     ModelSpec& usePackedInput()
     {
         mUsePackedInput = true;
@@ -111,7 +113,6 @@ class ModelSpec
     std::string mResultsFile;
     nvinfer1::DataType mDataType;
     bool mUseGptAttentionPlugin;
-    bool mUseInflightBatching;
     bool mUsePackedInput;
     bool mUsePagedKvCache;
     bool mDecoderPerRequest;
@@ -130,7 +131,7 @@ class SessionTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-
 
         mLogger = std::make_shared<TllmLogger>();
 
-        initLibNvInferPlugins(mLogger.get(), "tensorrt_llm");
+        initTrtLlmPlugins(mLogger.get());
     }
 
     void TearDown() override {}
@@ -149,10 +150,9 @@ void verifyModelConfig(GptModelConfig const& modelConfig, ModelSpec const& model
     ASSERT_EQ(modelSpec.mDataType, modelConfig.getDataType());
 }
 
-template <int endId = 50256, int padId = 50256>
-void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeType beamWidth,
+void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds const modelIds, SizeType beamWidth,
     std::initializer_list<int> const& batchSizes, std::string const& resultsFile,
-    std::shared_ptr<nvinfer1::ILogger> const& logger, bool replicateFirstInput = false, bool cudaGraphMode = false)
+    std::shared_ptr<nvinfer1::ILogger> const& logger, bool cudaGraphMode = false)
 {
     ASSERT_TRUE(fs::exists(DATA_PATH));
     auto givenInput = tc::Tensor::loadNpy(DATA_PATH / "input_tokens.npy", tc::MEMORY_CPU);
@@ -172,7 +172,8 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT
     auto const decoderPerRequest = modelSpec.mDecoderPerRequest;
 
     auto const worldConfig = WorldConfig::mpi(*logger);
-    auto const enginePath = modelPath / json.engineFilename(worldConfig);
+    auto enginePath = modelPath / json.engineFilename(worldConfig);
+    ASSERT_TRUE(fs::exists(enginePath));
 
     auto const maxInputLength = static_cast<SizeType>(givenInput.shape[1]);
     auto const maxSeqLength = static_cast<SizeType>(expectedOutput.shape[1]);
@@ -185,6 +186,9 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT
     samplingConfig.topK = std::vector{0};
     samplingConfig.topP = std::vector{0.0f};
 
+    auto const padId = modelIds.padId;
+    auto const endId = modelIds.endId;
+
     std::vector<SizeType> givenInputLengths(nbGivenInputs);
     for (SizeType i = 0; i < nbGivenInputs; ++i)
     {
@@ -210,7 +214,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT
         std::vector<SizeType> inputLenghtsHost(batchSize);
         for (SizeType i = 0; i < batchSize; ++i)
         {
-            const int inputIdx = replicateFirstInput ? 0 : i % nbGivenInputs;
+            const int inputIdx = i % nbGivenInputs;
             inputLenghtsHost[i] = givenInputLengths[inputIdx];
         }
         auto inputLenghts = bufferManager.copyFrom(inputLenghtsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
@@ -226,7 +230,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT
             std::vector<std::int32_t> inputsHost(totalInputSize);
             for (SizeType i = 0; i < batchSize; ++i)
             {
-                auto const seqBegin = givenInputData + (replicateFirstInput ? 0 : (i % nbGivenInputs) * maxInputLength);
+                auto const seqBegin = givenInputData + (i % nbGivenInputs) * maxInputLength;
                 std::copy(seqBegin, seqBegin + inputLenghtsHost[i], inputsHost.begin() + inputOffsetsHost[i]);
             }
             inputIds = bufferManager.copyFrom(inputsHost, ITensor::makeShape({1, totalInputSize}), MemoryType::kGPU);
@@ -236,7 +240,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT
             std::vector<std::int32_t> inputsHost(batchSize * maxInputLength, padId);
             for (SizeType i = 0; i < batchSize; ++i)
             {
-                auto const seqBegin = givenInputData + (replicateFirstInput ? 0 : (i % nbGivenInputs) * maxInputLength);
+                auto const seqBegin = givenInputData + (i % nbGivenInputs) * maxInputLength;
                 std::copy(seqBegin, seqBegin + inputLenghtsHost[i], inputsHost.begin() + i * maxInputLength);
             }
             inputIds
@@ -282,9 +286,8 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT
                     for (auto i = 0; i < maxSeqLength; ++i)
                     {
                         auto const outputIndex = tc::flat_index3(b, beam, i, beamWidth, maxSeqLength);
-                        const int expectedBatch = replicateFirstInput ? 0 : b;
                         auto const expectIndex
-                            = tc::flat_index2((expectedBatch % nbGivenInputs * beamWidth + beam), i, maxSeqLength);
+                            = tc::flat_index2((b % nbGivenInputs * beamWidth + beam), i, maxSeqLength);
                         EXPECT_EQ(output[outputIndex], expectedOutputData[expectIndex])
                             << " b: " << b << " beam: " << beam << " i: " << i;
                         anyMismatch |= (output[outputIndex] != expectedOutputData[expectIndex]);
@@ -304,7 +307,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT
 
 auto constexpr kBatchSizes = {1, 8};
 
-using ParamType = std::tuple<char const*, ModelSpec, SizeType, bool>;
+using ParamType = std::tuple<ModelParams, ModelSpec, SizeType, bool>;
 
 std::string generateTestName(const testing::TestParamInfo<ParamType>& info)
 {
@@ -314,8 +317,6 @@ std::string generateTestName(const testing::TestParamInfo<ParamType>& info)
     name.append(beamWidth == 1 ? "Sampling" : "BeamWidth" + std::to_string(beamWidth));
     if (modelSpec.mUseGptAttentionPlugin)
         name.append("GptAttentionPlugin");
-    if (modelSpec.mUseInflightBatching)
-        name.append("WithInflightBatching");
     if (modelSpec.mUsePackedInput)
         name.append("Packed");
     if (modelSpec.mUsePagedKvCache)
@@ -334,7 +335,9 @@ class ParamTest : public SessionTest, public ::testing::WithParamInterface<Param
 
 TEST_P(ParamTest, Test)
 {
-    auto const modelDir = std::get<0>(GetParam());
+    auto const modelParams = std::get<0>(GetParam());
+    auto const modelDir = modelParams.baseDir;
+    auto const modelIds = modelParams.ids;
     auto const modelSpec = std::get<1>(GetParam());
     auto const modelPath{ENGINGE_PATH / modelDir / modelSpec.mModelPath / "1-gpu"};
     SizeType const beamWidth{std::get<2>(GetParam())};
@@ -345,15 +348,13 @@ TEST_P(ParamTest, Test)
     if (!modelSpec.mUseGptAttentionPlugin && beamWidth > 1)
         GTEST_SKIP();
 
-    auto const replicateFirstInput = false;
     auto const cudaGraphMode = std::get<3>(GetParam());
 
-    testGptSession(
-        modelPath, modelSpec, beamWidth, kBatchSizes, resultsFile, mLogger, replicateFirstInput, cudaGraphMode);
+    testGptSession(modelPath, modelSpec, modelIds, beamWidth, kBatchSizes, resultsFile, mLogger, cudaGraphMode);
 }
 
 INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest,
-    testing::Combine(testing::Values(GPT_MODEL_DIR),
+    testing::Combine(testing::Values(ModelParams{GPT_MODEL_DIR, {50256, 50256}}),
         testing::Values(
             // single decoder
             ModelSpec{FP32_GPT_DIR, FP32_RESULT_FILE, nvinfer1::DataType::kFLOAT},
@@ -369,17 +370,6 @@ INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest,
                 .useGptAttentionPlugin()
                 .usePackedInput()
                 .usePagedKvCache(),
-            // ModelSpec{
-            //     FP16_GPT_ATTENTION_INFLIGHT_BATCHING_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, nvinfer1::DataType::kHALF}
-            //     .useGptAttentionPlugin()
-            //     .useInflightBatching()
-            //     .usePackedInput(),
-            ModelSpec{FP16_GPT_ATTENTION_INFLIGHT_BATCHING_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE,
-                nvinfer1::DataType::kHALF}
-                .useGptAttentionPlugin()
-                .useInflightBatching()
-                .usePackedInput()
-                .usePagedKvCache(),
             // decoderBatch
             ModelSpec{FP32_GPT_DIR, FP32_RESULT_FILE, nvinfer1::DataType::kFLOAT}.useDecoderPerRequest(),
             ModelSpec{FP32_GPT_ATTENTION_DIR, FP32_PLUGIN_RESULT_FILE, nvinfer1::DataType::kFLOAT}
@@ -397,19 +387,6 @@ INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest,
                 .useGptAttentionPlugin()
                 .usePackedInput()
                 .usePagedKvCache()
-                .useDecoderPerRequest(),
-            // ModelSpec{
-            //     FP16_GPT_ATTENTION_INFLIGHT_BATCHING_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, nvinfer1::DataType::kHALF}
-            //     .useGptAttentionPlugin()
-            //     .useInflightBatching()
-            //     .usePackedInput()
-            //     .useDecoderPerRequest(),
-            ModelSpec{FP16_GPT_ATTENTION_INFLIGHT_BATCHING_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE,
-                nvinfer1::DataType::kHALF}
-                .useGptAttentionPlugin()
-                .useInflightBatching()
-                .usePackedInput()
-                .usePagedKvCache()
                 .useDecoderPerRequest()
 
                 ),
@@ -417,7 +394,7 @@ INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest,
     generateTestName);
 
 INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest,
-    testing::Combine(testing::Values(GPTJ_MODEL_DIR),
+    testing::Combine(testing::Values(ModelParams{GPTJ_MODEL_DIR, {50256, 50256}}),
         testing::Values(
             // single decoder
             ModelSpec{FP16_GPT_ATTENTION_DIR, FP16_PLUGIN_RESULT_FILE, nvinfer1::DataType::kHALF}
@@ -425,8 +402,7 @@ INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest,
             ModelSpec{FP16_GPT_ATTENTION_PACKED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, nvinfer1::DataType::kHALF}
                 .useGptAttentionPlugin()
                 .usePackedInput(),
-            ModelSpec{FP16_GPT_ATTENTION_INFLIGHT_BATCHING_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE,
-                nvinfer1::DataType::kHALF}
+            ModelSpec{FP16_GPT_ATTENTION_PACKED_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, nvinfer1::DataType::kHALF}
                 .useGptAttentionPlugin()
                 .usePackedInput()
                 .usePagedKvCache(),
@@ -438,8 +414,7 @@ INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest,
                 .useGptAttentionPlugin()
                 .usePackedInput()
                 .useDecoderPerRequest(),
-            ModelSpec{FP16_GPT_ATTENTION_INFLIGHT_BATCHING_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE,
-                nvinfer1::DataType::kHALF}
+            ModelSpec{FP16_GPT_ATTENTION_PACKED_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, nvinfer1::DataType::kHALF}
                 .useGptAttentionPlugin()
                 .usePackedInput()
                 .usePagedKvCache()
@@ -449,11 +424,26 @@ INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest,
         testing::Values(1, 2), testing::Values(false)),
     generateTestName);
 
-class LlamaSessionTest : public SessionTest
+INSTANTIATE_TEST_SUITE_P(LlamaSessionTest, ParamTest,
+    testing::Combine(testing::Values(ModelParams{LLAMA_MODEL_DIR, {2, 2}}),
+        testing::Values(
+            // single decoder
+            ModelSpec{FP16_GPT_ATTENTION_DIR, FP16_PLUGIN_RESULT_FILE, nvinfer1::DataType::kHALF}
+                .useGptAttentionPlugin(),
+            // decoderBatch
+            ModelSpec{FP16_GPT_ATTENTION_DIR, FP16_PLUGIN_RESULT_FILE, nvinfer1::DataType::kHALF}
+                .useGptAttentionPlugin()
+                .useDecoderPerRequest()
+
+                ),
+        testing::Values(1, 2), testing::Values(false)),
+    generateTestName);
+
+class LlamaSessionOnDemandTest : public SessionTest
 {
 };
 
-TEST_F(LlamaSessionTest, SamplingFP16WithAttentionPlugin)
+TEST_F(LlamaSessionOnDemandTest, SamplingFP16WithAttentionPlugin)
 {
     GTEST_SKIP() << "Run only on demand";
     auto const modelDir = "llama_7bf";
@@ -465,11 +455,12 @@ TEST_F(LlamaSessionTest, SamplingFP16WithAttentionPlugin)
 
     auto constexpr dtype = nvinfer1::DataType::kHALF;
     auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin();
+    auto const modeIds = ModelIds{2, 2};
 
-    testGptSession<2, 2>(modelPath, modelSpec, beamWidth, batchSizes, resultsFile, mLogger);
+    testGptSession(modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger);
 }
 
-TEST_F(LlamaSessionTest, SamplingFP16AttentionPluginDecoderBatch)
+TEST_F(LlamaSessionOnDemandTest, SamplingFP16AttentionPluginDecoderBatch)
 {
     GTEST_SKIP() << "Run only on demand";
     auto const modelDir = "llamav2";
@@ -480,6 +471,7 @@ TEST_F(LlamaSessionTest, SamplingFP16AttentionPluginDecoderBatch)
 
     auto constexpr dtype = nvinfer1::DataType::kHALF;
     auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin().usePackedInput().useDecoderPerRequest();
+    auto const modeIds = ModelIds{2, 2};
 
-    testGptSession<2, 2>(modelPath, modelSpec, beamWidth, batchSizes, resultsFile, mLogger);
+    testGptSession(modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger);
 }
diff --git a/cpp/tests/runtime/runtimeKernelTest.cpp b/cpp/tests/runtime/runtimeKernelTest.cpp
index 3af4f9cfb47..e503e912e22 100644
--- a/cpp/tests/runtime/runtimeKernelTest.cpp
+++ b/cpp/tests/runtime/runtimeKernelTest.cpp
@@ -54,80 +54,87 @@ class RuntimeKernelTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro
     BufferManager::CudaStreamPtr mStream;
 };
 
-TEST_F(RuntimeKernelTest, FillInt32)
+namespace
 {
-    SizeType constexpr value{3};
-    SizeType constexpr size{123};
-    auto buffer = mManager->gpu(size, nvinfer1::DataType::kINT32);
-    kernels::invokeFill(*buffer, value, *mStream);
-    auto bufferHost = mManager->copyFrom(*buffer, MemoryType::kCPU);
-    auto bufferPtr = bufferCast<SizeType>(*bufferHost);
-    std::vector<SizeType> expected(buffer->getSize(), value);
+template <typename T>
+void testFill(IBuffer& buffer, BufferManager& manager, CudaStream& stream)
+{
+    T constexpr value{3};
+    kernels::invokeFill(buffer, value, stream);
+    auto bufferHost = manager.copyFrom(buffer, MemoryType::kCPU);
+    auto bufferPtr = bufferCast<T>(*bufferHost);
+    auto constexpr expected = value;
 
     auto anyMismatch = false;
-    for (std::size_t i = 0; i < buffer->getSize(); ++i)
+    for (std::size_t i = 0; i < buffer.getSize(); ++i)
     {
-        EXPECT_EQ(bufferPtr[i], expected[i]) << "Error at index " << i;
-        anyMismatch |= bufferPtr[i] != expected[i];
+        EXPECT_EQ(bufferPtr[i], expected) << "Error at index " << i;
+        anyMismatch |= bufferPtr[i] != expected;
     }
-    buffer.release();
     ASSERT_FALSE(anyMismatch);
+}
+} // namespace
 
-    auto tensor = mManager->gpu(ITensor::makeShape({size, size}), nvinfer1::DataType::kINT32);
-    kernels::invokeFill(*tensor, value, *mStream);
-    auto tensorHost = mManager->copyFrom(*tensor, MemoryType::kCPU);
-    auto tensorPtr = bufferCast<SizeType>(*tensorHost);
-    expected.clear();
-    expected.resize(tensor->getSize(), value);
+TEST_F(RuntimeKernelTest, FillBufferInt8)
+{
+    for (auto size : {123llu, 1025llu, 1llu << 32})
+    {
+        auto buffer = mManager->gpu(size, nvinfer1::DataType::kINT8);
+        testFill<std::int8_t>(*buffer, *mManager, *mStream);
+        buffer.release();
+    }
+}
 
-    anyMismatch = false;
-    for (std::size_t i = 0; i < tensor->getSize(); ++i)
+TEST_F(RuntimeKernelTest, FillTensorInt8)
+{
+    for (auto size : {123, 1025, std::numeric_limits<SizeType>::max()})
     {
-        EXPECT_EQ(tensorPtr[i], expected[i]) << "Error at index " << i;
-        anyMismatch |= tensorPtr[i] != expected[i];
+        auto tensor = mManager->gpu(ITensor::makeShape({size, 2}), nvinfer1::DataType::kINT8);
+        testFill<std::int8_t>(*tensor, *mManager, *mStream);
+        tensor.release();
     }
-    tensor.release();
-    ASSERT_FALSE(anyMismatch);
 }
 
-TEST_F(RuntimeKernelTest, AddInt32)
+namespace
+{
+void testAdd(IBuffer& buffer, BufferManager& manager, CudaStream& stream)
 {
     SizeType constexpr value{3};
-    SizeType constexpr size{123};
-    auto buffer = mManager->gpu(size, nvinfer1::DataType::kINT32);
-    mManager->setZero(*buffer);
-    kernels::invokeAdd(*buffer, value, *mStream);
-    kernels::invokeAdd(*buffer, value, *mStream);
-    auto bufferHost = mManager->copyFrom(*buffer, MemoryType::kCPU);
+    manager.setZero(buffer);
+    kernels::invokeAdd(buffer, value, stream);
+    kernels::invokeAdd(buffer, value, stream);
+    auto bufferHost = manager.copyFrom(buffer, MemoryType::kCPU);
     auto bufferPtr = bufferCast<SizeType>(*bufferHost);
-    std::vector<SizeType> expected(buffer->getSize(), 2 * value);
+    auto constexpr expected = 2 * value;
 
     auto anyMismatch = false;
-    for (std::size_t i = 0; i < buffer->getSize(); ++i)
+    for (std::size_t i = 0; i < buffer.getSize(); ++i)
     {
-        EXPECT_EQ(bufferPtr[i], expected[i]) << "Error at index " << i;
-        anyMismatch |= bufferPtr[i] != expected[i];
+        EXPECT_EQ(bufferPtr[i], expected) << "Error at index " << i;
+        anyMismatch |= bufferPtr[i] != expected;
     }
-    buffer.release();
     ASSERT_FALSE(anyMismatch);
+}
+} // namespace
 
-    auto tensor = mManager->gpu(ITensor::makeShape({size, size}), nvinfer1::DataType::kINT32);
-    mManager->setZero(*tensor);
-    kernels::invokeAdd(*tensor, value, *mStream);
-    kernels::invokeAdd(*tensor, value, *mStream);
-    auto tensorHost = mManager->copyFrom(*tensor, MemoryType::kCPU);
-    auto tensorPtr = bufferCast<SizeType>(*tensorHost);
-    expected.clear();
-    expected.resize(tensor->getSize(), 2 * value);
-
-    anyMismatch = false;
-    for (std::size_t i = 0; i < tensor->getSize(); ++i)
+TEST_F(RuntimeKernelTest, AddBufferInt32)
+{
+    for (auto size : {123, 1025})
     {
-        EXPECT_EQ(tensorPtr[i], expected[i]) << "Error at index " << i;
-        anyMismatch |= tensorPtr[i] != expected[i];
+        auto buffer = mManager->gpu(size, nvinfer1::DataType::kINT32);
+        testAdd(*buffer, *mManager, *mStream);
+        buffer.release();
+    }
+}
+
+TEST_F(RuntimeKernelTest, AddTensorInt32)
+{
+    for (auto size : {123, 1025})
+    {
+        auto tensor = mManager->gpu(ITensor::makeShape({size, size}), nvinfer1::DataType::kINT32);
+        testAdd(*tensor, *mManager, *mStream);
+        tensor.release();
     }
-    tensor.release();
-    ASSERT_FALSE(anyMismatch);
 }
 
 TEST_F(RuntimeKernelTest, Transpose)
@@ -623,6 +630,35 @@ TEST_F(RuntimeKernelTest, ScatterHalf)
     }
 }
 
+namespace
+{
+template <typename T>
+void verifyTiling(std::vector<T> const& input, ITensor const& outputTensor, BufferManager& manager)
+{
+    auto outputHost = manager.copyFrom(outputTensor, MemoryType::kCPU);
+    auto outputPtr = bufferCast<T>(*outputHost);
+
+    auto const& shape = outputTensor.getShape();
+    auto batchSize = static_cast<std::size_t>(shape.d[0]);
+    auto beamWidth = static_cast<std::size_t>(shape.d[1]);
+    auto inputLength = outputTensor.getSize() / batchSize / beamWidth;
+
+    for (std::size_t b = 0; b < batchSize; ++b)
+    {
+        for (std::size_t beam = 0; beam < beamWidth; ++beam)
+        {
+            for (std::size_t i = 0; i < inputLength; ++i)
+            {
+                auto const inputIdx = tc::flat_index2(b, i, inputLength);
+                auto const outputIdx = tc::flat_index3(b, beam, i, beamWidth, inputLength);
+                EXPECT_EQ(outputPtr[outputIdx], input[inputIdx])
+                    << "Error at index (" << b << ',' << beam << ',' << i << ')';
+            }
+        }
+    }
+}
+} // namespace
+
 TEST_F(RuntimeKernelTest, TileInt32)
 {
     SizeType const beamWidth{3};
@@ -637,22 +673,9 @@ TEST_F(RuntimeKernelTest, TileInt32)
     auto outputTensor = mManager->gpu(outputShape, nvinfer1::DataType::kINT32);
 
     kernels::tileTensor(*outputTensor, *inputTensor, beamWidth, *mStream);
-    auto outputHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU);
-    auto outputPtr = bufferCast<SizeType>(*outputHost);
 
-    for (SizeType b = 0; b < batchSize; ++b)
-    {
-        for (SizeType beam = 0; beam < beamWidth; ++beam)
-        {
-            for (SizeType i = 0; i < inputLength; ++i)
-            {
-                auto const inputIdx = tc::flat_index2(b, i, inputLength);
-                auto const outputIdx = tc::flat_index3(b, beam, i, beamWidth, inputLength);
-                EXPECT_EQ(outputPtr[outputIdx], input[inputIdx])
-                    << "Error at index (" << b << ',' << beam << ',' << i << ')';
-            }
-        }
-    }
+    outputTensor->reshape(ITensor::makeShape({batchSize, beamWidth, inputLength}));
+    verifyTiling(input, *outputTensor, *mManager);
 }
 
 TEST_F(RuntimeKernelTest, TileHalf)
@@ -670,22 +693,9 @@ TEST_F(RuntimeKernelTest, TileHalf)
     auto outputTensor = mManager->gpu(outputShape, nvinfer1::DataType::kHALF);
 
     kernels::tileTensor(*outputTensor, *inputTensor, beamWidth, *mStream);
-    auto outputHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU);
-    auto outputPtr = bufferCast<half>(*outputHost);
 
-    for (SizeType b = 0; b < batchSize; ++b)
-    {
-        for (SizeType beam = 0; beam < beamWidth; ++beam)
-        {
-            for (SizeType i = 0; i < inputLength; ++i)
-            {
-                auto const inputIdx = tc::flat_index2(b, i, inputLength);
-                auto const outputIdx = tc::flat_index3(b, beam, i, beamWidth, inputLength);
-                EXPECT_EQ(outputPtr[outputIdx], input[inputIdx])
-                    << "Error at index (" << b << ',' << beam << ',' << i << ')';
-            }
-        }
-    }
+    outputTensor->reshape(ITensor::makeShape({batchSize, beamWidth, inputLength}));
+    verifyTiling(input, *outputTensor, *mManager);
 }
 
 TEST_F(RuntimeKernelTest, TileInplaceInt32)
@@ -703,22 +713,9 @@ TEST_F(RuntimeKernelTest, TileInplaceInt32)
 
     kernels::scatterTensor(*outputTensor, *inputTensor, beamWidth, *mStream);
     kernels::tileTensorInplace(*outputTensor, beamWidth, *mStream);
-    auto outputHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU);
-    auto outputPtr = bufferCast<SizeType>(*outputHost);
 
-    for (SizeType b = 0; b < batchSize; ++b)
-    {
-        for (SizeType beam = 0; beam < beamWidth; ++beam)
-        {
-            for (SizeType i = 0; i < inputLength; ++i)
-            {
-                auto const inputIdx = tc::flat_index2(b, i, inputLength);
-                auto const outputIdx = tc::flat_index3(b, beam, i, beamWidth, inputLength);
-                EXPECT_EQ(outputPtr[outputIdx], input[inputIdx])
-                    << "Error at index (" << b << ',' << beam << ',' << i << ')';
-            }
-        }
-    }
+    outputTensor->reshape(ITensor::makeShape({batchSize, beamWidth, inputLength}));
+    verifyTiling(input, *outputTensor, *mManager);
 }
 
 TEST_F(RuntimeKernelTest, TileInplaceHalf)
@@ -737,20 +734,62 @@ TEST_F(RuntimeKernelTest, TileInplaceHalf)
 
     kernels::scatterTensor(*outputTensor, *inputTensor, beamWidth, *mStream);
     kernels::tileTensorInplace(*outputTensor, beamWidth, *mStream);
-    auto outputHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU);
-    auto outputPtr = bufferCast<half>(*outputHost);
 
-    for (SizeType b = 0; b < batchSize; ++b)
+    outputTensor->reshape(ITensor::makeShape({batchSize, beamWidth, inputLength}));
+    verifyTiling(input, *outputTensor, *mManager);
+}
+
+TEST_F(RuntimeKernelTest, TileInt8Large)
+{
+    std::int8_t constexpr value{3};
+    SizeType constexpr batchSize{1};
+    SizeType constexpr beamWidth{2};
+
+    SizeType const d2{2};
+    auto const d3 = std::numeric_limits<SizeType>::max();
+    auto const inputShape = ITensor::makeShape({batchSize, d2, d3});
+    auto const outputShape = ITensor::makeShape({batchSize * beamWidth, d2, d3});
+
+    auto inputTensor = mManager->gpu(inputShape, nvinfer1::DataType::kINT8);
+    kernels::invokeFill(*inputTensor, value, *mStream);
+    mStream->synchronize();
+
+    auto outputTensor = mManager->gpu(outputShape, nvinfer1::DataType::kINT8);
+    kernels::tileTensor(*outputTensor, *inputTensor, beamWidth, *mStream);
+    mStream->synchronize();
+
+    auto bufferHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU);
+    auto bufferPtr = bufferCast<std::int8_t>(*bufferHost);
+    auto constexpr expected = value;
+    for (std::size_t i = 0; i < bufferHost->getSize(); ++i)
     {
-        for (SizeType beam = 0; beam < beamWidth; ++beam)
-        {
-            for (SizeType i = 0; i < inputLength; ++i)
-            {
-                auto const inputIdx = tc::flat_index2(b, i, inputLength);
-                auto const outputIdx = tc::flat_index3(b, beam, i, beamWidth, inputLength);
-                EXPECT_EQ(outputPtr[outputIdx], input[inputIdx])
-                    << "Error at index (" << b << ',' << beam << ',' << i << ')';
-            }
-        }
+        EXPECT_EQ(bufferPtr[i], expected) << "Error at index " << i;
+    }
+}
+
+TEST_F(RuntimeKernelTest, TileInplaceInt8Large)
+{
+    std::int8_t constexpr value{3};
+    SizeType constexpr batchSize{1};
+    SizeType constexpr beamWidth{2};
+
+    SizeType const d2{2};
+    auto const d3 = std::numeric_limits<SizeType>::max();
+    auto const inputShape = ITensor::makeShape({batchSize, d2, d3});
+    auto const outputShape = ITensor::makeShape({batchSize * beamWidth, d2, d3});
+
+    auto inputTensor = mManager->gpu(inputShape, nvinfer1::DataType::kINT8);
+    kernels::invokeFill(*inputTensor, value, *mStream);
+
+    auto outputTensor = mManager->gpu(outputShape, nvinfer1::DataType::kINT8);
+    kernels::scatterTensor(*outputTensor, *inputTensor, beamWidth, *mStream);
+    kernels::tileTensorInplace(*outputTensor, beamWidth, *mStream);
+
+    auto bufferHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU);
+    auto bufferPtr = bufferCast<std::int8_t>(*bufferHost);
+    auto constexpr expected = value;
+    for (std::size_t i = 0; i < bufferHost->getSize(); ++i)
+    {
+        EXPECT_EQ(bufferPtr[i], expected) << "Error at index " << i;
     }
 }
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index 87d19f07540..e2ae69f7ad7 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -1,6 +1,6 @@
 # Multi-stage Dockerfile
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
-ARG BASE_TAG=23.07-py3
+ARG BASE_TAG=23.08-py3
 
 FROM ${BASE_IMAGE}:${BASE_TAG} as base
 
@@ -24,15 +24,26 @@ RUN --mount=type=cache,target=/root/.cache \
     pip uninstall -y tensorrt
 
 # Download & install internal TRT release
-ARG TENSOR_RT_VERSION="9.0.1.4"
+ARG TENSOR_RT_VERSION="9.1.0.1"
 ARG CUDA_VERSION="12.2"
-ARG RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/secure/9.0.1/tars/TensorRT-${TENSOR_RT_VERSION}.Linux.x86_64-gnu.cuda-${CUDA_VERSION}.tar.gz
+ARG RELEASE_URL_TRT
+ARG TARGETARCH
+
 RUN --mount=type=cache,target=/root/.cache \
-    wget --no-verbose ${RELEASE_URL_TRT} -P /workspace && \
-    tar -xf /workspace/TensorRT-${TENSOR_RT_VERSION}.Linux.x86_64-gnu.cuda-${CUDA_VERSION}.tar.gz -C /usr/local/ && \
+    if [ -z "$RELEASE_URL_TRT"];then \
+        ARCH=${TARGETARCH} && \
+        if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi && \
+        if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi && \
+        if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi &&\
+        if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04"; else OS1="Linux" && OS2="Linux";fi &&\
+        RELEASE_URL_TRT=http://cuda-repo.nvidia.com/release-candidates/Libraries/TensorRT/v9.1/${TENSOR_RT_VERSION}-b6aa91dc/${CUDA_VERSION}-r535/${OS1}-${DIR_NAME}/tar/TensorRT-${TENSOR_RT_VERSION}.${OS2}.${ARCH}-gnu.cuda-${CUDA_VERSION}.tar.gz;\
+    fi &&\
+    wget --no-verbose ${RELEASE_URL_TRT} -O /workspace/TensorRT.tar && \
+    tar -xf TensorRT.tar -C /usr/local/ && \
     mv /usr/local/TensorRT-${TENSOR_RT_VERSION} /usr/local/tensorrt && \
-    pip install /usr/local/tensorrt/python/tensorrt-9.0.1*cp310-none-linux_x86_64.whl && \
-    rm -rf /workspace/TensorRT-${TENSOR_RT_VERSION}.Linux.x86_64-gnu.cuda-${CUDA_VERSION}.tar.gz
+    pip install /usr/local/tensorrt/python/tensorrt-*-cp310-*.whl && \
+    rm -rf /workspace/TensorRT.tar
+
 ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH}
 
 # Install latest Polygraphy
@@ -59,6 +70,7 @@ FROM devel as release
 
 WORKDIR /app/tensorrt_llm
 COPY --from=wheel /src/tensorrt_llm/build/tensorrt_llm*.whl .
+COPY --from=wheel /src/tensorrt_llm/cpp/include/ include/
 RUN pip install tensorrt_llm*.whl && \
     rm tensorrt_llm*.whl
 COPY README.md ./
diff --git a/docker/Makefile b/docker/Makefile
index f4aa231251b..3e4153f81d3 100644
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -42,7 +42,7 @@ endef
 
 %_build:
 	@echo "Building docker image: $(IMAGE_WITH_TAG)"
-	docker build $(DOCKER_BUILD_OPTS) $(DOCKER_BUILD_ARGS) \
+	DOCKER_BUILDKIT=1 docker build $(DOCKER_BUILD_OPTS) $(DOCKER_BUILD_ARGS) \
 		--progress $(DOCKER_PROGRESS) \
 		$(if $(BASE_IMAGE), --build-arg BASE_IMAGE=$(BASE_IMAGE)) \
 		$(if $(BASE_TAG), --build-arg BASE_TAG=$(BASE_TAG)) \
@@ -77,7 +77,7 @@ endif
     		--workdir $(CODE_DIR) \
     		--hostname $(shell hostname)-$* \
     		--name $(CONTAINER_NAME)-$*-$(USER_NAME) \
-			--tmpfs /tmp \
+			--tmpfs /tmp:exec \
     		$(IMAGE_WITH_TAG)$(IMAGE_TAG_SUFFIX) $(RUN_CMD)
 
 devel_%: STAGE = devel
diff --git a/docs/Doxygen b/docs/Doxygen
new file mode 100644
index 00000000000..617416c16bf
--- /dev/null
+++ b/docs/Doxygen
@@ -0,0 +1,2658 @@
+# Doxyfile 1.9.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "TensorRT-LLM"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = cpp_docs
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which efficively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = ../cpp/include/tensorrt_llm/runtime
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
+# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
+# *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f18 \
+                         *.f \
+                         *.for \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.ice
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see:
+# http://clang.llvm.org/) for more accurate parsing at the cost of reduced
+# performance. This can be particularly helpful with template rich C++ code for
+# which doxygen's built-in parser lacks the necessary type information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled and the CLANG_ADD_INC_PATHS tag is set to
+# YES then doxygen will add the directory of each input to the include path.
+# The default value is: YES.
+
+CLANG_ADD_INC_PATHS    = YES
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the directory containing a file called compile_commands.json. This
+# file is the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the
+# options used when the source files were built. This is equivalent to
+# specifying the -p option to a clang tool, such as clang-check. These options
+# will then be passed to the parser. Any options specified with CLANG_OPTIONS
+# will be added as well.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see:
+# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the main .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = https://cdn.jsdelivr.net/npm/mathjax@2
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using JavaScript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         =
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
+# files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc and
+# plantuml temporary files.
+# The default value is: YES.
+
+DOT_CLEANUP            = YES
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 00000000000..d0c3cbf1020
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 00000000000..0d3320ce796
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,40 @@
+# Docs
+
+This directory contains the stuff for building static html documentations based on [sphinx](https://www.sphinx-doc.org/en/master/).
+
+
+## Build the docs
+Firstly, install the sphinx:
+
+```sh
+apt-get install python3-sphinx doxygen
+```
+
+Secondly, install the packages:
+
+```sh
+python3 -m pip install -r ./requirements.txt
+```
+
+And then, make the docs:
+
+```sh
+doxygen Doxyfile # build C++ docs
+
+make html
+```
+
+And the finally the generated html pages will locate in the `build/html` directory.
+
+
+## Preview the docs locally
+
+The basic way to preview the docs is using the `http.serve`:
+
+```sh
+cd build/html
+
+python3 -m http.server 8081
+```
+
+And you can visit the page with your web browser with url `http://localhost:8081`.
diff --git a/docs/graph-rewriting.md b/docs/graph-rewriting.md
new file mode 100644
index 00000000000..6e011b7bbbf
--- /dev/null
+++ b/docs/graph-rewriting.md
@@ -0,0 +1,193 @@
+# Graph Rewriting Module
+
+TensorRT-LLM adopts a declarative approach to define a neural network. It provides a wrapper similar to PyTorch's Module. When a user invokes the `forward` method, the layers are lowered to TensorRT's `ILayer`s and become part of an `INetworkDefinition`. The Graph Rewriting (GW) module can be used to manipulate the network at the `ILayer`/`INetworkDefinition` level.
+
+## When to Use Graph Rewriting?
+
+For network manipulation, there are two options in TensorRT-LLM:
+
+1. **Module Rewriting:** This method modifies the members of `Module` instances before triggering the `forward` method(i.e. creating the TensorRT graph). It works on the highest level of the network representation and facilitates the modification of sequences of operations (like modifying the GEMM + activation for SmoothQuant),
+
+2. **Graph Rewriting:** Graph Rewriting manipulates TensorRT's `INetworkDefinition` after the `forward` method is triggered. It operates at a finer-grained `ILayer` level and can alter the structure across multiple Module instances. It is typically used for layer fusion.
+
+Graph Rewriting (GW) is ideally used in the following conditions:
+
+1. When only `ILayer`/`INetworkDefinition` is available,
+2. When Module Rewriting would lead to nested control flow or scattered functionality.
+
+## Graph Rewriting APIs
+
+Several core APIs are provided for Graph Rewriting:
+
+### Tensor-Related Methods
+
+- `Tensor.get_parent`: Get the `ILayer` that produces this tensor,
+- `Tensor.get_users`: Get the consumer `ILayer`s of this tensor,
+- `replace_all_uses_with`: Replace this tensor with another tensor in all consumer `ILayer`s.
+
+### FLayerInfo for Retrieving High-Level Information for a Functional
+
+For all the layers located in `functional.py`, the original input information is missing once lowered to `INetworkDefinition`, especially for TensorRT plugins, which are opaque in the Python world. `FLayerInfo` holds their original information as a high-level signature containing inputs like `Tensor`, Python attributes, and more. There is a Network-wise singleton called `FLayerInfoMemo` to map each `ILayer` to its corresponding `FLayerInfo`.
+
+For `FLayerInfo`:
+
+- `FLayerInfo.replace_input_with`: Replace some input tensor with another tensor,
+- `FLayerInfo.replace_output_uses_with`: Redirect the usage of the original output tensors to a set of new tensors.
+
+For `FLayerInfoMemo`:
+
+- `FLayerInfoMemo.instance()`: Get the singleton instance,
+- `FLayerInfoMemo.get`: Get the corresponding `FLayerInfo` for an `ILayer`.
+
+`FLayerInfo` remains consistent with the actual `ILayer` during GW, making it safe to use.
+
+### Pattern and Pattern Manager
+
+There are two kinds of patterns:
+
+- `PatternRewriter`: Used for defining a rewriting pattern, which actually alters the network.
+  - `match`: Match the pattern; returns true if a layer is matched,
+  - `rewrite`: Manipulate a layer,
+  - `match_and_rewrite`: Combines both `match` and `rewrite`, used for complex states that need to pass from `match` to `rewrite`.
+
+- `PatternAnalyzer`: Used for defining an analysis pattern, which collects information from the network.
+  - `match`: Match the pattern,
+  - `analyze`: Perform analysis on a list of layers.
+
+There are two managers for managing multiple `PatternRewriter` or `PatternAnalyzer`:
+
+- `RewritePatternManager`:
+  - `add`: Add a pattern with its label and benefit; the benefit specifies its privilege,
+  - `get`: Get a pattern by label,
+  - `rewrite`: Apply the rewriting patterns contained to a network.
+
+- `AnalysisPatternManager`:
+  - `add`: Add a pattern with its label and benefit; the benefit specifies its privilege,
+  - `get`: Get a pattern by label,
+  - `analyze`: Apply the analysis patterns contained to a network.
+
+### @record_signature to Decorate Functionals Requiring FLayerInfo
+
+The `@record_signature` decorator is used to record the `FLayerInfo` for a functional. While FLayerInfo is vital for GW when analyzing or rewriting certain functionals, it is used in an "add as needed" manner. If you are adding GW patterns, ensure that the functional requires the `@record_signature` decorator.
+
+## Classical Workflow
+
+There are specific routines for defining a GW pattern. Let's start with a simple example: replacing a sum layer with a subtract layer, which can also be found in the `test_graph_rewriting.py` file.
+
+```python
+class NaivePatternRewriter_ReplaceAddWithSub(PatternRewriter):
+
+    def __init__(self):
+        super().__init__('replace_add_with_sub',
+                         root_layer={trt.LayerType.ELEMENTWISE},
+                         separate_match_rewrite=True)
+
+    def match(self, layer: Layer):
+        # The rewriter will stop at the first matched layer, and then the Rewriter will enter the rewrite() to do the rewriting.
+        return layer.as_layer().op == trt.ElementWiseOperation.SUM
+
+    def rewrite(self, layer: Layer) -> None:
+        # The layer here should be an Elementwise_SUM layer.
+        with net_guard(layer.network):
+            # There are several stages to replace some subgraph with another subgraph:
+
+            # Stage 1: Get the input tensors and output tensors of the subgraph to replace.
+            # - For Elementwise_SUM, there are two inputs and one output.
+            a, b = layer.get_inputs(0, 1)
+            o = layer.get_outputs(0)[0]
+
+            # Stage 2: Create a new subgraph that takes the old one's inputs.
+            # - Here we insert an Elementwise_SUB layer, and 'c' is the output.
+            c = a - b
+
+            # Stage 3: Redirect all the layers depending on the outputs of the old subgraph to the new subgraph's.
+            # - After this, the SUM becomes dangling and will be pruned by TensorRT when building the engine.
+            # - Note that there is no API in TensorRT python to remove a layer explicitly; `replace_all_uses_with` is the only way to "remove" a layer.
+            o.replace_all_uses_with(c)
+
+            # Stage 4: Mark all the layers in the old subgraph as removed.
+            # - This helps the PatternRewriter to skip the removed layers.
+            layer.mark_as_removed()
+```
+
+In this example, we deal with `ILayer` rather than Plugins, so `FLayerInfo` is unnecessary. As illustrated in the `rewrite` method, there are four stages that are shared across nearly all rewrite patterns.
+
+Note that in GW, we **NEVER** rewrite a layer directly. Instead, we do it in two steps: first, create another layer with the same input and deprive all the users of the original outputs, redirecting them to the outputs of the new layers. In this way, the old layer will be dangling and pruned automatically by TensorRT during the engine building phase. This is a limitation of TensorRT since remove-layer-like APIs are not available in Python.
+
+In Stage 2, we rely on operators and layers commonly used during the network building phase. Ideally, you can replace them with any network structure during GW.
+
+For the usage of `FLayerInfo`, let's rewrite the `gpt_attention` to enable the `remove-padding` feature. `gpt_attention` is actually
+
+ a TensorRT plugin, so we need `FLayerInfo` to hold the original Tensor-wise inputs to help create new `gpt_attention` layers.
+
+```python
+class GPTAttentionPluginRemovePaddingRewritePass(PatternRewriter):
+
+    def __init__(self):
+        super().__init__('gpt_attention_plugin_remove_padding',
+                         root_layer={trt.LayerType.PLUGIN_V2})
+
+    def match_and_rewrite(self, layer: Layer) -> bool:
+        if layer.as_layer().type != trt.LayerType.PLUGIN_V2 or \
+                layer.as_layer().plugin.plugin_namespace != 'tensorrt_llm' or \
+                layer.as_layer().plugin.plugin_type != 'GPTAttention':
+            return False
+
+        # Retrieve the FLayerInfo
+        flayer = FLayerInfoMemo.instance().get(layer.name)
+        assert flayer
+        # Although the layer is a plugin, which is a black box, we get some high-level input information from the FLayerInfo.
+        tensor_input: Tensor = flayer.get_input('tensor')
+        if tensor_input.shape[0] == 1:  # Already in remove-padding mode
+            return False
+
+        # Some information could be passed in from external
+        assert self.args is not None, "args should be passed in from RewritePatternManager.rewrite()"
+        batch_size, in_len, hidden_size = self.args['batch_size'], self.args['in_len'], self.args['hidden_size']
+
+        with net_guard(layer.network):
+            new_inputs = flayer.clone_inputs()
+
+            # Step 1: Create new inputs and replace the original arglist.
+            input = Tensor(
+                name='tensor',
+                dtype=trt.float16,
+                shape=(1, batch_size * in_len, hidden_size),
+            )
+            new_inputs['tensor'] = input
+
+            # Step 2: Create a new plugin instance.
+            new_outs = gpt_attention(**new_inputs)
+
+            # Step 3: Deprive all the users of the old plugin instance.
+            flayer.replace_outputs_uses_with(layer.network, new_outs)
+
+            # Step 4: Remove the old plugin instance.
+            layer.mark_as_removed()
+
+        return True
+```
+
+This is quite similar to the first example, with the focus on the `FLayerInfo` part. Through the code below, we can get the original inputs of this layer, enabling us to alter the inputs related to remove-padding and create a new layer to replace it.
+
+```python
+flayer = FLayerInfoMemo.instance().get(layer.name)
+assert flayer
+```
+
+```python
+new_inputs = flayer.clone_inputs()
+
+# Step 1: Create new inputs and replace the original arglist.
+input = Tensor(
+    name='tensor',
+    dtype=trt.float16,
+    shape=(1, batch_size * in_len, hidden_size),
+)
+new_inputs['tensor'] = input
+
+# Step 2: Create a new plugin instance.
+new_outs = gpt_attention(**new_inputs)
+```
+
+For real examples, please refer to the `FuseAttentionWithBiasPass` in the `graph_rewriting.py`.
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 00000000000..dc1312ab09c
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 00000000000..0dac7314ec2
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,4 @@
+sphinx-autosummary
+sphinx-rtd-theme
+myst_parser
+breathe
diff --git a/docs/2023-05-17-how-to-add-a-new-model.md b/docs/source/2023-05-17-how-to-add-a-new-model.md
similarity index 100%
rename from docs/2023-05-17-how-to-add-a-new-model.md
rename to docs/source/2023-05-17-how-to-add-a-new-model.md
diff --git a/docs/2023-05-19-how-to-debug.md b/docs/source/2023-05-19-how-to-debug.md
similarity index 100%
rename from docs/2023-05-19-how-to-debug.md
rename to docs/source/2023-05-19-how-to-debug.md
diff --git a/docs/source/CONTRIBUTING.md b/docs/source/CONTRIBUTING.md
new file mode 100644
index 00000000000..5df66d10026
--- /dev/null
+++ b/docs/source/CONTRIBUTING.md
@@ -0,0 +1,69 @@
+# CONTRIBUTING
+
+## Workflow
+
+1. Apply to be a developer of the project
+
+Please reach to the maintainers:
+
+- [bhsueh](https://gitlab-master.nvidia.com/bhsueh) (BoYang Hsueh)
+- [tali](https://gitlab-master.nvidia.com/tali) (Tao Li)
+- [junq](https://gitlab-master.nvidia.com/junq) (Freddy Qi)
+
+2. Clone
+
+```bash
+git clone https://gitlab-master.nvidia.com/ftp/tensorrt_llm.git
+```
+
+3. Create a local feature branch
+
+```bash
+cd tensorrt_llm
+git checkout -b feature/xxx
+```
+
+4. Commit
+
+We use `pre-commit` to help check the code style.
+
+```bash
+pip install pre-commit
+pre-commit install
+```
+
+`pre-commit` will be triggered in every commit.
+
+```bash
+git commit -m "fix"
+
+isort....................................................................Passed
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.....................................................................Failed
+- hook id: yapf
+- files were modified by this hook
+check for added large files..............................................Passed
+check for merge conflicts................................................Passed
+check for broken symlinks............................(no files to check)Skipped
+detect private key...................................(no files to check)Skipped
+fix end of files.........................................................Passed
+check yaml...........................................(no files to check)Skipped
+trim trailing whitespace.................................................Passed
+autoflake................................................................Passed
+clang-format.........................................(no files to check)Skipped
+cmake-format.........................................(no files to check)Skipped
+```
+
+5. Push
+
+```bash
+git push -u origin feature/xxx
+```
+
+## Add a new model
+
+[How to add a new model](docs/2023-05-17-how-to-add-a-new-model.md)
+
+## Debug
+
+[How to debug](docs/2023-05-19-how-to-debug.md)
diff --git a/README.md b/docs/source/README.md
similarity index 100%
rename from README.md
rename to docs/source/README.md
diff --git a/docs/architecture.md b/docs/source/architecture.md
similarity index 100%
rename from docs/architecture.md
rename to docs/source/architecture.md
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 00000000000..d77e439db7c
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,88 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+import os
+import subprocess
+import sys
+
+sys.path.insert(0, os.path.abspath('../..'))
+
+project = 'tensorrt_llm'
+copyright = '2023, NVidia'
+author = 'NVidia'
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+templates_path = ['_templates']
+exclude_patterns = []
+
+extensions = [
+    'sphinx.ext.duration',
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.napoleon',
+    'myst_parser',  # for markdown support
+    "breathe",
+    'sphinx.ext.todo',
+]
+
+autosummary_generate = True
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.txt': 'markdown',
+    '.md': 'markdown',
+}
+
+html_theme = 'sphinx_rtd_theme'
+html_static_path = ['_static']
+
+# ------------------------  C++ Doc related  --------------------------
+# Breathe configuration
+breathe_default_project = "TensorRT-LLM"
+breathe_projects = {"TensorRT-LLM": "../cpp_docs/xml"}
+
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+
+CPP_INCLUDE_DIR = os.path.join(SCRIPT_DIR, '../../cpp/include/tensorrt_llm')
+CPP_GEN_DIR = os.path.join(SCRIPT_DIR, '_cpp_gen')
+print('CPP_INCLUDE_DIR', CPP_INCLUDE_DIR)
+print('CPP_GEN_DIR', CPP_GEN_DIR)
+
+
+def gen_cpp_doc(ofile_name: str, header_dir: str, summary: str):
+    cpp_header_files = [
+        file for file in os.listdir(header_dir) if file.endswith('.h')
+    ]
+
+    with open(ofile_name, 'w') as ofile:
+        ofile.write(summary + "\n")
+        for header in cpp_header_files:
+            ofile.write(f"{header}\n")
+            ofile.write("_" * len(header) + "\n\n")
+
+            ofile.write(f".. doxygenfile:: {header}\n")
+            ofile.write("   :project: TensorRT-LLM\n\n")
+
+
+runtime_summary = f"""
+Runtime
+==========
+
+.. Here are files in the cpp/include/runtime
+.. We manually add subsection to enable detailed description in the future
+.. It is also doable to automatically generate this file and list all the modules in the conf.py
+    """.strip()
+
+subprocess.run(['mkdir', '-p', CPP_GEN_DIR])
+gen_cpp_doc(CPP_GEN_DIR + '/runtime.rst', CPP_INCLUDE_DIR + '/runtime',
+            runtime_summary)
diff --git a/docs/gpt_attention.md b/docs/source/gpt_attention.md
similarity index 100%
rename from docs/gpt_attention.md
rename to docs/source/gpt_attention.md
diff --git a/docs/gpt_runtime.md b/docs/source/gpt_runtime.md
similarity index 98%
rename from docs/gpt_runtime.md
rename to docs/source/gpt_runtime.md
index 555f6498588..e62a5558130 100644
--- a/docs/gpt_runtime.md
+++ b/docs/source/gpt_runtime.md
@@ -89,7 +89,7 @@ class. In this release, that class encapsulates the following parameters:
  * `useGptAttentionPlugin`, indicates if the [GPT Attention](gpt_attention.md)
    operator was compiled using the
    [GPT Attention plugin](../cpp/tensorrt_llm/plugins/gptAttentionPlugin),
- * `useInflightBatching`, indicates if the [GPT Attention](gpt_attention.md)
+ * `supportsInflightBatching`, indicates if the [GPT Attention](gpt_attention.md)
    operator was compiled using above plugin with in-flight batching support,
  * `inputPacked`, indicates that the input must be packed (or padded when set
    to `false`). For performance reasons, it is recommended to always use packed,
@@ -124,7 +124,8 @@ The world configuration is an instance of the
 [`WorldConfig`](../cpp/include/tensorrt_llm/runtime/worldConfig.h)
 class. In this release, that class encapsulates the following parameters:
 
-* `worldSize`, is the number of ranks that collaborate together,
+* `tensorParallelism`, is the number of ranks that collaborate together,
+* `pipelineParallelism`, is the number of ranks that collaborate together,
 * `rank`, is the unique identifier of the rank (see below),
 * `gpusPerNode`, indicates the number of GPUs on each node. Having that
   information allows the C++ runtime to optimize communications between GPUs in
@@ -153,7 +154,7 @@ int rank;
 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
 // Create the TensorRT-LLM Runtime WorldConfig.
-tensorrt_llm::runtime::WorldConfig worldConfig(worldSize, rank);
+tensorrt_llm::runtime::WorldConfig worldConfig(tensorParallelism, pipelineParallelism, rank);
 
 // Create the GPT session (as shown above).
 tensorrt_llm::runtime::GptSession session(modelConfig, worldConfig, ...);
diff --git a/docs/in_flight_batching.md b/docs/source/in_flight_batching.md
similarity index 100%
rename from docs/in_flight_batching.md
rename to docs/source/in_flight_batching.md
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 00000000000..03c1af799f3
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,65 @@
+.. TensorRT-LLM documentation master file, created by
+   sphinx-quickstart on Wed Sep 20 08:35:21 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to TensorRT-LLM's documentation!
+========================================
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Contents:
+
+   architecture.md
+   gpt_runtime.md
+   in_flight_batching.md
+   gpt_attention.md
+   precision.md
+   performance.md
+   2023-05-19-how-to-debug.md
+   2023-05-17-how-to-add-a-new-model.md
+
+Python API
+----------
+
+- :doc:`tensorrt_llm.layers <python-api/tensorrt_llm.layers>`
+- :doc:`tensorrt_llm.functional <python-api/tensorrt_llm.functional>`
+- :doc:`tensorrt_llm.models <python-api/tensorrt_llm.models>`
+- :doc:`tensorrt_llm.plugin <python-api/tensorrt_llm.plugin>`
+- :doc:`tensorrt_llm.qunatization <python-api/tensorrt_llm.quantization>`
+- :doc:`tensorrt_llm.runtime <python-api/tensorrt_llm.runtime>`
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Python API
+   :hidden:
+
+   python-api/tensorrt_llm.layers
+   python-api/tensorrt_llm.functional
+   python-api/tensorrt_llm.models
+   python-api/tensorrt_llm.plugin
+   python-api/tensorrt_llm.quantization
+   python-api/tensorrt_llm.runtime
+
+
+C++ API
+---------
+
+- :doc:`cpp/runtime <_cpp_gen/runtime>`
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: C++ API
+   :hidden:
+
+   _cpp_gen/runtime
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/performance.md b/docs/source/performance.md
similarity index 100%
rename from docs/performance.md
rename to docs/source/performance.md
diff --git a/docs/precision.md b/docs/source/precision.md
similarity index 100%
rename from docs/precision.md
rename to docs/source/precision.md
diff --git a/docs/source/python-api/tensorrt_llm.functional.rst b/docs/source/python-api/tensorrt_llm.functional.rst
new file mode 100644
index 00000000000..bfa0368f179
--- /dev/null
+++ b/docs/source/python-api/tensorrt_llm.functional.rst
@@ -0,0 +1,11 @@
+Functionals
+===========================
+
+.. automodule:: tensorrt_llm
+
+.. currentmodule:: tensorrt_llm
+
+.. automodule:: tensorrt_llm.functional
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/python-api/tensorrt_llm.layers.rst b/docs/source/python-api/tensorrt_llm.layers.rst
new file mode 100644
index 00000000000..dd55d8ecc28
--- /dev/null
+++ b/docs/source/python-api/tensorrt_llm.layers.rst
@@ -0,0 +1,69 @@
+Layers
+===========================
+
+.. automodule:: tensorrt_llm
+
+.. currentmodule:: tensorrt_llm
+
+Activation
+------------
+.. automodule:: tensorrt_llm.layers.activation
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Attention
+------------
+.. automodule:: tensorrt_llm.layers.attention
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Cast
+------------
+.. automodule:: tensorrt_llm.layers.cast
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Conv
+------------
+.. automodule:: tensorrt_llm.layers.conv
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Embedding
+------------
+.. automodule:: tensorrt_llm.layers.embedding
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Linear
+------------
+.. automodule:: tensorrt_llm.layers.linear
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+MLP
+------------
+.. automodule:: tensorrt_llm.layers.mlp
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Normalization
+------------
+.. automodule:: tensorrt_llm.layers.normalization
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Pooling
+------------
+.. automodule:: tensorrt_llm.layers.pooling
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/python-api/tensorrt_llm.models.rst b/docs/source/python-api/tensorrt_llm.models.rst
new file mode 100644
index 00000000000..ca0a5c06f50
--- /dev/null
+++ b/docs/source/python-api/tensorrt_llm.models.rst
@@ -0,0 +1,11 @@
+Models
+===========================
+
+.. automodule:: tensorrt_llm
+
+.. currentmodule:: tensorrt_llm
+
+.. automodule:: tensorrt_llm.models
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/python-api/tensorrt_llm.plugin.rst b/docs/source/python-api/tensorrt_llm.plugin.rst
new file mode 100644
index 00000000000..b4459f98e2c
--- /dev/null
+++ b/docs/source/python-api/tensorrt_llm.plugin.rst
@@ -0,0 +1,10 @@
+Plugin
+===========================
+
+.. automodule:: tensorrt_llm
+
+.. currentmodule:: tensorrt_llm
+
+.. automodule:: tensorrt_llm.plugin
+   :members:
+   :show-inheritance:
diff --git a/docs/source/python-api/tensorrt_llm.quantization.rst b/docs/source/python-api/tensorrt_llm.quantization.rst
new file mode 100644
index 00000000000..479fd0eadd8
--- /dev/null
+++ b/docs/source/python-api/tensorrt_llm.quantization.rst
@@ -0,0 +1,10 @@
+Qunatization
+===========================
+
+.. automodule:: tensorrt_llm
+
+.. currentmodule:: tensorrt_llm
+
+.. automodule:: tensorrt_llm.quantization
+   :members:
+   :show-inheritance:
diff --git a/docs/source/python-api/tensorrt_llm.runtime.rst b/docs/source/python-api/tensorrt_llm.runtime.rst
new file mode 100644
index 00000000000..2bd02e3dca5
--- /dev/null
+++ b/docs/source/python-api/tensorrt_llm.runtime.rst
@@ -0,0 +1,11 @@
+Runtime
+===========================
+
+.. automodule:: tensorrt_llm
+
+.. currentmodule:: tensorrt_llm
+
+.. automodule:: tensorrt_llm.runtime
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/examples/baichuan/build.py b/examples/baichuan/build.py
index 4d1082d4c9d..0c6d4f38015 100644
--- a/examples/baichuan/build.py
+++ b/examples/baichuan/build.py
@@ -224,9 +224,18 @@ def parse_arguments():
         args.quant_mode = QuantMode(0)
 
     if args.use_inflight_batching:
-        assert args.use_gpt_attention_plugin, "You have to use GPT attention plugin for inflight batching mode"
-        assert args.paged_kv_cache, "You have to use paged kv cache for inflight batching mode"
-        assert args.remove_input_padding, "You have to remove input padding for in-flight batching"
+        if not args.use_gpt_attention_plugin:
+            args.use_gpt_attention_plugin = 'float16'
+            logger.info(
+                f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'"
+            )
+        if not args.remove_input_padding:
+            args.remove_input_padding = True
+            logger.info(
+                "Using remove input padding for inflight batching mode.")
+        if not args.paged_kv_cache:
+            args.paged_kv_cache = True
+            logger.info("Using paged KV cache for inflight batching mode.")
 
     if args.model_dir is not None:
         hf_config = AutoConfig.from_pretrained(args.model_dir,
@@ -343,8 +352,6 @@ def build_rank_engine(builder: Builder,
             dtype=args.use_gpt_attention_plugin)
     if args.use_gemm_plugin:
         network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
-    if args.use_inflight_batching:
-        network.plugin_config.enable_in_flight_batching()
     assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
     if args.enable_context_fmha:
         network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
@@ -386,6 +393,8 @@ def build_rank_engine(builder: Builder,
             model_path = os.path.join(args.output_dir, 'test.onnx')
             to_onnx(network.trt_network, model_path)
 
+    tensorrt_llm.graph_rewriting.optimize(network)
+
     engine = None
 
     # Network -> Engine
diff --git a/examples/baichuan/run.py b/examples/baichuan/run.py
index fa8e2c93a39..c19689572fe 100644
--- a/examples/baichuan/run.py
+++ b/examples/baichuan/run.py
@@ -182,7 +182,10 @@ def generate(
                 EOS_TOKEN).cuda()
 
     max_input_length = torch.max(input_lengths).item()
-    decoder.setup(input_lengths.size(0), max_input_length, max_output_len)
+    decoder.setup(input_lengths.size(0),
+                  max_input_length,
+                  max_output_len,
+                  beam_width=num_beams)
 
     output_ids = decoder.decode(input_ids, input_lengths, sampling_config)
     torch.cuda.synchronize()
diff --git a/examples/baichuan/summarize.py b/examples/baichuan/summarize.py
index 6fa2f932e16..05d0759ece3 100644
--- a/examples/baichuan/summarize.py
+++ b/examples/baichuan/summarize.py
@@ -181,7 +181,8 @@ def summarize_tensorrt_llm(datapoint):
         with torch.no_grad():
             tensorrt_llm_baichuan.setup(batch_size,
                                         max_context_length=max_length,
-                                        max_new_tokens=output_len)
+                                        max_new_tokens=output_len,
+                                        beam_width=num_beams)
             if tensorrt_llm_baichuan.remove_input_padding:
                 output_ids = tensorrt_llm_baichuan.decode_batch(
                     line_encoded, sampling_config)
diff --git a/examples/bloom/README.md b/examples/bloom/README.md
index aae54e346d1..0963fa14d36 100644
--- a/examples/bloom/README.md
+++ b/examples/bloom/README.md
@@ -56,7 +56,7 @@ python build.py --model_dir ./bloom/560M/ \
 # Currently, TensorRT does not support tensors with more than 2^31-1 elements,
 # so we have to shard the embedding table to multi-GPUs.
 
-# sharding embedding table in the vocab dimension
+# sharding embedding table in the vocab dimension (the lookup plugin is optional)
 python build.py --model_dir ./bloom/176B/ \
                 --dtype float16 \
                 --use_gemm_plugin float16 \
@@ -65,7 +65,7 @@ python build.py --model_dir ./bloom/176B/ \
                 --world_size 8 \
                 --use_parallel_embedding \
                 --embedding_sharding_dim 0 \
-                --use_lookup_plugin
+                --use_lookup_plugin  float16
 
 # sharding embedding table in the hidden dimension
 python build.py --model_dir ./bloom/176B/ \
@@ -75,8 +75,20 @@ python build.py --model_dir ./bloom/176B/ \
                 --output_dir ./bloom/176B/trt_engines/fp16/8-gpu/ \
                 --world_size 8 \
                 --use_parallel_embedding \
-                --embedding_sharding_dim 1 \
-                --use_lookup_plugin
+                --embedding_sharding_dim 1
+
+# share embedding table between embedding() and lm_head() layers
+# To reduce the generated engine size, we has to use gemm and lookup plugin (--use_gemm_plugin --use_lookup_plugin) and must shard the embedding table in the vocab dimension.
+python build.py --model_dir ./bloom/176B/ \
+                --dtype float16 \
+                --use_gemm_plugin float16 \
+                --use_gpt_attention_plugin float16 \
+                --output_dir ./bloom/176B/trt_engines/fp16/8-gpu/ \
+                --world_size 8 \
+                --use_parallel_embedding \
+                --embedding_sharding_dim 0 \
+                --use_lookup_plugin float16 \
+                --use_embedding_sharing
 ```
 
 ### 4. Run
diff --git a/examples/bloom/build.py b/examples/bloom/build.py
index f04b9a80242..a7110463991 100644
--- a/examples/bloom/build.py
+++ b/examples/bloom/build.py
@@ -29,7 +29,7 @@
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 
-from weight import load_from_hf_bloom  # isort:skip
+from weight import load_from_hf_bloom, check_embedding_share  # isort:skip
 
 MODEL_NAME = "bloom"
 
@@ -181,9 +181,16 @@ def parse_arguments():
         choices=[0, 1],
         help=
         'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
-        'To shard it along hiddem dimension, set embedding_sharding_dim=1'
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
         'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
     )
+    parser.add_argument(
+        '--use_embedding_sharing',
+        action="store_true",
+        default=False,
+        help=
+        'Try to reduce the engine size by sharing the embedding lookup table between two layers.'
+        'Note: the flag might not take effect when the criteria are not met.')
     parser.add_argument(
         '--use_lookup_plugin',
         nargs='?',
@@ -217,6 +224,26 @@ def build_rank_engine(builder: Builder,
     '''
     kv_dtype = str_dtype_to_trt(args.dtype)
 
+    # Share_embedding_table can be set True only when:
+    # 1) the weight for lm_head() does not exist while other weights exist
+    # 2) For multiple-processes, use_parallel_embedding=True and embedding_sharding_dim == 0.
+    # Besides, for TensorRT 9.0, we can observe the engine size reduction when the lookup and gemm plugin are enabled.
+    share_embedding_table = False
+    if args.use_embedding_sharing:
+        if args.world_size > 1:
+            if args.model_dir is not None and args.embedding_sharding_dim == 0 and args.use_parallel_embedding:
+                share_embedding_table = check_embedding_share(args.model_dir)
+        else:
+            if args.model_dir is not None:
+                share_embedding_table = check_embedding_share(args.model_dir)
+
+        if not share_embedding_table:
+            logger.warning(f'Cannot share the embedding lookup table.')
+
+    if share_embedding_table:
+        logger.info(
+            'Engine will share embedding and language modeling weights.')
+
     # Initialize Module
     tensorrt_llm_bloom = tensorrt_llm.models.BloomForCausalLM(
         num_layers=args.n_layer,
@@ -229,7 +256,9 @@ def build_rank_engine(builder: Builder,
                         rank=rank,
                         tp_size=args.world_size),  # TP only
         use_parallel_embedding=args.use_parallel_embedding,
-        embedding_sharding_dim=args.embedding_sharding_dim)
+        embedding_sharding_dim=args.embedding_sharding_dim,
+        share_embedding_table=share_embedding_table)
+
     if args.model_dir is not None:
         logger.info(f'Loading HF BLOOM ... from {args.model_dir}')
         tik = time.time()
@@ -245,8 +274,8 @@ def build_rank_engine(builder: Builder,
                            args.world_size,
                            fp16=(args.dtype == 'float16'),
                            use_parallel_embedding=args.use_parallel_embedding,
-                           sharding_dim=args.embedding_sharding_dim)
-        del hf_bloom
+                           sharding_dim=args.embedding_sharding_dim,
+                           share_embedding_table=share_embedding_table)
 
     # Module -> Network
     network = builder.create_network()
@@ -288,6 +317,8 @@ def build_rank_engine(builder: Builder,
             model_path = os.path.join(args.output_dir, 'test.onnx')
             to_onnx(network.trt_network, model_path)
 
+    tensorrt_llm.graph_rewriting.optimize(network)
+
     engine = None
 
     # Network -> Engine
diff --git a/examples/bloom/summarize.py b/examples/bloom/summarize.py
index 451d307f822..58dcd74ad33 100644
--- a/examples/bloom/summarize.py
+++ b/examples/bloom/summarize.py
@@ -168,7 +168,8 @@ def summarize_tensorrt_llm(datapoint):
         with torch.no_grad():
             tensorrt_llm_bloom.setup(line_encoded.size(0),
                                      max_context_length=line_encoded.size(1),
-                                     max_new_tokens=output_len)
+                                     max_new_tokens=output_len,
+                                     beam_width=num_beams)
 
             output_ids = tensorrt_llm_bloom.decode(
                 line_encoded,
diff --git a/examples/bloom/weight.py b/examples/bloom/weight.py
index d7da2490ade..17888121fbf 100644
--- a/examples/bloom/weight.py
+++ b/examples/bloom/weight.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
+from pathlib import Path
 
 import numpy as np
 import torch
@@ -97,13 +98,21 @@ def get_weight_and_bias(config, prefix, dtype):
     return get_weight(config, prefix, dtype), get_bias(config, prefix, dtype)
 
 
+def check_embedding_share(dir_path):
+    share_embedding_table = False
+    if Path(dir_path).exists():
+        share_embedding_table = True
+    return share_embedding_table
+
+
 def load_from_hf_bloom(tensorrt_llm_bloom,
                        hf_bloom,
                        rank=0,
                        tensor_parallel=1,
                        fp16=False,
                        use_parallel_embedding=False,
-                       sharding_dim=0):
+                       sharding_dim=0,
+                       share_embedding_table=False):
     tensorrt_llm.logger.info('Loading weights from HF BLOOM...')
     tik = time.time()
 
@@ -157,16 +166,16 @@ def load_from_hf_bloom(tensorrt_llm_bloom,
         tensorrt_llm_bloom.layers[l].post_layernorm.bias.value = post_ln_bias
 
     embed_w = get_weight(model_params, 'transformer.word_embeddings', dtype)
+    if not share_embedding_table:
+        tensorrt_llm_bloom.lm_head.weight.value = split_matrix_tp(
+            embed_w.copy(), tensor_parallel, rank, dim=0)
+
     if not use_parallel_embedding:
-        tensorrt_llm_bloom.embedding.weight.value = embed_w.copy()
+        tensorrt_llm_bloom.embedding.weight.value = embed_w
     else:
         assert hf_bloom.config.vocab_size % tensor_parallel == 0
-        tensorrt_llm_bloom.embedding.weight.value = np.ascontiguousarray(
-            split(embed_w.copy(), tensor_parallel, rank, dim=sharding_dim))
-    tensorrt_llm_bloom.lm_head.weight.value = split_matrix_tp(embed_w,
-                                                              tensor_parallel,
-                                                              rank,
-                                                              dim=0)
+        tensorrt_llm_bloom.embedding.weight.value = split_matrix_tp(
+            embed_w, tensor_parallel, rank, dim=sharding_dim)
 
     embed_f_w, embed_f_b = get_weight_and_bias(
         model_params, 'transformer.word_embeddings_layernorm', dtype)
diff --git a/examples/chatglm2-6b/build.py b/examples/chatglm2-6b/build.py
index 13c9cfc3d67..948464815a1 100644
--- a/examples/chatglm2-6b/build.py
+++ b/examples/chatglm2-6b/build.py
@@ -310,6 +310,8 @@ def build_rank_engine(builder: Builder,
             args.max_beam_width)
         tensorrt_llm_ChatGLM2_6BModel(*inputs)
 
+    tensorrt_llm.graph_rewriting.optimize(network)
+
     engine = None
 
     # Network -> Engine
diff --git a/examples/chatglm6b/build.py b/examples/chatglm6b/build.py
index 8a20b0fd6ea..9206291321a 100644
--- a/examples/chatglm6b/build.py
+++ b/examples/chatglm6b/build.py
@@ -305,6 +305,8 @@ def build_rank_engine(builder: Builder,
             args.max_beam_width)
         tensorrt_llm_ChatGLM6BModel(*inputs)
 
+    tensorrt_llm.graph_rewriting.optimize(network)
+
     engine = None
 
     # Network -> Engine
diff --git a/examples/cpp_library/main.cpp b/examples/cpp_library/main.cpp
index e5aa8e2988a..06695d9c3ee 100644
--- a/examples/cpp_library/main.cpp
+++ b/examples/cpp_library/main.cpp
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <dlfcn.h>
 #include <iostream>
 #include <stdexcept>
 #include <string>
diff --git a/examples/cpp_library/tensorrt_llm_libutils.h b/examples/cpp_library/tensorrt_llm_libutils.h
index 99d04ab06bf..e841e24134e 100644
--- a/examples/cpp_library/tensorrt_llm_libutils.h
+++ b/examples/cpp_library/tensorrt_llm_libutils.h
@@ -14,7 +14,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#if !defined(_WIN32)
 #include <dlfcn.h>
+#endif // !defined(_WIN32)
 #include <iostream>
 #include <stdexcept>
 #include <string>
@@ -24,6 +26,7 @@
 template <typename tSymbolSignature>
 tSymbolSignature getTrtLLMFunction(std::string libFileSoName, std::string symbol)
 {
+#if !defined(_WIN32)
     std::cout << "Trying to load " << libFileSoName << " ..." << std::endl;
 
     // 1. Defining a handle to the library
@@ -51,4 +54,9 @@ tSymbolSignature getTrtLLMFunction(std::string libFileSoName, std::string symbol
     }
 
     return symbolFctn;
+#else  // on windows
+    throw std::runtime_error(
+        "`tSymbolSignature getTrtLLMFunction(std::string, std::string)` is not implemented on Windows.");
+    return nullptr;
+#endif // !defined(_WIN32)
 }
diff --git a/examples/falcon/README.md b/examples/falcon/README.md
index e7c9d8bc6a7..fe6df0116f1 100644
--- a/examples/falcon/README.md
+++ b/examples/falcon/README.md
@@ -59,16 +59,41 @@ python build.py --model_dir falcon/40b-instruct \
                 --use_gemm_plugin bfloat16 \
                 --use_gpt_attention_plugin bfloat16 \
                 --output_dir falcon/40b-instruct/trt_engines/bf16/2-gpu/ \
-                --world_size 2
+                --world_size 2 \
+                --tp_size 2
+
+# Use 2-way tensor parallelism and 2-way pipeline parallelism on falcon-40b-instruct
+python build.py --model_dir falcon/40b-instruct \
+                --dtype bfloat16 \
+                --use_gemm_plugin bfloat16 \
+                --use_gpt_attention_plugin bfloat16 \
+                --output_dir falcon/40b-instruct/trt_engines/bf16/2-gpu/ \
+                --world_size 4 \
+                --tp_size 2 \
+                --pp_size 2
 
 # Use 8-way tensor parallelism on falcon-180B, loading weights shard-by-shard.
-python build.py --model_dir /models/falcon/180b \
+python build.py --model_dir falcon/180b \
+                --dtype bfloat16 \
+                --use_gemm_plugin bfloat16 \
+                --use_gpt_attention_plugin bfloat16 \
+                --output_dir falcon/180b/trt_engines/bf16/8-gpu/ \
+                --world_size 8 \
+                --tp_size 8 \
+                --load_by_shard \
+                --parallel_build
+
+# Use 4-way tensor parallelism and 2-way pipeline parallelism on falcon-180B, loading weights shard-by-shard.
+python build.py --model_dir falcon/180b \
                 --dtype bfloat16 \
                 --use_gemm_plugin bfloat16 \
                 --use_gpt_attention_plugin bfloat16 \
-                --output_dir /models/falcon/180b/trt_engines/bf16/8-gpu/ \
-                --world_size 8
-                --paralell_build
+                --output_dir falcon/180b/trt_engines/bf16/8-gpu/ \
+                --world_size 8 \
+                --tp_size 4 \
+                --pp_size 2 \
+                --load_by_shard \
+                --parallel_build
 ```
 
 Note that in order to use N-way tensor parallelism, the number of attention heads must be a multiple of N.
diff --git a/examples/falcon/build.py b/examples/falcon/build.py
index 8a20f8f66e1..7a6fd078f44 100644
--- a/examples/falcon/build.py
+++ b/examples/falcon/build.py
@@ -30,8 +30,10 @@
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import fp8_quantize
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
+from tensorrt_llm.quantization import QuantMode
 
 from weight import load_from_hf_falcon  # isort:skip
 from weight import load_from_hf_checkpoint  # isort:skip
@@ -94,8 +96,11 @@ def to_onnx(network, path):
     onnx.save(onnx_model, path)
 
 
-def get_engine_name(model, dtype, tp_size, rank):
-    return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
+def get_engine_name(model, dtype, tp_size, pp_size, rank):
+    if pp_size == 1:
+        return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
+    return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size,
+                                                  pp_size, rank)
 
 
 def serialize_engine(engine, path):
@@ -143,10 +148,9 @@ def load_falcon_config(model_dir: Union[str, Path]) -> FalconConfig:
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--world_size',
-                        type=int,
-                        default=1,
-                        help='world size, only support tensor parallelism now')
+    parser.add_argument('--world_size', type=int, default=1)
+    parser.add_argument('--tp_size', type=int, default=1)
+    parser.add_argument('--pp_size', type=int, default=1)
     parser.add_argument('--model_dir', type=str, default=None)
     parser.add_argument('--dtype',
                         type=str,
@@ -198,6 +202,15 @@ def parse_arguments():
                         type=str,
                         default=False,
                         choices=['float16', 'bfloat16', 'float32'])
+    parser.add_argument(
+        '--use_layernorm_plugin',
+        nargs='?',
+        const=None,
+        type=str,
+        default=False,
+        choices=['float16', 'float32', 'bfloat16'],
+        help="Activates layernorm plugin. You can specify the plugin dtype or "
+        "leave blank to use the model dtype.")
     parser.add_argument('--parallel_build', default=False, action='store_true')
     parser.add_argument('--enable_context_fmha',
                         default=False,
@@ -224,10 +237,59 @@ def parse_arguments():
                         default=False,
                         action='store_true')
 
+    # Arguments related to the quantization of the model.
+    parser.add_argument(
+        '--enable_fp8',
+        default=False,
+        action='store_true',
+        help='Use FP8 Linear layer for Attention QKV/Dense and MLP.')
+    parser.add_argument(
+        '--fp8_kv_cache',
+        default=False,
+        action="store_true",
+        help='By default, we use dtype for KV cache. fp8_kv_cache chooses int8 '
+        'quantization for KV')
+    parser.add_argument(
+        '--use_inflight_batching',
+        action="store_true",
+        default=False,
+        help="Activates inflight batching mode of gptAttentionPlugin.")
+    parser.add_argument(
+        '--paged_kv_cache',
+        action="store_true",
+        default=False,
+        help=
+        'By default we use contiguous KV cache. By setting this flag you enable paged KV cache'
+    )
+    parser.add_argument('--tokens_per_block',
+                        type=int,
+                        default=64,
+                        help='Number of tokens per block in paged KV cache')
+
     args = parser.parse_args()
 
     logger.set_level(args.log_level)
 
+    if args.use_inflight_batching:
+        if not args.use_gpt_attention_plugin:
+            args.use_gpt_attention_plugin = 'float16'
+            logger.info(
+                f"Using GPT attention plugin for inflight batching mode. "
+                f"Setting to default '{args.use_gpt_attention_plugin}'")
+        if not args.remove_input_padding:
+            args.remove_input_padding = True
+            logger.info(
+                'Using remove input padding for inflight batching mode.')
+        if not args.paged_kv_cache:
+            args.paged_kv_cache = True
+            logger.info('Using paged KV cache for inflight batching mode.')
+
+    args.quant_mode = QuantMode(0)
+    if args.fp8_kv_cache:
+        args.quant_mode = args.quant_mode.set_fp8_kv_cache()
+    if args.enable_fp8:
+        args.quant_mode = args.quant_mode.set_fp8_qdq()
+
     if args.model_dir is not None:
         hf_config = load_falcon_config(args.model_dir)
         args.n_embd = hf_config.hidden_size
@@ -243,7 +305,7 @@ def parse_arguments():
         args.new_decoder_architecture = hf_config.new_decoder_architecture
 
         # FalconConfig sets num_kv_heads by num_heads if not provided, even
-        # thoguh multi-query attention case. We here manually correct the
+        # though multi-query attention case. We here manually correct the
         # value of number of K/V heads.
         if not hf_config.new_decoder_architecture and hf_config.multi_query:
             args.n_kv_head = 1
@@ -253,17 +315,18 @@ def parse_arguments():
     assert (args.n_head % args.n_kv_head) == 0, \
         "MQA/GQA requires the number of heads to be divisible by the number "\
         "of K/V heads."
-    assert args.n_kv_head % args.world_size == 0 \
-        or args.world_size % args.n_kv_head == 0, \
+    assert args.n_kv_head % args.tp_size == 0 \
+        or args.tp_size % args.n_kv_head == 0, \
         "MQA/GQA requires either the number of K/V heads to be divisible by "\
-        "the number of GPUs OR the number of GPUs to be divisible by the "\
+        "the tensor parallelism size OR the tensor parallelism size to be divisible by the "\
         "number of K/V heads."
+    assert args.pp_size * args.tp_size == args.world_size
 
     # TODO: Allow gpt plugin once attention module allows alibi with the plugin
     if not args.use_gpt_attention_plugin and not args.alibi:
         args.use_gpt_attention_plugin = args.dtype
         logger.warning(
-            f"RoPE does not support without GPT attention plugin. Replaced by"
+            f"RoPE does not support without GPT attention plugin. Set by "
             f"use_gpt_attention_plugin={args.dtype}.")
     elif args.use_gpt_attention_plugin and args.alibi:
         args.use_gpt_attention_plugin = False
@@ -291,6 +354,12 @@ def build_rank_engine(builder: Builder,
     dtype = str_dtype_to_trt(args.dtype)
 
     # Initialize Module
+    mapping = Mapping(
+        world_size=args.world_size,
+        rank=rank,
+        tp_size=args.tp_size,
+        pp_size=args.pp_size,
+    )
     tensorrt_llm_falcon = tensorrt_llm.models.FalconForCausalLM(
         num_layers=args.n_layer,
         num_heads=args.n_head,
@@ -299,14 +368,17 @@ def build_rank_engine(builder: Builder,
         vocab_size=args.vocab_size,
         max_position_embeddings=args.n_positions,
         dtype=dtype,
+        quant_mode=args.quant_mode,
         bias=args.bias,
         use_alibi=args.alibi,
         logits_dtype=args.logits_dtype,
-        mapping=Mapping(world_size=args.world_size,
-                        rank=rank,
-                        tp_size=args.world_size),  # TP only
+        mapping=mapping,
         parallel_attention=args.parallel_attention,
         new_decoder_architecture=args.new_decoder_architecture)
+
+    if args.enable_fp8 or args.fp8_kv_cache:
+        # Dummy scales only
+        tensorrt_llm_falcon = fp8_quantize(tensorrt_llm_falcon, args.quant_mode)
     if args.model_dir is not None:
         logger.info(f'Loading HF Falcon ... from {args.model_dir}')
         tik = time.time()
@@ -315,15 +387,13 @@ def build_rank_engine(builder: Builder,
                 args.model_dir, trust_remote_code=True)
             load_from_hf_falcon(tensorrt_llm_falcon,
                                 hf_falcon,
-                                rank,
-                                args.world_size,
+                                mapping,
                                 dtype=args.dtype)
             del hf_falcon
         else:
             load_from_hf_checkpoint(tensorrt_llm_falcon,
                                     args.model_dir,
-                                    rank,
-                                    args.world_size,
+                                    mapping,
                                     dtype=args.dtype)
         tok = time.time()
         t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
@@ -337,6 +407,12 @@ def build_rank_engine(builder: Builder,
             dtype=args.use_gpt_attention_plugin)
     if args.use_gemm_plugin:
         network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
+    if args.use_layernorm_plugin:
+        network.plugin_config.set_layernorm_plugin(
+            dtype=args.use_layernorm_plugin)
+
+    # Quantization plugins.
+    assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
     if args.enable_context_fmha:
         network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
     if args.enable_context_fmha_fp32_acc:
@@ -347,13 +423,19 @@ def build_rank_engine(builder: Builder,
         network.plugin_config.set_nccl_plugin(args.dtype)
     if args.remove_input_padding:
         network.plugin_config.enable_remove_input_padding()
+    if args.paged_kv_cache:
+        network.plugin_config.enable_paged_kv_cache()
     with net_guard(network):
         # Prepare
         network.set_named_parameters(tensorrt_llm_falcon.named_parameters())
-        inputs = tensorrt_llm_falcon.prepare_inputs(args.max_batch_size,
-                                                    args.max_input_len,
-                                                    args.max_output_len, True,
-                                                    args.max_beam_width)
+        inputs = tensorrt_llm_falcon.prepare_inputs(
+            max_batch_size=args.max_batch_size,
+            max_input_len=args.max_input_len,
+            max_new_tokens=args.max_output_len,
+            use_cache=True,
+            max_beam_width=args.max_beam_width,
+            paged_kv_cache=args.paged_kv_cache,
+            tokens_per_block=args.tokens_per_block)
         tensorrt_llm_falcon(*inputs)
         if args.enable_debug_output:
             # mark intermediate nodes' outputs
@@ -366,6 +448,8 @@ def build_rank_engine(builder: Builder,
             model_path = os.path.join(args.output_dir, 'test.onnx')
             to_onnx(network.trt_network, model_path)
 
+    tensorrt_llm.graph_rewriting.optimize(network)
+
     engine = None
 
     # Network -> Engine
@@ -393,7 +477,8 @@ def build(rank, args):
             name=MODEL_NAME,
             precision=args.dtype,
             timing_cache=args.timing_cache if cache is None else cache,
-            tensor_parallel=args.world_size,  # TP only
+            tensor_parallel=args.tp_size,
+            pipeline_parallel=args.pp_size,
             num_layers=args.n_layer,
             num_heads=args.n_head,
             num_kv_heads=args.n_kv_head,
@@ -406,11 +491,14 @@ def build(rank, args):
             max_batch_size=args.max_batch_size,
             max_input_len=args.max_input_len,
             max_output_len=args.max_output_len,
+            fp8=args.quant_mode.has_fp8_qdq(),
+            quant_mode=args.quant_mode,
             opt_level=args.builder_opt,
+            paged_kv_cache=args.paged_kv_cache,
+            tokens_per_block=args.tokens_per_block,
         )
-        builder_config.trt_builder_config.builder_optimization_level = 1
-        engine_name = get_engine_name(MODEL_NAME, args.dtype, args.world_size,
-                                      cur_rank)
+        engine_name = get_engine_name(MODEL_NAME, args.dtype, args.tp_size,
+                                      args.pp_size, cur_rank)
         engine = build_rank_engine(builder, builder_config, engine_name,
                                    cur_rank, args)
         assert engine is not None, \
diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt
index dbf83f53216..7f49ed9a000 100644
--- a/examples/falcon/requirements.txt
+++ b/examples/falcon/requirements.txt
@@ -3,3 +3,4 @@ datasets~=2.3.2
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
 typing-extensions==4.5.0
+tqdm
diff --git a/examples/falcon/run.py b/examples/falcon/run.py
index 9506b099baf..35dfd8470be 100644
--- a/examples/falcon/run.py
+++ b/examples/falcon/run.py
@@ -13,13 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import csv
 import json
-import os
+from pathlib import Path
 
+import numpy as np
 import torch
 from transformers import PreTrainedTokenizerFast
 
 import tensorrt_llm
+from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import ModelConfig, SamplingConfig
 
 from build import get_engine_name  # isort:skip
@@ -37,55 +40,169 @@ def parse_arguments():
     parser.add_argument('--input_text',
                         type=str,
                         default='Born in north-east France, Soyer trained as a')
+    parser.add_argument(
+        '--input_tokens',
+        dest='input_file',
+        type=str,
+        help=
+        'CSV or Numpy file containing tokenized input. Alternative to text input.',
+        default=None)
+    parser.add_argument('--output_csv',
+                        type=str,
+                        help='CSV file where the tokenized output is stored.',
+                        default=None)
+    parser.add_argument('--output_npy',
+                        type=str,
+                        help='Numpy file where the tokenized output is stored.',
+                        default=None)
+    parser.add_argument('--num_beams',
+                        type=int,
+                        help="Use beam search if num_beams >1",
+                        default=1)
     parser.add_argument('--debug', action='store_true')
     return parser.parse_args()
 
 
-if __name__ == '__main__':
-    args = parse_arguments()
-    tensorrt_llm.logger.set_level(args.log_level)
-
-    config_path = os.path.join(args.engine_dir, 'config.json')
-    with open(config_path, 'r') as f:
+def read_config(config_path: Path):
+    with config_path.open('r') as f:
         config = json.load(f)
-    use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
-    dtype = config['builder_config']['precision']
-    world_size = config['builder_config']['tensor_parallel']
+
+    builder_config = config['builder_config']
+    dtype = builder_config['precision']
+    tp_size = builder_config['tensor_parallel']
+    pp_size = builder_config['pipeline_parallel']
+    world_size = tp_size * pp_size
     assert world_size == tensorrt_llm.mpi_world_size(), \
         f'Engine world size ({world_size}) != Runtime world size '\
         f'({tensorrt_llm.mpi_world_size()})'
-    num_heads = config['builder_config']['num_heads'] // world_size
-    num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads)
-    hidden_size = config['builder_config']['hidden_size'] // world_size
-    vocab_size = config['builder_config']['vocab_size']
-    num_layers = config['builder_config']['num_layers']
+    assert pp_size == 1, 'Python runtime does not support pipeline parallelism'
+    num_heads = builder_config['num_heads'] // tp_size
+    num_kv_heads = builder_config.get('num_kv_heads', num_heads)
+    num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
+    hidden_size = builder_config['hidden_size'] // tp_size
+
+    vocab_size = builder_config['vocab_size']
+    num_layers = builder_config['num_layers']
+    tokens_per_block = builder_config['tokens_per_block']
+    quant_mode = QuantMode(builder_config['quant_mode'])
+
+    plugin_config = config['plugin_config']
+    use_gpt_attention_plugin = plugin_config['gpt_attention_plugin']
+    paged_kv_cache = plugin_config['paged_kv_cache']
+    remove_input_padding = plugin_config['remove_input_padding']
+
+    model_config = ModelConfig(num_heads=num_heads,
+                               num_kv_heads=num_kv_heads,
+                               hidden_size=hidden_size,
+                               vocab_size=vocab_size,
+                               num_layers=num_layers,
+                               gpt_attention_plugin=use_gpt_attention_plugin,
+                               paged_kv_cache=paged_kv_cache,
+                               tokens_per_block=tokens_per_block,
+                               remove_input_padding=remove_input_padding,
+                               quant_mode=quant_mode)
+
+    return model_config, tp_size, pp_size, world_size, dtype
+
+
+def parse_input(input_text: str, input_file: str, tokenizer, pad_id: int,
+                remove_input_padding: bool):
+    input_tokens = []
+    if input_file is None:
+        input_tokens.append(
+            tokenizer.encode(input_text, add_special_tokens=False))
+    else:
+        if input_file.endswith('.csv'):
+            with open(input_file, 'r') as csv_file:
+                csv_reader = csv.reader(csv_file, delimiter=',')
+                for line in csv_reader:
+                    input_tokens.append(np.array(line, dtype='int32'))
+        elif input_file.endswith('.npy'):
+            inputs = np.load(input_file)
+            for row in inputs:
+                row = row[row != pad_id]
+                input_tokens.append(row)
+        else:
+            print('Input file format not supported.')
+            raise SystemExit
+
+    input_ids = None
+    input_lengths = torch.tensor([len(x) for x in input_tokens],
+                                 dtype=torch.int32,
+                                 device='cuda')
+    if remove_input_padding:
+        input_ids = np.concatenate(input_tokens)
+        input_ids = torch.tensor(input_ids, dtype=torch.int32,
+                                 device='cuda').unsqueeze(0)
+    else:
+        input_ids = torch.nested.to_padded_tensor(
+            torch.nested.nested_tensor(input_tokens, dtype=torch.int32),
+            pad_id).cuda()
+
+    return input_ids, input_lengths
+
+
+def print_output(output_ids, input_lengths, max_output_len, tokenizer,
+                 output_csv, output_npy):
+    num_beams = output_ids.size(1)
+    if output_csv is None and output_npy is None:
+        for b in range(input_lengths.size(0)):
+            inputs = output_ids[b][0][:input_lengths[b]].tolist()
+            input_text = tokenizer.decode(inputs)
+            print(f'Input: \"{input_text}\"')
+            for beam in range(num_beams):
+                output_begin = input_lengths[b]
+                output_end = input_lengths[b] + max_output_len
+                outputs = output_ids[b][beam][output_begin:output_end].tolist()
+                output_text = tokenizer.decode(outputs)
+                print(f'Output: \"{output_text}\"')
+
+    output_ids = output_ids.reshape((-1, output_ids.size(2)))
+
+    if output_csv is not None:
+        output_file = Path(output_csv)
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        outputs = output_ids.tolist()
+        with open(output_file, 'w') as csv_file:
+            writer = csv.writer(csv_file, delimiter=',')
+            writer.writerows(outputs)
+
+    if output_npy is not None:
+        output_file = Path(output_npy)
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        outputs = np.array(output_ids.cpu().contiguous(), dtype='int32')
+        np.save(output_file, outputs)
+
+
+def main():
+    args = parse_arguments()
+    tensorrt_llm.logger.set_level(args.log_level)
+
+    engine_dir = Path(args.engine_dir)
+    model_config, tp_size, pp_size, world_size, dtype = read_config(
+        engine_dir / 'config.json')
 
     runtime_rank = tensorrt_llm.mpi_rank()
     runtime_mapping = tensorrt_llm.Mapping(world_size,
                                            runtime_rank,
-                                           tp_size=world_size)
+                                           tp_size=tp_size,
+                                           pp_size=pp_size)
     torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
 
-    engine_name = get_engine_name('falcon', dtype, world_size, runtime_rank)
-    serialize_path = os.path.join(args.engine_dir, engine_name)
+    engine_name = get_engine_name('falcon', dtype, tp_size, pp_size,
+                                  runtime_rank)
+    serialize_path = engine_dir / engine_name
 
     tokenizer = PreTrainedTokenizerFast.from_pretrained(args.tokenizer_dir)
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
-    input_ids = torch.tensor(tokenizer.encode(args.input_text),
-                             dtype=torch.int32).cuda().unsqueeze(0)
-
-    model_config = ModelConfig(num_heads=num_heads,
-                               num_kv_heads=num_kv_heads,
-                               hidden_size=hidden_size,
-                               vocab_size=vocab_size,
-                               num_layers=num_layers,
-                               gpt_attention_plugin=use_gpt_attention_plugin)
+    input_ids, input_lengths = parse_input(args.input_text, args.input_file,
+                                           tokenizer, tokenizer.eos_token_id,
+                                           model_config.remove_input_padding)
 
     sampling_config = SamplingConfig(end_id=tokenizer.eos_token_id,
-                                     pad_id=tokenizer.pad_token_id)
-    input_lengths = torch.tensor(
-        [input_ids.size(1) for _ in range(input_ids.size(0))]).int().cuda()
+                                     pad_id=tokenizer.pad_token_id,
+                                     num_beams=args.num_beams)
 
     with open(serialize_path, 'rb') as f:
         engine_buffer = f.read()
@@ -95,13 +212,15 @@ def parse_arguments():
                                                      debug_mode=args.debug)
     decoder.setup(input_ids.size(0),
                   max_context_length=input_ids.size(1),
-                  max_new_tokens=args.max_output_len)
+                  max_new_tokens=args.max_output_len,
+                  beam_width=args.num_beams)
     output_ids = decoder.decode(input_ids, input_lengths, sampling_config)
     torch.cuda.synchronize()
 
-    output_ids = output_ids[0, 0, input_ids.size(1):]
-    output_text = tokenizer.decode(output_ids, skip_special_tokens=True)
+    if runtime_rank == 0:
+        print_output(output_ids, input_lengths, args.max_output_len, tokenizer,
+                     args.output_csv, args.output_npy)
+
 
-    print(f'Input: {args.input_text}')
-    print(f'Output Ids: {output_ids}')
-    print(f'Output: {output_text}')
+if __name__ == '__main__':
+    main()
diff --git a/examples/falcon/summarize.py b/examples/falcon/summarize.py
index 8d2a20a08d8..78649dd542d 100644
--- a/examples/falcon/summarize.py
+++ b/examples/falcon/summarize.py
@@ -26,25 +26,36 @@
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm.logger import logger
+from tensorrt_llm.quantization import QuantMode
 
 from build import get_engine_name  # isort:skip
 
 
 def TRTFalcon(args, config):
-    dtype = config['builder_config']['precision']
-    world_size = config['builder_config']['tensor_parallel']
+    builder_config = config['builder_config']
+    plugin_config = config['plugin_config']
+
+    dtype = builder_config['precision']
+    tp_size = builder_config['tensor_parallel']
+    pp_size = builder_config['pipeline_parallel']
+    world_size = tp_size * pp_size
     assert world_size == tensorrt_llm.mpi_world_size(), \
-        f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
-
-    world_size = config['builder_config']['tensor_parallel']
-    num_heads = config['builder_config']['num_heads'] // world_size
-    hidden_size = config['builder_config']['hidden_size'] // world_size
-    vocab_size = config['builder_config']['vocab_size']
-    num_layers = config['builder_config']['num_layers']
-    use_gpt_attention_plugin = bool(
-        config['plugin_config']['gpt_attention_plugin'])
-    num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads)
-    num_kv_heads = (num_kv_heads + world_size - 1) // world_size
+        f'Engine world size ({world_size}) != Runtime world size '\
+        f'({tensorrt_llm.mpi_world_size()})'
+    assert pp_size == 1, 'Python runtime does not support pipeline parallelism'
+
+    num_heads = builder_config['num_heads'] // tp_size
+    hidden_size = builder_config['hidden_size'] // tp_size
+    vocab_size = builder_config['vocab_size']
+    num_layers = builder_config['num_layers']
+    num_kv_heads = builder_config.get('num_kv_heads', num_heads)
+    num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
+    tokens_per_block = config['builder_config']['tokens_per_block']
+    quant_mode = QuantMode(builder_config['quant_mode'])
+
+    use_gpt_attention_plugin = bool(plugin_config['gpt_attention_plugin'])
+    paged_kv_cache = plugin_config['paged_kv_cache']
+    remove_input_padding = plugin_config['remove_input_padding']
 
     model_config = tensorrt_llm.runtime.ModelConfig(
         vocab_size=vocab_size,
@@ -52,15 +63,21 @@ def TRTFalcon(args, config):
         num_heads=num_heads,
         num_kv_heads=num_kv_heads,
         hidden_size=hidden_size,
-        gpt_attention_plugin=use_gpt_attention_plugin)
+        gpt_attention_plugin=use_gpt_attention_plugin,
+        paged_kv_cache=paged_kv_cache,
+        tokens_per_block=tokens_per_block,
+        remove_input_padding=remove_input_padding,
+        quant_mode=quant_mode)
 
     runtime_rank = tensorrt_llm.mpi_rank()
     runtime_mapping = tensorrt_llm.Mapping(world_size,
                                            runtime_rank,
-                                           tp_size=world_size)
+                                           tp_size=tp_size,
+                                           pp_size=pp_size)
     torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
 
-    engine_name = get_engine_name('falcon', dtype, world_size, runtime_rank)
+    engine_name = get_engine_name('falcon', dtype, tp_size, pp_size,
+                                  runtime_rank)
     serialize_path = os.path.join(args.engine_dir, engine_name)
 
     profiler.start('load tensorrt_llm engine')
@@ -171,9 +188,7 @@ def summarize_tensorrt_llm(datapoint):
                 pad_size = max_length - input_lengths[i]
 
                 pad = torch.ones([1, pad_size]).type(torch.int32) * pad_id
-                line_encoded[i] = torch.cat(
-                    [torch.tensor(line_encoded[i], dtype=torch.int32), pad],
-                    axis=-1)
+                line_encoded[i] = torch.cat([line_encoded[i], pad], axis=-1)
 
             line_encoded = torch.cat(line_encoded, axis=0).cuda()
             input_lengths = torch.tensor(input_lengths,
@@ -188,9 +203,10 @@ def summarize_tensorrt_llm(datapoint):
             repetition_penalty=repetition_penalty)
 
         with torch.no_grad():
-            tensorrt_llm_falcon.setup(line_encoded.size(0),
-                                      max_context_length=line_encoded.size(1),
-                                      max_new_tokens=output_len)
+            tensorrt_llm_falcon.setup(batch_size,
+                                      max_context_length=max_length,
+                                      max_new_tokens=output_len,
+                                      beam_width=num_beams)
 
             if tensorrt_llm_falcon.remove_input_padding:
                 output_ids = tensorrt_llm_falcon.decode_batch(
@@ -225,10 +241,8 @@ def summarize_hf(datapoint):
             line[i] = line[i].strip()
             line[i] = line[i].replace(" n't", "n't")
 
-        line_encoded = tokenizer(line,
-                                 return_tensors='pt',
-                                 padding=True,
-                                 truncation=True)["input_ids"].long()
+        line_encoded = tokenizer(line, return_tensors='pt',
+                                 padding=True)["input_ids"].long()
 
         line_encoded = line_encoded[:, -test_token_num:]
         line_encoded = line_encoded.cuda()
diff --git a/examples/falcon/weight.py b/examples/falcon/weight.py
index f8ef581d4f1..379c1e14025 100644
--- a/examples/falcon/weight.py
+++ b/examples/falcon/weight.py
@@ -22,6 +22,7 @@
 
 import tensorrt_llm
 import tensorrt_llm.logger as logger
+from tensorrt_llm.mapping import Mapping
 
 
 def split(weight: np.ndarray, tp_size: int, rank: int = 0, dim: int = 0):
@@ -130,10 +131,15 @@ def split_qkv_weight(trtllm_falcon: tensorrt_llm.models.FalconModel,
                                         num_kv_heads=num_kv_heads,
                                         tp_size=tp_size,
                                         is_bias=is_bias)
+
+    # Copy a sliced tensor to prevent memory leak. A sliced tensor shares the
+    # memory buffer of the original tensor. So, returning without copying makes
+    # the buffer of a loaded "qkv" be referenced, resulting GC can't release
+    # those weights until the whole process ends.
     if not is_bias:
-        return np.ascontiguousarray(weight[rank, ...])
+        return np.ascontiguousarray(weight[rank, ...].copy())
     else:
-        return weight[rank, ...].ravel()
+        return weight[rank, ...].ravel().copy()
 
 
 def split_matrix(weight: np.ndarray, tp_size: int, rank: int, dim: int):
@@ -160,8 +166,7 @@ def get_weight_and_bias(params: Dict, prefix: str, dtype: torch.dtype):
 
 def load_from_hf_falcon(trtllm_falcon: tensorrt_llm.models.FalconForCausalLM,
                         hf_falcon,
-                        rank: int = 0,
-                        tensor_parallel: int = 1,
+                        mapping=Mapping(),
                         dtype: Union[str, torch.dtype] = torch.float32):
     logger.info('Loading weights from HF Falcon...')
     tik = time.time()
@@ -171,15 +176,17 @@ def load_from_hf_falcon(trtllm_falcon: tensorrt_llm.models.FalconForCausalLM,
         dtype = tensorrt_llm._utils.str_dtype_to_torch(dtype)
     num_kv_heads = trtllm_falcon.num_kv_heads
 
-    for i in range(trtllm_falcon.num_layers):
+    layers_range = trtllm_falcon.get_transformer_layers(
+        trtllm_falcon.mapping, trtllm_falcon.num_layers)
+    for i in layers_range:
         prefix = f'transformer.h.{i}'
-        layer = trtllm_falcon.layers[i]
+        layer = trtllm_falcon.layers[i - layers_range[0]]
         qkv_weight, qkv_bias = get_weight_and_bias(
             model_params, f'{prefix}.self_attention.query_key_value', dtype)
         qkv_w = split_qkv_weight(trtllm_falcon,
                                  qkv_weight,
-                                 tensor_parallel,
-                                 rank,
+                                 mapping.tp_size,
+                                 mapping.tp_rank,
                                  is_bias=False,
                                  num_kv_heads=num_kv_heads)
         layer.attention.qkv.weight.value = qkv_w
@@ -187,8 +194,8 @@ def load_from_hf_falcon(trtllm_falcon: tensorrt_llm.models.FalconForCausalLM,
             layer.attention.qkv.bias.value = split_qkv_weight(
                 trtllm_falcon,
                 qkv_bias,
-                tensor_parallel,
-                rank,
+                mapping.tp_size,
+                mapping.tp_rank,
                 is_bias=True,
                 num_kv_heads=num_kv_heads)
 
@@ -196,8 +203,8 @@ def load_from_hf_falcon(trtllm_falcon: tensorrt_llm.models.FalconForCausalLM,
         attn_dense_weight, attn_dense_bias = get_weight_and_bias(
             model_params, f'{prefix}.self_attention.dense', dtype)
         layer.attention.dense.weight.value = split_matrix(attn_dense_weight,
-                                                          tensor_parallel,
-                                                          rank,
+                                                          mapping.tp_size,
+                                                          mapping.tp_rank,
                                                           dim=1)
         if attn_dense_bias is not None:
             layer.attention.dense.bias.value = attn_dense_bias
@@ -206,21 +213,21 @@ def load_from_hf_falcon(trtllm_falcon: tensorrt_llm.models.FalconForCausalLM,
         mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(
             model_params, f'{prefix}.mlp.dense_h_to_4h', dtype)
         layer.mlp.fc.weight.value = split_matrix(mlp_fc_weight,
-                                                 tensor_parallel,
-                                                 rank,
+                                                 mapping.tp_size,
+                                                 mapping.tp_rank,
                                                  dim=0)
         if mlp_fc_bias is not None:
             layer.mlp.fc.bias.value = split_matrix(mlp_fc_bias,
-                                                   tensor_parallel,
-                                                   rank,
+                                                   mapping.tp_size,
+                                                   mapping.tp_rank,
                                                    dim=0)
 
         logger.debug(f'Layer {i}: Loading MLP Proj weights...')
         mlp_proj_weight, mlp_proj_bias = get_weight_and_bias(
             model_params, f'{prefix}.mlp.dense_4h_to_h', dtype)
         layer.mlp.proj.weight.value = split_matrix(mlp_proj_weight,
-                                                   tensor_parallel,
-                                                   rank,
+                                                   mapping.tp_size,
+                                                   mapping.tp_rank,
                                                    dim=1)
         if mlp_proj_bias is not None:
             layer.mlp.proj.bias.value = mlp_proj_bias
@@ -255,17 +262,19 @@ def load_from_hf_falcon(trtllm_falcon: tensorrt_llm.models.FalconForCausalLM,
                     layer.post_layernorm.bias.value = post_ln_bias
 
     embed_w = get_weight(model_params, 'transformer.word_embeddings', dtype)
-    trtllm_falcon.embedding.weight.value = embed_w.copy()
-    trtllm_falcon.lm_head.weight.value = split_matrix(embed_w,
-                                                      tensor_parallel,
-                                                      rank,
-                                                      dim=0)
-
-    ln_f_w, ln_f_b = get_weight_and_bias(model_params, 'transformer.ln_f',
-                                         dtype)
-    trtllm_falcon.ln_f.weight.value = ln_f_w
-    if ln_f_b is not None:
-        trtllm_falcon.ln_f.bias.value = ln_f_b
+    if mapping.is_first_pp_rank():
+        trtllm_falcon.embedding.weight.value = embed_w.copy()
+    if mapping.is_last_pp_rank():
+        trtllm_falcon.lm_head.weight.value = split_matrix(embed_w,
+                                                          mapping.tp_size,
+                                                          mapping.tp_rank,
+                                                          dim=0)
+
+        ln_f_w, ln_f_b = get_weight_and_bias(model_params, 'transformer.ln_f',
+                                             dtype)
+        trtllm_falcon.ln_f.weight.value = ln_f_w
+        if ln_f_b is not None:
+            trtllm_falcon.ln_f.bias.value = ln_f_b
 
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
@@ -310,19 +319,21 @@ def retrieved_layer_index_from_name(name: str) -> Optional[int]:
     return int(res.group()) if res is not None else res
 
 
-def iterate_shard_files(model_dir: Path):
-    for file in model_dir.glob('*.bin'):
-        yield file
-    for file in model_dir.glob('*.safetensors'):
-        yield file
+def iterate_shard_files(model_dir: Path, rank: int):
+    import tqdm
+
+    shard_files = list(model_dir.glob('*.bin')) + list(
+        model_dir.glob('*.safetensors'))
+    desc = f'Rank [{rank}] Loading weights'
+    for shard_file in tqdm.tqdm(shard_files, desc=desc, position=rank):
+        yield shard_file
 
 
 def load_from_hf_checkpoint(
-    trtllm_falcon: tensorrt_llm.models.FalconForCausalLM,
-    model_dir: Union[str, Path],
-    rank: int = 0,
-    tensor_parallel: int = 1,
-    dtype: Union[str, torch.dtype] = torch.float32,
+        trtllm_falcon: tensorrt_llm.models.FalconForCausalLM,
+        model_dir: Union[str, Path],
+        mapping=Mapping(),
+        dtype: Union[str, torch.dtype] = torch.float32,
 ):
     logger.info('Loading weights from HF Falcon...')
     tik = time.time()
@@ -334,53 +345,60 @@ def load_from_hf_checkpoint(
     def is_bias(_name):
         return 'bias' in _name
 
-    for model_file in iterate_shard_files(model_dir):
+    layers_range = trtllm_falcon.get_transformer_layers(
+        trtllm_falcon.mapping, trtllm_falcon.num_layers)
+    for model_file in iterate_shard_files(model_dir, mapping.tp_rank):
         logger.debug(f'Loading file {str(model_file)}...')
         state_dict = load_state_dict(model_file, dtype)
         for name, param in state_dict.items():
             logger.debug(f'Converting weight {name}...')
             i = retrieved_layer_index_from_name(name)
-            layer = trtllm_falcon.layers[i] if i is not None else None
+            if i is None:
+                layer = None
+            else:
+                if i not in layers_range:
+                    continue
+                layer = trtllm_falcon.layers[i - layers_range[0]]
 
             if 'self_attention.query_key_value' in name:
                 if not is_bias(name):
                     layer.attention.qkv.weight.value = split_qkv_weight(
                         trtllm_falcon,
                         param,
-                        tensor_parallel,
-                        rank,
+                        mapping.tp_size,
+                        mapping.tp_rank,
                         is_bias=False,
                         num_kv_heads=trtllm_falcon.num_kv_heads)
                 else:
                     layer.attention.qkv.bias.value = split_qkv_weight(
                         trtllm_falcon,
                         param,
-                        tensor_parallel,
-                        rank,
+                        mapping.tp_size,
+                        mapping.tp_rank,
                         is_bias=True,
                         num_kv_heads=trtllm_falcon.num_kv_heads)
             elif 'self_attention.dense' in name:
                 if not is_bias(name):
                     layer.attention.dense.weight.value = split_matrix(
-                        param, tensor_parallel, rank, dim=1)
+                        param, mapping.tp_size, mapping.tp_rank, dim=1)
                 else:
                     layer.attention.dense.bias.value = param
             elif 'mlp.dense_h_to_4h' in name:
                 if not is_bias(name):
                     layer.mlp.fc.weight.value = split_matrix(param,
-                                                             tensor_parallel,
-                                                             rank,
+                                                             mapping.tp_size,
+                                                             mapping.tp_rank,
                                                              dim=0)
                 else:
                     layer.mlp.fc.bias.value = split_matrix(param,
-                                                           tensor_parallel,
-                                                           rank,
+                                                           mapping.tp_size,
+                                                           mapping.tp_rank,
                                                            dim=0)
             elif 'mlp.dense_4h_to_h' in name:
                 if not is_bias(name):
                     layer.mlp.proj.weight.value = split_matrix(param,
-                                                               tensor_parallel,
-                                                               rank,
+                                                               mapping.tp_size,
+                                                               mapping.tp_rank,
                                                                dim=1)
                 else:
                     layer.mlp.proj.bias.value = param
@@ -402,14 +420,17 @@ def is_bias(_name):
                 else:
                     layer.post_layernorm.bias.value = param
             elif 'word_embeddings' in name:
-                trtllm_falcon.embedding.weight.value = param.copy()
-                trtllm_falcon.lm_head.weight.value = split_matrix(
-                    param, tensor_parallel, rank, dim=0)
+                if mapping.is_first_pp_rank():
+                    trtllm_falcon.embedding.weight.value = param.copy()
+                if mapping.is_last_pp_rank():
+                    trtllm_falcon.lm_head.weight.value = split_matrix(
+                        param, mapping.tp_size, mapping.tp_rank, dim=0)
             elif 'ln_f' in name:
-                if not is_bias(name):
-                    trtllm_falcon.ln_f.weight.value = param
-                else:
-                    trtllm_falcon.ln_f.bias.value = param
+                if mapping.is_last_pp_rank():
+                    if not is_bias(name):
+                        trtllm_falcon.ln_f.weight.value = param
+                    else:
+                        trtllm_falcon.ln_f.bias.value = param
         del state_dict
 
     tok = time.time()
diff --git a/examples/gpt/README.md b/examples/gpt/README.md
index 577e99eece9..c20280259cc 100644
--- a/examples/gpt/README.md
+++ b/examples/gpt/README.md
@@ -149,8 +149,11 @@ Note `--enable_context_fmha` / `--enable_context_fmha_fp32_acc` has to be used t
 
 #### In-flight batching and paged KV cache
 
-If one wants to use [in-flight batching in C++ runtime](../../docs/in_flight_batching.md), the engine must be built accordingly. In-flight batching is enabled by
-adding `--use_inflight_batching` to the invocation of `build.py`. Note that in-flight batching in C++ runtime works only with attention plugin `--use_gpt_attention_plugin=float16`, paged KV cache `--paged_kv_cache` and with packed data `--remove_input_padding`. One can additionally control the size of the block in paged KV cache using `--tokens_per_block=N`.
+If one wants to use [in-flight batching in C++ runtime](../../docs/in_flight_batching.md), the engine must be built accordingly.
+In-flight batching is enabled by adding `--use_inflight_batching` to the invocation of `build.py`.
+Note that in-flight batching in C++ runtime works only with attention plugin `--use_gpt_attention_plugin=float16`, paged KV cache `--paged_kv_cache` and with packed data `--remove_input_padding`.
+Adding `--use_inflight_batching` will enable these three flags if not already enabled. It is possible to choose a different precision for `--use_gpt_attention_plugin` if the flag is provided separately.
+One can additionally control the size of the block in paged KV cache using `--tokens_per_block=N`.
 
 ### 4. Run
 
diff --git a/examples/gpt/build.py b/examples/gpt/build.py
index 6bd03bbd54f..ae2e94c4030 100644
--- a/examples/gpt/build.py
+++ b/examples/gpt/build.py
@@ -247,7 +247,7 @@ def parse_arguments(args):
         choices=[0, 1],
         help=
         'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
-        'To shard it along hiddem dimension, set embedding_sharding_dim=1'
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
         'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
     )
     parser.add_argument(
@@ -273,6 +273,14 @@ def parse_arguments(args):
         help=
         'By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV'
     )
+
+    parser.add_argument(
+        '--strongly_typed',
+        default=False,
+        action="store_true",
+        help=
+        'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.'
+    )
     args = parser.parse_args(args)
     logger.set_level(args.log_level)
 
@@ -311,9 +319,18 @@ def parse_arguments(args):
     ), "You cannot enable both SmoothQuant and INT8 weight-only together."
 
     if args.use_inflight_batching:
-        assert args.use_gpt_attention_plugin, "You have to use GPT attention plugin for in-flight batching mode"
-        assert args.paged_kv_cache, "You have to use paged kv cache for in-flight batching mode"
-        assert args.remove_input_padding, "You have to remove input padding for in-flight batching"
+        if not args.use_gpt_attention_plugin:
+            args.use_gpt_attention_plugin = 'float16'
+            logger.info(
+                f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'"
+            )
+        if not args.remove_input_padding:
+            args.remove_input_padding = True
+            logger.info(
+                "Using remove input padding for inflight batching mode.")
+        if not args.paged_kv_cache:
+            args.paged_kv_cache = True
+            logger.info("Using paged KV cache for inflight batching mode.")
 
     if args.use_smooth_quant:
         args.quant_mode = QuantMode.use_smooth_quant(args.per_token,
@@ -329,7 +346,7 @@ def parse_arguments(args):
     if args.fp8_kv_cache:
         assert (
             args.use_gpt_attention_plugin or args.use_inflight_batching
-        ), "You have to use GPT attention plugin or inflight batching plugin when fp8 KV cache is set"
+        ), "You have to use GPT attention plugin when fp8 KV cache is set"
         args.quant_mode = args.quant_mode.set_fp8_kv_cache()
 
     if args.enable_fp8:
@@ -364,7 +381,6 @@ def build_rank_engine(builder: Builder,
 
         if not share_embedding_table:
             logger.warning(f'Cannot share the embedding lookup table.')
-    #args.use_lookup_plugin and args.use_gemm_plugin and args.model_dir is not None and args.use_lookup_plugin and args.use_gemm_plugin and
 
     if share_embedding_table:
         logger.info(
@@ -448,9 +464,6 @@ def build_rank_engine(builder: Builder,
     if args.paged_kv_cache:
         network.plugin_config.enable_paged_kv_cache()
 
-    if args.use_inflight_batching:
-        network.plugin_config.enable_in_flight_batching()
-
     # Quantization plugins.
     if args.use_smooth_quant:
         network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype)
@@ -488,6 +501,8 @@ def build_rank_engine(builder: Builder,
             prompt_embedding_table_size=args.max_prompt_embedding_table_size)
         tensorrt_llm_gpt(*inputs)
 
+    tensorrt_llm.graph_rewriting.optimize(network)
+
     engine = None
 
     # Network -> Engine
@@ -533,6 +548,7 @@ def build(rank, args):
             multi_query_mode=args.multi_query_mode,
             paged_kv_cache=args.paged_kv_cache,
             tokens_per_block=args.tokens_per_block,
+            strongly_typed=args.strongly_typed,
             use_prompt_tuning=args.max_prompt_embedding_table_size > 0,
             fp8=args.enable_fp8,
             use_parallel_embedding=args.use_parallel_embedding)
diff --git a/examples/gpt/hf_gpt_convert.py b/examples/gpt/hf_gpt_convert.py
index ff743469e00..e42045831e0 100644
--- a/examples/gpt/hf_gpt_convert.py
+++ b/examples/gpt/hf_gpt_convert.py
@@ -19,6 +19,7 @@
 import configparser
 import dataclasses
 import os
+import platform
 from pathlib import Path
 
 import torch
@@ -281,6 +282,12 @@ def hf_gpt_converter(args: ProgArgs):
 
 
 def run_conversion(args: ProgArgs):
+    if args.processes > 1 and platform.system() == "Windows":
+        print(
+            "Resetting processes to 1 because multi-process on Windows is not implemented."
+        )
+        args = dataclasses.replace(args, processes=1)
+
     print("\n=============== Arguments ===============")
     for key, value in vars(args).items():
         print(f"{key}: {value}")
diff --git a/examples/gpt/run.py b/examples/gpt/run.py
index 6288fc9d87f..e5b4bf0dada 100644
--- a/examples/gpt/run.py
+++ b/examples/gpt/run.py
@@ -277,7 +277,10 @@ def generate(
                                            model_config.remove_input_padding)
 
     max_input_length = torch.max(input_lengths).item()
-    decoder.setup(input_lengths.size(0), max_input_length, max_output_len)
+    decoder.setup(input_lengths.size(0),
+                  max_input_length,
+                  max_output_len,
+                  beam_width=num_beams)
 
     ptuning_args = [] if not model_config.use_prompt_tuning else ptuning_setup(
         prompt_table, dtype, model_config.hidden_size, tasks, input_ids,
diff --git a/examples/gpt/summarize.py b/examples/gpt/summarize.py
index 8cf4fcbd374..dab39745647 100644
--- a/examples/gpt/summarize.py
+++ b/examples/gpt/summarize.py
@@ -187,7 +187,8 @@ def eval_tensorrt_llm(datapoint, eval_type='summarize'):
         with torch.no_grad():
             tensorrt_llm_gpt.setup(batch_size,
                                    max_context_length=max_length,
-                                   max_new_tokens=output_len)
+                                   max_new_tokens=output_len,
+                                   beam_width=num_beams)
 
             if tensorrt_llm_gpt.remove_input_padding:
                 output_ids = tensorrt_llm_gpt.decode_batch(
diff --git a/examples/gpt/weight.py b/examples/gpt/weight.py
index 425473a94e9..82fc0b5065b 100644
--- a/examples/gpt/weight.py
+++ b/examples/gpt/weight.py
@@ -232,7 +232,7 @@ def sq_trick(x):
                                     constant_values=0)
         tensorrt_llm_gpt.lm_head.weight.value = np.ascontiguousarray(
             split(lm_head_weight, tensor_parallel, rank))
-
+    fake_fp8_sf_dt = np.float32
     for i in range(n_layer):
         c_attn_out_dim = (3 * n_embd //
                           tensor_parallel) if not multi_query_mode else (
@@ -282,10 +282,10 @@ def sq_trick(x):
         if enable_fp8_qdq:
             tensorrt_llm_gpt.layers[
                 i].attention.qkv.activation_scaling_factor.value = np.array(
-                    [scaling_factors['qkv_act'][i]], dtype=np.float32)
+                    [scaling_factors['qkv_act'][i]], dtype=fake_fp8_sf_dt)
             tensorrt_llm_gpt.layers[
                 i].attention.qkv.weights_scaling_factor.value = np.array(
-                    [scaling_factors['qkv_weights'][i]], dtype=np.float32)
+                    [scaling_factors['qkv_weights'][i]], dtype=fake_fp8_sf_dt)
             tensorrt_llm_gpt.layers[
                 i].attention.kv_orig_quant_scale.value = np.array(
                     [scaling_factors['qkv_output'][i]], dtype=np.float32)
@@ -329,10 +329,10 @@ def sq_trick(x):
         if enable_fp8_qdq:
             tensorrt_llm_gpt.layers[
                 i].attention.dense.activation_scaling_factor.value = np.array(
-                    [scaling_factors['dense_act'][i]], dtype=np.float32)
+                    [scaling_factors['dense_act'][i]], dtype=fake_fp8_sf_dt)
             tensorrt_llm_gpt.layers[
                 i].attention.dense.weights_scaling_factor.value = np.array(
-                    [scaling_factors['dense_weights'][i]], dtype=np.float32)
+                    [scaling_factors['dense_weights'][i]], dtype=fake_fp8_sf_dt)
 
         dst = tensorrt_llm_gpt.layers[i].post_layernorm.weight
         dst.value = fromfile(
@@ -387,10 +387,10 @@ def sq_trick(x):
         if enable_fp8_qdq:
             tensorrt_llm_gpt.layers[
                 i].mlp.fc.activation_scaling_factor.value = np.array(
-                    [scaling_factors['fc_act'][i]], dtype=np.float32)
+                    [scaling_factors['fc_act'][i]], dtype=fake_fp8_sf_dt)
             tensorrt_llm_gpt.layers[
                 i].mlp.fc.weights_scaling_factor.value = np.array(
-                    [scaling_factors['fc_weights'][i]], dtype=np.float32)
+                    [scaling_factors['fc_weights'][i]], dtype=fake_fp8_sf_dt)
 
         t = fromfile(
             dir_path,
@@ -437,10 +437,10 @@ def sq_trick(x):
         if enable_fp8_qdq:
             tensorrt_llm_gpt.layers[
                 i].mlp.proj.activation_scaling_factor.value = np.array(
-                    [scaling_factors['proj_act'][i]], dtype=np.float32)
+                    [scaling_factors['proj_act'][i]], dtype=fake_fp8_sf_dt)
             tensorrt_llm_gpt.layers[
                 i].mlp.proj.weights_scaling_factor.value = np.array(
-                    [scaling_factors['proj_weights'][i]], dtype=np.float32)
+                    [scaling_factors['proj_weights'][i]], dtype=fake_fp8_sf_dt)
 
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
diff --git a/examples/gptj/README.md b/examples/gptj/README.md
index 4ae4cd6397f..fdf15c6adc0 100644
--- a/examples/gptj/README.md
+++ b/examples/gptj/README.md
@@ -63,6 +63,23 @@ python3 build.py --dtype=float16 \
                  --max_output_len=128  \
                  --output_dir=gptj_engine_dummy_weights 2>&1 | tee build.log
 
+# Build an int4 weight only quantization engine using awq int4 weight only quantized weights.
+# Enable several TensorRT-LLM plugins to increase runtime performance. It also helps with build time.
+
+python3 build.py --dtype=float16 \
+                 --log_level=verbose  \
+                 --use_gpt_attention_plugin float16 \
+                 --use_gemm_plugin float16 \
+                 --use_layernorm_plugin float16 \
+                 --max_batch_size=32 \
+                 --max_input_len=1919 \
+                 --max_output_len=128  \
+                 --output_dir=gptj_engine \
+                 --use_weight_only \
+                 --per_group \
+                 --weight_only_precision=int4 \
+                 --model_dir=awq_int4_weight_only_quantized_models 2>&1 | tee build.log
+
 ```
 #### Fused MultiHead Attention (FMHA)
 
@@ -80,6 +97,19 @@ One can enable FP8 for KV cache to reduce memory footprint used by KV cache and
 - `--fp8_kv_cache` to enable FP8 accurancy for KV cache.
 - `--quantized_fp8_model_path` to provide path to the quantized model calibrated for FP8. For more details see [quantization docs](../quantization/README.md).
 
+#### AWQ INT4 weight only quantization
+
+One can enable AWQ INT4 weight only quantization with these 3 options when building engine with `build.py`:
+
+- `--use_weight_only` enables weight only GEMMs in the network.
+- `--per_group` enable groupwise weight only quantization, for GPT-J example, we support AWQ with the group size default as 128.
+- `--weight_only_precision=int4` the precision of weight only quantization. Only int4 is supported for groupwise weight only quantization.
+
+The linear layer in the AWQ int4 weight only quantized weights should have 3 parameters:
+1. FP16 smoothed_weights (=weights/pre_quant_scale) with shape [n, k] ;
+2. FP16 amax (the max abs values of the smoothed_weights) with shape [n, k/group_size];
+3. FP16 pre_quant_scale (the smooth scales used to multiply by activation) with shape [k];
+
 ### 3. Run
 
 
diff --git a/examples/gptj/build.py b/examples/gptj/build.py
index a30c59b4d68..1900ac049f5 100644
--- a/examples/gptj/build.py
+++ b/examples/gptj/build.py
@@ -239,16 +239,25 @@ def parse_arguments(args):
     if args.fp8_kv_cache:
         assert (
             args.use_gpt_attention_plugin
-        ), "You have to use GPT attention plugin or inflight batching plugin when fp8 KV cache is set"
+        ), "You have to use GPT attention plugin when fp8 KV cache is set"
         args.quant_mode = args.quant_mode.set_fp8_kv_cache()
 
     if args.enable_fp8:
         args.quant_mode = args.quant_mode.set_fp8_qdq()
 
     if args.use_inflight_batching:
-        assert args.use_gpt_attention_plugin, "You have to use GPT attention plugin for in-flight batching mode"
-        assert args.paged_kv_cache, "You have to use paged kv cache for in-flight batching mode"
-        assert args.remove_input_padding, "You have to remove input padding for in-flight batching"
+        if not args.use_gpt_attention_plugin:
+            args.use_gpt_attention_plugin = 'float16'
+            logger.info(
+                f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'"
+            )
+        if not args.remove_input_padding:
+            args.remove_input_padding = True
+            logger.info(
+                "Using remove input padding for inflight batching mode.")
+        if not args.paged_kv_cache:
+            args.paged_kv_cache = True
+            logger.info("Using paged KV cache for inflight batching mode.")
 
     if args.remove_input_padding or args.use_inflight_batching or args.paged_kv_cache:
         assert (
@@ -513,8 +522,6 @@ def build_rank_engine(builder: Builder,
         network.plugin_config.set_nccl_plugin(args.dtype)
     if args.remove_input_padding:
         network.plugin_config.enable_remove_input_padding()
-    if args.use_inflight_batching:
-        network.plugin_config.enable_in_flight_batching()
     if args.paged_kv_cache:
         network.plugin_config.enable_paged_kv_cache()
 
@@ -535,6 +542,8 @@ def build_rank_engine(builder: Builder,
             tokens_per_block=args.tokens_per_block)
         tensorrt_llm_gpt(*inputs)
 
+    tensorrt_llm.graph_rewriting.optimize(network)
+
     engine = None
 
     # Network -> Engine
diff --git a/examples/gptj/run.py b/examples/gptj/run.py
index 22c881937d7..8a80becf2d9 100644
--- a/examples/gptj/run.py
+++ b/examples/gptj/run.py
@@ -228,7 +228,10 @@ def generate(
                                            model_config.remove_input_padding)
 
     max_input_length = torch.max(input_lengths).item()
-    decoder.setup(input_lengths.size(0), max_input_length, max_output_len)
+    decoder.setup(input_lengths.size(0),
+                  max_input_length,
+                  max_output_len,
+                  beam_width=num_beams)
 
     output_ids, sequence_lengths = decoder.decode(
         input_ids,
diff --git a/examples/gptj/summarize.py b/examples/gptj/summarize.py
index 8a9db0548fa..ff838d6e034 100644
--- a/examples/gptj/summarize.py
+++ b/examples/gptj/summarize.py
@@ -172,7 +172,8 @@ def summarize_tensorrt_llm(datapoint):
         with torch.no_grad():
             tensorrt_llm_gpt.setup(batch_size,
                                    max_context_length=max_length,
-                                   max_new_tokens=output_len)
+                                   max_new_tokens=output_len,
+                                   beam_width=num_beams)
 
             if tensorrt_llm_gpt.remove_input_padding:
                 output_ids = tensorrt_llm_gpt.decode_batch(
diff --git a/examples/gptj/weight.py b/examples/gptj/weight.py
index 067d5b6fdf9..f4bf55ea9e2 100644
--- a/examples/gptj/weight.py
+++ b/examples/gptj/weight.py
@@ -334,14 +334,14 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
         [vocab_size, k] = weight.shape
         new_weight = torch.zeros([pad_vocab_size, k])
         new_weight[:vocab_size, :] = weight
-        new_weight = new_weight.T.continugous()
+        new_weight = new_weight.T.contiguous()
         amax = awq_gpt_j['lm_head.weight_quantizer._amax'].reshape(
-            [vocab_size, k / group_size])
-        new_amax = torch.ones([pad_vocab_size, k / group_size])
+            [vocab_size, int(k / group_size)])
+        new_amax = torch.ones([pad_vocab_size, int(k / group_size)])
         new_amax[:vocab_size, :] = amax
-        new_amax = new_amax.T.continugous()
+        new_amax = new_amax.T.contiguous()
         new_scale = new_amax / 8
-        tensorrt_llm_gpt_j.lm_head.weight.value = AWQ_quantize_pack_preprocess(
+        tensorrt_llm_gpt_j.lm_head.qweight.value = AWQ_quantize_pack_preprocess(
             new_weight, new_scale, group_size, packer, preprocessor)
         tensorrt_llm_gpt_j.lm_head.scale.value = new_scale.to(
             torch_dtype).cpu().numpy()
diff --git a/examples/gptneox/README.md b/examples/gptneox/README.md
index d4442a1f96c..7ea61707b05 100644
--- a/examples/gptneox/README.md
+++ b/examples/gptneox/README.md
@@ -159,6 +159,7 @@ python3 build.py --dtype=float16 \
                  --use_gemm_plugin float16 \
                  --use_layernorm_plugin float16 \
                  --use_weight_only_groupwise_quant_matmul_plugin float16 \
+                 --groupwise_quant_safetensors_path=gptneox_model/gptneox-20b-4bit-gs128.safetensors \
                  --max_batch_size=16 \
                  --max_input_len=1024 \
                  --max_output_len=1024 \
diff --git a/examples/gptneox/build.py b/examples/gptneox/build.py
index 649eb3eb8b4..8031e86b30d 100644
--- a/examples/gptneox/build.py
+++ b/examples/gptneox/build.py
@@ -40,9 +40,8 @@
 
 class StateDict():
 
-    def __init__(self, model_dir):
-        self.model_state_dict = safe_open(model_dir +
-                                          '/gptneox-20b-4bit-gs128.safetensors',
+    def __init__(self, quant_ckpt_dir):
+        self.model_state_dict = safe_open(quant_ckpt_dir,
                                           framework="pt",
                                           device=0)
 
@@ -52,7 +51,7 @@ def get(self, k):
 
 class GPTQModel():
 
-    def __init__(self, model_dir):
+    def __init__(self, model_dir, quant_ckpt_dir):
         with open(model_dir + '/config.json', 'r') as f:
             model_config = json.load(f)
             self.config = GPTNeoXConfig()
@@ -74,7 +73,7 @@ def __init__(self, model_dir):
             self.config.eos_token_id = model_config['eos_token_id']
             self.config.tie_word_embeddings = model_config[
                 'tie_word_embeddings']
-        self.model_state_dict = StateDict(model_dir)
+        self.model_state_dict = StateDict(quant_ckpt_dir)
 
     def state_dict(self):
         return self.model_state_dict
@@ -157,6 +156,13 @@ def parse_arguments():
                         type=str,
                         default=False,
                         choices=['float16'])
+    parser.add_argument(
+        '--groupwise_quant_safetensors_path',
+        type=str,
+        default=None,
+        help=
+        "The path to groupwise quantized GPT-NeoX model / checkpoints to read weights from."
+    )
     parser.add_argument('--use_layernorm_plugin',
                         nargs='?',
                         const='float16',
@@ -198,10 +204,14 @@ def parse_arguments():
             args.vocab_size = hf_gpt.config.vocab_size
             args.rotary_pct = hf_gpt.config.rotary_pct
         else:
+            assert (
+                args.groupwise_quant_safetensors_path is not None
+            ), f'Please set the path to the groupwise quantized GPT-NeoX checkpoints with --groupwise_quant_safetensors_path'
             logger.info(
-                f'Loading GPTQ quantized HF GPT-NeoX model from {args.model_dir}...'
+                f'Loading GPTQ quantized HF GPT-NeoX model from {args.groupwise_quant_safetensors_path}...'
             )
-            hf_gpt = GPTQModel(args.model_dir)
+            hf_gpt = GPTQModel(args.model_dir,
+                               args.groupwise_quant_safetensors_path)
             args.n_embd = hf_gpt.config.hidden_size
             args.n_head = hf_gpt.config.num_attention_heads
             args.n_layer = hf_gpt.config.num_hidden_layers
@@ -302,6 +312,8 @@ def build_rank_engine(builder: Builder,
                                                  args.max_beam_width)
         tensorrt_llm_gpt(*inputs)
 
+    tensorrt_llm.graph_rewriting.optimize(network)
+
     engine = None
 
     # Network -> Engine
diff --git a/examples/gptneox/summarize.py b/examples/gptneox/summarize.py
index 064e56ce35e..c53b307f456 100644
--- a/examples/gptneox/summarize.py
+++ b/examples/gptneox/summarize.py
@@ -164,7 +164,8 @@ def summarize_tensorrt_llm(datapoint):
         with torch.no_grad():
             tensorrt_llm_gpt.setup(batch_size,
                                    max_context_length=max_length,
-                                   max_new_tokens=output_len)
+                                   max_new_tokens=output_len,
+                                   beam_width=num_beams)
 
             if tensorrt_llm_gpt.remove_input_padding:
                 output_ids = tensorrt_llm_gpt.decode_batch(
diff --git a/examples/llama/README.md b/examples/llama/README.md
index 654b8093ab6..7f07d382c3d 100644
--- a/examples/llama/README.md
+++ b/examples/llama/README.md
@@ -57,7 +57,18 @@ python build.py --model_dir ./tmp/llama/7B/ \
                 --use_gpt_attention_plugin float16 \
                 --use_gemm_plugin float16 \
                 --output_dir ./tmp/llama/7B/trt_engines/fp16/2-gpu/ \
-                --world_size 2
+                --world_size 2 \
+                --tp_size 2
+
+# Build LLaMA 7B using 2-way tensor parallelism and 2-way pipeline parallelism.
+python build.py --model_dir ./tmp/llama/7B/ \
+                --dtype float16 \
+                --use_gpt_attention_plugin float16 \
+                --use_gemm_plugin float16 \
+                --output_dir ./tmp/llama/7B/trt_engines/fp16/2-gpu/ \
+                --world_size 4 \
+                --tp_size 2 \
+                --pp_size 2
 
 # Build LLaMA 30B using 2-way tensor parallelism.
 python build.py --model_dir ./tmp/llama/30B/hf/ \
@@ -65,7 +76,8 @@ python build.py --model_dir ./tmp/llama/30B/hf/ \
                 --use_gpt_attention_plugin float16 \
                 --use_gemm_plugin float16 \
                 --output_dir ./tmp/llama/30B/trt_engines/fp16/2-gpu/ \
-                --world_size 2
+                --world_size 2 \
+                --tp_size 2
 ```
 
 #### LLaMA v2 Updates
@@ -88,7 +100,18 @@ python build.py --model_dir ./tmp/llama/70B/hf/ \
                 --use_gpt_attention_plugin float16 \
                 --use_gemm_plugin float16 \
                 --output_dir ./tmp/llama/70B/trt_engines/fp16/8-gpu/ \
-                --world_size 8
+                --world_size 8 \
+                --tp_size 8
+
+# Build LLaMA 70B using 4-way tensor parallelism and 2-way pipeline parallelism.
+python build.py --model_dir ./tmp/llama/70B/hf/ \
+                --dtype float16 \
+                --use_gpt_attention_plugin float16 \
+                --use_gemm_plugin float16 \
+                --output_dir ./tmp/llama/70B/trt_engines/fp16/8-gpu/ \
+                --world_size 8 \
+                --tp_size 4 \
+                --pp_size 2
 
 
 # Build LLaMA 70B TP=8 using Meta checkpoints directly.
@@ -97,11 +120,53 @@ python build.py --meta_ckpt_dir ./tmp/llama/70B \
                 --use_gpt_attention_plugin float16 \
                 --use_gemm_plugin float16 \
                 --output_dir ./tmp/llama/70B/trt_engines/fp16/8-gpu/ \
-                --world_size 8
+                --world_size 8 \
+                --tp_size 8
 ```
 
 Same instructions can be applied to fine-tuned versions of the LLaMA v2 models (e.g. 7Bf or llama-2-7b-chat).
 
+#### INT8 weight only + INT8 KV cache
+For INT8 KV cache, [`hf_llama_convert.py`](./hf_llama_convert.py) features a
+`--calibrate-kv-cache, -kv` option. Setting `-kv` will calibrate the model,
+and then export the scaling factors needed for INT8 KV cache inference.
+
+
+Example:
+
+```bash
+python3 hf_llama_convert.py -i /llama-models/llama-7b-hf -o /llama/smooth_llama_7B/int8_kv_cache/ --calibrate-kv-cache -t float16
+```
+
+[`build.py`](./build.py) add new options for the support of INT8 KV cache.
+
+`--int8_kv_cache` is the command-line option to enable INT8 KV cache.
+
+In addition, it could be combined with INT8 weight-only quantization, as follows:
+
+Examples of INT8 weight-only quantization + INT8 KV cache
+
+```bash
+# Build model with both INT8 weight-only and INT8 KV cache enabled
+python build.py --ft_model_dir=/llama/smooth_llama_7B/int8_kv_cache/1-gpu/ \
+                --dtype float16 \
+                --use_gpt_attention_plugin float16 \
+                --use_gemm_plugin float16 \
+                --output_dir ./tmp/llama/7B/trt_engines/int8_kv_cache_weight_only/1-gpu \
+                --int8_kv_cache \
+                --use_weight_only
+```
+
+Test with `summarize.py`:
+
+```bash
+python summarize.py --test_trt_llm \
+                    --hf_model_location /llama-models/llama-7b-hf \
+                    --data_type fp16 \
+                    --engine_dir ./tmp/llama/7B/trt_engines/int8_kv_cache_weight_only/1-gpu \
+                    --test_hf
+```
+
 #### SmoothQuant
 
 The smoothquant supports both LLaMA v1 and LLaMA v2. Unlike the FP16 build where the HF weights are processed and loaded into the TensorRT-LLM directly, the SmoothQuant needs to load INT8 weights which should be pre-processed before building an engine.
@@ -225,3 +290,19 @@ mpirun -n 2 --allow-run-as-root \
                         --data_type fp16 \
                         --engine_dir ./tmp/llama/30B/trt_engines/fp16/2-gpu/
 ```
+
+## Running CodeLlama
+Those examples can be used to build and run the CodeLlama models. All 7b, 13b, and 34b sizes and variants are supported.
+
+NOTE: There are a couple of differences in CodeLlama in comparison to LLaMA v1/v2 models: rotary_base (`theta=1000000.0f`) and vocabulary size (`32016`).
+### Build
+Use the following command to build `CodeLlama-7b-Instruct`:
+```
+python build.py --meta_ckpt_dir ./CodeLlama-7b-Instruct/ --dtype float16 --use_gpt_attention_plugin float16 --use_gemm_plugin float16 --use_rmsnorm_plugin float16 --output_dir codellama_7b --rotary_base 1000000 --vocab_size 32016
+```
+
+### Run
+Use the following command to run it:
+```
+python run.py --max_output_len=40 --tokenizer_dir . --engine_dir codellama_7b --input_text "In Bash, how do I list all text files?"
+```
diff --git a/examples/llama/build.py b/examples/llama/build.py
index c56da290ef1..92706cd546b 100644
--- a/examples/llama/build.py
+++ b/examples/llama/build.py
@@ -105,8 +105,11 @@ def to_onnx(network, path):
     onnx.save(onnx_model, path)
 
 
-def get_engine_name(model, dtype, tp_size, rank):
-    return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
+def get_engine_name(model, dtype, tp_size, pp_size, rank):
+    if pp_size == 1:
+        return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
+    return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size,
+                                                  pp_size, rank)
 
 
 def serialize_engine(engine, path):
@@ -121,10 +124,9 @@ def serialize_engine(engine, path):
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--world_size',
-                        type=int,
-                        default=1,
-                        help='world size, only support tensor parallelism now')
+    parser.add_argument('--world_size', type=int, default=1)
+    parser.add_argument('--tp_size', type=int, default=1)
+    parser.add_argument('--pp_size', type=int, default=1)
     parser.add_argument('--model_dir', type=str, default=None)
     parser.add_argument('--ft_model_dir', type=str, default=None)
     parser.add_argument('--meta_ckpt_dir', type=str, default=None)
@@ -151,10 +153,13 @@ def parse_arguments():
     parser.add_argument('--ffn_dim_multiplier', type=float, default=1.0)
     parser.add_argument('--inter_size', type=int, default=None)
     parser.add_argument('--hidden_act', type=str, default='silu')
+    parser.add_argument('--rms_norm_eps', type=float, default=1e-06)
     parser.add_argument('--max_batch_size', type=int, default=8)
     parser.add_argument('--max_input_len', type=int, default=2048)
     parser.add_argument('--max_output_len', type=int, default=512)
     parser.add_argument('--max_beam_width', type=int, default=1)
+    parser.add_argument('--rotary_base', type=float, default=10000.0)
+    parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None)
     parser.add_argument('--use_gpt_attention_plugin',
                         nargs='?',
                         const='float16',
@@ -251,7 +256,7 @@ def parse_arguments():
         choices=[0, 1],
         help=
         'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
-        'To shard it along hiddem dimension, set embedding_sharding_dim=1'
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
         'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
     )
     parser.add_argument(
@@ -309,9 +314,18 @@ def parse_arguments():
     ), "You cannot enable both SmoothQuant and INT8 weight-only together."
 
     if args.use_inflight_batching:
-        assert args.use_gpt_attention_plugin, "You have to use GPT attention plugin for in-flight batching mode"
-        assert args.paged_kv_cache, "You have to use paged kv cache for in-flight batching mode"
-        assert args.remove_input_padding, "You have to remove input padding for in-flight batching"
+        if not args.use_gpt_attention_plugin:
+            args.use_gpt_attention_plugin = 'float16'
+            logger.info(
+                f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'"
+            )
+        if not args.remove_input_padding:
+            args.remove_input_padding = True
+            logger.info(
+                "Using remove input padding for inflight batching mode.")
+        if not args.paged_kv_cache:
+            args.paged_kv_cache = True
+            logger.info("Using paged KV cache for inflight batching mode.")
 
     if args.use_smooth_quant:
         args.quant_mode = QuantMode.use_smooth_quant(args.per_token,
@@ -338,6 +352,17 @@ def parse_arguments():
     if args.enable_fp8:
         args.quant_mode = args.quant_mode.set_fp8_qdq()
 
+    if args.rotary_scaling is not None:
+        rotary_scaling = {
+            "type": args.rotary_scaling[0],
+            "factor": float(args.rotary_scaling[1])
+        }
+        assert rotary_scaling["type"] in ["linear", "dynamic"]
+        assert rotary_scaling["factor"] > 1.0
+        args.rotary_scaling = rotary_scaling
+        if rotary_scaling["type"] == "dynamic":
+            assert not args.remove_input_padding, "TODO: Not supported yet"
+
     if args.inter_size is None:
         # this should not be need when loading a real model
         # but it is helpful when creating a dummy model without loading any real weights
@@ -361,6 +386,7 @@ def parse_arguments():
         args.n_positions = hf_config.max_position_embeddings
         args.vocab_size = hf_config.vocab_size
         args.hidden_act = hf_config.hidden_act
+        args.rms_norm_eps = hf_config.rms_norm_eps
     elif args.meta_ckpt_dir is not None:
         with open(Path(args.meta_ckpt_dir, "params.json")) as fp:
             meta_config: dict = json.load(fp)
@@ -374,6 +400,7 @@ def parse_arguments():
         args.inter_size = args.multiple_of * (
             (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) //
             args.multiple_of)
+        args.rms_norm_eps = meta_config["norm_eps"]
     elif args.ft_model_dir is not None:
         n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size = parse_ft_config(
             Path(args.ft_model_dir) / "config.ini")
@@ -384,19 +411,23 @@ def parse_arguments():
         args.n_positions = n_positions
         args.vocab_size = vocab_size
         args.hidden_act = hidden_act
+        args.rms_norm_eps = 1e-06
+        logger.warning("Set rms_norm_eps to 1e-06 directly.")
     assert args.use_gpt_attention_plugin, "LLaMa must use gpt attention plugin"
     if args.n_kv_head is None:
         args.n_kv_head = args.n_head
     elif args.n_kv_head != args.n_head:
         assert (args.n_head % args.n_kv_head) == 0, \
             "MQA/GQA requires the number of heads to be divisible by the number of K/V heads."
-        assert (args.n_kv_head % args.world_size) == 0 or (args.world_size % args.n_kv_head) == 0, \
-            "MQA/GQA requires either the number of K/V heads to be divisible by the number of GPUs OR " \
-            "the number of GPUs to be divisible by the number of K/V heads."
+        assert (args.n_kv_head % args.tp_size) == 0 or (args.tp_size % args.n_kv_head) == 0, \
+            "MQA/GQA requires either the number of K/V heads to be divisible by the tensor parallelism size OR " \
+            "the tensor parallelism size to be divisible by the number of K/V heads."
 
     if args.dtype == 'bfloat16':
         assert args.use_gemm_plugin, "Please use gemm plugin when dtype is bfloat16"
 
+    assert args.pp_size * args.tp_size == args.world_size
+
     return args
 
 
@@ -409,7 +440,11 @@ def build_rank_engine(builder: Builder,
        @param args: The cmd line arguments.
        @return: The built engine.
     '''
-    kv_dtype = str_dtype_to_trt(args.dtype)
+    dtype = str_dtype_to_trt(args.dtype)
+    mapping = Mapping(world_size=args.world_size,
+                      rank=rank,
+                      tp_size=args.tp_size,
+                      pp_size=args.pp_size)
 
     # Initialize Module
     tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM(
@@ -420,21 +455,22 @@ def build_rank_engine(builder: Builder,
         vocab_size=args.vocab_size,
         hidden_act=args.hidden_act,
         max_position_embeddings=args.n_positions,
-        dtype=kv_dtype,
+        dtype=dtype,
         mlp_hidden_size=args.inter_size,
         position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
-        mapping=Mapping(world_size=args.world_size,
-                        rank=rank,
-                        tp_size=args.world_size),  # TP only
+        mapping=mapping,
+        rotary_base=args.rotary_base,
+        rotary_scaling=args.rotary_scaling,
         use_parallel_embedding=args.use_parallel_embedding,
         embedding_sharding_dim=args.embedding_sharding_dim,
-        quant_mode=args.quant_mode)
+        quant_mode=args.quant_mode,
+        rms_norm_eps=args.rms_norm_eps)
     if args.use_smooth_quant:
         tensorrt_llm_llama = smooth_quantize(tensorrt_llm_llama,
                                              args.quant_mode)
     elif args.use_weight_only and args.weight_only_precision == 'int8':
         tensorrt_llm_llama = weight_only_quantize(tensorrt_llm_llama,
-                                                  QuantMode.use_weight_only())
+                                                  args.quant_mode)
     elif args.use_weight_only and args.weight_only_precision == 'int4':
         if args.per_group:
             tensorrt_llm_llama = weight_only_groupwise_quantize(
@@ -449,9 +485,8 @@ def build_rank_engine(builder: Builder,
                 group_size=128,
                 zero=True)
         else:
-            tensorrt_llm_llama = weight_only_quantize(
-                tensorrt_llm_llama,
-                QuantMode.use_weight_only(use_int4_weights=True))
+            tensorrt_llm_llama = weight_only_quantize(tensorrt_llm_llama,
+                                                      args.quant_mode)
     elif args.enable_fp8 or args.fp8_kv_cache:
         # Dummy scales only
         tensorrt_llm_llama = fp8_quantize(tensorrt_llm_llama, args.quant_mode)
@@ -460,12 +495,11 @@ def build_rank_engine(builder: Builder,
         load_from_groupwise_safetensors_llama(
             tensorrt_llm_llama=tensorrt_llm_llama,
             quant_safetensors_path=args.quant_safetensors_path,
-            tensor_parallel=args.world_size,
-            rank=rank,
+            mapping=mapping,
             dtype="float16")
     elif args.meta_ckpt_dir is not None:
-        load_from_meta_llama(tensorrt_llm_llama, args.meta_ckpt_dir, rank,
-                             args.world_size, args.dtype)
+        load_from_meta_llama(tensorrt_llm_llama, args.meta_ckpt_dir, mapping,
+                             args.dtype)
     elif args.model_dir is not None:
         logger.info(f'Loading HF LLaMA ... from {args.model_dir}')
         tik = time.time()
@@ -481,16 +515,14 @@ def build_rank_engine(builder: Builder,
         logger.info(f'HF LLaMA loaded. Total time: {t}')
         load_from_hf_llama(tensorrt_llm_llama,
                            hf_llama,
-                           rank,
-                           args.world_size,
+                           mapping=mapping,
                            dtype=args.dtype)
         del hf_llama
     elif args.ft_model_dir is not None:
         # TODO add multi_query_mode
         load_from_binary(tensorrt_llm_llama,
                          args.ft_model_dir,
-                         rank,
-                         args.world_size,
+                         mapping,
                          fp16=(args.dtype == 'float16'),
                          multi_query_mode=(args.n_kv_head != args.n_head))
 
@@ -505,9 +537,6 @@ def build_rank_engine(builder: Builder,
     if args.use_rmsnorm_plugin:
         network.plugin_config.set_rmsnorm_plugin(dtype=args.use_rmsnorm_plugin)
 
-    if args.use_inflight_batching:
-        network.plugin_config.enable_in_flight_batching()
-
     # Quantization plugins.
     if args.use_smooth_quant:
         network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype)
@@ -554,11 +583,13 @@ def build_rank_engine(builder: Builder,
                 v = v.trt_tensor
                 v.name = k
                 network.trt_network.mark_output(v)
-                v.dtype = kv_dtype
+                v.dtype = dtype
         if args.visualize:
             model_path = os.path.join(args.output_dir, 'test.onnx')
             to_onnx(network.trt_network, model_path)
 
+    tensorrt_llm.graph_rewriting.optimize(network)
+
     engine = None
 
     # Network -> Engine
@@ -587,7 +618,8 @@ def build(rank, args):
             name=MODEL_NAME,
             precision=args.dtype,
             timing_cache=args.timing_cache if cache is None else cache,
-            tensor_parallel=args.world_size,  # TP only
+            tensor_parallel=args.tp_size,
+            pipeline_parallel=args.pp_size,
             parallel_build=args.parallel_build,
             num_layers=args.n_layer,
             num_heads=args.n_head,
@@ -605,8 +637,8 @@ def build(rank, args):
             opt_level=args.builder_opt,
             paged_kv_cache=args.paged_kv_cache,
             tokens_per_block=args.tokens_per_block)
-        engine_name = get_engine_name(MODEL_NAME, args.dtype, args.world_size,
-                                      cur_rank)
+        engine_name = get_engine_name(MODEL_NAME, args.dtype, args.tp_size,
+                                      args.pp_size, cur_rank)
         engine = build_rank_engine(builder, builder_config, engine_name,
                                    cur_rank, args)
         assert engine is not None, f'Failed to build engine for rank {cur_rank}'
diff --git a/examples/llama/convert.py b/examples/llama/convert.py
index d009fb000f7..11dd73d3ca1 100644
--- a/examples/llama/convert.py
+++ b/examples/llama/convert.py
@@ -237,7 +237,7 @@ def split_and_save_weight(i, saved_dir, factor, key, val, act_range, config):
         split_dim = 0
         split_vals = np.split(val, factor, axis=split_dim)
         save_split(split_vals, saved_dir, key, i, factor)
-        if save_int8:
+        if act_range is not None and int8_outputs == "all":
             base_key = key.replace(".weight", "")
             vals_i8 = generate_int8(val, act_range)
             write_int8(vals_i8, saved_dir, base_key, split_dim, i, factor)
@@ -246,7 +246,7 @@ def split_and_save_weight(i, saved_dir, factor, key, val, act_range, config):
         split_dim = -1
         split_vals = np.split(val, factor, axis=split_dim)
         save_split(split_vals, saved_dir, key, i, factor)
-        if save_int8:
+        if act_range is not None and int8_outputs == "all":
             base_key = key.replace(".weight", "")
             vals_i8 = generate_int8(val, act_range)
             write_int8(vals_i8, saved_dir, base_key, split_dim, i, factor)
diff --git a/examples/llama/hf_llama_convert.py b/examples/llama/hf_llama_convert.py
index 6578adeb72b..51738565bf7 100644
--- a/examples/llama/hf_llama_convert.py
+++ b/examples/llama/hf_llama_convert.py
@@ -30,6 +30,30 @@
 from transformers.models.llama.modeling_llama import LlamaDecoderLayer
 
 
+def merge_qkv_scales(q_name, hf_model, scales, llama_qkv_para):
+    layer_name_q = q_name.replace(".weight", "")
+    layer_name_k = layer_name_q.replace("q_proj", "k_proj")
+    layer_name_v = layer_name_q.replace("q_proj", "v_proj")
+    layer_name_qkv = layer_name_q.replace("q_proj", "qkv_proj")
+
+    q = hf_model.state_dict()[layer_name_q + ".weight"]
+    k = hf_model.state_dict()[layer_name_k + ".weight"]
+    v = hf_model.state_dict()[layer_name_v + ".weight"]
+
+    weight = torch.cat([q, k, v], dim=0)
+
+    scales[layer_name_qkv]["x"] = scales[layer_name_q]["x"]
+    scales[layer_name_qkv]["w"] = weight.abs().max(dim=1)[0]
+    print(scales[layer_name_q])
+    scales[layer_name_qkv]["y"] = torch.cat([
+        scales[layer_name_q]["y"], scales[layer_name_k]["y"],
+        scales[layer_name_v]["y"]
+    ],
+                                            dim=0)
+
+    llama_qkv_para[layer_name_qkv] = weight.transpose(0, 1)
+
+
 @torch.no_grad()
 def smooth_llama_model(model, scales, alpha, llama_qkv_para, llama_smoother):
     # Smooth the activation and weights with smoother = $\diag{s}$
@@ -213,6 +237,7 @@ def hf_gpt_converter(args):
         elif ft_name.split('.')[-2] == 'query_key_value':
             # Is there other ways to get local_dim? local_dim = hidden_size in llama2
             local_dim = model.config.hidden_size if args.multi_query_mode else None
+            merge_qkv_scales(name, model, act_range, llama_qkv_para)
             qkv = (0, saved_dir, infer_tp, ft_name,
                    llama_qkv_para.get(
                        name.replace(".weight", "").replace(
diff --git a/examples/llama/run.py b/examples/llama/run.py
index 078eedd430a..0dbf708f4db 100644
--- a/examples/llama/run.py
+++ b/examples/llama/run.py
@@ -36,11 +36,13 @@ def read_config(config_path: Path):
     use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
     remove_input_padding = config['plugin_config']['remove_input_padding']
     dtype = config['builder_config']['precision']
-    world_size = config['builder_config']['tensor_parallel']
+    tp_size = config['builder_config']['tensor_parallel']
+    pp_size = config['builder_config']['pipeline_parallel']
+    world_size = tp_size * pp_size
     assert world_size == tensorrt_llm.mpi_world_size(), \
         f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
-    num_heads = config['builder_config']['num_heads'] // world_size
-    hidden_size = config['builder_config']['hidden_size'] // world_size
+    num_heads = config['builder_config']['num_heads'] // tp_size
+    hidden_size = config['builder_config']['hidden_size'] // tp_size
     vocab_size = config['builder_config']['vocab_size']
     num_layers = config['builder_config']['num_layers']
     num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads)
@@ -51,7 +53,7 @@ def read_config(config_path: Path):
             "`multi_query_mode` config is deprecated. Please rebuild the engine."
         )
         num_kv_heads = 1
-    num_kv_heads = (num_kv_heads + world_size - 1) // world_size
+    num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
 
     model_config = ModelConfig(num_heads=num_heads,
                                num_kv_heads=num_kv_heads,
@@ -63,7 +65,7 @@ def read_config(config_path: Path):
                                tokens_per_block=tokens_per_block,
                                remove_input_padding=remove_input_padding)
 
-    return model_config, world_size, dtype
+    return model_config, tp_size, pp_size, dtype
 
 
 def parse_input(input_text: str, input_file: str, tokenizer, end_id: int,
@@ -184,12 +186,15 @@ def generate(
 
     engine_dir = Path(engine_dir)
     config_path = engine_dir / 'config.json'
-    model_config, world_size, dtype = read_config(config_path)
+    model_config, tp_size, pp_size, dtype = read_config(config_path)
+    assert pp_size == 1, 'Python runtime does not support pipeline parallelism'
+    world_size = tp_size * pp_size
 
     runtime_rank = tensorrt_llm.mpi_rank()
     runtime_mapping = tensorrt_llm.Mapping(world_size,
                                            runtime_rank,
-                                           tp_size=world_size)
+                                           tp_size=tp_size,
+                                           pp_size=pp_size)
     torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
 
     tokenizer = LlamaTokenizer.from_pretrained(tokenizer_dir, legacy=False)
@@ -198,7 +203,8 @@ def generate(
                                      pad_id=PAD_TOKEN,
                                      num_beams=num_beams)
 
-    engine_name = get_engine_name('llama', dtype, world_size, runtime_rank)
+    engine_name = get_engine_name('llama', dtype, tp_size, pp_size,
+                                  runtime_rank)
     serialize_path = engine_dir / engine_name
     with open(serialize_path, 'rb') as f:
         engine_buffer = f.read()
diff --git a/examples/llama/summarize.py b/examples/llama/summarize.py
index e26a58e0dbf..85347dd2595 100644
--- a/examples/llama/summarize.py
+++ b/examples/llama/summarize.py
@@ -31,13 +31,16 @@
 
 def TRTLLaMA(args, config):
     dtype = config['builder_config']['precision']
-    world_size = config['builder_config']['tensor_parallel']
+    tp_size = config['builder_config']['tensor_parallel']
+    pp_size = config['builder_config']['pipeline_parallel']
+    world_size = tp_size * pp_size
+
+    assert pp_size == 1, 'Python runtime does not support pipeline parallelism'
     assert world_size == tensorrt_llm.mpi_world_size(), \
         f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
 
-    world_size = config['builder_config']['tensor_parallel']
-    num_heads = config['builder_config']['num_heads'] // world_size
-    hidden_size = config['builder_config']['hidden_size'] // world_size
+    num_heads = config['builder_config']['num_heads'] // tp_size
+    hidden_size = config['builder_config']['hidden_size'] // tp_size
     vocab_size = config['builder_config']['vocab_size']
     num_layers = config['builder_config']['num_layers']
     use_gpt_attention_plugin = bool(
@@ -51,7 +54,7 @@ def TRTLLaMA(args, config):
             "`multi_query_mode` config is deprecated. Please rebuild the engine."
         )
         num_kv_heads = 1
-    num_kv_heads = (num_kv_heads + world_size - 1) // world_size
+    num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
 
     model_config = tensorrt_llm.runtime.ModelConfig(
         vocab_size=vocab_size,
@@ -67,10 +70,12 @@ def TRTLLaMA(args, config):
     runtime_rank = tensorrt_llm.mpi_rank()
     runtime_mapping = tensorrt_llm.Mapping(world_size,
                                            runtime_rank,
-                                           tp_size=world_size)
+                                           tp_size=tp_size,
+                                           pp_size=pp_size)
     torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
 
-    engine_name = get_engine_name('llama', dtype, world_size, runtime_rank)
+    engine_name = get_engine_name('llama', dtype, tp_size, pp_size,
+                                  runtime_rank)
     serialize_path = os.path.join(args.engine_dir, engine_name)
 
     tensorrt_llm.logger.set_level(args.log_level)
@@ -187,7 +192,8 @@ def summarize_tensorrt_llm(datapoint):
         with torch.no_grad():
             tensorrt_llm_llama.setup(batch_size,
                                      max_context_length=max_length,
-                                     max_new_tokens=output_len)
+                                     max_new_tokens=output_len,
+                                     beam_width=num_beams)
 
             if tensorrt_llm_llama.remove_input_padding:
                 output_ids = tensorrt_llm_llama.decode_batch(
diff --git a/examples/llama/weight.py b/examples/llama/weight.py
index ec29552b63b..65f09c90254 100644
--- a/examples/llama/weight.py
+++ b/examples/llama/weight.py
@@ -22,6 +22,7 @@
 
 import tensorrt_llm
 from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
+from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models import LLaMAForCausalLM
 from tensorrt_llm.quantization import QuantMode
 
@@ -83,9 +84,8 @@ def parse_ft_config(ini_file):
 
 def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
                        hf_llama,
-                       rank=0,
-                       tensor_parallel=1,
-                       dtype="float32"):
+                       mapping=Mapping(),
+                       dtype='float32'):
     tensorrt_llm.logger.info('Loading weights from HF LLaMA...')
     tik = time.time()
 
@@ -106,14 +106,14 @@ def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
         v_weight = model_params[prefix + 'v_proj.weight']
         if not mha_mode:
             head_size = tensorrt_llm_llama.hidden_size // tensorrt_llm_llama.num_heads
-            if num_kv_heads < tensor_parallel:
+            if num_kv_heads < mapping.tp_size:
                 # duplicate the KV heads up to tensor_parallel
                 k_weight = dup_kv_weight(k_weight, num_kv_heads,
-                                         tensor_parallel)
+                                         mapping.tp_size)
                 v_weight = dup_kv_weight(v_weight, num_kv_heads,
-                                         tensor_parallel)
-            assert (k_weight.shape[0] % (tensor_parallel * head_size)) == 0
-            assert (v_weight.shape[0] % (tensor_parallel * head_size)) == 0
+                                         mapping.tp_size)
+            assert (k_weight.shape[0] % (mapping.tp_size * head_size)) == 0
+            assert (v_weight.shape[0] % (mapping.tp_size * head_size)) == 0
             qkv_weight = [q_weight, k_weight, v_weight]
         else:
             qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
@@ -121,6 +121,10 @@ def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
         model_params[prefix + 'qkv_proj.weight'] = qkv_weight
 
     torch_dtype = str_dtype_to_torch(dtype)
+    layers_per_pipeline_stage = hf_llama.config.num_hidden_layers // mapping.pp_size
+    layers_range = list(
+        range(mapping.pp_rank * layers_per_pipeline_stage,
+              (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1))
     for k, v in model_params.items():
         if isinstance(v, list):
             v = [torch_to_numpy(vv.to(torch_dtype).detach().cpu()) for vv in v]
@@ -128,19 +132,22 @@ def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
             v = torch_to_numpy(v.to(torch_dtype).detach().cpu())
         if 'model.embed_tokens.weight' in k:
             if tensorrt_llm_llama.use_parallel_embedding:
-                v = split(v, tensor_parallel, rank,
+                v = split(v, mapping.tp_size, mapping.tp_rank,
                           tensorrt_llm_llama.embedding_sharding_dim)
-            tensorrt_llm_llama.vocab_embedding.weight.value = v
+            if mapping.is_first_pp_rank():
+                tensorrt_llm_llama.vocab_embedding.weight.value = v
         elif 'model.norm.weight' in k:
-            tensorrt_llm_llama.ln_f.weight.value = v
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_llama.ln_f.weight.value = v
         elif 'lm_head.weight' in k:
-            tensorrt_llm_llama.lm_head.weight.value = np.ascontiguousarray(
-                split(v, tensor_parallel, rank))
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_llama.lm_head.weight.value = np.ascontiguousarray(
+                    split(v, mapping.tp_size, mapping.tp_rank))
         else:
             layer_idx = extract_layer_idx(k)
-            if layer_idx is None:
+            if layer_idx is None or int(layer_idx) not in layers_range:
                 continue
-            idx = int(layer_idx)
+            idx = int(layer_idx) - mapping.pp_rank * layers_per_pipeline_stage
             if idx >= tensorrt_llm_llama.num_layers:
                 continue
             if 'input_layernorm.weight' in k:
@@ -152,20 +159,21 @@ def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
                 dst = tensorrt_llm_llama.layers[idx].attention.qkv.weight
                 if not mha_mode:
                     assert isinstance(v, list) and len(v) == 3
-                    wq = split(v[0], tensor_parallel, rank)
-                    wk = split(v[1], tensor_parallel, rank)
-                    wv = split(v[2], tensor_parallel, rank)
+                    wq = split(v[0], mapping.tp_size, mapping.tp_rank)
+                    wk = split(v[1], mapping.tp_size, mapping.tp_rank)
+                    wv = split(v[2], mapping.tp_size, mapping.tp_rank)
                     split_v = np.concatenate((wq, wk, wv))
                 else:
                     q_emb = v.shape[0] // 3
                     model_emb = v.shape[1]
                     v = v.reshape(3, q_emb, model_emb)
-                    split_v = split(v, tensor_parallel, rank, dim=1)
-                    split_v = split_v.reshape(3 * (q_emb // tensor_parallel),
+                    split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=1)
+                    split_v = split_v.reshape(3 * (q_emb // mapping.tp_size),
                                               model_emb)
                 if use_weight_only:
                     v = np.ascontiguousarray(split_v.transpose())
-                    processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                    processed_torch_weights, torch_weight_scales = \
+                        torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
                         torch.tensor(v), plugin_weight_only_quant_type)
                     # workaround for trt not supporting int8 inputs in plugins currently
                     dst.value = processed_torch_weights.view(
@@ -177,10 +185,11 @@ def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
                     dst.value = np.ascontiguousarray(split_v)
             elif 'self_attn.o_proj.weight' in k:
                 dst = tensorrt_llm_llama.layers[idx].attention.dense.weight
-                split_v = split(v, tensor_parallel, rank, dim=1)
+                split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=1)
                 if use_weight_only:
                     v = np.ascontiguousarray(split_v.transpose())
-                    processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                    processed_torch_weights, torch_weight_scales = \
+                        torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
                         torch.tensor(v), plugin_weight_only_quant_type)
                     # workaround for trt not supporting int8 inputs in plugins currently
                     dst.value = processed_torch_weights.view(
@@ -192,10 +201,11 @@ def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
                     dst.value = np.ascontiguousarray(split_v)
             elif 'mlp.up_proj.weight' in k:
                 dst = tensorrt_llm_llama.layers[idx].mlp.gate.weight
-                split_v = split(v, tensor_parallel, rank, dim=0)
+                split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=0)
                 if use_weight_only:
                     v = np.ascontiguousarray(split_v.transpose())
-                    processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                    processed_torch_weights, torch_weight_scales = \
+                        torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
                         torch.tensor(v), plugin_weight_only_quant_type)
                     # workaround for trt not supporting int8 inputs in plugins currently
                     dst.value = processed_torch_weights.view(
@@ -207,10 +217,11 @@ def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
                     dst.value = np.ascontiguousarray(split_v)
             elif 'mlp.down_proj.weight' in k:
                 dst = tensorrt_llm_llama.layers[idx].mlp.proj.weight
-                split_v = split(v, tensor_parallel, rank, dim=1)
+                split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=1)
                 if use_weight_only:
                     v = np.ascontiguousarray(split_v.transpose())
-                    processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                    processed_torch_weights, torch_weight_scales = \
+                        torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
                         torch.tensor(v), plugin_weight_only_quant_type)
                     # workaround for trt not supporting int8 inputs in plugins currently
                     dst.value = processed_torch_weights.view(
@@ -222,10 +233,11 @@ def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
                     dst.value = np.ascontiguousarray(split_v)
             elif 'mlp.gate_proj.weight' in k:
                 dst = tensorrt_llm_llama.layers[idx].mlp.fc.weight
-                split_v = split(v, tensor_parallel, rank, dim=0)
+                split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=0)
                 if use_weight_only:
                     v = np.ascontiguousarray(split_v.transpose())
-                    processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                    processed_torch_weights, torch_weight_scales = \
+                        torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
                         torch.tensor(v), plugin_weight_only_quant_type)
                     # workaround for trt not supporting int8 inputs in plugins currently
                     dst.value = processed_torch_weights.view(
@@ -245,8 +257,7 @@ def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
 def load_from_meta_llama(
         tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
         meta_ckpt_dir,
-        rank=0,
-        tensor_parallel=1,
+        mapping=Mapping(),
         dtype="float32"):
     torch_dtype = str_dtype_to_torch(dtype)
 
@@ -270,12 +281,12 @@ def split_ckpt(ckpt, ranks_per_ckpt, ckpt_rank):
                 d = 1
             if "norm" in k or "rope" in k:  # no TP
                 split_ckpt[k] = ckpt[k].clone()
-            elif tensorrt_llm_llama.num_kv_heads < tensor_parallel and any(
+            elif tensorrt_llm_llama.num_kv_heads < mapping.tp_size and any(
                 [n in k for n in ["wk", "wv"]]):
-                assert tensor_parallel % tensorrt_llm_llama.num_kv_heads == 0
+                assert mapping.tp_size % tensorrt_llm_llama.num_kv_heads == 0
                 # special case: we need to duplicate KV head
                 tmp = dup_kv_weight(ckpt[k], tensorrt_llm_llama.num_kv_heads,
-                                    tensor_parallel)
+                                    mapping.tp_size)
                 split_ckpt[k] = torch.split(tmp,
                                             tmp.shape[d] // ranks_per_ckpt,
                                             dim=d)[ckpt_rank].clone()
@@ -286,11 +297,11 @@ def split_ckpt(ckpt, ranks_per_ckpt, ckpt_rank):
         return split_ckpt
 
     def get_current_weights(num_ckpts):
-        if num_ckpts > tensor_parallel:
+        if num_ckpts > mapping.tp_size:
             # combine ckpts
-            assert (num_ckpts % tensor_parallel) == 0
-            nf = num_ckpts // tensor_parallel
-            fs = nf * rank
+            assert (num_ckpts % mapping.tp_size) == 0
+            nf = num_ckpts // mapping.tp_size
+            fs = nf * mapping.tp_rank
             file_ids = list(range(fs, fs + nf))
             ckpts = []
             for f in file_ids:
@@ -299,12 +310,12 @@ def get_current_weights(num_ckpts):
                                   map_location="cpu")
                 ckpts.append(ckpt)
             return gather_ckpts(ckpts)
-        elif num_ckpts < tensor_parallel:
+        elif num_ckpts < mapping.tp_size:
             # split ckpt
-            assert (tensor_parallel % num_ckpts) == 0
-            ranks_per_ckpt = tensor_parallel // num_ckpts
-            ckpt_fid = rank // ranks_per_ckpt
-            ckpt_rank = rank % ranks_per_ckpt
+            assert (mapping.tp_size % num_ckpts) == 0
+            ranks_per_ckpt = mapping.tp_size // num_ckpts
+            ckpt_fid = mapping.tp_rank // ranks_per_ckpt
+            ckpt_rank = mapping.tp_rank % ranks_per_ckpt
             nH_per_ckpt = tensorrt_llm_llama.num_heads // num_ckpts
             assert (nH_per_ckpt % ranks_per_ckpt) == 0
             ckpt = torch.load(Path(meta_ckpt_dir,
@@ -313,7 +324,8 @@ def get_current_weights(num_ckpts):
             return split_ckpt(ckpt, ranks_per_ckpt, ckpt_rank)
 
         # num_ckpts == tensor_parallel, 1:1 mapping from files to TP
-        return torch.load(Path(meta_ckpt_dir, f"consolidated.{rank:02d}.pth"),
+        return torch.load(Path(meta_ckpt_dir,
+                               f"consolidated.{mapping.tp_rank:02d}.pth"),
                           map_location="cpu")
 
     def permute(w, nH, d, dH):
@@ -324,7 +336,7 @@ def permute(w, nH, d, dH):
         load_from_meta_llama.saved_embed = None
 
     def gather_embedding(cur_embed, name: str, num_ckpts):
-        if tensor_parallel == 1:
+        if mapping.tp_size == 1:
             # even if num_ckpts > 1, get_current_weights will already have it gathered
             return cur_embed
         if load_from_meta_llama.saved_embed is None:
@@ -359,18 +371,22 @@ def gather_embedding(cur_embed, name: str, num_ckpts):
 
     head_size = tensorrt_llm_llama.hidden_size // tensorrt_llm_llama.num_heads
     ckpt = get_current_weights(num_ckpts)
-    for l in range(tensorrt_llm_llama.num_layers):
+    layers_range = list(
+        range(mapping.pp_rank * tensorrt_llm_llama.num_layers,
+              (mapping.pp_rank + 1) * tensorrt_llm_llama.num_layers, 1))
+
+    for l in layers_range:
         prefix = f'layers.{l}.attention.'
         q_weight = permute(ckpt[prefix + 'wq.weight'].clone(),
-                           nH=(tensorrt_llm_llama.num_heads // tensor_parallel),
+                           nH=(tensorrt_llm_llama.num_heads // mapping.tp_size),
                            d=tensorrt_llm_llama.hidden_size,
                            dH=head_size)
-        if num_kv_heads < tensor_parallel and num_ckpts >= tensor_parallel:
-            assert tensor_parallel % num_kv_heads == 0
+        if num_kv_heads < mapping.tp_size and num_ckpts >= mapping.tp_size:
+            assert mapping.tp_size % num_kv_heads == 0
             assert False, "Not supported yet"
         k_weight = permute(ckpt[prefix + 'wk.weight'].clone(),
-                           nH=((num_kv_heads + tensor_parallel - 1) //
-                               tensor_parallel),
+                           nH=((num_kv_heads + mapping.tp_size - 1) //
+                               mapping.tp_size),
                            d=tensorrt_llm_llama.hidden_size,
                            dH=head_size)
         v_weight = ckpt[prefix + 'wv.weight'].clone()
@@ -386,18 +402,22 @@ def gather_embedding(cur_embed, name: str, num_ckpts):
             elif tensorrt_llm_llama.embedding_sharding_dim == 0:
                 # this needs a gather and then resplit along different dims
                 v = gather_embedding(v, k, num_ckpts)
-                v = split(v, tensor_parallel, rank, 0)
-            tensorrt_llm_llama.vocab_embedding.weight.value = v
+                v = split(v, mapping.tp_size, mapping.tp_rank, 0)
+            if mapping.is_first_pp_rank():
+                tensorrt_llm_llama.vocab_embedding.weight.value = v
         elif "output" in k:
-            tensorrt_llm_llama.lm_head.weight.value = v
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_llama.lm_head.weight.value = v
         elif k == "norm.weight":
-            tensorrt_llm_llama.ln_f.weight.value = v
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_llama.ln_f.weight.value = v
         else:
             # layer specific weights
             layer_idx = extract_layer_idx(k)
             if layer_idx is None:
                 continue
-            idx = int(layer_idx)
+            idx = int(
+                layer_idx) - mapping.pp_rank * tensorrt_llm_llama.num_layers
             if idx >= tensorrt_llm_llama.num_layers:
                 continue
             if 'attention_norm.weight' in k:
@@ -423,8 +443,7 @@ def gather_embedding(cur_embed, name: str, num_ckpts):
 
 def load_from_binary(tensorrt_llm_llama: LLaMAForCausalLM,
                      dir_path,
-                     rank=0,
-                     tensor_parallel=1,
+                     mapping=Mapping(),
                      fp16=False,
                      multi_query_mode=False):
     tensorrt_llm.logger.info('Loading weights from FT...')
@@ -488,6 +507,10 @@ def set_smoother(module, dir_path, base_name, shape, rank):
 
     # Determine the quantization mode.
     quant_mode = getattr(tensorrt_llm_llama, "quant_mode", QuantMode(0))
+    if quant_mode.is_int8_weight_only():
+        plugin_weight_only_quant_type = torch.int8
+    elif quant_mode.is_int4_weight_only():
+        plugin_weight_only_quant_type = torch.quint4x2
     # Do we use SmoothQuant?
     use_smooth_quant = quant_mode.has_act_and_weight_quant()
     # Do we use quantization per token?
@@ -496,7 +519,7 @@ def set_smoother(module, dir_path, base_name, shape, rank):
     quant_per_channel = quant_mode.has_per_channel_scaling()
 
     # Do we use INT4/INT8 weight-only?
-    quant_mode.is_weight_only()
+    use_weight_only = quant_mode.is_weight_only()
 
     # Int8 KV cache
     use_int8_kv_cache = quant_mode.has_int8_kv_cache()
@@ -505,149 +528,205 @@ def sq_trick(x):
         return x.view(np.float32) if use_smooth_quant else x
 
     # Debug
-    suffix = gen_suffix(rank, use_smooth_quant, quant_per_channel)
+    suffix = gen_suffix(mapping.tp_rank, use_smooth_quant, quant_per_channel)
     # The type of weights.
     w_type = np_dtype if not use_smooth_quant else np.int8
 
-    tensorrt_llm_llama.vocab_embedding.weight.value = (fromfile(
-        dir_path, 'vocab_embedding.weight.bin', [vocab_size, n_embd]))
+    if mapping.is_first_pp_rank():
+        tensorrt_llm_llama.vocab_embedding.weight.value = (fromfile(
+            dir_path, 'vocab_embedding.weight.bin', [vocab_size, n_embd]))
 
-    tensorrt_llm_llama.ln_f.weight.value = (fromfile(dir_path,
-                                                     'ln_f.weight.bin'))
+    if mapping.is_last_pp_rank():
+        tensorrt_llm_llama.ln_f.weight.value = (fromfile(
+            dir_path, 'ln_f.weight.bin'))
     # share input embedding
     lm_head_weight = fromfile(dir_path, 'lm_head.weight.bin',
                               [vocab_size, n_embd])
 
-    if vocab_size % tensor_parallel != 0:
+    if vocab_size % mapping.tp_size != 0:
         # padding
-        vocab_size_padded = tensorrt_llm_llama.lm_head.out_features * tensor_parallel
+        vocab_size_padded = tensorrt_llm_llama.lm_head.out_features * mapping.tp_size
         pad_width = vocab_size_padded - vocab_size
         lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)),
                                 'constant',
                                 constant_values=0)
-    tensorrt_llm_llama.lm_head.weight.value = np.ascontiguousarray(
-        split(lm_head_weight, tensor_parallel, rank))
+    if mapping.is_last_pp_rank():
+        tensorrt_llm_llama.lm_head.weight.value = np.ascontiguousarray(
+            split(lm_head_weight, mapping.tp_size, mapping.tp_rank))
 
-    for i in range(n_layer):
+    layers_range = list(
+        range(mapping.pp_rank * tensorrt_llm_llama.num_layers,
+              (mapping.pp_rank + 1) * tensorrt_llm_llama.num_layers, 1))
+
+    for i in layers_range:
         c_attn_out_dim = (3 * n_embd //
-                          tensor_parallel) if not multi_query_mode else (
-                              n_embd // tensor_parallel +
+                          mapping.tp_size) if not multi_query_mode else (
+                              n_embd // mapping.tp_size +
                               (n_embd // n_head) * 2)
-        tensorrt_llm_llama.layers[i].input_layernorm.weight.value = (fromfile(
+        idx = i - mapping.pp_rank * tensorrt_llm_llama.num_layers
+        tensorrt_llm_llama.layers[idx].input_layernorm.weight.value = (fromfile(
             dir_path, 'model.layers.' + str(i) + '.input_layernorm.weight.bin'))
         t = fromfile(
             dir_path, 'model.layers.' + str(i) +
             '.attention.query_key_value.weight.' + suffix,
             [n_embd, c_attn_out_dim], w_type)
         if t is not None:
-            dst = tensorrt_llm_llama.layers[i].attention.qkv.weight
+            dst = tensorrt_llm_llama.layers[idx].attention.qkv.weight
             if use_smooth_quant:
                 dst.value = sq_trick(
                     np.ascontiguousarray(np.transpose(t, [1, 0])))
                 set_smoothquant_scale_factors(
-                    tensorrt_llm_llama.layers[i].attention.qkv,
-                    tensorrt_llm_llama.layers[i].input_layernorm.scale_to_int,
+                    tensorrt_llm_llama.layers[idx].attention.qkv,
+                    tensorrt_llm_llama.layers[idx].input_layernorm.scale_to_int,
                     dir_path,
                     'model.layers.' + str(i) + '.attention.query_key_value.',
                     [1, c_attn_out_dim],
                     quant_per_token_dyn,
                     quant_per_channel,
-                    rank=rank,
+                    rank=mapping.tp_rank,
                     is_qkv=True)
+            elif use_weight_only:
+                processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                    torch.tensor(t), plugin_weight_only_quant_type)
+                # workaround for trt not supporting int8 inputs in plugins currently
+                dst.value = processed_torch_weights.view(
+                    dtype=torch.float32).numpy()
+                scales = tensorrt_llm_llama.layers[
+                    i].attention.qkv.per_channel_scale
+                scales.value = torch_weight_scales.numpy()
             else:
                 dst.value = np.ascontiguousarray(np.transpose(t, [1, 0]))
 
-        dst = tensorrt_llm_llama.layers[i].attention.dense.weight
+        dst = tensorrt_llm_llama.layers[idx].attention.dense.weight
         t = fromfile(
             dir_path,
             'model.layers.' + str(i) + '.attention.dense.weight.' + suffix,
-            [n_embd // tensor_parallel, n_embd], w_type)
+            [n_embd // mapping.tp_size, n_embd], w_type)
         if use_smooth_quant:
             dst.value = sq_trick(np.ascontiguousarray(np.transpose(t, [1, 0])))
-            dense_scale = getattr(tensorrt_llm_llama.layers[i].attention,
+            dense_scale = getattr(tensorrt_llm_llama.layers[idx].attention,
                                   "quantization_scaling_factor", None)
             set_smoothquant_scale_factors(
-                tensorrt_llm_llama.layers[i].attention.dense, dense_scale,
+                tensorrt_llm_llama.layers[idx].attention.dense, dense_scale,
                 dir_path, 'model.layers.' + str(i) + '.attention.dense.',
                 [1, n_embd], quant_per_token_dyn, quant_per_channel)
-            set_smoother(tensorrt_llm_llama.layers[i].attention.dense, dir_path,
+            set_smoother(tensorrt_llm_llama.layers[idx].attention.dense,
+                         dir_path,
                          'model.layers.' + str(i) + '.attention.dense',
-                         [1, n_embd // tensor_parallel], rank)
+                         [1, n_embd // mapping.tp_size], mapping.tp_rank)
+        elif use_weight_only:
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            # workaround for trt not supporting int8 inputs in plugins currently
+            dst.value = processed_torch_weights.view(
+                dtype=torch.float32).numpy()
+            scales = tensorrt_llm_llama.layers[
+                i].attention.dense.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
         else:
             dst.value = np.ascontiguousarray(np.transpose(t, [1, 0]))
 
-        dst = tensorrt_llm_llama.layers[i].post_layernorm.weight
+        dst = tensorrt_llm_llama.layers[idx].post_layernorm.weight
         dst.value = fromfile(
             dir_path, 'model.layers.' + str(i) + '.post_layernorm.weight.bin')
 
         t = fromfile(dir_path,
                      'model.layers.' + str(i) + '.mlp.fc.weight.' + suffix,
-                     [n_embd, inter_size // tensor_parallel], w_type)
+                     [n_embd, inter_size // mapping.tp_size], w_type)
 
         if use_smooth_quant:
-            tensorrt_llm_llama.layers[i].mlp.fc.weight.value = sq_trick(
+            tensorrt_llm_llama.layers[idx].mlp.fc.weight.value = sq_trick(
                 np.ascontiguousarray(np.transpose(t, [1, 0])))
             set_smoothquant_scale_factors(
-                tensorrt_llm_llama.layers[i].mlp.fc,
-                tensorrt_llm_llama.layers[i].post_layernorm.scale_to_int,
+                tensorrt_llm_llama.layers[idx].mlp.fc,
+                tensorrt_llm_llama.layers[idx].post_layernorm.scale_to_int,
                 dir_path,
                 'model.layers.' + str(i) + '.mlp.fc.',
-                [1, inter_size // tensor_parallel],
+                [1, inter_size // mapping.tp_size],
                 quant_per_token_dyn,
                 quant_per_channel,
-                rank=rank)
+                rank=mapping.tp_rank)
+        elif use_weight_only:
+            dst = tensorrt_llm_llama.layers[i].mlp.fc.weight
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            # workaround for trt not supporting int8 inputs in plugins currently
+            dst.value = processed_torch_weights.view(
+                dtype=torch.float32).numpy()
+            scales = tensorrt_llm_llama.layers[i].mlp.fc.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
         else:
             tensorrt_llm_llama.layers[
-                i].mlp.fc.weight.value = np.ascontiguousarray(
+                idx].mlp.fc.weight.value = np.ascontiguousarray(
                     np.transpose(t, [1, 0]))
 
         t = fromfile(dir_path,
                      'model.layers.' + str(i) + '.mlp.gate.weight.' + suffix,
-                     [n_embd, inter_size // tensor_parallel], w_type)
+                     [n_embd, inter_size // mapping.tp_size], w_type)
         if use_smooth_quant:
-            tensorrt_llm_llama.layers[i].mlp.gate.weight.value = sq_trick(
+            tensorrt_llm_llama.layers[idx].mlp.gate.weight.value = sq_trick(
                 np.ascontiguousarray(np.transpose(t, [1, 0])))
             set_smoothquant_scale_factors(
-                tensorrt_llm_llama.layers[i].mlp.gate,
-                tensorrt_llm_llama.layers[i].post_layernorm.scale_to_int,
+                tensorrt_llm_llama.layers[idx].mlp.gate,
+                tensorrt_llm_llama.layers[idx].post_layernorm.scale_to_int,
                 dir_path,
                 'model.layers.' + str(i) + '.mlp.gate.',
-                [1, inter_size // tensor_parallel],
+                [1, inter_size // mapping.tp_size],
                 quant_per_token_dyn,
                 quant_per_channel,
-                rank=rank)
+                rank=mapping.tp_rank)
+        elif use_weight_only:
+            dst = tensorrt_llm_llama.layers[i].mlp.gate.weight
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            # workaround for trt not supporting int8 inputs in plugins currently
+            dst.value = processed_torch_weights.view(
+                dtype=torch.float32).numpy()
+            scales = tensorrt_llm_llama.layers[i].mlp.gate.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
         else:
             tensorrt_llm_llama.layers[
-                i].mlp.gate.weight.value = np.ascontiguousarray(
+                idx].mlp.gate.weight.value = np.ascontiguousarray(
                     np.transpose(t, [1, 0]))
 
         t = fromfile(dir_path,
                      'model.layers.' + str(i) + '.mlp.proj.weight.' + suffix,
-                     [inter_size // tensor_parallel, n_embd], w_type)
+                     [inter_size // mapping.tp_size, n_embd], w_type)
         if use_smooth_quant:
-            tensorrt_llm_llama.layers[i].mlp.proj.weight.value = sq_trick(
+            tensorrt_llm_llama.layers[idx].mlp.proj.weight.value = sq_trick(
                 np.ascontiguousarray(np.transpose(t, [1, 0])))
-            proj_scale = getattr(tensorrt_llm_llama.layers[i].mlp,
+            proj_scale = getattr(tensorrt_llm_llama.layers[idx].mlp,
                                  "quantization_scaling_factor", None)
             set_smoothquant_scale_factors(
-                tensorrt_llm_llama.layers[i].mlp.proj, proj_scale, dir_path,
+                tensorrt_llm_llama.layers[idx].mlp.proj, proj_scale, dir_path,
                 'model.layers.' + str(i) + '.mlp.proj.', [1, n_embd],
                 quant_per_token_dyn, quant_per_channel)
-            set_smoother(tensorrt_llm_llama.layers[i].mlp.proj, dir_path,
+            set_smoother(tensorrt_llm_llama.layers[idx].mlp.proj, dir_path,
                          'model.layers.' + str(i) + '.mlp.proj',
-                         [1, inter_size // tensor_parallel], rank)
+                         [1, inter_size // mapping.tp_size], mapping.tp_rank)
+        elif use_weight_only:
+            dst = tensorrt_llm_llama.layers[i].mlp.proj.weight
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            # workaround for trt not supporting int8 inputs in plugins currently
+            dst.value = processed_torch_weights.view(
+                dtype=torch.float32).numpy()
+            scales = tensorrt_llm_llama.layers[i].mlp.proj.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
         else:
-            tensorrt_llm_llama.layers[i].mlp.proj.weight.value = (
+            tensorrt_llm_llama.layers[idx].mlp.proj.weight.value = (
                 np.ascontiguousarray(np.transpose(t, [1, 0])))
 
+        assert use_int8_kv_cache
         if use_int8_kv_cache:
             t = fromfile(
                 dir_path, 'model.layers.' + str(i) +
                 '.attention.query_key_value.scale_y_quant_orig.bin', [1],
                 np.float32)
             tensorrt_llm_llama.layers[
-                i].attention.kv_orig_quant_scale.value = 1.0 / t
-            tensorrt_llm_llama.layers[i].attention.kv_quant_orig_scale.value = t
+                idx].attention.kv_orig_quant_scale.value = 1.0 / t
+            tensorrt_llm_llama.layers[
+                idx].attention.kv_quant_orig_scale.value = t
 
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
@@ -656,8 +735,7 @@ def sq_trick(x):
 
 def load_from_groupwise_safetensors_llama(tensorrt_llm_llama,
                                           quant_safetensors_path,
-                                          rank=0,
-                                          tensor_parallel=1,
+                                          mapping=Mapping(),
                                           dtype="float32",
                                           multi_query_mode=False):
     tensorrt_llm.logger.info(
@@ -721,7 +799,13 @@ def preprocess_groupwise_weight_params(weight_name,
     num_kv_heads = tensorrt_llm_llama.num_kv_heads
     mha_mode = (num_kv_heads == tensorrt_llm_llama.num_heads)
     suffixs = ['qweight', 'qzeros', 'scales']
-    for l in range(num_hidden_layers):
+
+    layers_per_pipeline_stage = num_hidden_layers // mapping.pp_size
+    layers_range = list(
+        range(mapping.pp_rank * layers_per_pipeline_stage,
+              (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1))
+
+    for l in layers_range:
         prefix = f'model.layers.{l}.self_attn.'
         split_qkv_suf = []
 
@@ -732,7 +816,8 @@ def preprocess_groupwise_weight_params(weight_name,
             qkv_part = torch.cat([q_part, k_part, v_part], dim=0)
             dim = qkv_part.shape
             qkv_part = qkv_part.reshape(3, dim[0] // 3, dim[1])
-            split_qkv = qkv_part.split(dim[1] // tensor_parallel, dim=2)[rank]
+            split_qkv = qkv_part.split(dim[1] // mapping.tp_size,
+                                       dim=2)[mapping.tp_rank]
             split_qkv = torch.cat([
                 split_qkv[0, :, :].squeeze(0), split_qkv[1, :, :].squeeze(0),
                 split_qkv[2, :, :].squeeze(0)
@@ -743,31 +828,40 @@ def preprocess_groupwise_weight_params(weight_name,
         th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params(
             None, split_qkv_suf[0], split_qkv_suf[1], split_qkv_suf[2])
 
+        idx = l - mapping.pp_rank * layers_per_pipeline_stage
+        tensorrt_llm_llama.layers[
+            idx].attention.qkv.qweight.value = th_qweight.numpy()
         tensorrt_llm_llama.layers[
-            l].attention.qkv.qweight.value = th_qweight.numpy()
-        tensorrt_llm_llama.layers[l].attention.qkv.scale.value = th_zero.numpy()
-        tensorrt_llm_llama.layers[l].attention.qkv.zero.value = th_scale.numpy()
+            idx].attention.qkv.scale.value = th_zero.numpy()
+        tensorrt_llm_llama.layers[
+            idx].attention.qkv.zero.value = th_scale.numpy()
 
     torch_dtype = str_dtype_to_torch(dtype)
+
     for k, v in model_params.items():
         if isinstance(v, list):
             v = [torch_to_numpy(vv.to(torch_dtype).detach().cpu()) for vv in v]
         else:
             v = torch_to_numpy(v.to(torch_dtype).detach().cpu())
         if 'model.embed_tokens.weight' in k:
-            tensorrt_llm_llama.vocab_embedding.weight.value = v
+            if mapping.is_first_pp_rank():
+                tensorrt_llm_llama.vocab_embedding.weight.value = v
         elif 'model.norm.weight' in k:
-            tensorrt_llm_llama.ln_f.weight.value = v
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_llama.ln_f.weight.value = v
         elif 'lm_head.weight' in k:
-            tensorrt_llm_llama.lm_head.weight.value = np.ascontiguousarray(
-                split(v, tensor_parallel, rank))
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_llama.lm_head.weight.value = np.ascontiguousarray(
+                    split(v, mapping.tp_size, mapping.tp_rank))
         else:
             layer_idx = extract_layer_idx(k)
             if layer_idx is None:
                 continue
             idx = int(layer_idx)
-            if idx >= tensorrt_llm_llama.num_layers:
+            if idx not in layers_range:
                 continue
+            idx = idx - mapping.pp_rank * layers_per_pipeline_stage
+
             if 'input_layernorm.weight' in k:
                 tensorrt_llm_llama.layers[idx].input_layernorm.weight.value = v
             elif 'post_attention_layernorm.weight' in k:
@@ -776,8 +870,8 @@ def preprocess_groupwise_weight_params(weight_name,
                 split_v_suf = []
                 for suf in suffixs:
                     v = model_params[k[:-7] + suf].cpu()
-                    split_v = v.split(v.shape[0] // tensor_parallel,
-                                      dim=0)[rank]
+                    split_v = v.split(v.shape[0] // mapping.tp_size,
+                                      dim=0)[mapping.tp_rank]
                     split_v_suf.append(split_v)
                 th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params(
                     None, split_v_suf[0], split_v_suf[1], split_v_suf[2])
@@ -791,8 +885,8 @@ def preprocess_groupwise_weight_params(weight_name,
                 split_v_suf = []
                 for suf in suffixs:
                     v = model_params[k[:-7] + suf].cpu()
-                    split_v = v.split(v.shape[1] // tensor_parallel,
-                                      dim=1)[rank]
+                    split_v = v.split(v.shape[1] // mapping.tp_size,
+                                      dim=1)[mapping.tp_rank]
                     split_v_suf.append(split_v)
                 th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params(
                     None, split_v_suf[0], split_v_suf[1], split_v_suf[2])
@@ -806,8 +900,8 @@ def preprocess_groupwise_weight_params(weight_name,
                 split_v_suf = []
                 for suf in suffixs:
                     v = model_params[k[:-7] + suf].cpu()
-                    split_v = v.split(v.shape[0] // tensor_parallel,
-                                      dim=0)[rank]
+                    split_v = v.split(v.shape[0] // mapping.tp_size,
+                                      dim=0)[mapping.tp_rank]
                     split_v_suf.append(split_v)
                 th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params(
                     None, split_v_suf[0], split_v_suf[1], split_v_suf[2])
@@ -821,8 +915,8 @@ def preprocess_groupwise_weight_params(weight_name,
                 split_v_suf = []
                 for suf in suffixs:
                     v = model_params[k[:-7] + suf].cpu()
-                    split_v = v.split(v.shape[1] // tensor_parallel,
-                                      dim=1)[rank]
+                    split_v = v.split(v.shape[1] // mapping.tp_size,
+                                      dim=1)[mapping.tp_rank]
                     split_v_suf.append(split_v)
                 th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params(
                     None, split_v_suf[0], split_v_suf[1], split_v_suf[2])
diff --git a/examples/mpt/build.py b/examples/mpt/build.py
index 2cc867664aa..df8bfa34d26 100644
--- a/examples/mpt/build.py
+++ b/examples/mpt/build.py
@@ -224,6 +224,20 @@ def parse_arguments(args):
     args = parser.parse_args(args)
     logger.set_level(args.log_level)
 
+    if args.use_inflight_batching:
+        if not args.use_gpt_attention_plugin:
+            args.use_gpt_attention_plugin = 'float16'
+            logger.info(
+                f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'"
+            )
+        if not args.remove_input_padding:
+            args.remove_input_padding = True
+            logger.info(
+                "Using remove input padding for inflight batching mode.")
+        if not args.paged_kv_cache:
+            args.paged_kv_cache = True
+            logger.info("Using paged KV cache for inflight batching mode.")
+
     args.bias = not args.no_bias
     if args.inter_size is None:
         args.inter_size = 4 * args.n_embd
@@ -394,6 +408,8 @@ def build_rank_engine(builder: Builder,
             prompt_embedding_table_size=args.max_prompt_embedding_table_size)
         tensorrt_llm_gpt(*inputs)
 
+    tensorrt_llm.graph_rewriting.optimize(network)
+
     engine = None
 
     # Network -> Engine
diff --git a/examples/mpt/run.py b/examples/mpt/run.py
index 369a03986c2..0f988a269dd 100644
--- a/examples/mpt/run.py
+++ b/examples/mpt/run.py
@@ -165,7 +165,10 @@ def generate(
                 END_ID).cuda()
 
     max_input_length = torch.max(input_lengths).item()
-    decoder.setup(input_lengths.size(0), max_input_length, max_output_len)
+    decoder.setup(input_lengths.size(0),
+                  max_input_length,
+                  max_output_len,
+                  beam_width=num_beams)
 
     ptuning_args = []
     if use_prompt_tuning:
diff --git a/examples/openai_triton/CMakeLists.txt b/examples/openai_triton/CMakeLists.txt
index 9c99517563e..8523ecc8255 100644
--- a/examples/openai_triton/CMakeLists.txt
+++ b/examples/openai_triton/CMakeLists.txt
@@ -58,7 +58,7 @@ if(NOT DEFINED TRT_LIB_DIR)
   set(TRT_LIB_DIR "/usr/local/tensorrt/lib")
   if(NOT EXISTS ${TRT_INCLUDE_DIR})
     # In case of TensorRT installed from a deb package.
-    set(TRT_LIB_DIR "/lib/x86_64-linux-gnu")
+    set(TRT_LIB_DIR "/lib/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu")
   endif()
 endif()
 find_library(
@@ -84,6 +84,11 @@ find_library(TRT_LLM_LIB_PATH nvinfer_plugin_tensorrt_llm
 find_library(TRT_LLM_LIB_PATH nvinfer_plugin_tensorrt_llm REQUIRED)
 message(STATUS "Found nvinfer_plugin_tensorrt_llm library: ${TRT_LLM_LIB_PATH}")
 
+find_library(TRT_LLM_COMMON_LIB_PATH th_common HINTS ${TRT_LLM_LIB_DIR}
+                                                     NO_DEFAULT_PATH)
+find_library(TRT_LLM_COMMON_LIB_PATH th_common REQUIRED)
+message(STATUS "Found th_common library: ${TRT_LLM_COMMON_LIB_PATH}")
+
 # Declare the target library.
 add_library(
   ${TARGET_NAME} SHARED
@@ -94,11 +99,13 @@ add_library(
   aot/fp16/fmha_kernel_d64_fp16.48d785a0_0d1d2d3d4d5d6789.c
   aot/fp32/fmha_kernel_d64_fp32.da74e9ed_0d1d2d3d4d5d6789.c)
 
-set_property(TARGET ${TARGET_NAME} PROPERTY IMPORTED_LOCATION ${TRT_LIB_PATH})
-set_property(TARGET ${TARGET_NAME} PROPERTY IMPORTED_LOCATION
-                                            ${TRT_LLM_LIB_PATH})
-target_link_libraries(${TARGET_NAME} PUBLIC ${CUDA_LIBRARIES})
-target_link_libraries(${TARGET_NAME} PUBLIC cuda)
+target_link_libraries(
+  ${TARGET_NAME} PUBLIC cuda ${CUDA_LIBRARIES} ${TRT_LLM_LIB_PATH}
+                        ${TRT_LLM_COMMON_LIB_PATH} ${TRT_LIB_PATH})
+
+if(NOT MSVC)
+  set_property(TARGET ${TARGET_NAME} PROPERTY LINK_FLAGS "-Wl,--no-undefined")
+endif()
 
 target_include_directories(${TARGET_NAME} PUBLIC /usr/local/cuda/include)
 target_include_directories(${TARGET_NAME} PUBLIC ${TRT_INCLUDE_DIR})
diff --git a/examples/openai_triton/TritonFlashAttentionPlugin.cpp b/examples/openai_triton/TritonFlashAttentionPlugin.cpp
index 12f526e8e4c..f5b4d6c226c 100644
--- a/examples/openai_triton/TritonFlashAttentionPlugin.cpp
+++ b/examples/openai_triton/TritonFlashAttentionPlugin.cpp
@@ -16,11 +16,24 @@
  */
 #include "TritonFlashAttentionPlugin.h"
 
+#include "tensorrt_llm/plugins/common/checkMacrosPlugin.h"
+#include "tensorrt_llm/plugins/common/plugin.h"
+
+// Import a generated header to use generated triton kernels.
+extern "C"
+{
+#include "aot/fmha_kernel_fp16.h"
+#include "aot/fmha_kernel_fp32.h"
+}
+
 #include <string>
 
 using namespace nvinfer1;
-using nvinfer1::plugin::TritonFlashAttentionPluginCreator;
-using nvinfer1::plugin::TritonFlashAttentionPlugin;
+using openai_triton::plugin::TritonFlashAttentionPluginCreator;
+using openai_triton::plugin::TritonFlashAttentionPlugin;
+using tensorrt_llm::plugins::read;
+using tensorrt_llm::plugins::write;
+using tensorrt_llm::plugins::caughtError;
 
 static const char* TRITON_FLASH_ATTENTION_PLUGIN_VERSION{"1"};
 static const char* TRITON_FLASH_ATTENTION_PLUGIN_NAME{"TritonFlashAttention"};
@@ -44,7 +57,7 @@ TritonFlashAttentionPlugin::TritonFlashAttentionPlugin(const void* data, size_t
     read(d, mHeadSize);
     read(d, mSoftmaxScale);
     read(d, mType);
-    PLUGIN_ASSERT(d == a + length);
+    TLLM_CHECK(d == a + length);
 }
 
 // IPluginV2DynamicExt Methods
@@ -60,7 +73,7 @@ nvinfer1::DimsExprs TritonFlashAttentionPlugin::getOutputDimensions(
 {
     // Output shape.
     //   output tensor [batchSize, seqLen, mNumHeads, head_size]
-    PLUGIN_ASSERT(outputIndex == 0);
+    TLLM_CHECK(outputIndex == 0);
     return inputs[outputIndex];
 }
 
@@ -68,8 +81,8 @@ bool TritonFlashAttentionPlugin::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept
 {
     // In this example, inputs: Q, K, V, outputs: Out
-    PLUGIN_ASSERT(nbInputs + nbOutputs == 4);
-    PLUGIN_ASSERT(0 <= pos && pos < nbInputs + nbOutputs);
+    TLLM_CHECK(nbInputs + nbOutputs == 4);
+    TLLM_CHECK(0 <= pos && pos < nbInputs + nbOutputs);
 
     bool is_valid = false;
     if (0 <= pos && pos < 3) // Q, K, V
@@ -99,7 +112,7 @@ size_t TritonFlashAttentionPlugin::getWorkspaceSize(const nvinfer1::PluginTensor
     size_t workspaces[numBuffers];
     workspaces[0] = sizeof(float) * batchSize * mNumHeads * seqLen;
     workspaces[1] = sizeof(float) * batchSize * mNumHeads * seqLen;
-    return plugin::calculateTotalWorkspaceSize(workspaces, numBuffers);
+    return tensorrt_llm::plugins::calculateTotalWorkspaceSize(workspaces, numBuffers);
 }
 
 template <typename T>
@@ -107,9 +120,9 @@ int TritonFlashAttentionPlugin::enqueueImpl(const nvinfer1::PluginTensorDesc* in
     const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace,
     cudaStream_t stream)
 {
-    PLUGIN_ASSERT(inputDesc[0].dims.d[1] == mNumHeads && inputDesc[0].dims.d[3] == mHeadSize);
-    PLUGIN_ASSERT(inputDesc[1].dims.d[1] == mNumHeads && inputDesc[1].dims.d[3] == mHeadSize);
-    PLUGIN_ASSERT(inputDesc[2].dims.d[1] == mNumHeads && inputDesc[2].dims.d[3] == mHeadSize);
+    TLLM_CHECK(inputDesc[0].dims.d[1] == mNumHeads && inputDesc[0].dims.d[3] == mHeadSize);
+    TLLM_CHECK(inputDesc[1].dims.d[1] == mNumHeads && inputDesc[1].dims.d[3] == mHeadSize);
+    TLLM_CHECK(inputDesc[2].dims.d[1] == mNumHeads && inputDesc[2].dims.d[3] == mHeadSize);
 
     int batchSize = inputDesc[0].dims.d[0];
     int seqLen = inputDesc[0].dims.d[2];
@@ -118,7 +131,7 @@ int TritonFlashAttentionPlugin::enqueueImpl(const nvinfer1::PluginTensorDesc* in
 
     const size_t bufSize = sizeof(float) * batchSize * mNumHeads * seqLen;
     float* L = reinterpret_cast<float*>(workspace);
-    float* M = reinterpret_cast<float*>(plugin::nextWorkspacePtr(reinterpret_cast<int8_t*>(L), bufSize));
+    float* M = reinterpret_cast<float*>(tensorrt_llm::plugins::nextWorkspacePtr(reinterpret_cast<int8_t*>(L), bufSize));
 
     const T* Q = reinterpret_cast<const T*>(inputs[0]);
     const T* K = reinterpret_cast<const T*>(inputs[1]);
@@ -160,7 +173,7 @@ int TritonFlashAttentionPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputD
 nvinfer1::DataType TritonFlashAttentionPlugin::getOutputDataType(
     int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
 {
-    PLUGIN_ASSERT(index == 0);
+    TLLM_CHECK(index == 0);
     return inputTypes[0];
 }
 
@@ -269,22 +282,22 @@ IPluginV2* TritonFlashAttentionPluginCreator::createPlugin(const char* name, con
         const char* attrName = fields[i].name;
         if (!strcmp(attrName, "num_heads"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             numHeads = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "head_size"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             headSize = static_cast<int>(*(static_cast<const int*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "softmax_scale"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kFLOAT32);
             softmaxScale = static_cast<float>(*(static_cast<const float*>(fields[i].data)));
         }
         else if (!strcmp(attrName, "type_id"))
         {
-            PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
             type = static_cast<nvinfer1::DataType>(*(static_cast<const nvinfer1::DataType*>(fields[i].data)));
         }
     }
diff --git a/examples/openai_triton/TritonFlashAttentionPlugin.h b/examples/openai_triton/TritonFlashAttentionPlugin.h
index 7e986adb254..1fb27fac1f7 100644
--- a/examples/openai_triton/TritonFlashAttentionPlugin.h
+++ b/examples/openai_triton/TritonFlashAttentionPlugin.h
@@ -14,18 +14,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef TRT_TRITON_FLASH_ATTENTION_PLUGIN_H
-#define TRT_TRITON_FLASH_ATTENTION_PLUGIN_H
-#include "NvInferPlugin.h"
+#pragma once
 
-#include "tensorrt_llm/plugins/common/plugin.h"
-
-// Import a generated header to use generated triton kernels.
-extern "C"
-{
-#include "aot/fmha_kernel_fp16.h"
-#include "aot/fmha_kernel_fp32.h"
-}
+#include <NvInferRuntime.h>
 
 #include <cassert>
 #include <set>
@@ -35,12 +26,10 @@ extern "C"
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-namespace nvinfer1
-{
-namespace plugin
+namespace openai_triton::plugin
 {
 
-class TritonFlashAttentionPlugin : public IPluginV2DynamicExt
+class TritonFlashAttentionPlugin : public nvinfer1::IPluginV2DynamicExt
 {
 public:
     TritonFlashAttentionPlugin(int numHeads, int headSize, float softmaxScale, nvinfer1::DataType type);
@@ -95,7 +84,7 @@ class TritonFlashAttentionPlugin : public IPluginV2DynamicExt
     CUfunction mKernel;
 };
 
-class TritonFlashAttentionPluginCreator : public IPluginCreator
+class TritonFlashAttentionPluginCreator : public nvinfer1::IPluginCreator
 {
 public:
     TritonFlashAttentionPluginCreator();
@@ -116,12 +105,9 @@ class TritonFlashAttentionPluginCreator : public IPluginCreator
     const char* getPluginNamespace() const noexcept override;
 
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
     std::string mNamespace;
 };
 
-} // namespace plugin
-} // namespace nvinfer1
-
-#endif // TRT_TRITON_FLASH_ATTENTION_PLUGIN_H
+} // namespace openai_triton::plugin
diff --git a/examples/openai_triton/plugin.py b/examples/openai_triton/plugin.py
index 92b90c86a95..7ad34600a02 100644
--- a/examples/openai_triton/plugin.py
+++ b/examples/openai_triton/plugin.py
@@ -36,9 +36,9 @@ def _load_triton_plugin_lib():
     handle = ctypes.CDLL(plugin_lib, mode=ctypes.RTLD_GLOBAL)
     if handle is None:
         raise ImportError('TensorRT-LLM Triton Plugin is unavailable')
-    handle.initLibNvInferPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-    handle.initLibNvInferPlugins.restype = ctypes.c_bool
-    assert handle.initLibNvInferPlugins(
+    handle.initOpenAiTritonPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+    handle.initOpenAiTritonPlugins.restype = ctypes.c_bool
+    assert handle.initOpenAiTritonPlugins(
         None, TRT_LLM_PLUGIN_NAMESPACE.encode('utf-8'))
 
 
diff --git a/examples/openai_triton/tritonPlugins.cpp b/examples/openai_triton/tritonPlugins.cpp
index fe401d5d750..6361326088c 100644
--- a/examples/openai_triton/tritonPlugins.cpp
+++ b/examples/openai_triton/tritonPlugins.cpp
@@ -14,8 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "NvInfer.h"
-#include "NvInferPlugin.h"
+#include "NvInferRuntime.h"
 #include "TritonFlashAttentionPlugin.h"
 
 #include <algorithm>
@@ -26,12 +25,6 @@
 #include <stack>
 #include <unordered_set>
 
-using namespace nvinfer1;
-using namespace nvinfer1::plugin;
-
-namespace nvinfer1
-{
-
 namespace
 {
 
@@ -86,12 +79,12 @@ class TritonPluginCreatorRegistry
         {
             if (!errorMsg.empty())
             {
-                trtLogger->log(ILogger::Severity::kERROR, errorMsg.c_str());
+                trtLogger->log(nvinfer1::ILogger::Severity::kERROR, errorMsg.c_str());
             }
 
             if (!verboseMsg.empty())
             {
-                trtLogger->log(ILogger::Severity::kVERBOSE, verboseMsg.c_str());
+                trtLogger->log(nvinfer1::ILogger::Severity::kVERBOSE, verboseMsg.c_str());
             }
         }
     }
@@ -112,7 +105,7 @@ class TritonPluginCreatorRegistry
     TritonPluginCreatorRegistry() {}
 
     std::mutex mRegistryLock;
-    std::stack<std::unique_ptr<IPluginCreator>> mRegistry;
+    std::stack<std::unique_ptr<nvinfer1::IPluginCreator>> mRegistry;
     std::unordered_set<std::string> mRegistryList;
 
 public:
@@ -127,15 +120,14 @@ void initializeTritonPlugin(void* logger, const char* libNamespace)
 }
 
 } // namespace
-} // namespace nvinfer1
 
 // New Plugin APIs
 
 extern "C"
 {
-    bool initLibNvInferPlugins(void* logger, const char* libNamespace)
+    bool initOpenAiTritonPlugins(void* logger, const char* libNamespace)
     {
-        nvinfer1::initializeTritonPlugin<nvinfer1::plugin::TritonFlashAttentionPluginCreator>(logger, libNamespace);
+        initializeTritonPlugin<openai_triton::plugin::TritonFlashAttentionPluginCreator>(logger, libNamespace);
         return true;
     }
 } // extern "C"
diff --git a/examples/opt/build.py b/examples/opt/build.py
index 94087df1576..04c30413e47 100644
--- a/examples/opt/build.py
+++ b/examples/opt/build.py
@@ -132,7 +132,7 @@ def parse_arguments():
         choices=[0, 1],
         help=
         'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
-        'To shard it along hiddem dimension, set embedding_sharding_dim=1'
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
         'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
     )
     parser.add_argument(
@@ -252,6 +252,8 @@ def build_rank_engine(builder: Builder,
                                                  args.max_beam_width)
         tensorrt_llm_gpt(*inputs)
 
+    tensorrt_llm.graph_rewriting.optimize(network)
+
     engine = None
 
     # Network -> Engine
diff --git a/examples/opt/summarize.py b/examples/opt/summarize.py
index e231d821ec9..39c738cbbfa 100644
--- a/examples/opt/summarize.py
+++ b/examples/opt/summarize.py
@@ -162,7 +162,8 @@ def summarize_tensorrt_llm(datapoint):
         with torch.no_grad():
             tensorrt_llm_gpt.setup(batch_size,
                                    max_context_length=max_length,
-                                   max_new_tokens=output_len)
+                                   max_new_tokens=output_len,
+                                   beam_width=num_beams)
 
             if tensorrt_llm_gpt.remove_input_padding:
                 output_ids = tensorrt_llm_gpt.decode_batch(
diff --git a/examples/quantization/summarize.py b/examples/quantization/summarize.py
index 05d4a927194..66ff2c4d6b6 100644
--- a/examples/quantization/summarize.py
+++ b/examples/quantization/summarize.py
@@ -215,7 +215,8 @@ def eval_tensorrt_llm(datapoint, eval_type="summarize"):
         with torch.no_grad():
             tensorrt_llm_gpt.setup(batch_size,
                                    max_context_length=max_length,
-                                   max_new_tokens=output_len)
+                                   max_new_tokens=output_len,
+                                   beam_width=num_beams)
 
             if tensorrt_llm_gpt.remove_input_padding:
                 output_ids = tensorrt_llm_gpt.decode_batch(
diff --git a/requirements-dev-windows.txt b/requirements-dev-windows.txt
new file mode 100644
index 00000000000..6c1c8bc254c
--- /dev/null
+++ b/requirements-dev-windows.txt
@@ -0,0 +1,25 @@
+--pre --extra-index-url https://download.pytorch.org/whl/nightly/cu121
+# torch is CPU-only on Windows, so need to specify a torch version with GPU support
+torch==2.1.0.dev20230828+cu121
+torchvision==0.16.0.dev20230828
+torchdata==0.7.0.dev20230828
+torchtext==0.16.0.dev20230828
+tokenizers==0.13.3
+transformers==4.31.0
+diffusers==0.15.0
+accelerate==0.20.3
+colored
+polygraphy
+onnx==1.12.0
+mpi4py
+numpy
+cuda-python==12.2.0
+mypy
+pytest-cov
+pytest-xdist
+pytest-forked
+pre-commit
+einops
+parameterized
+graphviz
+pypiwin32
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 5abbf4bc418..c2621d83ad9 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -8,7 +8,7 @@ onnx==1.12.0
 mpi4py
 tensorrt>=8.6.0
 numpy
-cuda-python==12.1.0
+cuda-python==12.2.0
 mypy
 pytest-cov
 pytest-xdist
@@ -17,3 +17,4 @@ pre-commit
 einops
 parameterized
 graphviz
+lark
diff --git a/requirements-windows.txt b/requirements-windows.txt
new file mode 100644
index 00000000000..01bc779cb36
--- /dev/null
+++ b/requirements-windows.txt
@@ -0,0 +1,20 @@
+build
+--pre --extra-index-url https://download.pytorch.org/whl/nightly/cu121
+# torch is CPU-only on Windows, so need to specify a torch version with GPU support
+torch==2.1.0.dev20230828+cu121
+torchvision==0.16.0.dev20230828
+torchdata==0.7.0.dev20230828
+torchtext==0.16.0.dev20230828
+tokenizers==0.13.3
+transformers==4.31.0
+diffusers==0.15.0
+accelerate==0.20.3
+colored
+polygraphy
+onnx>=1.12.0
+mpi4py
+numpy
+cuda-python==12.2.0
+sentencepiece>=0.1.99
+wheel
+pypiwin32
diff --git a/requirements.txt b/requirements.txt
index 1a5d3716467..d91f9f38a9e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,5 +9,7 @@ onnx>=1.12.0
 mpi4py
 tensorrt>=8.6.0
 numpy
-cuda-python==12.1.0
+cuda-python==12.2.0
 sentencepiece>=0.1.99
+wheel
+lark
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index 987e940d430..ff3f778d194 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -15,13 +15,16 @@
 # limitations under the License.
 
 import os
+import platform
+import sys
 from argparse import ArgumentParser
 from contextlib import contextmanager
 from functools import partial
 from multiprocessing import cpu_count
 from pathlib import Path
 from shutil import copy, rmtree
-from subprocess import run
+from subprocess import check_output, run
+from typing import List
 
 
 @contextmanager
@@ -40,7 +43,7 @@ def main(build_type: str = "Release",
          dist_dir: Path = None,
          cuda_architectures: str = None,
          job_count: int = None,
-         extra_cmake_vars: str = "",
+         extra_cmake_vars: List[str] = list(),
          extra_make_targets: str = "",
          trt_root: str = None,
          nccl_root: str = None,
@@ -56,27 +59,58 @@ def main(build_type: str = "Release",
     if not (project_dir / "3rdparty/cutlass/.git").exists():
         build_run('git submodule update --init --recursive')
 
+    requirements_filename = "requirements-windows.txt" if platform.system(
+    ) == "Windows" else "requirements.txt"
     build_run(
-        'pip install -r requirements.txt --extra-index-url https://pypi.ngc.nvidia.com'
+        f"{sys.executable} -m pip install -r {requirements_filename} --extra-index-url https://pypi.ngc.nvidia.com"
     )
+    # Ensure TRT is installed on windows to prevent surprises.
+    reqs = check_output([sys.executable, "-m", "pip", "freeze"])
+    installed_packages = [r.decode().split("==")[0] for r in reqs.split()]
+    if "tensorrt" not in installed_packages:
+        error_msg = "TensorRT was not installed properly."
+        if platform.system() == "Windows":
+            error_msg += (
+                " Please download the TensorRT zip file manually,"
+                " install it and relaunch build_wheel.py."
+                " See https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-zip for more details."
+            )
+        else:
+            error_msg += " Please run `pip install tensorrt` manually and relaunch build_wheel.py"
+        raise RuntimeError(error_msg)
 
     cmake_cuda_architectures = (
-        f'-DCMAKE_CUDA_ARCHITECTURES={cuda_architectures}'
+        f'"-DCMAKE_CUDA_ARCHITECTURES={cuda_architectures}"'
         if cuda_architectures is not None else "")
 
     cmake_def_args = []
 
+    hardware_arch = platform.machine()
+
     if job_count is None:
         job_count = cpu_count()
 
     if len(extra_cmake_vars):
-        extra_cmake_vars = extra_cmake_vars.split(";")
-        extra_cmake_vars = ["-D" + var for var in extra_cmake_vars]
+        # Backwards compatibility, we also support semicolon expansion for each value.
+        # However, it is best to use flag multiple-times due to issues with spaces in CLI.
+        expanded_args = []
+        for var in extra_cmake_vars:
+            expanded_args += var.split(";")
+
+        extra_cmake_vars = ["\"-D{}\"".format(var) for var in expanded_args]
         cmake_def_args.extend(extra_cmake_vars)
 
     if trt_root is not None:
-        cmake_def_args.append(
-            f"-DTRT_LIB_DIR={trt_root}/targets/x86_64-linux-gnu/lib")
+        trt_root = trt_root.replace("\\", "/")
+        trt_lib_dir_candidates = (
+            f"{trt_root}/targets/{hardware_arch}-linux-gnu/lib",
+            f"{trt_root}/lib")
+        try:
+            trt_lib_dir = next(
+                filter(lambda x: Path(x).exists(), trt_lib_dir_candidates))
+        except StopIteration:
+            trt_lib_dir = trt_lib_dir_candidates[0]
+        cmake_def_args.append(f"-DTRT_LIB_DIR={trt_lib_dir}")
         cmake_def_args.append(f"-DTRT_INCLUDE_DIR={trt_root}/include")
 
     if nccl_root is not None:
@@ -108,10 +142,10 @@ def main(build_type: str = "Release",
         cmake_def_args = " ".join(cmake_def_args)
         if clean or first_build:
             build_run(
-                f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" "{cmake_cuda_architectures}"'
+                f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" {cmake_cuda_architectures}'
                 f' {cmake_def_args} -S "{source_dir}"')
         build_run(
-            f'make -j{job_count} tensorrt_llm tensorrt_llm_static nvinfer_plugin_tensorrt_llm {th_common_lib} '
+            f'cmake --build . --config {build_type} --parallel {job_count} --target tensorrt_llm tensorrt_llm_static nvinfer_plugin_tensorrt_llm {th_common_lib} '
             f'{" ".join(extra_make_targets)}')
 
     if cpp_only:
@@ -122,10 +156,20 @@ def main(build_type: str = "Release",
     if lib_dir.exists():
         rmtree(lib_dir)
     lib_dir.mkdir(parents=True)
-    copy(build_dir / "tensorrt_llm/thop/libth_common.so",
-         lib_dir / "libth_common.so")
-    copy(build_dir / "tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so",
-         lib_dir / "libnvinfer_plugin_tensorrt_llm.so")
+    if platform.system() == "Windows":
+        copy(build_dir / f"tensorrt_llm/thop/{build_type}/th_common.dll",
+             lib_dir / "th_common.dll")
+        copy(
+            build_dir /
+            f"tensorrt_llm/plugins/{build_type}/nvinfer_plugin_tensorrt_llm.dll",
+            lib_dir / "nvinfer_plugin_tensorrt_llm.dll")
+    else:
+        copy(build_dir / "tensorrt_llm/thop/libth_common.so",
+             lib_dir / "libth_common.so")
+        copy(
+            build_dir /
+            "tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so",
+            lib_dir / "libnvinfer_plugin_tensorrt_llm.so")
 
     if dist_dir is None:
         dist_dir = project_dir / "build"
@@ -140,7 +184,7 @@ def main(build_type: str = "Release",
         )
 
     if install:
-        build_run('pip install -e .')
+        build_run(f"{sys.executable} -m pip install -e .")
 
 
 if __name__ == "__main__":
@@ -169,9 +213,11 @@ def main(build_type: str = "Release",
         help="Only build the C++ library without Python dependencies")
     parser.add_argument(
         "--extra-cmake-vars",
+        "-D",
+        action="append",
         help=
-        "A list of cmake variable definition, example: \"key1=value1;key2=value2\"",
-        default="")
+        "Extra cmake variable definition which can be specified multiple times, example: -D \"key1=value1\" -D \"key2=value2\"",
+        default=[])
     parser.add_argument(
         "--extra-make-targets",
         help="A list of additional make targets, example: \"target_1 target_2\"",
diff --git a/setup.py b/setup.py
index 85cdea8f09d..cba4ad79dd7 100644
--- a/setup.py
+++ b/setup.py
@@ -12,11 +12,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import platform
+
 from setuptools import find_packages, setup
 from setuptools.dist import Distribution
 
-with open("requirements.txt") as f:
-    required_deps = f.read().splitlines()
+requirements_filename = "requirements-windows.txt" if platform.system(
+) == "Windows" else "requirements.txt"
+with open(requirements_filename) as f:
+    requirements = f.read().splitlines()
+
+    def extract_url(line):
+        return next(filter(lambda x: x[0] != '-', line.split()))
+
+    extra_URLs = []
+    required_deps = []
+    for line in requirements:
+        if line[0] == "#":
+            continue
+
+        # handle -i and --extra-index-url options
+        if "-i " in line or "--extra-index-url" in line:
+            extra_URLs.append(extract_url(line))
+        else:
+            required_deps.append(line)
 
 
 class BinaryDistribution(Distribution):
@@ -35,11 +54,15 @@ def is_pure(self):
     version='0.1.3',
     description='TensorRT-LLM: A TensorRT Toolbox for Large Language Models',
     install_requires=required_deps,
+    dependency_links=extra_URLs,
     zip_safe=True,
     packages=find_packages(),
     package_data={
         'tensorrt_llm':
-        ['libs/libth_common.so', 'libs/libnvinfer_plugin_tensorrt_llm.so']
+        (['libs/th_common.dll', 'libs/nvinfer_plugin_tensorrt_llm.dll']
+         if platform.system() == "Windows" else
+         ['libs/libth_common.so', 'libs/libnvinfer_plugin_tensorrt_llm.so']) +
+        ['tools/plugin_gen/templates/*']
     },
     python_requires=">=3.7, <4",
     distclass=BinaryDistribution,
diff --git a/tensorrt_llm/__init__.py b/tensorrt_llm/__init__.py
index bed491fc73e..7a4a62d70a1 100644
--- a/tensorrt_llm/__init__.py
+++ b/tensorrt_llm/__init__.py
@@ -16,8 +16,9 @@
 import tensorrt_llm.models as models
 import tensorrt_llm.quantization as quantization
 import tensorrt_llm.runtime as runtime
+import tensorrt_llm.tools as tools
 
-from ._common import default_net, default_trtnet, precision
+from ._common import _init, default_net, default_trtnet, precision
 # Disable flake8 on the line below because mpi_rank is not used in tensorrt_llm project
 # but may be called in dependencies (such as examples)
 from ._utils import mpi_rank, mpi_world_size, str_dtype_to_trt  # NOQA
@@ -51,6 +52,7 @@
     'functional',
     'models',
     'quantization',
+    'tools',
 ]
 
-_common._init(log_level="error")
+_init(log_level="error")
diff --git a/tensorrt_llm/_common.py b/tensorrt_llm/_common.py
index 44e776254dd..9b8ccb12d42 100644
--- a/tensorrt_llm/_common.py
+++ b/tensorrt_llm/_common.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
+import platform
 from pathlib import Path
 
 import torch
@@ -40,7 +41,10 @@ def _init(log_level=None):
 
     # load FT decoder layer
     project_dir = str(Path(__file__).parent.absolute())
-    ft_decoder_lib = project_dir + '/libs/libth_common.so'
+    if platform.system() == "Windows":
+        ft_decoder_lib = project_dir + '/libs/th_common.dll'
+    else:
+        ft_decoder_lib = project_dir + '/libs/libth_common.so'
     if ft_decoder_lib == '':
         raise ImportError('FT decoder layer is unavailable')
     torch.classes.load_library(ft_decoder_lib)
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
index 20cbc8ce793..8d0084907e3 100644
--- a/tensorrt_llm/_utils.py
+++ b/tensorrt_llm/_utils.py
@@ -26,6 +26,7 @@
 from .logger import logger
 
 fp32_array = partial(np.array, dtype=np.float32)
+fp16_array = partial(np.array, dtype=np.float16)
 int32_array = partial(np.array, dtype=np.int32)
 
 # numpy doesn't know bfloat16, define abstract binary type instead
diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
index 7fbef29be76..f8c16592f96 100644
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@@ -19,8 +19,9 @@
 from typing import Optional, Union
 
 import tensorrt as trt
+from packaging import version
 
-from ._utils import to_dict, to_json_file
+from ._utils import to_dict, to_json_file, trt_version
 from .logger import logger
 from .network import Network
 
@@ -72,15 +73,24 @@ class Builder():
     def __init__(self):
         super().__init__()
         self._trt_builder = trt.Builder(logger.trt_logger)
+        self.strongly_typed = False
 
     @property
     def trt_builder(self) -> trt.Builder:
         return self._trt_builder
 
     def create_network(self) -> Network:
-        return Network()._init(
-            self.trt_builder.create_network(
-                1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)))
+        if version.parse(trt_version()) >= version.parse(
+                "9.1.0") and self.strongly_typed:
+            return Network()._init(
+                self.trt_builder.create_network(
+                    (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+                    | (1 << int(
+                        trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED))))
+        else:
+            return Network()._init(
+                self.trt_builder.create_network(
+                    1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)))
 
     def create_builder_config(self,
                               precision: str,
@@ -90,6 +100,7 @@ def create_builder_config(self,
                               use_refit: bool = False,
                               int8: bool = False,
                               fp8: bool = False,
+                              strongly_typed: bool = False,
                               opt_level: Optional[int] = None,
                               **kwargs) -> BuilderConfig:
         ''' @brief Create a builder config with given precisions and timing cache
@@ -101,8 +112,9 @@ def create_builder_config(self,
             @param int8: whether to build with int8 enabled or not. Can't be used together with refit option
             @return: A BuilderConfig object, return None if failed
         '''
+        self.strongly_typed = strongly_typed
 
-        if precision not in self._ALLOWED_PRECISIONS:
+        if not strongly_typed and precision not in self._ALLOWED_PRECISIONS:
             logger.error(
                 f"precision should be one of {self._ALLOWED_PRECISIONS}")
 
@@ -112,18 +124,20 @@ def create_builder_config(self,
             logger.error(f"can't use refit and int8 mode at the same time")
 
         config = self.trt_builder.create_builder_config()
-        if precision == 'float16':
-            config.set_flag(trt.BuilderFlag.FP16)
-            config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
-        elif precision == 'bfloat16':
-            config.set_flag(trt.BuilderFlag.BF16)
-            config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
-        if int8:
-            config.set_flag(trt.BuilderFlag.INT8)
-
-        if fp8:
-            config.set_flag(trt.BuilderFlag.FP8)
-            config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
+        if not strongly_typed:
+            if precision == 'float16':
+                config.set_flag(trt.BuilderFlag.FP16)
+                config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
+            elif precision == 'bfloat16':
+                config.set_flag(trt.BuilderFlag.BF16)
+                config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
+            if int8:
+                config.set_flag(trt.BuilderFlag.INT8)
+
+            if fp8:
+                config.set_flag(trt.BuilderFlag.FP8)
+                config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
+
         config.set_preview_feature(trt.PreviewFeature.PROFILE_SHARING_0806,
                                    True)
 
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index 52b4637dd37..bb4e362e413 100644
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -20,13 +20,14 @@
 
 import numpy as np
 import tensorrt as trt
+import torch
 
 from . import graph_rewriting as gw
 from ._common import default_net, default_trtnet, precision
-from ._utils import (dim_resolve_negative, dim_to_trt_axes, fp32_array,
-                     int32_array, np_dtype_to_trt, str_dtype_to_np,
-                     str_dtype_to_trt, trt_dtype_to_np)
-from .plugin import _TRT_LLM_PLUGIN_NAMESPACE as TRT_LLM_PLUGIN_NAMESPACE
+from ._utils import (dim_resolve_negative, dim_to_trt_axes, fp16_array,
+                     fp32_array, int32_array, np_dtype_to_trt, str_dtype_to_np,
+                     str_dtype_to_trt, torch_to_numpy, trt_dtype_to_torch)
+from .plugin import TRT_LLM_PLUGIN_NAMESPACE
 from .quantization import QuantMode
 
 
@@ -490,7 +491,7 @@ def replace_all_uses_with(self, new_tensor):
 
             # update the FLayerMetadata as well
             flayer = gw.FLayerInfoMemo.instance().get(user.name)
-            flayer and flayer.replace_with_tensor_from_input(self, new_tensor)
+            flayer and flayer.replace_input_with(self, new_tensor)
 
     def is_trt_wrapper(self):
         '''
@@ -552,7 +553,7 @@ def _create_tensor(trt_tensor: trt.ITensor,
     # centralized location to pass the name from
     # module space to the TRT IR
     default_net()._set_layer_name(producer)
-    if default_net().dtype is not None:
+    if default_net().dtype is not None and not default_net().strongly_typed:
         if producer.type not in [
                 trt.LayerType.CONSTANT, trt.LayerType.GATHER,
                 trt.LayerType.CONCATENATION
@@ -566,6 +567,12 @@ def _create_tensor(trt_tensor: trt.ITensor,
     return tensor
 
 
+class RotaryScalingType(IntEnum):
+    none = 0
+    linear = 1
+    dynamic = 2
+
+
 class PositionEmbeddingType(IntEnum):
     learned_absolute = 0
     rope_gptj = 1
@@ -668,6 +675,22 @@ def swiglu(input: Tensor) -> Tensor:
     return silu(gate) * x
 
 
+def squared_relu(x: Tensor) -> Tensor:
+    '''
+    Add a Squared ReLU operation.
+
+    This function applies ReLU and squares the output.
+
+    Parameters:
+        input : Tensor
+            The input tensor on which the activation function is applied.
+
+    Returns:
+        The tensor produced by the activation layer.
+    '''
+    return pow(relu(x), 2.0)
+
+
 def cast(input: Tensor, dtype: Union[str, trt.DataType]):
     '''
     Add a cast operation.
@@ -698,7 +721,8 @@ def cast(input: Tensor, dtype: Union[str, trt.DataType]):
         raise TypeError("%s is not supported" % type(dtype))
 
     layer = default_trtnet().add_cast(input.trt_tensor, cvt_dtype)
-    layer.set_output_type(0, cvt_dtype)
+    if not default_net().strongly_typed:
+        layer.set_output_type(0, cvt_dtype)
     output = _create_tensor(layer.get_output(0), layer)
     if input.dtype == str_dtype_to_trt('int8'):
         layer.get_input(0).set_dynamic_range(-127, 127)
@@ -903,7 +927,8 @@ def constant(ndarray: np.ndarray) -> Tensor:
     # Prevent underlying numpy array from going out of scope
     default_net().register_ndarray(ndarray)
     layer = default_trtnet().add_constant(trt.Dims(ndarray.shape), weights)
-    layer.set_output_type(0, np_dtype_to_trt(ndarray.dtype))
+    if not default_net()._strongly_typed:
+        layer.set_output_type(0, np_dtype_to_trt(ndarray.dtype))
     return _create_tensor(layer.get_output(0), layer)
 
 
@@ -1024,33 +1049,25 @@ def arange(start: Union[Tensor, int], end: Union[Tensor, int],
         `end-start` elements of type `dtype`.
     '''
     if isinstance(start, int):
-        step = 1
         assert isinstance(end, int)
-        assert isinstance(step, int)
-
-        num = len(range(start, end, step))
-
-        layer = default_trtnet().add_fill([num], trt.FillOperation.LINSPACE)
-        layer.set_output_type(0, str_dtype_to_trt(dtype))
-        layer.set_alpha(start)
-        layer.set_beta(step)
-        return _create_tensor(layer.get_output(0), layer)
+        start = constant(int32_array(start))
+        end = constant(int32_array(end))
     elif isinstance(start, Tensor):
-        step = constant(int32_array([1]))
         assert isinstance(end, Tensor)
-        assert isinstance(step, Tensor)
-
-        num = end - start
-        num = num.view([1])
-
-        layer = default_trtnet().add_fill([0], trt.FillOperation.LINSPACE)
-        layer.set_input(0, num.trt_tensor)  # rank = 1
-        layer.set_input(1, start.trt_tensor)  # rank = 0
-        layer.set_input(2, step.trt_tensor)  # rank = 1
-        return _create_tensor(layer.get_output(0), layer)
     else:
         raise TypeError("%s is not supported" % type(start))
 
+    step = constant(int32_array([1]))
+
+    num = end - start
+    num = num.view([1])
+
+    layer = default_trtnet().add_fill([0], trt.FillOperation.LINSPACE)
+    layer.set_input(0, num.trt_tensor)  # rank = 1
+    layer.set_input(1, start.trt_tensor)  # rank = 0
+    layer.set_input(2, step.trt_tensor)  # rank = 1
+    return _create_tensor(layer.get_output(0), layer)
+
 
 def expand(input: Tensor, expand_shape: Tensor) -> Tensor:
     '''
@@ -1388,8 +1405,13 @@ def expand_dims_like(left: Union[Tensor, int, float], right: Tensor) -> Tensor:
     if isinstance(left, int):
         left = constant(int32_array([left]))
     elif isinstance(left, float):
-        left = constant(fp32_array([left]))
-
+        if default_net().strongly_typed:
+            if isinstance(right, Tensor) and right.dtype == trt.DataType.HALF:
+                left = constant(fp16_array([left]))
+            else:
+                left = constant(fp32_array([left]))
+        else:
+            left = constant(fp32_array([left]))
     left_ndim = left.ndim()
     right_ndim = right.ndim()
     if right_ndim > left_ndim:
@@ -1878,11 +1900,17 @@ def embedding(input: Tensor,
     return x
 
 
-def constant_to_tensor_(input: Union[Tensor, int, float]) -> Tensor:
+def constant_to_tensor_(input: Union[Tensor, int, float],
+                        dtype: trt.DataType = trt.float32) -> Tensor:
     if isinstance(input, int):
         return constant(int32_array([input]))
     elif isinstance(input, float):
-        return constant(fp32_array([input]))
+        assert dtype == trt.float32 or dtype == trt.float16
+        if dtype == trt.float32:
+            return constant(fp32_array([input]))
+        else:
+            return constant(fp16_array([input]))
+
     return input
 
 
@@ -1907,8 +1935,13 @@ def broadcast_helper(left: Union[Tensor, int, float],
     Returns:
         A pair of tensors of same rank.
     '''
-    left = constant_to_tensor_(left)
-    right = constant_to_tensor_(right)
+    if not default_net().strongly_typed:
+        left = constant_to_tensor_(left)
+        right = constant_to_tensor_(right)
+    else:
+        left = constant_to_tensor_(
+            left, right.dtype if isinstance(right, Tensor) else trt.float32)
+        right = constant_to_tensor_(right, left.dtype)
 
     if left.rank() == right.rank():
         return (left, right)
@@ -2232,8 +2265,26 @@ def gelu(x: Tensor) -> Tensor:
     Returns:
         The tensor produced by the activation layer.
     '''
-    return 0.5 * x * (
-        tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * pow(x, 3.0))) + 1.0)
+    if default_net().strongly_typed:
+        if x.dtype == trt.float16:
+            v1 = constant(fp16_array([0.5]))
+            v2 = constant(fp16_array([math.sqrt(2.0 / math.pi)]))
+            v3 = constant(fp16_array([0.044715]))
+            v4 = constant(fp16_array([3.0]))
+            v5 = constant(fp16_array([1.0]))
+        elif x.dtype == trt.float32:
+            v1 = constant(fp32_array([0.5]))
+            v2 = constant(fp32_array([math.sqrt(2.0 / math.pi)]))
+            v3 = constant(fp32_array([0.044715]))
+            v4 = constant(fp32_array([3.0]))
+            v5 = constant(fp32_array([1.0]))
+        else:
+            assert False, f"gelu on datatype of {x.dtype} is not supported"
+
+        return v1 * x * (tanh(v2 * (x + v3 * pow(x, v4))) + v5)
+    else:
+        return 0.5 * x * (
+            tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * pow(x, 3.0))) + 1.0)
 
 
 def geglu(x: Tensor) -> Tensor:
@@ -2885,12 +2936,17 @@ def gpt_attention(
         num_kv_heads: int,
         q_scaling: float,
         rotary_embedding_dim: int,
-        position_embedding_type: PositionEmbeddingType,
-        multi_block_mode: bool,
-        kv_orig_quant_scale: Tensor,
-        kv_quant_orig_scale: Tensor,
-        kv_cache_quant_mode: QuantMode,
-        max_context_length: int,
+        rotary_embedding_base: float = 10000.0,
+        rotary_embedding_scale_type: RotaryScalingType = RotaryScalingType.none,
+        rotary_embedding_scale: float = 1.0,
+        rotary_embedding_max_positions: int = 1024,
+        position_embedding_type: PositionEmbeddingType = PositionEmbeddingType.
+    learned_absolute,
+        multi_block_mode: bool = False,
+        kv_orig_quant_scale: Tensor = None,
+        kv_quant_orig_scale: Tensor = None,
+        kv_cache_quant_mode: QuantMode = None,
+        max_context_length: int = None,
         mask_type: AttentionMaskType = AttentionMaskType.causal,
         alibi_slopes: Tensor = None,
         tp_size: int = 1,
@@ -2958,6 +3014,24 @@ def gpt_attention(
         rotary_embedding_dim: int
             The dimension to compute RoPE. Use 0 when position_embedding_type is not RoPE.
 
+        rotary_embedding_base: float
+            The theta value to use for RoPE. Ignored when position_embedding_type is not RoPE.
+
+        rotary_embedding_scale_type: RotaryScalingType
+            The scaling type of RoPE. Ignored when position_embedding_type is not RoPE.
+            Possible rotary scaling type:
+                * RotaryScalingType.none
+                * RotaryScalingType.linear
+                * RotaryScalingType.dynamic
+
+        rotary_embedding_scale: float
+            The scale value to use for linear/dynamic scaling in RoPE.
+            Ignored when position_embedding_type is not RoPE.
+            Must be set to 1 (default) if rotary_embedding_scale_type is `none`.
+
+        rotary_embedding_max_positions: int
+            Needed only for `dynamic` RoPE scaling. Ignored otherwise.
+
         position_embedding_type: PositionEmbeddingType
             The position embedding type:
                 * PositionEmbeddingType.learned_absolute
@@ -3030,6 +3104,22 @@ def gpt_attention(
     rotary_embedding_dim = trt.PluginField(
         "rotary_embedding_dim", np.array(rotary_embedding_dim, dtype=np.int32),
         trt.PluginFieldType.INT32)
+    rotary_embedding_base = trt.PluginField(
+        "rotary_embedding_base",
+        np.array(rotary_embedding_base, dtype=np.float32),
+        trt.PluginFieldType.FLOAT32)
+    rotary_embedding_scale_type = trt.PluginField(
+        "rotary_embedding_scale_type",
+        np.array(rotary_embedding_scale_type, dtype=np.int8),
+        trt.PluginFieldType.INT8)
+    rotary_embedding_scale = trt.PluginField(
+        "rotary_embedding_scale",
+        np.array(rotary_embedding_scale, dtype=np.float32),
+        trt.PluginFieldType.FLOAT32)
+    rotary_embedding_max_positions = trt.PluginField(
+        "rotary_embedding_max_positions",
+        np.array(rotary_embedding_max_positions, dtype=np.int32),
+        trt.PluginFieldType.INT32)
     position_embedding_type = trt.PluginField(
         "position_embedding_type",
         np.array(int(position_embedding_type), dtype=np.int8),
@@ -3064,10 +3154,6 @@ def gpt_attention(
         "paged_kv_cache",
         np.array(default_net().plugin_config.paged_kv_cache, dtype=np.int32),
         trt.PluginFieldType.INT32)
-    in_flight_batching = trt.PluginField(
-        "in_flight_batching",
-        np.array(default_net().plugin_config.in_flight_batching,
-                 dtype=np.int32), trt.PluginFieldType.INT32)
     max_context_length = trt.PluginField("max_context_length",
                                          np.array(max_context_length, np.int32),
                                          trt.PluginFieldType.INT32)
@@ -3082,10 +3168,11 @@ def gpt_attention(
 
     pfc = trt.PluginFieldCollection([
         nheads, num_kv_heads, unidirectional, q_scaling,
-        position_embedding_type, rotary_embedding_dim, tp_size, tp_rank,
-        context_fmha_type, multi_block_mode, kv_cache_quant_mode_field,
-        remove_input_padding, mask_type, paged_kv_cache, pf_type,
-        in_flight_batching, max_context_length, qkv_bias_enabled
+        position_embedding_type, rotary_embedding_dim, rotary_embedding_base,
+        rotary_embedding_scale_type, rotary_embedding_scale,
+        rotary_embedding_max_positions, tp_size, tp_rank, context_fmha_type,
+        multi_block_mode, kv_cache_quant_mode_field, remove_input_padding,
+        mask_type, paged_kv_cache, pf_type, max_context_length, qkv_bias_enabled
     ])
 
     attn_plug = attn_plg_creator.create_plugin("causal_attn", pfc)
@@ -3269,12 +3356,23 @@ def rms_norm(input: Tensor,
 
         dim = tuple([-i - 1 for i in range(len(normalized_shape))])
 
-        with precision("float32"):
-            varx = pow(input, 2.0)
+        if default_net().strongly_typed:
+            input_dtype = input.dtype
+            fp32_input = cast(input, "float32")
+            varx = pow(fp32_input, 2.0)
+
             varx = varx.mean(dim, keepdim=True)
             denom = varx + eps
             denom = denom.sqrt()
-            y = input / denom
+            fp32_y = fp32_input / denom
+            y = cast(fp32_y, input_dtype)
+        else:
+            with precision("float32"):
+                varx = pow(input, 2.0)
+                varx = varx.mean(dim, keepdim=True)
+                denom = varx + eps
+                denom = denom.sqrt()
+                y = input / denom
 
         if weight is not None:
             y = y * weight
@@ -3310,7 +3408,8 @@ def rms_norm(input: Tensor,
 def generate_alibi_slopes(num_heads: int,
                           dtype: trt.DataType = trt.float32,
                           tp_size: int = 1,
-                          tp_rank: int = 0) -> Tensor:
+                          tp_rank: int = 0,
+                          alibi_scale: float = 1.0) -> Tensor:
     '''
     Compute the ALiBi slopes as described in https://arxiv.org/abs/2211.05100.
 
@@ -3356,7 +3455,15 @@ def generate_alibi_slopes(num_heads: int,
         slopes = np.concatenate(
             [slopes, np.power(extra_base, extra_powers)], axis=0)
 
-    slopes = slopes.astype(trt_dtype_to_np(dtype))
+    slopes = alibi_scale * slopes
+    # Note that for bfloat16, we cannot case numpy tensor from float32 to bfloat16
+    # becuases numpy does not support bfloat16. Even if we use custom type to define
+    # the np_bfloat16, the "astype" here would be undefined.
+    # So, we must use torch to cast tensor from float32 to bfloat16, and then use torch_to_numpy
+    # to cast the tensor back.
+    slopes = torch.from_numpy(slopes)
+    slopes = slopes.to(trt_dtype_to_torch(dtype))
+    slopes = torch_to_numpy(slopes)
     slopes = constant(slopes.reshape(1, (end_head_id - start_head_id), 1, 1))
     return slopes
 
@@ -3497,6 +3604,7 @@ def gather_last_token_logits(hidden_states: Tensor, last_token_ids: Tensor,
     'geglu': geglu,
     'silu': silu,
     'softplus': softplus,
+    'squared-relu': squared_relu,
     'swiglu': swiglu,
     'fast-swiglu': swiglu,
 }
diff --git a/tensorrt_llm/graph_rewriting.py b/tensorrt_llm/graph_rewriting.py
index f11381a4442..c899215d83d 100644
--- a/tensorrt_llm/graph_rewriting.py
+++ b/tensorrt_llm/graph_rewriting.py
@@ -1,4 +1,5 @@
 import inspect
+from copy import copy
 from dataclasses import dataclass, field
 from functools import wraps
 from typing import (Any, Callable, ClassVar, Dict, List, Optional, Set, Tuple,
@@ -357,7 +358,16 @@ def amend_tensor(arg) -> Any:
     def set_outputs(self, outputs: List[Any]):
         self.raw_outputs = outputs
 
-    def replace_with_tensor_from_input(self, src, dst):
+    def get_input(self, name: str) -> Any:
+        return self.raw_inputs[name]
+
+    def clone_inputs(self):
+        '''
+        Get a shallow copy of the inputs.
+        '''
+        return copy(self.raw_inputs)
+
+    def replace_input_with(self, src, dst):
         '''
         Replace the input `src` with the input `dst` in the raw_inputs.
 
@@ -379,7 +389,7 @@ def replace(arg: Any):
 
         replace(self.raw_inputs)
 
-    def replace_output_users_with(self, net: Network, new_outs: List[Any]):
+    def replace_outputs_uses_with(self, net: Network, new_outs: List[Any]):
         '''
         Replace the output users with the new outputs.
 
@@ -625,7 +635,7 @@ def match_and_rewrite(self, layer: Layer) -> bool:
             plugin_flayer.raw_inputs['qkv_bias'] = eltwise_const_inputs[0]
             from .functional import gpt_attention
             new_outputs = gpt_attention(**plugin_flayer.raw_inputs)
-            plugin_flayer.replace_output_users_with(layer.network, new_outputs)
+            plugin_flayer.replace_outputs_uses_with(layer.network, new_outputs)
         return True
 
 
diff --git a/tensorrt_llm/layers/__init__.py b/tensorrt_llm/layers/__init__.py
index 41cfbe6f600..fa1e80b3ac0 100644
--- a/tensorrt_llm/layers/__init__.py
+++ b/tensorrt_llm/layers/__init__.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .activation import Mish
-from .attention import Attention, AttentionMaskType, PositionEmbeddingType
+from .attention import (Attention, AttentionMaskType, AttentionParams,
+                        KeyValueCacheParams, PositionEmbeddingType)
 from .cast import Cast
 from .conv import Conv2d, ConvTranspose2d
 from .embedding import Embedding, PromptTuningEmbedding
@@ -41,4 +42,6 @@
     'MLP',
     'GatedMLP',
     'Cast',
+    'AttentionParams',
+    'KeyValueCacheParams',
 ]
diff --git a/tensorrt_llm/layers/attention.py b/tensorrt_llm/layers/attention.py
index cb467bbce49..eef8eb95e63 100644
--- a/tensorrt_llm/layers/attention.py
+++ b/tensorrt_llm/layers/attention.py
@@ -13,16 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+from typing import List
 
 import numpy as np
 import tensorrt as trt
 
 from .._common import default_net, precision
-from ..functional import (AttentionMaskType, PositionEmbeddingType, Tensor,
-                          cast, clip, concat, constant, expand_mask,
-                          generate_alibi_biases, generate_alibi_slopes,
-                          gpt_attention, matmul, round, shape, slice, softmax,
-                          split)
+from ..functional import (AttentionMaskType, PositionEmbeddingType,
+                          RotaryScalingType, Tensor, cast, clip, concat,
+                          constant, expand_mask, generate_alibi_biases,
+                          generate_alibi_slopes, gpt_attention, matmul, round,
+                          shape, slice, softmax, split)
 from ..module import Module
 from ..parameter import Parameter
 from ..quantization import QuantMode
@@ -30,6 +31,72 @@
 from .linear import ColumnLinear, RowLinear
 
 
+class AttentionParams(object):
+
+    def __init__(self,
+                 sequence_length: Tensor = None,
+                 context_lengths: Tensor = None,
+                 host_context_lengths: Tensor = None,
+                 max_context_length: int = None,
+                 host_request_types: Tensor = None):
+        self.sequence_length = sequence_length
+        self.context_lengths = context_lengths
+        self.host_context_lengths = host_context_lengths
+        # max allowed context length. Required to
+        # compute scratch memory size.
+        self.max_context_length = max_context_length
+        self.host_request_types = host_request_types
+
+    def is_valid(self, gpt_attention_plugin, remove_input_padding):
+        if gpt_attention_plugin:
+            if self.sequence_length is None:
+                return False
+            if self.context_lengths is None:
+                return False
+            if self.host_request_types is None:
+                return False
+            if self.max_context_length is None:
+                return False
+
+        if remove_input_padding:
+            if self.host_context_lengths is None:
+                return False
+
+        return True
+
+
+class KeyValueCacheParams(object):
+
+    def __init__(self,
+                 past_key_value: List[Tensor] = None,
+                 host_past_key_value_lengths: Tensor = None,
+                 kv_cache_block_pointers: List[Tensor] = None,
+                 cache_indirection: Tensor = None):
+        self.past_key_value = past_key_value
+        self.host_past_key_value_lengths = host_past_key_value_lengths
+        self.kv_cache_block_pointers = kv_cache_block_pointers
+        self.cache_indirection = cache_indirection
+
+    def get_first_past_key_value(self):
+        if self.past_key_value is None:
+            return None
+        return self.past_key_value[0]
+
+    def get_first_kv_cache_block_pointers(self):
+        if self.kv_cache_block_pointers is None:
+            return None
+        return self.kv_cache_block_pointers[0]
+
+    def is_valid(self, gpt_attention_plugin):
+        if gpt_attention_plugin:
+            if self.host_past_key_value_lengths is None:
+                return False
+            if self.cache_indirection is None:
+                return False
+
+        return True
+
+
 class Attention(Module):
 
     def __init__(self,
@@ -43,6 +110,8 @@ def __init__(self,
                  bias=True,
                  dtype=None,
                  position_embedding_type=PositionEmbeddingType.learned_absolute,
+                 rotary_embedding_base=10000.0,
+                 rotary_embedding_scaling=None,
                  use_int8_kv_cache=False,
                  rotary_embedding_percentage=1.0,
                  tp_group=None,
@@ -80,6 +149,15 @@ def __init__(self,
         self.position_embedding_type = position_embedding_type
         self.multi_block_mode = multi_block_mode
 
+        self.rotary_embedding_base = rotary_embedding_base
+        self.rotary_embedding_scale_type = RotaryScalingType.none
+        self.rotary_embedding_scale = 1.0
+        if rotary_embedding_scaling is not None:
+            assert rotary_embedding_scaling["type"] in ["linear", "dynamic"]
+            self.rotary_embedding_scale_type = RotaryScalingType.linear if rotary_embedding_scaling[
+                "type"] == "linear" else RotaryScalingType.dynamic
+            self.rotary_embedding_scale = rotary_embedding_scaling["factor"]
+            assert self.rotary_embedding_scale > 1.0
         self.rotary_embedding_dim = 0
         if self.position_embedding_type.is_rope():
             self.rotary_embedding_dim = int(self.attention_head_size *
@@ -143,18 +221,9 @@ def forward(
         self,
         hidden_states: Tensor,
         attention_mask=None,
-        past_key_value=None,
-        sequence_length=None,
-        host_past_key_value_lengths: Tensor = None,
         use_cache=False,
-        cache_indirection=None,
-        kv_cache_block_pointers=None,
-        context_lengths: Tensor = None,
-        host_context_lengths: Tensor = None,
-        host_request_types=None,
-        # max allowed context length. Required to
-        # compute scratch memory size.
-        max_context_length: int = None,
+        kv_cache_params=None,
+        attention_params=None,
     ):
 
         assert isinstance(hidden_states, Tensor)
@@ -169,20 +238,21 @@ def forward(
             if default_net().plugin_config.gpt_attention_plugin:
                 dtype = hidden_states.dtype
             alibi_scale = 1. / self.norm_factor if self.scale_alibi_bias else 1.
-            alibi_slopes = alibi_scale * generate_alibi_slopes(
-                self.num_attention_heads * self.tp_size,
-                dtype=dtype,
-                tp_size=self.tp_size,
-                tp_rank=self.tp_rank)
+            alibi_slopes = generate_alibi_slopes(self.num_attention_heads *
+                                                 self.tp_size,
+                                                 dtype=dtype,
+                                                 tp_size=self.tp_size,
+                                                 tp_rank=self.tp_rank,
+                                                 alibi_scale=alibi_scale)
 
         qkv = self.qkv(hidden_states)
 
         if default_net().plugin_config.gpt_attention_plugin:
-            assert sequence_length is not None
-            assert host_past_key_value_lengths is not None
-            assert cache_indirection is not None
-            assert context_lengths is not None
-            assert host_request_types is not None
+            assert attention_params.is_valid(
+                default_net().plugin_config.gpt_attention_plugin,
+                default_net().plugin_config.remove_input_padding)
+            assert kv_cache_params.is_valid(
+                default_net().plugin_config.gpt_attention_plugin)
             assert self.attention_mask_type in [
                 AttentionMaskType.causal, AttentionMaskType.bidirectional
             ], 'Plugin only support masked MHA.'
@@ -190,32 +260,36 @@ def forward(
             ) else None
             kv_quant_orig_scale = self.kv_quant_orig_scale.value if self.quant_mode.has_kv_cache_quant(
             ) else None
-            if default_net().plugin_config.remove_input_padding:
-                assert host_context_lengths is not None
             context, past_key_value = gpt_attention(
-                qkv,
-                past_key_value,
-                sequence_length,
+                tensor=qkv,
+                past_key_value=kv_cache_params.get_first_past_key_value(),
+                sequence_length=attention_params.sequence_length,
+                host_past_key_value_lengths=kv_cache_params.
                 host_past_key_value_lengths,
-                context_lengths,
-                cache_indirection,
-                host_request_types,
-                self.num_attention_heads,
-                self.num_attention_kv_heads,
-                self.q_scaling,
-                self.rotary_embedding_dim,
-                self.position_embedding_type,
-                self.multi_block_mode,
-                kv_orig_quant_scale,
-                kv_quant_orig_scale,
-                self.quant_mode,
-                max_context_length,
-                self.attention_mask_type,
+                context_lengths=attention_params.context_lengths,
+                cache_indirection=kv_cache_params.cache_indirection,
+                host_request_types=attention_params.host_request_types,
+                num_heads=self.num_attention_heads,
+                num_kv_heads=self.num_attention_kv_heads,
+                q_scaling=self.q_scaling,
+                rotary_embedding_dim=self.rotary_embedding_dim,
+                rotary_embedding_base=self.rotary_embedding_base,
+                rotary_embedding_scale_type=self.rotary_embedding_scale_type,
+                rotary_embedding_scale=self.rotary_embedding_scale,
+                rotary_embedding_max_positions=self.max_position_embeddings,
+                position_embedding_type=self.position_embedding_type,
+                multi_block_mode=self.multi_block_mode,
+                kv_orig_quant_scale=kv_orig_quant_scale,
+                kv_quant_orig_scale=kv_quant_orig_scale,
+                kv_cache_quant_mode=self.quant_mode,
+                max_context_length=attention_params.max_context_length,
+                mask_type=self.attention_mask_type,
                 alibi_slopes=alibi_slopes,
                 tp_size=self.tp_size,
                 tp_rank=self.tp_rank,
-                kv_cache_block_pointers=kv_cache_block_pointers,
-                host_context_lengths=host_context_lengths)
+                kv_cache_block_pointers=kv_cache_params.
+                get_first_kv_cache_block_pointers(),
+                host_context_lengths=attention_params.host_context_lengths)
 
         else:
             assert default_net().plugin_config.paged_kv_cache == False
@@ -241,6 +315,8 @@ def transpose_for_scores(x, is_kv: bool = False):
             key = transpose_for_scores(key, is_kv=True)
             value = transpose_for_scores(value, is_kv=True)
 
+            past_key_value = None if kv_cache_params is None else kv_cache_params.get_first_past_key_value(
+            )
             if past_key_value is not None:
 
                 def dequantize_tensor(x, scale):
diff --git a/tensorrt_llm/layers/linear.py b/tensorrt_llm/layers/linear.py
index 70f2ca342cc..ff93449b8b4 100644
--- a/tensorrt_llm/layers/linear.py
+++ b/tensorrt_llm/layers/linear.py
@@ -17,17 +17,18 @@
 
 from .._common import default_net, default_trtnet
 from .._utils import int32_array, str_dtype_to_trt
-from ..functional import (Tensor, _create_tensor, allgather, allreduce, concat,
-                          constant, matmul, shape, slice)
+from ..functional import (Tensor, _create_tensor, allgather, allreduce, cast,
+                          concat, constant, matmul, shape, slice)
 from ..module import Module
 from ..parameter import Parameter
-from ..plugin import _TRT_LLM_PLUGIN_NAMESPACE as TRT_LLM_PLUGIN_NAMESPACE
+from ..plugin import TRT_LLM_PLUGIN_NAMESPACE
 
 
 def _gemm_plugin(input: Tensor,
                  mat2: Tensor,
                  transa: bool = False,
-                 transb: bool = False) -> Tensor:
+                 transb: bool = False,
+                 use_fp8: bool = False) -> Tensor:
     plg_creator = trt.get_plugin_registry().get_plugin_creator(
         'Gemm', '1', TRT_LLM_PLUGIN_NAMESPACE)
     assert plg_creator is not None
@@ -38,11 +39,15 @@ def _gemm_plugin(input: Tensor,
     transb = 1 if transb else 0
     transb = trt.PluginField("transb", np.array(transb, dtype=np.int32),
                              trt.PluginFieldType.INT32)
+    use_fp8 = 1 if use_fp8 else 0
+    use_fp8 = trt.PluginField("use_fp8", np.array(use_fp8, dtype=np.int32),
+                              trt.PluginFieldType.INT32)
+
     p_dtype = default_net().plugin_config.gemm_plugin
     pf_type = trt.PluginField(
         "type_id", np.array([int(str_dtype_to_trt(p_dtype))], np.int32),
         trt.PluginFieldType.INT32)
-    pfc = trt.PluginFieldCollection([transa, transb, pf_type])
+    pfc = trt.PluginFieldCollection([transa, transb, pf_type, use_fp8])
     gemm_plug = plg_creator.create_plugin("gemm", pfc)
     plug_inputs = [input.trt_tensor, mat2.trt_tensor]
     layer = default_trtnet().add_plugin_v2(plug_inputs, gemm_plug)
@@ -80,13 +85,15 @@ def __init__(self,
         else:
             self.register_parameter('bias', None)
 
-    def multiply_gather(self, x, weight, gemm_plugin):
+    def multiply_gather(self, x, weight, gemm_plugin, use_fp8=False):
         if gemm_plugin:
-            x = _gemm_plugin(x, weight, transb=True)
+            x = _gemm_plugin(x, weight, transb=True, use_fp8=use_fp8)
         else:
             x = matmul(x, weight, transb=True)
 
         if self.bias is not None:
+            if x.dtype != self.bias.value.dtype:
+                x = cast(x, self.bias.value.dtype)
             x = x + self.bias.value
 
         if self.gather_output and self.tp_size > 1 and self.tp_group is not None:
@@ -142,9 +149,9 @@ def __init__(self,
         self.tp_group = tp_group
         self.tp_size = tp_size
 
-    def multiply_reduce(self, x, weight, gemm_plugin):
+    def multiply_reduce(self, x, weight, gemm_plugin, use_fp8=False):
         if gemm_plugin:
-            x = _gemm_plugin(x, weight, transb=True)
+            x = _gemm_plugin(x, weight, transb=True, use_fp8=use_fp8)
         else:
             x = matmul(x, weight, transb=True)
 
@@ -152,6 +159,9 @@ def multiply_reduce(self, x, weight, gemm_plugin):
             x = allreduce(x, self.tp_group)
 
         if self.bias is not None:
+            if x.dtype != self.bias.value.dtype:
+                x = cast(x, self.bias.value.dtype)
+
             x = x + self.bias.value
 
         return x
diff --git a/tensorrt_llm/models/baichuan/model.py b/tensorrt_llm/models/baichuan/model.py
index 160cf5ab7b3..cc4a8debd74 100644
--- a/tensorrt_llm/models/baichuan/model.py
+++ b/tensorrt_llm/models/baichuan/model.py
@@ -17,8 +17,9 @@
 from ..._common import default_net
 from ..._utils import pad_vocab_size, str_dtype_to_trt
 from ...functional import Tensor, gather_last_token_logits
-from ...layers import (Attention, AttentionMaskType, ColumnLinear, Embedding,
-                       GatedMLP, RmsNorm)
+from ...layers import (Attention, AttentionMaskType, AttentionParams,
+                       ColumnLinear, Embedding, GatedMLP, KeyValueCacheParams,
+                       RmsNorm)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ..generation_mixin import GenerationMixin
@@ -67,32 +68,17 @@ def __init__(self,
     def forward(self,
                 hidden_states: Tensor,
                 attention_mask=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
-                cache_indirection=None,
-                kv_cache_block_pointers=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length: int = None):
+                kv_cache_params=None,
+                attention_params=None):
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
 
-        attention_output = self.attention(
-            hidden_states,
-            attention_mask=attention_mask,
-            past_key_value=past_key_value,
-            sequence_length=sequence_length,
-            host_past_key_value_lengths=host_past_key_value_lengths,
-            use_cache=use_cache,
-            cache_indirection=cache_indirection,
-            kv_cache_block_pointers=kv_cache_block_pointers,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
+        attention_output = self.attention(hidden_states,
+                                          attention_mask=attention_mask,
+                                          use_cache=use_cache,
+                                          kv_cache_params=kv_cache_params,
+                                          attention_params=attention_params)
 
         if use_cache:
             attention_output, presents = attention_output
@@ -146,41 +132,33 @@ def __init__(self,
     def forward(self,
                 input_ids: Tensor,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 attention_mask=None,
-                cache_indirection=None,
-                kv_cache_block_pointers=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length: int = None):
+                kv_cache_params=None,
+                attention_params=None):
 
         hidden_states = self.vocab_embedding(input_ids)
 
-        if past_key_value is None:
-            past_key_value = tuple([None] * len(self.layers))
+        if kv_cache_params.past_key_value is None:
+            kv_cache_params.past_key_value = tuple([None] * len(self.layers))
 
         if use_cache:
             presents = []
 
-        for layer, past, pointers in zip(self.layers, past_key_value,
-                                         kv_cache_block_pointers):
+        for layer, past, pointer in zip(
+                self.layers, kv_cache_params.past_key_value,
+                kv_cache_params.kv_cache_block_pointers):
             hidden_states = layer(
                 hidden_states,
-                past_key_value=past,
-                sequence_length=sequence_length,
-                host_past_key_value_lengths=host_past_key_value_lengths,
                 use_cache=use_cache,
                 attention_mask=attention_mask,
-                cache_indirection=cache_indirection,
-                kv_cache_block_pointers=pointers,
-                context_lengths=context_lengths,
-                host_context_lengths=host_context_lengths,
-                host_request_types=host_request_types,
-                max_context_length=max_context_length)
+                kv_cache_params=KeyValueCacheParams(
+                    past_key_value=[past],
+                    host_past_key_value_lengths=kv_cache_params.
+                    host_past_key_value_lengths,
+                    kv_cache_block_pointers=[pointer],
+                    cache_indirection=kv_cache_params.cache_indirection),
+                attention_params=attention_params)
 
             if use_cache:
                 presents.append(hidden_states[1])
@@ -233,23 +211,14 @@ def __init__(self,
     def forward(self,
                 input_ids: Tensor,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 last_token_ids=None,
                 attention_mask=None,
-                cache_indirection=None,
-                kv_cache_block_pointers=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length: int = None):
-        hidden_states = super().forward(
-            input_ids, position_ids, past_key_value, sequence_length,
-            host_past_key_value_lengths, use_cache, attention_mask,
-            cache_indirection, kv_cache_block_pointers, context_lengths,
-            host_context_lengths, host_request_types, max_context_length)
+                kv_cache_params=None,
+                attention_params=None):
+        hidden_states = super().forward(input_ids, position_ids, use_cache,
+                                        attention_mask, kv_cache_params,
+                                        attention_params)
 
         if use_cache:
             hidden_states, presents = hidden_states
@@ -290,6 +259,7 @@ def prepare_inputs(self,
         remove_input_padding = default_net().plugin_config.remove_input_padding
         use_gpt_attention_plugin = default_net(
         ).plugin_config.gpt_attention_plugin
+        use_gemm_plugin = default_net().plugin_config.gemm_plugin
 
         model_inputs = self.prepare_basic_inputs(
             max_batch_size,
@@ -302,15 +272,23 @@ def prepare_inputs(self,
             self._kv_dtype,
             remove_input_padding=remove_input_padding,
             use_gpt_attention_plugin=use_gpt_attention_plugin,
+            use_gemm_plugin=use_gemm_plugin,
             paged_kv_cache=paged_kv_cache,
             tokens_per_block=tokens_per_block)
 
-        return (model_inputs['input_ids'], model_inputs['position_ids'],
-                model_inputs['past_key_value'], model_inputs['sequence_length'],
-                model_inputs['host_past_key_value_lengths'], True,
+        return (model_inputs['input_ids'], model_inputs['position_ids'], True,
                 model_inputs['last_token_ids'], model_inputs['attention_mask'],
-                model_inputs['cache_indirection'],
-                model_inputs['kv_cache_block_pointers_list'],
-                model_inputs['context_lengths'],
-                model_inputs['host_context_lengths'],
-                model_inputs['host_request_types'], max_input_len)
+                KeyValueCacheParams(
+                    past_key_value=model_inputs['past_key_value'],
+                    host_past_key_value_lengths=model_inputs[
+                        'host_past_key_value_lengths'],
+                    kv_cache_block_pointers=model_inputs[
+                        'kv_cache_block_pointers_list'],
+                    cache_indirection=model_inputs['cache_indirection'],
+                ),
+                AttentionParams(
+                    sequence_length=model_inputs['sequence_length'],
+                    context_lengths=model_inputs['context_lengths'],
+                    host_context_lengths=model_inputs['host_context_lengths'],
+                    max_context_length=max_input_len,
+                    host_request_types=model_inputs['host_request_types']))
diff --git a/tensorrt_llm/models/bloom/model.py b/tensorrt_llm/models/bloom/model.py
index e6cfd8963ba..0bd6a4940fc 100644
--- a/tensorrt_llm/models/bloom/model.py
+++ b/tensorrt_llm/models/bloom/model.py
@@ -17,8 +17,9 @@
 from ..._common import default_net
 from ..._utils import pad_vocab_size, str_dtype_to_trt
 from ...functional import Tensor, gather_last_token_logits
-from ...layers import (MLP, Attention, AttentionMaskType, ColumnLinear,
-                       Embedding, LayerNorm, PositionEmbeddingType)
+from ...layers import (MLP, Attention, AttentionMaskType, AttentionParams,
+                       ColumnLinear, Embedding, KeyValueCacheParams, LayerNorm,
+                       PositionEmbeddingType)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ...quantization import QuantMode
@@ -84,20 +85,12 @@ def __init__(self,
         self.post_layernorm = LayerNorm(normalized_shape=hidden_size,
                                         dtype=dtype)
 
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask=None,
-        past_key_value=None,
-        sequence_length=None,
-        host_past_key_value_lengths=None,
-        use_cache=False,
-        cache_indirection=None,
-        context_lengths=None,
-        host_context_lengths=None,
-        host_request_types=None,
-        max_context_length: int = None,
-    ):
+    def forward(self,
+                hidden_states: Tensor,
+                attention_mask=None,
+                use_cache=False,
+                kv_cache_params=None,
+                attention_params=None):
 
         assert isinstance(hidden_states, Tensor)
 
@@ -105,18 +98,11 @@ def forward(
 
         hidden_states = self.input_layernorm(hidden_states)
 
-        attention_output = self.attention(
-            hidden_states,
-            attention_mask=attention_mask,
-            past_key_value=past_key_value,
-            sequence_length=sequence_length,
-            host_past_key_value_lengths=host_past_key_value_lengths,
-            use_cache=use_cache,
-            cache_indirection=cache_indirection,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
+        attention_output = self.attention(hidden_states,
+                                          attention_mask=attention_mask,
+                                          use_cache=use_cache,
+                                          kv_cache_params=kv_cache_params,
+                                          attention_params=attention_params)
 
         if use_cache:
             attention_output, presents = attention_output
@@ -184,44 +170,34 @@ def __init__(self,
 
         self.ln_f = LayerNorm(normalized_shape=hidden_size, dtype=dtype)
 
-    def forward(
-        self,
-        input_ids=None,
-        position_ids=None,
-        past_key_value=None,
-        sequence_length=None,
-        host_past_key_value_lengths=None,
-        use_cache=False,
-        attention_mask=None,
-        cache_indirection=None,
-        context_lengths=None,
-        host_context_lengths=None,
-        host_request_types=None,
-        max_context_length: int = None,
-    ):
+    def forward(self,
+                input_ids: Tensor,
+                position_ids=None,
+                use_cache=False,
+                attention_mask=None,
+                kv_cache_params=None,
+                attention_params=None):
 
         hidden_states = self.embedding(input_ids)
         hidden_states = self.ln_embed(hidden_states)
 
-        if past_key_value is None:
-            past_key_value = tuple([None] * len(self.layers))
+        if kv_cache_params.past_key_value is None:
+            kv_cache_params.past_key_value = tuple([None] * len(self.layers))
 
         if use_cache:
             presents = []
 
-        for layer, past in zip(self.layers, past_key_value):
+        for layer, past in zip(self.layers, kv_cache_params.past_key_value):
             hidden_states = layer(
                 hidden_states,
-                past_key_value=past,
-                sequence_length=sequence_length,
-                host_past_key_value_lengths=host_past_key_value_lengths,
                 use_cache=use_cache,
                 attention_mask=attention_mask,
-                cache_indirection=cache_indirection,
-                context_lengths=context_lengths,
-                host_context_lengths=host_context_lengths,
-                host_request_types=host_request_types,
-                max_context_length=max_context_length)
+                kv_cache_params=KeyValueCacheParams(
+                    past_key_value=[past],
+                    host_past_key_value_lengths=kv_cache_params.
+                    host_past_key_value_lengths,
+                    cache_indirection=kv_cache_params.cache_indirection),
+                attention_params=attention_params)
 
             if use_cache:
                 presents.append(hidden_states[1])
@@ -250,12 +226,21 @@ def __init__(self,
                  quant_mode=QuantMode(0),
                  multi_query_mode=False,
                  use_parallel_embedding=False,
-                 embedding_sharding_dim=0):
+                 embedding_sharding_dim=0,
+                 share_embedding_table=False):
         if isinstance(dtype, str):
             self._kv_dtype = str_dtype_to_trt(dtype)
         else:
             assert isinstance(dtype, trt.DataType)
             self._kv_dtype = dtype
+
+        if share_embedding_table and mapping.tp_size > 1:
+            if (not use_parallel_embedding) or (use_parallel_embedding and
+                                                embedding_sharding_dim == 1):
+                raise NotImplementedError(
+                    'For multiple-processes cases, sharing the embedding table must set use_parallel_embedding=True and embedding_sharding_dim = 0'
+                )
+
         self._dtype = self._kv_dtype
         if quant_mode.has_int8_kv_cache():
             self._kv_dtype = str_dtype_to_trt('int8')
@@ -273,34 +258,31 @@ def __init__(self,
                          mlp_hidden_size, bias, quant_mode, multi_query_mode,
                          use_parallel_embedding, embedding_sharding_dim)
         vocab_size_padded = pad_vocab_size(vocab_size, mapping.tp_size)
+
+        share_weight = None
+        if share_embedding_table:
+            share_weight = self.embedding.weight
+
         self.lm_head = ColumnLinear(hidden_size,
                                     vocab_size_padded,
                                     bias=False,
                                     dtype=dtype,
                                     tp_group=mapping.tp_group,
                                     tp_size=mapping.tp_size,
-                                    gather_output=True)
+                                    gather_output=True,
+                                    share_weight=share_weight)
 
     def forward(self,
-                input_ids=None,
+                input_ids: Tensor,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 last_token_ids=None,
                 attention_mask=None,
-                cache_indirection=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
-        hidden_states = super().forward(input_ids, position_ids, past_key_value,
-                                        sequence_length,
-                                        host_past_key_value_lengths, use_cache,
-                                        attention_mask, cache_indirection,
-                                        context_lengths, host_context_lengths,
-                                        host_request_types, max_context_length)
+                kv_cache_params=None,
+                attention_params=None):
+        hidden_states = super().forward(input_ids, position_ids, use_cache,
+                                        attention_mask, kv_cache_params,
+                                        attention_params)
 
         if use_cache:
             hidden_states, presents = hidden_states
@@ -338,17 +320,32 @@ def prepare_inputs(self,
         remove_input_padding = default_net().plugin_config.remove_input_padding
         use_gpt_attention_plugin = default_net(
         ).plugin_config.gpt_attention_plugin
+        use_gemm_plugin = default_net().plugin_config.gemm_plugin
 
         model_inputs = self.prepare_basic_inputs(
-            max_batch_size, max_beam_width, max_input_len, max_new_tokens,
-            num_heads, head_size, self._num_layers, self._kv_dtype,
-            remove_input_padding, use_gpt_attention_plugin)
-
-        return (model_inputs['input_ids'], model_inputs['position_ids'],
-                model_inputs['past_key_value'], model_inputs['sequence_length'],
-                model_inputs['host_past_key_value_lengths'], True,
+            max_batch_size,
+            max_beam_width,
+            max_input_len,
+            max_new_tokens,
+            num_heads,
+            head_size,
+            self._num_layers,
+            self._kv_dtype,
+            remove_input_padding,
+            use_gpt_attention_plugin,
+            use_gemm_plugin=use_gemm_plugin)
+
+        return (model_inputs['input_ids'], model_inputs['position_ids'], True,
                 model_inputs['last_token_ids'], model_inputs['attention_mask'],
-                model_inputs['cache_indirection'],
-                model_inputs['context_lengths'],
-                model_inputs['host_context_lengths'],
-                model_inputs['host_request_types'], max_input_len)
+                KeyValueCacheParams(
+                    past_key_value=model_inputs['past_key_value'],
+                    host_past_key_value_lengths=model_inputs[
+                        'host_past_key_value_lengths'],
+                    cache_indirection=model_inputs['cache_indirection'],
+                ),
+                AttentionParams(
+                    sequence_length=model_inputs['sequence_length'],
+                    context_lengths=model_inputs['context_lengths'],
+                    host_context_lengths=model_inputs['host_context_lengths'],
+                    max_context_length=max_input_len,
+                    host_request_types=model_inputs['host_request_types']))
diff --git a/tensorrt_llm/models/chatglm2_6b/model.py b/tensorrt_llm/models/chatglm2_6b/model.py
index 4f410c1a7b7..20a613a090b 100644
--- a/tensorrt_llm/models/chatglm2_6b/model.py
+++ b/tensorrt_llm/models/chatglm2_6b/model.py
@@ -21,8 +21,8 @@
 from ...functional import (PositionEmbeddingType, Tensor, concat, constant,
                            expand, expand_dims, gather, gpt_attention,
                            index_select, select, shape, slice, split)
-from ...layers import (MLP, AttentionMaskType, ColumnLinear, Embedding, RmsNorm,
-                       RowLinear)
+from ...layers import (MLP, AttentionMaskType, AttentionParams, ColumnLinear,
+                       Embedding, KeyValueCacheParams, RmsNorm, RowLinear)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ...parameter import Parameter
@@ -166,22 +166,12 @@ def __init__(
                                tp_group=tp_group,
                                tp_size=tp_size)
 
-    def forward(
-        self,
-        hidden_states: Tensor,
-        rotary_pos_emb,
-        past_key_value=None,
-        sequence_length=None,
-        host_past_key_value_lengths=None,
-        cache_indirection=None,
-        use_cache=True,
-        kv_cache_block_pointers=None,
-        context_lengths: Tensor = None,
-        host_context_lengths: Tensor = None,
-        host_request_types=None,
-        # max allowed context length. Required to
-        # compute scratch memory size.
-        max_context_length: int = None):
+    def forward(self,
+                hidden_states: Tensor,
+                rotary_pos_emb,
+                use_cache=True,
+                kv_cache_params=None,
+                attention_params=None):
         if not default_net().plugin_config.gpt_attention_plugin:
             raise ValueError(
                 'ChatGLM2 is only supported with GPTAttention plugin,pleas build it with --use_gpt_attention_plugin argument.'
@@ -258,20 +248,22 @@ def forward(
         qkv = qkv.view(
             concat([shape(qkv, 0),
                     shape(qkv, 1), self.hidden_size * 3]))
-        assert sequence_length is not None
-        assert host_past_key_value_lengths is not None
-        assert cache_indirection is not None
-        assert context_lengths is not None
+        assert attention_params.is_valid(
+            default_net().plugin_config.gpt_attention_plugin,
+            default_net().plugin_config.remove_input_padding)
+        assert kv_cache_params.is_valid(
+            default_net().plugin_config.gpt_attention_plugin)
         kv_orig_quant_scale = self.kv_orig_quant_scale.value if self.use_int8_kv_cache else None
         kv_quant_orig_scale = self.kv_quant_orig_scale.value if self.use_int8_kv_cache else None
         context, past_key_value = gpt_attention(
             tensor=qkv,
-            past_key_value=past_key_value,
-            sequence_length=sequence_length,
-            host_past_key_value_lengths=host_past_key_value_lengths,
-            context_lengths=context_lengths,
-            cache_indirection=cache_indirection,
-            host_request_types=host_request_types,
+            past_key_value=kv_cache_params.get_first_past_key_value(),
+            sequence_length=attention_params.sequence_length,
+            host_past_key_value_lengths=kv_cache_params.
+            host_past_key_value_lengths,
+            context_lengths=attention_params.context_lengths,
+            cache_indirection=kv_cache_params.cache_indirection,
+            host_request_types=attention_params.host_request_types,
             num_heads=self.num_attention_heads,
             num_kv_heads=self.
             num_attention_heads,  # since self.multi_query_mode is set to False
@@ -283,9 +275,10 @@ def forward(
             kv_quant_orig_scale=kv_quant_orig_scale,
             kv_cache_quant_mode=QuantMode.INT8_KV_CACHE
             if self.use_int8_kv_cache else QuantMode(0),
-            kv_cache_block_pointers=kv_cache_block_pointers,
-            max_context_length=max_context_length,
-            host_context_lengths=host_context_lengths)
+            kv_cache_block_pointers=kv_cache_params.
+            get_first_kv_cache_block_pointers(),
+            max_context_length=attention_params.max_context_length,
+            host_context_lengths=attention_params.host_context_lengths)
         # dense layer after self-attention
         context = self.dense(context)
         if use_cache:
@@ -345,22 +338,12 @@ def __init__(self,
         self.mlp = MLP(self.hidden_size, ffn_hiden_size, act_func, linear_bias,
                        dtype)
 
-    def forward(
-        self,
-        hidden_states,
-        rotary_pos_emb,
-        past_key_value,
-        sequence_length,
-        host_past_key_value_lengths,
-        cache_indirection,
-        use_cache=True,
-        kv_cache_block_pointers=None,
-        context_lengths: Tensor = None,
-        host_context_lengths: Tensor = None,
-        host_request_types=None,
-        # max allowed context length. Required to
-        # compute scratch memory size.
-        max_context_length: int = None):
+    def forward(self,
+                hidden_states,
+                rotary_pos_emb,
+                use_cache=True,
+                kv_cache_params=None,
+                attention_params=None):
         # hidden_states: [s, b, h]
 
         # Layer norm at the beginning of the transformer layer.
@@ -370,16 +353,10 @@ def forward(
         attention_output, kv_cache = self.self_attention(
             layernorm_output,
             rotary_pos_emb,
-            past_key_value,
-            sequence_length,
-            host_past_key_value_lengths,
-            cache_indirection=cache_indirection,
             use_cache=use_cache,
-            kv_cache_block_pointers=kv_cache_block_pointers,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params)
+
         # Residual connection.
         if self.apply_residual_connection_post_layernorm:
             residual = layernorm_output
@@ -455,42 +432,30 @@ def build_layer(layer_number):
     def _get_layer(self, layer_number):
         return self.layers[layer_number]
 
-    def forward(
-        self,
-        hidden_states,
-        rotary_pos_emb,
-        past_key_value,
-        sequence_length,
-        host_past_key_value_lengths,
-        cache_indirection,
-        use_cache=True,
-        kv_cache_block_pointers=None,
-        context_lengths: Tensor = None,
-        host_context_lengths: Tensor = None,
-        host_request_types=None,
-        # max allowed context length. Required to
-        # compute scratch memory size.
-        max_context_length: int = None):
+    def forward(self,
+                hidden_states,
+                rotary_pos_emb,
+                use_cache=True,
+                kv_cache_params=None,
+                attention_params=None):
 
         presents = []
         for index in range(self.num_layers):
             layer = self._get_layer(index)
-            layer_past_key_value = past_key_value[index]
             hidden_states, kv_cache = layer(
                 hidden_states,
                 rotary_pos_emb,
-                layer_past_key_value,
-                sequence_length,
-                host_past_key_value_lengths,
-                cache_indirection,
                 use_cache=use_cache,
-                kv_cache_block_pointers=kv_cache_block_pointers,
-                context_lengths=context_lengths,
-                host_context_lengths=host_context_lengths,
-                host_request_types=host_request_types,
-                max_context_length=max_context_length)
+                kv_cache_params=KeyValueCacheParams(
+                    past_key_value=[kv_cache_params.past_key_value[index]],
+                    kv_cache_block_pointers=[
+                        kv_cache_params.kv_cache_block_pointers[index]
+                    ],
+                    host_past_key_value_lengths=kv_cache_params.
+                    host_past_key_value_lengths,
+                    cache_indirection=kv_cache_params.cache_indirection),
+                attention_params=attention_params)
             presents.append(kv_cache)
-            # hidden_states.mark_output(f'hidden_states{index}',trt.float16)
 
         if self.post_layer_norm:
             hidden_states = self.final_layernorm(hidden_states)
@@ -542,18 +507,10 @@ def forward(
         self,
         input_ids: Tensor,
         position_ids,
-        past_key_value,
-        sequence_length,
-        host_past_key_value_lengths,
-        cache_indirection,
         use_cache=True,
-        kv_cache_block_pointers=None,
-        context_lengths: Tensor = None,
-        host_context_lengths: Tensor = None,
-        host_request_types=None,
-        # max allowed context length. Required to
-        # compute scratch memory size.
-        max_context_length: int = None):
+        kv_cache_params=None,
+        attention_params=None,
+    ):
 
         inputs_embeds = self.embedding(input_ids)
         # Rotary positional embeddings
@@ -579,17 +536,10 @@ def forward(
         hidden_states, presents = self.encoder(
             inputs_embeds,
             selected_pos_emb,
-            past_key_value,
-            sequence_length,
-            host_past_key_value_lengths,
-            cache_indirection,
-            use_cache,
-            kv_cache_block_pointers,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
-
+            use_cache=use_cache,
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params,
+        )
         return hidden_states, presents
 
 
@@ -647,28 +597,17 @@ def __init__(self,
                                     tp_size=mapping.tp_size,
                                     gather_output=True)
 
-    def forward(
-        self,
-        input_ids=None,
-        position_ids=None,
-        past_key_value=None,
-        sequence_length=None,
-        host_past_key_value_lengths=None,
-        last_token_ids=None,
-        cache_indirection=None,
-        context_lengths: Tensor = None,
-        host_context_lengths: Tensor = None,
-        host_request_types=None,
-        # max allowed context length. Required to
-        # compute scratch memory size.
-        max_context_length: int = None):
+    def forward(self,
+                input_ids=None,
+                position_ids=None,
+                last_token_ids=None,
+                kv_cache_params=None,
+                attention_params=None):
+
         assert last_token_ids is not None, "Expecting last token ids to be not None"
 
-        hidden_states = super().forward(
-            input_ids, position_ids, past_key_value, sequence_length,
-            host_past_key_value_lengths, cache_indirection, self.use_cache,
-            self.kv_cache_block_pointers, context_lengths, host_context_lengths,
-            host_request_types, max_context_length)
+        hidden_states = super().forward(input_ids, position_ids, self.use_cache,
+                                        kv_cache_params, attention_params)
 
         if self.use_cache:
             hidden_states, presents = hidden_states
@@ -711,16 +650,34 @@ def prepare_inputs(self,
         remove_input_padding = default_net().plugin_config.remove_input_padding
         use_gpt_attention_plugin = default_net(
         ).plugin_config.gpt_attention_plugin
+        use_gemm_plugin = default_net().plugin_config.gemm_plugin
 
         model_inputs = self.prepare_basic_inputs(
-            max_batch_size, max_beam_width, max_input_len, max_new_tokens,
-            num_heads, head_size, self.num_layers, self._kv_dtype,
-            remove_input_padding, use_gpt_attention_plugin)
+            max_batch_size,
+            max_beam_width,
+            max_input_len,
+            max_new_tokens,
+            num_heads,
+            head_size,
+            self.num_layers,
+            self._kv_dtype,
+            remove_input_padding,
+            use_gpt_attention_plugin,
+            use_gemm_plugin=use_gemm_plugin)
+
         return (model_inputs['input_ids'], model_inputs['position_ids'],
-                model_inputs['past_key_value'], model_inputs['sequence_length'],
-                model_inputs['host_past_key_value_lengths'],
                 model_inputs['last_token_ids'],
-                model_inputs['cache_indirection'],
-                model_inputs['context_lengths'],
-                model_inputs['host_context_lengths'],
-                model_inputs['host_request_types'], max_input_len)
+                KeyValueCacheParams(
+                    past_key_value=model_inputs['past_key_value'],
+                    host_past_key_value_lengths=model_inputs[
+                        'host_past_key_value_lengths'],
+                    kv_cache_block_pointers=model_inputs[
+                        'kv_cache_block_pointers_list'],
+                    cache_indirection=model_inputs['cache_indirection'],
+                ),
+                AttentionParams(
+                    sequence_length=model_inputs['sequence_length'],
+                    context_lengths=model_inputs['context_lengths'],
+                    host_context_lengths=model_inputs['host_context_lengths'],
+                    max_context_length=max_input_len,
+                    host_request_types=model_inputs['host_request_types']))
diff --git a/tensorrt_llm/models/chatglm6b/model.py b/tensorrt_llm/models/chatglm6b/model.py
index 111f16859ec..262f259fcd6 100644
--- a/tensorrt_llm/models/chatglm6b/model.py
+++ b/tensorrt_llm/models/chatglm6b/model.py
@@ -23,8 +23,8 @@
 from ...functional import (PositionEmbeddingType, Tensor, assertion, concat,
                            constant, expand, gather, gpt_attention, shape,
                            split)
-from ...layers import (MLP, AttentionMaskType, ColumnLinear, Embedding,
-                       LayerNorm, RowLinear)
+from ...layers import (MLP, AttentionMaskType, AttentionParams, ColumnLinear,
+                       Embedding, KeyValueCacheParams, LayerNorm, RowLinear)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ...parameter import Parameter
@@ -102,21 +102,12 @@ def __init__(self,
                                tp_group=tp_group,
                                tp_size=tp_size)
 
-    def forward(
-            self,
-            hidden_states: Tensor,
-            position_embedding,
-            past_key_value,
-            sequence_length,
-            host_past_key_value_lengths,
-            cache_indirection,
-            context_lengths: Tensor = None,
-            host_context_lengths: Tensor = None,
-            host_request_types=None,
-            # max allowed context length. Required to
-            # compute scratch memory size.
-            max_context_length: int = None,
-            use_cache=False):
+    def forward(self,
+                hidden_states: Tensor,
+                position_embedding,
+                use_cache=False,
+                kv_cache_params=None,
+                attention_params=None):
 
         if not default_net().plugin_config.gpt_attention_plugin:
             raise ValueError(
@@ -193,26 +184,27 @@ def rotate_embedding(x, position_embedding_value):
             concat([shape(qkv, 0),
                     shape(qkv, 1), self.hidden_size * 3]))
         context, past_key_value = gpt_attention(
-            qkv,
-            past_key_value,
-            sequence_length,
+            tensor=qkv,
+            past_key_value=kv_cache_params.get_first_past_key_value(),
+            sequence_length=attention_params.sequence_length,
+            host_past_key_value_lengths=kv_cache_params.
             host_past_key_value_lengths,
-            context_lengths,
-            cache_indirection,
-            host_request_types,
-            self.num_attention_heads,
-            self.num_attention_kv_heads,
-            self.q_scaling,
-            self.rotary_embedding_dim,
-            self.position_embedding_type,
-            self.multi_block_mode,
-            kv_orig_quant_scale,
-            kv_quant_orig_scale,
+            context_lengths=attention_params.context_lengths,
+            cache_indirection=kv_cache_params.cache_indirection,
+            host_request_types=attention_params.host_request_types,
+            num_heads=self.num_attention_heads,
+            num_kv_heads=self.num_attention_kv_heads,
+            q_scaling=self.q_scaling,
+            rotary_embedding_dim=self.rotary_embedding_dim,
+            position_embedding_type=self.position_embedding_type,
+            multi_block_mode=self.multi_block_mode,
+            kv_orig_quant_scale=kv_orig_quant_scale,
+            kv_quant_orig_scale=kv_quant_orig_scale,
             kv_cache_quant_mode=QuantMode.from_description(
                 use_int8_kv_cache=self.use_int8_kv_cache),
-            max_context_length=max_context_length,
+            max_context_length=attention_params.max_context_length,
             mask_type=self.attention_mask_type.value,
-            host_context_lengths=host_context_lengths)
+            host_context_lengths=attention_params.host_context_lengths)
 
         context = self.dense(context)
 
@@ -276,38 +268,21 @@ def __init__(self,
         self.post_layernorm = LayerNorm(normalized_shape=hidden_size,
                                         dtype=dtype)
 
-    def forward(
-        self,
-        hidden_states: Tensor,
-        position_embedding,
-        past_key_value=None,
-        sequence_length=None,
-        host_past_key_value_lengths=None,
-        use_cache=False,
-        cache_indirection=None,
-        context_lengths: Tensor = None,
-        host_context_lengths: Tensor = None,
-        host_request_types=None,
-        # max allowed context length. Required to
-        # compute scratch memory size.
-        max_context_length: int = None):
+    def forward(self,
+                hidden_states: Tensor,
+                position_embedding,
+                use_cache=False,
+                kv_cache_params=None,
+                attention_params=None):
 
         assert isinstance(hidden_states, Tensor)
         hidden_states = self.input_layernorm(hidden_states)
 
-        attention_output = self.attention(
-            hidden_states,
-            position_embedding=position_embedding,
-            past_key_value=past_key_value,
-            sequence_length=sequence_length,
-            host_past_key_value_lengths=host_past_key_value_lengths,
-            cache_indirection=cache_indirection,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length,
-            use_cache=use_cache,
-        )
+        attention_output = self.attention(hidden_states,
+                                          position_embedding,
+                                          use_cache=use_cache,
+                                          kv_cache_params=kv_cache_params,
+                                          attention_params=attention_params)
 
         if use_cache:
             attention_output, presents = attention_output
@@ -370,21 +345,12 @@ def __init__(self,
 
         self.ln_f = LayerNorm(normalized_shape=hidden_size, dtype=dtype)
 
-    def forward(
-        self,
-        input_ids=None,
-        position_ids=None,
-        past_key_value=None,
-        sequence_length=None,
-        host_past_key_value_lengths=None,
-        use_cache=False,
-        cache_indirection=None,
-        context_lengths: Tensor = None,
-        host_context_lengths: Tensor = None,
-        host_request_types=None,
-        # max allowed context length. Required to
-        # compute scratch memory size.
-        max_context_length: int = None):
+    def forward(self,
+                input_ids=None,
+                position_ids=None,
+                use_cache=False,
+                kv_cache_params=None,
+                attention_params=None):
 
         batch_size = shape(input_ids, 0)
         input_len = shape(input_ids, 1)
@@ -413,25 +379,23 @@ def forward(
             position_embedding_sin0, position_embedding_sin1
         ]
 
-        if past_key_value is None:
-            past_key_value = tuple([None] * len(self.layers))
+        if kv_cache_params.past_key_value is None:
+            kv_cache_params.past_key_value = tuple([None] * len(self.layers))
 
         if use_cache:
             presents = []
 
-        for layer, past in zip(self.layers, past_key_value):
+        for layer, past in zip(self.layers, kv_cache_params.past_key_value):
             hidden_states = layer(
                 hidden_states,
                 position_embedding,
-                past_key_value=past,
-                sequence_length=sequence_length,
-                host_past_key_value_lengths=host_past_key_value_lengths,
                 use_cache=use_cache,
-                cache_indirection=cache_indirection,
-                context_lengths=context_lengths,
-                host_context_lengths=host_context_lengths,
-                host_request_types=host_request_types,
-                max_context_length=max_context_length)
+                kv_cache_params=KeyValueCacheParams(
+                    past_key_value=[past],
+                    host_past_key_value_lengths=kv_cache_params.
+                    host_past_key_value_lengths,
+                    cache_indirection=kv_cache_params.cache_indirection),
+                attention_params=attention_params)
 
             if use_cache:
                 presents.append(hidden_states[1])
@@ -488,29 +452,17 @@ def __init__(self,
                                     tp_size=mapping.tp_size,
                                     gather_output=True)
 
-    def forward(
-        self,
-        input_ids=None,
-        position_ids=None,
-        past_key_value=None,
-        sequence_length=None,
-        host_past_key_value_lengths=None,
-        use_cache=False,
-        last_token_ids=None,
-        cache_indirection=None,
-        context_lengths: Tensor = None,
-        host_context_lengths: Tensor = None,
-        host_request_types=None,
-        # max allowed context length. Required to
-        # compute scratch memory size.
-        max_context_length: int = None):
+    def forward(self,
+                input_ids=None,
+                position_ids=None,
+                use_cache=False,
+                last_token_ids=None,
+                kv_cache_params=None,
+                attention_params=None):
+
         assert last_token_ids is not None, "Expecting last token ids to be not None"
-        hidden_states = super().forward(input_ids, position_ids, past_key_value,
-                                        sequence_length,
-                                        host_past_key_value_lengths, use_cache,
-                                        cache_indirection, context_lengths,
-                                        host_context_lengths,
-                                        host_request_types, max_context_length)
+        hidden_states = super().forward(input_ids, position_ids, use_cache,
+                                        kv_cache_params, attention_params)
 
         if use_cache:
             hidden_states, presents = hidden_states
@@ -653,7 +605,14 @@ def prepare_inputs(self,
                                        ('max_seq_len', [max_len_range]),
                                    ]))
 
-        return (input_ids, position_ids, past_key_value, sequence_length,
-                host_past_key_value_lengths, True, last_token_ids,
-                cache_indirection, context_lengths, host_context_lengths,
-                host_request_types, max_input_len)
+        return (input_ids, position_ids, True, last_token_ids,
+                KeyValueCacheParams(
+                    past_key_value=past_key_value,
+                    host_past_key_value_lengths=host_past_key_value_lengths,
+                    cache_indirection=cache_indirection,
+                ),
+                AttentionParams(sequence_length=sequence_length,
+                                context_lengths=context_lengths,
+                                host_context_lengths=host_context_lengths,
+                                max_context_length=max_input_len,
+                                host_request_types=host_request_types))
diff --git a/tensorrt_llm/models/falcon/model.py b/tensorrt_llm/models/falcon/model.py
index d428bb7952b..78fcc5ec61a 100644
--- a/tensorrt_llm/models/falcon/model.py
+++ b/tensorrt_llm/models/falcon/model.py
@@ -18,9 +18,10 @@
 
 from ..._common import default_net
 from ..._utils import pad_vocab_size, str_dtype_to_trt
-from ...functional import Tensor, gather_last_token_logits
-from ...layers import (MLP, Attention, AttentionMaskType, ColumnLinear,
-                       Embedding, LayerNorm, PositionEmbeddingType)
+from ...functional import Tensor, gather_last_token_logits, recv, send
+from ...layers import (MLP, Attention, AttentionMaskType, AttentionParams,
+                       ColumnLinear, Embedding, KeyValueCacheParams, LayerNorm,
+                       PositionEmbeddingType)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ...quantization import QuantMode
@@ -79,6 +80,7 @@ def __init__(
             tp_size=tp_size,
             use_int8_kv_cache=quant_mode.has_int8_kv_cache(),
             scale_alibi_bias=True,
+            quant_mode=quant_mode,
         )
 
         if mlp_hidden_size is None:
@@ -102,6 +104,7 @@ def __init__(
             bias=bias,
             tp_group=tp_group,
             tp_size=tp_size,
+            quant_mode=quant_mode,
         )
         if self.new_decoder_architecture or self.parallel_attn:
             self.post_layernorm = None
@@ -112,15 +115,9 @@ def __init__(
     def forward(self,
                 hidden_states: Tensor,
                 attention_mask=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
-                cache_indirection=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None):
         assert isinstance(hidden_states, Tensor)
 
         residual = hidden_states
@@ -129,18 +126,11 @@ def forward(self,
             mlp_ln_output = self.mlp_layernorm(hidden_states)
         hidden_states = self.input_layernorm(hidden_states)
         input_ln_output = hidden_states
-        attention_output = self.attention(
-            hidden_states,
-            attention_mask=attention_mask,
-            past_key_value=past_key_value,
-            sequence_length=sequence_length,
-            host_past_key_value_lengths=host_past_key_value_lengths,
-            use_cache=use_cache,
-            cache_indirection=cache_indirection,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
+        attention_output = self.attention(hidden_states,
+                                          attention_mask=attention_mask,
+                                          use_cache=use_cache,
+                                          kv_cache_params=kv_cache_params,
+                                          attention_params=attention_params)
 
         if use_cache:
             attention_output, presents = attention_output
@@ -192,22 +182,25 @@ def __init__(
         self.num_kv_heads = num_kv_heads or num_heads
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
+        self.mapping = mapping
 
         # Falcon variants
         self.parallel_attention = parallel_attention
         self.new_decoder_architecture = new_decoder_architecture
 
+        self.quant_mode = quant_mode
         assert isinstance(dtype, (str, trt.DataType))
         if isinstance(dtype, str):
             self.dtype = str_dtype_to_trt(dtype)
         else:
             self.dtype = dtype
-        if quant_mode.has_int8_kv_cache():
+        if self.quant_mode.has_int8_kv_cache():
             self.kv_dtype = str_dtype_to_trt('int8')
         else:
             self.kv_dtype = self.dtype
 
-        self.embedding = Embedding(vocab_size, hidden_size, dtype=dtype)
+        if self.mapping.is_first_pp_rank():
+            self.embedding = Embedding(vocab_size, hidden_size, dtype=dtype)
 
         self.layers = ModuleList([
             FalconDecoderLayer(
@@ -216,7 +209,7 @@ def __init__(
                 max_position_embeddings=max_position_embeddings,
                 dtype=dtype,
                 bias=bias,
-                quant_mode=quant_mode,
+                quant_mode=self.quant_mode,
                 hidden_act=hidden_act,
                 num_attention_kv_heads=self.num_kv_heads,
                 mlp_hidden_size=mlp_hidden_size,
@@ -226,51 +219,55 @@ def __init__(
                 tp_group=mapping.tp_group,
                 tp_size=mapping.tp_size,
                 layer_id=i,
-            ) for i in range(num_layers)
+            ) for i in self.get_transformer_layers(self.mapping, num_layers)
         ])
 
-        self.ln_f = LayerNorm(normalized_shape=hidden_size, dtype=dtype)
+        if self.mapping.is_last_pp_rank():
+            self.ln_f = LayerNorm(normalized_shape=hidden_size, dtype=dtype)
 
     def forward(self,
                 input_ids: Tensor,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 attention_mask=None,
-                cache_indirection=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length: int = None):
-        hidden_states = self.embedding(input_ids)
+                kv_cache_params=None,
+                attention_params=None,
+                hidden_states=None):
 
-        if past_key_value is None:
-            past_key_value = tuple([None] * len(self.layers))
+        if kv_cache_params.past_key_value is None:
+            kv_cache_params.past_key_value = tuple([None] * len(self.layers))
 
         if use_cache:
             presents = []
 
-        for layer, past in zip(self.layers, past_key_value):
+        if self.mapping.is_first_pp_rank():
+            hidden_states = self.embedding(input_ids)
+        else:
+            hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
+
+        for layer, past, pointer in zip(
+                self.layers, kv_cache_params.past_key_value,
+                kv_cache_params.kv_cache_block_pointers):
             hidden_states = layer(
                 hidden_states,
-                past_key_value=past,
-                sequence_length=sequence_length,
-                host_past_key_value_lengths=host_past_key_value_lengths,
                 use_cache=use_cache,
                 attention_mask=attention_mask,
-                cache_indirection=cache_indirection,
-                context_lengths=context_lengths,
-                host_context_lengths=host_context_lengths,
-                host_request_types=host_request_types,
-                max_context_length=max_context_length)
+                kv_cache_params=KeyValueCacheParams(
+                    past_key_value=[past],
+                    host_past_key_value_lengths=kv_cache_params.
+                    host_past_key_value_lengths,
+                    kv_cache_block_pointers=[pointer],
+                    cache_indirection=kv_cache_params.cache_indirection),
+                attention_params=attention_params)
 
             if use_cache:
                 presents.append(hidden_states[1])
                 hidden_states = hidden_states[0]
 
-        hidden_states = self.ln_f(hidden_states)
+        if self.mapping.is_last_pp_rank():
+            hidden_states = self.ln_f(hidden_states)
+        else:
+            hidden_states = send(hidden_states, self.mapping.next_pp_rank())
 
         if use_cache:
             return (hidden_states, tuple(presents))
@@ -311,16 +308,21 @@ def __init__(self,
                          use_alibi=use_alibi,
                          parallel_attention=parallel_attention,
                          new_decoder_architecture=new_decoder_architecture)
+
+        # TODO: For compatibility to quantization modules. Remove it later.
+        self._num_layers = num_layers
+
         vocab_size_padded = pad_vocab_size(vocab_size, mapping.tp_size)
-        self.lm_head = ColumnLinear(
-            hidden_size,
-            vocab_size_padded,
-            bias=False,
-            dtype=dtype,
-            tp_group=mapping.tp_group,
-            tp_size=mapping.tp_size,
-            gather_output=True,
-        )
+        if self.mapping.is_last_pp_rank():
+            self.lm_head = ColumnLinear(
+                hidden_size,
+                vocab_size_padded,
+                bias=False,
+                dtype=dtype,
+                tp_group=mapping.tp_group,
+                tp_size=mapping.tp_size,
+                gather_output=True,
+            )
         if isinstance(logits_dtype, str):
             self.logits_dtype = str_dtype_to_trt(logits_dtype)
         else:
@@ -330,57 +332,58 @@ def __init__(self,
     def forward(self,
                 input_ids: Tensor,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 last_token_ids=None,
                 attention_mask=None,
-                cache_indirection=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
-        hidden_states = super().forward(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            sequence_length=sequence_length,
-            host_past_key_value_lengths=host_past_key_value_lengths,
-            use_cache=use_cache,
-            attention_mask=attention_mask,
-            cache_indirection=cache_indirection,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
+                kv_cache_params=None,
+                attention_params=None,
+                hidden_states=None):
+
+        hidden_states = super().forward(input_ids=input_ids,
+                                        position_ids=position_ids,
+                                        use_cache=use_cache,
+                                        attention_mask=attention_mask,
+                                        kv_cache_params=kv_cache_params,
+                                        attention_params=attention_params,
+                                        hidden_states=hidden_states)
 
         if use_cache:
             hidden_states, presents = hidden_states
 
-        hidden_states = gather_last_token_logits(
-            hidden_states,
-            last_token_ids,
-            default_net().plugin_config.remove_input_padding,
-        )
+        if self.mapping.is_last_pp_rank():
+            hidden_states = gather_last_token_logits(
+                hidden_states,
+                last_token_ids,
+                default_net().plugin_config.remove_input_padding,
+            )
 
-        # [batch_size, hidden_size] -> [batch_size, vocab_size]
-        lm_logits = self.lm_head(hidden_states)
-        lm_logits.mark_output('logits', self.logits_dtype)
+            # [batch_size, hidden_size] -> [batch_size, vocab_size]
+            lm_logits = self.lm_head(hidden_states)
+            lm_logits.mark_output('logits', self.logits_dtype)
+        else:
+            hidden_states.mark_output('hidden_states_output', self.dtype)
 
         if use_cache:
             for i, present in enumerate(presents):
                 present.mark_output(f'present_key_value_{i}', self.kv_dtype)
-            return lm_logits, presents
-
-        return lm_logits
+            if self.mapping.is_last_pp_rank():
+                return lm_logits, presents
+            else:
+                return hidden_states, presents
+        else:
+            if self.mapping.is_last_pp_rank():
+                return lm_logits
+            else:
+                return hidden_states
 
     def prepare_inputs(self,
-                       max_batch_size,
-                       max_input_len,
-                       max_new_tokens,
-                       use_cache,
-                       max_beam_width: int = 1):
+                       max_batch_size: int,
+                       max_input_len: int,
+                       max_new_tokens: int,
+                       use_cache: bool,
+                       max_beam_width: int = 1,
+                       paged_kv_cache: bool = False,
+                       tokens_per_block: int = 64):
         '''
 
         @brief: Prepare inputs Tensors for the model, the given sizes are used
@@ -391,7 +394,6 @@ def prepare_inputs(self,
 
         # Prepare inputs
         head_size = self.hidden_size // self.num_heads
-        num_kv_heads = self.layers[0].attention.num_attention_kv_heads
 
         plugin_config = default_net().plugin_config
         use_gpt_attention_plugin = plugin_config.gpt_attention_plugin
@@ -402,19 +404,37 @@ def prepare_inputs(self,
             max_beam_width=max_beam_width,
             max_input_len=max_input_len,
             max_new_tokens=max_new_tokens,
-            num_heads=num_kv_heads,
+            num_kv_heads=self.num_kv_heads,
             head_size=head_size,
             num_layers=self.num_layers,
             kv_dtype=self.kv_dtype,
             use_gpt_attention_plugin=use_gpt_attention_plugin,
             remove_input_padding=remove_input_padding,
+            paged_kv_cache=paged_kv_cache,
+            tokens_per_block=tokens_per_block,
+            dtype=self.dtype,
+            num_heads=self.num_heads,
+            mapping=self.mapping,
         )
 
-        return (model_inputs['input_ids'], model_inputs['position_ids'],
-                model_inputs['past_key_value'], model_inputs['sequence_length'],
-                model_inputs['host_past_key_value_lengths'], use_cache,
-                model_inputs['last_token_ids'], model_inputs['attention_mask'],
-                model_inputs['cache_indirection'],
-                model_inputs['context_lengths'],
-                model_inputs['host_context_lengths'],
-                model_inputs['host_request_types'], max_input_len)
+        return (
+            model_inputs['input_ids'],
+            model_inputs['position_ids'],
+            use_cache,
+            model_inputs['last_token_ids'],
+            model_inputs['attention_mask'],
+            KeyValueCacheParams(
+                past_key_value=model_inputs['past_key_value'],
+                host_past_key_value_lengths=model_inputs[
+                    'host_past_key_value_lengths'],
+                kv_cache_block_pointers=model_inputs[
+                    'kv_cache_block_pointers_list'],
+                cache_indirection=model_inputs['cache_indirection']),
+            AttentionParams(
+                sequence_length=model_inputs['sequence_length'],
+                context_lengths=model_inputs['context_lengths'],
+                host_context_lengths=model_inputs['host_context_lengths'],
+                max_context_length=max_input_len,
+                host_request_types=model_inputs['host_request_types']),
+            model_inputs['hidden_states_input'],
+        )
diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py
index 2a1d56865d3..093676f03d1 100644
--- a/tensorrt_llm/models/generation_mixin.py
+++ b/tensorrt_llm/models/generation_mixin.py
@@ -17,120 +17,251 @@
 
 import tensorrt as trt
 
-from ..functional import Tensor, assertion, shape
+from ..functional import Tensor
+from ..mapping import Mapping
 
 
 class GenerationMixin:
 
+    def get_transformer_layers(self, mapping, num_layers):
+        layers_per_pipeline_stage = num_layers // mapping.pp_size
+        layers_range = list(
+            range(mapping.pp_rank * layers_per_pipeline_stage,
+                  (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1))
+        return layers_range
+
     def prepare_basic_inputs(self,
                              max_batch_size,
                              max_beam_width,
                              max_input_len,
                              max_new_tokens,
-                             num_heads,
+                             num_kv_heads,
                              head_size,
                              num_layers,
                              kv_dtype,
                              remove_input_padding=False,
                              use_gpt_attention_plugin=False,
+                             use_gemm_plugin=False,
                              paged_kv_cache=False,
-                             tokens_per_block=64):
+                             tokens_per_block=64,
+                             dtype=None,
+                             num_heads=None,
+                             mapping=Mapping()):
 
         max_len = max_input_len + max_new_tokens
-        bb_range = [
+
+        bb_range_cxt = [1, (max_batch_size + 1) // 2, max_batch_size]
+        bb_range_gen = [
             1, (max_batch_size * max_beam_width + 1) // 2,
             max_batch_size * max_beam_width
         ]
-        bs_range = [1, (max_batch_size + 1) // 2, max_batch_size]
-        beam_width_range = [1, (max_beam_width + 1) // 2, max_beam_width]
-        inlen_range = [1, 1, max_input_len]
-        mask_len_range = [1, (max_len + 1) // 2 + 1, max_len + 1]
-        max_len_range = [0, (max_len + 1) // 2, max_len]
-        num_tokens_range = [
-            1, max_batch_size * max_beam_width,
-            max(max_input_len * max_batch_size, max_beam_width * max_batch_size)
+        _bs_range = [1, (max_batch_size + 1) // 2, max_batch_size]
+        _beam_width_range = [1, (max_beam_width + 1) // 2, max_beam_width]
+        inlen_range_cxt = [1, (max_input_len + 1) // 2, max_input_len]
+        inlen_range_gen = [1, 1, 1]
+        _mask_len_ctx = [1, (max_input_len + 1) // 2, max_input_len]
+        _mask_len_gen = [2, (max_len + 1) // 2 + 1, max_len + 1]
+        _kv_cache_range_ctx = [0, 0, 0]
+        _kv_cache_range_gen = [1, (max_len + 1) // 2, max_len]
+        _max_len_range = [0, (max_len + 1) // 2, max_len]
+        num_tokens_range_ctx = [
+            1, (max_input_len * max_batch_size + 1) // 2,
+            max_input_len * max_batch_size
+        ]
+        num_tokens_range_gen = [
+            1, max_batch_size * max_beam_width, max_beam_width * max_batch_size
         ]
 
+        enable_two_optimization_profiles = False
+        if use_gpt_attention_plugin == False or use_gemm_plugin == False:
+            use_in_flight_batching = use_gpt_attention_plugin and remove_input_padding and paged_kv_cache
+            enable_two_optimization_profiles = not use_in_flight_batching
+        if enable_two_optimization_profiles:
+            bb_range = [bb_range_cxt, bb_range_gen]
+            bs_range = [_bs_range, _bs_range]
+            beam_width_range = [_beam_width_range, _beam_width_range]
+            inlen_range = [inlen_range_cxt, inlen_range_gen]
+            mask_len_range = [_mask_len_ctx, _mask_len_gen]
+            if use_gpt_attention_plugin:
+                kv_cache_range = [_kv_cache_range_gen, _kv_cache_range_gen]
+            else:
+                kv_cache_range = [_kv_cache_range_ctx, _kv_cache_range_gen]
+            max_len_range = [_max_len_range, _max_len_range]
+            num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
+        else:
+            bb_range = [bb_range_gen]
+            bs_range = [_bs_range]
+            beam_width_range = [_beam_width_range]
+            inlen_range = [[1, 1, max_input_len]]
+            mask_len_range = [[1, (max_len + 1) // 2 + 1, max_len + 1]]
+            kv_cache_range = [[0, (max_len + 1) // 2, max_len]]
+            max_len_range = [_max_len_range]
+            num_tokens_range = [[
+                1, max_batch_size * max_beam_width,
+                max(max_input_len * max_batch_size,
+                    max_beam_width * max_batch_size)
+            ]]
+
+        input_ids = None
+        position_ids = None
+        hidden_states = None
         if remove_input_padding:
-            input_ids = Tensor(name='input_ids',
-                               dtype=trt.int32,
-                               shape=[1, -1],
-                               dim_range=OrderedDict([
-                                   ('batch_size_fake', [1]),
-                                   ('num_tokens', [num_tokens_range]),
-                               ]))
-            position_ids = Tensor(name='position_ids',
-                                  dtype=trt.int32,
-                                  shape=[1, -1],
-                                  dim_range=OrderedDict([
-                                      ('batch_size_fake', [1]),
-                                      ('num_tokens', [num_tokens_range]),
-                                  ]))
+            if mapping.is_first_pp_rank():
+                input_ids = Tensor(
+                    name='input_ids',
+                    dtype=trt.int32,
+                    shape=[1, -1],
+                    dim_range=OrderedDict([
+                        ('batch_size_fake',
+                         [1, 1] if enable_two_optimization_profiles else [1]),
+                        ('num_tokens', num_tokens_range),
+                    ]))
+                position_ids = Tensor(
+                    name='position_ids',
+                    dtype=trt.int32,
+                    shape=[1, -1],
+                    dim_range=OrderedDict([
+                        ('batch_size_fake',
+                         [1, 1] if enable_two_optimization_profiles else [1]),
+                        ('num_tokens', num_tokens_range),
+                    ]))
+            else:
+                assert dtype is not None
+                assert num_heads is not None
+                hidden_states = Tensor(
+                    name='hidden_states_input',
+                    dtype=dtype,
+                    shape=[1, -1, head_size * num_heads],
+                    dim_range=OrderedDict([
+                        ('batch_size_fake',
+                         [1, 1] if enable_two_optimization_profiles else [1]),
+                        ('num_tokens', num_tokens_range),
+                        ('hidden_size',
+                         [head_size * num_heads, head_size *
+                          num_heads] if enable_two_optimization_profiles else
+                         [head_size * num_heads]),
+                    ]))
+
         else:
-            input_ids = Tensor(name='input_ids',
-                               dtype=trt.int32,
-                               shape=[-1, -1],
-                               dim_range=OrderedDict([
-                                   ('batch_size_beam_width', [bb_range]),
-                                   ('input_len', [inlen_range]),
-                               ]))
-            position_ids = Tensor(name='position_ids',
-                                  dtype=trt.int32,
-                                  shape=[-1, -1],
-                                  dim_range=OrderedDict([
-                                      ('batch_size_beam_width', [bb_range]),
-                                      ('input_len', [inlen_range]),
-                                  ]))
+            if mapping.is_first_pp_rank():
+                input_ids = Tensor(name='input_ids',
+                                   dtype=trt.int32,
+                                   shape=[-1, -1],
+                                   dim_range=OrderedDict([
+                                       ('batch_size_beam_width', bb_range),
+                                       ('input_len', inlen_range),
+                                   ]))
+                position_ids = Tensor(name='position_ids',
+                                      dtype=trt.int32,
+                                      shape=[-1, -1],
+                                      dim_range=OrderedDict([
+                                          ('batch_size_beam_width', bb_range),
+                                          ('input_len', inlen_range),
+                                      ]))
+            else:
+                assert dtype is not None
+                assert num_heads is not None
+                hidden_states = Tensor(
+                    name='hidden_states_input',
+                    dtype=dtype,
+                    shape=[-1, -1, head_size * num_heads],
+                    dim_range=OrderedDict([
+                        ('batch_size_beam_width', bb_range),
+                        ('input_len', inlen_range),
+                        ('hidden_size',
+                         [head_size * num_heads, head_size *
+                          num_heads] if enable_two_optimization_profiles else
+                         [head_size * num_heads]),
+                    ]))
 
+        num_kv_heads = (num_kv_heads + mapping.tp_size - 1) // mapping.tp_size
+        layers_range = self.get_transformer_layers(mapping, num_layers)
         past_key_value = []
         kv_cache_block_pointers_list = []
         if not paged_kv_cache:
-            for i in range(num_layers):
+            for i in layers_range:
                 kv_dim_range = OrderedDict([
-                    ('kv_batch_size_beam_width', [bb_range]),
-                    ('kv', [2]),
-                    ('num_heads', [num_heads]),
-                    ('past_key_len', [max_len_range]),
-                    ('head_size', [head_size]),
+                    ('batch_size_beam_width', bb_range),
+                    ('kv', [2, 2] if enable_two_optimization_profiles else [2]),
+                    ('num_heads', [num_kv_heads, num_kv_heads]
+                     if enable_two_optimization_profiles else [num_kv_heads]),
+                    ('past_key_len', kv_cache_range),
+                    ('head_size', [head_size, head_size]
+                     if enable_two_optimization_profiles else [head_size]),
                 ])
                 kv = Tensor(name=f'past_key_value_{i}',
                             dtype=kv_dtype,
-                            shape=[-1, 2, num_heads, -1, head_size],
+                            shape=[-1, 2, num_kv_heads, -1, head_size],
                             dim_range=kv_dim_range)
                 past_key_value.append(kv)
-                # TODO(kaiyu): Remove this when TRT fix the named dimension
-                if not remove_input_padding:
-                    assertion(shape(input_ids, 0) == shape(kv, 0), 'batch size')
 
                 kv_cache_block_pointers_list.append(None)
         else:
-            max_blocks_per_seq_range = [
-                math.ceil(max_len_range[0] / tokens_per_block),
-                math.ceil(max_len_range[1] / tokens_per_block),
-                math.ceil(max_len_range[2] / tokens_per_block)
-            ]
-            blocks_range = [
-                bb_range[0] * max_blocks_per_seq_range[0],
-                bb_range[1] * max_blocks_per_seq_range[1],
-                bb_range[2] * max_blocks_per_seq_range[2]
-            ]
-            # NOTE(nkorobov): we multiply max_blocks_per_seq by 2 because plugin expects pointers as int64,
-            # but TRT does not support int64. Thus, we emulate int64 with doubled int32.
-            max_blocks_per_seq_range = [2 * x for x in max_blocks_per_seq_range]
+            if enable_two_optimization_profiles:
+                max_blocks_per_seq_range = [
+                    [
+                        math.ceil(kv_cache_range[0][0] / tokens_per_block),
+                        math.ceil(kv_cache_range[0][1] / tokens_per_block),
+                        math.ceil(kv_cache_range[0][2] / tokens_per_block)
+                    ],
+                    [
+                        math.ceil(kv_cache_range[1][0] / tokens_per_block),
+                        math.ceil(kv_cache_range[1][1] / tokens_per_block),
+                        math.ceil(kv_cache_range[1][2] / tokens_per_block)
+                    ]
+                ]
+                blocks_range = [
+                    [
+                        bb_range[0][0] * max_blocks_per_seq_range[0][0],
+                        bb_range[0][1] * max_blocks_per_seq_range[0][1],
+                        bb_range[0][2] * max_blocks_per_seq_range[0][2]
+                    ],
+                    [
+                        bb_range[1][0] * max_blocks_per_seq_range[1][0],
+                        bb_range[1][1] * max_blocks_per_seq_range[1][1],
+                        bb_range[1][2] * max_blocks_per_seq_range[1][2]
+                    ],
+                ]
+
+                # NOTE(nkorobov): we multiply max_blocks_per_seq by 2 because plugin expects pointers as int64,
+                # but TRT does not support int64. Thus, we emulate int64 with doubled int32.
+                max_blocks_per_seq_range = [[
+                    2 * x for x in max_blocks_per_seq_range[0]
+                ], [2 * x for x in max_blocks_per_seq_range[1]]]
+            else:
+                max_blocks_per_seq_range = [[
+                    math.ceil(kv_cache_range[0][0] / tokens_per_block),
+                    math.ceil(kv_cache_range[0][1] / tokens_per_block),
+                    math.ceil(kv_cache_range[0][2] / tokens_per_block)
+                ]]
+                blocks_range = [[
+                    bb_range[0][0] * max_blocks_per_seq_range[0][0],
+                    bb_range[0][1] * max_blocks_per_seq_range[0][1],
+                    bb_range[0][2] * max_blocks_per_seq_range[0][2]
+                ]]
+
+                # NOTE(nkorobov): we multiply max_blocks_per_seq by 2 because plugin expects pointers as int64,
+                # but TRT does not support int64. Thus, we emulate int64 with doubled int32.
+                max_blocks_per_seq_range = [[
+                    2 * x for x in max_blocks_per_seq_range[0]
+                ]]
 
             kv_dim_range = OrderedDict([
-                ('blocks', [blocks_range]),
-                ('kv', [2]),
-                ('num_heads', [num_heads]),
-                ('tokens_per_block', [tokens_per_block]),
-                ('head_size', [head_size]),
+                ('blocks', blocks_range),
+                ('kv', [2, 2] if enable_two_optimization_profiles else [2]),
+                ('num_heads', [num_kv_heads, num_kv_heads]
+                 if enable_two_optimization_profiles else [num_kv_heads]),
+                ('tokens_per_block', [tokens_per_block, tokens_per_block]
+                 if enable_two_optimization_profiles else [tokens_per_block]),
+                ('head_size', [head_size, head_size]
+                 if enable_two_optimization_profiles else [head_size]),
             ])
-            for i in range(self._num_layers):
+            for i in layers_range:
                 # (blocks, 2, kv_num_heads, tokens_per_block, head_size)
                 kv = Tensor(
                     name=f'past_key_value_{i}',
-                    dtype=self._kv_dtype,
-                    shape=[-1, 2, num_heads, tokens_per_block, head_size],
+                    dtype=kv_dtype,
+                    shape=[-1, 2, num_kv_heads, tokens_per_block, head_size],
                     dim_range=kv_dim_range)
                 past_key_value.append(kv)
 
@@ -139,9 +270,10 @@ def prepare_basic_inputs(self,
                     dtype=trt.int32,
                     shape=[-1, 2, -1],
                     dim_range=OrderedDict([
-                        ('batch_size_beam_width', [bb_range]),
-                        ('kv', [2]),
-                        ('max_blocks_per_seq', [max_blocks_per_seq_range]),
+                        ('batch_size_beam_width', bb_range),
+                        ('kv',
+                         [2, 2] if enable_two_optimization_profiles else [2]),
+                        ('max_blocks_per_seq', max_blocks_per_seq_range),
                     ]))
                 kv_cache_block_pointers_list.append(kv_cache_block_pointers)
 
@@ -158,26 +290,26 @@ def prepare_basic_inputs(self,
                 name='sequence_length',
                 dtype=trt.int32,
                 shape=[-1],
-                dim_range=OrderedDict([('batch_size_beam_width', [bb_range])]),
+                dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
             )
 
             host_request_types = Tensor(
                 name='host_request_types',
                 dtype=trt.int32,
                 shape=[-1],
-                dim_range=OrderedDict([('batch_size_beam_width', [bb_range])]),
+                dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
             )
             host_past_key_value_lengths = Tensor(
                 name='host_past_key_value_lengths',
                 dtype=trt.int32,
                 shape=[-1],
-                dim_range=OrderedDict([('batch_size_beam_width', [bb_range])]),
+                dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
             )
             context_lengths = Tensor(
                 name='context_lengths',
                 dtype=trt.int32,
                 shape=[-1],
-                dim_range=OrderedDict([('batch_size_beam_width', [bb_range])]),
+                dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
             )
         else:
             attention_mask = Tensor(
@@ -185,8 +317,8 @@ def prepare_basic_inputs(self,
                 dtype=trt.int32,
                 shape=[-1, -1],
                 dim_range=OrderedDict([
-                    ('batch_size_beam_width', [bb_range]),
-                    ('mask_len', [mask_len_range]),
+                    ('batch_size_beam_width', bb_range),
+                    ('mask_len', mask_len_range),
                 ]),
             )
 
@@ -195,31 +327,34 @@ def prepare_basic_inputs(self,
                 name='host_context_lengths',
                 dtype=trt.int32,
                 shape=[-1],
-                dim_range=OrderedDict([('batch_size_beam_width', [bb_range])]),
+                dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
             )
 
-        last_token_ids = Tensor(
-            name='last_token_ids',
-            dtype=trt.int32,
-            shape=[-1],
-            dim_range=OrderedDict([
-                ('batch_size_last_token_ids', [bb_range]),
-            ]),
-        )
+        last_token_ids = None
+        if mapping.is_last_pp_rank():
+            last_token_ids = Tensor(
+                name='last_token_ids',
+                dtype=trt.int32,
+                shape=[-1],
+                dim_range=OrderedDict([
+                    ('batch_size_last_token_ids', bb_range),
+                ]),
+            )
 
         cache_indirection = Tensor(
             name='cache_indirection',
             dtype=trt.int32,
             shape=[-1, -1, -1],
             dim_range=OrderedDict([
-                ('batch_size_cache', [bs_range]),
-                ('beam_width', [beam_width_range]),
-                ('max_seq_len', [max_len_range]),
+                ('batch_size_cache', bs_range),
+                ('beam_width', beam_width_range),
+                ('max_seq_len', max_len_range),
             ]),
         )
 
         return {
             'input_ids': input_ids,
+            'hidden_states_input': hidden_states,
             'position_ids': position_ids,
             'attention_mask': attention_mask,
             'sequence_length': sequence_length,
diff --git a/tensorrt_llm/models/gpt/model.py b/tensorrt_llm/models/gpt/model.py
index 2fcf0d929f6..2214836a75b 100644
--- a/tensorrt_llm/models/gpt/model.py
+++ b/tensorrt_llm/models/gpt/model.py
@@ -20,9 +20,9 @@
 from ..._utils import pad_vocab_size, str_dtype_to_trt
 from ...functional import (Tensor, gather_last_token_logits,
                            is_gated_activation, non_gated_version)
-from ...layers import (MLP, Attention, AttentionMaskType, ColumnLinear,
-                       Embedding, GatedMLP, LayerNorm, PositionEmbeddingType,
-                       PromptTuningEmbedding)
+from ...layers import (MLP, Attention, AttentionMaskType, AttentionParams,
+                       ColumnLinear, Embedding, GatedMLP, KeyValueCacheParams,
+                       LayerNorm, PositionEmbeddingType, PromptTuningEmbedding)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ...quantization import QuantMode
@@ -159,16 +159,9 @@ def __init__(self,
     def forward(self,
                 hidden_states: Tensor,
                 attention_mask=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
-                cache_indirection=None,
-                kv_cache_block_pointers=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None):
 
         assert isinstance(hidden_states, Tensor)
 
@@ -176,19 +169,11 @@ def forward(self,
 
         hidden_states = self.input_layernorm(hidden_states)
 
-        attention_output = self.attention(
-            hidden_states,
-            attention_mask=attention_mask,
-            past_key_value=past_key_value,
-            sequence_length=sequence_length,
-            host_past_key_value_lengths=host_past_key_value_lengths,
-            use_cache=use_cache,
-            cache_indirection=cache_indirection,
-            kv_cache_block_pointers=kv_cache_block_pointers,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
+        attention_output = self.attention(hidden_states,
+                                          attention_mask=attention_mask,
+                                          use_cache=use_cache,
+                                          kv_cache_params=kv_cache_params,
+                                          attention_params=attention_params)
 
         if use_cache:
             attention_output, presents = attention_output
@@ -268,46 +253,38 @@ def __init__(self,
     def forward(self,
                 input_ids,
                 position_ids,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 attention_mask=None,
-                cache_indirection=None,
-                kv_cache_block_pointers=None,
+                kv_cache_params=None,
+                attention_params=None,
                 prompt_embedding_table=None,
                 prompt_tasks=None,
-                prompt_vocab_size=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                prompt_vocab_size=None):
 
         hidden_states = self.embedding(input_ids, position_ids,
                                        prompt_embedding_table, prompt_tasks,
                                        prompt_vocab_size)
 
-        if past_key_value is None:
-            past_key_value = tuple([None] * len(self.layers))
+        if kv_cache_params.past_key_value is None:
+            kv_cache_params.past_key_value = tuple([None] * len(self.layers))
 
         if use_cache:
             presents = []
 
-        for (layer, past, pointers) in zip(self.layers, past_key_value,
-                                           kv_cache_block_pointers):
+        for layer, past, pointer in zip(
+                self.layers, kv_cache_params.past_key_value,
+                kv_cache_params.kv_cache_block_pointers):
             hidden_states = layer(
                 hidden_states,
-                past_key_value=past,
-                sequence_length=sequence_length,
-                host_past_key_value_lengths=host_past_key_value_lengths,
                 use_cache=use_cache,
                 attention_mask=attention_mask,
-                cache_indirection=cache_indirection,
-                kv_cache_block_pointers=pointers,
-                context_lengths=context_lengths,
-                host_context_lengths=host_context_lengths,
-                host_request_types=host_request_types,
-                max_context_length=max_context_length)
+                kv_cache_params=KeyValueCacheParams(
+                    past_key_value=[past],
+                    host_past_key_value_lengths=kv_cache_params.
+                    host_past_key_value_lengths,
+                    kv_cache_block_pointers=[pointer],
+                    cache_indirection=kv_cache_params.cache_indirection),
+                attention_params=attention_params)
 
             if use_cache:
                 presents.append(hidden_states[1])
@@ -397,31 +374,23 @@ def __init__(self,
                                     share_weight=share_weight)
 
     def forward(self,
-                input_ids,
-                position_ids,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
+                input_ids: Tensor,
+                position_ids=None,
                 use_cache=False,
                 last_token_ids=None,
                 attention_mask=None,
-                cache_indirection=None,
-                kv_cache_block_pointers=None,
+                kv_cache_params=None,
+                attention_params=None,
                 prompt_embedding_table=None,
                 prompt_tasks=None,
-                prompt_vocab_size=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                prompt_vocab_size=None):
 
         assert last_token_ids is not None, "Expecting last token ids to be not None"
-        hidden_states = super().forward(
-            input_ids, position_ids, past_key_value, sequence_length,
-            host_past_key_value_lengths, use_cache, attention_mask,
-            cache_indirection, kv_cache_block_pointers, prompt_embedding_table,
-            prompt_tasks, prompt_vocab_size, context_lengths,
-            host_context_lengths, host_request_types, max_context_length)
+        hidden_states = super().forward(input_ids, position_ids, use_cache,
+                                        attention_mask, kv_cache_params,
+                                        attention_params,
+                                        prompt_embedding_table, prompt_tasks,
+                                        prompt_vocab_size)
 
         if use_cache:
             hidden_states, presents = hidden_states
@@ -463,6 +432,7 @@ def prepare_inputs(self,
         remove_input_padding = default_net().plugin_config.remove_input_padding
         use_gpt_attention_plugin = default_net(
         ).plugin_config.gpt_attention_plugin
+        use_gemm_plugin = default_net().plugin_config.gemm_plugin
 
         model_inputs = self.prepare_basic_inputs(
             max_batch_size,
@@ -475,63 +445,93 @@ def prepare_inputs(self,
             self._kv_dtype,
             remove_input_padding=remove_input_padding,
             use_gpt_attention_plugin=use_gpt_attention_plugin,
+            use_gemm_plugin=use_gemm_plugin,
             paged_kv_cache=paged_kv_cache,
             tokens_per_block=tokens_per_block)
 
-        bb_range = [
+        bb_range_cxt = [1, (max_batch_size + 1) // 2, max_batch_size]
+        bb_range_gen = [
             1, (max_batch_size * max_beam_width + 1) // 2,
             max_batch_size * max_beam_width
         ]
-        p_embedding_range = [
+        _p_embedding_range = [
             1, prompt_embedding_table_size // 2, prompt_embedding_table_size
         ]
-        num_tokens_range = [
+        inlen_range_cxt = [1, (max_input_len + 1) // 2, max_input_len]
+        _num_tokens_range = [
             1, max_batch_size * max_beam_width,
             max(max_input_len * max_batch_size, max_beam_width * max_batch_size)
         ]
-        inlen_range = [1, 1, max_input_len]
+
+        enable_two_optimization_profiles = False
+        if use_gpt_attention_plugin == False or use_gemm_plugin == False:
+            use_in_flight_batching = use_gpt_attention_plugin and remove_input_padding and paged_kv_cache
+            enable_two_optimization_profiles = not use_in_flight_batching
+        if enable_two_optimization_profiles:
+            bb_range = [bb_range_cxt, bb_range_gen]
+            p_embedding_range = [_p_embedding_range, _p_embedding_range]
+            num_tokens_range = [_num_tokens_range, _num_tokens_range]
+            input_len_task_range = [inlen_range_cxt, inlen_range_cxt]
+        else:
+            bb_range = [bb_range_gen]
+            p_embedding_range = [_p_embedding_range]
+            num_tokens_range = [_num_tokens_range]
+            input_len_task_range = [inlen_range_cxt]
 
         prompt_embedding_table = None
         tasks = None
         prompt_vocab_size = None
         if self._use_prompt_tuning:
-            prompt_embedding_table = Tensor(name='prompt_embedding_table',
-                                            dtype=self._dtype,
-                                            shape=[-1, self._hidden_size],
-                                            dim_range=OrderedDict([
-                                                ('prompt_embedding_table_size',
-                                                 [p_embedding_range]),
-                                                ('hidden_size',
-                                                 [self._hidden_size]),
-                                            ]))
+            prompt_embedding_table = Tensor(
+                name='prompt_embedding_table',
+                dtype=self._dtype,
+                shape=[-1, self._hidden_size],
+                dim_range=OrderedDict([
+                    ('prompt_embedding_table_size', p_embedding_range),
+                    ('hidden_size', [self._hidden_size, self._hidden_size] if
+                     enable_two_optimization_profiles else [self._hidden_size]),
+                ]))
             if remove_input_padding:
-                tasks = Tensor(name='tasks',
-                               dtype=trt.int32,
-                               shape=[1, -1],
-                               dim_range=OrderedDict([
-                                   ('batch_size_fake', [1]),
-                                   ('input_len_task', [num_tokens_range]),
-                               ]))
+                tasks = Tensor(
+                    name='tasks',
+                    dtype=trt.int32,
+                    shape=[1, -1],
+                    dim_range=OrderedDict([
+                        ('batch_size_fake',
+                         [1, 1] if enable_two_optimization_profiles else [1]),
+                        ('input_len_task', num_tokens_range),
+                    ]))
             else:
                 tasks = Tensor(name='tasks',
                                dtype=trt.int32,
                                shape=[-1, -1],
                                dim_range=OrderedDict([
-                                   ('batch_size_beam_width', [bb_range]),
-                                   ('input_len_task', [inlen_range]),
+                                   ('batch_size_beam_width', bb_range),
+                                   ('input_len_task', input_len_task_range),
                                ]))
-            prompt_vocab_size = Tensor(name='prompt_vocab_size',
-                                       dtype=trt.int32,
-                                       shape=[1],
-                                       dim_range=OrderedDict([('size', [1])]))
-
-        return (model_inputs['input_ids'], model_inputs['position_ids'],
-                model_inputs['past_key_value'], model_inputs['sequence_length'],
-                model_inputs['host_past_key_value_lengths'], True,
+            prompt_vocab_size = Tensor(
+                name='prompt_vocab_size',
+                dtype=trt.int32,
+                shape=[1],
+                dim_range=OrderedDict([
+                    ('size',
+                     [1, 1] if enable_two_optimization_profiles else [1])
+                ]))
+
+        return (model_inputs['input_ids'], model_inputs['position_ids'], True,
                 model_inputs['last_token_ids'], model_inputs['attention_mask'],
-                model_inputs['cache_indirection'],
-                model_inputs['kv_cache_block_pointers_list'],
-                prompt_embedding_table, tasks, prompt_vocab_size,
-                model_inputs['context_lengths'],
-                model_inputs['host_context_lengths'],
-                model_inputs['host_request_types'], max_input_len)
+                KeyValueCacheParams(
+                    past_key_value=model_inputs['past_key_value'],
+                    host_past_key_value_lengths=model_inputs[
+                        'host_past_key_value_lengths'],
+                    kv_cache_block_pointers=model_inputs[
+                        'kv_cache_block_pointers_list'],
+                    cache_indirection=model_inputs['cache_indirection'],
+                ),
+                AttentionParams(
+                    sequence_length=model_inputs['sequence_length'],
+                    context_lengths=model_inputs['context_lengths'],
+                    host_context_lengths=model_inputs['host_context_lengths'],
+                    max_context_length=max_input_len,
+                    host_request_types=model_inputs['host_request_types']),
+                prompt_embedding_table, tasks, prompt_vocab_size)
diff --git a/tensorrt_llm/models/gptj/model.py b/tensorrt_llm/models/gptj/model.py
index a4e7afb42c9..f53becbdd40 100644
--- a/tensorrt_llm/models/gptj/model.py
+++ b/tensorrt_llm/models/gptj/model.py
@@ -21,8 +21,8 @@
 from ..._utils import pad_vocab_size, str_dtype_to_trt
 from ...functional import (PositionEmbeddingType, Tensor, assertion,
                            gather_last_token_logits, shape)
-from ...layers import (MLP, Attention, AttentionMaskType, ColumnLinear,
-                       Embedding, LayerNorm)
+from ...layers import (MLP, Attention, AttentionMaskType, AttentionParams,
+                       ColumnLinear, Embedding, KeyValueCacheParams, LayerNorm)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ...quantization import QuantMode
@@ -79,16 +79,9 @@ def __init__(self,
     def forward(self,
                 hidden_states: Tensor,
                 attention_mask=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
-                cache_indirection=None,
-                kv_cache_block_pointers=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None):
         if not default_net(
         ).plugin_config.layernorm_plugin and trt.__version__[:3] == '8.6':
             raise AssertionError(
@@ -98,19 +91,11 @@ def forward(self,
 
         hidden_states = self.input_layernorm(hidden_states)
 
-        attention_output = self.attention(
-            hidden_states,
-            attention_mask=attention_mask,
-            past_key_value=past_key_value,
-            sequence_length=sequence_length,
-            host_past_key_value_lengths=host_past_key_value_lengths,
-            use_cache=use_cache,
-            cache_indirection=cache_indirection,
-            kv_cache_block_pointers=kv_cache_block_pointers,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
+        attention_output = self.attention(hidden_states,
+                                          attention_mask=attention_mask,
+                                          use_cache=use_cache,
+                                          kv_cache_params=kv_cache_params,
+                                          attention_params=attention_params)
 
         if use_cache:
             attention_output, presents = attention_output
@@ -154,40 +139,32 @@ def __init__(self,
         self.ln_f = LayerNorm(normalized_shape=hidden_size, dtype=dtype)
 
     def forward(self,
-                input_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
+                input_ids: Tensor,
                 use_cache=False,
-                cache_indirection=None,
-                kv_cache_block_pointers_list=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None):
 
         hidden_states = self.embedding(input_ids)
 
-        if past_key_value is None:
-            past_key_value = tuple([None] * len(self.layers))
+        if kv_cache_params.past_key_value is None:
+            kv_cache_params.past_key_value = tuple([None] * len(self.layers))
 
         if use_cache:
             presents = []
 
-        for idx, (layer, past, pointers) in enumerate(
-                zip(self.layers, past_key_value, kv_cache_block_pointers_list)):
+        for layer, past, pointer in zip(
+                self.layers, kv_cache_params.past_key_value,
+                kv_cache_params.kv_cache_block_pointers):
             hidden_states = layer(
                 hidden_states,
-                past_key_value=past,
-                sequence_length=sequence_length,
-                host_past_key_value_lengths=host_past_key_value_lengths,
                 use_cache=use_cache,
-                cache_indirection=cache_indirection,
-                kv_cache_block_pointers=pointers,
-                context_lengths=context_lengths,
-                host_context_lengths=host_context_lengths,
-                host_request_types=host_request_types,
-                max_context_length=max_context_length)
+                kv_cache_params=KeyValueCacheParams(
+                    past_key_value=[past],
+                    host_past_key_value_lengths=kv_cache_params.
+                    host_past_key_value_lengths,
+                    kv_cache_block_pointers=[pointer],
+                    cache_indirection=kv_cache_params.cache_indirection),
+                attention_params=attention_params)
 
             if use_cache:
                 presents.append(hidden_states[1])
@@ -250,32 +227,14 @@ def __init__(self,
                                     gather_output=True)
 
     def forward(self,
-                input_ids=None,
+                input_ids: Tensor,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 last_token_ids=None,
-                cache_indirection=None,
-                kv_cache_block_pointers_list=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
-
-        hidden_states = super().forward(
-            input_ids,
-            past_key_value,
-            sequence_length,
-            host_past_key_value_lengths,
-            use_cache,
-            cache_indirection,
-            kv_cache_block_pointers_list,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
+                kv_cache_params=None,
+                attention_params=None):
+        hidden_states = super().forward(input_ids, use_cache, kv_cache_params,
+                                        attention_params)
 
         if use_cache:
             hidden_states, presents = hidden_states
@@ -494,8 +453,15 @@ def prepare_inputs(self,
                                        ('max_seq_len', max_len_range),
                                    ]))
 
-        return (input_ids, position_ids, past_key_value, sequence_length,
-                host_past_key_value_lengths, True, last_token_ids,
-                cache_indirection, kv_cache_block_pointers_list,
-                context_lengths, host_context_lengths, host_request_types,
-                max_input_len)
+        return (input_ids, position_ids, True, last_token_ids,
+                KeyValueCacheParams(
+                    past_key_value=past_key_value,
+                    host_past_key_value_lengths=host_past_key_value_lengths,
+                    kv_cache_block_pointers=kv_cache_block_pointers_list,
+                    cache_indirection=cache_indirection,
+                ),
+                AttentionParams(sequence_length=sequence_length,
+                                context_lengths=context_lengths,
+                                host_context_lengths=host_context_lengths,
+                                max_context_length=max_input_len,
+                                host_request_types=host_request_types))
diff --git a/tensorrt_llm/models/gptneox/model.py b/tensorrt_llm/models/gptneox/model.py
index afb7cb2f2dc..d43ba6a7e5f 100644
--- a/tensorrt_llm/models/gptneox/model.py
+++ b/tensorrt_llm/models/gptneox/model.py
@@ -18,8 +18,8 @@
 from ..._utils import pad_vocab_size, str_dtype_to_trt
 from ...functional import (PositionEmbeddingType, Tensor,
                            gather_last_token_logits, gpt_attention)
-from ...layers import (MLP, AttentionMaskType, ColumnLinear, Embedding,
-                       LayerNorm, RowLinear)
+from ...layers import (MLP, AttentionMaskType, AttentionParams, ColumnLinear,
+                       Embedding, KeyValueCacheParams, LayerNorm, RowLinear)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ...parameter import Parameter
@@ -75,41 +75,40 @@ def __init__(self,
     def forward(self,
                 hidden_states: Tensor,
                 attention_mask=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
-                cache_indirection=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None):
         if not default_net().plugin_config.gpt_attention_plugin:
             raise ValueError(
                 'GPT-NeoX RoPE is only supported with GPTAttention plugin')
         qkv = self.qkv(hidden_states)
-        assert sequence_length is not None
-        assert host_past_key_value_lengths is not None
-        assert cache_indirection is not None
+
+        assert attention_params.is_valid(
+            default_net().plugin_config.gpt_attention_plugin,
+            default_net().plugin_config.remove_input_padding)
+        assert kv_cache_params.is_valid(
+            default_net().plugin_config.gpt_attention_plugin)
+
         context, past_key_value = gpt_attention(
-            qkv,
-            past_key_value,
-            sequence_length,
+            tensor=qkv,
+            past_key_value=kv_cache_params.get_first_past_key_value(),
+            sequence_length=attention_params.sequence_length,
+            host_past_key_value_lengths=kv_cache_params.
             host_past_key_value_lengths,
-            context_lengths,
-            cache_indirection,
-            host_request_types,
-            self.num_attention_heads,
-            self.num_attention_heads,
-            1.0,
-            self.rotary_dim,
-            self.position_embedding_type,
-            self.multi_block_mode,
-            self.kv_quantization_scale,
-            self.kv_dequantization_scale,
-            self.quant_mode,
-            max_context_length,
-            host_context_lengths=host_context_lengths)
+            context_lengths=attention_params.context_lengths,
+            cache_indirection=kv_cache_params.cache_indirection,
+            host_request_types=attention_params.host_request_types,
+            num_heads=self.num_attention_heads,
+            num_kv_heads=self.num_attention_heads,
+            q_scaling=1.0,
+            rotary_embedding_dim=self.rotary_dim,
+            position_embedding_type=self.position_embedding_type,
+            multi_block_mode=self.multi_block_mode,
+            kv_orig_quant_scale=self.kv_quantization_scale,
+            kv_quant_orig_scale=self.kv_dequantization_scale,
+            kv_cache_quant_mode=self.quant_mode,
+            max_context_length=attention_params.max_context_length,
+            host_context_lengths=attention_params.host_context_lengths)
 
         context = self.dense(context)
 
@@ -161,15 +160,9 @@ def __init__(self,
     def forward(self,
                 hidden_states: Tensor,
                 attention_mask=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
-                cache_indirection=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None):
         if not default_net(
         ).plugin_config.layernorm_plugin and trt.__version__[:3] == '8.6':
             raise AssertionError(
@@ -181,18 +174,11 @@ def forward(self,
         post_attention_layernorm_output = self.post_attention_layernorm(
             hidden_states)
 
-        attention_output = self.attention(
-            input_layernorm_output,
-            attention_mask=attention_mask,
-            past_key_value=past_key_value,
-            sequence_length=sequence_length,
-            host_past_key_value_lengths=host_past_key_value_lengths,
-            use_cache=use_cache,
-            cache_indirection=cache_indirection,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
+        attention_output = self.attention(input_layernorm_output,
+                                          attention_mask=attention_mask,
+                                          use_cache=use_cache,
+                                          kv_cache_params=kv_cache_params,
+                                          attention_params=attention_params)
 
         if use_cache:
             attention_output, presents = attention_output
@@ -240,37 +226,29 @@ def __init__(self,
         self.ln_f = LayerNorm(normalized_shape=hidden_size, dtype=dtype)
 
     def forward(self,
-                input_ids=None,
+                input_ids: Tensor,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
-                cache_indirection=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None):
         hidden_states = self.embedding(input_ids)
 
-        if past_key_value is None:
-            past_key_value = tuple([None] * len(self.layers))
+        if kv_cache_params.past_key_value is None:
+            kv_cache_params.past_key_value = tuple([None] * len(self.layers))
 
         if use_cache:
             presents = []
 
-        for layer, past in zip(self.layers, past_key_value):
+        for layer, past in zip(self.layers, kv_cache_params.past_key_value):
             hidden_states = layer(
                 hidden_states,
-                past_key_value=past,
-                sequence_length=sequence_length,
-                host_past_key_value_lengths=host_past_key_value_lengths,
                 use_cache=use_cache,
-                cache_indirection=cache_indirection,
-                context_lengths=context_lengths,
-                host_context_lengths=host_context_lengths,
-                host_request_types=host_request_types,
-                max_context_length=max_context_length)
+                kv_cache_params=KeyValueCacheParams(
+                    past_key_value=[past],
+                    host_past_key_value_lengths=kv_cache_params.
+                    host_past_key_value_lengths,
+                    cache_indirection=kv_cache_params.cache_indirection),
+                attention_params=attention_params)
 
             if use_cache:
                 presents.append(hidden_states[1])
@@ -330,24 +308,14 @@ def __init__(self,
                                     gather_output=True)
 
     def forward(self,
-                input_ids=None,
+                input_ids: Tensor,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 last_token_ids=None,
-                cache_indirection=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
-        hidden_states = super().forward(input_ids, position_ids, past_key_value,
-                                        sequence_length,
-                                        host_past_key_value_lengths, use_cache,
-                                        cache_indirection, context_lengths,
-                                        host_context_lengths,
-                                        host_request_types, max_context_length)
+                kv_cache_params=None,
+                attention_params=None):
+        hidden_states = super().forward(input_ids, position_ids, use_cache,
+                                        kv_cache_params, attention_params)
 
         if use_cache:
             hidden_states, presents = hidden_states
@@ -381,17 +349,32 @@ def prepare_inputs(self, max_batch_size, max_input_len, max_new_tokens,
         remove_input_padding = default_net().plugin_config.remove_input_padding
         use_gpt_attention_plugin = default_net(
         ).plugin_config.gpt_attention_plugin
+        use_gemm_plugin = default_net().plugin_config.gemm_plugin
 
         model_inputs = self.prepare_basic_inputs(
-            max_batch_size, max_beam_width, max_input_len, max_new_tokens,
-            num_heads, head_size, self._num_layers, self._kv_dtype,
-            remove_input_padding, use_gpt_attention_plugin)
-
-        return (model_inputs['input_ids'], model_inputs['position_ids'],
-                model_inputs['past_key_value'], model_inputs['sequence_length'],
-                model_inputs['host_past_key_value_lengths'], True,
+            max_batch_size,
+            max_beam_width,
+            max_input_len,
+            max_new_tokens,
+            num_heads,
+            head_size,
+            self._num_layers,
+            self._kv_dtype,
+            remove_input_padding,
+            use_gpt_attention_plugin,
+            use_gemm_plugin=use_gemm_plugin)
+
+        return (model_inputs['input_ids'], model_inputs['position_ids'], True,
                 model_inputs['last_token_ids'],
-                model_inputs['cache_indirection'],
-                model_inputs['context_lengths'],
-                model_inputs['host_context_lengths'],
-                model_inputs['host_request_types'], max_input_len)
+                KeyValueCacheParams(
+                    past_key_value=model_inputs['past_key_value'],
+                    host_past_key_value_lengths=model_inputs[
+                        'host_past_key_value_lengths'],
+                    cache_indirection=model_inputs['cache_indirection'],
+                ),
+                AttentionParams(
+                    sequence_length=model_inputs['sequence_length'],
+                    context_lengths=model_inputs['context_lengths'],
+                    host_context_lengths=model_inputs['host_context_lengths'],
+                    max_context_length=max_input_len,
+                    host_request_types=model_inputs['host_request_types']))
diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py
index e6134603720..2f0c1ee88ac 100644
--- a/tensorrt_llm/models/llama/model.py
+++ b/tensorrt_llm/models/llama/model.py
@@ -16,9 +16,10 @@
 
 from ..._common import default_net
 from ..._utils import pad_vocab_size, str_dtype_to_trt
-from ...functional import Tensor, gather_last_token_logits
-from ...layers import (Attention, AttentionMaskType, ColumnLinear, Embedding,
-                       GatedMLP, PositionEmbeddingType, RmsNorm)
+from ...functional import gather_last_token_logits, recv, send
+from ...layers import (Attention, AttentionMaskType, AttentionParams,
+                       ColumnLinear, Embedding, GatedMLP, KeyValueCacheParams,
+                       PositionEmbeddingType, RmsNorm)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ...quantization import QuantMode
@@ -37,10 +38,13 @@ def __init__(self,
                  attention_mask_type=AttentionMaskType.causal,
                  hidden_act='silu',
                  position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+                 rotary_base=10000.0,
+                 rotary_scaling=None,
                  mlp_hidden_size=None,
                  tp_group=None,
                  tp_size=1,
-                 quant_mode=QuantMode(0)):
+                 quant_mode=QuantMode(0),
+                 rms_norm_eps=1e-06):
         super().__init__()
         self._layer_id = layer_id  # useful for debugging
         # used for quantizing model
@@ -56,6 +60,7 @@ def __init__(self,
         self.attention_mask_type = attention_mask_type
         self.position_embedding_type = position_embedding_type
         self.input_layernorm = RmsNorm(normalized_shape=hidden_size,
+                                       eps=rms_norm_eps,
                                        dtype=dtype)
 
         self.attention = Attention(
@@ -67,6 +72,8 @@ def __init__(self,
             attention_mask_type=AttentionMaskType.causal,
             bias=False,
             position_embedding_type=position_embedding_type,
+            rotary_embedding_base=rotary_base,
+            rotary_embedding_scaling=rotary_scaling,
             tp_group=tp_group,
             tp_size=tp_size,
             use_int8_kv_cache=quant_mode.has_int8_kv_cache(),
@@ -81,37 +88,24 @@ def __init__(self,
                             tp_group=tp_group,
                             tp_size=tp_size,
                             quant_mode=quant_mode)
-        self.post_layernorm = RmsNorm(normalized_shape=hidden_size, dtype=dtype)
+        self.post_layernorm = RmsNorm(normalized_shape=hidden_size,
+                                      eps=rms_norm_eps,
+                                      dtype=dtype)
 
     def forward(self,
-                hidden_states: Tensor,
+                hidden_states,
                 attention_mask=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
-                cache_indirection=None,
-                kv_cache_block_pointers=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None):
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
 
-        attention_output = self.attention(
-            hidden_states,
-            attention_mask=attention_mask,
-            past_key_value=past_key_value,
-            sequence_length=sequence_length,
-            host_past_key_value_lengths=host_past_key_value_lengths,
-            use_cache=use_cache,
-            cache_indirection=cache_indirection,
-            kv_cache_block_pointers=kv_cache_block_pointers,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
+        attention_output = self.attention(hidden_states,
+                                          attention_mask=attention_mask,
+                                          use_cache=use_cache,
+                                          kv_cache_params=kv_cache_params,
+                                          attention_params=attention_params)
 
         if use_cache:
             attention_output, presents = attention_output
@@ -142,20 +136,26 @@ def __init__(self,
                  dtype,
                  mlp_hidden_size=None,
                  position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+                 rotary_base=10000.0,
+                 rotary_scaling=None,
                  mapping=Mapping(),
                  quant_mode=QuantMode(0),
                  use_parallel_embedding=False,
-                 embedding_sharding_dim=0):
+                 embedding_sharding_dim=0,
+                 rms_norm_eps=1e-06):
         super().__init__()
-        self.vocab_embedding = Embedding(
-            num_embeddings=vocab_size,
-            embedding_dim=hidden_size,
-            dtype=dtype,
-            tp_size=mapping.tp_size if use_parallel_embedding else 1,
-            tp_group=mapping.tp_group if use_parallel_embedding else None,
-            sharding_dim=embedding_sharding_dim,
-            tp_rank=mapping.tp_rank)
-        self.num_layers = num_layers
+        self.mapping = mapping
+
+        if self.mapping.is_first_pp_rank():
+            self.vocab_embedding = Embedding(
+                num_embeddings=vocab_size,
+                embedding_dim=hidden_size,
+                dtype=dtype,
+                tp_size=mapping.tp_size if use_parallel_embedding else 1,
+                tp_group=mapping.tp_group if use_parallel_embedding else None,
+                sharding_dim=embedding_sharding_dim,
+                tp_rank=mapping.tp_rank)
+
         self.layers = ModuleList([
             LLaMADecoderLayer(layer_id=i,
                               hidden_size=hidden_size,
@@ -166,57 +166,63 @@ def __init__(self,
                               hidden_act=hidden_act,
                               mlp_hidden_size=mlp_hidden_size,
                               position_embedding_type=position_embedding_type,
+                              rotary_base=rotary_base,
+                              rotary_scaling=rotary_scaling,
                               tp_group=mapping.tp_group,
                               tp_size=mapping.tp_size,
-                              quant_mode=quant_mode) for i in range(num_layers)
+                              quant_mode=quant_mode,
+                              rms_norm_eps=rms_norm_eps)
+            for i in self.get_transformer_layers(self.mapping, num_layers)
         ])
 
-        self.ln_f = RmsNorm(normalized_shape=hidden_size, dtype=dtype)
+        if self.mapping.is_last_pp_rank():
+            self.ln_f = RmsNorm(normalized_shape=hidden_size,
+                                eps=rms_norm_eps,
+                                dtype=dtype)
 
     def forward(self,
-                input_ids: Tensor,
+                input_ids,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 attention_mask=None,
-                cache_indirection=None,
-                kv_cache_block_pointers=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None,
+                hidden_states=None):
 
-        hidden_states = self.vocab_embedding(input_ids)
-
-        if past_key_value is None:
-            past_key_value = tuple([None] * len(self.layers))
+        if kv_cache_params.past_key_value is None:
+            tuple([None] * len(self.layers))
 
         if use_cache:
             presents = []
 
-        for layer, past, pointers in zip(self.layers, past_key_value,
-                                         kv_cache_block_pointers):
+        if self.mapping.is_first_pp_rank():
+            hidden_states = self.vocab_embedding(input_ids)
+        else:
+            hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
+
+        for layer, past, pointer in zip(
+                self.layers, kv_cache_params.past_key_value,
+                kv_cache_params.kv_cache_block_pointers):
             hidden_states = layer(
                 hidden_states,
-                past_key_value=past,
-                sequence_length=sequence_length,
-                host_past_key_value_lengths=host_past_key_value_lengths,
                 use_cache=use_cache,
                 attention_mask=attention_mask,
-                cache_indirection=cache_indirection,
-                kv_cache_block_pointers=pointers,
-                context_lengths=context_lengths,
-                host_context_lengths=host_context_lengths,
-                host_request_types=host_request_types,
-                max_context_length=max_context_length)
+                kv_cache_params=KeyValueCacheParams(
+                    past_key_value=[past],
+                    host_past_key_value_lengths=kv_cache_params.
+                    host_past_key_value_lengths,
+                    kv_cache_block_pointers=[pointer],
+                    cache_indirection=kv_cache_params.cache_indirection),
+                attention_params=attention_params)
 
             if use_cache:
                 presents.append(hidden_states[1])
                 hidden_states = hidden_states[0]
 
-        hidden_states = self.ln_f(hidden_states)
+        if self.mapping.is_last_pp_rank():
+            hidden_states = self.ln_f(hidden_states)
+        else:
+            hidden_states = send(hidden_states, self.mapping.next_pp_rank())
 
         if use_cache:
             return (hidden_states, tuple(presents))
@@ -237,23 +243,27 @@ def __init__(self,
                  logits_dtype="float32",
                  mlp_hidden_size=None,
                  position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+                 rotary_base=10000.0,
+                 rotary_scaling=None,
                  mapping=Mapping(),
                  quant_mode=QuantMode(0),
                  use_parallel_embedding=False,
-                 embedding_sharding_dim=0):
+                 embedding_sharding_dim=0,
+                 rms_norm_eps=1e-06):
 
         if isinstance(dtype, str):
-            self._kv_dtype = str_dtype_to_trt(dtype)
+            self.dtype = str_dtype_to_trt(dtype)
         else:
             assert isinstance(dtype, trt.DataType)
-            self._kv_dtype = dtype
+            self.dtype = dtype
+
         if isinstance(logits_dtype, str):
-            self._logits_dtype = str_dtype_to_trt(logits_dtype)
+            self.logits_dtype = str_dtype_to_trt(logits_dtype)
         else:
             assert isinstance(logits_dtype, trt.DataType)
-            self._logits_dtype = logits_dtype
+            self.logits_dtype = logits_dtype
 
-        self._num_layers = num_layers
+        self.num_layers = num_layers
         self.num_heads = num_heads
         if num_kv_heads is None or num_kv_heads <= 0:
             num_kv_heads = num_heads
@@ -261,65 +271,71 @@ def __init__(self,
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
         self.tp_size = mapping.tp_size
+
+        self.kv_dtype = self.dtype
         if quant_mode.has_int8_kv_cache():
-            self._kv_dtype = str_dtype_to_trt('int8')
+            self.kv_dtype = str_dtype_to_trt('int8')
+
         self.quant_mode = quant_mode
         self.use_parallel_embedding = use_parallel_embedding
         self.embedding_sharding_dim = embedding_sharding_dim
 
         super().__init__(num_layers, num_heads, num_kv_heads, hidden_size,
                          vocab_size, hidden_act, max_position_embeddings, dtype,
-                         mlp_hidden_size, position_embedding_type, mapping,
-                         quant_mode, use_parallel_embedding,
-                         embedding_sharding_dim)
+                         mlp_hidden_size, position_embedding_type, rotary_base,
+                         rotary_scaling, mapping, quant_mode,
+                         use_parallel_embedding, embedding_sharding_dim,
+                         rms_norm_eps)
 
         vocab_size_padded = pad_vocab_size(vocab_size, mapping.tp_size)
-        self.lm_head = ColumnLinear(hidden_size,
-                                    vocab_size_padded,
-                                    bias=False,
-                                    dtype=dtype,
-                                    tp_group=mapping.tp_group,
-                                    tp_size=mapping.tp_size,
-                                    gather_output=True)
+        if self.mapping.is_last_pp_rank():
+            self.lm_head = ColumnLinear(hidden_size,
+                                        vocab_size_padded,
+                                        bias=False,
+                                        dtype=dtype,
+                                        tp_group=mapping.tp_group,
+                                        tp_size=mapping.tp_size,
+                                        gather_output=True)
 
     def forward(self,
-                input_ids: Tensor,
+                input_ids,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 last_token_ids=None,
                 attention_mask=None,
-                cache_indirection=None,
-                kv_cache_block_pointers=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
-        hidden_states = super().forward(
-            input_ids, position_ids, past_key_value, sequence_length,
-            host_past_key_value_lengths, use_cache, attention_mask,
-            cache_indirection, kv_cache_block_pointers, context_lengths,
-            host_context_lengths, host_request_types, max_context_length)
+                kv_cache_params=None,
+                attention_params=None,
+                hidden_states=None):
+        hidden_states = super().forward(input_ids, position_ids, use_cache,
+                                        attention_mask, kv_cache_params,
+                                        attention_params, hidden_states)
 
         if use_cache:
             hidden_states, presents = hidden_states
 
-        hidden_states = gather_last_token_logits(
-            hidden_states, last_token_ids,
-            default_net().plugin_config.remove_input_padding)
+        if self.mapping.is_last_pp_rank():
+            hidden_states = gather_last_token_logits(
+                hidden_states, last_token_ids,
+                default_net().plugin_config.remove_input_padding)
 
-        # [batch_size, hidden_size] -> [batch_size, vocab_size]
-        lm_logits = self.lm_head(hidden_states)
-        lm_logits.mark_output('logits', self._logits_dtype)
+            # [batch_size, hidden_size] -> [batch_size, vocab_size]
+            lm_logits = self.lm_head(hidden_states)
+            lm_logits.mark_output('logits', self.logits_dtype)
+        else:
+            hidden_states.mark_output('hidden_states_output', self.dtype)
 
         if use_cache:
-            for i, present in enumerate(presents):
-                present.mark_output(f'present_key_value_{i}', self._kv_dtype)
-            return (lm_logits, presents)
-
-        return lm_logits
+            for i, present in zip(
+                    self.get_transformer_layers(self.mapping, self.num_layers),
+                    presents):
+                present.mark_output(f'present_key_value_{i}', self.kv_dtype)
+            if self.mapping.is_last_pp_rank():
+                return (lm_logits, presents)
+            return (hidden_states, presents)
+        else:
+            if self.mapping.is_last_pp_rank():
+                return lm_logits
+            return hidden_states
 
     def prepare_inputs(self,
                        max_batch_size,
@@ -337,31 +353,43 @@ def prepare_inputs(self,
 
         # Prepare inputs
         head_size = self.hidden_size // self.num_heads
-        num_heads_kv = (self.num_kv_heads + self.tp_size - 1) // self.tp_size
         remove_input_padding = default_net().plugin_config.remove_input_padding
         use_gpt_attention_plugin = default_net(
         ).plugin_config.gpt_attention_plugin
+        use_gemm_plugin = default_net().plugin_config.gemm_plugin
 
         model_inputs = self.prepare_basic_inputs(
             max_batch_size,
             max_beam_width,
             max_input_len,
             max_new_tokens,
-            num_heads_kv,
+            self.num_kv_heads,
             head_size,
-            self._num_layers,
-            self._kv_dtype,
+            self.num_layers,
+            self.kv_dtype,
             remove_input_padding=remove_input_padding,
             use_gpt_attention_plugin=use_gpt_attention_plugin,
+            use_gemm_plugin=use_gemm_plugin,
             paged_kv_cache=paged_kv_cache,
-            tokens_per_block=tokens_per_block)
+            tokens_per_block=tokens_per_block,
+            dtype=self.dtype,
+            num_heads=self.num_heads,
+            mapping=self.mapping)
 
-        return (model_inputs['input_ids'], model_inputs['position_ids'],
-                model_inputs['past_key_value'], model_inputs['sequence_length'],
-                model_inputs['host_past_key_value_lengths'], True,
+        return (model_inputs['input_ids'], model_inputs['position_ids'], True,
                 model_inputs['last_token_ids'], model_inputs['attention_mask'],
-                model_inputs['cache_indirection'],
-                model_inputs['kv_cache_block_pointers_list'],
-                model_inputs['context_lengths'],
-                model_inputs['host_context_lengths'],
-                model_inputs['host_request_types'], max_input_len)
+                KeyValueCacheParams(
+                    past_key_value=model_inputs['past_key_value'],
+                    host_past_key_value_lengths=model_inputs[
+                        'host_past_key_value_lengths'],
+                    kv_cache_block_pointers=model_inputs[
+                        'kv_cache_block_pointers_list'],
+                    cache_indirection=model_inputs['cache_indirection'],
+                ),
+                AttentionParams(
+                    sequence_length=model_inputs['sequence_length'],
+                    context_lengths=model_inputs['context_lengths'],
+                    host_context_lengths=model_inputs['host_context_lengths'],
+                    max_context_length=max_input_len,
+                    host_request_types=model_inputs['host_request_types']),
+                model_inputs['hidden_states_input'])
diff --git a/tensorrt_llm/models/opt/model.py b/tensorrt_llm/models/opt/model.py
index c1acd001cde..1d6fe26dbe7 100644
--- a/tensorrt_llm/models/opt/model.py
+++ b/tensorrt_llm/models/opt/model.py
@@ -17,8 +17,9 @@
 from ..._common import default_net
 from ..._utils import pad_vocab_size, str_dtype_to_trt
 from ...functional import Tensor, gather_last_token_logits
-from ...layers import (MLP, Attention, AttentionMaskType, ColumnLinear,
-                       LayerNorm, PositionEmbeddingType)
+from ...layers import (MLP, Attention, AttentionMaskType, AttentionParams,
+                       ColumnLinear, KeyValueCacheParams, LayerNorm,
+                       PositionEmbeddingType)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ..generation_mixin import GenerationMixin
@@ -63,15 +64,9 @@ def __init__(self,
     def forward(self,
                 hidden_states: Tensor,
                 attention_mask=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
-                cache_indirection=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None):
         residual = hidden_states
 
         attention_input = hidden_states
@@ -81,19 +76,11 @@ def forward(self,
         # At this point the hidden_states object must be a Tensor.
         assert isinstance(attention_input, Tensor)
 
-        attention_output = self.attention(
-            attention_input,
-            attention_mask=attention_mask,
-            past_key_value=past_key_value,
-            sequence_length=sequence_length,
-            host_past_key_value_lengths=host_past_key_value_lengths,
-            use_cache=use_cache,
-            cache_indirection=cache_indirection,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
-
+        attention_output = self.attention(attention_input,
+                                          attention_mask=attention_mask,
+                                          use_cache=use_cache,
+                                          kv_cache_params=kv_cache_params,
+                                          attention_params=attention_params)
         if use_cache:
             attention_output, presents = attention_output
 
@@ -162,41 +149,32 @@ def __init__(self,
             self.ln_f = LayerNorm(normalized_shape=hidden_size, dtype=dtype)
 
     def forward(self,
-                input_ids=None,
+                input_ids: Tensor,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 attention_mask=None,
-                cache_indirection=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None):
 
         hidden_states = self.embedding(input_ids, position_ids)
 
-        if past_key_value is None:
-            past_key_value = tuple([None] * len(self.layers))
+        if kv_cache_params.past_key_value is None:
+            kv_cache_params.past_key_value = tuple([None] * len(self.layers))
 
         if use_cache:
             presents = []
 
-        for layer, past in zip(self.layers, past_key_value):
+        for layer, past in zip(self.layers, kv_cache_params.past_key_value):
             hidden_states = layer(
                 hidden_states,
-                past_key_value=past,
-                sequence_length=sequence_length,
-                host_past_key_value_lengths=host_past_key_value_lengths,
                 use_cache=use_cache,
                 attention_mask=attention_mask,
-                cache_indirection=cache_indirection,
-                context_lengths=context_lengths,
-                host_context_lengths=host_context_lengths,
-                host_request_types=host_request_types,
-                max_context_length=max_context_length)
-
+                kv_cache_params=KeyValueCacheParams(
+                    past_key_value=[past],
+                    host_past_key_value_lengths=kv_cache_params.
+                    host_past_key_value_lengths,
+                    cache_indirection=kv_cache_params.cache_indirection),
+                attention_params=attention_params)
             if use_cache:
                 presents.append(hidden_states[1])
                 hidden_states = hidden_states[0]
@@ -264,31 +242,14 @@ def __init__(self,
     def forward(self,
                 input_ids: Tensor,
                 position_ids=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
                 last_token_ids=None,
                 attention_mask=None,
-                cache_indirection=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
-        hidden_states = super().forward(
-            input_ids,
-            position_ids,
-            past_key_value,
-            sequence_length,
-            host_past_key_value_lengths,
-            use_cache,
-            attention_mask,
-            cache_indirection,
-            context_lengths=context_lengths,
-            host_context_lengths=host_context_lengths,
-            host_request_types=host_request_types,
-            max_context_length=max_context_length)
-
+                kv_cache_params=None,
+                attention_params=None):
+        hidden_states = super().forward(input_ids, position_ids, use_cache,
+                                        attention_mask, kv_cache_params,
+                                        attention_params)
         if use_cache:
             hidden_states, presents = hidden_states
 
@@ -321,17 +282,32 @@ def prepare_inputs(self, max_batch_size, max_input_len, max_new_tokens,
         remove_input_padding = default_net().plugin_config.remove_input_padding
         use_gpt_attention_plugin = default_net(
         ).plugin_config.gpt_attention_plugin
+        use_gemm_plugin = default_net().plugin_config.gemm_plugin
 
         model_inputs = self.prepare_basic_inputs(
-            max_batch_size, max_beam_width, max_input_len, max_new_tokens,
-            num_heads, head_size, self._num_layers, self._kv_dtype,
-            remove_input_padding, use_gpt_attention_plugin)
-
-        return (model_inputs['input_ids'], model_inputs['position_ids'],
-                model_inputs['past_key_value'], model_inputs['sequence_length'],
-                model_inputs['host_past_key_value_lengths'], True,
+            max_batch_size,
+            max_beam_width,
+            max_input_len,
+            max_new_tokens,
+            num_heads,
+            head_size,
+            self._num_layers,
+            self._kv_dtype,
+            remove_input_padding,
+            use_gpt_attention_plugin,
+            use_gemm_plugin=use_gemm_plugin)
+
+        return (model_inputs['input_ids'], model_inputs['position_ids'], True,
                 model_inputs['last_token_ids'], model_inputs['attention_mask'],
-                model_inputs['cache_indirection'],
-                model_inputs['context_lengths'],
-                model_inputs['host_context_lengths'],
-                model_inputs['host_request_types'], max_input_len)
+                KeyValueCacheParams(
+                    past_key_value=model_inputs['past_key_value'],
+                    host_past_key_value_lengths=model_inputs[
+                        'host_past_key_value_lengths'],
+                    cache_indirection=model_inputs['cache_indirection'],
+                ),
+                AttentionParams(
+                    sequence_length=model_inputs['sequence_length'],
+                    context_lengths=model_inputs['context_lengths'],
+                    host_context_lengths=model_inputs['host_context_lengths'],
+                    max_context_length=max_input_len,
+                    host_request_types=model_inputs['host_request_types']))
diff --git a/tensorrt_llm/models/quantized/quant.py b/tensorrt_llm/models/quantized/quant.py
index 9cf63a38db9..38c6c0a1c7a 100644
--- a/tensorrt_llm/models/quantized/quant.py
+++ b/tensorrt_llm/models/quantized/quant.py
@@ -17,7 +17,8 @@
 import numpy as np
 
 from ...layers import ColumnLinear, RowLinear
-from ...models import GPTJForCausalLM, GPTLMHeadModel, LLaMAForCausalLM
+from ...models import (FalconForCausalLM, GPTJForCausalLM, GPTLMHeadModel,
+                       LLaMAForCausalLM)
 from ...quantization import QuantMode
 
 # isort: off
@@ -293,7 +294,10 @@ def _default_fp8_quantize(model: Union[GPTLMHeadModel, LLaMAForCausalLM,
     This is used by benchmark script and therefore is intentionally decoupled from AMMO toolkit
     """
     if quant_scales is None:
-        quant_scales = get_dummy_quant_scales(model._num_layers)
+        num_layers = getattr(model, '_num_layers',
+                             getattr(model, 'num_layers', None))
+        assert num_layers is not None
+        quant_scales = get_dummy_quant_scales(num_layers)
 
     assert model.quant_mode == quant_mode, "Quant setting not consistent with model init setting"
 
@@ -307,11 +311,9 @@ def _default_fp8_quantize(model: Union[GPTLMHeadModel, LLaMAForCausalLM,
 
 
 def fp8_quantize(model, quant_mode: QuantMode, quant_scales: dict = None):
-    if isinstance(model, LLaMAForCausalLM):
+    if isinstance(
+            model,
+        (FalconForCausalLM, GPTJForCausalLM, GPTLMHeadModel, LLaMAForCausalLM)):
         return _default_fp8_quantize(model, quant_mode, quant_scales)
-    elif isinstance(model, GPTJForCausalLM):
-        return _default_fp8_quantize(model, quant_mode, quant_scales)
-    elif isinstance(model, GPTLMHeadModel):
-        return _default_fp8_quantize(model, quant_mode, quant_scales)
-    else:
-        assert False, f"Model {model} is not implemented by fp8_quantize yet"
+    raise NotImplementedError(
+        f"Model {model} is not implemented by fp8_quantize yet")
diff --git a/tensorrt_llm/network.py b/tensorrt_llm/network.py
index c997c1614ef..058172ccc3f 100644
--- a/tensorrt_llm/network.py
+++ b/tensorrt_llm/network.py
@@ -69,6 +69,8 @@ def _init(self, trt_network):
         self._plugin_config = PluginConfig()
         self._module_call_stack = _TrtLlmModuleCallStack()
         self._registered_ndarrays = []
+        self._strongly_typed = trt.INetworkDefinition.get_flag(
+            self._trt_network, trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED)
 
         return self
 
@@ -89,6 +91,10 @@ def trt_network(self) -> trt.INetworkDefinition:
     def plugin_config(self) -> PluginConfig:
         return self._plugin_config
 
+    @property
+    def strongly_typed(self) -> bool:
+        return self._strongly_typed
+
     def _add_input(self,
                    tensor,
                    name,
@@ -112,9 +118,22 @@ def _add_input(self,
         self._inputs[name] = tensor
 
     def _mark_output(self, tensor, name, dtype):
-        self.trt_network.mark_output(tensor.trt_tensor)
-        tensor.trt_tensor.name = name
-        tensor.trt_tensor.dtype = dtype
+        from .functional import cast
+
+        if self.strongly_typed:
+            if tensor.trt_tensor.dtype != dtype:
+                # If stronglyTyped mode is enabled and inferred output dtype does not match desired dtype, add a cast.
+                cast_output = cast(tensor, dtype)
+                self.trt_network.mark_output(cast_output.trt_tensor)
+                cast_output.trt_tensor.name = name
+            else:
+                # Otherwise, mark the tensor as network output. We should not set tensor dtype in stronglyTyped mode.
+                self.trt_network.mark_output(tensor.trt_tensor)
+                tensor.trt_tensor.name = name
+        else:
+            self.trt_network.mark_output(tensor.trt_tensor)
+            tensor.trt_tensor.name = name
+            tensor.trt_tensor.dtype = dtype
         logger.debug(f'Mark output: {name}, dtype: {dtype}')
 
     def set_named_parameters(self, named_parameters):
@@ -427,12 +446,15 @@ def _get_network_hash(self, lightweight=True) -> bytes:
 
 @contextlib.contextmanager
 def net_guard(network):
+    from ._common import net
     assert isinstance(
         network, Network
     ), f"Invalid network, can only guard Network instance, got: {network}"
+
+    old_net = net
     set_network(network)
     yield
-    set_network(None)
+    set_network(old_net)
 
 
 class _TrtLlmModuleCallStack(object):
diff --git a/tensorrt_llm/plugin/__init__.py b/tensorrt_llm/plugin/__init__.py
index 7e34d2e142e..793da51f767 100644
--- a/tensorrt_llm/plugin/__init__.py
+++ b/tensorrt_llm/plugin/__init__.py
@@ -12,6 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .plugin import _TRT_LLM_PLUGIN_NAMESPACE, PluginConfig, _load_plugin_lib
+from .plugin import (TRT_LLM_PLUGIN_NAMESPACE, PluginConfig, _load_plugin_lib,
+                     plugin_lib_path)
 
-__all__ = ['_TRT_LLM_PLUGIN_NAMESPACE', '_load_plugin_lib', 'PluginConfig']
+__all__ = [
+    'TRT_LLM_PLUGIN_NAMESPACE', '_load_plugin_lib', 'PluginConfig',
+    'plugin_lib_path'
+]
diff --git a/tensorrt_llm/plugin/plugin.py b/tensorrt_llm/plugin/plugin.py
index 21e58486768..5923dca2932 100644
--- a/tensorrt_llm/plugin/plugin.py
+++ b/tensorrt_llm/plugin/plugin.py
@@ -13,27 +13,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import ctypes
+import platform
 from enum import IntEnum
 from pathlib import Path
 
 from tensorrt_llm.logger import logger
 
-_TRT_LLM_PLUGIN_NAMESPACE = 'tensorrt_llm'
+TRT_LLM_PLUGIN_NAMESPACE = 'tensorrt_llm'
 
 
-def _load_plugin_lib():
+def plugin_lib_path():
     project_dir = str(Path(__file__).parent.parent.absolute())
 
     # load tensorrt_llm plugin
-    plugin_lib = project_dir + '/libs/libnvinfer_plugin_tensorrt_llm.so'
-    handle = ctypes.CDLL(plugin_lib, mode=ctypes.RTLD_GLOBAL)
+    if platform.system() != "Windows":
+        return project_dir + '/libs/libnvinfer_plugin_tensorrt_llm.so'
+    else:  # Windows
+        return project_dir + '/libs/nvinfer_plugin_tensorrt_llm.dll'
+
+
+def _load_plugin_lib():
+    handle = ctypes.CDLL(plugin_lib_path(), mode=ctypes.RTLD_GLOBAL)
     if handle is None:
         raise ImportError('TensorRT-LLM Plugin is unavailable')
 
-    handle.initLibNvInferPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-    handle.initLibNvInferPlugins.restype = ctypes.c_bool
-    assert handle.initLibNvInferPlugins(
-        None, _TRT_LLM_PLUGIN_NAMESPACE.encode('utf-8'))
+    handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+    handle.initTrtLlmPlugins.restype = ctypes.c_bool
+    assert handle.initTrtLlmPlugins(None,
+                                    TRT_LLM_PLUGIN_NAMESPACE.encode('utf-8'))
 
 
 class ContextFMHAType(IntEnum):
@@ -69,7 +76,6 @@ def init(self):
         self.quantize_tensor_plugin = False
         self.paged_kv_cache = False
         self.lookup_plugin = False
-        self.in_flight_batching = False
 
     def enable_qk_half_accum(self):
         self.attention_qk_half_accumulation = True
@@ -98,11 +104,6 @@ def enable_paged_kv_cache(self):
         logger.info(f"Paged KV Cache Enabled")
         return self
 
-    def enable_in_flight_batching(self):
-        self.in_flight_batching = True
-        logger.info(f"In-Flight Batching Enabled")
-        return self
-
     def set_gpt_attention_plugin(self, dtype='float16'):
         self.gpt_attention_plugin = dtype
         return self
diff --git a/tensorrt_llm/quantization/functional.py b/tensorrt_llm/quantization/functional.py
index 1659c4fd0c5..4b56653e1d4 100644
--- a/tensorrt_llm/quantization/functional.py
+++ b/tensorrt_llm/quantization/functional.py
@@ -20,7 +20,7 @@
 from .._common import default_net, default_trtnet
 from .._utils import str_dtype_to_np, str_dtype_to_trt
 from ..functional import Tensor, _create_tensor, cast, clip, constant, round
-from ..plugin import _TRT_LLM_PLUGIN_NAMESPACE as TRT_LLM_PLUGIN_NAMESPACE
+from ..plugin import TRT_LLM_PLUGIN_NAMESPACE
 
 
 def smooth_quant_gemm(input: Tensor, weights: Tensor, scales_a: Tensor,
@@ -258,17 +258,27 @@ def quantize(input: Tensor,
 
     output = _create_tensor(layer.get_output(0), layer)
 
-    layer.get_output(0).dtype = str_dtype_to_trt(dtype)
+    if not default_net().strongly_typed:
+        layer.get_output(0).dtype = str_dtype_to_trt(dtype)
 
     return output
 
 
-def dequantize(input: Tensor, scale_factor: Tensor, axis: int = -1) -> Tensor:
+def dequantize(input: Tensor,
+               scale_factor: Tensor,
+               axis: int = -1,
+               output_type: Union[str, trt.DataType] = 'float16') -> Tensor:
+
+    if isinstance(output_type, str):
+        output_type = str_dtype_to_trt(output_type)
+
     layer = default_trtnet().add_dequantize(input.trt_tensor,
-                                            scale_factor.trt_tensor)
+                                            scale_factor.trt_tensor,
+                                            output_type)
     layer.axis = axis
 
-    layer.precision = input.dtype
+    if not default_net().strongly_typed:
+        layer.precision = input.dtype
 
     output = _create_tensor(layer.get_output(0), layer)
 
diff --git a/tensorrt_llm/quantization/layers.py b/tensorrt_llm/quantization/layers.py
index 761af57f492..98ecdfa78c5 100644
--- a/tensorrt_llm/quantization/layers.py
+++ b/tensorrt_llm/quantization/layers.py
@@ -657,42 +657,6 @@ def forward(self, hidden_states):
         return output
 
 
-class FP8RowLinear(RowLinear):
-
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 bias=True,
-                 dtype=None,
-                 tp_group=None,
-                 tp_size=1):
-        super().__init__(in_features,
-                         out_features,
-                         bias=bias,
-                         dtype=dtype,
-                         tp_group=tp_group,
-                         tp_size=tp_size)
-        self.activation_scaling_factor = Parameter(shape=(1, ),
-                                                   dtype=trt.float32)
-        self.weights_scaling_factor = Parameter(shape=(1, ), dtype=trt.float32)
-
-    def forward(self, x):
-        act_cast_out = cast(x, 'float32')
-
-        quantized_out = quantize(act_cast_out,
-                                 self.activation_scaling_factor.value, 'fp8')
-        dequantized_out = dequantize(quantized_out,
-                                     self.activation_scaling_factor.value)
-
-        w_cast_out = cast(self.weight.value, 'float32')
-
-        w_quant_out = quantize(w_cast_out, self.weights_scaling_factor.value,
-                               'fp8')
-        w_deq_out = dequantize(w_quant_out, self.weights_scaling_factor.value)
-
-        return self.multiply_reduce(dequantized_out, w_deq_out, False)
-
-
 class Int8SmoothQuantRowLinear(RowLinear):
 
     def __init__(self,
@@ -810,18 +774,116 @@ def __init__(self,
         self.weights_scaling_factor = Parameter(shape=(1, ), dtype=trt.float32)
 
     def forward(self, x):
-        act_cast_out = cast(x, 'float32')
-        quantized_out = quantize(act_cast_out,
-                                 self.activation_scaling_factor.value, 'fp8')
+        if default_net().strongly_typed:
+            assert x.dtype == self.dtype
+            assert x.dtype == self.weight.value.dtype
+        if default_net(
+        ).strongly_typed and self.activation_scaling_factor.value.dtype != self.dtype:
+            x = cast(x, self.activation_scaling_factor.value.dtype)
+            quantized_out = quantize(x, self.activation_scaling_factor.value,
+                                     'fp8')
+        else:
+            quantized_out = quantize(x, self.activation_scaling_factor.value,
+                                     'fp8')
+
         dequantized_out = dequantize(quantized_out,
-                                     self.activation_scaling_factor.value)
+                                     self.activation_scaling_factor.value, -1,
+                                     self.activation_scaling_factor.value.dtype)
 
-        w_cast_out = cast(self.weight.value, 'float32')
-        w_quant_out = quantize(w_cast_out, self.weights_scaling_factor.value,
-                               'fp8')
-        w_deq_out = dequantize(w_quant_out, self.weights_scaling_factor.value)
+        if default_net(
+        ).strongly_typed and self.activation_scaling_factor.value.dtype != self.dtype:
+            dequantized_out = cast(dequantized_out, self.dtype)
+
+        if default_net(
+        ).strongly_typed and self.weights_scaling_factor.value.dtype != self.dtype:
+            cast_weight = cast(self.weight.value,
+                               self.weights_scaling_factor.value.dtype)
+
+            w_quant_out = quantize(cast_weight,
+                                   self.weights_scaling_factor.value, 'fp8')
+        else:
 
-        return self.multiply_gather(dequantized_out, w_deq_out, False)
+            w_quant_out = quantize(self.weight.value,
+                                   self.weights_scaling_factor.value, 'fp8')
+
+        w_deq_out = dequantize(w_quant_out, self.weights_scaling_factor.value,
+                               -1, self.weights_scaling_factor.value.dtype)
+
+        if default_net(
+        ).strongly_typed and self.weights_scaling_factor.value.dtype != self.dtype:
+            w_deq_out = cast(w_deq_out, self.dtype)
+
+        # TODO(nkorobov): allow gemm plugin default_net().plugin_config.gemm_plugin
+        return self.multiply_gather(dequantized_out,
+                                    w_deq_out,
+                                    False,
+                                    use_fp8=True)
+
+
+class FP8RowLinear(RowLinear):
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 bias=True,
+                 dtype=None,
+                 tp_group=None,
+                 tp_size=1):
+        super().__init__(in_features,
+                         out_features,
+                         bias=bias,
+                         dtype=dtype,
+                         tp_group=tp_group,
+                         tp_size=tp_size)
+        self.activation_scaling_factor = Parameter(shape=(1, ),
+                                                   dtype=trt.float32)
+        self.weights_scaling_factor = Parameter(shape=(1, ), dtype=trt.float32)
+
+    def forward(self, x):
+        if default_net().strongly_typed:
+            assert x.dtype == self.dtype
+            assert x.dtype == self.weight.value.dtype
+
+        if default_net(
+        ).strongly_typed and self.activation_scaling_factor.value.dtype != self.dtype:
+            x = cast(x, self.activation_scaling_factor.value.dtype)
+            quantized_out = quantize(x, self.activation_scaling_factor.value,
+                                     'fp8')
+        else:
+            quantized_out = quantize(x, self.activation_scaling_factor.value,
+                                     'fp8')
+
+        dequantized_out = dequantize(quantized_out,
+                                     self.activation_scaling_factor.value, -1,
+                                     self.activation_scaling_factor.value.dtype)
+
+        if default_net(
+        ).strongly_typed and self.activation_scaling_factor.value.dtype != self.dtype:
+            dequantized_out = cast(dequantized_out, self.dtype)
+
+        if default_net(
+        ).strongly_typed and self.weights_scaling_factor.value.dtype != self.dtype:
+            cast_weight = cast(self.weight.value,
+                               self.weights_scaling_factor.value.dtype)
+
+            w_quant_out = quantize(cast_weight,
+                                   self.weights_scaling_factor.value, 'fp8')
+        else:
+            w_quant_out = quantize(self.weight.value,
+                                   self.weights_scaling_factor.value, 'fp8')
+
+        w_deq_out = dequantize(w_quant_out, self.weights_scaling_factor.value,
+                               -1, self.weights_scaling_factor.value.dtype)
+
+        if default_net(
+        ).strongly_typed and self.weights_scaling_factor.value.dtype != self.dtype:
+            w_deq_out = cast(w_deq_out, self.dtype)
+
+        # TODO(nkorobov): allow gemm plugin default_net().plugin_config.gemm_plugin
+        return self.multiply_reduce(dequantized_out,
+                                    w_deq_out,
+                                    False,
+                                    use_fp8=True)
 
 
 class SmoothQuantGatedMLP(SmoothQuantMLP):
@@ -969,16 +1031,9 @@ def __init__(self,
     def forward(self,
                 hidden_states: Tensor,
                 attention_mask=None,
-                past_key_value=None,
-                sequence_length=None,
-                host_past_key_value_lengths=None,
                 use_cache=False,
-                cache_indirection=None,
-                kv_cache_block_pointers=None,
-                context_lengths=None,
-                host_context_lengths=None,
-                host_request_types=None,
-                max_context_length=None):
+                kv_cache_params=None,
+                attention_params=None):
         # TODO(nkorobov) add in-flight batching to SmoothQuant
         if default_net().plugin_config.smooth_quant_gemm_plugin:
             qkv = self.qkv(hidden_states)
@@ -988,39 +1043,40 @@ def forward(self,
             raise ValueError("gpt_attention_plugin is not set")
 
         if default_net().plugin_config.gpt_attention_plugin:
-            assert sequence_length is not None
-            assert host_past_key_value_lengths is not None
+
+            assert attention_params.is_valid(
+                default_net().plugin_config.gpt_attention_plugin,
+                default_net().plugin_config.remove_input_padding)
+            assert kv_cache_params.is_valid(
+                default_net().plugin_config.gpt_attention_plugin)
             assert self.attention_mask_type == AttentionMaskType.causal, \
                 'Plugin only support masked MHA.'
-            assert context_lengths is not None
-            assert host_request_types is not None
-            if default_net().plugin_config.remove_input_padding:
-                assert host_context_lengths is not None
-
             kv_quant_scale = self.kv_orig_quant_scale.value if self.quant_mode.has_int8_kv_cache(
             ) else None
             kv_dequant_scale = self.kv_quant_orig_scale.value if self.quant_mode.has_int8_kv_cache(
             ) else None
             context, past_key_value = gpt_attention(
-                qkv,
-                past_key_value,
-                sequence_length,
+                tensor=qkv,
+                past_key_value=kv_cache_params.get_first_past_key_value(),
+                sequence_length=attention_params.sequence_length,
+                host_past_key_value_lengths=kv_cache_params.
                 host_past_key_value_lengths,
-                context_lengths,
-                cache_indirection,
-                host_request_types,
-                self.num_attention_heads,
-                self.num_kv_heads,
-                self.q_scaling,
-                self.rotary_embedding_dim,
-                self.position_embedding_type,
-                self.multi_block_mode,
-                kv_quant_scale,
-                kv_dequant_scale,
-                self.quant_mode,
-                max_context_length,
-                kv_cache_block_pointers=kv_cache_block_pointers,
-                host_context_lengths=host_context_lengths)
+                context_lengths=attention_params.context_lengths,
+                cache_indirection=kv_cache_params.cache_indirection,
+                host_request_types=attention_params.host_request_types,
+                num_heads=self.num_attention_heads,
+                num_kv_heads=self.num_kv_heads,
+                q_scaling=self.q_scaling,
+                rotary_embedding_dim=self.rotary_embedding_dim,
+                position_embedding_type=self.position_embedding_type,
+                multi_block_mode=self.multi_block_mode,
+                kv_orig_quant_scale=kv_quant_scale,
+                kv_quant_orig_scale=kv_dequant_scale,
+                kv_cache_quant_mode=self.quant_mode,
+                max_context_length=attention_params.max_context_length,
+                kv_cache_block_pointers=kv_cache_params.
+                get_first_kv_cache_block_pointers(),
+                host_context_lengths=attention_params.host_context_lengths)
         else:
             assert self.paged_kv_cache == False
 
@@ -1037,6 +1093,7 @@ def transpose_for_scores(x):
             key = transpose_for_scores(key)
             value = transpose_for_scores(value)
 
+            past_key_value = kv_cache_params.get_first_past_key_value()
             if past_key_value is not None:
 
                 def dequantize_tensor(x, scale):
diff --git a/tensorrt_llm/runtime/generation.py b/tensorrt_llm/runtime/generation.py
index fd5584357ad..dfef5563339 100644
--- a/tensorrt_llm/runtime/generation.py
+++ b/tensorrt_llm/runtime/generation.py
@@ -15,7 +15,7 @@
 import csv
 import math
 from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Sequence
+from typing import Dict, List, Optional, Sequence, Union
 
 import numpy as np
 import tensorrt as trt
@@ -244,19 +244,20 @@ class SamplingConfig:
     pad_id: int
 
     num_beams: int = field(default=1)
-    temperature: float = field(default=1.0)
-    top_k: int = field(default=1)
-    top_p: float = field(default=0.0)
-    length_penalty: float = field(default=1.0)
-    repetition_penalty: float = field(default=1.0)
-    min_length: int = field(default=1)
-    presence_penalty: float = field(default=0.0)
+    temperature: Union[float, torch.Tensor] = field(default=1.0)
+    top_k: Union[int, torch.Tensor] = field(default=1)
+    top_p: Union[float, torch.Tensor] = field(default=0.0)
+    length_penalty: Union[float, torch.Tensor] = field(default=1.0)
+    repetition_penalty: Union[float, torch.Tensor] = field(default=1.0)
+    min_length: Union[int, torch.Tensor] = field(default=1)
+    presence_penalty: Union[float, torch.Tensor] = field(default=0.0)
     use_beam_hyps: bool = field(default=True)
 
     ## None here means user didn't set it, and dynamicDecodeOp.cpp take optional value
     ## The real default value is set in dynamicDecodeOp.cpp when it's None
-    beam_search_diversity_rate: float = field(init=False, default=None)
-    random_seed: int = field(init=False, default=None)
+    beam_search_diversity_rate: Union[float, torch.Tensor] = field(init=False,
+                                                                   default=None)
+    random_seed: Union[int, torch.Tensor] = field(init=False, default=None)
     output_cum_log_probs: bool = field(init=False, default=False)
     output_log_probs: bool = field(init=False, default=False)
 
@@ -417,32 +418,81 @@ def __setup_decoder(self, input_ids: torch.Tensor,
         '''
         batch_size = host_context_lengths.shape[0]
         scfg = sampling_config  # just to make a shorter name, no other meaning
-        self.top_k = torch.full([batch_size], scfg.top_k, dtype=torch.int32)
-        self.top_p = torch.full([batch_size], scfg.top_p, dtype=torch.float32)
-        self.temperature = torch.full([batch_size],
-                                      scfg.temperature,
-                                      dtype=torch.float32)
-        self.repetition_penalty = torch.full([batch_size],
-                                             scfg.repetition_penalty,
-                                             dtype=torch.float32)
-        if scfg.repetition_penalty == 1.0:
-            self.repetition_penalty = None
-
-        self.length_penalty = torch.FloatTensor([scfg.length_penalty])
+        if isinstance(scfg.top_k, torch.Tensor):
+            assert scfg.top_k.dtype == torch.int32, f"scfg.top_k.dtype ({scfg.top_k.dtype}) must be torch.int32"
+            assert scfg.top_k.shape[
+                0] == batch_size, f"scfg.top_k.shape[0] ({scfg.top_k.shape[0]}) must equal to batch_size ({batch_size})"
+            self.top_k = scfg.top_k
+        else:
+            self.top_k = torch.full([batch_size], scfg.top_k, dtype=torch.int32)
 
-        self.presence_penalty = torch.full([batch_size],
-                                           scfg.presence_penalty,
-                                           dtype=torch.float32)
-        if scfg.presence_penalty == 0.0:
+        if isinstance(scfg.top_p, torch.Tensor):
+            assert scfg.top_p.dtype == torch.float32, f"scfg.top_p.dtype ({scfg.top_p.dtype}) must be torch.float32"
+            assert scfg.top_p.shape[
+                0] == batch_size, f"scfg.top_p.shape[0] ({top_p.shape[0]}) must equal to batch_size ({batch_size})"
+            self.top_p = scfg.top_p
+        else:
+            self.top_p = torch.full([batch_size],
+                                    scfg.top_p,
+                                    dtype=torch.float32)
+
+        if isinstance(scfg.temperature, torch.Tensor):
+            assert scfg.temperature.dtype == torch.float32, f"scfg.temperature.dtype ({scfg.temperature.dtype}) must be torch.float32"
+            assert scfg.temperature.shape[
+                0] == batch_size, f"scfg.temperature.shape[0] ({scfg.temperature.shape[0]}) must equal to batch_size ({batch_size})"
+            self.temperature = scfg.temperature
+        else:
+            self.temperature = torch.full([batch_size],
+                                          scfg.temperature,
+                                          dtype=torch.float32)
+
+        if isinstance(scfg.repetition_penalty, torch.Tensor):
+            assert scfg.repetition_penalty.dtype == torch.float32, f"scfg.repetition_penalty.dtype ({scfg.repetition_penalty.dtype}) must be torch.float32"
+            assert scfg.repetition_penalty.shape[
+                0] == batch_size, f"scfg.repetition_penalty.shape[0] ({scfg.repetition_penalty.shape[0]}) must equal to batch_size ({batch_size})"
+            self.repetition_penalty = scfg.repetition_penalty
+        elif scfg.repetition_penalty == 1.0:
+            self.repetition_penalty = None
+        else:
+            self.repetition_penalty = torch.full([batch_size],
+                                                 scfg.repetition_penalty,
+                                                 dtype=torch.float32)
+
+        self.length_penalty = torch.FloatTensor([scfg.length_penalty
+                                                 ])  # only support scalar now
+
+        if isinstance(scfg.presence_penalty, torch.Tensor):
+            assert scfg.presence_penalty.dtype == torch.float32, f"scfg.presence_penalty.dtype ({scfg.presence_penalty.dtype}) must be torch.float32"
+            assert scfg.presence_penalty.shape[
+                0] == batch_size, f"scfg.presence_penalty.shape[0] ({scfg.presence_penalty.shape[0]}) must equal to batch_size ({batch_size})"
+            self.presence_penalty = scfg.presence_penalty
+        elif scfg.presence_penalty == 0.0:
             self.presence_penalty = None
+        else:
+            self.presence_penalty = torch.full([batch_size],
+                                               scfg.presence_penalty,
+                                               dtype=torch.float32)
+
         assert (
             scfg.presence_penalty == 0.0 or scfg.repetition_penalty == 0.0
         ), f"presence_penalty({scfg.presence_penalty}) and repetition_penalty({scfg.repetition_penalty}) cannot be larger than 0.0 at the same time."
-        self.min_length = torch.full([batch_size],
-                                     scfg.min_length,
-                                     dtype=torch.int32)
 
-        if scfg.beam_search_diversity_rate is not None:
+        if isinstance(scfg.min_length, torch.Tensor):
+            assert scfg.min_length.dtype == torch.int32, f"scfg.min_length.dtype ({scfg.min_length.dtype}) must be torch.int32"
+            assert scfg.min_length.shape[
+                0] == batch_size, f"scfg.min_length.shape[0] ({scfg.min_length.shape[0]}) must equal to batch_size ({batch_size})"
+            self.min_length = scfg.min_length
+        else:
+            self.min_length = torch.full([batch_size],
+                                         scfg.min_length,
+                                         dtype=torch.int32)
+
+        if isinstance(scfg.beam_search_diversity_rate, torch.Tensor):
+            assert scfg.beam_search_diversity_rate.dtype == torch.float32, f"scfg.beam_search_diversity_rate.dtype ({scfg.beam_search_diversity_rate.dtype}) must be torch.float32"
+            assert scfg.beam_search_diversity_rate.shape[
+                0] == batch_size, f"scfg.beam_search_diversity_rate.shape[0] ({scfg.beam_search_diversity_rate.shape[0]}) must equal to batch_size ({batch_size})"
+            self.beam_search_diversity_rate = scfg.beam_search_diversity_rate
+        elif scfg.beam_search_diversity_rate is not None:
             self.beam_search_diversity_rate = torch.full(
                 [batch_size],
                 scfg.beam_search_diversity_rate,
@@ -450,7 +500,12 @@ def __setup_decoder(self, input_ids: torch.Tensor,
         else:
             self.beam_search_diversity_rate = None
 
-        if scfg.random_seed is not None:
+        if isinstance(scfg.random_seed, torch.Tensor):
+            assert scfg.random_seed.dtype == torch.int64, f"scfg.random_seed.dtype ({scfg.random_seed.dtype}) must be torch.int64"
+            assert scfg.random_seed.shape[
+                0] == batch_size, f"scfg.random_seed.shape[0] ({scfg.random_seed.shape[0]}) must equal to batch_size ({batch_size})"
+            self.random_seed = scfg.random_seed
+        elif scfg.random_seed is not None:
             self.random_seed = torch.full([batch_size],
                                           scfg.random_seed,
                                           dtype=torch.int64)
@@ -493,19 +548,20 @@ def __setup_decoder(self, input_ids: torch.Tensor,
             tiled_input_ids.permute(2, 0, 1)
             self.output_ids = torch.cat(
                 (tiled_input_ids,
-                 torch.zeros(batch_size,
-                             scfg.num_beams,
-                             self.max_seq_length - max_context_length,
-                             dtype=padded_input_ids.dtype,
-                             device=padded_input_ids.device)),
+                 torch.full((batch_size, scfg.num_beams,
+                             self.max_seq_length - max_context_length),
+                            scfg.end_id,
+                            dtype=padded_input_ids.dtype,
+                            device=padded_input_ids.device)),
                 axis=-1)
         else:
             self.output_ids = torch.cat(
                 (padded_input_ids,
-                 torch.zeros(batch_size,
-                             self.max_seq_length - max_context_length,
-                             dtype=padded_input_ids.dtype,
-                             device=padded_input_ids.device)),
+                 torch.full(
+                     (batch_size, self.max_seq_length - max_context_length),
+                     scfg.end_id,
+                     dtype=padded_input_ids.dtype,
+                     device=padded_input_ids.device)),
                 axis=-1)
 
         self.parent_ids = torch.zeros(
@@ -600,6 +656,7 @@ def setup(self,
         self.max_context_length = max_context_length
         self.max_new_tokens = max_new_tokens
         self.max_seq_length = max_context_length + max_new_tokens
+        self.beam_width = beam_width
 
         self.buffer = {
             'logits':
@@ -978,6 +1035,9 @@ def decode(self,
         assert max_context_length == self.max_context_length, \
             "Given input length is large then the one used in setup()," \
             "rerun the setup function with the new max_context_length to avoid buffer overflow."
+        assert beam_width == self.beam_width, \
+            "Given beam width is different from the one used in setup()," \
+            "rerun the setup function with the new beam width to avoid buffer overflow."
         ite = 0  # index of local batches, will always be 0 if pp_size = 1
 
         self.__setup_decoder(input_ids, scfg, host_context_lengths)
@@ -1120,16 +1180,9 @@ def decode(self,
                 self.buffer['logits'] = _tile_beam_width(
                     self.buffer['logits'], beam_width)
 
-            # Handle sequence_lengths.
-            if self.use_gpt_attention_plugin:
-                # Initialize sequence_lengths (no paddings) for the generation phase.
-                if step == 0:
-                    self.sequence_length_buffer = torch.add(
-                        context_lengths.detach().clone(), 1)
-                # + 1 for each new step.
-                else:
-                    self.sequence_length_buffer = torch.add(
-                        self.sequence_length_buffer, 1)
+            # Initialize sequence_lengths (no paddings) for the generation phase.
+            if step == 0:
+                self.sequence_length_buffer = context_lengths.detach().clone()
 
             if not step == self.max_new_tokens - 1:
                 # Set shape and address for the next step
@@ -1203,7 +1256,7 @@ def decode(self,
                     context_lengths, sequence_limit_lengths, stop_words_list,
                     bad_words_list, no_repeat_ngram_size,
                     this_src_cache_indirection, self.output_ids,
-                    self.new_tokens, self.finished, sequence_lengths,
+                    self.new_tokens, self.finished, self.sequence_length_buffer,
                     self.cum_log_probs, self.log_probs, self.parent_ids,
                     this_tgt_cache_indirection, self.beam_hyps_output_ids_tgt,
                     self.beam_hyps_sequence_lengths_tgt,
@@ -1220,19 +1273,18 @@ def decode(self,
 
                     # output shape of self.gather_tree: [batch_size, beam_width, output_len]
                     final_output_ids = self.gather_tree(
-                        sequence_lengths, self.output_ids, self.parent_ids,
-                        self.end_ids, context_lengths, self.cum_log_probs,
-                        self.beam_hyps_output_ids_tgt,
+                        self.sequence_length_buffer, self.output_ids,
+                        self.parent_ids, self.end_ids, context_lengths,
+                        self.cum_log_probs, self.beam_hyps_output_ids_tgt,
                         self.beam_hyps_sequence_lengths_tgt,
                         self.beam_hyps_cum_log_probs,
                         self.beam_hyps_normed_scores, self.beam_hyps_log_probs,
                         self.beam_hyps_min_normed_scores,
                         self.beam_hyps_num_beams, self.beam_hyps_is_done,
                         self.finished, self.length_penalty, batch_size,
-                        beam_width, max_context_length, self.max_seq_length,
-                        scfg.use_beam_hyps)
+                        beam_width, self.max_seq_length, scfg.use_beam_hyps)
                     if do_return_sequence_length:
-                        return final_output_ids, sequence_lengths.reshape(
+                        return final_output_ids, self.sequence_length_buffer.reshape(
                             [batch_size, beam_width])
                     else:
                         return final_output_ids
@@ -1251,17 +1303,17 @@ def decode(self,
 
         # output shape of self.gather_tree: [batch_size, beam_width, output_len]
         final_output_ids = self.gather_tree(
-            sequence_lengths, self.output_ids, self.parent_ids, self.end_ids,
-            context_lengths, self.cum_log_probs, self.beam_hyps_output_ids_tgt,
-            self.beam_hyps_sequence_lengths_tgt, self.beam_hyps_cum_log_probs,
-            self.beam_hyps_normed_scores, self.beam_hyps_log_probs,
-            self.beam_hyps_min_normed_scores, self.beam_hyps_num_beams,
-            self.beam_hyps_is_done, self.finished, self.length_penalty,
-            batch_size, beam_width, max_context_length, self.max_seq_length,
+            self.sequence_length_buffer, self.output_ids, self.parent_ids,
+            self.end_ids, context_lengths, self.cum_log_probs,
+            self.beam_hyps_output_ids_tgt, self.beam_hyps_sequence_lengths_tgt,
+            self.beam_hyps_cum_log_probs, self.beam_hyps_normed_scores,
+            self.beam_hyps_log_probs, self.beam_hyps_min_normed_scores,
+            self.beam_hyps_num_beams, self.beam_hyps_is_done, self.finished,
+            self.length_penalty, batch_size, beam_width, self.max_seq_length,
             scfg.use_beam_hyps)
 
         if do_return_sequence_length:
-            return final_output_ids, sequence_lengths.reshape(
+            return final_output_ids, self.sequence_length_buffer.reshape(
                 [batch_size, beam_width])
         else:
             return final_output_ids
diff --git a/tensorrt_llm/tools/__init__.py b/tensorrt_llm/tools/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorrt_llm/tools/plugin_gen/__init__.py b/tensorrt_llm/tools/plugin_gen/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorrt_llm/tools/plugin_gen/core.py b/tensorrt_llm/tools/plugin_gen/core.py
new file mode 100644
index 00000000000..87e4b953af9
--- /dev/null
+++ b/tensorrt_llm/tools/plugin_gen/core.py
@@ -0,0 +1,693 @@
+import glob
+import os
+from collections import OrderedDict
+from dataclasses import dataclass, field
+from enum import Enum
+from functools import lru_cache
+from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List, Tuple,
+                    Union)
+
+from tensorrt_llm.logger import logger
+
+if TYPE_CHECKING:
+    from tensorrt_llm.tools.plugin_gen.plugin_gen import _TritonAotArgs, _TritonKernelCompileArgs, _TrtPluginGenArgs, _CopyOutputArgs
+
+import jinja2
+import yaml
+
+pjoin = os.path.join
+
+
+class DType(Enum):
+    FP16 = 0
+    FP32 = 1
+    FP64 = 2
+    INT8 = 3
+    INT32 = 4
+    INT64 = 5
+
+    @staticmethod
+    def get_str(d: "DType"):
+        assert isinstance(d, DType)
+        return d.to("c")
+
+    @staticmethod
+    def get_trt_dtype(d: "DType") -> str:
+        assert isinstance(d, DType)
+        return d.to("trt")
+
+    def to(self, dst: str) -> str:
+        if dst == 'trt_plugin_py':
+            map = DType.get_map("dtype", 'trt_plugin')
+            ret = map[self]
+            return ret[1:]  # skip the preceeding 'k'
+
+        map = DType.get_map("dtype", dst)
+        ret = map[self]
+        return ret
+
+    @lru_cache
+    @staticmethod
+    def get_map(src: str, dst: str) -> Dict[Any, Any]:
+        idx = dict(dtype=0, abbr=1, trt=2, c=3, np=4, trt_plugin=5)
+        return {x[idx[src]]: x[idx[dst]] for x in DType._get_dtype_strs()}
+
+    @staticmethod
+    def _get_dtype_strs() -> List[Tuple["DType", str, str, str, str, str]]:
+        return [(DType.FP16, "fp16", "kHALF", "half", "float16", "kFLOAT16"),
+                (DType.FP32, "fp32", "kFLOAT", "float", "float32", "kFLOAT32"),
+                (DType.FP64, "fp64", "kDOUBLE", "double", "float64",
+                 "kFLOAT64"),
+                (DType.INT8, "i8", "kINT8", "int8_t", "int8", "kINT8"),
+                (DType.INT32, "i32", "kINT32", "int32_t", "int32", "kINT32"),
+                (DType.INT64, "i64", "kINT64", "int64_t", "int64", "KINT64")]
+
+
+class Type:
+
+    def __init__(self, s: str):
+        to_dtype = DType.get_map("abbr", "dtype")
+        is_tensor = False
+        if s.startswith("tensor"):
+            is_tensor = True
+
+            s = s.split("[")[1].split("]")[0]
+
+        self.is_tensor = is_tensor
+        self.dtype = to_dtype[s]
+
+    def to_triton_sig(self) -> str:
+        dic = DType.get_map("dtype", "abbr")
+        return f'*{dic[self.dtype]}' if self.is_tensor else dic[self.dtype]
+
+    def __str__(self) -> str:
+        dic = DType.get_map("dtype", "abbr")
+        return f"tensor[{dic[self.dtype]}]" if self.is_tensor else dic[
+            self.dtype]
+
+    @staticmethod
+    def from_str(s: str):
+        return Type(s)
+
+    @property
+    def is_scalar(self) -> bool:
+        return not self.is_tensor
+
+    @staticmethod
+    def tensor_ty(dtype: "DType"):
+        return Type(f"tensor[{DType.get_str(dtype)}]")
+
+    @staticmethod
+    def float16() -> "Type":
+        return Type("fp16")
+
+    @staticmethod
+    def float32() -> "Type":
+        return Type("fp32")
+
+    @staticmethod
+    def float64() -> "Type":
+        return Type("fp64")
+
+    @staticmethod
+    def int8() -> "Type":
+        return Type("i8")
+
+    @staticmethod
+    def int32() -> "Type":
+        return Type("i32")
+
+    @staticmethod
+    def int64() -> "Type":
+        return Type("i64")
+
+
+@dataclass
+class Argument:
+    ''' An input or output parameter of a Triton kernel.  '''
+
+    class ArgType(Enum):
+        # an input
+        INPUT = 0
+        # an output
+        OUTPUT = 1
+        # an argument that is a PluginField of a plugin
+        PARAM = 2
+        # an argument that is a dimension size of a tensor
+        # such argument could be deduced from input tensors' shapes, so they will be hidden from plugin's input
+        DIM_SIZE = 3
+
+    name: str
+    dtype: Type
+    # offset of the argument in either input or output arglist
+    # NOTE, both input and output offsets are counted from 0
+    offset: int = 0
+    hints: List[str] = field(default_factory=list)
+    arg_type: "Argument.ArgType" = ArgType.INPUT
+
+    @property
+    def is_output(self) -> bool:
+        return self.arg_type == Argument.ArgType.OUTPUT
+
+    @property
+    def is_input(self) -> bool:
+        return self.arg_type == Argument.ArgType.INPUT
+
+    @property
+    def is_param(self) -> bool:
+        return self.arg_type == Argument.ArgType.PARAM
+
+    @property
+    def is_dim_size(self) -> bool:
+        return self.arg_type == Argument.ArgType.DIM_SIZE
+
+    def to_dict(self, force_str=False) -> Dict[str, Any]:
+        return dict(
+            name=self.name,
+            dtype=str(self.dtype),
+            is_input=self.is_input,
+            offset=self.offset,
+            hints=self.hints,
+            arg_type=self.arg_type.name,
+        )
+
+    @property
+    def is_tensor(self):
+        return self.dtype.is_tensor
+
+
+@dataclass
+class InputArg(Argument):
+    '''
+    Suger for creating an input argument.
+    '''
+    arg_type: ClassVar = Argument.ArgType.INPUT
+
+
+@dataclass
+class OutputArg(Argument):
+    '''
+    Sugar for creating an output argument.
+    '''
+    arg_type: ClassVar = Argument.ArgType.OUTPUT
+
+
+@dataclass
+class ParamArg(Argument):
+    '''
+    Sugar for creating a parameter argument.
+
+    This will generate a TensorRT PluginField.
+    '''
+    arg_type: ClassVar = Argument.ArgType.PARAM
+
+
+@dataclass
+class DimSizeArg(Argument):
+    '''
+    Sugar for creating a dimension size argument.
+
+    This will generate a TensorRT PluginField.
+    '''
+    arg_type: ClassVar = Argument.ArgType.DIM_SIZE
+    code: str = field(default_factory=str, init=False)
+    dtype: ClassVar[Type] = Type("i32")  # i64?
+
+
+@dataclass
+class Constexpr:
+    ''' tl.constexpr '''
+    value: int
+
+    def to_dict(self, force_str=False) -> Dict[str, Any]:
+        return dict(value=self.value)
+
+
+@dataclass
+class KernelMetaData:
+    '''
+    All the necessary metadata of a Triton kernel.
+
+    This acts as the core data structure for the configuration required for generating the Triton plugin.
+    '''
+    ArgT = Union[Constexpr, ParamArg, InputArg, OutputArg, DimSizeArg]
+
+    kernel_name: str = ""
+    ios: List[ArgT] = field(default_factory=list)
+    shape_infer_rules: List[str] = field(default_factory=list)
+    version: int = 1
+    kernel_file: str = ""  # path to the Triton kernel file
+    num_warps: int = 1
+    num_stages: int = 1
+    grid_dims: Tuple[str, str, str] = ("1", "1", "1")
+
+    _name_to_arg: Dict[str, Argument] = field(default_factory=dict, init=False)
+
+    def __post_init__(self):
+        # build name_to_arg mapping
+        self._name_to_arg.clear()
+        for io in self.ios:
+            if isinstance(io, Argument):
+                self._name_to_arg[io.name] = io
+
+        # set the argument offset
+        for arg_off, arg in enumerate(self.get_inputs()):
+            arg.offset = arg_off
+        for arg_off, arg in enumerate(self.get_outputs()):
+            arg.offset = arg_off
+
+        self._validate()
+
+    @property
+    def arguments(self) -> Iterable[Argument]:
+        return filter(lambda x: isinstance(x, Argument), self.ios)
+
+    @staticmethod
+    def load_from_yaml(yaml_path: str = "",
+                       yaml_str: str = "") -> "KernelMetaData":
+        assert yaml_path or yaml_str, "Either yaml_path or yaml_str should be given"
+        if yaml_path:
+            with open(yaml_path, "r") as f:
+                yaml_str = f.read()
+        yaml_data = yaml.load(yaml_str, Loader=yaml.Loader)
+
+        kernel_name = yaml_data["name"]
+        ios = []
+        for arg_name, arg_data in yaml_data["arguments"].items():
+            if "value" in arg_data:
+                ios.append(Constexpr(arg_data["value"]))
+            else:
+                ios.append(
+                    Argument(arg_name,
+                             Type(arg_data["dtype"]),
+                             arg_type=Argument.ArgType[arg_data.get(
+                                 "arg_type", "INPUT")]))
+        shape_infer_rules = yaml_data.get("shape_infer_rules", [])
+        version = yaml_data.get("version", 1)
+        grid_dims = yaml_data["grid_dims"]
+
+        # TODO[chunweiy]: add other metadata for launching a Triton kernel
+
+        return KernelMetaData(kernel_name=kernel_name,
+                              ios=ios,
+                              shape_infer_rules=shape_infer_rules,
+                              version=version,
+                              grid_dims=grid_dims)
+
+    def to_yaml(self) -> str:
+        ''' Convert the metadata to a YAML string. '''
+        ret = dict(
+            name=self.kernel_name,
+            version=self.version,
+            arguments=OrderedDict(),
+            shape_deduce=self.shape_infer_rules,
+            grid_dims=self.grid_dims,
+        )
+
+        const_count = 0
+        for arg in self.ios:
+            if isinstance(arg, Argument):
+                ret["arguments"][arg.name] = arg.to_dict(force_str=True)
+            else:
+                ret["arguments"][f"constexpr_{const_count}"] = arg.to_dict()
+                const_count += 1
+
+        logger.info(f"load {self.num_inputs} inputs")
+        logger.info(f"load {self.num_outputs} outputs")
+        logger.info(f"load {self.num_constexprs} constexprs")
+
+        return yaml.dump(ret)
+
+    def to_triton_signatures(self) -> List[str]:
+        '''
+        Generate the signatures for the Triton compile.py tool.
+        '''
+        signature = []
+        hints = []
+        for arg in self.ios:
+            if isinstance(arg, Argument):
+                sig = arg.dtype.to_triton_sig()
+                signature.append(sig)
+                hints.append(arg.hints)
+            else:
+                signature.append(str(arg.value))
+                hints.append([])
+
+        # the number of hints should be the same across the arguments those have hints
+        num_hints_per_arg = 0
+        for cur in hints:
+            if cur and num_hints_per_arg == 0:
+                num_hints_per_arg = len(cur)
+            if cur:
+                assert len(
+                    cur
+                ) == num_hints_per_arg, f"The number of hints should be the same across the arguments those have hints, get {len(cur)} mismatch with {num_hints_per_arg}"
+
+        num_hints_per_arg = max(num_hints_per_arg, 1)
+        # fill the empty hints
+        for cur in hints:
+            if len(cur) != num_hints_per_arg:
+                cur.extend([''] * (num_hints_per_arg - len(cur)))
+
+        signatures = []
+        for sig, hints in zip(signature, hints):
+            signatures.append(
+                [f"{sig}:{hint}" if hint else sig for hint in hints])
+
+        return [', '.join(sig) for sig in zip(*signatures)]
+
+    def get_inputs(self) -> Iterable[Argument]:
+        return filter(lambda x: x.is_input, self.arguments)
+
+    def get_outputs(self) -> Iterable[Argument]:
+        return filter(lambda x: x.is_output, self.arguments)
+
+    def get_dim_size_args(self) -> Iterable[Argument]:
+        return filter(lambda x: x.is_dim_size, self.arguments)
+
+    def get_params(self) -> Iterable[Argument]:
+        return filter(lambda x: x.is_param, self.arguments)
+
+    @property
+    def num_inputs(self) -> int:
+        return len(list(filter(lambda x: x.is_input, self.arguments)))
+
+    @property
+    def num_outputs(self) -> int:
+        return len(list(filter(lambda x: not x.is_input, self.arguments)))
+
+    @property
+    def num_constexprs(self) -> int:
+        return len(self.ios) - self.num_inputs - self.num_outputs
+
+    def to_TritonAotArgs(self, workspace: str) -> '_TritonAotArgs':
+        '''
+        Get a TritonAotArgs from the metadata.
+
+        Args:
+            workspace: the root directory for all the stages generations.
+        '''
+        from tensorrt_llm.tools.plugin_gen.plugin_gen import _TritonAotArgs
+        return _TritonAotArgs(
+            kernel_name=self.kernel_name,
+            workspace=workspace,
+            kernel_file=self.kernel_file,
+            configs=[
+                _TritonAotArgs._AotConfig(
+                    output_name=self.kernel_name,
+                    num_warps=self.num_warps,
+                    num_stages=self.num_stages,
+                    signature=sig,
+                ) for sig in self.to_triton_signatures()
+            ],
+            grid_dims=self.grid_dims,
+        )
+
+    def to_TritonKernelCompileArgs(
+            self, workspace: str) -> '_TritonKernelCompileArgs':
+        from tensorrt_llm.tools.plugin_gen.plugin_gen import \
+            _TritonKernelCompileArgs
+        return _TritonKernelCompileArgs(workspace=workspace,
+                                        kernel_name=self.kernel_name)
+
+    def to_TrtPluginGenArgs(self, workspace: str) -> '_TrtPluginGenArgs':
+        from tensorrt_llm.tools.plugin_gen.plugin_gen import _TrtPluginGenArgs
+        return _TrtPluginGenArgs(workspace=workspace,
+                                 config=self,
+                                 kernel_name=self.kernel_name)
+
+    def to_TrtPluginCompileArgs(self,
+                                workspace: str) -> '_TrtPluginCompileArgs':
+        from tensorrt_llm.tools.plugin_gen.plugin_gen import \
+            _TrtPluginCompileArgs
+        return _TrtPluginCompileArgs(workspace=workspace)
+
+    def to_CopyOutputArgs(self, workspace: str) -> '_CopyOutputArgs':
+        from tensorrt_llm.tools.plugin_gen.plugin_gen import _CopyOutputArgs
+        return _CopyOutputArgs(
+            so_path=pjoin(workspace, 'build', 'libtriton_plugins.so'),
+            functional_py_path=pjoin(workspace, 'functional.py'),
+            output_dir=pjoin(workspace, 'output'),
+        )
+
+    def _validate(self):
+        assert self.num_inputs > 0, "At least one input should be given"
+        assert self.num_outputs > 0, "At least one output should be given"
+
+
+def _render_common_parameters():
+    return dict(
+        triton_aot_dir='_triton_aot',
+        generate_trt_plugin_dir='_generate_trt_plugin',
+        compile_trt_plugin_dir='_compile_trt_plugin',
+        compile_triton_kernel_dir='_compile_triton_kernel',
+    )
+
+
+@dataclass
+class PluginCppCodegen:
+    ''' Generate the C++ code for a Triton plugin, including a xPlugin.h, xPlugin.cpp '''
+    output_dir: str
+    meta_data: KernelMetaData
+
+    def __post_init__(self):
+        from tensorrt_llm.tools.plugin_gen.shape_infer import CppCodeTranspiler
+
+        # parse the rules
+        transpiler = CppCodeTranspiler(self.meta_data._name_to_arg)
+        self.shape_infer_code, dim_size_infer_code = transpiler(
+            self.meta_data.shape_infer_rules)
+
+        for arg in self.meta_data.get_dim_size_args():
+            arg.code = dim_size_infer_code[arg.name]
+
+    @property
+    def plugin_name(self) -> str:
+        return f"{self.meta_data.kernel_name}Plugin"
+
+    def generate(self):
+        file_base_name = "plugin"
+
+        # generate header file
+        with open(pjoin(self.output_dir, file_base_name + ".h"), "w") as f:
+            f.write(self._render('plugin.h.tpl'))
+
+        # generate cpp file
+        with open(pjoin(self.output_dir, file_base_name + ".cpp"), "w") as f:
+            f.write(self._render('plugin.cpp.tpl'))
+
+        # dump meta_data to yaml for later collection in cmake
+        with open(pjoin(self.output_dir, 'plugin.yml'), 'w') as f:
+            f.write(self.meta_data.to_yaml())
+
+    def _render(self, tpl_path: str):
+        tpl_data = dict(
+            kernel_name=self.meta_data.kernel_name,
+            plugin_name=f"{self.meta_data.kernel_name}Plugin",
+            kernel_version=self.meta_data.version,
+            construct_arg_list=self.construct_arg_list,
+            getOutputDimensions_body=self.getOutputDimensions_body,
+            # for supportsFormatCombination
+            inputs=list(self.meta_data.get_inputs()),
+            outputs=list(self.meta_data.get_outputs()),
+            dim_size_args=list(self.meta_data.get_dim_size_args()),
+            params=list(self.meta_data.get_params()),
+            param_names=[arg.name for arg in self.meta_data.get_params()],
+            io_count=self.get_io_count(),
+            input_count=len(list(self.meta_data.get_inputs())),
+            output_count=len(list(self.meta_data.get_outputs())),
+            configurePlugin_body=self.configurePlugin_body,
+            getWorkspaceSize_body=self.getWorkspaceSize_body,
+            enqueue_body_arg_list=self.enqueue_body_arg_list,
+            getNbOutputs_body=self.getNbOutputs_body,
+            creator_constructor_body=self.creator_constructor_body,
+            plugin_version='0',  # TODO[chunweiy]: update it
+            **_render_common_parameters(),
+        )
+
+        env = setup_jinja_env()
+        return env.get_template(tpl_path).render(tpl_data)
+
+    def get_io_count(self) -> int:
+        return len(list(self.meta_data.get_inputs())) + len(
+            list(self.meta_data.get_outputs()))
+
+    @property
+    def construct_arg_list(self) -> str:
+        return ", ".join("%s %s" % (arg.dtype.dtype.to("c"), arg.name)
+                         for arg in self.meta_data.get_params())
+
+    @property
+    def getOutputDimensions_body(self) -> str:
+        lines = [code(f"nvinfer1::DimsExprs outputDims;")
+                 ] + self.shape_infer_code + [code("return outputDims;")]
+        indent = 2 * ' '
+        return '\n'.join(indent + line for line in lines)
+
+    @property
+    def configurePlugin_body(self) -> str:
+        return ""
+
+    @property
+    def getWorkspaceSize_body(self) -> str:
+        return code("return 0;")
+
+    @property
+    def enqueue_body_arg_list(self) -> str:
+        # Here we add two additional arguments: stream and algo_id=0 for lauching the triton kernel
+        return ", ".join(["stream"] +
+                         [arg.name for arg in self.meta_data.arguments] + ['0'])
+
+    @property
+    def getOutputDataType_body(self) -> str:
+        outputs = filter(lambda x: x.is_output, self.meta_data.arguments)
+        ret = []
+        for off, out in enumerate(outputs):
+            ret.append(code(f"if (index == {off}) {{"))
+            ret.append(
+                code(
+                    f"return nvinfer1::DataType::{DType.get_trt_dtype(out.dtype.dtype)};"
+                ))
+            ret.append(code("}"))
+        return '\n'.join(ret)
+
+    @property
+    def getNbOutputs_body(self) -> str:
+        return code(f"return {self.meta_data.num_outputs};")
+
+    @property
+    def serialize_body(self) -> str:
+        return ""
+
+    @property
+    def creator_constructor_body(self) -> str:
+        return ""
+
+    def getPluginVersion_body(self) -> str:
+        return code(f"return {self.meta_data.version};")
+
+
+@dataclass
+class PluginPyCodegen:
+    ''' Generate the Python functional wrapper for a Triton plugin.  '''
+
+    out_path: str
+    meta_data: KernelMetaData
+    add_header: bool
+    plugin_lib_path: str
+
+    def generate(self):
+        write_mode = "w" if self.add_header else "a"
+        with open(self.out_path, write_mode) as f:
+            env = setup_jinja_env()
+
+            tpl_data = dict(
+                metadata=self.meta_data,
+                plugin_name=f"{self.meta_data.kernel_name}Plugin",
+                kernel_name=self.meta_data.kernel_name,
+                kernel_ret=self.kernel_ret,
+                kernel_version=self.meta_data.version,
+                arg_list=', '.join(arg.name for arg in self.get_arg_list()),
+                add_header=self.add_header,
+                params=self.meta_data.get_params(),
+                inputs=[
+                    dict(name=arg.name,
+                         np_type=arg.dtype.dtype.to("np"),
+                         trt_type=arg.dtype.dtype.to("trt")) for arg in filter(
+                             lambda x: x.is_input, self.meta_data.arguments)
+                ],
+                input_list=', '.join(arg.name
+                                     for arg in self.meta_data.get_inputs()),
+                num_outputs=len(list(self.meta_data.get_outputs())),
+                plugin_lib_path=self.plugin_lib_path,
+            )
+
+            content = env.get_template("functional.py.tpl").render(tpl_data)
+            f.write(content)
+
+    def get_arg_list(self) -> Iterable[Argument]:
+        # NOTE: for easizer argument passing, here DONT follow the argument order in the original Triton kernel
+        for arg in self.meta_data.get_params():
+            yield arg
+        for arg in self.meta_data.get_inputs():
+            yield arg
+
+    @property
+    def kernel_ret(self):
+        return ', '.join(f"_create_tensor(layer.get_output({i}), layer)"
+                         for i in range(self.meta_data.num_outputs))
+
+
+@dataclass
+class PluginRegistryCodegen:
+    '''
+    Generate the code for adding all the detected Triton plugins to the TensorRT registry.
+    '''
+    out_path: str
+    plugin_names: List[str]
+
+    def generate(self):
+        with open(self.out_path, "w") as f:
+            env = setup_jinja_env()
+
+            tpl_data = dict(plugin_creators=[
+                f"{plugin_name}PluginCreator"
+                for plugin_name in self.plugin_names
+            ],
+                            headers=[
+                                f"{plugin_name}/_generate_trt_plugin/plugin.h"
+                                for plugin_name in self.plugin_names
+                            ])
+
+            print('tpl_data', tpl_data)
+
+            content = env.get_template("tritonPlugins.cpp.tpl").render(tpl_data)
+            f.write(content)
+
+
+@dataclass
+class PluginCmakeCodegen:
+    ''' Generate the CMakeLists.txt for a Triton plugin.  '''
+
+    out_path: str
+    workspace: str
+    plugin_names: List[str]
+    kernel_names: List[str]
+
+    def generate(self):
+        with open(self.out_path, "w") as f:
+            env = setup_jinja_env()
+
+            kernel_object_files = []
+            for kernel_name in self.kernel_names:
+                path = f"{kernel_name}/_triton_aot/*.c"
+                kernel_object_files += glob.glob(path, root_dir=self.workspace)
+
+            assert kernel_object_files
+
+            tpl_data = dict(plugin_lib='triton_plugins',
+                            plugin_names=self.plugin_names,
+                            kernel_names=self.kernel_names,
+                            workspace=self.workspace,
+                            kernel_object_files=kernel_object_files,
+                            **_render_common_parameters())
+
+            content = env.get_template("CMakeLists.txt.tpl").render(tpl_data)
+            f.write(content)
+
+
+def setup_jinja_env():
+    env = jinja2.Environment(
+        loader=jinja2.PackageLoader(
+            package_name="tensorrt_llm.tools.plugin_gen",
+            package_path="templates",
+        ),
+        undefined=jinja2.StrictUndefined,
+    )
+    env.variable_start_string = '[['
+    env.variable_end_string = ']]'
+    return env
+
+
+def code(*lines):
+    return "\n".join(lines)
diff --git a/tensorrt_llm/tools/plugin_gen/plugin_gen.py b/tensorrt_llm/tools/plugin_gen/plugin_gen.py
new file mode 100644
index 00000000000..7c73662d501
--- /dev/null
+++ b/tensorrt_llm/tools/plugin_gen/plugin_gen.py
@@ -0,0 +1,339 @@
+'''
+This file is a script tool for generating TensorRT plugin library for Triton.
+'''
+import argparse
+import glob
+#import pkg_resources
+import logging
+import os
+import subprocess
+import sys
+from dataclasses import dataclass
+from typing import ClassVar, Iterable, List, Tuple, Union
+
+try:
+    import triton
+except ImportError:
+    raise ImportError("Triton is not installed. Please install it first.")
+
+from tensorrt_llm.tools.plugin_gen.core import (KernelMetaData,
+                                                PluginCmakeCodegen,
+                                                PluginCppCodegen,
+                                                PluginPyCodegen,
+                                                PluginRegistryCodegen)
+
+PYTHON_BIN = sys.executable
+
+TRITON_ROOT = os.path.dirname(triton.__file__)
+TRITON_COMPILE_BIN = os.path.join(TRITON_ROOT, 'tools', 'compile.py')
+TRITON_LINK_BIN = os.path.join(TRITON_ROOT, 'tools', 'link.py')
+
+
+@dataclass
+class StageArgs:
+    workspace: str  # the root directory for all the stages
+    kernel_name: str
+
+    @property
+    def sub_workspace(self) -> str:
+        return os.path.join(self.workspace, self.kernel_name,
+                            f"_{self.stage_name}")
+
+
+@dataclass
+class _TritonAotArgs(StageArgs):
+    stage_name: ClassVar[str] = 'triton_aot'
+
+    @dataclass
+    class _AotConfig:
+        output_name: str
+        num_warps: int
+        num_stages: int
+        signature: str
+
+    kernel_file: str
+    configs: List[_AotConfig]
+    grid_dims: Tuple[str, str, str]
+
+
+@dataclass
+class _TritonKernelCompileArgs(StageArgs):
+    stage_name: ClassVar[str] = 'compile_triton_kernel'
+
+
+@dataclass
+class _TrtPluginGenArgs(StageArgs):
+    stage_name: ClassVar[str] = 'generate_trt_plugin'
+
+    config: KernelMetaData
+
+
+@dataclass
+class _TrtPluginCompileArgs:
+    workspace: str
+
+    stage_name: ClassVar[str] = 'compile_trt_plugin'
+
+    @property
+    def sub_workspace(self) -> str:
+        return self.workspace
+
+
+@dataclass
+class _CopyOutputArgs:
+    so_path: str
+    functional_py_path: str
+    output_dir: str
+
+    stage_name: ClassVar[str] = 'copy_output'
+
+
+@dataclass
+class Stage:
+    '''
+    Stage represents a stage in the plugin generation process. e.g. Triton AOT could be a stage.
+    '''
+
+    config: Union[_TritonAotArgs, _TritonKernelCompileArgs, _TrtPluginGenArgs,
+                  _TrtPluginCompileArgs, _CopyOutputArgs]
+
+    def run(self):
+        stages = {
+            _TritonAotArgs.stage_name: self.do_triton_aot,
+            _TritonKernelCompileArgs.stage_name: self.do_compile_triton_kernel,
+            _TrtPluginGenArgs.stage_name: self.do_generate_trt_plugin,
+            _TrtPluginCompileArgs.stage_name: self.do_compile_trt_plugin,
+            _CopyOutputArgs.stage_name: self.do_copy_output,
+        }
+
+        logging.info(f"Running stage {self.config.stage_name}")
+
+        stages[self.config.stage_name]()
+
+    def do_triton_aot(self):
+        compile_dir = self.config.sub_workspace
+        _clean_path(compile_dir)
+
+        # compile each config for different hints
+        for config in self.config.configs:
+            command = [
+                PYTHON_BIN, TRITON_COMPILE_BIN, self.config.kernel_file, '-n',
+                self.config.kernel_name, '-o', f"{compile_dir}/kernel",
+                '--out-name', config.output_name, '-w',
+                str(config.num_warps), '-s', f"{config.signature}", '-g',
+                ','.join(self.config.grid_dims), '--num-stages',
+                str(config.num_stages)
+            ]
+            _run_command(command)
+
+        # link and get a kernel launcher with all the configs
+        h_files = glob.glob(os.path.join(compile_dir, '*.h'))
+        command = [
+            PYTHON_BIN,
+            TRITON_LINK_BIN,
+            *h_files,
+            '-o',
+            os.path.join(compile_dir, 'launcher'),
+        ]
+        _run_command(command)
+
+    def do_compile_triton_kernel(self):
+        '''
+        Compile the triton kernel to library.
+        '''
+        #assert isinstance(self.args, _TritonKernelCompileArgs)
+
+        from triton.common import cuda_include_dir, libcuda_dirs
+        kernel_dir = os.path.join(self.config.workspace,
+                                  self.config.kernel_name, '_triton_aot')
+        compile_dir = self.config.sub_workspace
+        _mkdir(compile_dir)
+        _clean_path(compile_dir)
+
+        c_files = glob.glob(os.path.join(os.getcwd(), kernel_dir, "*.c"))
+        assert c_files
+        _run_command([
+            "gcc",
+            "-c",
+            *c_files,
+            "-I",
+            cuda_include_dir(),
+            "-L",
+            libcuda_dirs()[0],
+            "-l",
+            "cuda",
+        ],
+                     cwd=compile_dir)
+
+        o_files = glob.glob(os.path.join(os.getcwd(), compile_dir, "*.o"))
+        assert o_files
+        '''
+        _run_command([
+            "ar", "rcs",
+            f"lib{self.args.kernel_name}.a", *o_files
+        ], cwd=compile_dir)
+        '''
+
+    def do_generate_trt_plugin(self):
+        '''
+        Generate the trt plugin from the triton kernel library.
+        '''
+        #assert isinstance(self.args, _TrtPluginGenArgs)
+        workspace = self.config.sub_workspace
+        _mkdir(workspace)
+        _clean_path(workspace)
+
+        PluginCppCodegen(output_dir=workspace,
+                         meta_data=self.config.config).generate()
+
+    def do_compile_trt_plugin(self):
+        '''
+        Compile the trt plugin library.
+        '''
+
+        # collect all the kernels within the workspace
+        #assert isinstance(self.args, _TrtPluginCompileArgs)
+
+        def collect_all_kernel_configs(
+                workspace: str) -> Iterable[KernelMetaData]:
+            ymls = glob.glob(os.path.join(workspace, '**/*.yml'),
+                             recursive=True)
+            for path in ymls:
+                yield KernelMetaData.load_from_yaml(path)
+
+        configs = list(collect_all_kernel_configs(self.config.workspace))
+
+        kernel_names = [config.kernel_name for config in configs]
+
+        PluginRegistryCodegen(out_path=os.path.join(self.config.sub_workspace,
+                                                    'tritonPlugins.cpp'),
+                              plugin_names=kernel_names).generate()
+        PluginCmakeCodegen(out_path=os.path.join(self.config.sub_workspace,
+                                                 'CMakeLists.txt'),
+                           plugin_names=kernel_names,
+                           kernel_names=kernel_names,
+                           workspace=self.config.workspace).generate()
+
+        functional_py = os.path.join(self.config.workspace, 'functional.py')
+        plugin_lib_path = os.path.join(self.config.workspace, 'build',
+                                       'libtriton_plugins.so')
+
+        for i, kernel_config in enumerate(configs):
+            PluginPyCodegen(out_path=functional_py,
+                            meta_data=kernel_config,
+                            add_header=i == 0,
+                            plugin_lib_path=plugin_lib_path).generate()
+
+        # TODO[chunweiy]: make it customizable
+        trt_llm_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                    '../../..')
+
+        # create build directory and compile
+        _run_command(['mkdir', '-p', 'build'], cwd=self.config.sub_workspace)
+        _run_command(['rm', '-rf', 'build/*'], cwd=self.config.sub_workspace)
+        _run_command([
+            'cmake',
+            '..',
+            '-DTRT_LLM_INCLUDE_DIR=' + os.path.join(trt_llm_path, 'cpp'),
+            '-DTRT_LLM_LIB_DIR=' +
+            os.path.join(trt_llm_path, 'tensorrt_llm', 'libs'),
+        ],
+                     cwd=os.path.join(self.config.sub_workspace, "build"))
+        _run_command(['make', '-j'],
+                     cwd=os.path.join(self.config.sub_workspace, "build"))
+
+    def do_copy_output(self):
+        '''
+        Copy the output to the destination directory.
+        '''
+        _mkdir(self.config.output_dir)
+        _clean_path(self.config.output_dir)
+        # copy the so file
+        _run_command(['cp', self.config.so_path, self.config.output_dir])
+
+        # copy the functional.py
+        _run_command(
+            ['cp', self.config.functional_py_path, self.config.output_dir])
+
+
+def gen_trt_plugins(workspace: str, metas: List[KernelMetaData]):
+    '''
+    Generate TRT plugins end-to-end.
+    '''
+    for meta in metas:
+        Stage(meta.to_TritonAotArgs(workspace)).run()
+        Stage(meta.to_TritonKernelCompileArgs(workspace)).run()
+        Stage(meta.to_TrtPluginGenArgs(workspace)).run()
+
+    # collect all the plugins
+    Stage(metas[0].to_TrtPluginCompileArgs(workspace)).run()
+    Stage(metas[0].to_CopyOutputArgs(workspace)).run()
+
+
+def _clean_path(path: str):
+    '''
+    Clean the content within this directory
+    '''
+    _rmdir(path)
+    _mkdir(path)
+
+
+def _mkdir(path: str):
+    '''
+    mkdir if not exists
+    '''
+    subprocess.run(['mkdir', '-p', path], check=True)
+
+
+def _rmdir(path: str):
+    '''
+    rmdir if exists
+    '''
+    subprocess.run(['rm', '-rf', path], check=True)
+
+
+def _run_command(args, cwd=None):
+    print(f"Running command: {' '.join(args)}")
+    subprocess.run(args, check=True, cwd=cwd)
+
+
+def parse_arguments(args):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--workspace',
+        type=str,
+        required=True,
+        help="the root path to store all the intermediate files")
+    parser.add_argument(
+        '--kernel_config',
+        type=str,
+        required=True,
+        help=
+        'the path to the kernel config file, which should be a python module containing KernelMetaData instances'
+    )
+    args = parser.parse_args(args)
+    return args
+
+
+def import_from_file(module_name, file_path):
+    import importlib.util
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+if __name__ == '__main__':
+    args = parse_arguments(None)
+
+    assert args.kernel_config.endswith(
+        '.py'), f"Kernel config {args.kernel_config} should be a python module"
+    module_name = os.path.basename(args.kernel_config).replace('.py', '')
+
+    config_module = import_from_file(module_name, args.kernel_config)
+
+    kernel_configs: List[KernelMetaData] = config_module.KERNELS
+
+    gen_trt_plugins(args.workspace, kernel_configs)
diff --git a/tensorrt_llm/tools/plugin_gen/shape_infer.py b/tensorrt_llm/tools/plugin_gen/shape_infer.py
new file mode 100644
index 00000000000..b79eaa56790
--- /dev/null
+++ b/tensorrt_llm/tools/plugin_gen/shape_infer.py
@@ -0,0 +1,322 @@
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Dict, List, Tuple, Union
+
+from lark import Lark, Token, Tree
+
+if TYPE_CHECKING:
+    from tensorrt_llm.tools.plugin_gen.core import Argument
+
+parser = Lark(r"""
+value: SIGNED_NUMBER
+      | name
+      | expr
+      | "(" expr ")"
+
+expr: value "+" value -> add
+    | value "-" value -> sub
+    | value "*" value -> mul
+    | value "/" value -> div
+    | value
+
+shaped_tensor: name "[" value ("," value)* ("," "*")? "]" -> tensor
+      | name "[" "*" "]" -> wildcard_tensor
+
+tensors: shaped_tensor ("," shaped_tensor)*
+
+deduce_shape: tensors "->" tensors
+
+deduce_dim_size_arg: tensors ":" expr "->" name
+
+name: CNAME
+?start: deduce_shape | deduce_dim_size_arg
+
+%import common.SIGNED_NUMBER
+%import common.WS
+%import common.CNAME
+%ignore WS
+""".strip())
+
+
+# Here we introduce a set of ASTs to represent the target's expression.
+# The Ast nodes from lark is not convenient to use.
+class _AST:
+    pass
+
+
+@dataclass
+class NumberAST(_AST):
+    value: int
+
+
+@dataclass
+class BinaryAST(_AST):
+    op: str
+    left: _AST
+    right: _AST
+
+
+@dataclass
+class ShapeAST:
+    dims: List[_AST]
+
+
+@dataclass
+class DimAST(_AST):
+    name: str
+
+
+@dataclass
+class ShapedTensorAST(_AST):
+    arg_name: str
+    shape: ShapeAST
+
+
+@dataclass
+class DeduceShapeRule(_AST):
+    left: List[ShapedTensorAST]
+    right: List[ShapedTensorAST]
+
+
+@dataclass
+class DeduceDimSizeArgRule(_AST):
+    left: List[ShapedTensorAST]
+    expr: _AST
+    right: str
+
+
+class ToAst:
+
+    def __call__(self,
+                 tree: Tree) -> Union[DeduceShapeRule, DeduceDimSizeArgRule]:
+        if tree.data == "deduce_shape":
+            assert len(tree.children) == 2
+            return self.visit_DeduceShape(tree.children[0], tree.children[1])
+        elif tree.data == "deduce_dim_size_arg":
+            assert len(tree.children) == 3
+            return self.visit_DeduceDimSizeArg(tree.children[0],
+                                               tree.children[1],
+                                               tree.children[2])
+        raise NotImplementedError()
+
+    def visit_DeduceShape(self, left: Tree, right: Tree) -> DeduceShapeRule:
+        assert left.data == "tensors"
+        assert right.data == "tensors"
+
+        lefts = self.visit_tensors(left)
+        rights = self.visit_tensors(right)
+        return DeduceShapeRule(lefts, rights)
+
+    def visit_DeduceDimSizeArg(self, left: Tree, expr: Tree,
+                               right: Tree) -> DeduceDimSizeArgRule:
+        lefts = self.visit_tensors(left)
+        _expr = self.visit_expr(expr)
+        rights = self.visit_name(right)
+        return DeduceDimSizeArgRule(lefts, _expr, rights)
+
+    def visit_tensors(self, tree: Tree) -> List[ShapedTensorAST]:
+        assert tree.data == "tensors", repr(tree)
+        return [self.visit_tensor(child) for child in tree.children]
+
+    def visit_tensor(self, tree: Tree) -> ShapedTensorAST:
+        if tree.data == "tensor":
+            arg_name = self.visit_name(tree.children[0])
+            dims = [self.visit_expr(child) for child in tree.children[1:]]
+            return ShapedTensorAST(arg_name, ShapeAST(dims))
+
+        assert tree.data == "wildcard_tensor", repr(tree)
+        arg_name = self.visit_name(tree.children[0])
+        return ShapedTensorAST(arg_name, ShapeAST([DimAST("*")]))
+
+    def visit_number(self, v: str) -> _AST:
+        return NumberAST(int(v))
+
+    def visit_expr(self, tree: Tree) -> _AST:
+        '''
+        for expression of dims, like `m * 2 + 1`
+        '''
+
+        def visit(tree: Union[Tree, Token]) -> _AST:
+            if isinstance(tree, Token):
+                if tree.type == "SIGNED_NUMBER":
+                    return NumberAST(int(tree.value))
+                elif tree.type == "CNAME":
+                    return DimAST(tree.value)
+                raise ValueError("Unexpected token: %s" % tree)
+
+            elif isinstance(tree.data, Token):  # RULE; CNAME
+                tree_type = tree.data.value
+                if tree_type == 'name':
+                    return DimAST(tree.children[0].value)
+                elif tree_type == 'value':
+                    return visit(tree.children[0])
+                elif tree_type == 'expr':
+                    return visit(tree.children[0])
+                elif tree.data == "SIGNED_NUMBER":
+                    return NumberAST(int(tree.children[0].data))
+                else:
+                    raise ValueError(f"Unexpected tree: {repr(tree)}")
+
+            elif tree.data == "add":
+                assert len(tree.children) == 2
+                return BinaryAST("+", visit(tree.children[0]),
+                                 visit(tree.children[1]))
+            elif tree.data == "sub":
+                assert len(tree.children) == 2
+                return BinaryAST("-", visit(tree.children[0]),
+                                 visit(tree.children[1]))
+            elif tree.data == "mul":
+                assert len(tree.children) == 2
+                return BinaryAST("*", visit(tree.children[0]),
+                                 visit(tree.children[1]))
+            elif tree.data == "div":
+                assert len(tree.children) == 2
+                return BinaryAST("/", visit(tree.children[0]),
+                                 visit(tree.children[1]))
+            else:
+                raise ValueError(f"Unexpected tree: {repr(tree)}")
+
+        return visit(tree)
+
+    def visit_name(self, tree: Tree) -> str:
+        assert isinstance(tree.data, Token) and tree.data.value == "name"
+        return tree.children[0].value
+
+
+@dataclass
+class Dim:
+    arg: "Argument"
+    dim_off: int
+
+
+@dataclass
+class CppCodeTranspiler:
+    # The mapping from a arg_name in the expression to the corresponding Argument.
+    name_to_arg: Dict[str, "Argument"]
+
+    # The mapping from a dim_name in the expression to the corresponding Dim in an Argument.
+    name_to_dim: Dict[str, Dim] = field(default_factory=dict, init=False)
+
+    def __call__(self, exprs: List[str]) -> Tuple[List[str], Dict[str, str]]:
+        asts = [self.to_ast(expr) for expr in exprs]
+        return self.codegen(asts)
+
+    def to_ast(self, expr: str) -> _AST:
+        self.cur_expr = expr
+        ast = parser.parse(expr)
+        ast = ToAst()(ast)
+        return ast
+
+    def codegen(self, asts: List[_AST]) -> Tuple[List[str], Dict[str, str]]:
+        '''
+        Parse an expression group and generate the corresponding C++ code.
+
+        The syntax of an expression is like below:
+
+        - `name[expr, expr, ...] -> name[expr, expr, ...]`
+        - `name[expr, expr, ...]:expr -> dim_arg`
+        '''
+        shape_infer_code = []
+        dim_size_infer_code = {}
+
+        for ast in asts:
+            if isinstance(ast, DeduceShapeRule):
+                self.dim_cpp_repr = lambda arg_idx, dim_idx: f"inputDims[{arg_idx}].d[{dim_idx}]"
+                shape_infer_code.extend(self.emit_DeduceShapeRule(ast))
+            elif isinstance(ast, DeduceDimSizeArgRule):
+                self.dim_cpp_repr = lambda arg_idx, dim_idx: f"inputDesc[{arg_idx}].dims.d[{dim_idx}]"
+                dim_size_infer_code[ast.right] = self.emit_DeduceDimSizeArgRule(
+                    ast)
+            else:
+                raise ValueError("Unexpected ast: %s" % repr(ast))
+
+        return shape_infer_code, dim_size_infer_code
+
+    @staticmethod
+    def is_cur_identical_dims(item: ShapedTensorAST):
+        return len(item.shape.dims) == 1 and isinstance(
+            item.shape.dims[0], DimAST) and item.shape.dims[0].name == "*"
+
+    def collect_dims_from_left(self, lefts: List[ShapedTensorAST]):
+        self.name_to_dim.clear()
+
+        is_left_identical_dims = self.is_cur_identical_dims(lefts[0])
+        # process left, and record the named dimentions
+        for left in lefts:
+            arg_name = left.arg_name
+            argument = self.name_to_arg[arg_name]
+            for off, dim in enumerate(left.shape.dims):
+                assert isinstance(
+                    dim, DimAST
+                ), f"Wrong syntax in '{self.cur_expr}', for deduce_shape rule, each named dimension should be a name rather than an expression"
+                self.name_to_dim[dim.name] = Dim(argument, off)
+        return is_left_identical_dims
+
+    def emit_DeduceShapeRule(self, rule: DeduceShapeRule) -> List[str]:
+        from tensorrt_llm.tools.plugin_gen.core import code
+
+        is_cur_identical_dims = lambda item: len(
+            item.shape.dims) == 1 and isinstance(item.shape.dims[
+                0], DimAST) and item.shape.dims[0].name == "*"
+
+        is_left_identical_dims = self.collect_dims_from_left(rule.left)
+
+        first_left_tensor = rule.left[0]
+        first_left_tensor_arg = self.name_to_arg[first_left_tensor.arg_name]
+
+        ret = []
+        # process right, and generate the code for each dimensions
+
+        # TODO[chunweiy]: support more wildcard cases, currently only A[*] -> B[*], C[*] is supported
+        is_right_identical_dims = False
+        for off, item in enumerate(rule.right):
+            is_cur_identical_dims = self.is_cur_identical_dims(item)
+            if is_right_identical_dims and not is_cur_identical_dims:
+                assert is_cur_identical_dims, "Wrong syntax in '%s', for deduce_shape rule, once the left side be X[*], the should all be X[*] format too" % self.cur_expr
+            is_right_identical_dims = is_cur_identical_dims
+
+        assert is_left_identical_dims == is_right_identical_dims, "Wrong syntax in '%s', for deduce_shape rule, the left and right side should be both X[*] or not" % self.cur_expr
+
+        for off, tensor in enumerate(rule.right):
+            out_arg = self.name_to_arg[tensor.arg_name]
+            ret.append(code(f"if (outputIndex == {out_arg.offset}) {{"))
+
+            if is_right_identical_dims:
+                ret.append(
+                    code(
+                        f"  outputDims = inputDims[{first_left_tensor_arg.offset}];"
+                    ))
+            else:
+                ret.append(
+                    code(f"  outputDims.nbDims = {len(tensor.shape.dims)};"))
+                for dim_off, dim in enumerate(tensor.shape.dims):
+                    ret.append(
+                        code(
+                            f"  outputDims.d[{dim_off}] = {self.emit_expr(dim)};"
+                        ))
+
+            ret.append(code(f"}}"))
+
+        return ret
+
+    def emit_DeduceDimSizeArgRule(self, rule: DeduceDimSizeArgRule) -> str:
+        self.collect_dims_from_left(rule.left)
+        return self.emit_expr(rule.expr)
+
+    def emit_expr(self, expr: _AST) -> str:
+        if isinstance(expr, NumberAST):
+            return str(expr.value)
+        elif isinstance(expr, DimAST):
+            return self.emit_dim(expr)
+        elif isinstance(expr, BinaryAST):
+            return self.emit_binary(expr)
+        raise ValueError("Unexpected expr: %s" % expr)
+
+    def emit_dim(self, dim: DimAST) -> str:
+        dim_: Dim = self.name_to_dim[dim.name]
+        repr = self.dim_cpp_repr(dim_.arg.offset, dim_.dim_off)
+        return repr
+
+    def emit_binary(self, binary: BinaryAST) -> str:
+        left = self.emit_expr(binary.left)
+        right = self.emit_expr(binary.right)
+        return f"({left} {binary.op} {right})"
diff --git a/tensorrt_llm/tools/plugin_gen/templates/CMakeLists.txt.tpl b/tensorrt_llm/tools/plugin_gen/templates/CMakeLists.txt.tpl
new file mode 100644
index 00000000000..5d4df0ec30c
--- /dev/null
+++ b/tensorrt_llm/tools/plugin_gen/templates/CMakeLists.txt.tpl
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 3.1)
+
+# Enable C++
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+
+# Define project name
+set(TARGET_NAME [[ plugin_lib ]])
+project(${TARGET_NAME})
+
+set(CMAKE_VERBOSE_MAKEFILE 1)
+
+# Compile options
+set(CMAKE_C_FLAGS "-Wall -pthread ")
+set(CMAKE_C_FLAGS_DEBUG "-g -O0")
+set(CMAKE_C_FLAGS_RELEASE "-O2")
+set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -lstdc++")
+set(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+
+set(CMAKE_BUILD_TYPE release)
+
+find_package(CUDA REQUIRED)
+find_library(TRT_LLM_LIB_PATH nvinfer_plugin_tensorrt_llm
+             HINTS ${TRT_LLM_LIB_DIR} NO_DEFAULT_PATH)
+find_library(TRT_LLM_LIB_PATH nvinfer_plugin_tensorrt_llm REQUIRED)
+
+message(STATUS "CUDA library status:")
+message(STATUS "    config: ${CUDA_DIR}")
+message(STATUS "    version: ${CUDA_VERSION}")
+message(STATUS "    libraries: ${CUDA_LIBRARIES}")
+message(STATUS "    include path: ${CUDA_INCLUDE_DIRS}")
+
+if(NOT DEFINED TRT_LIB_DIR)
+  set(TRT_LIB_DIR "/usr/local/tensorrt/lib")
+endif()
+if(NOT DEFINED TRT_INCLUDE_DIR)
+  set(TRT_INCLUDE_DIR "/usr/local/tensorrt/include")
+endif()
+if(NOT DEFINED TRT_LLM_LIB_DIR)
+  set(TRT_LLM_LIB_DIR "../../tensorrt_llm/libs")
+endif()
+if(NOT DEFINED TRT_LLM_INCLUDE_DIR)
+  set(TRT_LLM_INCLUDE_DIR "../../cpp")
+endif()
+message(STATUS "tensorrt library: ${TRT_LIB_DIR}")
+message(STATUS "tensorrt include path: ${TRT_INCLUDE_DIR}")
+message(STATUS "tensorrt_llm librariy path: ${TRT_LLM_LIB_DIR}")
+message(STATUS "tensorrt_llm include path: ${TRT_LLM_INCLUDE_DIR}")
+link_directories(${TRT_LLM_LIB_DIR} ${TRT_LIB_DIR})
+
+# Declare the executable target built from your sources
+add_library(
+  ${TARGET_NAME} SHARED
+  ${CMAKE_SOURCE_DIR}/tritonPlugins.cpp
+{% for plugin in plugin_names %}
+  ${CMAKE_SOURCE_DIR}/[[ plugin ]]/_generate_trt_plugin/plugin.cpp
+{% endfor %}
+  [[ ' '.join(kernel_object_files) ]]
+  )
+
+set_property(TARGET ${TARGET_NAME} PROPERTY IMPORTED_LOCATION ${TRT_LIB_PATH})
+set_property(TARGET ${TARGET_NAME} PROPERTY IMPORTED_LOCATION
+                                            ${TRT_LLM_LIB_PATH})
+target_link_libraries(${TARGET_NAME} LINK_PRIVATE ${CUDA_LIBRARIES})
+target_link_libraries(${TARGET_NAME} LINK_PRIVATE nvinfer)
+target_link_libraries(${TARGET_NAME} LINK_PRIVATE nvinfer_plugin_tensorrt_llm)
+target_link_libraries(${TARGET_NAME} LINK_PRIVATE cuda)
+
+include_directories("/usr/local/cuda/include")
+include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${TRT_INCLUDE_DIR})
+include_directories(${TRT_LLM_INCLUDE_DIR})
diff --git a/tensorrt_llm/tools/plugin_gen/templates/functional.py.tpl b/tensorrt_llm/tools/plugin_gen/templates/functional.py.tpl
new file mode 100644
index 00000000000..281da947309
--- /dev/null
+++ b/tensorrt_llm/tools/plugin_gen/templates/functional.py.tpl
@@ -0,0 +1,70 @@
+{% if add_header %}
+# //header begin//
+
+import ctypes
+from collections import OrderedDict
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import tensorrt as trt
+
+from tensorrt_llm._common import default_trtnet
+from tensorrt_llm._utils import str_dtype_to_trt
+from tensorrt_llm.functional import Tensor, _create_tensor
+from tensorrt_llm.module import Module
+
+TRT_LLM_PLUGIN_NAMESPACE = 'tensorrt_llm'
+
+def _load_triton_plugin_lib():
+    triton_plugin_dir = Path(__file__).parent.absolute()
+    plugin_lib = "[[ plugin_lib_path ]]"
+    handle = ctypes.CDLL(plugin_lib, mode=ctypes.RTLD_GLOBAL)
+    if handle is None:
+        raise ImportError('TensorRT-LLM Triton Plugin is unavailable')
+    handle.initLibNvInferPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+    handle.initLibNvInferPlugins.restype = ctypes.c_bool
+    assert handle.initLibNvInferPlugins(
+        None, TRT_LLM_PLUGIN_NAMESPACE.encode('utf-8'))
+
+_load_triton_plugin_lib()
+
+# //header end//
+{% endif %}
+
+def [[ kernel_name ]]([[ arg_list ]]):
+    '''
+    Inputs:
+    {% for arg in metadata.get_params() -%}
+    - [[arg.name]]: [[arg.dtype.dtype.to('np')]]
+    {% endfor %}
+    {% for arg in metadata.get_inputs() -%}
+    - [[arg.name]]: {% if arg.is_tensor %}tensor<{%endif%}[[arg.dtype.dtype.to('np')]]>
+    {% endfor %}
+    Outputs:
+    {% for arg in metadata.get_outputs() -%}
+    - [[arg.name]]: {% if arg.is_tensor %}tensor<{%endif%}[[arg.dtype.dtype.to('np')]]>
+    {% endfor -%}
+    '''
+    plg_creator = trt.get_plugin_registry().get_plugin_creator(
+        '[[ plugin_name ]]', '[[ kernel_version ]]', TRT_LLM_PLUGIN_NAMESPACE)
+    assert plg_creator is not None
+
+    pfc = trt.PluginFieldCollection([
+        {% for arg in params -%}
+        {# input is a dict[name, np_type, trt_type ] #}
+        trt.PluginField("[[arg.name]]", np.array([ [[ arg.name ]] ], np.[[ arg.dtype.dtype.to('np') ]]),
+                        trt.PluginFieldType.[[ arg.dtype.dtype.to('trt_plugin_py') ]]),
+        {% endfor %}
+    ])
+
+    plugin = plg_creator.create_plugin("[[ plugin_name ]]", pfc)
+
+    plug_inputs = [ [[ input_list ]] ]
+    layer = default_trtnet().add_plugin_v2(plug_inputs, plugin)
+
+    return [
+        {% for id in range(num_outputs) %}
+        _create_tensor(layer.get_output([[ id ]]), layer),
+        {% endfor %}
+    ]
diff --git a/tensorrt_llm/tools/plugin_gen/templates/plugin.cpp.tpl b/tensorrt_llm/tools/plugin_gen/templates/plugin.cpp.tpl
new file mode 100644
index 00000000000..77badb81df1
--- /dev/null
+++ b/tensorrt_llm/tools/plugin_gen/templates/plugin.cpp.tpl
@@ -0,0 +1,302 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin.h"
+#include <iostream>
+
+using namespace nvinfer1;
+using nvinfer1::plugin::[[ plugin_name ]]Creator;
+using nvinfer1::plugin::[[ plugin_name ]];
+
+PluginFieldCollection [[ plugin_name ]]Creator::mFC{};
+std::vector<PluginField> [[ plugin_name ]]Creator::mPluginAttributes;
+
+
+// constructor
+[[ plugin_name ]]::[[ plugin_name ]]( [[ construct_arg_list ]] )
+{
+  {% for arg in params -%}
+  this->[[arg.name]] = [[arg.name]];
+  {% endfor %}
+}
+
+
+// Parameterized constructor
+[[ plugin_name ]]::[[ plugin_name ]](const void* data, size_t length)
+{
+    const char *d = static_cast<const char*>(data), *a = d;
+
+    {% for arg in params -%}
+    read(d, [[arg.name]]);
+    {% endfor %}
+    assert(d == a + getSerializationSize());
+}
+
+
+nvinfer1::IPluginV2DynamicExt* [[ plugin_name ]]::clone() const noexcept
+{
+  auto* plugin = new [[plugin_name]]([[', '.join(param_names)]]);
+  plugin->setPluginNamespace(mNamespace.c_str());
+  return plugin;
+}
+
+
+nvinfer1::DimsExprs [[ plugin_name ]]::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputDims, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept
+{
+  [[ getOutputDimensions_body ]]
+}
+
+bool [[ plugin_name ]]::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept
+{
+  PLUGIN_ASSERT(nbInputs + nbOutputs == [[io_count]]);
+  PLUGIN_ASSERT(0 <= pos && pos < nbInputs + nbOutputs);
+  PLUGIN_ASSERT(nbInputs == [[input_count]]);
+  PLUGIN_ASSERT(nbOutputs == [[output_count]]);
+
+  {% for arg in inputs %}
+  if (pos == [[loop.index0]]) {
+    {%- if arg.is_tensor -%}
+    return inOut[pos].type == DataType::[[ arg.dtype.dtype.to('trt') ]] && inOut[pos].format == TensorFormat::kLINEAR;
+    {%- else -%}
+    return inOut[pos].type == DataType::[[ arg.dtype.dtype.to('trt') ]];
+    {%- endif -%}
+  }
+  {% endfor %}
+
+  {% for arg in outputs %}
+  if (pos == nbInputs + [[loop.index0]])
+    return inOut[pos].type == DataType::[[ arg.dtype.dtype.to('trt') ]] && inOut[pos].format == TensorFormat::kLINEAR;
+  {% endfor %}
+}
+
+
+void [[ plugin_name ]]::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept
+{
+  [[ configurePlugin_body ]]
+}
+
+
+size_t [[ plugin_name ]]::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const noexcept
+{
+  [[ getWorkspaceSize_body ]]
+}
+
+int [[ plugin_name ]]::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc,
+    const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept
+{
+  // input arguments
+  {% for arg in inputs %}
+  const auto* [[arg.name]]_buf = reinterpret_cast<const [[arg.dtype.dtype.to("c")]] *>(inputs[ [[arg.offset]] ]);
+  {% if arg.dtype.is_tensor -%}
+  auto [[arg.name]] = reinterpret_cast<CUdeviceptr>([[arg.name]]_buf);
+  {% else %}
+  const auto [[arg.name]] = * [[arg.name]]_buf;
+  {% endif -%}
+  {% endfor %}
+
+  // outputs
+  {% for arg in outputs %}
+  auto* [[arg.name]]_buf = reinterpret_cast<const [[arg.dtype.dtype.to("c")]] *>(outputs[ [[arg.offset]] ]);
+  auto [[arg.name]] = reinterpret_cast<CUdeviceptr>([[arg.name]]_buf);
+  {% endfor %}
+
+  // dim size arguments
+  {%- for arg in dim_size_args -%}
+  {# code field is dedicated for DimSizeArg #}
+  [[arg.dtype.dtype.to("c")]] [[arg.name]] = [[arg.code]];
+  {%- endfor %}
+
+  // TODO[chunweiy]: Check result code
+  [[kernel_name]]([[enqueue_body_arg_list]]);
+
+  return 0;
+}
+
+
+nvinfer1::DataType [[ plugin_name ]]::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
+{
+  {% for arg in outputs %}
+  if (index == [[loop.index0]]) {
+    return DataType::[[arg.dtype.dtype.to('trt')]];
+  }
+  {% endfor %}
+}
+
+
+const char* [[ plugin_name ]]::getPluginType() const noexcept
+{
+  return "[[ plugin_name ]]";
+}
+
+const char* [[ plugin_name ]]::getPluginVersion() const noexcept
+{
+  return "[[ kernel_version ]]";
+}
+
+int [[ plugin_name ]]::getNbOutputs() const noexcept
+{
+  return [[outputs|length]];
+}
+
+int [[ plugin_name ]]::initialize() noexcept
+{
+    load_[[kernel_name]]();
+    return 0;
+}
+
+void [[ plugin_name ]]::terminate() noexcept {
+  unload_[[kernel_name]]();
+}
+
+size_t [[ plugin_name ]]::getSerializationSize() const noexcept
+{
+  size_t ret = 0;
+
+  {% for arg in params -%}
+  ret += sizeof([[arg.dtype.dtype.to('c')]]);
+  {% endfor %}
+
+  return ret;
+
+}
+
+void [[ plugin_name ]]::serialize(void* buffer) const noexcept
+{
+
+    char *d = static_cast<char*>(buffer), *a = d;
+
+    {% for arg in params -%}
+    write(d, [[arg.name]]);
+    {% endfor %}
+    assert(d == a + getSerializationSize());
+
+}
+
+void [[ plugin_name ]]::destroy() noexcept
+{
+    // This gets called when the network containing plugin is destroyed
+    delete this;
+}
+
+void [[ plugin_name ]]::setPluginNamespace(const char* libNamespace) noexcept
+{
+    mNamespace = libNamespace;
+}
+
+const char* [[ plugin_name ]]::getPluginNamespace() const noexcept
+{
+    return mNamespace.c_str();
+}
+
+
+
+[[ plugin_name ]]Creator::[[ plugin_name ]]Creator()
+{
+    // Fill PluginFieldCollection with PluginField arguments metadata
+    mPluginAttributes.clear();
+
+    {% for arg in params %}
+    mPluginAttributes.emplace_back(PluginField("[[arg.name]]", nullptr, PluginFieldType::[[arg.dtype.dtype.to('trt_plugin')]], 0));
+    {% endfor %}
+
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+
+const char* [[ plugin_name ]]Creator::getPluginName() const noexcept
+{
+    return "[[ plugin_name ]]";
+}
+
+const char* [[ plugin_name ]]Creator::getPluginVersion() const noexcept
+{
+    return "[[ plugin_version ]]";
+}
+
+const PluginFieldCollection* [[ plugin_name ]]Creator::getFieldNames() noexcept
+{
+    return &mFC;
+}
+
+IPluginV2* [[ plugin_name ]]Creator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept
+{
+  const PluginField* fields = fc->fields;
+
+  // declare parameters
+  {% for arg in params %}
+    [[arg.dtype.dtype.to('c')]] [[arg.name]];
+  {% endfor %}
+
+    for (int i = 0; i < fc->nbFields; ++i) {
+        const char* attrName = fields[i].name;
+  {% for arg in params %}
+        if (!strcmp(attrName, "[[arg.name]]"))
+        {
+            PLUGIN_ASSERT(fields[i].type == PluginFieldType::[[arg.dtype.dtype.to('trt_plugin')]]);
+            [[arg.name]] = static_cast<[[arg.dtype.dtype.to('c')]]>(*(static_cast<const [[arg.dtype.dtype.to('c')]]*>(fields[i].data)));
+        }
+  {% endfor %}
+    }
+
+    try
+    {
+        auto* obj = new [[plugin_name]]([[ ', '.join(param_names) ]]);
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return nullptr;
+
+}
+
+
+IPluginV2* [[ plugin_name ]]Creator::deserializePlugin(
+    const char* name, const void* serialData, size_t serialLength) noexcept
+{
+    // This object will be deleted when the network is destroyed, which will
+    // call [[ plugin_name ]]::destroy()
+    try
+    {
+        auto* obj = new [[ plugin_name ]](serialData, serialLength);
+        obj->setPluginNamespace("tensorrt_llm");
+        return obj;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return nullptr;
+}
+
+void [[plugin_name]]Creator::setPluginNamespace(const char* libNamespace) noexcept
+{
+    mNamespace = libNamespace;
+}
+
+const char* [[plugin_name]]Creator::getPluginNamespace() const noexcept
+{
+    return mNamespace.c_str();
+}
diff --git a/tensorrt_llm/tools/plugin_gen/templates/plugin.h.tpl b/tensorrt_llm/tools/plugin_gen/templates/plugin.h.tpl
new file mode 100644
index 00000000000..ebb51fd77f8
--- /dev/null
+++ b/tensorrt_llm/tools/plugin_gen/templates/plugin.h.tpl
@@ -0,0 +1,120 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_[[ plugin_name ]]_PLUGIN_H
+#define TRT_[[ plugin_name ]]_PLUGIN_H
+#include "NvInferPlugin.h"
+#include "tensorrt_llm/plugins/common/plugin.h"
+
+extern "C"
+{
+
+#include "[[ kernel_name ]]/[[ triton_aot_dir ]]/launcher.h"
+
+}
+
+
+#include <cassert>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+
+class [[ plugin_name ]] : public IPluginV2DynamicExt {
+
+public:
+  [[ plugin_name ]]( [[ construct_arg_list ]]);
+  [[ plugin_name ]](const void* data, size_t length);
+
+  ~[[ plugin_name ]]() override = default;
+
+
+    // IPluginV2DynamicExt Methods
+    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
+    nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
+    bool supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept override;
+    void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+        const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept override;
+    size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+        const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const noexcept override;
+    int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc,
+        const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
+
+    // IPluginV2Ext Methods
+    nvinfer1::DataType getOutputDataType(
+        int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept override;
+
+    // IPluginV2 Methods
+    const char* getPluginType() const noexcept override;
+    const char* getPluginVersion() const noexcept override;
+    int getNbOutputs() const noexcept override;
+    int initialize() noexcept override;
+    void terminate() noexcept override;
+    size_t getSerializationSize() const noexcept override;
+    void serialize(void* buffer) const noexcept override;
+    void destroy() noexcept override;
+    void setPluginNamespace(const char* pluginNamespace) noexcept override;
+    const char* getPluginNamespace() const noexcept override;
+
+private:
+    std::string mNamespace;
+
+    {% for arg in params -%}
+    [[arg.dtype.dtype.to('c')]] [[arg.name]];
+    {% endfor %}
+};
+
+
+
+class [[ plugin_name ]]Creator : public IPluginCreator
+{
+public:
+    [[ plugin_name ]]Creator();
+
+    const char* getPluginName() const noexcept override;
+
+    const char* getPluginVersion() const noexcept override;
+
+    const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override;
+
+    nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) noexcept override;
+
+    nvinfer1::IPluginV2* deserializePlugin(
+        const char* name, const void* serialData, size_t serialLength) noexcept override;
+
+    void setPluginNamespace(const char* pluginNamespace) noexcept override;
+
+    const char* getPluginNamespace() const noexcept override;
+
+private:
+    static PluginFieldCollection mFC;
+    static std::vector<PluginField> mPluginAttributes;
+    std::string mNamespace;
+};
+
+
+} // namespace plugin
+} // namespace nvinfer1
+
+#endif // TRT_[[ plugin_name ]]_PLUGIN_H
diff --git a/tensorrt_llm/tools/plugin_gen/templates/tritonPlugins.cpp.tpl b/tensorrt_llm/tools/plugin_gen/templates/tritonPlugins.cpp.tpl
new file mode 100644
index 00000000000..60490ed74f3
--- /dev/null
+++ b/tensorrt_llm/tools/plugin_gen/templates/tritonPlugins.cpp.tpl
@@ -0,0 +1,147 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+
+{% for header in headers %}
+#include "[[ header ]]"
+{% endfor %}
+
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <stack>
+#include <unordered_set>
+
+using namespace nvinfer1;
+using namespace nvinfer1::plugin;
+
+namespace nvinfer1
+{
+
+namespace
+{
+
+// This singleton ensures that each plugin is only registered once for a given
+// namespace and type, and attempts of duplicate registration are ignored.
+class TritonPluginCreatorRegistry
+{
+public:
+    static TritonPluginCreatorRegistry& getInstance()
+    {
+        static TritonPluginCreatorRegistry instance;
+        return instance;
+    }
+
+    template <typename CreatorType>
+    void addPluginCreator(void* logger, const char* libNamespace)
+    {
+        // Make accesses to the plugin creator registry thread safe
+        std::lock_guard<std::mutex> lock(mRegistryLock);
+
+        std::string errorMsg;
+        std::string verboseMsg;
+
+        auto pluginCreator = std::make_unique<CreatorType>();
+        pluginCreator->setPluginNamespace(libNamespace);
+
+        nvinfer1::ILogger* trtLogger = static_cast<nvinfer1::ILogger*>(logger);
+        std::string pluginType = std::string{pluginCreator->getPluginNamespace()}
+            + "::" + std::string{pluginCreator->getPluginName()} + " version "
+            + std::string{pluginCreator->getPluginVersion()};
+
+        if (mRegistryList.find(pluginType) == mRegistryList.end())
+        {
+            bool status = getPluginRegistry()->registerCreator(*pluginCreator, libNamespace);
+            if (status)
+            {
+                mRegistry.push(std::move(pluginCreator));
+                mRegistryList.insert(pluginType);
+                verboseMsg = "Registered plugin creator - " + pluginType;
+            }
+            else
+            {
+                errorMsg = "Could not register plugin creator -  " + pluginType;
+            }
+        }
+        else
+        {
+            verboseMsg = "Plugin creator already registered - " + pluginType;
+        }
+
+        if (trtLogger)
+        {
+            if (!errorMsg.empty())
+            {
+                trtLogger->log(ILogger::Severity::kERROR, errorMsg.c_str());
+            }
+
+            if (!verboseMsg.empty())
+            {
+                trtLogger->log(ILogger::Severity::kVERBOSE, verboseMsg.c_str());
+            }
+        }
+    }
+
+    ~TritonPluginCreatorRegistry()
+    {
+        std::lock_guard<std::mutex> lock(mRegistryLock);
+
+        // Release pluginCreators in LIFO order of registration.
+        while (!mRegistry.empty())
+        {
+            mRegistry.pop();
+        }
+        mRegistryList.clear();
+    }
+
+private:
+    TritonPluginCreatorRegistry() {}
+
+    std::mutex mRegistryLock;
+    std::stack<std::unique_ptr<IPluginCreator>> mRegistry;
+    std::unordered_set<std::string> mRegistryList;
+
+public:
+    TritonPluginCreatorRegistry(TritonPluginCreatorRegistry const&) = delete;
+    void operator=(TritonPluginCreatorRegistry const&) = delete;
+};
+
+template <typename CreatorType>
+void initializeTritonPlugin(void* logger, const char* libNamespace)
+{
+    TritonPluginCreatorRegistry::getInstance().addPluginCreator<CreatorType>(logger, libNamespace);
+}
+
+} // namespace
+} // namespace nvinfer1
+
+// New Plugin APIs
+
+extern "C"
+{
+    bool initLibNvInferPlugins(void* logger, const char* libNamespace)
+    {
+        {# create the registers for all the plugins #}
+        {% for creator in plugin_creators %}
+            nvinfer1::initializeTritonPlugin<nvinfer1::plugin::[[ creator ]]>(logger, libNamespace);
+        {% endfor %}
+        return true;
+    }
+} // extern "C"
diff --git a/tests/attention/test_gpt_attention.py b/tests/attention/test_gpt_attention.py
index f364bff1da6..6349fb9ba12 100644
--- a/tests/attention/test_gpt_attention.py
+++ b/tests/attention/test_gpt_attention.py
@@ -35,7 +35,7 @@
 from tensorrt_llm import Tensor
 from tensorrt_llm._utils import (str_dtype_to_np, str_dtype_to_torch,
                                  torch_to_numpy)
-from tensorrt_llm.functional import PositionEmbeddingType
+from tensorrt_llm.functional import PositionEmbeddingType, RotaryScalingType
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import GenerationSequence, KVCacheManager
@@ -81,11 +81,12 @@ def load_test_cases():
                     [True], [1], [False], [True, False]))
 
         # Test cases for the multi-block MMHA.
+        # NOTE: With long in_len=2048, beam_width=4 runs into OOM issue.
         test_cases += list(
             product(['llama_attention'], [
                 ContextFMHAType.enabled, ContextFMHAType.enabled_with_fp32_acc
             ], ['float16', 'bfloat16'], [2], [2048], [4], [64], [0], [True],
-                    [False], [False], [1, 4], [False], [False]))
+                    [False], [False], [1], [False], [False]))
 
         # Test cases for the int8 K/V cache.
         test_cases += list(
@@ -111,6 +112,62 @@ def load_test_cases():
                     [1], [165], [32], [128], [4], [False], [False], [False],
                     [1], [False], [False]))
 
+        # test cases for RoPE base and scaling
+        test_cases += list(
+            product(
+                ['llama_attention'],
+                [ContextFMHAType.disabled],
+                ['bfloat16', 'float16'],
+                [2],
+                [4],
+                [8],
+                [32],
+                [2, 4],
+                [False],
+                [False],
+                [False],
+                [1],
+                [False],
+                [False],
+                [10000.0, 1000000.0],  # rope base
+                [  # rope scaling
+                    {
+                        "type": "linear",
+                        "factor": 2.0
+                    },
+                    {
+                        "type": "dynamic",
+                        "factor": 3.0
+                    },
+                ]))
+        test_cases += list(
+            product(
+                ['llama_attention'],
+                [ContextFMHAType.enabled],
+                ['float32'],
+                [1],
+                [165],
+                [32],
+                [128],
+                [4],
+                [False],
+                [False],
+                [False],
+                [1],
+                [False],
+                [False],
+                [10000.0, 1000000.0],  # rope base
+                [  # rope scaling
+                    {
+                        "type": "linear",
+                        "factor": 3.0
+                    },
+                    {
+                        "type": "dynamic",
+                        "factor": 2.0
+                    },
+                ]))
+
         return test_cases
 
     def custom_name_func(testcase_func, param_num, param):
@@ -120,11 +177,23 @@ def custom_name_func(testcase_func, param_num, param):
         )
 
     @parameterized.expand(load_test_cases, name_func=custom_name_func)
-    def test_gpt_attention(self, attention_type, context_fmha_type, dtype,
-                           batch_size, in_len, num_heads, head_size,
-                           num_kv_heads, enable_multi_block_mmha,
-                           use_int8_kv_cache, enable_remove_input_padding,
-                           beam_width, paged_kv_cache, fuse_bias):
+    def test_gpt_attention(self,
+                           attention_type,
+                           context_fmha_type,
+                           dtype,
+                           batch_size,
+                           in_len,
+                           num_heads,
+                           head_size,
+                           num_kv_heads,
+                           enable_multi_block_mmha,
+                           use_int8_kv_cache,
+                           enable_remove_input_padding,
+                           beam_width,
+                           paged_kv_cache,
+                           fuse_bias,
+                           rope_base=10000.0,
+                           rope_scaling=None):
         # if attention_type != "gpt_bigcode_attention" and attention_type != "llama_attention":
         #     assert num_kv_heads == 0 # safe guard against bad test case configs
 
@@ -163,14 +232,13 @@ def test_gpt_attention(self, attention_type, context_fmha_type, dtype,
 
         tokens_per_block = 16 if paged_kv_cache else -1
 
-        def _construct_execution(session, input_tensor, weight, bias,
-                                 past_key_value, pointer_array, sequence_length,
-                                 host_past_key_value_lengths, context_lengths,
-                                 host_context_lengths, cache_indirection,
-                                 host_request_types, num_heads, hidden_size,
-                                 num_kv_heads, output, dtype,
-                                 max_context_length, shape_dict,
-                                 kv_int8_quant_scale, kv_int8_dequant_scale):
+        def _construct_execution(
+                session, input_tensor, weight, bias, past_key_value,
+                pointer_array, sequence_length, host_past_key_value_lengths,
+                context_lengths, host_context_lengths, cache_indirection,
+                host_request_types, num_heads, hidden_size, num_kv_heads,
+                output, dtype, max_context_length, shape_dict,
+                kv_int8_quant_scale, kv_int8_dequant_scale, configuration):
             head_size = hidden_size // num_heads
             # construct trt network
             builder = tensorrt_llm.Builder()
@@ -270,18 +338,35 @@ def _construct_execution(session, input_tensor, weight, bias,
                 else:
                     position_embedding_type = PositionEmbeddingType.learned_absolute
 
+                rope_base = 10000.0
+                rope_scale_type = RotaryScalingType.none
+                rope_scale = 1.0
+                if attention_type == "llama_attention":
+                    rope_base = configuration.rope_theta
+                    if configuration.rope_scaling is not None:
+                        rope_scale_type = {
+                            "linear": RotaryScalingType.linear,
+                            "dynamic": RotaryScalingType.dynamic
+                        }[configuration.rope_scaling["type"]]
+                        rope_scale = configuration.rope_scaling["factor"]
                 outputs = tensorrt_llm.functional.gpt_attention(
-                    qkv,
-                    past_key_value_tensor,
-                    sequence_length_tensor,
+                    tensor=qkv,
+                    past_key_value=past_key_value_tensor,
+                    sequence_length=sequence_length_tensor,
+                    host_past_key_value_lengths=
                     host_past_key_value_lengths_tensor,
-                    context_lengths_tensor,
-                    cache_indirection_tensor,
-                    host_request_types_tensor,
+                    context_lengths=context_lengths_tensor,
+                    cache_indirection=cache_indirection_tensor,
+                    host_request_types=host_request_types_tensor,
                     num_heads=num_heads,
                     num_kv_heads=num_kv_heads,
                     q_scaling=1.0,
                     rotary_embedding_dim=rotary_embedding_dim,
+                    rotary_embedding_base=rope_base,
+                    rotary_embedding_scale_type=rope_scale_type,
+                    rotary_embedding_scale=rope_scale,
+                    rotary_embedding_max_positions=configuration.
+                    max_position_embeddings,
                     position_embedding_type=position_embedding_type,
                     multi_block_mode=enable_multi_block_mmha,
                     kv_orig_quant_scale=kv_int8_quant_scale_tensor,
@@ -439,6 +524,17 @@ def _construct_execution(session, input_tensor, weight, bias,
         )
         if attention_type == 'llama_attention':
             configuration.num_key_value_heads = num_kv_heads
+            configuration.rope_theta = rope_base
+            configuration.rope_scaling = rope_scaling
+            if rope_scaling is not None:
+                # scaling is typically used for supporting longer seq lens than max_position_embeddings
+                # so we set the max_position_embeddings to be smaller than total seq len
+                # the following will use default path (no scaling) when generating half of the outputs
+                # the other half will use activate the scaling
+                # NOTE: in_len is also halved because the other half is treated as padding.
+                #       See input_lengths below.
+                configuration.max_position_embeddings = (
+                    in_len // 2) + out_len - (out_len // 2)
         attention = AttentionCls(configuration).cuda().eval()
         if attention_type == 'gpt2_attention':
             attention.c_attn.weight = torch.nn.parameter.Parameter(
@@ -758,7 +854,7 @@ def verify_kv_cache(torch_present):
                     host_context_lengths, cache_indirection, host_request_types,
                     num_heads, hidden_size, num_kv_heads, output, dtype,
                     max_context_length, shape_dict, kv_int8_quant_scale,
-                    kv_int8_dequant_scale)
+                    kv_int8_dequant_scale, configuration)
                 del session
                 session = None
 
@@ -860,24 +956,21 @@ def tile_beam_width(tensor: torch.Tensor, num_beams: int):
 
                 torch.cuda.synchronize()
 
-                if step == 1:
-                    tiled_input_tensor = tile_beam_width(
-                        input_tensor, beam_width)
-                    tiled_attention_mask = tile_beam_width(
-                        attention_mask, beam_width)
-                    tiled_input_lengths = tile_beam_width(
-                        input_lengths, beam_width)
-                    tiled_host_context_lengths = tiled_input_lengths.cpu(
-                    ) if enable_remove_input_padding else None
-                    tiled_host_past_key_value_lengths = tile_beam_width(
-                        host_past_key_value_lengths, beam_width)
-                    tiled_host_request_types = tile_beam_width(
-                        host_request_types, beam_width)
-                    tiled_present_key_value = tile_beam_width(
-                        present_key_value,
-                        beam_width) if not paged_kv_cache else present_key_value
-                    tiled_sequence_length = tile_beam_width(
-                        sequence_length, beam_width)
+                tiled_input_tensor = tile_beam_width(input_tensor, beam_width)
+                tiled_attention_mask = tile_beam_width(attention_mask,
+                                                       beam_width)
+                tiled_input_lengths = tile_beam_width(input_lengths, beam_width)
+                tiled_host_context_lengths = tiled_input_lengths.cpu(
+                ) if enable_remove_input_padding else None
+                tiled_host_past_key_value_lengths = tile_beam_width(
+                    host_past_key_value_lengths, beam_width)
+                tiled_host_request_types = tile_beam_width(
+                    host_request_types, beam_width)
+                tiled_present_key_value = tile_beam_width(
+                    present_key_value,
+                    beam_width) if not paged_kv_cache else present_key_value
+                tiled_sequence_length = tile_beam_width(sequence_length,
+                                                        beam_width)
 
                 if enable_remove_input_padding:
                     shape_dict['input'] = (1, batch_size, hidden_size)
@@ -888,17 +981,15 @@ def tile_beam_width(tensor: torch.Tensor, num_beams: int):
                 output = torch.zeros(shape_dict['output'],
                                      dtype=str_dtype_to_torch(dtype),
                                      device='cuda')
-                if step == 1:
-                    input_tensor = input_tensor.reshape(
-                        [batch_size, hidden_size])
-                    tiled_input_tensor = tile_beam_width(
-                        input_tensor, beam_width)
-                    tiled_input_tensor = tiled_input_tensor.reshape(
-                        [1, batch_size * beam_width, hidden_size])
-                    output = output.reshape([batch_size, hidden_size])
-                    tiled_output = tile_beam_width(output, beam_width)
-                    tiled_output = tiled_output.reshape(
-                        [1, batch_size * beam_width, hidden_size])
+
+                input_tensor = input_tensor.reshape([batch_size, hidden_size])
+                tiled_input_tensor = tile_beam_width(input_tensor, beam_width)
+                tiled_input_tensor = tiled_input_tensor.reshape(
+                    [1, batch_size * beam_width, hidden_size])
+                output = output.reshape([batch_size, hidden_size])
+                tiled_output = tile_beam_width(output, beam_width)
+                tiled_output = tiled_output.reshape(
+                    [1, batch_size * beam_width, hidden_size])
 
                 session, tiled_output, present_key_value = _construct_execution(
                     session, tiled_input_tensor, weight_plugin, bias_plugin,
@@ -908,7 +999,7 @@ def tile_beam_width(tensor: torch.Tensor, num_beams: int):
                     cache_indirection, tiled_host_request_types, num_heads,
                     hidden_size, num_kv_heads, tiled_output, dtype,
                     max_context_length, shape_dict, kv_int8_quant_scale,
-                    kv_int8_dequant_scale)
+                    kv_int8_dequant_scale, configuration)
 
                 del session
                 session = None
@@ -923,6 +1014,7 @@ def tile_beam_width(tensor: torch.Tensor, num_beams: int):
                 # Iterate to the next step. Increase number of tokens for all unfinished sequences
                 # And allocate new blocks if needed
                 manager.step([False] * batch_size)
+        return
 
 
 if __name__ == "__main__":
diff --git a/tests/attention/test_gpt_attention_IFB.py b/tests/attention/test_gpt_attention_IFB.py
index 33521f70ef3..35e94b2c777 100644
--- a/tests/attention/test_gpt_attention_IFB.py
+++ b/tests/attention/test_gpt_attention_IFB.py
@@ -71,7 +71,7 @@ def load_test_cases():
         test_cases = list(
             product(['gpt2_attention', 'llama_attention', 'gptj_attention'],
                     [ContextFMHAType.disabled], ['float16'], [2], [128], [4],
-                    [64], [0], [False], [False], [1], [False], [True, False]))
+                    [64], [0], [False], [False], [1], [True, False]))
 
         # TODO: add more unit tests
         test_cases += list(
@@ -79,20 +79,20 @@ def load_test_cases():
                 ContextFMHAType.disabled, ContextFMHAType.enabled,
                 ContextFMHAType.enabled_with_fp32_acc
             ], ['float16'], [2], [90], [4], [32], [0], [False], [False], [1],
-                    [False], [False]))
+                    [False]))
 
         # Test cases for the multi-block MMHA.
         test_cases += list(
             product(['llama_attention'], [
                 ContextFMHAType.enabled, ContextFMHAType.enabled_with_fp32_acc
             ], ['float16', 'float32'], [2], [2048], [4], [64], [0], [True],
-                    [False], [1], [False], [True, False]))
+                    [False], [1], [True, False]))
 
         # Test cases for the int8 K/V cache.
         test_cases += list(
             product(['gpt2_attention'], [ContextFMHAType.disabled],
                     ['float16', 'float32'], [2], [128], [4], [64], [0], [False],
-                    [True], [1], [False, True], [False]))
+                    [True], [1], [False]))
 
         # test cases for multi-query attention
         test_cases += list(
@@ -100,19 +100,18 @@ def load_test_cases():
                 ContextFMHAType.disabled, ContextFMHAType.enabled,
                 ContextFMHAType.enabled_with_fp32_acc
             ], ['float16'], [2], [128], [4], [64], [1], [False], [False], [1],
-                    [False, True], [False]))
+                    [False]))
 
         # test cases for beam search
         test_cases += list(
             product(['gpt2_attention'], [ContextFMHAType.disabled], ['float16'],
-                    [2], [128], [4], [64], [0], [False], [False], [4],
-                    [False, True], [False]))
+                    [2], [128], [4], [64], [0], [False], [False], [4], [False]))
 
         # test cases for grouped-query attention
         test_cases += list(
             product(['llama_attention'], [ContextFMHAType.disabled],
                     ['float16'], [2], [128], [8], [32], [2, 4], [False],
-                    [False], [1], [False], [False]))
+                    [False], [1], [False]))
         return test_cases
 
     def custom_name_func(testcase_func, param_num, param):
@@ -125,8 +124,7 @@ def custom_name_func(testcase_func, param_num, param):
     def test_gpt_attention_IFB(self, attention_type, context_fmha_type, dtype,
                                batch_size, in_len, num_heads, head_size,
                                num_kv_heads, enable_multi_block_mmha,
-                               use_int8_kv_cache, beam_width, paged_kv_cache,
-                               fuse_bias):
+                               use_int8_kv_cache, beam_width, fuse_bias):
         if num_kv_heads == 0:
             num_kv_heads = num_heads
 
@@ -142,10 +140,8 @@ def test_gpt_attention_IFB(self, attention_type, context_fmha_type, dtype,
             # Fixing seed to avoid flakiness in tests with quantization
             torch.manual_seed(42)
 
-        if beam_width != 1 and paged_kv_cache:
-            pytest.skip(
-                "Beam search and paged kv cache are not supported in this test yet"
-            )
+        if beam_width != 1:
+            pytest.skip("Beam search is not supported in this test yet")
 
         # Skip tests that are not supported in pre-ampere architecture
         if getSMVersion() < 80:
@@ -158,9 +154,8 @@ def test_gpt_attention_IFB(self, attention_type, context_fmha_type, dtype,
                     "ContextFMHAType with fp32 acc is not supported in pre-ampere architecture"
                 )
 
-        tokens_per_block = 16 if paged_kv_cache else -1
+        tokens_per_block = 16
 
-        in_flight_batching = True
         remove_input_padding = True
 
         def _construct_execution(session, input_tensor, weight, bias,
@@ -178,9 +173,7 @@ def _construct_execution(session, input_tensor, weight, bias,
             net.plugin_config.set_gpt_attention_plugin(dtype)
             net.plugin_config.set_context_fmha(context_fmha_type)
             net.plugin_config.enable_remove_input_padding()
-            net.plugin_config.enable_in_flight_batching()
-            if paged_kv_cache:
-                net.plugin_config.enable_paged_kv_cache()
+            net.plugin_config.enable_paged_kv_cache()
 
             with tensorrt_llm.net_guard(net):
                 x_tensor = Tensor(name='input',
@@ -217,12 +210,10 @@ def _construct_execution(session, input_tensor, weight, bias,
                         name='kv_int8_dequant_scale',
                         shape=(1, ),
                         dtype=tensorrt_llm.str_dtype_to_trt('float32'))
-                pointer_array_tensor = None
-                if paged_kv_cache:
-                    pointer_array_tensor = Tensor(
-                        name='kv_cache_block_pointers',
-                        shape=tuple(pointer_array.shape),
-                        dtype=tensorrt_llm.str_dtype_to_trt('int32'))
+                pointer_array_tensor = Tensor(
+                    name='kv_cache_block_pointers',
+                    shape=tuple(pointer_array.shape),
+                    dtype=tensorrt_llm.str_dtype_to_trt('int32'))
 
                 host_context_lengths_tensor = None
                 if remove_input_padding:
@@ -271,13 +262,14 @@ def _construct_execution(session, input_tensor, weight, bias,
                 else:
                     position_embedding_type = PositionEmbeddingType.learned_absolute
                 outputs = tensorrt_llm.functional.gpt_attention(
-                    qkv,
-                    past_key_value_tensor,
-                    sequence_length_tensor,
+                    tensor=qkv,
+                    past_key_value=past_key_value_tensor,
+                    sequence_length=sequence_length_tensor,
+                    host_past_key_value_lengths=
                     host_past_key_value_lengths_tensor,
-                    input_lengths_tensor,
-                    cache_indirection_tensor,
-                    host_request_types_tensor,
+                    context_lengths=input_lengths_tensor,
+                    cache_indirection=cache_indirection_tensor,
+                    host_request_types=host_request_types_tensor,
                     num_heads=num_heads,
                     num_kv_heads=num_kv_heads,
                     q_scaling=1.0,
@@ -308,15 +300,13 @@ def _construct_execution(session, input_tensor, weight, bias,
                 'host_past_key_value_lengths': host_past_key_value_lengths,
                 'context_lengths': context_lengths,
                 'cache_indirection': cache_indirection,
-                'host_request_types': host_request_types
+                'host_request_types': host_request_types,
+                'kv_cache_block_pointers': pointer_array
             }
             if use_int8_kv_cache:
                 inputs['kv_int8_quant_scale'] = kv_int8_quant_scale
                 inputs['kv_int8_dequant_scale'] = kv_int8_dequant_scale
 
-            if paged_kv_cache:
-                inputs['kv_cache_block_pointers'] = pointer_array
-
             if remove_input_padding:
                 inputs['host_context_lengths'] = host_context_lengths
 
@@ -359,15 +349,10 @@ def _construct_execution(session, input_tensor, weight, bias,
             'bias': (qkv_hidden_size, ),
             'kv_int8_quant_scale': (1, ),
             'kv_int8_dequant_scale': (1, ),
-            'cache_indirection': (num_req, beam_width, max_seq_len)
+            'cache_indirection': (num_req, beam_width, max_seq_len),
+            'past_key_value':
+            (blocks, 2, plugin_kv_num_heads, tokens_per_block, head_size)
         }
-        if paged_kv_cache:
-            shape_dict['past_key_value'] = (blocks, 2, plugin_kv_num_heads,
-                                            tokens_per_block, head_size)
-        else:
-            shape_dict['past_key_value'] = (num_req * beam_width, 2,
-                                            plugin_kv_num_heads, max_seq_len,
-                                            head_size)
         shape_dict['present_key_value'] = shape_dict['past_key_value']
 
         ordered_key_value = torch.zeros(
@@ -637,13 +622,12 @@ def torch_exec(step: int,
             torch.cuda.synchronize()
             return torch_output, torch_present
 
-        if paged_kv_cache:
-            # Init KV cache block manager
-            kv_cache_manager = KVCacheManager([ordered_key_value],
-                                              blocks,
-                                              tokens_per_block,
-                                              max_blocks_per_seq,
-                                              beam_width=beam_width)
+        # Init KV cache block manager
+        kv_cache_manager = KVCacheManager([ordered_key_value],
+                                          blocks,
+                                          tokens_per_block,
+                                          max_blocks_per_seq,
+                                          beam_width=beam_width)
 
         torch_cache_list = [None] * num_req
         cache_num_req = 0
@@ -691,19 +675,16 @@ def torch_exec(step: int,
 
             num_seq = num_context_req + num_generation_req * beam_width
 
-            dense_pointer_arrays = None
-            if paged_kv_cache:
-                # Check if new sequence arrived
-                if iteration < num_req:
-                    # Add sequence to the manager
-                    sequence = GenerationSequence(seq_idx=iteration,
-                                                  batch_idx=iteration)
-                    kv_cache_manager.add_sequence(sequence, in_len)
+            # Check if new sequence arrived
+            if iteration < num_req:
+                # Add sequence to the manager
+                sequence = GenerationSequence(seq_idx=iteration,
+                                              batch_idx=iteration)
+                kv_cache_manager.add_sequence(sequence, in_len)
 
-                # Get arrays of pointers to the "pages" of KV values
-                pointer_arrays = kv_cache_manager.get_pointer_arrays(
-                    beam_width)[0]
-                dense_pointer_arrays = pointer_arrays[sequence_selection]
+            # Get arrays of pointers to the "pages" of KV values
+            pointer_arrays = kv_cache_manager.get_pointer_arrays(beam_width)[0]
+            dense_pointer_arrays = pointer_arrays[sequence_selection]
 
             host_input_lengths = np.concatenate(input_length_list)
             host_input_lengths = torch.tensor(host_input_lengths,
@@ -730,17 +711,14 @@ def torch_exec(step: int,
             local_shape_dict = {
                 'input': (1, total_num_tokens, hidden_size),
                 'output': (1, total_num_tokens, hidden_size),
-                'past_key_value': (blocks, 2, num_kv_heads, tokens_per_block,
-                                   head_size) if paged_kv_cache else
-                (num_seq, 2, num_kv_heads, max_seq_len, head_size),
+                'past_key_value':
+                (blocks, 2, num_kv_heads, tokens_per_block, head_size),
                 'sequence_length': (num_seq, ),
                 'host_context_lengths': (num_seq, ),
                 'host_request_types': (num_seq, ),
                 'context_lengths': (num_seq, ),
                 'cache_indir': (num_req, beam_width, max_seq_len),
-                'block_pointers':
-                (num_req, beam_width, 2,
-                 max_blocks_per_seq) if paged_kv_cache else None,
+                'block_pointers': (num_req, beam_width, 2, max_blocks_per_seq),
                 'host_request_type': (num_seq),
                 'kv_int8_quant_scale': (1, ),
                 'kv_int8_dequant_scale': (1, )
@@ -820,13 +798,12 @@ def torch_exec(step: int,
                     atol=1e-3 if step == 0 else 2E-3)
                 offset = offset_next
 
-            if paged_kv_cache:
-                # Due to the design of the test we do not remove finished sequences, but keep them as "none" instead
-                cache_num_req = min(iteration + 1, num_req)
-                finished = [False for _ in range(cache_num_req)]
-                # Iterate to the next step. Increase number of tokens for all unfinished sequences
-                # And allocate new blocks if needed
-                kv_cache_manager.step(finished)
+            # Due to the design of the test we do not remove finished sequences, but keep them as "none" instead
+            cache_num_req = min(iteration + 1, num_req)
+            finished = [False for _ in range(cache_num_req)]
+            # Iterate to the next step. Increase number of tokens for all unfinished sequences
+            # And allocate new blocks if needed
+            kv_cache_manager.step(finished)
 
 
 if __name__ == "__main__":
diff --git a/tests/functional/test_arange.py b/tests/functional/test_arange.py
new file mode 100644
index 00000000000..74aa3e32cd1
--- /dev/null
+++ b/tests/functional/test_arange.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import torch
+from polygraphy.backend.trt import EngineFromNetwork, TrtRunner
+
+import tensorrt_llm
+
+
+class TestFunctional(unittest.TestCase):
+
+    def setUp(self):
+        tensorrt_llm.logger.set_level('error')
+
+    def test_arange_int(self):
+        # test data
+        start = 0
+        end = 128
+        dtype = 'int32'
+
+        # construct trt network
+        builder = tensorrt_llm.Builder()
+        net = builder.create_network()
+        with tensorrt_llm.net_guard(net):
+            network = tensorrt_llm.default_trtnet()
+
+            output = tensorrt_llm.functional.arange(start=start,
+                                                    end=end,
+                                                    dtype=dtype).trt_tensor
+            output.name = 'output'
+            network.mark_output(output)
+            output.dtype = tensorrt_llm.str_dtype_to_trt(dtype)
+
+        # trt run
+        build_engine = EngineFromNetwork((builder.trt_builder, net.trt_network))
+        with TrtRunner(build_engine) as runner:
+            outputs = runner.infer(feed_dict={})
+
+        ref = torch.arange(start, end).int()
+        np.testing.assert_allclose(ref.cpu().numpy(),
+                                   outputs['output'],
+                                   atol=1e-5)
+
+    def test_arange_tensor(self):
+        # test data
+        s = 0
+        e = 128
+        dtype = 'int32'
+
+        # construct trt network
+        builder = tensorrt_llm.Builder()
+        net = builder.create_network()
+        with tensorrt_llm.net_guard(net):
+            network = tensorrt_llm.default_trtnet()
+
+            start = tensorrt_llm.functional.constant(np.array(s,
+                                                              dtype=np.int32))
+            end_tensor = tensorrt_llm.functional.constant(
+                np.array([0] * e, dtype=np.int32))
+
+            output = tensorrt_llm.functional.arange(
+                start=start,
+                end=tensorrt_llm.functional.shape(end_tensor, 0),
+                dtype=dtype).trt_tensor
+            output.name = 'output'
+            network.mark_output(output)
+            output.dtype = tensorrt_llm.str_dtype_to_trt(dtype)
+
+        # trt run
+        build_engine = EngineFromNetwork((builder.trt_builder, net.trt_network))
+        with TrtRunner(build_engine) as runner:
+            outputs = runner.infer(feed_dict={})
+
+        ref = torch.arange(s, e).int()
+        np.testing.assert_allclose(ref.cpu().numpy(),
+                                   outputs['output'],
+                                   atol=1e-5)
diff --git a/tests/model/test_bert.py b/tests/model/test_bert.py
index e192b247687..8cc646c759c 100644
--- a/tests/model/test_bert.py
+++ b/tests/model/test_bert.py
@@ -339,7 +339,8 @@ def test_bert(self, model, use_refit, use_plugin, fast_building,
                 ref = hf_outputs.last_hidden_state
                 np.testing.assert_allclose(ref.cpu().numpy(),
                                            res.cpu().numpy(),
-                                           atol=1.5e-2)
+                                           atol=1e-2,
+                                           rtol=1e-2)
             elif model == BertForQuestionAnswering.__name__:
                 res_start_logits, res_end_logits = torch.split(res, 1, -1)
                 res_start_logits = res_start_logits.squeeze()
diff --git a/tests/model/test_bloom.py b/tests/model/test_bloom.py
index 598e089ec2a..f30759bda28 100644
--- a/tests/model/test_bloom.py
+++ b/tests/model/test_bloom.py
@@ -271,7 +271,7 @@ def test_bloom(self, use_gpt_attention_plugin, context_fmha_type,
                             device='cuda'),
             })
 
-        context = runtime.context_0
+        context = runtime.ctx_context
         runtime._set_shape(context, ctx_shape)
         runtime._set_buffer(context, ctx_buffer)
         runtime._run(context)
@@ -444,7 +444,8 @@ def test_greedy_search(self, use_gpt_attention_plugin, context_fmha_type,
 
         decoder.setup(batch_size,
                       max_context_length=seq_len,
-                      max_new_tokens=max_new_tokens)
+                      max_new_tokens=max_new_tokens,
+                      beam_width=num_beams)
 
         output_ids = decoder.decode(input_ids, context_lengths, sampling_config)
         # TODO: change to actual ragged tensor after BLOOM plugin supports it
diff --git a/tests/model/test_falcon.py b/tests/model/test_falcon.py
index ca73518f523..5359b3ea776 100644
--- a/tests/model/test_falcon.py
+++ b/tests/model/test_falcon.py
@@ -117,8 +117,10 @@ def initialize_network(self, network: tensorrt_llm.Network,
                                                  max_beam_width=beam_width)
             load_from_hf_falcon(trtllm_model,
                                 hf_model,
-                                rank=rank,
-                                tensor_parallel=tensor_parallel,
+                                mapping=tensorrt_llm.Mapping(
+                                    world_size=tensor_parallel,
+                                    rank=rank,
+                                    tp_size=tensor_parallel),
                                 dtype=dtype)
 
             # Prepare
@@ -383,7 +385,7 @@ def test_falcon(self, query_type, use_alibi, parallel_attention,
                 ctx_buffer[f'past_key_value_{i}'] = torch.zeros(
                     (1, ), dtype=str_dtype_to_torch(kv_dtype), device=device)
 
-        context = runtime.context_0
+        context = runtime.ctx_context
         runtime._set_shape(context, ctx_shape)
         runtime._set_buffer(context, ctx_buffer)
         runtime._run(context)
diff --git a/tests/model/test_gpt.py b/tests/model/test_gpt.py
index 1c2593c7bfa..5ebcff9c2f4 100644
--- a/tests/model/test_gpt.py
+++ b/tests/model/test_gpt.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 import os
 import random
 import sys
@@ -33,6 +34,8 @@
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.runtime import ModelConfig, SamplingConfig
 from tensorrt_llm.runtime.generation import _prepare_attention_mask
+from tensorrt_llm.runtime.kv_cache_manager import (GenerationSequence,
+                                                   KVCacheManager)
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 from examples.gpt.weight import load_from_hf_gpt
@@ -56,7 +59,8 @@ def _gen_hf_gpt(self, hidden_act, n_layer, max_length, dtype):
     def _gen_tensorrt_llm_network(self, network, builder, hf_gpt, gpt_config,
                                   batch_size, input_len, output_len, fp16,
                                   gpt_attention_plugin, tensor_parallel,
-                                  apply_query_key_layer_scaling):
+                                  apply_query_key_layer_scaling, paged_kv_cache,
+                                  tokens_per_block):
         num_layers = gpt_config.n_layer
         num_heads = gpt_config.n_head
         hidden_size = gpt_config.n_embd
@@ -79,11 +83,14 @@ def _gen_tensorrt_llm_network(self, network, builder, hf_gpt, gpt_config,
                 mapping=tensorrt_llm.Mapping(world_size=tensor_parallel,
                                              tp_size=tensor_parallel),
                 apply_query_key_layer_scaling=apply_query_key_layer_scaling)
-            inputs = tensorrt_llm_gpt.prepare_inputs(batch_size,
-                                                     input_len,
-                                                     output_len,
-                                                     use_cache=True,
-                                                     max_beam_width=1)
+            inputs = tensorrt_llm_gpt.prepare_inputs(
+                batch_size,
+                input_len,
+                output_len,
+                use_cache=True,
+                max_beam_width=1,
+                paged_kv_cache=paged_kv_cache,
+                tokens_per_block=tokens_per_block)
             load_from_hf_gpt(tensorrt_llm_gpt,
                              hf_gpt,
                              dtype="float16" if fp16 else "float32")
@@ -112,7 +119,8 @@ def _gen_tensorrt_llm_runtime(self,
                                   apply_query_key_layer_scaling=False,
                                   context_fmha_type=ContextFMHAType.disabled,
                                   enable_remove_input_padding=False,
-                                  use_in_flight_batching=False):
+                                  enable_paged_kv_cache=False,
+                                  tokens_per_block=64):
         mapping = tensorrt_llm.Mapping(world_size, rank, tp_size=world_size)
 
         runtime = None
@@ -129,13 +137,15 @@ def _gen_tensorrt_llm_runtime(self,
             network.plugin_config.set_context_fmha(context_fmha_type)
             if enable_remove_input_padding:
                 network.plugin_config.enable_remove_input_padding()
-            if use_in_flight_batching:
-                network.plugin_config.enable_in_flight_batching()
+            if enable_paged_kv_cache:
+                network.plugin_config.enable_paged_kv_cache()
 
             self._gen_tensorrt_llm_network(network, builder, hf_gpt, gpt_config,
                                            batch_size, input_len, output_len,
                                            fp16, use_plugin, world_size,
-                                           apply_query_key_layer_scaling)
+                                           apply_query_key_layer_scaling,
+                                           enable_paged_kv_cache,
+                                           tokens_per_block)
 
             builder_config = builder.create_builder_config(
                 name='gpt',
@@ -249,7 +259,7 @@ def test_gpt_float32(self, use_refit):
                             device='cuda'),
             })
 
-        context = runtime.context_0
+        context = runtime.ctx_context
         runtime._set_shape(context, ctx_shape)
         runtime._set_buffer(context, ctx_buffer)
         runtime._run(context)
@@ -378,23 +388,16 @@ def load_test_cases():
             product([False, True], [False, True], [False, True], [
                 ContextFMHAType.disabled, ContextFMHAType.enabled,
                 ContextFMHAType.enabled_with_fp32_acc
-            ], [False, True], [False]))
-
-        # Add several cases for inflight batching
-        test_cases.append(
-            (False, False, False, ContextFMHAType.disabled, True, True))
-        test_cases.append(
-            (True, True, True, ContextFMHAType.disabled, True, True))
+            ], [False, True], [False, True]))
 
         return test_cases
 
     @parameterized.expand(load_test_cases)
     def test_gpt_plugin(self, use_refit, fast_building,
                         apply_query_key_layer_scaling, context_fmha_type,
-                        enable_remove_input_padding, use_in_flight_batching):
-        if use_in_flight_batching:
-            # inflight batching mode can only works with remove_input_padding
-            assert enable_remove_input_padding
+                        enable_remove_input_padding, enable_paged_kv_cache):
+        # inflight batching mode only works with remove_input_padding and paged_kv_cache
+        use_in_flight_batching = enable_remove_input_padding and enable_paged_kv_cache
 
         # Skip tests that are not supported in pre-ampere architecture
         if getSMVersion() < 80:
@@ -423,26 +426,40 @@ def test_gpt_plugin(self, use_refit, fast_building,
         seq_len = 128
         total_length = seq_len + max_length
         use_plugin = True
+        tokens_per_block = 64
         gpt_config, hf_gpt = self._gen_hf_gpt(hidden_act, n_layer,
                                               seq_len + max_length, dtype)
         runtime, _ = self._gen_tensorrt_llm_runtime(
             log_level, dtype, world_size, rank, gpt_config, hf_gpt, model,
             use_plugin, batch_size, seq_len, max_length, use_refit,
             fast_building, apply_query_key_layer_scaling, context_fmha_type,
-            enable_remove_input_padding, use_in_flight_batching)
+            enable_remove_input_padding, enable_paged_kv_cache,
+            tokens_per_block)
         key_value_cache_buffers = []
         value_cache_buffers = []
         head_size = gpt_config.n_embd // gpt_config.n_head
 
         for i in range(gpt_config.n_layer):
-            key_value_cache_buffers.append(
-                torch.zeros((
+            if enable_paged_kv_cache:
+                blocks = batch_size * beam_width * math.ceil(
+                    total_length / tokens_per_block)
+                cache_shape = (
+                    blocks,
+                    2,
+                    gpt_config.n_head,
+                    tokens_per_block,
+                    head_size,
+                )
+            else:
+                cache_shape = (
                     batch_size,
                     2,
                     gpt_config.n_head,
                     total_length,
                     head_size,
-                ),
+                )
+            key_value_cache_buffers.append(
+                torch.zeros(cache_shape,
                             dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype),
                             device='cuda'))
             value_cache_buffers.append(
@@ -474,7 +491,21 @@ def test_gpt_plugin(self, use_refit, fast_building,
                        device='cuda')
         ]  # ping-pong buffers
 
-        def run_engine(input_ids,
+        if enable_paged_kv_cache:
+            max_blocks_per_seq = math.ceil(total_length / tokens_per_block)
+            blocks = batch_size * beam_width * max_blocks_per_seq
+            kv_cache_manager = KVCacheManager(key_value_cache_buffers, blocks,
+                                              tokens_per_block,
+                                              max_blocks_per_seq, beam_width)
+
+            # Add sequences to the manager
+            for bi in range(batch_size):
+                generation_sequence = GenerationSequence(seq_idx=bi,
+                                                         batch_idx=bi)
+                kv_cache_manager.add_sequence(generation_sequence, seq_len)
+
+        def run_engine(context,
+                       input_ids,
                        context_lengths,
                        host_request_types,
                        position_ids,
@@ -505,12 +536,23 @@ def run_engine(input_ids,
                 ctx_buffer[f'present_key_value_{i}'] = key_value_cache_buffers[
                     i]
 
+            if enable_paged_kv_cache:
+                assert beam_width == 1
+                # for beam_width > 1 the argument must be '1' in ctx phase and 'beam_width' in gen phase
+                kv_cache_block_pointers = kv_cache_manager.get_pointer_arrays(1)
+
+                for idx in range(gpt_config.n_layer):
+                    shape = kv_cache_block_pointers[idx].shape
+                    shape = [shape[0] * shape[1], *shape[2:]]
+                    ctx_buffer[
+                        f'kv_cache_block_pointers_{idx}'] = kv_cache_block_pointers[
+                            idx].reshape(shape).contiguous()
+
             ctx_shape = {
                 key: buffer.shape
                 for key, buffer in ctx_buffer.items()
             }
 
-            context = runtime.context_0
             runtime._set_shape(context, ctx_shape)
             runtime._set_buffer(context, ctx_buffer)
             runtime._run(context)
@@ -565,6 +607,7 @@ def compare_context(run_ref_only=False):
             sequence_length = ctx_context_lengths.detach().clone()
 
             res = run_engine(
+                context=runtime.ctx_context,
                 input_ids=ctx_ids,
                 context_lengths=ctx_context_lengths,
                 position_ids=ctx_position_ids,
@@ -625,6 +668,7 @@ def compare_generation(run_ref_only=False):
             sequence_length = torch.add(gen_context_lengths.detach().clone(), 1)
 
             res = run_engine(
+                context=runtime.context_1,
                 input_ids=gen_ids,
                 context_lengths=gen_context_lengths,
                 position_ids=gen_position_ids,
@@ -653,7 +697,7 @@ def compare_mixing_context_and_generation_phases():
             ref_ctx_out = None
             ref_gen_out = None
 
-            compare_context()
+            # compare_context()
 
             # prepare the inputs for plugin-based gpt
             assert step0_ids is not None and step1_ids is not None
@@ -701,6 +745,7 @@ def compare_mixing_context_and_generation_phases():
                                            dtype=torch.int32).cuda()
 
             res = run_engine(
+                context=runtime.context_1,
                 input_ids=input_ids,
                 context_lengths=context_lengths,
                 position_ids=position_ids,
@@ -785,7 +830,8 @@ def test_greedy_search_float32(self, use_refit):
 
         decoder.setup(batch_size,
                       max_context_length=seq_len,
-                      max_new_tokens=max_new_tokens)
+                      max_new_tokens=max_new_tokens,
+                      beam_width=num_beams)
 
         output_ids = decoder.decode(input_ids, input_lengths, sampling_config)
         #TODO: change to actual ragged tensor after GPT plugin supports it
diff --git a/tests/model/test_gptj.py b/tests/model/test_gptj.py
index 0f4200a4f9d..f8c6dbfaeb7 100644
--- a/tests/model/test_gptj.py
+++ b/tests/model/test_gptj.py
@@ -105,8 +105,7 @@ def _gen_tensorrt_llm_runtime(self,
                                   use_refit,
                                   use_ln_gemm_plugin,
                                   context_fmha_flag=ContextFMHAType.disabled,
-                                  enable_remove_input_padding=False,
-                                  use_in_flight_batching=False):
+                                  enable_remove_input_padding=False):
         tensorrt_llm.logger.set_level('error')
         mapping = tensorrt_llm.Mapping(world_size, rank, tp_size=world_size)
 
@@ -123,8 +122,6 @@ def _gen_tensorrt_llm_runtime(self,
                 network.plugin_config.set_layernorm_plugin(dtype)
             if enable_remove_input_padding:
                 network.plugin_config.enable_remove_input_padding()
-            if use_in_flight_batching:
-                network.plugin_config.enable_in_flight_batching()
             network.plugin_config.set_context_fmha(context_fmha_flag)
 
             self._gen_tensorrt_llm_network(network, hf_gpt, gpt_config,
@@ -153,23 +150,14 @@ def load_test_cases():
             product([
                 ContextFMHAType.disabled, ContextFMHAType.enabled,
                 ContextFMHAType.enabled_with_fp32_acc
-            ], [False, True], [False]))
-
-        # Add several cases for inflight batching
-        test_cases.append((ContextFMHAType.disabled, True, True))
+            ], [False, True]))
 
         return test_cases
 
     @parameterized.expand(load_test_cases)
-    def test_gptj_plugin(self,
-                         context_fmha_flag,
-                         enable_remove_input_padding,
-                         use_in_flight_batching=False):
-        if use_in_flight_batching:
-            # inflight batching mode can only works with remove_input_padding
-            assert enable_remove_input_padding
-
-            # Skip tests that are not supported in pre-ampere architecture
+    def test_gptj_plugin(self, context_fmha_flag, enable_remove_input_padding):
+
+        # Skip tests that are not supported in pre-ampere architecture
         if getSMVersion() < 80:
             if context_fmha_flag == ContextFMHAType.enabled:
                 pytest.skip(
@@ -211,8 +199,7 @@ def test_gptj_plugin(self,
             use_refit,
             use_ln_gemm_plugin,
             context_fmha_flag,
-            enable_remove_input_padding=enable_remove_input_padding,
-            use_in_flight_batching=use_in_flight_batching)
+            enable_remove_input_padding=enable_remove_input_padding)
 
         key_value_cache_buffers = []
         head_size = gpt_config.n_embd // gpt_config.n_head
@@ -228,7 +215,8 @@ def test_gptj_plugin(self,
                             dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype),
                             device='cuda'))
 
-        def run_engine(input_ids,
+        def run_engine(context,
+                       input_ids,
                        context_lengths,
                        host_request_types,
                        position_ids,
@@ -262,7 +250,6 @@ def run_engine(input_ids,
                 for key, buffer in ctx_buffer.items()
             }
 
-            context = runtime.context_0
             runtime._set_shape(context, ctx_shape)
             runtime._set_buffer(context, ctx_buffer)
 
@@ -329,6 +316,7 @@ def compare_context():
             ) if enable_remove_input_padding else None
 
             res = run_engine(
+                context=runtime.ctx_context,
                 input_ids=ctx_ids,
                 context_lengths=ctx_context_lengths,
                 position_ids=ctx_position_ids,
@@ -408,6 +396,7 @@ def compare_generation():
             sequence_length_buffer = torch.add(ctx_context_lengths, 1)
 
             res = run_engine(
+                context=runtime.context_1,
                 input_ids=step1_id,
                 # note we should pass context length for generation phase.
                 context_lengths=ctx_context_lengths,
diff --git a/tests/model/test_gptneox.py b/tests/model/test_gptneox.py
index 29bcfcaeeb7..f5137acd6e4 100644
--- a/tests/model/test_gptneox.py
+++ b/tests/model/test_gptneox.py
@@ -301,7 +301,7 @@ def test_gptneox_plugin(self, context_fmha_flag,
         ctx_shape['host_past_key_value_lengths'] = ctx_buffer[
             'host_past_key_value_lengths'].shape
 
-        context = runtime.context_0
+        context = runtime.ctx_context
         runtime._set_shape(context, ctx_shape)
         runtime._set_buffer(context, ctx_buffer)
 
diff --git a/tests/model/test_llama.py b/tests/model/test_llama.py
index 2837e1f4800..eb508f0c879 100644
--- a/tests/model/test_llama.py
+++ b/tests/model/test_llama.py
@@ -71,8 +71,10 @@ def _gen_tensorrt_llm_network(self, network, hf_llama,
             load_from_hf_llama(tensorrt_llm_llama,
                                hf_llama,
                                dtype=dtype,
-                               rank=rank,
-                               tensor_parallel=tensor_parallel)
+                               mapping=tensorrt_llm.Mapping(
+                                   world_size=tensor_parallel,
+                                   rank=rank,
+                                   tp_size=tensor_parallel))
             # Prepare
             network.set_named_parameters(tensorrt_llm_llama.named_parameters())
             inputs = tensorrt_llm_llama.prepare_inputs(batch_size, input_len,
@@ -322,7 +324,7 @@ def test_llama(self, use_refit, fast_building, context_fmha_flag,
         ctx_buffer['host_past_key_value_lengths'] = torch.tensor(
             [0] * batch_size, dtype=torch.int32)
 
-        context = runtime.context_0
+        context = runtime.ctx_context
         runtime._set_shape(context, ctx_shape)
         runtime._set_buffer(context, ctx_buffer)
         runtime._run(context)
@@ -502,8 +504,9 @@ def print_layers(m: tensorrt_llm.models.LLaMAForCausalLM):
         # print_layers(tensorrt_llm_llama_wHF)
         load_from_hf_llama(tensorrt_llm_llama_wHF,
                            hf_llama,
-                           rank=rank,
-                           tensor_parallel=tp_size,
+                           mapping=tensorrt_llm.Mapping(world_size=tp_size,
+                                                        rank=rank,
+                                                        tp_size=tp_size),
                            dtype=dtype)
         # print_layers(tensorrt_llm_llama_wHF)
 
@@ -526,8 +529,9 @@ def print_layers(m: tensorrt_llm.models.LLaMAForCausalLM):
         # print_layers(tensorrt_llm_llama_wMETA)
         load_from_meta_llama(tensorrt_llm_llama_wMETA,
                              meta_path,
-                             rank=rank,
-                             tensor_parallel=tp_size,
+                             mapping=tensorrt_llm.Mapping(world_size=tp_size,
+                                                          rank=rank,
+                                                          tp_size=tp_size),
                              dtype=dtype)
         # print_layers(tensorrt_llm_llama_wMETA)
         # token embedding
diff --git a/tests/quantization/test_smooth_quant_gemm.py b/tests/quantization/test_smooth_quant_gemm.py
index 61cabc488c9..8f484b6ecb3 100644
--- a/tests/quantization/test_smooth_quant_gemm.py
+++ b/tests/quantization/test_smooth_quant_gemm.py
@@ -156,3 +156,7 @@ def test_sq_matmul_no_plugin(self):
                     TypeError,
                     "Smooth Quant GEMM is only supported with plugin"):
                 smooth_quant_gemm(None, None, None, None, False, False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_graph_rewriter.py b/tests/test_graph_rewriter.py
index 4450b2629ae..a17e3db0c72 100644
--- a/tests/test_graph_rewriter.py
+++ b/tests/test_graph_rewriter.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
-from copy import copy
 
 import numpy as np
 import tensorrt as trt
@@ -116,13 +115,13 @@ def _construct_execution(
             else:
                 position_embedding_type = PositionEmbeddingType.learned_absolute
             outputs = tensorrt_llm.functional.gpt_attention(
-                qkv,
-                past_key_value_tensor,
-                sequence_length_tensor,
-                host_past_key_value_lengths_tensor,
-                context_lengths_tensor,
-                cache_indirection_tensor,
-                host_request_types_tensor,
+                tensor=qkv,
+                past_key_value=past_key_value_tensor,
+                sequence_length=sequence_length_tensor,
+                host_past_key_value_lengths=host_past_key_value_lengths_tensor,
+                context_lengths=context_lengths_tensor,
+                cache_indirection=cache_indirection_tensor,
+                host_request_types=host_request_types_tensor,
                 num_heads=num_heads,
                 num_kv_heads=1 if enable_multi_query_attention else num_heads,
                 q_scaling=1.0,
@@ -290,21 +289,21 @@ def rewrite(self, layer: Layer) -> None:
         with net_guard(layer.network):
             # There are several stages to replace some subgraph with another subgraph:
 
-            # Stage 1: Get the input tensors and output tensors of the subgraph to replace.
+            # Step 1: Get the input tensors and output tensors of the subgraph to replace.
             # - For elementwise_SUM, there are two inputs and one output.
             a, b = layer.get_inputs(0, 1)
             o = layer.get_outputs(0)[0]
 
-            # Stage 2: Create a new subgraph that takes the old one's inputs
+            # Step 2: Create a new subgraph that takes the old one's inputs
             # - here we insert an elementwise_SUB layer, and c is the output
             c = a - b
 
-            # Stage 3: Redirect all the layers depending on the outputs of the old subgraph to the new subgraph's.
+            # Step 3: Redirect all the layers depending on the outputs of the old subgraph to the new subgraph's.
             # - After this, the SUM become dangling, and will be pruned by TensorRT when building the engine.
             # - Note that, there is no API in TensorRT python to remove a layer explicitly, the `replace_all_uses_with` is the only way to "remove" a layer.
             o.replace_all_uses_with(c)
 
-            # Stage 4: Mark the all the layers in the old subgraph as removed.
+            # Step 4: Mark the all the layers in the old subgraph as removed.
             # - This helps the PatternRewriter to skip the removed layers
             layer.mark_as_removed()
 
@@ -408,23 +407,23 @@ def match_and_rewrite(self, layer: Layer) -> bool:
             return False
 
         flayer = FLayerInfoMemo.instance().get(layer.name)
-        tensor_input: Tensor = flayer.raw_inputs['tensor']
-        if tensor_input.shape[0] == 1:  # already on remove padding
+        assert flayer
+        tensor_input: Tensor = flayer.get_input('tensor')
+        if tensor_input.shape[0] == 1:  # already on remove-padding mode
             return False
 
         self.log_info(f'hit gpt_attention plugin: {layer.name}')
 
-        assert self.args is not None
-        batch_size = self.args['batch_size']
-        in_len = self.args['in_len']
-        hidden_size = self.args['hidden_size']
+        assert self.args is not None, "args should be passed in from RewritePatternManager.rewrite()"
+        batch_size, in_len, hidden_size = self.args['batch_size'], self.args[
+            'in_len'], self.args['hidden_size']
 
         # record the times of rewriting
         self.count += 1
 
-        new_inputs = copy(flayer.raw_inputs)
+        new_inputs = flayer.clone_inputs()
         with net_guard(layer.network):
-            # step 1: create new inputs and repalce the original arglist
+            # Step 1: create new inputs and repalce the original arglist
             input = Tensor(
                 name='tensor',
                 dtype=trt.float16,
@@ -432,15 +431,17 @@ def match_and_rewrite(self, layer: Layer) -> bool:
             )
             new_inputs['tensor'] = input
 
-            # step 2: create a new plugin instance
+            # Step 2: create a new plugin instance
             new_outs = gpt_attention(**new_inputs)
 
-            # step 3: deprive all the users of the old plugin instance
-            flayer.replace_output_users_with(layer.network, new_outs)
+            # Step 3: deprive all the users of the old plugin instance
+            flayer.replace_outputs_uses_with(layer.network, new_outs)
 
-            # step 4: remove the old plugin instance
+            # Step 4: remove the old plugin instance
             layer.mark_as_removed()
 
+        return True
+
 
 class TestGPTAttentionPluginRemovePaddingRewritePass(unittest.TestCase):
 
diff --git a/tests/test_layer.py b/tests/test_layer.py
index f41364580ca..d3902a00413 100644
--- a/tests/test_layer.py
+++ b/tests/test_layer.py
@@ -31,7 +31,8 @@
 import tensorrt_llm
 from tensorrt_llm import Tensor
 from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
-from tensorrt_llm.layers import PositionEmbeddingType
+from tensorrt_llm.layers import (AttentionParams, KeyValueCacheParams,
+                                 PositionEmbeddingType)
 
 
 class TestLayer(unittest.TestCase):
@@ -744,15 +745,17 @@ def test_attention(self,
             if use_plugin:
                 output, present_key_value = attn_layer(
                     input_tensor,
-                    past_key_value=past_key_value_tensor,
-                    sequence_length=sequence_length_tensor,
-                    host_past_key_value_lengths=
-                    host_past_key_value_lengths_tensor,
                     use_cache=True,
-                    cache_indirection=cache_indirection_tensor,
-                    context_lengths=context_lengths_tensor,
-                    host_request_types=host_request_types_tensor,
-                    max_context_length=seq_len)
+                    kv_cache_params=KeyValueCacheParams(
+                        past_key_value=[past_key_value_tensor],
+                        host_past_key_value_lengths=
+                        host_past_key_value_lengths_tensor,
+                        cache_indirection=cache_indirection_tensor),
+                    attention_params=AttentionParams(
+                        sequence_length=sequence_length_tensor,
+                        context_lengths=context_lengths_tensor,
+                        host_request_types=host_request_types_tensor,
+                        max_context_length=seq_len))
                 assert isinstance(output, Tensor)
                 output = output
                 present_key_value.mark_output(
diff --git a/tests/test_plugins.py b/tests/test_plugins.py
new file mode 100644
index 00000000000..c56026f71d4
--- /dev/null
+++ b/tests/test_plugins.py
@@ -0,0 +1,17 @@
+import tensorrt as _trt
+
+import tensorrt_llm.plugin as _tlp
+
+
+def test_load_library():
+    """Test loading the TensorRT-LLM plugin library."""
+    runtime = _trt.Runtime(_trt.Logger(_trt.Logger.WARNING))
+    registry = runtime.get_plugin_registry()
+    handle = registry.load_library(_tlp.plugin_lib_path())
+    creators = registry.plugin_creator_list
+    assert len(creators) >= 10
+    for creator in creators:
+        assert creator.plugin_namespace == _tlp.TRT_LLM_PLUGIN_NAMESPACE
+
+    registry.deregister_library(handle)
+    assert len(registry.plugin_creator_list) == 0
diff --git a/tests/tools/__init__.py b/tests/tools/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/tools/plugin_gen/__init__.py b/tests/tools/plugin_gen/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/tools/plugin_gen/build_engine.py b/tests/tools/plugin_gen/build_engine.py
new file mode 100644
index 00000000000..9b5ff771770
--- /dev/null
+++ b/tests/tools/plugin_gen/build_engine.py
@@ -0,0 +1,195 @@
+import argparse
+import math
+# include plugins
+# yapf: disable
+import sys
+import time
+from pathlib import Path
+from typing import List, OrderedDict
+
+import tensorrt as trt
+
+# from plugin import LAYER_NAME, FmhaLayer, get_engine_name
+import tensorrt_llm
+from tensorrt_llm import Module, str_dtype_to_trt
+from tensorrt_llm.builder import Builder, BuilderConfig
+from tensorrt_llm.functional import Tensor
+from tensorrt_llm.logger import logger
+from tensorrt_llm.network import net_guard
+
+sys.path.append('./tmp/output')
+from functional import fused_attention_kernel # isort:skip
+# yapf: enable
+
+
+def get_engine_name(head_size: int, dtype: str) -> str:
+    return f'fmha_{head_size}_{dtype}.engine'
+
+
+class FmhaLayer(Module):
+
+    def __init__(self, num_heads: int, head_size: int, softmax_scale: float):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.softmax_scale = softmax_scale
+        self.dtype = str_dtype_to_trt('float16')
+
+    def forward(self, Q: Tensor, K: Tensor, V: Tensor):
+        inputs = [Q, K, V]
+        Out, L, M = fused_attention_kernel(self.softmax_scale, self.num_heads,
+                                           *[p.trt_tensor for p in inputs])
+        Out.mark_output('out', self.dtype)
+        L.mark_output('L', self.dtype)
+        M.mark_output('M', self.dtype)
+        return Out, L, M
+
+    def prepare_inputs(self, max_batch_size: int, max_len: int) -> List[Tensor]:
+        '''
+
+        @brief: Prepare inputs Tensors for the model, the given sizes are used to
+            determine the ranges of the dimensions of when using TRT dynamic shapes.
+
+        @return: a list contains values which can be fed into the self.forward()
+        '''
+
+        bs_range = [1, (max_batch_size + 1) // 2, max_batch_size]
+        max_len_range = [1, (max_len + 1) // 2, max_len]
+
+        dynamic_shape = [-1, self.num_heads, -1, self.head_size]
+        Q = Tensor(name='Q',
+                   dtype=trt.float16,
+                   shape=dynamic_shape,
+                   dim_range=OrderedDict([
+                       ('batch_size', [bs_range]),
+                       ('num_heads', [self.num_heads]),
+                       ('seq_len', [max_len_range]),
+                       ('head_size', [self.head_size]),
+                   ]))
+        K = Tensor(name='K',
+                   dtype=trt.float16,
+                   shape=dynamic_shape,
+                   dim_range=OrderedDict([
+                       ('batch_size', [bs_range]),
+                       ('num_heads', [self.num_heads]),
+                       ('seq_len', [max_len_range]),
+                       ('head_size', [self.head_size]),
+                   ]))
+        V = Tensor(name='V',
+                   dtype=trt.float16,
+                   shape=dynamic_shape,
+                   dim_range=OrderedDict([
+                       ('batch_size', [bs_range]),
+                       ('num_heads', [self.num_heads]),
+                       ('seq_len', [max_len_range]),
+                       ('head_size', [self.head_size]),
+                   ]))
+        return [Q, K, V]
+
+
+def build_engine(builder: Builder, builder_config: BuilderConfig,
+                 engine_name: str, args: argparse.Namespace) -> trt.IHostMemory:
+    '''
+    @brief: Build a TensorRT engine.
+    @param args: The cmd line arguments.
+    @return: The built or refitted engine.
+    '''
+
+    # Initialize Module
+    softmax_scale = 1.0 / math.sqrt(args.head_size)
+    layer = FmhaLayer(args.num_heads, args.head_size, softmax_scale)
+
+    # Module -> Network
+    network = builder.create_network()
+    network.trt_network.name = engine_name
+    with net_guard(network):
+        # Prepare
+        inputs = layer.prepare_inputs(args.max_batch_size, args.max_seq_len)
+        # Forward
+        logger.debug(f'model inputs: {inputs}')
+        layer(*inputs)
+
+        print('dot:')
+        print(network.to_dot())
+
+        layer = network.get_layer_by_name(
+            "FmhaLayer/PLUGIN_V2_fused_attention_kernelPlugin_2").as_layer()
+        print('layer', layer.plugin.plugin_type)
+        print('layer', layer.plugin.plugin_version)
+        print('layer', layer.plugin.plugin_namespace)
+
+    # Network -> Engine
+    engine = builder.build_engine(network, builder_config)
+    config_path = Path(args.output_dir) / 'config.json'
+    builder.save_config(builder_config, str(config_path))
+    return engine
+
+
+def build(args):
+    tensorrt_llm.logger.set_level(args.log_level)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    builder = Builder()
+    cache = None
+    builder_config = builder.create_builder_config(
+        name='fmha_triton',
+        precision=args.dtype,
+        timing_cache=args.timing_cache if cache is None else cache)
+
+    engine_name = get_engine_name(args.head_size, args.dtype)
+    engine = build_engine(builder, builder_config, engine_name, args)
+    assert engine is not None
+
+    engine_path = output_dir / engine_name
+    logger.info(f'Serializing engine to {str(engine_path)}...')
+    tik = time.time()
+    with engine_path.open('wb') as f:
+        f.write(bytearray(engine))
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Engine serialized. Total time: {t}')
+
+    ok = builder.save_timing_cache(builder_config,
+                                   Path(args.output_dir) / "model.cache")
+    assert ok, "Failed to save timing cache."
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--max_batch_size', type=int, default=4)
+    parser.add_argument('--max_seq_len', type=int, default=256)
+    parser.add_argument('--num_heads', type=int, default=8)
+    parser.add_argument('--head_size', type=int, default=64)
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='float16',
+                        choices=['float16', 'float32'])
+    parser.add_argument(
+        '--timing_cache',
+        type=str,
+        default='model.cache',
+        help='The path of to read timing cache from, will be ignored '
+        'if the file does not exist')
+    parser.add_argument('--log_level', type=str, default='info')
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='outputs',
+        help='The path to save the serialized engine files, timing cache '
+        'file and model configs')
+    args = parser.parse_args()
+
+    logger.set_level(args.log_level)
+    logger.info('Parameters'.center(40, '='))
+    for k, v in vars(args).items():
+        logger.info(f' - {k.ljust(15, ".")}: {v}')
+    logger.info(''.center(40, '='))
+
+    tik = time.time()
+    logger.info('Build TensorRT engine.')
+    build(args)
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Total time of building TRT engine: {t}')
diff --git a/tests/tools/plugin_gen/kernel_config.py b/tests/tools/plugin_gen/kernel_config.py
new file mode 100644
index 00000000000..dda07e88d86
--- /dev/null
+++ b/tests/tools/plugin_gen/kernel_config.py
@@ -0,0 +1,49 @@
+import os
+
+from tensorrt_llm.tools.plugin_gen.core import *
+
+openai_triton_example_root = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "..", "..", "..", "examples",
+    "openai_triton")
+
+
+def get_fmha_kernel_meta_data():
+    return KernelMetaData(
+        kernel_name='fused_attention_kernel',
+        ios=[
+            # outputs
+            OutputArg('Out', Type('tensor[fp16]'), hints=['16', '16']),
+            OutputArg('L', Type('tensor[fp32]'), hints=['16', '16']),
+            OutputArg('M', Type('tensor[fp16]'), hints=['16', '16']),
+            # inputs
+            InputArg('Q', Type('tensor[fp16]'), hints=['16', '16']),
+            InputArg('K', Type('tensor[fp16]'), hints=['16', '16']),
+            InputArg('V', Type('tensor[fp16]'), hints=['16', '16']),
+            ParamArg('sm_scale', Type('fp32')),
+            DimSizeArg('batch_size'),
+            ParamArg('num_heads', Type('i32')),
+            DimSizeArg('seq_len', hints=['', '16']),
+            # constexprs
+            Constexpr(128),
+            Constexpr(64),
+            Constexpr(128),
+        ],
+        shape_infer_rules=[
+            # The following rules helps to deduce the shapes of the output tensors
+            "Q[*] -> Out[*]",
+            "Q[m,n,k,*] -> L[m,n,k]",
+            "Q[m,n,k,*] -> M[m,n,k]",
+
+            # The following rules helps to deduce both DimSizeArgs: batch_size and seq_len
+            "Q[m,n,k,*] : m -> batch_size",
+            "Q[m,n,k,*] : k -> seq_len",
+        ],
+        version=0,
+        kernel_file=f'{openai_triton_example_root}/fmha_triton.py',
+        num_warps=1,
+        grid_dims=("(seq_len + 127) / 128", "batch_size * num_heads", "1"))
+
+
+KERNELS = [
+    get_fmha_kernel_meta_data(),
+]
diff --git a/tests/tools/plugin_gen/run_engine.py b/tests/tools/plugin_gen/run_engine.py
new file mode 100644
index 00000000000..f0d80d4374e
--- /dev/null
+++ b/tests/tools/plugin_gen/run_engine.py
@@ -0,0 +1,169 @@
+import argparse
+import json
+import math
+# include plugins
+# yapf: disable
+import sys
+from pathlib import Path
+
+import torch
+from fmha_triton import fused_attention
+
+from tensorrt_llm import profiler
+from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
+                                 trt_dtype_to_torch)
+from tensorrt_llm.logger import logger
+from tensorrt_llm.runtime.session import Session, TensorInfo
+
+# from tensorrt_llm.plugin import get_engine_name
+
+
+sys.path.append('./tmp')
+from functional import fused_attention_kernel # isort:skip
+# yapf: enable
+
+
+def get_engine_name(head_size, dtype):
+    return f'fmha_{head_size}_{dtype}.engine'
+
+
+def run(engine_dir,
+        batch_size,
+        seq_len,
+        num_heads,
+        head_size,
+        do_benchmark=False):
+    # Load trt engine.
+    engine_dir = Path(engine_dir)
+    config_path = engine_dir / 'config.json'
+    with config_path.open('r') as f:
+        config = json.load(f)
+    dtype = config['builder_config']['precision']
+    serialize_path = engine_dir / get_engine_name(head_size, dtype)
+
+    with open(serialize_path, 'rb') as f:
+        session = Session.from_serialized_engine(f.read())
+
+    # Prepare input tensors.
+    torch_dtype = str_dtype_to_torch(dtype) if isinstance(dtype, str) else dtype
+    shape = (batch_size, num_heads, seq_len, head_size)
+    q = torch.normal(mean=0.1,
+                     std=0.2,
+                     size=shape,
+                     dtype=torch_dtype,
+                     device='cuda')
+    k = torch.normal(mean=0.4,
+                     std=0.2,
+                     size=shape,
+                     dtype=torch_dtype,
+                     device='cuda')
+    v = torch.normal(mean=0.3,
+                     std=0.2,
+                     size=shape,
+                     dtype=torch_dtype,
+                     device='cuda')
+    batch_size = q.shape[0]
+    seq_len = q.shape[2]
+
+    inputs = {'Q': q, 'K': k, 'V': v}
+
+    # Prepare output tensors.
+    output_info = session.infer_shapes([
+        TensorInfo(name, str_dtype_to_trt(dtype), tensor.shape)
+        for name, tensor in inputs.items()
+    ])
+    logger.debug(f'output info {output_info}')
+    outputs = {
+        t.name: torch.empty(tuple(t.shape),
+                            dtype=trt_dtype_to_torch(t.dtype),
+                            device='cuda')
+        for t in output_info
+    }
+
+    # Execute model inference
+    stream = torch.cuda.current_stream()
+    ok = session.run(inputs=inputs, outputs=outputs, stream=stream.cuda_stream)
+    assert ok, 'Engine execution failed'
+
+    # Sanity check
+    stream.synchronize()
+    sm_scale = 1.0 / math.sqrt(head_size)
+    ref = fused_attention(q, k, v, sm_scale)
+    out = outputs["out"]
+    logger.debug(
+        f'Out: vals: {out.view(1, -1)} abs_sum: {out.float().abs().sum()}')
+    logger.debug(
+        f'Ref: vals: {ref.view(1, -1)} abs_sum: {ref.float().abs().sum()}')
+    torch.testing.assert_close(out, ref)
+
+    if do_benchmark:
+        n_repeats = 10
+
+        # For fair comparison, pre-allocate buffers as trt plugin does.
+        shape = (q.shape[0] * q.shape[1], q.shape[2])
+        L = torch.empty(shape, device=q.device, dtype=torch.float32)
+        m = torch.empty(shape, device=q.device, dtype=torch.float32)
+        o = torch.empty_like(q)
+
+        # Triton warm-up
+        fused_attention(q, k, v, sm_scale, l_buf=L, m_buf=m, o_buf=o)
+        stream.synchronize()
+        for _ in range(n_repeats):
+            profiler.start('Triton')
+            fused_attention(q, k, v, sm_scale, l_buf=L, m_buf=m, o_buf=o)
+            stream.synchronize()
+            profiler.stop('Triton')
+
+        # TRT warm-up
+        stream.synchronize()
+        ok = session.run(inputs=inputs,
+                         outputs=outputs,
+                         stream=stream.cuda_stream)
+        stream.synchronize()
+        for _ in range(n_repeats):
+            profiler.start('TRT Plugin')
+            ok = session.run(inputs=inputs,
+                             outputs=outputs,
+                             stream=stream.cuda_stream)
+            stream.synchronize()
+            profiler.stop('TRT Plugin')
+            assert ok
+        profiler.summary()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--batch_size', type=int, default=4)
+    parser.add_argument('--seq_len', type=int, default=128)
+    parser.add_argument('--num_heads', type=int, default=8)
+    parser.add_argument('--head_size', type=int, default=64)
+    parser.add_argument('--log_level', type=str, default='info')
+    parser.add_argument(
+        '--engine_dir',
+        type=Path,
+        default='outputs',
+        help='The directory where serialized engine files locate.')
+    parser.add_argument(
+        '--benchmark',
+        action='store_true',
+        help='Do performance benchmark compared to triton baseline.')
+    args = parser.parse_args()
+
+    logger.set_level(args.log_level)
+    logger.info('Parameters'.center(40, '='))
+    for k, v in vars(args).items():
+        logger.info(f' - {k.ljust(15, ".")}: {v}')
+    logger.info(''.center(40, '='))
+
+    assert args.engine_dir.exists(), \
+        f"Engine file {str(args.engine_dir)} doesn't exists."
+
+    logger.info('Inference using the built TensorRT engine.')
+    run(args.engine_dir,
+        args.batch_size,
+        args.seq_len,
+        args.num_heads,
+        args.head_size,
+        do_benchmark=args.benchmark)
+    logger.info('Done.')
diff --git a/tests/tools/plugin_gen/test_core.py b/tests/tools/plugin_gen/test_core.py
new file mode 100644
index 00000000000..8a4defae6a2
--- /dev/null
+++ b/tests/tools/plugin_gen/test_core.py
@@ -0,0 +1,58 @@
+from tensorrt_llm.tools.plugin_gen.core import *
+from tensorrt_llm.tools.plugin_gen.plugin_gen import _mkdir
+
+from .kernel_config import get_fmha_kernel_meta_data
+
+
+def test_Type():
+    t0 = Type("tensor[fp16]")
+    assert t0.is_tensor and t0.dtype == DType.FP16
+
+    t1 = Type("i32")
+    assert t1.is_scalar and t1.dtype == DType.INT32
+
+
+def test_KernelMetaData_to_triton_signatures():
+    signature = get_fmha_kernel_meta_data().to_triton_signatures()
+    assert signature == [
+        '*fp16:16, *fp32:16, *fp16:16, *fp16:16, *fp16:16, *fp16:16, fp32, i32, i32, i32, 128, 64, 128',
+        '*fp16:16, *fp32:16, *fp16:16, *fp16:16, *fp16:16, *fp16:16, fp32, i32, i32, i32:16, 128, 64, 128'
+    ]
+
+
+OUT_DIR = './output'
+
+
+def test_PluginCppCodegen():
+    metadata = get_fmha_kernel_meta_data()
+    _mkdir(OUT_DIR)
+    codegen = PluginCppCodegen(output_dir=OUT_DIR, meta_data=metadata)
+    codegen.generate()
+
+
+def test_PluginPyCodegen():
+    metadata = get_fmha_kernel_meta_data()
+    # clean the output file
+    out_file = os.path.join(OUT_DIR, "functional.py")
+    with open(out_file, "w") as f:
+        f.write("")
+
+    codegen = PluginPyCodegen(out_path=out_file,
+                              meta_data=metadata,
+                              add_header=True,
+                              plugin_lib_path="libtriton_fmha.so")
+
+    codegen.generate()
+
+
+def test_PluginRegistryCodegen():
+    out_path = './_plugin_registry.cc'
+    codegen = PluginRegistryCodegen(
+        out_path=out_path, plugin_names=['kernel0', "kernel1", "kernel2"])
+    codegen.generate()
+
+    with open(out_path, 'r') as f:
+        content = f.read()
+        assert 'kernel0PluginCreator' in content
+        assert 'kernel1PluginCreator' in content
+        assert 'kernel2PluginCreator' in content
diff --git a/tests/tools/plugin_gen/test_plugin_gen.py b/tests/tools/plugin_gen/test_plugin_gen.py
new file mode 100644
index 00000000000..56ad27bd8ed
--- /dev/null
+++ b/tests/tools/plugin_gen/test_plugin_gen.py
@@ -0,0 +1,21 @@
+import os
+
+import pytest
+
+from .kernel_config import get_fmha_kernel_meta_data
+
+KERNEL_META_DATA = get_fmha_kernel_meta_data()
+
+from tensorrt_llm.tools.plugin_gen.plugin_gen import (TRITON_COMPILE_BIN,
+                                                      gen_trt_plugins)
+
+WORKSPACE = './tmp/'
+
+
+def is_triton_installed() -> bool:
+    return os.path.exists(TRITON_COMPILE_BIN)
+
+
+@pytest.mark.skipif(not is_triton_installed(), reason='triton is not installed')
+def test_end_to_end():
+    gen_trt_plugins(workspace=WORKSPACE, metas=[KERNEL_META_DATA])
diff --git a/tests/tools/plugin_gen/test_shape_infer.py b/tests/tools/plugin_gen/test_shape_infer.py
new file mode 100644
index 00000000000..9e8030a0e88
--- /dev/null
+++ b/tests/tools/plugin_gen/test_shape_infer.py
@@ -0,0 +1,59 @@
+import pytest
+
+from tensorrt_llm.tools.plugin_gen.core import InputArg, Type
+from tensorrt_llm.tools.plugin_gen.shape_infer import *
+
+
+@pytest.mark.parametrize('expr', [
+    "a[m,n,k]:m*2+k+(n+1) -> b",
+    "a[m,n,k] -> b[m*n+k, 2*k, (3+1) * k]",
+    "a[*] -> b[*]",
+    "Q[m,n,k,*] : m -> batch_size",
+])
+def test_ToAst(expr: str):
+    ast = parser.parse(expr)
+    ast = ToAst()(ast)
+    assert ast
+
+    if isinstance(ast, DeduceDimSizeArgRule):
+        assert ast.left
+        assert ast.expr
+        assert ast.right
+
+
+@pytest.mark.parametrize('expr, target', [
+    ("a[m,n,k]:m*2+k+(n+1) -> b",
+     "((inputDesc[0].dims.d[0] * 2) + (inputDesc[0].dims.d[2] + (inputDesc[0].dims.d[1] + 1)))"
+     ),
+    ("a[m,n,k]:m*(2+k)+n+1 -> b",
+     "((inputDesc[0].dims.d[0] * (2 + inputDesc[0].dims.d[2])) + (inputDesc[0].dims.d[1] + 1))"
+     ),
+    ("a[m,n,k] -> b[m*((((n+1))))]", """
+if (outputIndex == 0) {
+  outputDims.nbDims = 1;
+  outputDims.d[0] = (inputDims[0].d[0] * (inputDims[0].d[1] + 1));
+}
+     """),
+    ("a[m,n,k] -> b[m*(n+k), 2*n, k+3]", """
+nvinfer1::DimsExprs outputDims;
+if (outputIndex == 0) {
+  outputDims.nbDims = 3;
+  outputDims.d[0] = (inputDims[0].d[0] * (inputDims[0].d[1] + inputDims[0].d[2]));
+  outputDims.d[1] = (2 * inputDims[0].d[1]);
+  outputDims.d[2] = (inputDims[0].d[2] + 3);
+}""")
+])
+def test_CppCodeTranspiler(expr: str, target: str):
+    args = dict(
+        a=InputArg('a', Type('fp16')),
+        b=InputArg('b', Type('fp16')),
+    )
+    target = target.strip()
+
+    transpiler = CppCodeTranspiler(args)
+
+    shape_infer_code, dim_infer_code = transpiler([expr])
+
+    # we don't check the correctness of the code since the lark produces unstable ast tree
+    # refer to https://github.com/lark-parser/lark/issues/324
+    assert shape_infer_code or dim_infer_code

From 766926cfd4f6f440a7f3124fe7159c4279058961 Mon Sep 17 00:00:00 2001
From: Kevin Xie <kevxie@nvidia.com>
Date: Thu, 28 Sep 2023 09:36:15 -0700
Subject: [PATCH 2/4] Add .a libs

---
 .../libtensorrt_llm_batch_manager_static.a    |  3 +
 ...sorrt_llm_batch_manager_static.pre_cxx11.a |  3 +
 docs/source/CONTRIBUTING.md                   | 69 -------------------
 3 files changed, 6 insertions(+), 69 deletions(-)
 create mode 100644 cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a
 create mode 100644 cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a
 delete mode 100644 docs/source/CONTRIBUTING.md

diff --git a/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a
new file mode 100644
index 00000000000..dafbcb9f1c4
--- /dev/null
+++ b/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fd5e6bcf2f1b8076c618d2cdfa082c46054704a9700d3e3cd4f5d00c18bb9a3
+size 11417576
diff --git a/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a
new file mode 100644
index 00000000000..1740d559a72
--- /dev/null
+++ b/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc9592c22a5f719fe634a2120135c32ae90b6553f56cd809128323c18409c53f
+size 11410084
diff --git a/docs/source/CONTRIBUTING.md b/docs/source/CONTRIBUTING.md
deleted file mode 100644
index 5df66d10026..00000000000
--- a/docs/source/CONTRIBUTING.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# CONTRIBUTING
-
-## Workflow
-
-1. Apply to be a developer of the project
-
-Please reach to the maintainers:
-
-- [bhsueh](https://gitlab-master.nvidia.com/bhsueh) (BoYang Hsueh)
-- [tali](https://gitlab-master.nvidia.com/tali) (Tao Li)
-- [junq](https://gitlab-master.nvidia.com/junq) (Freddy Qi)
-
-2. Clone
-
-```bash
-git clone https://gitlab-master.nvidia.com/ftp/tensorrt_llm.git
-```
-
-3. Create a local feature branch
-
-```bash
-cd tensorrt_llm
-git checkout -b feature/xxx
-```
-
-4. Commit
-
-We use `pre-commit` to help check the code style.
-
-```bash
-pip install pre-commit
-pre-commit install
-```
-
-`pre-commit` will be triggered in every commit.
-
-```bash
-git commit -m "fix"
-
-isort....................................................................Passed
-CRLF end-lines remover...............................(no files to check)Skipped
-yapf.....................................................................Failed
-- hook id: yapf
-- files were modified by this hook
-check for added large files..............................................Passed
-check for merge conflicts................................................Passed
-check for broken symlinks............................(no files to check)Skipped
-detect private key...................................(no files to check)Skipped
-fix end of files.........................................................Passed
-check yaml...........................................(no files to check)Skipped
-trim trailing whitespace.................................................Passed
-autoflake................................................................Passed
-clang-format.........................................(no files to check)Skipped
-cmake-format.........................................(no files to check)Skipped
-```
-
-5. Push
-
-```bash
-git push -u origin feature/xxx
-```
-
-## Add a new model
-
-[How to add a new model](docs/2023-05-17-how-to-add-a-new-model.md)
-
-## Debug
-
-[How to debug](docs/2023-05-19-how-to-debug.md)

From 496456efeca7d3ec22ca9ffcaca2c721654c13b8 Mon Sep 17 00:00:00 2001
From: Kevin Xie <kevxie@nvidia.com>
Date: Thu, 28 Sep 2023 10:00:48 -0700
Subject: [PATCH 3/4] Update submodule

---
 .gitmodules   | 3 +++
 3rdparty/NVTX | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 3rdparty/NVTX

diff --git a/.gitmodules b/.gitmodules
index 8180e14d6c7..7ca71e70c1b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -9,3 +9,6 @@
 	path = 3rdparty/cxxopts
 	url = https://github.com/jarro2783/cxxopts
 	branch = v3.1.1
+[submodule "3rdparty/NVTX"]
+	path = 3rdparty/NVTX
+	url = https://github.com/NVIDIA/NVTX.git
diff --git a/3rdparty/NVTX b/3rdparty/NVTX
new file mode 160000
index 00000000000..09e0d23a61a
--- /dev/null
+++ b/3rdparty/NVTX
@@ -0,0 +1 @@
+Subproject commit 09e0d23a61ae86cc381eef85d012afc3a2e6eeea

From 6111f5210bfc71ac657e3b0f99035f1c0c63d382 Mon Sep 17 00:00:00 2001
From: Kevin Xie <kevxie@nvidia.com>
Date: Thu, 28 Sep 2023 10:28:36 -0700
Subject: [PATCH 4/4] Update submodule

---
 3rdparty/NVTX | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/NVTX b/3rdparty/NVTX
index 09e0d23a61a..a1ceb0677f6 160000
--- a/3rdparty/NVTX
+++ b/3rdparty/NVTX
@@ -1 +1 @@
-Subproject commit 09e0d23a61ae86cc381eef85d012afc3a2e6eeea
+Subproject commit a1ceb0677f67371ed29a2b1c022794f077db5fe7