From 6e9e318e91f126313219aa77b27f14b3d69aa87b Mon Sep 17 00:00:00 2001 From: Kevin Xie Date: Thu, 28 Sep 2023 09:00:05 -0700 Subject: [PATCH 1/4] Update code --- .../cpp}/CMakeLists.txt | 0 {cpp/benchmarks => benchmarks/cpp}/README.md | 4 +- .../cpp}/bertBenchmark.cpp | 4 +- .../cpp}/gptSessionBenchmark.cpp | 9 +- benchmarks/{ => python}/README.md | 10 +- benchmarks/{ => python}/allowed_configs.py | 98 +- benchmarks/{ => python}/base_benchmark.py | 0 benchmarks/{ => python}/benchmark.py | 0 benchmarks/{ => python}/bert_benchmark.py | 0 benchmarks/{ => python}/gpt_benchmark.py | 46 +- benchmarks/{ => python}/mem_monitor.py | 0 cpp/CMakeLists.txt | 114 +- .../modules/find_library_create_target.cmake | 5 +- .../tensorrt_llm/batch_manager/GptManager.h | 35 +- .../batch_manager/batchScheduler.h | 79 + .../batch_manager/kvCacheManager.h | 31 +- .../tensorrt_llm/batch_manager/llmRequest.h | 117 +- .../batch_manager/trtGptModelOptionalParams.h | 68 + .../tensorrt_llm/runtime/gptJsonConfig.h | 23 +- .../tensorrt_llm/runtime/gptModelConfig.h | 46 +- cpp/include/tensorrt_llm/runtime/gptSession.h | 17 +- .../runtime/iStatefulGptDecoder.h | 1 + cpp/include/tensorrt_llm/runtime/tllmLogger.h | 7 +- .../tensorrt_llm/runtime/worldConfig.h | 59 +- cpp/tensorrt_llm/CMakeLists.txt | 15 +- .../libtensorrt_llm_batch_manager_static.a | 3 - ...sorrt_llm_batch_manager_static.pre_cxx11.a | 3 - cpp/tensorrt_llm/common/assert.h | 4 + cpp/tensorrt_llm/common/cublasMMWrapper.cpp | 360 ++- cpp/tensorrt_llm/common/cublasMMWrapper.h | 25 +- cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh | 4 +- cpp/tensorrt_llm/common/cudaDriverWrapper.cpp | 7 + cpp/tensorrt_llm/common/cudaTypeUtils.cuh | 22 +- cpp/tensorrt_llm/common/cudaUtils.h | 42 +- cpp/tensorrt_llm/common/int8Utils.cuh | 60 - cpp/tensorrt_llm/common/logger.h | 8 + cpp/tensorrt_llm/common/memoryUtils.cu | 6 +- cpp/tensorrt_llm/common/nvtxUtils.h | 44 + cpp/tensorrt_llm/common/reduceKernelUtils.cuh | 2 +- cpp/tensorrt_llm/common/stringUtils.h | 6 + cpp/tensorrt_llm/common/tensor.cpp | 18 +- cpp/tensorrt_llm/common/tensor.h | 3 +- cpp/tensorrt_llm/common/tllmException.cpp | 4 +- .../gemm/warp/mma_tensorop_dequantizer.h | 4 +- cpp/tensorrt_llm/kernels/banBadWords.cu | 3 +- cpp/tensorrt_llm/kernels/banRepeatNgram.cu | 3 +- .../kernels/beamSearchPenaltyKernels.cu | 53 +- .../kernels/beamSearchPenaltyKernels.h | 8 +- .../kernels/beamSearchTopkKernels.cu | 25 +- .../fmhaRunner.cpp | 5 + .../cutlass_kernels/cutlass_heuristic.cpp | 6 + .../fpA_intB_gemm/fpA_intB_gemm.h | 33 +- .../fpA_intB_gemm/fpA_intB_gemm_template.h | 61 +- .../cutlass_kernels/int8_gemm/int8_gemm.h | 61 +- .../int8_gemm/int8_gemm_template.h | 129 +- .../kernels/decoderMaskedMultiheadAttention.h | 2 + .../decoderMaskedMultiheadAttentionLaunch.h | 8 +- .../decoderMaskedMultiheadAttentionTemplate.h | 6 +- .../decoderMaskedMultiheadAttentionUtils.h | 138 +- cpp/tensorrt_llm/kernels/decodingKernels.cu | 76 +- cpp/tensorrt_llm/kernels/decodingKernels.h | 10 +- cpp/tensorrt_llm/kernels/gptKernels.h | 7 + .../kernels/onlineSoftmaxBeamsearchKernels.cu | 35 +- cpp/tensorrt_llm/kernels/quantization.cu | 1 - .../kernels/samplingPenaltyKernels.cu | 113 +- .../kernels/samplingPenaltyKernels.h | 11 +- .../kernels/samplingTopKKernels.cu | 46 +- .../kernels/stopCriteriaKernels.cu | 3 +- .../kernels/unfusedAttentionKernels.cu | 20 +- .../kernels/unfusedAttentionKernels.h | 4 +- .../kernels/weightOnlyBatchedGemv/common.h | 81 + .../kernels/weightOnlyBatchedGemv/kernel.h | 430 +++ .../weightOnlyBatchedGemv/kernelLauncher.cu | 224 ++ .../kernelLauncher.h} | 18 +- .../kernels/weightOnlyBatchedGemv/utility.h | 99 + .../weightOnlyBatchedGemvBs1Int4b.cu | 98 + .../weightOnlyBatchedGemvBs1Int8b.cu | 98 + .../weightOnlyBatchedGemvBs2Int4b.cu | 97 + .../weightOnlyBatchedGemvBs2Int8b.cu | 97 + .../weightOnlyBatchedGemvBs3Int4b.cu | 98 + .../weightOnlyBatchedGemvBs3Int8b.cu | 98 + .../weightOnlyBatchedGemvBs4Int4b.cu | 97 + .../weightOnlyBatchedGemvBs4Int8b.cu | 98 + ...OnlyGroupwiseMatrixVectorMultiplication.cu | 236 -- .../weightOnlyMatrixVectorMultiplication.cu | 381 --- .../weightOnlyMatrixVectorMultiplication.h | 48 - .../layers/baseBeamSearchLayer.cu | 30 +- cpp/tensorrt_llm/layers/baseBeamSearchLayer.h | 6 +- cpp/tensorrt_llm/layers/baseSamplingLayer.cpp | 22 +- cpp/tensorrt_llm/layers/baseSamplingLayer.h | 4 +- .../layers/dynamicDecodeLayer.cpp | 25 +- .../layers/onlineBeamSearchLayer.cu | 8 +- cpp/tensorrt_llm/layers/topPSamplingLayer.cu | 1 - cpp/tensorrt_llm/plugins/CMakeLists.txt | 21 +- cpp/tensorrt_llm/plugins/api/InferPlugin.cpp | 179 -- cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp | 209 ++ cpp/tensorrt_llm/plugins/api/tllmPlugin.h | 56 + .../bertAttentionPlugin.cpp | 62 +- .../bertAttentionPlugin/bertAttentionPlugin.h | 30 +- .../plugins/common/checkMacrosPlugin.cpp | 126 +- .../plugins/common/checkMacrosPlugin.h | 276 +- .../plugins/common/gemmPluginProfiler.h | 470 +++ cpp/tensorrt_llm/plugins/common/plugin.cpp | 51 +- cpp/tensorrt_llm/plugins/common/plugin.h | 100 +- cpp/tensorrt_llm/plugins/exports.map | 9 +- .../plugins/gemmPlugin/gemmPlugin.cpp | 364 ++- .../plugins/gemmPlugin/gemmPlugin.h | 118 +- .../gptAttentionCommon/gptAttentionCommon.cpp | 106 +- .../gptAttentionCommon/gptAttentionCommon.h | 59 +- .../gptAttentionCommonImpl.h | 9 +- .../gptAttentionPlugin/gptAttentionPlugin.cpp | 126 +- .../gptAttentionPlugin/gptAttentionPlugin.h | 47 +- .../plugins/identityPlugin/identityPlugin.cpp | 30 +- .../plugins/identityPlugin/identityPlugin.h | 30 +- .../layernormPlugin/layernormPlugin.cpp | 38 +- .../plugins/layernormPlugin/layernormPlugin.h | 30 +- .../layernormQuantizationPlugin.cpp | 44 +- .../layernormQuantizationPlugin.h | 30 +- .../plugins/lookupPlugin/lookupPlugin.cpp | 40 +- .../plugins/lookupPlugin/lookupPlugin.h | 30 +- .../plugins/ncclPlugin/allgatherPlugin.cpp | 34 +- .../plugins/ncclPlugin/allgatherPlugin.h | 30 +- .../plugins/ncclPlugin/allreducePlugin.cpp | 34 +- .../plugins/ncclPlugin/allreducePlugin.h | 30 +- .../plugins/ncclPlugin/recvPlugin.cpp | 34 +- .../plugins/ncclPlugin/recvPlugin.h | 30 +- .../plugins/ncclPlugin/sendPlugin.cpp | 34 +- .../plugins/ncclPlugin/sendPlugin.h | 30 +- .../quantizePerTokenPlugin.cpp | 38 +- .../quantizePerTokenPlugin.h | 29 +- .../quantizeTensorPlugin.cpp | 40 +- .../quantizeTensorPlugin.h | 29 +- .../plugins/rmsnormPlugin/rmsnormPlugin.cpp | 36 +- .../plugins/rmsnormPlugin/rmsnormPlugin.h | 30 +- .../rmsnormQuantizationPlugin.cpp | 42 +- .../rmsnormQuantizationPlugin.h | 30 +- .../smoothQuantGemmPlugin.cpp | 222 +- .../smoothQuantGemmPlugin.h | 83 +- .../weightOnlyGroupwiseQuantMatmulPlugin.cpp | 190 +- .../weightOnlyGroupwiseQuantMatmulPlugin.h | 90 +- .../weightOnlyQuantMatmulPlugin.cpp | 193 +- .../weightOnlyQuantMatmulPlugin.h | 76 +- cpp/tensorrt_llm/runtime/CMakeLists.txt | 9 +- cpp/tensorrt_llm/runtime/bufferManager.cpp | 18 +- cpp/tensorrt_llm/runtime/gptDecoder.cpp | 35 +- cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp | 40 +- cpp/tensorrt_llm/runtime/gptJsonConfig.cpp | 29 +- cpp/tensorrt_llm/runtime/gptSession.cpp | 281 +- cpp/tensorrt_llm/runtime/ncclCommunicator.cpp | 135 + cpp/tensorrt_llm/runtime/ncclCommunicator.h | 44 + cpp/tensorrt_llm/runtime/runtimeBuffers.cpp | 289 +- cpp/tensorrt_llm/runtime/runtimeBuffers.h | 33 +- cpp/tensorrt_llm/runtime/runtimeKernels.cu | 131 +- .../runtime/statefulGptDecoder.cpp | 44 +- cpp/tensorrt_llm/runtime/tllmRuntime.cpp | 42 +- cpp/tensorrt_llm/runtime/tllmRuntime.h | 4 +- cpp/tensorrt_llm/runtime/torchView.h | 1 + .../runtime/utils/multiDeviceUtils.h | 53 + .../runtime/utils/sessionUtils.cpp | 38 +- cpp/tensorrt_llm/runtime/utils/sessionUtils.h | 12 +- cpp/tensorrt_llm/runtime/worldConfig.cpp | 38 +- cpp/tensorrt_llm/thop/CMakeLists.txt | 6 +- cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp | 4 +- cpp/tensorrt_llm/thop/dynamicDecodeOp.h | 4 +- cpp/tensorrt_llm/thop/fp8Op.cpp | 6 +- cpp/tensorrt_llm/thop/gatherTreeOp.cpp | 28 +- cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp | 16 +- cpp/tests/README.md | 8 + cpp/tests/resources/.gitignore | 1 + .../resources/scripts/build_gpt_engines.py | 9 - .../resources/scripts/build_gptj_engines.py | 8 +- .../resources/scripts/build_llama_engines.py | 86 + .../scripts/generate_expected_llama_output.py | 59 + cpp/tests/resources/scripts/test_cpp.py | 48 +- cpp/tests/runtime/gptDecoderBatchTest.cpp | 36 +- cpp/tests/runtime/gptDecoderTest.cpp | 5 +- cpp/tests/runtime/gptSessionTest.cpp | 122 +- cpp/tests/runtime/runtimeKernelTest.cpp | 257 +- docker/Dockerfile.multi | 26 +- docker/Makefile | 4 +- docs/Doxygen | 2658 +++++++++++++++++ docs/Makefile | 20 + docs/README.md | 40 + docs/graph-rewriting.md | 193 ++ docs/make.bat | 35 + docs/requirements.txt | 4 + .../2023-05-17-how-to-add-a-new-model.md | 0 docs/{ => source}/2023-05-19-how-to-debug.md | 0 docs/source/CONTRIBUTING.md | 69 + README.md => docs/source/README.md | 0 docs/{ => source}/architecture.md | 0 docs/source/conf.py | 88 + docs/{ => source}/gpt_attention.md | 0 docs/{ => source}/gpt_runtime.md | 7 +- docs/{ => source}/in_flight_batching.md | 0 docs/source/index.rst | 65 + docs/{ => source}/performance.md | 0 docs/{ => source}/precision.md | 0 .../python-api/tensorrt_llm.functional.rst | 11 + .../source/python-api/tensorrt_llm.layers.rst | 69 + .../source/python-api/tensorrt_llm.models.rst | 11 + .../source/python-api/tensorrt_llm.plugin.rst | 10 + .../python-api/tensorrt_llm.quantization.rst | 10 + .../python-api/tensorrt_llm.runtime.rst | 11 + examples/baichuan/build.py | 19 +- examples/baichuan/run.py | 5 +- examples/baichuan/summarize.py | 3 +- examples/bloom/README.md | 20 +- examples/bloom/build.py | 41 +- examples/bloom/summarize.py | 3 +- examples/bloom/weight.py | 25 +- examples/chatglm2-6b/build.py | 2 + examples/chatglm6b/build.py | 2 + examples/cpp_library/main.cpp | 1 - examples/cpp_library/tensorrt_llm_libutils.h | 8 + examples/falcon/README.md | 35 +- examples/falcon/build.py | 140 +- examples/falcon/requirements.txt | 1 + examples/falcon/run.py | 191 +- examples/falcon/summarize.py | 66 +- examples/falcon/weight.py | 139 +- examples/gpt/README.md | 7 +- examples/gpt/build.py | 34 +- examples/gpt/hf_gpt_convert.py | 7 + examples/gpt/run.py | 5 +- examples/gpt/summarize.py | 3 +- examples/gpt/weight.py | 18 +- examples/gptj/README.md | 30 + examples/gptj/build.py | 21 +- examples/gptj/run.py | 5 +- examples/gptj/summarize.py | 3 +- examples/gptj/weight.py | 10 +- examples/gptneox/README.md | 1 + examples/gptneox/build.py | 26 +- examples/gptneox/summarize.py | 3 +- examples/llama/README.md | 89 +- examples/llama/build.py | 108 +- examples/llama/convert.py | 4 +- examples/llama/hf_llama_convert.py | 25 + examples/llama/run.py | 22 +- examples/llama/summarize.py | 22 +- examples/llama/weight.py | 346 ++- examples/mpt/build.py | 16 + examples/mpt/run.py | 5 +- examples/openai_triton/CMakeLists.txt | 19 +- .../TritonFlashAttentionPlugin.cpp | 45 +- .../TritonFlashAttentionPlugin.h | 30 +- examples/openai_triton/plugin.py | 6 +- examples/openai_triton/tritonPlugins.cpp | 20 +- examples/opt/build.py | 4 +- examples/opt/summarize.py | 3 +- examples/quantization/summarize.py | 3 +- requirements-dev-windows.txt | 25 + requirements-dev.txt | 3 +- requirements-windows.txt | 20 + requirements.txt | 4 +- scripts/build_wheel.py | 80 +- setup.py | 29 +- tensorrt_llm/__init__.py | 6 +- tensorrt_llm/_common.py | 6 +- tensorrt_llm/_utils.py | 1 + tensorrt_llm/builder.py | 48 +- tensorrt_llm/functional.py | 220 +- tensorrt_llm/graph_rewriting.py | 16 +- tensorrt_llm/layers/__init__.py | 5 +- tensorrt_llm/layers/attention.py | 170 +- tensorrt_llm/layers/linear.py | 28 +- tensorrt_llm/models/baichuan/model.py | 114 +- tensorrt_llm/models/bloom/model.py | 161 +- tensorrt_llm/models/chatglm2_6b/model.py | 225 +- tensorrt_llm/models/chatglm6b/model.py | 185 +- tensorrt_llm/models/falcon/model.py | 230 +- tensorrt_llm/models/generation_mixin.py | 319 +- tensorrt_llm/models/gpt/model.py | 204 +- tensorrt_llm/models/gptj/model.py | 116 +- tensorrt_llm/models/gptneox/model.py | 175 +- tensorrt_llm/models/llama/model.py | 272 +- tensorrt_llm/models/opt/model.py | 130 +- tensorrt_llm/models/quantized/quant.py | 20 +- tensorrt_llm/network.py | 30 +- tensorrt_llm/plugin/__init__.py | 8 +- tensorrt_llm/plugin/plugin.py | 29 +- tensorrt_llm/quantization/functional.py | 20 +- tensorrt_llm/quantization/layers.py | 215 +- tensorrt_llm/runtime/generation.py | 182 +- tensorrt_llm/tools/__init__.py | 0 tensorrt_llm/tools/plugin_gen/__init__.py | 0 tensorrt_llm/tools/plugin_gen/core.py | 693 +++++ tensorrt_llm/tools/plugin_gen/plugin_gen.py | 339 +++ tensorrt_llm/tools/plugin_gen/shape_infer.py | 322 ++ .../plugin_gen/templates/CMakeLists.txt.tpl | 73 + .../plugin_gen/templates/functional.py.tpl | 70 + .../tools/plugin_gen/templates/plugin.cpp.tpl | 302 ++ .../tools/plugin_gen/templates/plugin.h.tpl | 120 + .../templates/tritonPlugins.cpp.tpl | 147 + tests/attention/test_gpt_attention.py | 196 +- tests/attention/test_gpt_attention_IFB.py | 127 +- tests/functional/test_arange.py | 91 + tests/model/test_bert.py | 3 +- tests/model/test_bloom.py | 5 +- tests/model/test_falcon.py | 8 +- tests/model/test_gpt.py | 106 +- tests/model/test_gptj.py | 31 +- tests/model/test_gptneox.py | 2 +- tests/model/test_llama.py | 18 +- tests/quantization/test_smooth_quant_gemm.py | 4 + tests/test_graph_rewriter.py | 49 +- tests/test_layer.py | 21 +- tests/test_plugins.py | 17 + tests/tools/__init__.py | 0 tests/tools/plugin_gen/__init__.py | 0 tests/tools/plugin_gen/build_engine.py | 195 ++ tests/tools/plugin_gen/kernel_config.py | 49 + tests/tools/plugin_gen/run_engine.py | 169 ++ tests/tools/plugin_gen/test_core.py | 58 + tests/tools/plugin_gen/test_plugin_gen.py | 21 + tests/tools/plugin_gen/test_shape_infer.py | 59 + 317 files changed, 15996 insertions(+), 6294 deletions(-) rename {cpp/benchmarks => benchmarks/cpp}/CMakeLists.txt (100%) rename {cpp/benchmarks => benchmarks/cpp}/README.md (91%) rename {cpp/benchmarks => benchmarks/cpp}/bertBenchmark.cpp (99%) rename {cpp/benchmarks => benchmarks/cpp}/gptSessionBenchmark.cpp (96%) rename benchmarks/{ => python}/README.md (75%) rename benchmarks/{ => python}/allowed_configs.py (83%) rename benchmarks/{ => python}/base_benchmark.py (100%) rename benchmarks/{ => python}/benchmark.py (100%) rename benchmarks/{ => python}/bert_benchmark.py (100%) rename benchmarks/{ => python}/gpt_benchmark.py (93%) rename benchmarks/{ => python}/mem_monitor.py (100%) create mode 100644 cpp/include/tensorrt_llm/batch_manager/batchScheduler.h create mode 100644 cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h delete mode 100644 cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a delete mode 100644 cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a delete mode 100644 cpp/tensorrt_llm/common/int8Utils.cuh create mode 100644 cpp/tensorrt_llm/common/nvtxUtils.h create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.cu rename cpp/tensorrt_llm/kernels/{weightOnlyGroupwiseMatrixVectorMultiplication.h => weightOnlyBatchedGemv/kernelLauncher.h} (52%) create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int4b.cu create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int8b.cu create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int4b.cu create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int8b.cu create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int4b.cu create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int8b.cu create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int4b.cu create mode 100644 cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int8b.cu delete mode 100644 cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.cu delete mode 100644 cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.cu delete mode 100644 cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h delete mode 100644 cpp/tensorrt_llm/plugins/api/InferPlugin.cpp create mode 100644 cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp create mode 100644 cpp/tensorrt_llm/plugins/api/tllmPlugin.h create mode 100644 cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h mode change 100755 => 100644 cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h mode change 100755 => 100644 cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h mode change 100755 => 100644 cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h create mode 100644 cpp/tensorrt_llm/runtime/ncclCommunicator.cpp create mode 100644 cpp/tensorrt_llm/runtime/ncclCommunicator.h create mode 100644 cpp/tensorrt_llm/runtime/utils/multiDeviceUtils.h create mode 100644 cpp/tests/resources/scripts/build_llama_engines.py create mode 100644 cpp/tests/resources/scripts/generate_expected_llama_output.py create mode 100644 docs/Doxygen create mode 100644 docs/Makefile create mode 100644 docs/README.md create mode 100644 docs/graph-rewriting.md create mode 100644 docs/make.bat create mode 100644 docs/requirements.txt rename docs/{ => source}/2023-05-17-how-to-add-a-new-model.md (100%) rename docs/{ => source}/2023-05-19-how-to-debug.md (100%) create mode 100644 docs/source/CONTRIBUTING.md rename README.md => docs/source/README.md (100%) rename docs/{ => source}/architecture.md (100%) create mode 100644 docs/source/conf.py rename docs/{ => source}/gpt_attention.md (100%) rename docs/{ => source}/gpt_runtime.md (98%) rename docs/{ => source}/in_flight_batching.md (100%) create mode 100644 docs/source/index.rst rename docs/{ => source}/performance.md (100%) rename docs/{ => source}/precision.md (100%) create mode 100644 docs/source/python-api/tensorrt_llm.functional.rst create mode 100644 docs/source/python-api/tensorrt_llm.layers.rst create mode 100644 docs/source/python-api/tensorrt_llm.models.rst create mode 100644 docs/source/python-api/tensorrt_llm.plugin.rst create mode 100644 docs/source/python-api/tensorrt_llm.quantization.rst create mode 100644 docs/source/python-api/tensorrt_llm.runtime.rst create mode 100644 requirements-dev-windows.txt create mode 100644 requirements-windows.txt create mode 100644 tensorrt_llm/tools/__init__.py create mode 100644 tensorrt_llm/tools/plugin_gen/__init__.py create mode 100644 tensorrt_llm/tools/plugin_gen/core.py create mode 100644 tensorrt_llm/tools/plugin_gen/plugin_gen.py create mode 100644 tensorrt_llm/tools/plugin_gen/shape_infer.py create mode 100644 tensorrt_llm/tools/plugin_gen/templates/CMakeLists.txt.tpl create mode 100644 tensorrt_llm/tools/plugin_gen/templates/functional.py.tpl create mode 100644 tensorrt_llm/tools/plugin_gen/templates/plugin.cpp.tpl create mode 100644 tensorrt_llm/tools/plugin_gen/templates/plugin.h.tpl create mode 100644 tensorrt_llm/tools/plugin_gen/templates/tritonPlugins.cpp.tpl create mode 100644 tests/functional/test_arange.py create mode 100644 tests/test_plugins.py create mode 100644 tests/tools/__init__.py create mode 100644 tests/tools/plugin_gen/__init__.py create mode 100644 tests/tools/plugin_gen/build_engine.py create mode 100644 tests/tools/plugin_gen/kernel_config.py create mode 100644 tests/tools/plugin_gen/run_engine.py create mode 100644 tests/tools/plugin_gen/test_core.py create mode 100644 tests/tools/plugin_gen/test_plugin_gen.py create mode 100644 tests/tools/plugin_gen/test_shape_infer.py diff --git a/cpp/benchmarks/CMakeLists.txt b/benchmarks/cpp/CMakeLists.txt similarity index 100% rename from cpp/benchmarks/CMakeLists.txt rename to benchmarks/cpp/CMakeLists.txt diff --git a/cpp/benchmarks/README.md b/benchmarks/cpp/README.md similarity index 91% rename from cpp/benchmarks/README.md rename to benchmarks/cpp/README.md index e9c996d545f..d28378726a4 100644 --- a/cpp/benchmarks/README.md +++ b/benchmarks/cpp/README.md @@ -7,7 +7,7 @@ multiple GPUs or multiple nodes with multiple GPUs. ### 1. Build TensorRT-LLM and benchmarking source code -Please follow the [`installation document`](../../README.md) to build TensorRT-LLM. +Please follow the [`installation document`](../../../README.md) to build TensorRT-LLM. After that, you can build benchmarking source code for C++ runtime ``` @@ -19,7 +19,7 @@ make -j benchmarks Before you launch C++ benchmarking, please make sure that you have already built engine(s) using TensorRT-LLM API, C++ benchmarking code cannot generate engine(s) for you. -You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../../benchmarks/README.md). +You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../python/README.md). For detailed usage, you can do the following ``` diff --git a/cpp/benchmarks/bertBenchmark.cpp b/benchmarks/cpp/bertBenchmark.cpp similarity index 99% rename from cpp/benchmarks/bertBenchmark.cpp rename to benchmarks/cpp/bertBenchmark.cpp index 48fb8492cf0..06f148cad87 100644 --- a/cpp/benchmarks/bertBenchmark.cpp +++ b/benchmarks/cpp/bertBenchmark.cpp @@ -15,13 +15,13 @@ * limitations under the License. */ #include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/plugins/api/tllmPlugin.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include "tensorrt_llm/runtime/tllmRuntime.h" #include "tensorrt_llm/runtime/worldConfig.h" #include -#include #include #include #include @@ -228,7 +228,7 @@ int main(int argc, char* argv[]) { throw std::invalid_argument("Unexpected log level: " + logLevel); } - initLibNvInferPlugins(logger.get(), "tensorrt_llm"); + initTrtLlmPlugins(logger.get()); benchmarkBert(result["model"].as(), result["engine_dir"].as(), batchSizes, inLens, logger, result["warm_up"].as(), result["num_runs"].as(), result["duration"].as()); diff --git a/cpp/benchmarks/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp similarity index 96% rename from cpp/benchmarks/gptSessionBenchmark.cpp rename to benchmarks/cpp/gptSessionBenchmark.cpp index c6b08f242e9..ebe7b8de12f 100644 --- a/cpp/benchmarks/gptSessionBenchmark.cpp +++ b/benchmarks/cpp/gptSessionBenchmark.cpp @@ -15,12 +15,12 @@ * limitations under the License. */ #include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/plugins/api/tllmPlugin.h" #include "tensorrt_llm/runtime/gptJsonConfig.h" #include "tensorrt_llm/runtime/gptSession.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include -#include #include #include #include @@ -41,7 +41,10 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con auto const json = GptJsonConfig::parse(dataPath / "config.json"); auto const modelConfig = json.getModelConfig(); auto const inputPacked = modelConfig.usePackedInput(); - auto const worldConfig = WorldConfig::mpi(*logger); + SizeType deviceCount{0}; + TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); + auto const worldConfig + = WorldConfig::mpi(*logger, deviceCount, json.getTensorParallelism(), json.getPipelineParallelism()); auto const enginePath = dataPath / json.engineFilename(worldConfig, modelName); auto const dtype = modelConfig.getDataType(); auto const useHalf = (dtype == nvinfer1::DataType::kHALF); @@ -233,7 +236,7 @@ int main(int argc, char* argv[]) // Argument: Enable CUDA graph auto enableCudaGraph = result.count("enable_cuda_graph") > 0; - initLibNvInferPlugins(logger.get(), "tensorrt_llm"); + initTrtLlmPlugins(logger.get()); benchmarkGptSession(result["model"].as(), result["engine_dir"].as(), batchSizes, inOutLen, logger, result["warm_up"].as(), result["num_runs"].as(), result["duration"].as(), diff --git a/benchmarks/README.md b/benchmarks/python/README.md similarity index 75% rename from benchmarks/README.md rename to benchmarks/python/README.md index d7edddb2aaf..39e0743789a 100644 --- a/benchmarks/README.md +++ b/benchmarks/python/README.md @@ -5,12 +5,12 @@ multiple GPUs or multiple nodes with multiple GPUs. ## Overview -The benchmark implementation and entrypoint can be found in [`benchmarks/benchmark.py`](./benchmark.py). There are some other scripts in the directory: +The benchmark implementation and entrypoint can be found in [`benchmarks/python/benchmark.py`](./benchmark.py). There are some other scripts in the directory: -* [`benchmarks/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model. -* [`benchmarks/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark. -* [`benchmarks/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models. -* [`benchmarks/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models. +* [`benchmarks/python/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model. +* [`benchmarks/python/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark. +* [`benchmarks/python/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models. +* [`benchmarks/python/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models. ## Usage diff --git a/benchmarks/allowed_configs.py b/benchmarks/python/allowed_configs.py similarity index 83% rename from benchmarks/allowed_configs.py rename to benchmarks/python/allowed_configs.py index 51e566a8514..5a6ce1a7670 100644 --- a/benchmarks/allowed_configs.py +++ b/benchmarks/python/allowed_configs.py @@ -14,12 +14,12 @@ # limitations under the License. from typing import Literal, Optional -from pydantic import BaseModel +from pydantic import BaseModel, Extra from tensorrt_llm.functional import PositionEmbeddingType -class BuildConfig(BaseModel): +class BuildConfig(BaseModel, extra=Extra.allow): num_layers: int num_heads: int hidden_size: int @@ -28,10 +28,10 @@ class BuildConfig(BaseModel): n_positions: int max_batch_size: int max_input_len: int - num_kv_heads: int = None + num_kv_heads: Optional[int] = None max_output_len: Optional[int] = None - builder_opt: Optional[ - int] = None # TRT builder_optimization_level from 0 to 5 + # TRT builder_optimization_level from 0 to 5 + builder_opt: Optional[int] = None inter_size: Optional[int] = None rotary_dim: Optional[int] = None type_vocab_size: Optional[int] = None @@ -44,11 +44,10 @@ class BuildConfig(BaseModel): enable_context_fmha: bool = True # None means using the model family's default value defined in the ctor position_embedding_type: Optional[PositionEmbeddingType] = None - # Only when position embedding is RoPE, this value makes sense, make default value to be None, not 0 or 1 - # to prevent misuse + # Only when position embedding is RoPE, this value makes sense, make + # default value to be None, not 0 or 1 to prevent misuse rotary_pct: Optional[float] = None bias: bool = True - remove_input_padding: bool = True class ModelConfig(BaseModel): @@ -439,6 +438,89 @@ class ModelConfig(BaseModel): enable_qk_half_accum=False, enable_context_fmha=False, )), + "falcon_rw_1b": + ModelConfig(name="falcon_rw_1b", + family="falcon", + benchmark_type="gpt", + build_config=BuildConfig( + num_layers=24, + num_heads=32, + hidden_size=2048, + vocab_size=50304, + hidden_act=None, + n_positions=2048, + max_batch_size=256, + max_input_len=1024, + max_output_len=1024, + builder_opt=None, + bias=True, + use_alibi=True, + parallel_attention=False, + new_decoder_architecture=False, + )), + "falcon_7b": + ModelConfig(name="falcon_7b", + family="falcon", + benchmark_type="gpt", + build_config=BuildConfig( + num_layers=32, + num_heads=71, + num_kv_heads=1, + hidden_size=4544, + vocab_size=65024, + hidden_act=None, + n_positions=2048, + max_batch_size=128, + max_input_len=512, + max_output_len=200, + builder_opt=None, + bias=False, + use_alibi=False, + parallel_attention=True, + new_decoder_architecture=False, + )), + "falcon_40b": + ModelConfig(name="falcon_40b", + family="falcon", + benchmark_type="gpt", + build_config=BuildConfig( + num_layers=60, + num_heads=128, + num_kv_heads=8, + hidden_size=8192, + vocab_size=65024, + hidden_act=None, + n_positions=2048, + max_batch_size=64, + max_input_len=512, + max_output_len=200, + builder_opt=None, + bias=False, + use_alibi=False, + parallel_attention=True, + new_decoder_architecture=False, + )), + "falcon_180b": + ModelConfig(name="falcon_180b", + family="falcon", + benchmark_type="gpt", + build_config=BuildConfig( + num_layers=80, + num_heads=232, + num_kv_heads=8, + hidden_size=14848, + vocab_size=65024, + hidden_act=None, + n_positions=2048, + max_batch_size=8, + max_input_len=1024, + max_output_len=1024, + builder_opt=None, + bias=False, + use_alibi=False, + parallel_attention=True, + new_decoder_architecture=False, + )), } diff --git a/benchmarks/base_benchmark.py b/benchmarks/python/base_benchmark.py similarity index 100% rename from benchmarks/base_benchmark.py rename to benchmarks/python/base_benchmark.py diff --git a/benchmarks/benchmark.py b/benchmarks/python/benchmark.py similarity index 100% rename from benchmarks/benchmark.py rename to benchmarks/python/benchmark.py diff --git a/benchmarks/bert_benchmark.py b/benchmarks/python/bert_benchmark.py similarity index 100% rename from benchmarks/bert_benchmark.py rename to benchmarks/python/bert_benchmark.py diff --git a/benchmarks/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py similarity index 93% rename from benchmarks/gpt_benchmark.py rename to benchmarks/python/gpt_benchmark.py index f11f868272c..e66c7a180e7 100644 --- a/benchmarks/gpt_benchmark.py +++ b/benchmarks/python/gpt_benchmark.py @@ -81,22 +81,24 @@ def __init__(self, self.per_token = False self.per_channel = False - self.use_gpt_attention_plugin = False - self.use_gemm_plugin = False - self.use_layernorm_plugin = False - self.use_rmsnorm_plugin = False - self.use_lookup_plugin = False + is_plugin_mode = mode == 'plugin' + plg_dtype = dtype if is_plugin_mode else False + self.use_gpt_attention_plugin = plg_dtype + self.use_gemm_plugin = plg_dtype + self.use_layernorm_plugin = plg_dtype + # Enable RMS Norm plugin for the LLaMA family. + if is_plugin_mode and 'llama' in model_name: + self.use_rmsnorm_plugin = dtype + else: + self.use_rmsnorm_plugin = False + self.use_lookup_plugin = plg_dtype self.enable_context_fmha = True self.quant_mode = QuantMode(0) - if mode == 'plugin': - self.use_gpt_attention_plugin = dtype - self.use_gemm_plugin = dtype - self.use_layernorm_plugin = dtype - self.use_lookup_plugin = dtype - if "llama" in model_name: - self.use_rmsnorm_plugin = dtype + self.remove_input_padding = is_plugin_mode + for key, value in get_build_config(model_name).items(): setattr(self, key, value) + # Override the n_position/max_input_len/max_output_len/max_batch_size to value from cmd line if that's specified. if n_positions is not None: assert isinstance( @@ -122,6 +124,7 @@ def __init__(self, self.num_kv_heads = self.num_heads if kwargs.get('force_num_layer_1', False): self.num_layers = 1 + if self.use_smooth_quant: self.quant_mode = QuantMode.use_smooth_quant( self.per_token, self.per_channel) @@ -195,7 +198,7 @@ def prepare_inputs(self, config): input_lengths = torch.tensor([inlen for _ in range(batch_size)]).int().cuda() - self.decoder.setup(batch_size, inlen, outlen) + self.decoder.setup(batch_size, inlen, outlen, beam_width=self.num_beams) return (input_ids, input_lengths) def build(self): @@ -334,6 +337,21 @@ def build(self): world_size=self.world_size, tp_size=self.world_size), # TP only use_parallel_embedding=(self.model_name == 'bloom_176b')) + elif family == "falcon": + tensorrt_llm_model = tensorrt_llm.models.FalconForCausalLM( + num_layers=self.num_layers, + num_heads=self.num_heads, + num_kv_heads=self.num_kv_heads, + hidden_size=self.hidden_size, + vocab_size=self.vocab_size, + max_position_embeddings=self.n_positions, + dtype=kv_dtype, + bias=self.bias, + use_alibi=self.use_alibi, + new_decoder_architecture=self.new_decoder_architecture, + parallel_attention=self.parallel_attention, + mapping=tensorrt_llm.Mapping(world_size=self.world_size, + tp_size=self.world_size)) else: raise Exception(f'Unexpected model: {self.model_name}') @@ -429,7 +447,7 @@ def build(self): def run(self, inputs, config): batch_size, inlen, outlen = config[0], config[1], config[2] - self.decoder.setup(batch_size, inlen, outlen) + self.decoder.setup(batch_size, inlen, outlen, beam_width=self.num_beams) if self.remove_input_padding: self.decoder.decode_batch(inputs[0], self.sampling_config) else: diff --git a/benchmarks/mem_monitor.py b/benchmarks/python/mem_monitor.py similarity index 100% rename from benchmarks/mem_monitor.py rename to benchmarks/python/mem_monitor.py diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1b94dabce9d..58ee3e8a638 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,6 +28,14 @@ project(tensorrt_llm LANGUAGES CXX) option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON) option(BUILD_TESTS "Build Google tests" ON) option(BUILD_BENCHMARKS "Build benchmarks" ON) +option(NVTX_DISABLE "Disable all NVTX features" ON) + +if(NVTX_DISABLE) + add_compile_definitions("NVTX_DISABLE") + message(STATUS "NVTX is disabled") +else() + message(STATUS "NVTX is enabled") +endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/batch_manager/CMakeLists.txt") @@ -67,15 +75,35 @@ endif() check_language(CUDA) if(CMAKE_CUDA_COMPILER) message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}") - execute_process( - COMMAND - "bash" "-c" - "${CMAKE_CUDA_COMPILER} --version | egrep -o 'V[0-9]+.[0-9]+.[0-9]+' | cut -c2-" - RESULT_VARIABLE _BASH_SUCCESS - OUTPUT_VARIABLE CMAKE_CUDA_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - if(NOT _BASH_SUCCESS EQUAL 0) - message(FATAL_ERROR "Failed to determine CUDA version") + if(NOT WIN32) # Linux + execute_process( + COMMAND + "bash" "-c" + "${CMAKE_CUDA_COMPILER} --version | egrep -o 'V[0-9]+.[0-9]+.[0-9]+' | cut -c2-" + RESULT_VARIABLE _BASH_SUCCESS + OUTPUT_VARIABLE CMAKE_CUDA_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(NOT _BASH_SUCCESS EQUAL 0) + message(FATAL_ERROR "Failed to determine CUDA version") + endif() + + else() # Windows + execute_process( + COMMAND ${CMAKE_CUDA_COMPILER} --version + OUTPUT_VARIABLE versionString + RESULT_VARIABLE versionResult) + + if(versionResult EQUAL 0 AND versionString MATCHES + "V[0-9]+\\.[0-9]+\\.[0-9]+") + string(REGEX REPLACE "V" "" version ${CMAKE_MATCH_0}) + set(CMAKE_CUDA_COMPILER_VERSION "${version}") + else() + message(FATAL_ERROR "Failed to determine CUDA version") + endif() + + # Export shared libs as both `.lib` and `.dll` to avoid linking errors. + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() else() message(FATAL_ERROR "No CUDA compiler found") @@ -102,85 +130,94 @@ message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES}") enable_language(CUDA) -# TODO: FindCUDA is deprecated and should be replaced by FindCUDAToolkit -# https://cmake.org/cmake/help/latest/module/FindCUDA.html -find_package(CUDA ${CUDA_REQUIRED_VERSION} REQUIRED) - -message(STATUS "CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}") +find_package(CUDAToolkit REQUIRED) find_library( CUDNN_LIB cudnn - HINTS ${CUDA_TOOLKIT_ROOT_DIR} ${CUDNN_ROOT_DIR} - PATH_SUFFIXES lib64 lib) + HINTS ${CUDNN_ROOT_DIR} ${CUDAToolkit_LIBRARY_DIR} + PATH_SUFFIXES lib64 lib lib/x64) find_library( CUBLAS_LIB cublas - HINTS ${CUDA_TOOLKIT_ROOT_DIR} + HINTS ${CUDAToolkit_LIBRARY_DIR} PATH_SUFFIXES lib64 lib lib/stubs) find_library( CUBLASLT_LIB cublasLt - HINTS ${CUDA_TOOLKIT_ROOT_DIR} + HINTS ${CUDAToolkit_LIBRARY_DIR} PATH_SUFFIXES lib64 lib lib/stubs) -find_library( - CUDART_LIB cudart - HINTS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib lib64) find_library( CUDA_DRV_LIB cuda - HINTS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs) -set(CUDA_LIBRARIES ${CUDART_LIB}) + HINTS ${CUDAToolkit_LIBRARY_DIR} + PATH_SUFFIXES stubs lib lib64 lib/stubs lib64/stubs) + +set(CMAKE_CUDA_RUNTIME_LIBRARY Static) find_library(RT_LIB rt) set_ifndef(ENABLE_MULTI_DEVICE 1) if(ENABLE_MULTI_DEVICE EQUAL 1) # NCCL dependencies - set_ifndef(NCCL_LIB_DIR /usr/lib/x86_64-linux-gnu/) + set_ifndef(NCCL_LIB_DIR /usr/lib/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu/) set_ifndef(NCCL_INCLUDE_DIR /usr/include/) find_library(NCCL_LIB nccl HINTS ${NCCL_LIB_DIR}) endif() -set(3RDPARTY_DIR ../3rdparty) -include_directories(${CUDA_INCLUDE_DIRS} ${CUDNN_ROOT_DIR}/include - ${NCCL_INCLUDE_DIR} ${3RDPARTY_DIR}/cutlass/include) +get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_SOURCE_DIR} PATH) + +set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty) +include_directories( + ${CUDA_INCLUDE_DIRS} ${CUDNN_ROOT_DIR}/include ${NCCL_INCLUDE_DIR} + ${3RDPARTY_DIR}/cutlass/include ${3RDPARTY_DIR}/NVTX/include) # TRT dependencies set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR}) -set_ifndef(TRT_INCLUDE_DIR /usr/include/x86_64-linux-gnu) +set_ifndef(TRT_INCLUDE_DIR /usr/include/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu) set(TRT_LIB nvinfer) find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR}) find_library_create_target(nvuffparser nvparsers SHARED ${TRT_LIB_DIR}) -if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11") +if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11") add_definitions("-DENABLE_BF16") message( STATUS - "CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag" + "CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag" ) endif() -if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.8") +if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11.8") add_definitions("-DENABLE_FP8") message( STATUS - "CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.8, enable -DENABLE_FP8 flag" + "CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 11.8, enable -DENABLE_FP8 flag" ) endif() +# MPI MPI isn't used until tensorrt_llm/CMakeLists.txt is invoked. However, if +# it's not called before "CMAKE_CXX_FLAGS" is set, it breaks on Windows for some +# reason, so we just call it here as a workaround. +find_package(MPI REQUIRED) + # C++17 set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_FLAGS - "-Wno-deprecated-declarations ${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE}" + "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE}" ) +# Disable deprecated declarations warnings +if(NOT WIN32) + set(CMAKE_CXX_FLAGS "-Wno-deprecated-declarations ${CMAKE_CXX_FLAGS}") +else() + # /wd4996 is the Windows equivalent to turn off warnings for deprecated + # declarations + set(CMAKE_CXX_FLAGS "/wd4996 ${CMAKE_CXX_FLAGS}") +endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") -set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}) -set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDA_PATH}/include) +set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR}) message(STATUS "COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}") if(BUILD_PYT) @@ -277,5 +314,6 @@ if(BUILD_TESTS) endif() if(BUILD_BENCHMARKS) - add_subdirectory(benchmarks) + add_subdirectory(${TRT_LLM_ROOT_DIR}/benchmarks/cpp + ${CMAKE_BINARY_DIR}/benchmarks) endif() diff --git a/cpp/cmake/modules/find_library_create_target.cmake b/cpp/cmake/modules/find_library_create_target.cmake index c315d88ca78..1af806f19d0 100644 --- a/cpp/cmake/modules/find_library_create_target.cmake +++ b/cpp/cmake/modules/find_library_create_target.cmake @@ -31,8 +31,9 @@ macro(find_library_create_target target_name lib libtype hints) find_library(${lib}_LIB_PATH ${lib}) message(STATUS "Library that was found ${${lib}_LIB_PATH}") add_library(${target_name} ${libtype} IMPORTED) - set_property(TARGET ${target_name} PROPERTY IMPORTED_LOCATION - ${${lib}_LIB_PATH}) + set_target_properties( + ${target_name} PROPERTIES IMPORTED_LOCATION ${${lib}_LIB_PATH} + IMPORTED_IMPLIB ${${lib}_LIB_PATH}) message( STATUS "==========================================================================================" diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h index 3adb7a9e5e9..d626e1f7659 100644 --- a/cpp/include/tensorrt_llm/batch_manager/GptManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/GptManager.h @@ -17,8 +17,10 @@ #pragma once #include "tensorrt_llm/batch_manager/BatchManager.h" +#include "tensorrt_llm/batch_manager/batchScheduler.h" #include "tensorrt_llm/batch_manager/callbacks.h" #include "tensorrt_llm/batch_manager/llmRequest.h" +#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h" #include #include #include @@ -42,12 +44,13 @@ class TrtGptModel; class GptManager { public: - using RequestTable = std::map; + using SizeType = tensorrt_llm::runtime::SizeType; + using RequestList = std::list>; - GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType, int32_t mMaxSeqLen, - int32_t maxNumRequests, int32_t maxBeamWidth, GetInferenceRequestsCallback getInferenceRequestsCb, + GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType, int32_t maxBeamWidth, + batch_scheduler::SchedulerPolicy schedulerPolicy, GetInferenceRequestsCallback getInferenceRequestsCb, SendResponseCallback sendResponseCb, PollStopSignalCallback pollStopSignalCb = nullptr, - std::optional maxTokensInPagedKvCache = std::nullopt); + const TrtGptModelOptionalParams& optionalParams = TrtGptModelOptionalParams()); /* Wraps the user-provided callback for requests. Adds requests to request table. @@ -56,9 +59,8 @@ class GptManager /* Does the following: 1. Returns completed requests to Triton - 2. Frees KV cache and other dedicated resources - 3. Deletes entry from request_table */ - BatchManagerErrorCode_t return_completed_requests(); + 2. Deletes entry from activeRequests */ + BatchManagerErrorCode_t returnCompletedRequests(); BatchManagerErrorCode_t pollStopSignals(); @@ -69,20 +71,23 @@ class GptManager 1. Maps batch manager requests to backend request 2. Invokes one step of backend 3. Updates state of all requests */ - virtual BatchManagerErrorCode_t step(RequestTable& requestTable); + virtual BatchManagerErrorCode_t step(RequestList& activeRequests, std::set& activeRequestsIds); private: - void validateLlmRequest(LlmRequest& newReq); - static LlmRequest fillLlmRequest(std::shared_ptr newReq); + void validateLlmRequest(LlmRequest& newReq) const; + static std::shared_ptr fillLlmRequest(std::shared_ptr newReq); static std::shared_ptr> getReqInputTokens(std::shared_ptr new_req); static int32_t getMaxNewTokens(std::shared_ptr newReq); std::shared_ptr mTrtGptModel; - int32_t mMaxNumRequests; - int32_t mMaxSeqLen; - - // Table of live requests - std::map mRequestTable; + SizeType mMaxInputLen; + SizeType mMaxOutputLen; + SizeType mMaxNumSequences; + + // List of live requests + RequestList mActiveRequests; + // IDs of live requests + std::set mActiveRequestsIds; GetInferenceRequestsCallback mGetInferenceRequestsCb; SendResponseCallback mSendResponseCb; diff --git a/cpp/include/tensorrt_llm/batch_manager/batchScheduler.h b/cpp/include/tensorrt_llm/batch_manager/batchScheduler.h new file mode 100644 index 00000000000..4208c8b4c2b --- /dev/null +++ b/cpp/include/tensorrt_llm/batch_manager/batchScheduler.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/batch_manager/kvCacheManager.h" +#include "tensorrt_llm/batch_manager/llmRequest.h" +#include "tensorrt_llm/runtime/common.h" +#include + +namespace tensorrt_llm::batch_manager::batch_scheduler +{ + +enum class SchedulerPolicy +{ + MAX_UTILIZATION, + GUARANTEED_COMPLETION, +}; + +class BatchScheduler +{ +public: + using RequestTable = std::map>; + using SizeType = tensorrt_llm::runtime::SizeType; + using RequestList = std::list>; + + BatchScheduler(int32_t maxNumRequests, int32_t maxInputLen, + std::shared_ptr kvCacheManager, SchedulerPolicy schedulerPolicy) + : mMaxNumRequests(maxNumRequests) + , mMaxInputLen(maxInputLen) + , mKvCacheManager(kvCacheManager) + , mSchedulerPolicy(schedulerPolicy) + { + } + + /// @brief Takes as input a sorted list of requets and outputs a map of requests + /// to update for this current iteration + RequestTable scheduleRequests(const RequestList& requestList); + +private: + /// @brief Schedule request using the MAX_UTILIZATION policy + RequestTable scheduleRequestsMaxUtilization(const RequestList& requestList); + + /// @brief Try reserving resources to advance this req by one step, using MAX_UTILIZATION policy + bool trySchedulingRequestMaxUtilization( + const LlmRequest& req, SizeType& numScheduledRequests, SizeType& numScheduledBlocks); + + /// @brief Schedule request using the GUARANTEED_COMPLETION policy + RequestTable scheduleRequestsGuaranteedCompletion(const RequestList& requestList); + + /// @brief Schedule up to mMaxNumReuests requests + RequestTable scheduleMaxNumRequests(const RequestList& requestList); + + /// The maximum number of requests to schedule + int32_t mMaxNumRequests; + + /// The maximum prompt length + int32_t mMaxInputLen; + + std::shared_ptr mKvCacheManager; + + /// The scheduling policy to use + SchedulerPolicy mSchedulerPolicy; +}; + +} // namespace tensorrt_llm::batch_manager::batch_scheduler diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h index cd5845851a9..c6498620720 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h @@ -16,8 +16,10 @@ #pragma once +#include "tensorrt_llm/batch_manager/llmRequest.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/cudaStream.h" +#include "tensorrt_llm/runtime/gptModelConfig.h" #include "tensorrt_llm/runtime/iTensor.h" #include @@ -26,11 +28,6 @@ #include #include -namespace tensorrt_llm::runtime -{ -class GptModelConfig; -} - namespace tensorrt_llm::batch_manager::kv_cache_manager { @@ -158,7 +155,6 @@ class BlockManager return mFreeBlocks.size(); } -private: [[nodiscard]] bool hasFreeBlocks(std::size_t numRequired = 1) const { return getNumFreeBlocks() >= numRequired; @@ -203,6 +199,17 @@ class KVCacheManager return mBlockManager; } + /// @brief Function that computes the number of KV cache blocks needed to advance a request by one iteration + /// @param req The request for which we need to calculate the number of needed KV cache blocks + /// @return The number of blocks + SizeType getNeededBlocksOneStep(const LlmRequest& req) const; + + /// @brief Function that computes the number of KV cache blocks needed to advance a request to completion (i.e. for + /// maxNewTokens) + /// @param req The request for which we need to calculate the number of needed KV cache blocks + /// @return The number of blocks + SizeType getNeededBlocksToCompletion(const LlmRequest& req) const; + [[nodiscard]] std::vector const& getMemoryPools() const { return mPools; @@ -217,15 +224,21 @@ class KVCacheManager [[nodiscard]] std::vector getBlockPointersOfSlot( SizeType batchSlotIdx, SizeType beamWidth, SizeType maxBlocksPerSeq) const; - [[nodiscard]] std::vector getBlockPointersOfBatch( + [[nodiscard]] runtime::ITensor::UniquePtr getBlockPointersOfBatch( SizeType batchSize, SizeType beamWidth, SizeType maxBlocksPerSeq) const; // Volume of [2, numKvHeads, tokensPerBlock, sizePerHead] - [[nodiscard]] static SizeType constexpr calculatePageSize(tensorrt_llm::runtime::GptModelConfig const& modelConfig); + [[nodiscard]] static SizeType constexpr calculatePageSize(tensorrt_llm::runtime::GptModelConfig const& modelConfig) + { + return 2 * modelConfig.getNbKvHeads() * modelConfig.getTokensPerBlock() * modelConfig.getSizePerHead(); + } // numLayers * 2 * numKvHeads * sizePerHead [[nodiscard]] static SizeType constexpr calculateCacheSizePerToken( - tensorrt_llm::runtime::GptModelConfig const& modelConfig); + tensorrt_llm::runtime::GptModelConfig const& modelConfig) + { + return modelConfig.getNbLayers() * 2 * modelConfig.getNbKvHeads() * modelConfig.getSizePerHead(); + } private: // Number of elements per one blocks diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index 913576f1286..500561caefa 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -16,9 +16,9 @@ #pragma once -#include "tensorrt_llm/batch_manager/BatchManager.h" #include "tensorrt_llm/runtime/samplingConfig.h" +#include #include #include #include @@ -52,24 +52,135 @@ class LlmRequest , mIsStreaming(isStreaming) , mEndId(endId) , mPadId(padId) + , mBatchSlot(-1) { + mMaxSentTokenPos = mPromptLen - 1; // Scatter the input tokens to other beam mTokens = std::make_shared(mSamplingConfig.beamWidth, *input_tokens); } + /// @brief Get total number of tokens for this req (prompt + generated) + /// @param beam The beam index + /// @return The number of tokens + int32_t getNumTokens(int beam) const + { + return mTokens->at(beam).size(); + } + + /// @brief Get a token at a given position and beam index + /// @param beam The beam index + /// @param pos The position of the token relative to beginning of the prompt + /// @return The token index + int32_t getToken(int beam, int pos) const + { + return mTokens->at(beam).at(pos); + } + + /// @brief Get the tokens at a given beam index + /// @param beam The beam index + /// @return A vector of tokens for this beam index, includes the prompt + std::vector getTokens(int beam) const + { + return mTokens->at(beam); + } + + /// @brief Get the number of generated tokens + /// @return The number of generated tokens (doesn't include the prompt tokens) + int32_t getNumGeneratedTokens() const + { + return mNumGeneratedTokens; + } + + /// @brief Add new generated tokens to the vector of tokens + /// @param beamTokens A vector containing the tokens to add for each beam index + /// beamTokens is expected to be of size beamWidth + void addNewTokens(const std::vector& beamTokens) + { + assert(mSamplingConfig.beamWidth == beamTokens.size()); + for (int beam = 0; beam < beamTokens.size(); ++beam) + { + mTokens->at(beam).push_back(beamTokens[beam]); + } + mNumGeneratedTokens++; + } + + /// @brief Sets the generated tokens for all beams. Erases all previous generated tokens. + /// @param generatedBeamTokens The generated tokens for all beams (vector of vector of tokens) + void setGeneratedTokens(const BeamTokens& generatedBeamTokens) + { + assert(generatedBeamTokens.size() == mSamplingConfig.beamWidth); + for (int beam = 0; beam < generatedBeamTokens.size(); ++beam) + { + auto& beamTokens = (*mTokens)[beam]; + beamTokens.resize(mPromptLen); + beamTokens.insert(beamTokens.end(), generatedBeamTokens[beam].begin(), generatedBeamTokens[beam].end()); + } + mNumGeneratedTokens = generatedBeamTokens.at(0).size(); + } + + /// @brief Pause a request by moving the generated tokens to the prompt + /// @param maxInputLen The maximum prompt len. + void pause(SizeType maxInputLen) + { + // TODO: For beamWidth > 1, we would need to support swapping to avoid + // recomputing from the start + // See https://jirasw.nvidia.com/browse/TRT-21715 + // As a temporary solution, we currently reset the tokens to the prompt + if (mSamplingConfig.beamWidth > 1) + { + for (auto& beamTokens : *mTokens) + { + beamTokens.resize(mPromptLen); + } + } + else + { + SizeType newPromptLen = std::min(maxInputLen, mPromptLen + mNumGeneratedTokens); + for (auto& beamTokens : *mTokens) + { + beamTokens.resize(newPromptLen); + } + mMaxNewTokens -= (newPromptLen - mPromptLen); + mPromptLen = newPromptLen; + } + mNumGeneratedTokens = 0; + mState = REQUEST_STATE_CONTEXT_INIT; + mBatchSlot = -1; + } + + /// @brief Get the maximum position of the tokens returned to the client. Use to ensure we don't return to client + /// duplicated token positions. + /// @return The maximum position of the tokens sent to the client + int32_t getMaxSentTokenPos() const + { + return mMaxSentTokenPos; + } + + /// @brief Sets the maximum position of the tokens returned to the client. Use to ensure we don't return to client + /// duplicated token positions. + /// @param pos The maximum position + void setMaxSentTokenPos(int32_t pos) + { + mMaxSentTokenPos = pos; + } + uint64_t mRequestId; int32_t mMaxNewTokens; // Tokens [beam_size, mPromptLen + mNumGeneratedTokens] - std::shared_ptr mTokens; runtime::SamplingConfig mSamplingConfig; int32_t mPromptLen; - int32_t mNumGeneratedTokens; LlmRequestState_t mState; bool mIsStreaming; std::optional mEndId; std::optional mPadId; + int32_t mBatchSlot; ~LlmRequest() {} + +private: + std::shared_ptr mTokens; + int32_t mNumGeneratedTokens; + int32_t mMaxSentTokenPos; }; } // namespace tensorrt_llm::batch_manager diff --git a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h new file mode 100644 index 00000000000..43184c935dc --- /dev/null +++ b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h @@ -0,0 +1,68 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/runtime/common.h" + +#include + +namespace tensorrt_llm::batch_manager +{ + +class TrtGptModelOptionalParams +{ +public: + using SizeType = tensorrt_llm::runtime::SizeType; + + TrtGptModelOptionalParams() + : mMaxNumSequences(std::nullopt) + , mMaxTokensInPagedKvCache(std::nullopt) + , mKvCacheFreeGpuMemFraction(std::nullopt) + { + } + + TrtGptModelOptionalParams(std::optional maxNumSequences, std::optional maxTokensInPagedKvCache, + std::optional kvCacheFreeGpuMemFraction) + : mMaxNumSequences(maxNumSequences) + , mMaxTokensInPagedKvCache(maxTokensInPagedKvCache) + , mKvCacheFreeGpuMemFraction(kvCacheFreeGpuMemFraction) + { + } + + [[nodiscard]] std::optional getMaxTokensInPagedKvCache() const + { + return mMaxTokensInPagedKvCache; + } + + [[nodiscard]] std::optional getKvCacheFreeGpuMemFraction() const + { + return mKvCacheFreeGpuMemFraction; + } + + [[nodiscard]] std::optional getMaxNumSequences() const + { + return mMaxNumSequences; + } + +private: + std::optional mMaxNumSequences; + std::optional mMaxTokensInPagedKvCache; + std::optional mKvCacheFreeGpuMemFraction; +}; + +} // namespace tensorrt_llm::batch_manager diff --git a/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h b/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h index efd7a590a1c..5226fd0e5b5 100644 --- a/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h +++ b/cpp/include/tensorrt_llm/runtime/gptJsonConfig.h @@ -32,10 +32,12 @@ namespace tensorrt_llm::runtime class GptJsonConfig { public: - GptJsonConfig(std::string name, std::string precision, SizeType worldSize, GptModelConfig const& modelConfig) + GptJsonConfig(std::string name, std::string precision, SizeType tensorParallelism, SizeType pipelineParallelism, + GptModelConfig const& modelConfig) : mName(std::move(name)) , mPrecision(std::move(precision)) - , mWorldSize{worldSize} + , mTensorParallelism{tensorParallelism} + , mPipelineParallelism{pipelineParallelism} , mGptModelConfig(modelConfig) { } @@ -61,9 +63,19 @@ class GptJsonConfig return mPrecision; } - [[nodiscard]] SizeType const& getWorldSize() const + [[nodiscard]] SizeType constexpr getTensorParallelism() const { - return mWorldSize; + return mTensorParallelism; + } + + [[nodiscard]] SizeType constexpr getPipelineParallelism() const + { + return mPipelineParallelism; + } + + [[nodiscard]] SizeType constexpr getWorldSize() const + { + return mTensorParallelism * mPipelineParallelism; } [[nodiscard]] std::string engineFilename(WorldConfig const& worldConfig, std::string const& model) const; @@ -76,7 +88,8 @@ class GptJsonConfig private: std::string const mName; std::string const mPrecision; - SizeType const mWorldSize; + SizeType const mTensorParallelism; + SizeType const mPipelineParallelism; GptModelConfig const mGptModelConfig; }; diff --git a/cpp/include/tensorrt_llm/runtime/gptModelConfig.h b/cpp/include/tensorrt_llm/runtime/gptModelConfig.h index 863aa35c4ea..5c3892b6ae8 100644 --- a/cpp/include/tensorrt_llm/runtime/gptModelConfig.h +++ b/cpp/include/tensorrt_llm/runtime/gptModelConfig.h @@ -35,11 +35,13 @@ class GptModelConfig , mHiddenSize(hiddenSize) , mDataType(dtype) , mUseGptAttentionPlugin(false) - , mUseInflightBatching(false) , mInputPacked{false} , mPagedKvCache{false} , mTokensPerBlock{64} , mQuantMode{common::QuantMode::none()} + , mMaxBatchSize(0) + , mMaxInputLen(0) + , mMaxOutputLen(0) { } @@ -53,9 +55,10 @@ class GptModelConfig return (mVocabSize + worldSize - 1) / worldSize * worldSize; } - [[nodiscard]] SizeType constexpr getNbLayers() const noexcept + [[nodiscard]] SizeType constexpr getNbLayers(SizeType pipelineParallelism = 1) const { - return mNbLayers; + TLLM_CHECK(mNbLayers % pipelineParallelism == 0); + return mNbLayers / pipelineParallelism; } [[nodiscard]] SizeType constexpr getNbHeads() const noexcept @@ -138,14 +141,39 @@ class GptModelConfig mQuantMode = QuantMode; } - [[nodiscard]] bool constexpr useInflightBatching() const noexcept + [[nodiscard]] bool constexpr supportsInflightBatching() const noexcept { - return mUseInflightBatching; + return mUseGptAttentionPlugin && mInputPacked && mPagedKvCache; } - void constexpr useInflightBatching(bool useInflightBatching) noexcept + [[nodiscard]] SizeType constexpr getMaxBatchSize() const noexcept { - mUseInflightBatching = useInflightBatching; + return mMaxBatchSize; + } + + void constexpr setMaxBatchSize(SizeType maxBatchSize) noexcept + { + mMaxBatchSize = maxBatchSize; + } + + [[nodiscard]] SizeType constexpr getMaxInputLen() const noexcept + { + return mMaxInputLen; + } + + void constexpr setMaxInputLen(SizeType maxInputLen) noexcept + { + mMaxInputLen = maxInputLen; + } + + [[nodiscard]] SizeType constexpr getMaxOutputLen() const noexcept + { + return mMaxOutputLen; + } + + void constexpr setMaxOutputLen(SizeType maxOutputLen) noexcept + { + mMaxOutputLen = maxOutputLen; } private: @@ -156,11 +184,13 @@ class GptModelConfig SizeType mHiddenSize; nvinfer1::DataType mDataType; bool mUseGptAttentionPlugin; - bool mUseInflightBatching; bool mInputPacked; bool mPagedKvCache; SizeType mTokensPerBlock; common::QuantMode mQuantMode; + SizeType mMaxBatchSize; + SizeType mMaxInputLen; + SizeType mMaxOutputLen; }; } // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h index def229827fd..dca47a446cd 100644 --- a/cpp/include/tensorrt_llm/runtime/gptSession.h +++ b/cpp/include/tensorrt_llm/runtime/gptSession.h @@ -25,13 +25,13 @@ #include "tensorrt_llm/runtime/samplingConfig.h" #include "tensorrt_llm/runtime/worldConfig.h" +#include + #include #include #include #include -#include - namespace tensorrt_llm::batch_manager::kv_cache_manager { class KVCacheManager; @@ -47,6 +47,7 @@ std::vector loadEngine(std::string const& enginePath); class TllmRuntime; class IStatefulGptDecoder; +class NcclCommunicator; class RuntimeBuffers; class GptSession @@ -109,6 +110,9 @@ class GptSession void createContexts(); void createDecoder(bool decoderPerRequest); + bool executeDecoderStep(ITensor::SharedPtr& outputIds, ITensor::SharedPtr& newTokens, SizeType decoderStep); + void finalizeOutputIds(ITensor& outputIds); + class CudaGraphExecutor { public: @@ -131,13 +135,15 @@ class GptSession return mInstance != nullptr; } + void clear(); + void prepareNextGraph(TllmRuntime const& runtime, SizeType nextContextId); + void launch(CudaStream const& stream); + + private: void create(cudaGraph_t const& graph); bool update(cudaGraph_t const& graph); void uploadToStream(CudaStream const& stream); - void launch(CudaStream const& stream); - void clear(); - private: using cudaGraphExecPtr = cudaGraphExec_t; cudaGraphExecPtr mInstance; }; @@ -146,6 +152,7 @@ class GptSession GptModelConfig const mModelConfig; WorldConfig const mWorldConfig; int mDevice{-1}; + std::shared_ptr mPipelineComm; SizeType mDecoderMaxSequenceLength{}; diff --git a/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h b/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h index b6f96dfa8d0..507798e5fb8 100644 --- a/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h +++ b/cpp/include/tensorrt_llm/runtime/iStatefulGptDecoder.h @@ -61,6 +61,7 @@ class Output // parameters for beam search TensorPtr cacheIndirection; // [batchSize, maxBeamWidth, maxSeqLen], mandatory in beam search, on gpu + TensorPtr sequenceLengths; // [batchSize, maxBeamWidth], mandatory, on gpu }; } // namespace decoder diff --git a/cpp/include/tensorrt_llm/runtime/tllmLogger.h b/cpp/include/tensorrt_llm/runtime/tllmLogger.h index 53b20a3a6c2..5ffd39b7893 100644 --- a/cpp/include/tensorrt_llm/runtime/tllmLogger.h +++ b/cpp/include/tensorrt_llm/runtime/tllmLogger.h @@ -18,9 +18,7 @@ #include -namespace tensorrt_llm -{ -namespace runtime +namespace tensorrt_llm::runtime { class TllmLogger : public nvinfer1::ILogger @@ -33,5 +31,4 @@ class TllmLogger : public nvinfer1::ILogger void setLevel(Severity level); }; -} // namespace runtime -} // namespace tensorrt_llm +} // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/worldConfig.h b/cpp/include/tensorrt_llm/runtime/worldConfig.h index 0fbf305be0b..41193f098f3 100644 --- a/cpp/include/tensorrt_llm/runtime/worldConfig.h +++ b/cpp/include/tensorrt_llm/runtime/worldConfig.h @@ -19,6 +19,8 @@ #include "tensorrt_llm/runtime/common.h" #include +#include +#include namespace tensorrt_llm::runtime { @@ -27,9 +29,10 @@ class WorldConfig public: static SizeType constexpr kDefaultGpusPerNode = 8; - constexpr explicit WorldConfig( - SizeType worldSize = 1, SizeType rank = 0, SizeType gpusPerNode = kDefaultGpusPerNode) - : mSize{worldSize} + constexpr explicit WorldConfig(SizeType tensorParallelism = 1, SizeType pipelineParallelism = 1, SizeType rank = 0, + SizeType gpusPerNode = kDefaultGpusPerNode) + : mTensorParallelism{tensorParallelism} + , mPipelineParallelism{pipelineParallelism} , mRank{rank} , mGpusPerNode{gpusPerNode} { @@ -37,7 +40,22 @@ class WorldConfig [[nodiscard]] SizeType constexpr getSize() const noexcept { - return mSize; + return mTensorParallelism * mPipelineParallelism; + } + + [[nodiscard]] SizeType constexpr getTensorParallelism() const noexcept + { + return mTensorParallelism; + } + + [[nodiscard]] SizeType constexpr getPipelineParallelism() const noexcept + { + return mPipelineParallelism; + } + + [[nodiscard]] bool constexpr isPipelineParallel() const noexcept + { + return mPipelineParallelism > 1; } [[nodiscard]] SizeType constexpr getRank() const noexcept @@ -55,12 +73,39 @@ class WorldConfig return mRank % mGpusPerNode; } - static WorldConfig mpi(nvinfer1::ILogger& logger, SizeType gpusPerNode = kDefaultGpusPerNode); + [[nodiscard]] SizeType constexpr getPipelineParallelRank() const noexcept + { + return mRank / mTensorParallelism; + } + + [[nodiscard]] SizeType constexpr getTensorParallelRank() const noexcept + { + return mRank % mTensorParallelism; + } + + [[nodiscard]] bool constexpr isFirstPipelineParallelRank() const noexcept + { + return getPipelineParallelRank() == 0; + } + + [[nodiscard]] bool constexpr isLastPipelineParallelRank() const noexcept + { + return getPipelineParallelRank() == getPipelineParallelism() - 1; + } + + [[nodiscard]] std::vector getPipelineParallelGroup() const; + + static WorldConfig mpi(nvinfer1::ILogger& logger, SizeType gpusPerNode = kDefaultGpusPerNode, + std::optional tensorParallelism = std::nullopt, + std::optional pipelineParallelism = std::nullopt); - static WorldConfig mpi(SizeType gpusPerNode = kDefaultGpusPerNode); + static WorldConfig mpi(SizeType gpusPerNode = kDefaultGpusPerNode, + std::optional tensorParallelism = std::nullopt, + std::optional pipelineParallelism = std::nullopt); private: - SizeType mSize; + SizeType mTensorParallelism; + SizeType mPipelineParallelism; SizeType mRank; SizeType mGpusPerNode; }; diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt index 49fb5409580..2e6c1acf139 100644 --- a/cpp/tensorrt_llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/CMakeLists.txt @@ -24,8 +24,8 @@ set(STATIC_TARGET set(API_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include) find_package(MPI REQUIRED) -message(STATUS "Using MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}") -message(STATUS "Using MPI_LIBRARIES: ${MPI_LIBRARIES}") +message(STATUS "Using MPI_CXX_INCLUDE_DIRS: ${MPI_CXX_INCLUDE_DIRS}") +message(STATUS "Using MPI_CXX_LIBRARIES: ${MPI_CXX_LIBRARIES}") include_directories(${CMAKE_CURRENT_SOURCE_DIR}/cutlass_extensions/include ${API_INCLUDE_DIR} ${MPI_INCLUDE_PATH}) @@ -66,10 +66,9 @@ endif() set(TRTLLM_LINK_LIBS ${CUBLAS_LIB} ${CUBLASLT_LIB} - ${CUDART_LIB} ${CUDNN_LIB} ${CMAKE_DL_LIBS} - ${MPI_LIBRARIES} + ${MPI_CXX_LIBRARIES} ${TRT_LIB} common_src kernels_src @@ -86,8 +85,14 @@ set_target_properties( ${SHARED_TARGET} PROPERTIES CXX_STANDARD "17" CXX_STANDARD_REQUIRED "YES" CXX_EXTENSIONS "NO") +if(NOT MSVC) # Unix-like compilers + set(ALLOW_UNDEFINED_FLAG "-Wl, --no-undefined") +else() # MSVC + set(UNDEFINED_FLAG "") +endif() + target_link_libraries(${SHARED_TARGET} PUBLIC ${TRTLLM_LINK_LIBS} - "-Wl,--no-undefined") + ${UNDEFINED_FLAG}) # ################################# STATIC LIBRARY # ############################################################################## diff --git a/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a deleted file mode 100644 index c8105658ffa..00000000000 --- a/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a549ddc6871f9499ee1321bd66d8d30c291af3f6320c7a1c6b9276a19bad62a -size 10941362 diff --git a/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a deleted file mode 100644 index 85380f34812..00000000000 --- a/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a2c0d092b5f2bfa4d57528994048b8bf00010a502e543c2cc26a41f3d788ae4 -size 10932338 diff --git a/cpp/tensorrt_llm/common/assert.h b/cpp/tensorrt_llm/common/assert.h index a54ebba1307..1c4bca699b6 100644 --- a/cpp/tensorrt_llm/common/assert.h +++ b/cpp/tensorrt_llm/common/assert.h @@ -30,7 +30,11 @@ namespace tensorrt_llm::common } // namespace tensorrt_llm::common +#if defined(_WIN32) +#define TLLM_LIKELY(x) (__assume((x) == 1), (x)) +#else #define TLLM_LIKELY(x) __builtin_expect((x), 1) +#endif #define TLLM_CHECK(val) \ do \ diff --git a/cpp/tensorrt_llm/common/cublasMMWrapper.cpp b/cpp/tensorrt_llm/common/cublasMMWrapper.cpp index d7a660f1e81..07e50a9ecbc 100644 --- a/cpp/tensorrt_llm/common/cublasMMWrapper.cpp +++ b/cpp/tensorrt_llm/common/cublasMMWrapper.cpp @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/cublasMMWrapper.h" +#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/cublasVersionCheck.h" #include @@ -70,38 +71,48 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, c Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f); } +void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, + const void* A, const int lda, const void* B, const int ldb, void* C, const int ldc, + const std::optional& heuristic) +{ + if (heuristic) + { + Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f, (*heuristic).algo, + (*heuristic).state == CUBLAS_STATUS_SUCCESS && (*heuristic).workspaceSize < CUBLAS_WORKSPACE_SIZE); + } + else + { + Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f, {}, false); + } +} + void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, const void* A, const int lda, const void* B, const int ldb, void* C, const int ldc, float f_alpha, float f_beta) { - half h_alpha = (half) (f_alpha); - half h_beta = (half) (f_beta); + bool usingCublasLt = Atype_ == CUDA_R_16F; + bool isFp16ComputeType = computeType_ == CUDA_R_16F; - mu_->lock(); - // TODO: default cublas libs - int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0; - bool using_cublasLt = (Atype_ == CUDA_R_16F) ? true : false; int batch_count = 1; - // fp32 use cublas as default - // fp16 use cublasLt as default - const void* alpha = is_fp16_computeType ? reinterpret_cast(&h_alpha) : reinterpret_cast(&f_alpha); - const void* beta = is_fp16_computeType ? reinterpret_cast(&h_beta) : reinterpret_cast(&f_beta); - int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(Atype_)); cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_)); + + cublasLtMatmulAlgo_t algo; + void* workSpace = cublas_workspace_; + int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE; if (findAlgo) { if (info.stages != -1) { - using_cublasLt = true; + usingCublasLt = true; } else { - using_cublasLt = false; + usingCublasLt = false; } } - if (using_cublasLt) + if (usingCublasLt) { cublasLtMatmulDesc_t operationDesc = NULL; cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL; @@ -112,7 +123,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, c cudaDataType_t computeType; #endif - if (is_fp16_computeType) + if (isFp16ComputeType) { #if (CUDART_VERSION >= 11000) computeType = CUBLAS_COMPUTE_16F; @@ -131,23 +142,6 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, c scaleType = CUDA_R_32F; } - // -------------------------------------- - // Create descriptors for the original matrices - cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); - cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); - cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType); -#else - cublasLtMatmulDescCreate(&operationDesc, computeType); -#endif - - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t)); - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t)); - - cublasLtMatmulAlgo_t algo; - void* workSpace = cublas_workspace_; - int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE; if (findAlgo) { if (info.workspaceSize > workspaceSize) @@ -174,9 +168,103 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, c #endif } } + } + + Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, f_alpha, f_beta, algo, findAlgo); +} - cublasLtMatmul(*cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, - (findAlgo == 1 ? (&algo) : NULL), workSpace, workspaceSize, stream_); +void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, + const void* A, const int lda, const void* B, const int ldb, void* C, const int ldc, float f_alpha, float f_beta, + const cublasLtMatmulAlgo_t& algo, bool hasAlgo) +{ + half h_alpha = (half) (f_alpha); + half h_beta = (half) (f_beta); + + std::lock_guard lock(*mu_); + + // TODO: default cublas libs + bool usingCublasLt = Atype_ == CUDA_R_16F; + bool isFp16ComputeType = computeType_ == CUDA_R_16F; + int batch_count = 1; + // fp32 use cublas as default + // fp16 use cublasLt as default + const void* alpha = isFp16ComputeType ? reinterpret_cast(&h_alpha) : reinterpret_cast(&f_alpha); + const void* beta = isFp16ComputeType ? reinterpret_cast(&h_beta) : reinterpret_cast(&f_beta); + if (hasAlgo) + { + int32_t stages; + cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL); + if (stages != -1) + { + usingCublasLt = true; + } + else + { + usingCublasLt = false; + } + } + + if (usingCublasLt) + { + cublasLtMatmulDesc_t operationDesc = NULL; + cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL; + cudaDataType_t scaleType; +#if (CUDART_VERSION >= 11000) + cublasComputeType_t computeType; +#else + cudaDataType_t computeType; +#endif + + if (isFp16ComputeType) + { +#if (CUDART_VERSION >= 11000) + computeType = CUBLAS_COMPUTE_16F; +#else + computeType = CUDA_R_16F; +#endif + scaleType = CUDA_R_16F; + } + else + { +#if (CUDART_VERSION >= 11000) + computeType = CUBLAS_COMPUTE_32F; +#else + computeType = CUDA_R_32F; +#endif + scaleType = CUDA_R_32F; + } + // -------------------------------------- + // Create descriptors for the original matrices + cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); + cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); + cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc); +#if (CUDART_VERSION >= 11000) + cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType); +#else + cublasLtMatmulDescCreate(&operationDesc, computeType); +#endif + cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t)); + cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t)); + + void* workSpace = cublas_workspace_; + int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE; + if (hasAlgo) + { + cublasLtMatmulHeuristicResult_t heurResult; + // We have to check if the heruistic is correct given current shape size + cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck( + getCublasLtHandle(), operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult); + + if (algoStatus != CUBLAS_STATUS_SUCCESS || heurResult.state != CUBLAS_STATUS_SUCCESS + || heurResult.workspaceSize > CUBLAS_WORKSPACE_SIZE) + { + // Rely on runtime based heruistic + hasAlgo = false; + } + } + + check_cuda_error(cublasLtMatmul(*cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, + Cdesc, (hasAlgo ? (&algo) : NULL), workSpace, workspaceSize, stream_)); cublasLtMatmulDescDestroy(operationDesc); cublasLtMatrixLayoutDestroy(Adesc); @@ -186,12 +274,12 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, cublasOperation_t transb, c } else { - int cublasAlgo = info.algoId; + // Go with default heruistic to choose tactic as cuBLAS does not allow to choose tactics in Ampere+ + cublasGemmAlgo_t cublasAlgo = CUBLAS_GEMM_DEFAULT; check_cuda_error(cublasGemmEx(*cublas_handle_, transa, transb, m, n, k, alpha, A, Atype_, lda, B, Btype_, ldb, beta, C, Ctype_, ldc, computeType_, static_cast(cublasAlgo))); sync_check_cuda_error(); } - mu_->unlock(); } void cublasMMWrapper::setWorkspace(void* workspace) @@ -201,27 +289,25 @@ void cublasMMWrapper::setWorkspace(void* workspace) void cublasMMWrapper::setFP32GemmConfig() { - Atype_ = CUDA_R_32F; - Btype_ = CUDA_R_32F; - Ctype_ = CUDA_R_32F; - computeType_ = CUDA_R_32F; + setGemmConfig(CUDA_R_32F, CUDA_R_32F, CUDA_R_32F, CUDA_R_32F); } void cublasMMWrapper::setFP16GemmConfig() { - Atype_ = CUDA_R_16F; - Btype_ = CUDA_R_16F; - Ctype_ = CUDA_R_16F; - computeType_ = CUDA_R_32F; + setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); } #ifdef ENABLE_BF16 void cublasMMWrapper::setBF16GemmConfig() { - Atype_ = CUDA_R_16BF; - Btype_ = CUDA_R_16BF; - Ctype_ = CUDA_R_16BF; - computeType_ = CUDA_R_32F; + setGemmConfig(CUDA_R_16BF, CUDA_R_16BF, CUDA_R_16BF, CUDA_R_32F); +} +#endif + +#ifdef ENABLE_FP8 +void cublasMMWrapper::setFP8GemmConfig(cudaDataType_t outputType) +{ + setGemmConfig(CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, outputType, CUDA_R_32F); } #endif @@ -410,20 +496,150 @@ bool cublasMMWrapper::isFuseBatchGemm(const int batch_count, const int m, const } } -std::pair cublasMMWrapper::findBestAlgo(cublasLtHandle_t lightHandle, - cublasLtMatmulDesc_t computeDesc, const void* alpha, const void* A, cublasLtMatrixLayout_t Adesc, const void* B, - cublasLtMatrixLayout_t Bdesc, const void* beta, const void* C, cublasLtMatrixLayout_t Cdesc, void* D, - cublasLtMatrixLayout_t Ddesc, cudaStream_t stream) +std::vector cublasMMWrapper::getTactics(cublasOperation_t transa, + cublasOperation_t transb, const int m, const int n, const int k, const int lda, const int ldb, const int ldc) +{ + int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0; + cublasLtMatmulDesc_t operationDesc = NULL; + cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL; + cudaDataType_t scaleType; +#if (CUDART_VERSION >= 11000) + cublasComputeType_t computeType; +#else + cudaDataType_t computeType; +#endif + + if (is_fp16_computeType) + { +#if (CUDART_VERSION >= 11000) + computeType = CUBLAS_COMPUTE_16F; +#else + computeType = CUDA_R_16F; +#endif + scaleType = CUDA_R_16F; + } + else + { +#if (CUDART_VERSION >= 11000) + computeType = CUBLAS_COMPUTE_32F; +#else + computeType = CUDA_R_32F; +#endif + scaleType = CUDA_R_32F; + } + + // -------------------------------------- + // Create descriptors for the original matrices + check_cuda_error( + cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda)); + check_cuda_error( + cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb)); + check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc)); +#if (CUDART_VERSION >= 11000) + check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType)); +#else + check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType)); +#endif + + check_cuda_error( + cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t))); + check_cuda_error( + cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t))); + + void* workSpace = cublas_workspace_; + int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE; + + const auto heuristics = getTactics(getCublasLtHandle(), operationDesc, Adesc, Bdesc, Cdesc, Cdesc); + + check_cuda_error(cublasLtMatmulDescDestroy(operationDesc)); + check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc)); + check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc)); + check_cuda_error(cublasLtMatrixLayoutDestroy(Cdesc)); + sync_check_cuda_error(); + + return heuristics; +} + +bool cublasMMWrapper::checkTactic(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, + const int k, const int lda, const int ldb, const int ldc, const cublasLtMatmulHeuristicResult_t& heuristic) const +{ + int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0; + cublasLtMatmulDesc_t operationDesc = NULL; + cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL; + cudaDataType_t scaleType; +#if (CUDART_VERSION >= 11000) + cublasComputeType_t computeType; +#else + cudaDataType_t computeType; +#endif + + if (is_fp16_computeType) + { +#if (CUDART_VERSION >= 11000) + computeType = CUBLAS_COMPUTE_16F; +#else + computeType = CUDA_R_16F; +#endif + scaleType = CUDA_R_16F; + } + else + { +#if (CUDART_VERSION >= 11000) + computeType = CUBLAS_COMPUTE_32F; +#else + computeType = CUDA_R_32F; +#endif + scaleType = CUDA_R_32F; + } + + // -------------------------------------- + // Create descriptors for the original matrices + check_cuda_error( + cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda)); + check_cuda_error( + cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb)); + check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc)); +#if (CUDART_VERSION >= 11000) + check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType)); +#else + check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType)); +#endif + + check_cuda_error( + cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t))); + check_cuda_error( + cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t))); + + void* workSpace = cublas_workspace_; + int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE; + + cublasLtMatmulHeuristicResult_t heurResult; + cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck( + getCublasLtHandle(), operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &heuristic.algo, &heurResult); + + if (algoStatus != CUBLAS_STATUS_SUCCESS || heurResult.state != CUBLAS_STATUS_SUCCESS + || heurResult.workspaceSize > CUBLAS_WORKSPACE_SIZE) + { + return false; + } + + check_cuda_error(cublasLtMatmulDescDestroy(operationDesc)); + check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc)); + check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc)); + check_cuda_error(cublasLtMatrixLayoutDestroy(Cdesc)); + sync_check_cuda_error(); + + return true; +} + +std::vector cublasMMWrapper::getTactics(cublasLtHandle_t lightHandle, + cublasLtMatmulDesc_t computeDesc, cublasLtMatrixLayout_t Adesc, cublasLtMatrixLayout_t Bdesc, + cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Ddesc) { #if TLLM_CUBLAS_VER_LE(11, 4, 2) TLLM_CHECK_WITH_INFO(false, "CUBLAS version too low, must be > 11.4.2."); - return {false, cublasLtMatmulAlgo_t{}}; + return {}; #else - size_t returnSize; - int32_t pointer_mode; - cublasLtMatmulDescGetAttribute( - computeDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode), &returnSize); - std::vector heuristics(200); cublasLtMatmulPreference_t preference; check_cuda_error(cublasLtMatmulPreferenceCreate(&preference)); @@ -431,6 +647,10 @@ std::pair cublasMMWrapper::findBestAlgo(cublasLtHand uint64_t workspace_size = CUBLAS_WORKSPACE_SIZE; check_cuda_error(cublasLtMatmulPreferenceSetAttribute( preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size))); + // Restrict reduction algorithms for numerical stability and better determenism + uint32_t reduction_mask = CUBLASLT_REDUCTION_SCHEME_INPLACE; + check_cuda_error(cublasLtMatmulPreferenceSetAttribute( + preference, CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, &reduction_mask, sizeof(reduction_mask))); #if TLLM_CUBLAS_VER_LT(12, 0, 0) uint32_t pointer_mode_mask = 0; check_cuda_error(cublasLtMatmulPreferenceSetAttribute( @@ -438,10 +658,30 @@ std::pair cublasMMWrapper::findBestAlgo(cublasLtHand #endif int return_count = 0; - auto ret = cublasLtMatmulAlgoGetHeuristic(lightHandle, computeDesc, Adesc, Bdesc, Cdesc, Ddesc, preference, - heuristics.size(), heuristics.data(), &return_count); + check_cuda_error(cublasLtMatmulAlgoGetHeuristic(lightHandle, computeDesc, Adesc, Bdesc, Cdesc, Ddesc, preference, + heuristics.size(), heuristics.data(), &return_count)); heuristics.resize(return_count); + return heuristics; +#endif +} + +std::pair cublasMMWrapper::findBestAlgo(cublasLtHandle_t lightHandle, + cublasLtMatmulDesc_t computeDesc, const void* alpha, const void* A, cublasLtMatrixLayout_t Adesc, const void* B, + cublasLtMatrixLayout_t Bdesc, const void* beta, const void* C, cublasLtMatrixLayout_t Cdesc, void* D, + cublasLtMatrixLayout_t Ddesc, cudaStream_t stream) +{ +#if TLLM_CUBLAS_VER_LE(11, 4, 2) + TLLM_CHECK_WITH_INFO(false, "CUBLAS version too low, must be > 11.4.2."); + return {false, cublasLtMatmulAlgo_t{}}; +#else + size_t returnSize; + int32_t pointer_mode; + cublasLtMatmulDescGetAttribute( + computeDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode), &returnSize); + + const auto heuristics = getTactics(lightHandle, computeDesc, Adesc, Bdesc, Cdesc, Ddesc); + std::map> algo_results; for (const auto& heuristic : heuristics) { diff --git a/cpp/tensorrt_llm/common/cublasMMWrapper.h b/cpp/tensorrt_llm/common/cublasMMWrapper.h index d1302d46e35..1fd8b64a0b0 100644 --- a/cpp/tensorrt_llm/common/cublasMMWrapper.h +++ b/cpp/tensorrt_llm/common/cublasMMWrapper.h @@ -23,6 +23,7 @@ #include #include #include +#include #include namespace tensorrt_llm @@ -65,6 +66,16 @@ class cublasMMWrapper const void* beta, const void* C, cublasLtMatrixLayout_t Cdesc, void* D, cublasLtMatrixLayout_t Ddesc, const cublasLtMatmulAlgo_t* algo, void* workspace, size_t workspaceSizeInBytes, cudaStream_t stream); + bool checkTactic(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, + const int lda, const int ldb, const int ldc, const cublasLtMatmulHeuristicResult_t& algo) const; + + std::vector getTactics(cublasOperation_t transa, cublasOperation_t transb, + const int m, const int n, const int k, const int lda, const int ldb, const int ldc); + + std::vector getTactics(cublasLtHandle_t lightHandle, + cublasLtMatmulDesc_t computeDesc, cublasLtMatrixLayout_t Adesc, cublasLtMatrixLayout_t Bdesc, + cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Ddesc); + std::pair findBestAlgo(cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t computeDesc, const void* alpha, const void* A, cublasLtMatrixLayout_t Adesc, const void* B, cublasLtMatrixLayout_t Bdesc, const void* beta, const void* C, cublasLtMatrixLayout_t Cdesc, void* D, cublasLtMatrixLayout_t Ddesc, @@ -83,9 +94,17 @@ class cublasMMWrapper void Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, const void* A, const int lda, const void* B, const int ldb, void* C, const int ldc); + void Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, const void* A, + const int lda, const void* B, const int ldb, void* C, const int ldc, + const std::optional& algo); + void Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, const void* A, const int lda, const void* B, const int ldb, void* C, const int ldc, float f_alpha, float f_beta); + void Gemm(cublasOperation_t transa, cublasOperation_t transb, const int m, const int n, const int k, const void* A, + const int lda, const void* B, const int ldb, void* C, const int ldc, float f_alpha, float f_beta, + const cublasLtMatmulAlgo_t& algo, bool hasAlgo); + void setWorkspace(void* workspace); void Int8Gemm(const int m, const int n, const int k, const int8_t* A, const int lda, const int8_t* B, const int ldb, @@ -99,6 +118,10 @@ class cublasMMWrapper #ifdef ENABLE_BF16 void setBF16GemmConfig(); #endif +#ifdef ENABLE_FP8 + void setFP8GemmConfig(cudaDataType_t outputType = CUDA_R_16F); +#endif + void setStream(cudaStream_t stream); void setGemmConfig(cudaDataType_t aType, cudaDataType_t bType, cudaDataType_t cType, cudaDataType_t computeType); @@ -131,7 +154,7 @@ class cublasMMWrapper return *(this->cublas_handle_); } - cublasLtHandle_t getCublasLtHandle() + cublasLtHandle_t getCublasLtHandle() const { return *(this->cublaslt_handle_); } diff --git a/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh b/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh index 47827e42a6b..13a6165dc4b 100644 --- a/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh +++ b/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh @@ -18,6 +18,7 @@ #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include +#include namespace tensorrt_llm { @@ -198,6 +199,7 @@ inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) } #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) +#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020) inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hmul2(x, y); @@ -215,7 +217,7 @@ inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __n t.y = y; return t; } - +#endif #endif inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp index 43199d7194f..bbee01c3679 100644 --- a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp +++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp @@ -16,10 +16,17 @@ #define CUDA_LIB_NAME "cuda" +#if defined(_WIN32) +#include +#define dllOpen(name) LoadLibrary("nv" name ".dll") +#define dllClose(handle) FreeLibrary(static_cast(handle)) +#define dllGetSym(handle, name) static_cast(GetProcAddress(static_cast(handle), name)) +#else // For non-Windows platforms #include #define dllOpen(name) dlopen("lib" name ".so.1", RTLD_LAZY) #define dllClose(handle) dlclose(handle) #define dllGetSym(handle, name) dlsym(handle, name) +#endif // defined(_WIN32) #include "cudaDriverWrapper.h" #include "tensorrt_llm/common/assert.h" diff --git a/cpp/tensorrt_llm/common/cudaTypeUtils.cuh b/cpp/tensorrt_llm/common/cudaTypeUtils.cuh index 027a8fb6909..64f26b430f9 100644 --- a/cpp/tensorrt_llm/common/cudaTypeUtils.cuh +++ b/cpp/tensorrt_llm/common/cudaTypeUtils.cuh @@ -19,8 +19,12 @@ #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" +#include #include #include +#if ENABLE_BF16 +#include +#endif namespace tensorrt_llm { @@ -508,7 +512,11 @@ __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, half2>(half2 val) #endif // ENABLE BF16 template -__device__ inline T cuda_abs(T val); +__device__ inline T cuda_abs(T val) +{ + assert(false); + return {}; +} template <> __device__ inline float cuda_abs(float val) @@ -548,18 +556,6 @@ __device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) { return __habs2(val); } -#else -template <> -__device__ inline __nv_bfloat16 cuda_abs(__nv_bfloat16 val) -{ - return fabs(val); -} - -template <> -__device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) -{ - return make_bfloat162(fabs(val.x), fabs(val.y)); -} #endif #endif // ENABLE_FP16 diff --git a/cpp/tensorrt_llm/common/cudaUtils.h b/cpp/tensorrt_llm/common/cudaUtils.h index f457c1aabcd..d43a93030c8 100644 --- a/cpp/tensorrt_llm/common/cudaUtils.h +++ b/cpp/tensorrt_llm/common/cudaUtils.h @@ -20,6 +20,7 @@ #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/tllmException.h" +#include #include #include #include @@ -268,6 +269,15 @@ inline int getDeviceCount() return count; } +/// Get the memory info +/// \return The free and total amount of memory in bytes +inline std::tuple getDeviceMemoryInfo() +{ + size_t free, total; + check_cuda_error(cudaMemGetInfo(&free, &total)); + return {free, total}; +} + inline int getMultiProcessorCount() { int device_id; @@ -301,8 +311,15 @@ inline int divUp(int a, int n) return (a + n - 1) / n; } +template ::value>, + typename = std::enable_if_t::value>> +auto constexpr ceilDiv(T numerator, U denominator) +{ + return (numerator + denominator - 1) / denominator; +} + template -void printAbsMean(const T* buf, uint size, cudaStream_t stream, std::string name = "") +void printAbsMean(const T* buf, uint64_t size, cudaStream_t stream, std::string name = "") { if (buf == nullptr) { @@ -319,7 +336,7 @@ void printAbsMean(const T* buf, uint size, cudaStream_t stream, std::string name uint64_t zero_count = 0; float max_val = -1e10; bool find_inf = false; - for (uint i = 0; i < size; i++) + for (uint64_t i = 0; i < size; i++) { if (std::isinf((float) (h_tmp[i]))) { @@ -412,19 +429,24 @@ inline void print_element_(__nv_bfloat16 x) print_float_((float) x); } #endif -inline void print_element_(unsigned long long ull) +inline void print_element_(uint32_t ul) +{ + printf("%7" PRIu32, ul); +} + +inline void print_element_(uint64_t ull) { - printf("%7llu ", ull); + printf("%7" PRIu64, ull); } -inline void print_element_(int i) +inline void print_element_(int32_t il) { - printf("%7d ", i); + printf("%7" PRId32, il); } -inline void print_element_(size_t s) +inline void print_element_(int64_t ill) { - printf("%7ld ", s); + printf("%7" PRId64, ill); } template @@ -478,9 +500,9 @@ template void printMatrix(const half* ptr, int m, int k, int stride, bool is_dev #ifdef ENABLE_BF16 template void printMatrix(const __nv_bfloat16* ptr, int m, int k, int stride, bool is_device_ptr); #endif -template void printMatrix(const unsigned long long* ptr, int m, int k, int stride, bool is_device_ptr); +template void printMatrix(const uint32_t* ptr, int m, int k, int stride, bool is_device_ptr); +template void printMatrix(const uint64_t* ptr, int m, int k, int stride, bool is_device_ptr); template void printMatrix(const int* ptr, int m, int k, int stride, bool is_device_ptr); -template void printMatrix(const size_t* ptr, int m, int k, int stride, bool is_device_ptr); } // namespace tensorrt_llm::common diff --git a/cpp/tensorrt_llm/common/int8Utils.cuh b/cpp/tensorrt_llm/common/int8Utils.cuh deleted file mode 100644 index 3c2b01de5cd..00000000000 --- a/cpp/tensorrt_llm/common/int8Utils.cuh +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include -#include -#include - -namespace tensorrt_llm -{ -namespace kernels -{ - -static inline __device__ int8_t float_to_int8_rn(float x) -{ - uint32_t dst; - asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x)); - return reinterpret_cast(dst); -} - -static inline __device__ uint32_t float4_to_char4(float x, float y, float z, float w) -{ - uint32_t dst; -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720 - uint32_t a; - asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x)); - uint32_t b; - asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y)); - uint32_t c; - asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z)); - uint32_t d; - asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w)); - - asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, 0;\n" : "=r"(dst) : "r"(d), "r"(c)); - asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, %0;\n" : "+r"(dst) : "r"(b), "r"(a)); -#else - char4 tmp; - tmp.x = x; - tmp.y = y; - tmp.z = z; - tmp.w = w; - dst = reinterpret_cast(tmp); -#endif - return dst; -} -} // namespace kernels -} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/common/logger.h b/cpp/tensorrt_llm/common/logger.h index 0f15599d339..9173e1b1ce9 100644 --- a/cpp/tensorrt_llm/common/logger.h +++ b/cpp/tensorrt_llm/common/logger.h @@ -30,6 +30,14 @@ namespace tensorrt_llm::common class Logger { +#if _WIN32 +// On Windows, the file wingdi.h is included which has +// #define ERROR 0 +// This breaks everywhere ERROR is used in the Level enum +// Alternative, untested solution to #undef: compile with NOGDI flag defined +#undef ERROR +#endif // _WIN32 + public: enum Level { diff --git a/cpp/tensorrt_llm/common/memoryUtils.cu b/cpp/tensorrt_llm/common/memoryUtils.cu index 730913153d6..f613414fc87 100644 --- a/cpp/tensorrt_llm/common/memoryUtils.cu +++ b/cpp/tensorrt_llm/common/memoryUtils.cu @@ -224,7 +224,7 @@ template void cudaAutoCpy(int* tgt, const int* src, size_t size, cudaStream_t st template void cudaAutoCpy(bool* tgt, const bool* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(int8_t* tgt, const int8_t* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(uint8_t* tgt, const uint8_t* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(uint* tgt, const uint* src, size_t size, cudaStream_t stream); +template void cudaAutoCpy(uint32_t* tgt, const uint32_t* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(unsigned long long* tgt, const unsigned long long* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(char* tgt, const char* src, size_t size, cudaStream_t stream); @@ -467,8 +467,8 @@ template void invokeCudaD2DcpyConvert(half* tgt, const int* src, const size_t si template void invokeCudaD2DcpyConvert(float* tgt, const float* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(half* tgt, const float* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(float* tgt, const half* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(uint* tgt, const int* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(int* tgt, const uint* src, const size_t size, cudaStream_t stream); +template void invokeCudaD2DcpyConvert(uint32_t* tgt, const int* src, const size_t size, cudaStream_t stream); +template void invokeCudaD2DcpyConvert(int* tgt, const uint32_t* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(int* tgt, const float* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(int* tgt, const half* src, const size_t size, cudaStream_t stream); diff --git a/cpp/tensorrt_llm/common/nvtxUtils.h b/cpp/tensorrt_llm/common/nvtxUtils.h new file mode 100644 index 00000000000..ed2065e551c --- /dev/null +++ b/cpp/tensorrt_llm/common/nvtxUtils.h @@ -0,0 +1,44 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace tensorrt_llm::common::nvtx +{ +inline nvtx3::color nextColor() +{ +#if !defined(NVTX_DISABLE) + constexpr std::array kColors{nvtx3::color{0xff00ff00}, nvtx3::color{0xff0000ff}, nvtx3::color{0xffffff00}, + nvtx3::color{0xffff00ff}, nvtx3::color{0xff00ffff}, nvtx3::color{0xffff0000}, nvtx3::color{0xffffffff}}; + constexpr auto numColors = kColors.size(); + + static thread_local int colorId = 0; + auto const color = kColors[colorId]; + colorId = colorId + 1 >= numColors ? 0 : colorId + 1; + return color; +#else + return nvtx3::color{0}; +#endif +} + +} // namespace tensorrt_llm::common::nvtx + +#define NVTX3_SCOPED_RANGE(range) ::nvtx3::scoped_range range##_range(::tensorrt_llm::common::nvtx::nextColor(), #range) diff --git a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh index 82e4e8c9b00..37501c5f601 100644 --- a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh +++ b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh @@ -72,7 +72,7 @@ __device__ inline void copy(const void* local, void* data) *out = *in; } -static const float HALF_FLT_MAX = 65504.F; +static float constexpr HALF_FLT_MAX = 65504.F; #define FINAL_MASK 0xffffffff template diff --git a/cpp/tensorrt_llm/common/stringUtils.h b/cpp/tensorrt_llm/common/stringUtils.h index 3eac937cfd8..071c2279327 100644 --- a/cpp/tensorrt_llm/common/stringUtils.h +++ b/cpp/tensorrt_llm/common/stringUtils.h @@ -48,6 +48,12 @@ std::string fmtstr(char const* format, ...); std::string fmtstr(char const* format, ...) __attribute__((format(printf, 1, 2))); #endif +// __PRETTY_FUNCTION__ is used for neat debugging printing but is not supported on Windows +// The alternative is __FUNCSIG__, which is similar but not identical +#if defined(_WIN32) +#define __PRETTY_FUNCTION__ __FUNCSIG__ +#endif + template inline TStream& arr2outCasted(TStream& out, T* arr, size_t size) { diff --git a/cpp/tensorrt_llm/common/tensor.cpp b/cpp/tensorrt_llm/common/tensor.cpp index 464bc275e8a..059e69e0844 100644 --- a/cpp/tensorrt_llm/common/tensor.cpp +++ b/cpp/tensorrt_llm/common/tensor.cpp @@ -23,7 +23,6 @@ #include "stdlib.h" #include #include -#include #include #include #include @@ -32,6 +31,10 @@ #include #include +#if !defined(_WIN32) +#include +#endif // !defined(_WIN32) + namespace tensorrt_llm { namespace common @@ -152,7 +155,7 @@ Tensor Tensor::loadNpy(const std::string& npy_file, const MemoryType where) parseNpyIntro(f_ptr, header_len, start_data); parseNpyHeader(f_ptr, header_len, type, shape); - const size_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + const size_t size = std::accumulate(shape.begin(), shape.end(), size_t{1}, std::multiplies()); void* data_cpu = malloc(size * Tensor::getTypeSize(type)); void* data = data_cpu; @@ -338,7 +341,7 @@ Tensor Tensor::slice(std::vector shape, size_t offset) const if (this->data != nullptr) { size_t n_elts = this->size(); - size_t n_sliced_elts = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + size_t n_sliced_elts = std::accumulate(shape.begin(), shape.end(), size_t{1}, std::multiplies()); TLLM_CHECK_WITH_INFO(n_sliced_elts + offset <= n_elts, fmtstr("The number (%ld) of elements of sliced tensor exceeds that (%ld) of the original tensor", n_sliced_elts + offset, n_elts)); @@ -418,6 +421,7 @@ std::string TensorMap::toString() TensorMap TensorMap::fromNpyFolder(const std::string& base_folder) { +#if !defined(_WIN32) DIR* dir_p = opendir(base_folder.c_str()); TLLM_CHECK_WITH_INFO(dir_p != nullptr, fmtstr("Could not open folder %s. ", base_folder.c_str())); struct dirent* dp; @@ -460,10 +464,15 @@ TensorMap TensorMap::fromNpyFolder(const std::string& base_folder) closedir(dir_p); return ret_tensor; +#else + throw std::runtime_error("TensorMap::fromNpyFolder is not implemented on Windows."); + return {}; +#endif // !defined(_WIN32) } void TensorMap::saveNpy(const std::string& base_folder) { +#if !defined(_WIN32) mode_t mode_0755 = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; int ret = mkdir(base_folder.c_str(), mode_0755); TLLM_CHECK_WITH_INFO(ret == 0 || errno == EEXIST, fmtstr("Could not create folder %s.\n", base_folder.c_str())); @@ -472,6 +481,9 @@ void TensorMap::saveNpy(const std::string& base_folder) { item.second.saveNpy(base_folder + "/" + item.second.whereToString() + "-" + item.first + ".npy"); } +#else + throw std::runtime_error("TensorMap::saveNpy is not implemented on Windows."); +#endif // !defined(_WIN32) } } // namespace common diff --git a/cpp/tensorrt_llm/common/tensor.h b/cpp/tensorrt_llm/common/tensor.h index a0513a36761..b1dcbc626f3 100644 --- a/cpp/tensorrt_llm/common/tensor.h +++ b/cpp/tensorrt_llm/common/tensor.h @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -98,11 +97,13 @@ struct TensorDataType static constexpr DataType value = TYPE_UINT64; }; +#if !defined(_WIN32) template <> struct TensorDataType { static constexpr DataType value = TYPE_UINT64; }; +#endif // !defined(_WIN32) static_assert(sizeof(std::uint64_t) == sizeof(unsigned long long), ""); diff --git a/cpp/tensorrt_llm/common/tllmException.cpp b/cpp/tensorrt_llm/common/tllmException.cpp index 52354b6e66b..2d2831c1477 100644 --- a/cpp/tensorrt_llm/common/tllmException.cpp +++ b/cpp/tensorrt_llm/common/tllmException.cpp @@ -44,8 +44,8 @@ TllmException::TllmException(char const* file, std::size_t line, const std::stri } #else TllmException::TllmException(char const* file, std::size_t line, const std::string& msg) - : _mNbFrames{} - , runtime_error{fmtstr("%s (%s:%zu)", msg.c_str(), file, line)} + : mNbFrames{} + , std::runtime_error{fmtstr("%s (%s:%zu)", msg.c_str(), file, line)} { } #endif diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h index dead7975f6f..bdac36fd95d 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h @@ -415,12 +415,12 @@ class MmaTensorOpDequantizer mul_op; - plus plus_op; - ExpandedMmaOperandB* operand_frag_ptr = reinterpret_cast(&operand_frag); if constexpr (hasZero(QuantOp)) { + plus plus_op; + CUTLASS_PRAGMA_UNROLL for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter) { diff --git a/cpp/tensorrt_llm/kernels/banBadWords.cu b/cpp/tensorrt_llm/kernels/banBadWords.cu index 82d169d9d51..45cce552dea 100644 --- a/cpp/tensorrt_llm/kernels/banBadWords.cu +++ b/cpp/tensorrt_llm/kernels/banBadWords.cu @@ -98,7 +98,8 @@ void invokeBanBadWords(T* logits, const int** output_ids_ptr, const int** parent int vocab_size_padded, const int* sequence_lengths, int max_seq_len, cudaStream_t stream) { dim3 block, grid; - block.x = min(((bad_words_len + 32 - 1) / 32) * 32, 256UL); + constexpr size_t max_blocks{256}; + block.x = min(((bad_words_len + 32 - 1) / 32) * 32, max_blocks); grid.x = (bad_words_len + block.x - 1) / block.x; grid.y = local_batch_size * beam_width; diff --git a/cpp/tensorrt_llm/kernels/banRepeatNgram.cu b/cpp/tensorrt_llm/kernels/banRepeatNgram.cu index 3f232c2120c..5f2c686baae 100644 --- a/cpp/tensorrt_llm/kernels/banRepeatNgram.cu +++ b/cpp/tensorrt_llm/kernels/banRepeatNgram.cu @@ -146,7 +146,8 @@ void invokeBanRepeatNgram(T* logits, const int** output_ids_buf, const bool* fin // step (current generated length, except start token) is from 1 ~ max_seq_len dim3 block, grid; - block.x = min(((step + 32 - 1) / 32) * 32, 256UL); + constexpr size_t max_blocks{256}; + block.x = min(((step + 32 - 1) / 32) * 32, max_blocks); grid.x = (step + block.x - 1) / block.x; grid.y = local_batch_size * beam_width; diff --git a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu index e06cee8c7e9..2415d10e7cf 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu @@ -91,20 +91,18 @@ __global__ void add_bias_temperature(half2* logits, const half2* bias, const int template __global__ void apply_repetition_penalty(T* logits, const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int** output_ids_ptr, const int** parent_ids_ptr, const int* input_lengths, - const int* sequence_lengths, const int max_input_length, const float repetition_penalty, int max_seq_len) + const int* sequence_lengths, const float repetition_penalty, int max_seq_len) { const int tid = threadIdx.x; const int bbid = blockIdx.x; const int batch_id = bbid / beam_width; const int beam_idx{bbid % beam_width}; - const int bbsize = batch_size * beam_width; logits += bbid * vocab_size_padded; extern __shared__ char sbuf[]; T* penalty_logits = reinterpret_cast(sbuf); // prevent misaligment when sizeof(T) = 2 int* penalty_indices = reinterpret_cast(sbuf + (sizeof(T) * max_seq_len + 31) / 32 * 32); - const int input_length = (input_lengths != nullptr) ? input_lengths[bbid] : max_input_length; const int current_step{sequence_lengths[bbid]}; if (tid == 0) { @@ -127,11 +125,6 @@ __global__ void apply_repetition_penalty(T* logits, const int batch_size, const int parent_beam = bbid % beam_width; for (int i = current_step - 2; i >= 0; --i) { - // Skip the padded tokens. - if (i >= input_length && i < max_input_length) - { - continue; - } parent_beam = parent_ids_ptr[batch_id][parent_beam * max_seq_len + i]; prev_id = output_ids_ptr[batch_id][parent_beam * max_seq_len + i]; prev_logit = logits[prev_id]; @@ -150,23 +143,20 @@ __global__ void apply_repetition_penalty(T* logits, const int batch_size, const __syncthreads(); for (int i = tid; i < current_step; i += blockDim.x) { - if (i >= input_length && i < max_input_length) - { - continue; - } logits[penalty_indices[i]] = penalty_logits[i]; } } template __global__ void apply_min_length_penalty(T* logits, const int min_length, const int* end_ids, - const int* sequence_lengths, const int max_input_length, const int beam_width, const int vocab_size_padded) + const int* sequence_lengths, const int* input_lengths, const int beam_width, const int vocab_size_padded) { int bbid = threadIdx.x + blockIdx.x * blockDim.x; // batch-beam index int bid = bbid / beam_width; // batch index - // We need +1 because sequence_lengths = max_input_length + num_gen_tokens - - // 1, which is equal to the length of k/v caches. - if (sequence_lengths[bbid] + 1 - max_input_length < min_length) + auto const input_length{input_lengths == nullptr ? 0 : input_lengths[bbid]}; + // We need +1 because sequence_lengths = num_gen_tokens + input_length - 1, + // which is equal to the length of k/v caches. + if (sequence_lengths[bbid] + 1 - input_length < min_length) { T mask_val = (std::is_same::value) ? -HALF_FLT_MAX : -FLT_MAX; logits[bbid * vocab_size_padded + end_ids[bid]] = mask_val; @@ -175,11 +165,10 @@ __global__ void apply_min_length_penalty(T* logits, const int min_length, const template void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const int** parent_ids_ptr, - const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int max_input_length, - const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size, - const int vocab_size_padded, const int* end_ids, const float temperature, const float repetition_penalty, - const RepetitionPenaltyType repetition_penalty_type, const int min_length, const int max_seq_len, - cudaStream_t stream) + const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int local_batch_size, + const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids, + const float temperature, const float repetition_penalty, const RepetitionPenaltyType repetition_penalty_type, + const int min_length, const int max_seq_len, cudaStream_t stream) { if (bias != nullptr || temperature != 1.0f || vocab_size != vocab_size_padded) { @@ -210,14 +199,14 @@ void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const in { apply_repetition_penalty<<>>(logits, batch_size, beam_width, vocab_size, vocab_size_padded, output_ids_ptr, parent_ids_ptr, input_lengths, sequence_lengths, - max_input_length, repetition_penalty, max_seq_len); + repetition_penalty, max_seq_len); sync_check_cuda_error(); } else if (repetition_penalty_type == RepetitionPenaltyType::Additive) { apply_repetition_penalty<<>>(logits, batch_size, beam_width, vocab_size, vocab_size_padded, output_ids_ptr, parent_ids_ptr, input_lengths, sequence_lengths, - max_input_length, repetition_penalty, max_seq_len); + repetition_penalty, max_seq_len); sync_check_cuda_error(); } } @@ -229,21 +218,21 @@ void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const in const int block_size = min(local_batch_size * beam_width, 1024); const int grid_size = (local_batch_size * beam_width + block_size - 1) / block_size; apply_min_length_penalty<<>>( - logits, min_length, end_ids, sequence_lengths, max_input_length, beam_width, vocab_size_padded); + logits, min_length, end_ids, sequence_lengths, input_lengths, beam_width, vocab_size_padded); sync_check_cuda_error(); } template void invokeAddBiasApplyPenalties(float* logits, const int** output_ids_ptr, const int** parent_ids_ptr, - const int* input_lengths, const int* sequence_lengths, const float* bias, const int ite, const int max_input_length, - const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size, - const int vocab_size_padded, const int* end_ids, const float temperature, const float repetition_penalty, - const RepetitionPenaltyType repetition_penalty_type, const int min_length, int max_seq_len, cudaStream_t stream); + const int* input_lengths, const int* sequence_lengths, const float* bias, const int ite, const int local_batch_size, + const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids, + const float temperature, const float repetition_penalty, const RepetitionPenaltyType repetition_penalty_type, + const int min_length, int max_seq_len, cudaStream_t stream); template void invokeAddBiasApplyPenalties(half* logits, const int** output_ids_ptr, const int** parent_ids_ptr, - const int* input_lengths, const int* sequence_lengths, const half* bias, const int ite, const int max_input_length, - const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size, - const int vocab_size_padded, const int* end_ids, const float temperature, const float repetition_penalty, - const RepetitionPenaltyType repetition_penalty_type, const int min_length, int max_seq_len, cudaStream_t stream); + const int* input_lengths, const int* sequence_lengths, const half* bias, const int ite, const int local_batch_size, + const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids, + const float temperature, const float repetition_penalty, const RepetitionPenaltyType repetition_penalty_type, + const int min_length, int max_seq_len, cudaStream_t stream); } // namespace kernels } // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h index 888a1951873..c8321f6e8d2 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h +++ b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h @@ -26,10 +26,10 @@ namespace kernels template void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const int** parent_ids_ptr, - const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int max_input_length, - const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size, - const int vocab_size_padded, const int* end_ids, const float temperature, const float repetition_penalty, - const RepetitionPenaltyType repetition_penalty_type, const int min_length, int max_seq_len, cudaStream_t stream); + const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int local_batch_size, + const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids, + const float temperature, const float repetition_penalty, const RepetitionPenaltyType repetition_penalty_type, + const int min_length, int max_seq_len, cudaStream_t stream); } // namespace kernels } // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/beamSearchTopkKernels.cu b/cpp/tensorrt_llm/kernels/beamSearchTopkKernels.cu index 0eeb3166a28..f8ecab4a0a2 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchTopkKernels.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchTopkKernels.cu @@ -691,11 +691,11 @@ template void invokeTopkBeamSearch(void* workspace, size_t& workspace_size, floa template __global__ void tileEncoderResults(T* tiled_output, int* tiled_sequence_length, const T* output, - const int* sequence_length, const uint batch_size, const uint beam_width, const uint d_model) + const int* sequence_length, const uint32_t batch_size, const uint32_t beam_width, const uint32_t d_model) { if (blockIdx.x == 0) { - for (uint i = threadIdx.x; i < batch_size * beam_width; i += blockDim.x) + for (uint32_t i = threadIdx.x; i < batch_size * beam_width; i += blockDim.x) { tiled_sequence_length[i] = sequence_length[i / beam_width]; } @@ -704,7 +704,7 @@ __global__ void tileEncoderResults(T* tiled_output, int* tiled_sequence_length, int tgt_offset = blockIdx.x * gridDim.y * gridDim.z * d_model + blockIdx.y * gridDim.z * d_model + blockIdx.z * d_model; int src_offset = blockIdx.x * gridDim.z * d_model + blockIdx.z * d_model; - for (uint i = threadIdx.x; i < d_model; i += blockDim.x) + for (uint32_t i = threadIdx.x; i < d_model; i += blockDim.x) { tiled_output[i + tgt_offset] = output[i + src_offset]; } @@ -785,29 +785,16 @@ __global__ void insertUnfinishedPath(BeamHypotheses beam_hyps, const bool* finis int prev_id = beam_hyps.parent_ids_src[src_beam_idx * max_seq_len + last_token_idx]; for (int token_idx = last_token_idx - 1; token_idx >= 0; token_idx--) { - int src_offset; - // skip the padding between inputs and outputs - if (token_idx > max_seq_len) - { - src_offset = max_seq_len - beam_hyps.input_lengths[src_beam_idx]; - } - else - { - src_offset = 0; - } // output_ids_tgt need to use max_seq_len + 1 because its shape is // [bs, beam_width, max_seq_len + 1] beam_hyps.output_ids_tgt[tgt_beam_idx * max_seq_len + token_idx] - = beam_hyps.output_ids_src[bid * beam_width * max_seq_len + prev_id * max_seq_len + token_idx - + src_offset]; + = beam_hyps.output_ids_src[bid * beam_width * max_seq_len + prev_id * max_seq_len + token_idx]; if (beam_hyps.log_probs != nullptr && beam_hyps.log_probs_src != nullptr) { beam_hyps.log_probs[tgt_beam_idx * max_seq_len + token_idx] - = beam_hyps.log_probs_src[token_idx * batch_size * beam_width + bid * beam_width + prev_id - + src_offset]; + = beam_hyps.log_probs_src[token_idx * batch_size * beam_width + bid * beam_width + prev_id]; } - prev_id = beam_hyps.parent_ids_src[bid * beam_width * max_seq_len + prev_id * max_seq_len + token_idx - + src_offset]; + prev_id = beam_hyps.parent_ids_src[bid * beam_width * max_seq_len + prev_id * max_seq_len + token_idx]; } beam_hyps.sequence_lengths_tgt[tgt_beam_idx] = last_token_idx + 1; diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp index 8af213fcd4f..98caa3d5637 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp @@ -14,6 +14,11 @@ * limitations under the License. */ +#define _USE_MATH_DEFINES +// Include cmath with M_LOG2E defined +#include +#undef _USE_MATH_DEFINES + #include "fmhaRunner.h" #include "fused_multihead_attention_v2.h" diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp index df6d08f64aa..9673bdec21b 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp @@ -17,11 +17,17 @@ #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" +#ifndef _WIN32 #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // #ifndef _WIN32 + #include "cutlass/gemm/gemm.h" #include "cutlass/numeric_types.h" + +#ifndef _WIN32 #pragma GCC diagnostic pop +#endif // #ifndef _WIN32 #include #include diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h index 23ae3b59206..a9b7c16de84 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h @@ -59,16 +59,23 @@ class CutlassFpAIntBGemmRunnerInterface virtual ~CutlassFpAIntBGemmRunnerInterface() {} virtual void gemm(const void* A, const void* B, const void* weight_scales, void* C, int m, int n, int k, - char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) + tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) = 0; virtual void gemm(const void* A, const void* B, const void* weight_scales, const void* weight_zero_points, - const void* biases, void* C, int m, int n, int k, const int group_size, char* workspace_ptr, - const size_t workspace_bytes, cudaStream_t stream) + const void* biases, void* C, int m, int n, int k, const int group_size, tkc::CutlassGemmConfig gemmConfig, + char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) = 0; // Returns desired workspace size in bytes. - virtual int getWorkspaceSize(const int m, const int n, const int k) = 0; + virtual size_t getWorkspaceSize(const int m, const int n, const int k) = 0; + + virtual std::vector getConfigs() const = 0; + +protected: + static constexpr int SPLIT_K_LIMIT = 7; + static constexpr int MIN_M_TILE = 32; + static constexpr int MIN_N_TILE = 128; }; template @@ -79,11 +86,12 @@ class CutlassFpAIntBGemmRunner : public virtual CutlassFpAIntBGemmRunnerInterfac ~CutlassFpAIntBGemmRunner(); void gemm(const void* A, const void* B, const void* weight_scales, void* C, int m, int n, int k, - char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) override; + tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes, + cudaStream_t stream) override; void gemm(const void* A, const void* B, const void* weight_scales, const void* weight_zero_points, - const void* biases, void* C, int m, int n, int k, const int group_size, char* workspace_ptr, - const size_t workspace_bytes, cudaStream_t stream) override; + const void* biases, void* C, int m, int n, int k, const int group_size, tkc::CutlassGemmConfig gemmConfig, + char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) override; // Disabled since the fused GEMM, activation kernels will not be used in v1. @@ -92,7 +100,9 @@ class CutlassFpAIntBGemmRunner : public virtual CutlassFpAIntBGemmRunnerInterfac // stream); // Returns desired workspace size in bytes. - int getWorkspaceSize(const int m, const int n, const int k) override; + size_t getWorkspaceSize(const int m, const int n, const int k) override; + + std::vector getConfigs() const override; private: template @@ -100,14 +110,7 @@ class CutlassFpAIntBGemmRunner : public virtual CutlassFpAIntBGemmRunnerInterfac const T* biases, T* C, int m, int n, int k, const int group_size, tkc::CutlassGemmConfig gemm_config, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr); - template - void run_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* weight_zero_points, const T* biases, - T* C, int m, int n, int k, const int group_size, char* workspace_ptr, const size_t workspace_bytes, - cudaStream_t stream); - private: - static constexpr int split_k_limit = 7; - int sm_; int multi_processor_count_; }; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h index d61165c99fb..2d60cbc02b4 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h @@ -14,8 +14,10 @@ * limitations under the License. */ +#ifndef _WIN32 #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // #ifndef _WIN32 #include "cutlass/gemm/device/gemm_universal_base.h" #include "cutlass/gemm/kernel/default_gemm.h" @@ -27,7 +29,10 @@ #include "cutlass_extensions/gemm/threadblock/default_mma.h" #include "cutlass_extensions/gemm_configs.h" +#ifndef _WIN32 #pragma GCC diagnostic pop +#endif // #ifndef _WIN32 + #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" @@ -384,31 +389,6 @@ void CutlassFpAIntBGemmRunner::dispatch_to_arch -template -void CutlassFpAIntBGemmRunner::run_gemm(const T* A, const WeightType* B, - const T* weight_scales, const T* weight_zero_points, const T* biases, T* C, int m, int n, int k, - const int group_size, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) -{ - TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); - static constexpr bool is_weight_only = !std::is_same::value; - std::vector candidate_configs = get_candidate_configs(sm_, is_weight_only, false); - std::vector occupancies(candidate_configs.size()); - - for (size_t ii = 0; ii < candidate_configs.size(); ++ii) - { - dispatch_to_arch(A, B, weight_scales, weight_zero_points, biases, C, m, n, k, group_size, - candidate_configs[ii], workspace_ptr, workspace_bytes, stream, &occupancies[ii]); - } - // Standard GEMM, so 1 "expert". We use the same function for MoE and regular FFN. - static constexpr int num_experts = 1; - tkc::CutlassGemmConfig chosen_config = estimate_best_config_from_occupancies(candidate_configs, occupancies, m, n, - k, num_experts, split_k_limit, workspace_bytes, multi_processor_count_, is_weight_only); - - dispatch_to_arch(A, B, weight_scales, weight_zero_points, biases, C, m, n, k, group_size, - chosen_config, workspace_ptr, workspace_bytes, stream); -} - // Disabled since the fused GEMM, activation kernels will not be used in v1. // template @@ -447,15 +427,15 @@ void CutlassFpAIntBGemmRunner::run_gemm(con template void CutlassFpAIntBGemmRunner::gemm(const void* A, const void* B, const void* weight_scales, const void* weight_zero_points, const void* biases, void* C, int m, int n, int k, const int group_size, - char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) + tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) { TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); if constexpr ((QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS) || (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY)) { - run_gemm((const T*) A, (const WeightType*) B, (const T*) weight_scales, - (const T*) weight_zero_points, (const T*) biases, (T*) C, m, n, k, group_size, workspace_ptr, - workspace_bytes, stream); + dispatch_to_arch((const T*) A, (const WeightType*) B, (const T*) weight_scales, + (const T*) weight_zero_points, (const T*) biases, (T*) C, m, n, k, group_size, gemmConfig, workspace_ptr, + workspace_bytes, stream, nullptr); } else { @@ -466,14 +446,15 @@ void CutlassFpAIntBGemmRunner::gemm(const void* A, const template void CutlassFpAIntBGemmRunner::gemm(const void* A, const void* B, const void* weight_scales, - void* C, int m, int n, int k, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) + void* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes, + cudaStream_t stream) { TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY) { - run_gemm((const T*) A, (const WeightType*) B, (const T*) weight_scales, nullptr, nullptr, - (T*) C, m, n, k, k, workspace_ptr, workspace_bytes, stream); + dispatch_to_arch((const T*) A, (const WeightType*) B, (const T*) weight_scales, nullptr, + nullptr, (T*) C, m, n, k, k, gemmConfig, workspace_ptr, workspace_bytes, stream, nullptr); } else { @@ -482,14 +463,22 @@ void CutlassFpAIntBGemmRunner::gemm(const void* A, const } template -int CutlassFpAIntBGemmRunner::getWorkspaceSize(const int m, const int n, const int k) +std::vector CutlassFpAIntBGemmRunner::getConfigs() const +{ + static constexpr bool is_weight_only = !std::is_same::value; + std::vector candidateConfigs = get_candidate_configs(sm_, is_weight_only, false); + return candidateConfigs; +} + +template +size_t CutlassFpAIntBGemmRunner::getWorkspaceSize(const int m, const int n, const int k) { TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); // These are the min tile sizes for each config, which would launch the maximum number of blocks - const int max_grid_m = (m + 31) / 32; - const int max_grid_n = (n + 127) / 128; + const int max_grid_m = cutlass::ceil_div(m, MIN_M_TILE); + const int max_grid_n = cutlass::ceil_div(n, MIN_N_TILE); // We need 4 bytes per block in the worst case. We launch split_k_limit in z dim. - return max_grid_m * max_grid_n * split_k_limit * 4; + return static_cast(max_grid_m * max_grid_n * SPLIT_K_LIMIT * 4); } } // namespace cutlass_kernels diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h index b1120bc6385..256cd91b544 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h @@ -40,8 +40,6 @@ namespace cutlass_kernels Weights are assumed to be column-major. */ -using perfMapType = std::unordered_map; - class CutlassInt8GemmRunnerInterface { public: @@ -50,52 +48,19 @@ class CutlassInt8GemmRunnerInterface virtual ~CutlassInt8GemmRunnerInterface() {} virtual void gemm(const int8_t* A, const int8_t* B, tk::QuantMode quantOption, const float* alphaCol, - const float* alphaRow, void* C, int m, int n, int k, char* workspacePtr, const size_t workspaceBytes, - cudaStream_t stream) - = 0; - - virtual void profileGemms(tk::QuantMode quantOption, int minM, int maxM, int n, int k, int8_t* A, int8_t* B, - void* C, float* alphaCol, float* alphaRow, char* workspace) + const float* alphaRow, void* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspacePtr, + const size_t workspaceBytes, cudaStream_t stream) = 0; // Returns desired workspace size in bytes. - virtual int getWorkspaceSize(const int m, const int n, const int k) = 0; - - // Returns True if tactics has already been selected - bool hasSelectedTactics() const - { - return mTacticsMap.size() > 0; - } - - void setSelectedTactics(const perfMapType& tacticsMap) - { - mTacticsMap = tacticsMap; - } - - const perfMapType& getSelectedTactics() const - { - return mTacticsMap; - } - - void setMaxM(int maxM) - { - mMaxM = maxM; - } - - int getMaxM() const - { - return mMaxM; - } + virtual size_t getWorkspaceSize(const int m, const int n, const int k) = 0; + + virtual std::vector getConfigs() const = 0; protected: static constexpr int SPLIT_K_LIMIT = 7; - static constexpr int MAX_STEP_M = 32768; static constexpr int MIN_M_TILE = 32; static constexpr int MIN_N_TILE = 64; - - int mMaxM; - - perfMapType mTacticsMap; }; template @@ -106,25 +71,19 @@ class CutlassInt8GemmRunner : public virtual CutlassInt8GemmRunnerInterface ~CutlassInt8GemmRunner(); void gemm(const int8_t* A, const int8_t* B, tk::QuantMode quantOption, const float* alphaCol, const float* alphaRow, - void* C, int m, int n, int k, char* workspacePtr, const size_t workspaceBytes, cudaStream_t stream) override; - - void profileGemms(tk::QuantMode quantOption, int minM, int maxM, int n, int k, int8_t* A, int8_t* B, void* C, - float* alphaCol, float* alphaRow, char* workspace) override; + void* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspacePtr, + const size_t workspaceBytes, cudaStream_t stream) override; // Returns desired workspace size in bytes. - int getWorkspaceSize(const int m, const int n, const int k) override; + size_t getWorkspaceSize(const int m, const int n, const int k) override; + + std::vector getConfigs() const override; private: void dispatchToArch(const int8_t* A, const int8_t* B, tk::QuantMode quantOption, const float* alphaCol, const float* alphaRow, T* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspacePtr, const size_t workspaceBytes, cudaStream_t stream, int* occupancy = nullptr); - tkc::CutlassGemmConfig profileGemm(tk::QuantMode quant_option, int m, int n, int k, int8_t* A, int8_t* B, void* C, - float* alphaCol, float* alphaRow, char* workspace); - - float profileConfig(const tkc::CutlassGemmConfig& config, tk::QuantMode quantOption, int m, int n, int k, int8_t* A, - int8_t* B, void* C, float* alphaCol, float* alphaRow, char* workspace); - int mSm; int mMultiProcessorCount; }; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h index a92a3c3419d..4137c9a9faa 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h @@ -14,8 +14,10 @@ * limitations under the License. */ +#ifndef _WIN32 #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // #ifndef _WIN32 // clang-format off #include @@ -33,7 +35,9 @@ #include "cutlass_extensions/gemm/kernel/default_int8_traits.h" #include "cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h" +#ifndef _WIN32 #pragma GCC diagnostic pop +#endif // #ifndef _WIN32 #include "tensorrt_llm/common/allocator.h" #include "tensorrt_llm/common/cudaUtils.h" @@ -60,7 +64,6 @@ void genericInt8GemmKernelLauncher(const int8_t* A, const int8_t* B, tk::QuantMo size_t workspaceBytes, cudaStream_t stream, int* occupancy = nullptr) { TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); - // #ifdef BUILD_CUTLASS_MIXED_GEMM using ElementInput = int8_t; @@ -165,11 +168,6 @@ void genericInt8GemmKernelLauncher(const int8_t* A, const int8_t* B, tk::QuantMo = "Failed to run cutlass int8 gemm. Error: " + std::string(cutlassGetStatusString(runStatus)); throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg); } - // #else - // throw std::runtime_error( - // "[TensorRT-LLM Error][int8gemm] TensorRT-LLM was built was mixed gemm support off. Please rebuild with - // cmake option -DBUILD_CUTLASS_MIXED_GEMM=ON"); - // #endif } template @@ -355,136 +353,33 @@ void CutlassInt8GemmRunner::dispatchToArch(const int8_t* A, const int8_t* B, template void CutlassInt8GemmRunner::gemm(const int8_t* A, const int8_t* B, tk::QuantMode quantOption, const float* alphaCol, - const float* alphaRow, void* C, int m, int n, int k, char* workspacePtr, const size_t workspaceBytes, - cudaStream_t stream) + const float* alphaRow, void* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspacePtr, + const size_t workspaceBytes, cudaStream_t stream) { TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); - int mRounded = cutlass::round_up(m, MAX_STEP_M); - if (m < MAX_STEP_M) - { - mRounded = mmha::next_power_of_two(m); - } - mRounded = std::min(mMaxM, mRounded); - dispatchToArch(A, B, quantOption, alphaCol, alphaRow, reinterpret_cast(C), m, n, k, mTacticsMap[mRounded], - workspacePtr, workspaceBytes, stream); -} - -template -float CutlassInt8GemmRunner::profileConfig(const tkc::CutlassGemmConfig& config, tk::QuantMode quantOption, int m, - int n, int k, int8_t* A, int8_t* B, void* C, float* alphaCol, float* alphaRow, char* workspace) -{ - constexpr int warmup = 3; - constexpr int runs = 10; - - const auto workspaceBytes = getWorkspaceSize(m, n, k); - - cudaStream_t stream = cudaStreamDefault; - for (int i = 0; i < warmup; ++i) - { - dispatchToArch(A, B, quantOption, alphaCol, alphaRow, reinterpret_cast(C), m, n, k, config, workspace, - workspaceBytes, stream); - } - - cudaEvent_t start; - cudaEvent_t stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaDeviceSynchronize(); - cudaEventRecord(start, 0); - std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); - - for (int i = 0; i < runs; ++i) - { - dispatchToArch(A, B, quantOption, alphaCol, alphaRow, reinterpret_cast(C), m, n, k, config, workspace, - workspaceBytes, stream); - } - - cudaEventRecord(stop, 0); - - cudaEventSynchronize(stop); - - float elapsed; - cudaEventElapsedTime(&elapsed, start, stop); - - cudaEventDestroy(start); - cudaEventDestroy(stop); - - return elapsed / runs; + dispatchToArch(A, B, quantOption, alphaCol, alphaRow, reinterpret_cast(C), m, n, k, gemmConfig, workspacePtr, + workspaceBytes, stream); } template -tkc::CutlassGemmConfig CutlassInt8GemmRunner::profileGemm(tk::QuantMode quantOption, int m, int n, int k, int8_t* A, - int8_t* B, void* C, float* alphaCol, float* alphaRow, char* workspace) +std::vector CutlassInt8GemmRunner::getConfigs() const { - TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); static constexpr bool isWeightOnly = false; std::vector candidateConfigs = get_candidate_configs(mSm, isWeightOnly, mSm <= 70, /* SIMT configs */ true); /* INT8 configs */ - - float bestTime = std::numeric_limits::max(); - tkc::CutlassGemmConfig bestConfig; - bool foundOne = false; - - for (int ii = 0; ii < candidateConfigs.size(); ++ii) - { - tkc::CutlassGemmConfig candidateConfig = candidateConfigs[ii]; - float time = std::numeric_limits::max(); - try - { - time = profileConfig(candidateConfig, quantOption, m, n, k, A, B, C, alphaCol, alphaRow, workspace); - foundOne = true; - } - catch (...) - { - std::ostringstream msg; - msg << "Cannot profile configuration " << ii << " (for" - << " m=" << m << ", n=" << n << ", k=" << k << "). Skipped"; - TLLM_LOG_DEBUG(msg.str()); - } - - if (time < bestTime) - { - bestConfig = candidateConfig; - bestTime = time; - } - } - - if (!foundOne) - { - TLLM_LOG_ERROR("Have not found any valid GEMM config. Abort."); - } - - return bestConfig; -} - -template -void CutlassInt8GemmRunner::profileGemms(tk::QuantMode quantOption, int minM, int maxM, int n, int k, int8_t* A, - int8_t* B, void* C, float* alphaCol, float* alphaRow, char* workspace) -{ - TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); - - const int startMinMRounded = mmha::next_power_of_two(minM); - for (int m = startMinMRounded; m < maxM;) - { - mTacticsMap[m] = profileGemm(quantOption, m, n, k, A, B, C, alphaCol, alphaRow, workspace); - // Profile different Ms increasing it in powers of 2 up to MAX_STEP_M - // From there step linearly with MAX_STEP_M step - m += min(m, MAX_STEP_M); - } - // Profile the largest possible M - mTacticsMap[maxM] = profileGemm(quantOption, maxM, n, k, A, B, C, alphaCol, alphaRow, workspace); + return candidateConfigs; } template -int CutlassInt8GemmRunner::getWorkspaceSize(const int m, const int n, const int k) +size_t CutlassInt8GemmRunner::getWorkspaceSize(const int m, const int n, const int k) { TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); // These are the min tile sizes for each config, which would launch the maximum number of blocks const int maxGridM = cutlass::ceil_div(m, MIN_M_TILE); const int maxGridN = cutlass::ceil_div(m, MIN_N_TILE); // We need 4 bytes per block in the worst case. We launch SPLIT_K_LIMIT in z dim. - return maxGridM * maxGridN * SPLIT_K_LIMIT * 4; + return static_cast(maxGridM * maxGridN * SPLIT_K_LIMIT * 4); } } // namespace cutlass_kernels diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h index dcf97b7336c..de44584aef4 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h @@ -115,6 +115,8 @@ struct Multihead_attention_params_base PositionEmbeddingType position_embedding_type = PositionEmbeddingType::kLEARNED_ABSOLUTE; // The per-head latent space reserved for rotary embeddings. int rotary_embedding_dim = 0; + float rotary_embedding_base = 0.0f; + float rotary_embedding_scale = 0.0f; // The current timestep. TODO(bhsueh) Check that do we only this param in cross attention? int timestep = 0; // The current timestep of each sentences (support different timestep for different sentences) diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h index 90dacc3ea70..70bb2b395b8 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h @@ -117,10 +117,10 @@ inline size_t multi_block_grid_setup( #define MMHA_LAUNCH_CHECK(DYNAMIC_THDS_PER_BLOCK) \ std::size_t const dynamic_smem_sz{ \ mmha::smem_size_in_bytes(params, DYNAMIC_THDS_PER_BLOCK)}; \ - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&available_blocks, \ + TLLM_CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&available_blocks, \ mmha::masked_multihead_attention_kernel, \ - DYNAMIC_THDS_PER_BLOCK, dynamic_smem_sz); + DYNAMIC_THDS_PER_BLOCK, dynamic_smem_sz)); #define MMHA_KERNEL(DYNAMIC_THDS_PER_BLOCK) \ std::size_t const dynamic_smem_sz{ \ @@ -191,10 +191,10 @@ void mmha_launch_kernel_ex( // Tune block size based on batchxhead to increase occupancy. int num_blocks_per_sm = -1; std::size_t const smem_sz{mmha::smem_size_in_bytes(params, THDS_PER_BLOCK)}; - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm, + TLLM_CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm, mmha::masked_multihead_attention_kernel, - THDS_PER_BLOCK, smem_sz); + THDS_PER_BLOCK, smem_sz)); TLLM_CHECK_WITH_INFO( num_blocks_per_sm >= 1, "Sequence Length is too long for the MMHA kernel (not enough shared memory)."); diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h index df3059dcab5..42fe37d34cc 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h @@ -1246,7 +1246,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params case PositionEmbeddingType::kALIBI: break; case PositionEmbeddingType::kROPE_GPTJ: { - apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, tlength); + apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, params.rotary_embedding_base, + params.rotary_embedding_scale, tlength); break; } case PositionEmbeddingType::kROPE_GPT_NEOX: @@ -1278,7 +1279,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params mmha::vec_from_smem_transpose(q, q_smem_, transpose_idx, smem_pitch); mmha::vec_from_smem_transpose(k, k_smem, transpose_idx, smem_pitch); - mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, params.rotary_embedding_dim, tlength); + mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, params.rotary_embedding_dim, + params.rotary_embedding_base, params.rotary_embedding_scale, tlength); mmha::write_smem_transpose(k, k_smem, transpose_idx, smem_pitch); mmha::write_smem_transpose(q, q_smem_, transpose_idx, smem_pitch); diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h index 7253756c4a9..a514e759992 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h @@ -1508,9 +1508,10 @@ inline __device__ void zero(T& dst) //////////////////////////////////////////////////////////////////////////////////////////////////// -inline __device__ float2 rotary_embedding_coefficient(const int zid, const int rot_embed_dim, const float t_step) +inline __device__ float2 rotary_embedding_coefficient( + const int zid, const int rot_embed_dim, const float base, const float scale, const float t_step) { - const float inv_freq = t_step / pow(10000.0f, zid / (float) rot_embed_dim); + const float inv_freq = (t_step * scale) / pow(base, zid / (float) rot_embed_dim); return {cos(inv_freq), sin(inv_freq)}; } @@ -1538,38 +1539,42 @@ inline __device__ __nv_bfloat162 rotary_embedding_transform(const __nv_bfloat162 } #endif -inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, float base, float scale, int t_step) { return; } -inline __device__ void apply_rotary_embedding(float& q, float& k, int zid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + float& q, float& k, int zid, int rot_embed_dim, float base, float scale, int t_step) { return; } -inline __device__ void apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + float2& q, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (2 * tid >= rot_embed_dim) { return; } - const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step); + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step); q = rotary_embedding_transform(q, coef); } -inline __device__ void apply_rotary_embedding(float2& q, float2& k, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + float2& q, float2& k, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (2 * tid >= rot_embed_dim) { return; } - const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step); + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step); q = rotary_embedding_transform(q, coef); k = rotary_embedding_transform(k, coef); } -inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + float4& q, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (4 * tid >= rot_embed_dim) { @@ -1577,13 +1582,14 @@ inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_ } Float4_& q_ = *reinterpret_cast(&q); - const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step); + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step); q_.x = rotary_embedding_transform(q_.x, coef0); - const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step); q_.y = rotary_embedding_transform(q_.y, coef1); } -inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + float4& q, float4& k, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (4 * tid >= rot_embed_dim) { @@ -1592,189 +1598,199 @@ inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int Float4_& q_ = *reinterpret_cast(&q); Float4_& k_ = *reinterpret_cast(&k); - const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step); + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step); q_.x = rotary_embedding_transform(q_.x, coef0); k_.x = rotary_embedding_transform(k_.x, coef0); - const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step); q_.y = rotary_embedding_transform(q_.y, coef1); k_.y = rotary_embedding_transform(k_.y, coef1); } -inline __device__ void apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + uint32_t& q, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (2 * tid >= rot_embed_dim) { return; } - const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step); + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step); q = rotary_embedding_transform(q, coef); } -inline __device__ void apply_rotary_embedding(uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (2 * tid >= rot_embed_dim) { return; } - const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step); + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step); q = rotary_embedding_transform(q, coef); k = rotary_embedding_transform(k, coef); } -inline __device__ void apply_rotary_embedding(half2& q, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding(half2& q, int tid, int rot_embed_dim, float base, float scale, int t_step) { - return apply_rotary_embedding(*reinterpret_cast(&q), tid, rot_embed_dim, t_step); + return apply_rotary_embedding(*reinterpret_cast(&q), tid, rot_embed_dim, base, scale, t_step); } -inline __device__ void apply_rotary_embedding(half2& q, half2& k, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + half2& q, half2& k, int tid, int rot_embed_dim, float base, float scale, int t_step) { return apply_rotary_embedding( - *reinterpret_cast(&q), *reinterpret_cast(&k), tid, rot_embed_dim, t_step); + *reinterpret_cast(&q), *reinterpret_cast(&k), tid, rot_embed_dim, base, scale, t_step); } -inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (4 * tid >= rot_embed_dim) { return; } - const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step); + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step); q.x = rotary_embedding_transform(q.x, coef0); - const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step); q.y = rotary_embedding_transform(q.y, coef1); } -inline __device__ void apply_rotary_embedding(uint2& q, uint2& k, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + uint2& q, uint2& k, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (4 * tid >= rot_embed_dim) { return; } - const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step); + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step); q.x = rotary_embedding_transform(q.x, coef0); k.x = rotary_embedding_transform(k.x, coef0); - const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step); q.y = rotary_embedding_transform(q.y, coef1); k.y = rotary_embedding_transform(k.y, coef1); } -inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (8 * tid >= rot_embed_dim) { return; } - const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step); + const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step); q.x = rotary_embedding_transform(q.x, coef0); - const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step); q.y = rotary_embedding_transform(q.y, coef1); - const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step); q.z = rotary_embedding_transform(q.z, coef2); - const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step); q.w = rotary_embedding_transform(q.w, coef3); } -inline __device__ void apply_rotary_embedding(uint4& q, uint4& k, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + uint4& q, uint4& k, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (8 * tid >= rot_embed_dim) { return; } - const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step); + const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step); q.x = rotary_embedding_transform(q.x, coef0); k.x = rotary_embedding_transform(k.x, coef0); - const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step); q.y = rotary_embedding_transform(q.y, coef1); k.y = rotary_embedding_transform(k.y, coef1); - const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step); q.z = rotary_embedding_transform(q.z, coef2); k.z = rotary_embedding_transform(k.z, coef2); - const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step); q.w = rotary_embedding_transform(q.w, coef3); k.w = rotary_embedding_transform(k.w, coef3); } #ifdef ENABLE_BF16 -inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + __nv_bfloat162& q, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (2 * tid >= rot_embed_dim) { return; } - const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step); + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step); q = rotary_embedding_transform(q, coef); } inline __device__ void apply_rotary_embedding( - __nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, int t_step) + __nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (2 * tid >= rot_embed_dim) { return; } - const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step); + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, scale, t_step); q = rotary_embedding_transform(q, coef); k = rotary_embedding_transform(k, coef); } -inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + bf16_4_t& q, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (4 * tid >= rot_embed_dim) { return; } - const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step); + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step); q.x = rotary_embedding_transform(q.x, coef0); - const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step); q.y = rotary_embedding_transform(q.y, coef1); } -inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (4 * tid >= rot_embed_dim) { return; } - const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step); + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, scale, t_step); q.x = rotary_embedding_transform(q.x, coef0); k.x = rotary_embedding_transform(k.x, coef0); - const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, scale, t_step); q.y = rotary_embedding_transform(q.y, coef1); k.y = rotary_embedding_transform(k.y, coef1); } -inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + bf16_8_t& q, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (8 * tid >= rot_embed_dim) { return; } - const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step); + const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step); q.x = rotary_embedding_transform(q.x, coef0); - const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step); q.y = rotary_embedding_transform(q.y, coef1); - const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step); q.z = rotary_embedding_transform(q.z, coef2); - const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step); q.w = rotary_embedding_transform(q.w, coef3); } -inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, int t_step) +inline __device__ void apply_rotary_embedding( + bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, float base, float scale, int t_step) { if (8 * tid >= rot_embed_dim) { return; } - const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step); + const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, scale, t_step); q.x = rotary_embedding_transform(q.x, coef0); k.x = rotary_embedding_transform(k.x, coef0); - const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, scale, t_step); q.y = rotary_embedding_transform(q.y, coef1); k.y = rotary_embedding_transform(k.y, coef1); - const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, scale, t_step); q.z = rotary_embedding_transform(q.z, coef2); k.z = rotary_embedding_transform(k.z, coef2); - const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, scale, t_step); q.w = rotary_embedding_transform(q.w, coef3); k.w = rotary_embedding_transform(k.w, coef3); } @@ -2067,10 +2083,10 @@ inline __device__ void convert_from_fp8(bf16_4_t* v, const fp8_4_t u) inline __device__ void convert_from_fp8(bf16_8_t* v, const fp8_8_t u) { __nv_bfloat162* v2 = reinterpret_cast<__nv_bfloat162*>(v); - convert_from_fp8(v2, u.x); + convert_from_fp8(v2 + 0, u.x); convert_from_fp8(v2 + 1, u.y); - convert_from_fp8(v2 + 1, u.z); - convert_from_fp8(v2 + 2, u.w); + convert_from_fp8(v2 + 2, u.z); + convert_from_fp8(v2 + 3, u.w); } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.cu b/cpp/tensorrt_llm/kernels/decodingKernels.cu index 2abbbf0c48b..4d40df54b2d 100644 --- a/cpp/tensorrt_llm/kernels/decodingKernels.cu +++ b/cpp/tensorrt_llm/kernels/decodingKernels.cu @@ -30,8 +30,6 @@ namespace kernels __global__ void gatherTree(gatherTreeParam param) { - const int max_input_length = param.input_lengths == nullptr ? 0 : param.max_input_length; - for (int batchbeam_idx = blockIdx.x * blockDim.x + threadIdx.x; batchbeam_idx < param.batch_size * param.beam_width; batchbeam_idx += gridDim.x * blockDim.x) { @@ -66,43 +64,36 @@ __global__ void gatherTree(gatherTreeParam param) continue; } - const int padding_offset = param.has_padding ? max_input_length - input_len : 0; - const int initial_tgt_ix = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len - + max_seq_len_b - 1 - padding_offset; + const int initial_tgt_ix + = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + max_seq_len_b - 1; const int initial_parent_ix = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + max_seq_len_b - 1; - param.beams[initial_tgt_ix] = __ldg(step_ids + initial_parent_ix); + param.output_ids[initial_tgt_ix] = __ldg(step_ids + initial_parent_ix); int parent = parent_ids == nullptr ? 0 : __ldg(parent_ids + initial_parent_ix) % param.beam_width; bool found_bad = false; for (int level = max_seq_len_b - 2; level >= 0; --level) { - if (param.has_padding && level >= input_len && level < max_input_length) - { - continue; - } - const int tgt_level{level >= max_input_length ? level - padding_offset : level}; - const int level_beam_ix - = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + tgt_level; + const int level_beam_ix = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + level; const int level_parent_ix = batch * param.beam_width * param.max_seq_len + parent * param.max_seq_len + level; if (parent < 0 || parent > param.beam_width) { - param.beams[level_beam_ix] = param.end_tokens[batch]; + param.output_ids[level_beam_ix] = param.end_tokens[batch]; parent = -1; found_bad = true; } else { - param.beams[level_beam_ix] = __ldg(step_ids + level_parent_ix); + param.output_ids[level_beam_ix] = __ldg(step_ids + level_parent_ix); parent = parent_ids == nullptr ? 0 : __ldg(parent_ids + level_parent_ix) % param.beam_width; } } // set the padded part as end_token // input_len - for (int index = max_len - padding_offset; index < param.max_seq_len; ++index) + for (int index = max_len; index < param.max_seq_len; ++index) { - param.beams[batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + index] + param.output_ids[batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + index] = param.end_tokens[batch]; } @@ -113,40 +104,21 @@ __global__ void gatherTree(gatherTreeParam param) { bool finished = false; // skip the step 0 because it is often the start token - int start_step = max_input_length == 0 ? 1 : max_input_length; + int start_step = 1; for (int time = start_step; time < max_seq_len_b; ++time) { const int level_beam_ix = batch * param.beam_width * param.max_seq_len + beam * param.max_seq_len + time; if (finished) { - param.beams[level_beam_ix] = param.end_tokens[batch]; + param.output_ids[level_beam_ix] = param.end_tokens[batch]; } - else if (param.beams[level_beam_ix] == param.end_tokens[batch]) + else if (param.output_ids[level_beam_ix] == param.end_tokens[batch]) { finished = true; } } } - - // transpose on output_ids - if (param.output_ids != nullptr) - { - for (int step_idx = 0; step_idx < param.max_seq_len; step_idx++) - { - param.output_ids[batchbeam_idx * param.max_seq_len + step_idx] - = param.beams[batchbeam_idx * param.max_seq_len + step_idx]; - } - } - } - - // remove the pad length from sequence lengths - for (int batchbeam_idx = blockIdx.x * blockDim.x + threadIdx.x; batchbeam_idx < param.batch_size * param.beam_width; - batchbeam_idx += gridDim.x * blockDim.x) - { - const int input_len = param.input_lengths == nullptr ? 0 : param.input_lengths[batchbeam_idx]; - const int pad_len = max_input_length - input_len; - param.sequence_lengths[batchbeam_idx] -= pad_len; } } @@ -319,7 +291,7 @@ void invokeGatherTree(gatherTreeParam param) __global__ void finalize(int* output_ids, int* sequence_lengths, float* cum_log_probs, float* output_log_probs, const int* topk_output_ids, const int* topk_sequence_lengths, const float* scores, const float* topk_cum_log_probs, const float* topk_log_probs, const int* num_beams, const int* input_lengths, const int beam_width, - const int max_input_length, const int max_seq_len, bool do_remove_padding) + const int max_seq_len) { // output_ids: [bs, beam_width, max_seq_len] // sequence_lengths: [bs, beam_width] @@ -337,11 +309,9 @@ __global__ void finalize(int* output_ids, int* sequence_lengths, float* cum_log_ // Note that we remove the start_token (the id at first position) from topk_output_ids extern __shared__ char array[]; - int* s_rank = (int*) (array); // [beam_width] - float* s_scores = (float*) (s_rank + beam_width); // [2 * beam_width] - int* s_sequence_lengths = (int*) (s_scores + beam_width * 2); // [beam_width] - const int input_length = input_lengths[blockIdx.x * beam_width]; // input_lengths of same batch must be same - const int pad_len = do_remove_padding ? max_input_length - input_length : 0; + int* s_rank = (int*) (array); // [beam_width] + float* s_scores = (float*) (s_rank + beam_width); // [2 * beam_width] + int* s_sequence_lengths = (int*) (s_scores + beam_width * 2); // [beam_width] const int num_beam = num_beams[blockIdx.x]; if (threadIdx.x < num_beam) { @@ -426,8 +396,7 @@ __global__ void finalize(int* output_ids, int* sequence_lengths, float* cum_log_ if (threadIdx.x < beam_width) { - s_sequence_lengths[threadIdx.x] - = topk_sequence_lengths[blockIdx.x * beam_width * 2 + s_rank[threadIdx.x]] - pad_len; + s_sequence_lengths[threadIdx.x] = topk_sequence_lengths[blockIdx.x * beam_width * 2 + s_rank[threadIdx.x]]; sequence_lengths[blockIdx.x * beam_width + threadIdx.x] = s_sequence_lengths[threadIdx.x]; if (cum_log_probs != nullptr) @@ -443,15 +412,12 @@ __global__ void finalize(int* output_ids, int* sequence_lengths, float* cum_log_ // start from step 1 to skip the start token for (int i = threadIdx.x; i < s_sequence_lengths[beam_idx]; i += blockDim.x) { - int src_pad_offset = do_remove_padding ? ((i >= input_length) ? pad_len : 0) : 0; output_ids[blockIdx.x * beam_width * max_seq_len + beam_idx * max_seq_len + i] - = topk_output_ids[blockIdx.x * (beam_width * 2) * max_seq_len + s_rank[beam_idx] * max_seq_len - + (i + src_pad_offset)]; + = topk_output_ids[blockIdx.x * (beam_width * 2) * max_seq_len + s_rank[beam_idx] * max_seq_len + i]; if (output_log_probs != nullptr) { output_log_probs[blockIdx.x * beam_width * max_seq_len + beam_idx * max_seq_len + i] - = topk_log_probs[blockIdx.x * (beam_width * 2) * max_seq_len + s_rank[beam_idx] * max_seq_len - + (i + src_pad_offset)]; + = topk_log_probs[blockIdx.x * (beam_width * 2) * max_seq_len + s_rank[beam_idx] * max_seq_len + i]; } } } @@ -460,8 +426,7 @@ __global__ void finalize(int* output_ids, int* sequence_lengths, float* cum_log_ void invokeFinalize(int* output_ids, int* sequence_lengths, float* cum_log_probs, float* output_log_probs, const int* topk_output_ids, const int* topk_sequence_lengths, const float* scores, const float* topk_cum_log_probs, const float* topk_log_probs, const int* num_beams, const int* input_lengths, const int beam_width, - const int max_seq_len, const int batch_size, const int max_input_length, cudaStream_t stream, - bool do_remove_padding) + const int max_seq_len, const int batch_size, cudaStream_t stream) { TLLM_LOG_DEBUG("%s %s start", __FILE__, __PRETTY_FUNCTION__); dim3 block(beam_width * 2); @@ -469,8 +434,7 @@ void invokeFinalize(int* output_ids, int* sequence_lengths, float* cum_log_probs TLLM_CHECK(block.x < 1024); finalize<<>>(output_ids, sequence_lengths, cum_log_probs, output_log_probs, topk_output_ids, topk_sequence_lengths, scores, - topk_cum_log_probs, topk_log_probs, num_beams, input_lengths, beam_width, max_input_length, max_seq_len, - do_remove_padding); + topk_cum_log_probs, topk_log_probs, num_beams, input_lengths, beam_width, max_seq_len); } __global__ void initializeOutput(int* output_ids, const int* end_ids, const int max_seq_len) diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.h b/cpp/tensorrt_llm/kernels/decodingKernels.h index 9525c90d781..d942025e0aa 100644 --- a/cpp/tensorrt_llm/kernels/decodingKernels.h +++ b/cpp/tensorrt_llm/kernels/decodingKernels.h @@ -41,13 +41,10 @@ struct gatherTreeParam const int* step_ids = nullptr; // [max_seq_len, batch_size, beam_width] const int* parent_ids = nullptr; // [max_seq_len, batch_size, beam_width] const int* end_tokens = nullptr; // [batch_size], end token ids of each query - int max_input_length = 0; // max(input_lengths) int* output_ids = nullptr; // the buffer to put finalized ids - // True if we have virtual padding tokens to fill up to max_input_len - bool has_padding = true; cudaStream_t stream; - float* cum_log_probs = nullptr; // [batch_size, beam_width] - float length_penalty = 1.0f; // on cpu + float* cum_log_probs = nullptr; // [batch_size, beam_width] + float length_penalty = 1.0f; // on cpu }; /* @@ -58,8 +55,7 @@ void invokeGatherTree(gatherTreeParam param); void invokeFinalize(int* output_ids, int* sequence_lengths, float* cum_log_probs, float* output_log_probs, const int* topk_output_ids, const int* topk_sequence_lengths, const float* scores, const float* topk_cum_log_probs, const float* topk_log_probs, const int* num_beams, const int* input_lengths, const int beam_width, - const int max_seq_len, const int batch_size, const int max_input_length, cudaStream_t stream, - bool do_remove_padding = true); + const int max_seq_len, const int batch_size, cudaStream_t stream); void invokeInitializeOutput(int* output_ids, const int* end_ids, int batch_beam, int max_seq_len, cudaStream_t stream); diff --git a/cpp/tensorrt_llm/kernels/gptKernels.h b/cpp/tensorrt_llm/kernels/gptKernels.h index 76815203997..4130c357080 100644 --- a/cpp/tensorrt_llm/kernels/gptKernels.h +++ b/cpp/tensorrt_llm/kernels/gptKernels.h @@ -42,6 +42,13 @@ enum class PositionEmbeddingType : int8_t kALIBI = 3 }; +enum class RotaryScalingType : int8_t +{ + kNONE = 0, + kLINEAR = 1, + kDYNAMIC = 2, +}; + template struct BuildDecoderInfoParams { diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu index 508c695b1dc..6d00a43fa46 100644 --- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu +++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu @@ -30,12 +30,11 @@ void topK_softMax_kernelLauncher(const T* log_probs, const T* bias, const bool* const int temp_storage_size, BeamHypotheses* beam_hyps, const int batch_size, const int beam_width, const int vocab_size, const int* end_ids, T diversity_rate, const float length_penalty, cudaStream_t stream); -#define CASE_K(K, MAX_K) \ - case K ... MAX_K: \ - topK_softMax_kernelLauncher(log_probs, bias, finished, sequence_lengths, cum_log_probs, \ - output_log_probs, output_ids_ptr, temp_storage, temp_storage_size, beam_hyps, batch_size, beam_width, \ - vocab_size, end_ids, diversity_rate, length_penalty, stream); \ - break; +#define CASE_K(MAX_K) \ + topK_softMax_kernelLauncher(log_probs, bias, finished, sequence_lengths, cum_log_probs, \ + output_log_probs, output_ids_ptr, temp_storage, temp_storage_size, beam_hyps, batch_size, beam_width, \ + vocab_size, end_ids, diversity_rate, length_penalty, stream); \ + break; template void invokeTopkSoftMax(const T* log_probs, const T* bias, const bool* finished, const int* sequence_lengths, @@ -44,13 +43,25 @@ void invokeTopkSoftMax(const T* log_probs, const T* bias, const bool* finished, const int vocab_size, const int* end_ids, const float diversity_rate, const float length_penalty, cudaStream_t stream) { - switch (beam_width) + int log_beam_width(0); + int recursor(beam_width - 1); + while (recursor >>= 1) + ++log_beam_width; + + switch (log_beam_width) { - CASE_K(1, 4); - CASE_K(5, 8); - CASE_K(9, 16); - CASE_K(17, 32); - CASE_K(33, 64); + // 0 < beam_width <= 4 + case 0: // 1, 2 + case 1: // 3, 4 + CASE_K(4) + case 2: // 4 < beam_width <= 8 + CASE_K(8) + case 3: // 9 < beam_width <= 16 + CASE_K(16) + case 4: // 16 < beam_width <= 32 + CASE_K(32) + case 5: // 32 < beam_width <= 64 + CASE_K(64) default: throw std::runtime_error(fmtstr("Topk kernel of beam search does not support beam_width=%d", beam_width)); } } diff --git a/cpp/tensorrt_llm/kernels/quantization.cu b/cpp/tensorrt_llm/kernels/quantization.cu index 4417f9d8d43..5214fb02c8f 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cu +++ b/cpp/tensorrt_llm/kernels/quantization.cu @@ -17,7 +17,6 @@ #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" -#include "tensorrt_llm/common/int8Utils.cuh" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/quantization.h" diff --git a/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.cu b/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.cu index 7380ed0d2ec..281cb7417a2 100644 --- a/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.cu @@ -192,22 +192,16 @@ template void invokeBatchApplyTemperaturePenalty(half* logits, const half* bias, template __global__ void applyRepetitionPenalty(T* logits, const float penalty, const int* start_ids, int* output_ids, const int batch_size, const int local_batch_size, const int vocab_size, const int vocab_size_padd, - const int* input_lengths, const int max_input_len, const int step) + const int* input_lengths, const int step) { extern __shared__ float penalty_logits[]; int* penalty_indices = (int*) (penalty_logits + step); logits = logits + blockIdx.x * vocab_size_padd; - const int input_length = input_lengths != nullptr ? input_lengths[blockIdx.x] : max_input_len; + const int input_length = input_lengths != nullptr ? input_lengths[blockIdx.x] : 0; for (int index = threadIdx.x; index < step; index += blockDim.x) { - - if (index >= input_length && index < max_input_len) - { - continue; - } - - // output_ids shape: (input_len + output_len, batch_size) + // output_ids shape: (batch_size, input_len + output_len) int penalty_index = output_ids[index * batch_size + blockIdx.x]; if (penalty_index >= vocab_size) { @@ -241,13 +235,7 @@ __global__ void applyRepetitionPenalty(T* logits, const float penalty, const int for (int index = threadIdx.x; index < step; index += blockDim.x) { - - if (index >= input_length && index < max_input_len) - { - continue; - } - - // output_ids shape: (input_len + output_len, batch_size) + // output_ids shape: (batch_size, input_len + output_len) if (penalty_indices[index] >= vocab_size) { continue; @@ -256,54 +244,15 @@ __global__ void applyRepetitionPenalty(T* logits, const float penalty, const int } } -template -void invokeApplyRepetitionPenalty(T* logits, const float penalty, const int* start_ids, int* output_ids, - const int batch_size, const int local_batch_size, const int vocab_size, const int vocab_size_padd, - const int* input_lengths, const int max_input_len, const int step, const RepetitionPenaltyType penalty_type, - cudaStream_t stream) -{ - dim3 block(min(step, 1024)); - dim3 grid(local_batch_size); - size_t smem_size = step * (sizeof(float) + sizeof(int)); - - if (penalty_type == RepetitionPenaltyType::Additive) - { - applyRepetitionPenalty<<>>(logits, penalty, - start_ids, output_ids, batch_size, local_batch_size, vocab_size, vocab_size_padd, input_lengths, - max_input_len, step); - } - else if (penalty_type == RepetitionPenaltyType::Multiplicative) - { - applyRepetitionPenalty<<>>(logits, - penalty, start_ids, output_ids, batch_size, local_batch_size, vocab_size, vocab_size_padd, input_lengths, - max_input_len, step); - } - else if (penalty_type == RepetitionPenaltyType::None) - { - // do nothing - } -} - -template void invokeApplyRepetitionPenalty(float* logits, const float penalty, const int* start_ids, int* output_ids, - const int batch_size, const int local_batch_size, const int vocab_size, const int vocab_size_padd, - const int* input_lengths, const int max_input_len, const int step, const RepetitionPenaltyType penalty_type, - cudaStream_t stream); - -template void invokeApplyRepetitionPenalty(half* logits, const float penalty, const int* start_ids, int* output_ids, - const int batch_size, const int local_batch_size, const int vocab_size, const int vocab_size_padd, - const int* input_lengths, const int max_input_len, const int step, const RepetitionPenaltyType penalty_type, - cudaStream_t stream); - template __global__ void batchApplyRepetitionPenalty(T* logits, const float* penalties, const int** output_ids, const int* sequence_lengths, const int batch_size, const int vocab_size, const int* input_lengths, - const int max_input_length, const int max_seq_len) + const int max_seq_len) { extern __shared__ float penalty_logits[]; int* penalty_indices = (int*) (penalty_logits + max_seq_len); const int batch_idx = blockIdx.x; const float penalty = penalties[batch_idx]; - const int input_length = input_lengths != nullptr ? input_lengths[batch_idx] : max_input_length; const int current_step = sequence_lengths[batch_idx]; logits += batch_idx * vocab_size; @@ -312,12 +261,7 @@ __global__ void batchApplyRepetitionPenalty(T* logits, const float* penalties, c // A vocab id can appear multiple times but should be penalized once. for (int index = threadIdx.x; index < current_step; index += blockDim.x) { - // Skip the padding tokens in input sequences. - if (index >= input_length && index < max_input_length) - { - continue; - } - // output_ids shape: (input_len + output_len, batch_size) + // output_ids shape: (batch_size, input_len + output_len) int penalty_index = output_ids[batch_idx][blockIdx.y * max_seq_len + index]; assert(penalty_index < vocab_size); penalty_indices[index] = penalty_index; @@ -349,11 +293,6 @@ __global__ void batchApplyRepetitionPenalty(T* logits, const float* penalties, c // Phase 2. Replace a logit value by the penalized one. for (int index = threadIdx.x; index < current_step; index += blockDim.x) { - // Skip the padding tokens in input sequences. - if (index >= input_length && index < max_input_length) - { - continue; - } logits[penalty_indices[index]] = penalty_logits[index]; } } @@ -361,8 +300,7 @@ __global__ void batchApplyRepetitionPenalty(T* logits, const float* penalties, c template void invokeBatchApplyRepetitionPenalty(T* logits, const float* penalties, const int** output_ids, const int* sequence_lengths, const int batch_size, const int local_batch_size, const int vocab_size, - const int* input_lengths, const int max_input_length, RepetitionPenaltyType penalty_type, int max_seq_len, - cudaStream_t stream) + const int* input_lengths, RepetitionPenaltyType penalty_type, int max_seq_len, cudaStream_t stream) { // Inputs // logits [local_batch_size, vocab_size] : logit values. @@ -370,24 +308,20 @@ void invokeBatchApplyRepetitionPenalty(T* logits, const float* penalties, const // output_ids int**, [bs] array, each array has [1, max_seq_len] // sequence_lengths int*, [bs] // input_lengths [local_batch_size], input lengths - // (optional). - // Padding tokens at [input_length, max_input_length) of input will not - // be penalized. + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); dim3 block(min(max_seq_len, 1024)); dim3 grid(batch_size); size_t smem_size = max_seq_len * (sizeof(float) + sizeof(int)); if (penalty_type == RepetitionPenaltyType::Additive) { - batchApplyRepetitionPenalty<<>>(logits, - penalties, output_ids, sequence_lengths, batch_size, vocab_size, input_lengths, max_input_length, - max_seq_len); + batchApplyRepetitionPenalty<<>>( + logits, penalties, output_ids, sequence_lengths, batch_size, vocab_size, input_lengths, max_seq_len); } else if (penalty_type == RepetitionPenaltyType::Multiplicative) { - batchApplyRepetitionPenalty - <<>>(logits, penalties, output_ids, sequence_lengths, batch_size, - vocab_size, input_lengths, max_input_length, max_seq_len); + batchApplyRepetitionPenalty<<>>( + logits, penalties, output_ids, sequence_lengths, batch_size, vocab_size, input_lengths, max_seq_len); } else if (penalty_type == RepetitionPenaltyType::None) { @@ -397,22 +331,21 @@ void invokeBatchApplyRepetitionPenalty(T* logits, const float* penalties, const template void invokeBatchApplyRepetitionPenalty(float* logits, const float* penalties, const int** output_ids, const int* sequence_lengths, const int batch_size, const int local_batch_size, const int vocab_size, - const int* input_lengths, const int max_input_length, RepetitionPenaltyType penalty_type, int max_seq_len, - cudaStream_t stream); + const int* input_lengths, RepetitionPenaltyType penalty_type, int max_seq_len, cudaStream_t stream); template void invokeBatchApplyRepetitionPenalty(half* logits, const float* penalties, const int** output_ids, const int* sequence_lengths, const int batch_size, const int local_batch_size, const int vocab_size, - const int* input_lengths, const int max_input_length, RepetitionPenaltyType penalty_type, int max_seq_len, - cudaStream_t stream); + const int* input_lengths, RepetitionPenaltyType penalty_type, int max_seq_len, cudaStream_t stream); template __global__ void batchApplyMinLengthPenalty(T* logits, const int* min_lengths, const int* end_ids, - const int* sequence_lengths, const int max_input_length, const int vocab_size_padded) + const int* sequence_lengths, const int* input_lengths, const int vocab_size_padded) { int bid = threadIdx.x + blockIdx.x * blockDim.x; // batch index - // We need +1 because sequence_lengths = max_input_length + num_gen_tokens - - // 1, which is equal to the length of k/v caches. - if (sequence_lengths[bid] + 1 - max_input_length < min_lengths[bid]) + auto const input_length{input_lengths == nullptr ? 0 : input_lengths[bid]}; + // We need +1 because sequence_lengths = num_gen_tokens + input_length - 1, which is equal to the length of k/v + // caches. + if (sequence_lengths[bid] + 1 - input_length < min_lengths[bid]) { T mask_val = (std::is_same::value) ? -65504.0f : -FLT_MAX; logits[bid * vocab_size_padded + end_ids[bid]] = mask_val; @@ -421,21 +354,21 @@ __global__ void batchApplyMinLengthPenalty(T* logits, const int* min_lengths, co template void invokeMinLengthPenalty(T* logits, const int* min_lengths, const int* end_ids, const int* sequnece_lengths, - const int max_input_length, const int batch_size, const int vocab_size_padded, cudaStream_t stream) + const int* input_lengths, const int batch_size, const int vocab_size_padded, cudaStream_t stream) { const int block_size = min(batch_size, 1024); const int grid_size = (batch_size + block_size - 1) / block_size; batchApplyMinLengthPenalty<<>>( - logits, min_lengths, end_ids, sequnece_lengths, max_input_length, vocab_size_padded); + logits, min_lengths, end_ids, sequnece_lengths, input_lengths, vocab_size_padded); } template void invokeMinLengthPenalty(float* logits, const int* min_lengths, const int* end_ids, - const int* sequnece_lengths, const int max_input_length, const int batch_size, const int vocab_size_padded, + const int* sequnece_lengths, const int* input_lengths, const int batch_size, const int vocab_size_padded, cudaStream_t stream); template void invokeMinLengthPenalty(half* logits, const int* min_lengths, const int* end_ids, - const int* sequnece_lengths, const int max_input_length, const int batch_size, const int vocab_size_padded, + const int* sequnece_lengths, const int* input_lengths, const int batch_size, const int vocab_size_padded, cudaStream_t stream); } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.h b/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.h index 701a1ebe8a6..0dc754ca82b 100644 --- a/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.h +++ b/cpp/tensorrt_llm/kernels/samplingPenaltyKernels.h @@ -25,17 +25,10 @@ namespace tensorrt_llm namespace kernels { -template -void invokeApplyRepetitionPenalty(T* logits, const float penalty, const int* start_ids, int* output_ids, - const int batch_size, const int local_batch_size, const int vocab_size, const int vocab_size_padd, - const int* input_lengths, const int max_input_len, const int step, const RepetitionPenaltyType penalty_type, - cudaStream_t stream); - template void invokeBatchApplyRepetitionPenalty(T* logits, const float* penalties, const int** output_ids, const int* sequence_lengths, const int batch_size, const int local_batch_size, const int vocab_size, - const int* input_lengths, const int max_input_length, const RepetitionPenaltyType penalty_type, int max_seq_len, - cudaStream_t stream); + const int* input_lengths, const RepetitionPenaltyType penalty_type, int max_seq_len, cudaStream_t stream); template void invokeApplyTemperaturePenalty(T* logits, const T* bias, const float temperature, const int batch_size, @@ -47,7 +40,7 @@ void invokeBatchApplyTemperaturePenalty(T* logits, const T* bias, const float* t template void invokeMinLengthPenalty(T* logits, const int* min_lengths, const int* end_ids, const int* sequnece_lengths, - const int max_input_length, const int batch_size, const int vocab_size_padded, cudaStream_t stream); + const int* input_lengths, const int batch_size, const int vocab_size_padded, cudaStream_t stream); } // namespace kernels } // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu index 6ba4f0a780f..d10f2cc290b 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu @@ -186,7 +186,7 @@ __global__ void topk_stage1(const T* __restrict log_probs, T* tmp_log_probs, int const int index = tmp_topk_buf_index + ite; topk_tmp_id_buf[index] = total.p; topk_tmp_val_buf[index] = total.u; - if (total.p >= 0 && total.p < vocab_size) + if (total.p >= 0) { tmp_log_probs[total.p] = -MAX_T_VAL; } @@ -312,16 +312,15 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, T* t } } -#define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \ - case K_MIN ... K_MAX: \ - topk_stage1 \ - <<>>(log_probs, temp_log_probs, topk_tmp_id_buf, \ - topk_tmp_val_buf, finished, max_top_k, top_ks, vocab_size, end_ids, skip_decode); \ - topk_stage2_sampling \ - <<>>(topk_tmp_id_buf, \ - topk_tmp_val_buf, ids, sequence_lengths, finished, cum_log_probs, output_log_probs, max_top_k, top_ks, \ - top_p, top_ps, curandstate, end_ids, vocab_size, skip_decode); \ - break; +#define CASE_K(K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \ + topk_stage1 \ + <<>>(log_probs, temp_log_probs, topk_tmp_id_buf, \ + topk_tmp_val_buf, finished, max_top_k, top_ks, vocab_size, end_ids, skip_decode); \ + topk_stage2_sampling \ + <<>>(topk_tmp_id_buf, \ + topk_tmp_val_buf, ids, sequence_lengths, finished, cum_log_probs, output_log_probs, max_top_k, top_ks, \ + top_p, top_ps, curandstate, end_ids, vocab_size, skip_decode); \ + break; template void invokeBatchTopKSampling(void* workspace, size_t& workspace_size, const T* log_probs, int** ids, @@ -355,12 +354,27 @@ void invokeBatchTopKSampling(void* workspace, size_t& workspace_size, const T* l int* topk_tmp_id_buf = (int*) (temp_log_probs + temp_log_probs_buf_size); T* topk_tmp_val_buf = (T*) (topk_tmp_id_buf + topk_tmp_ids_buf_size); - switch (max_top_k) + // TODO (bhsueh) need to support case top_k = [2, 17] (use different cases of max_top_k) + int log_max_top_k(0); + int recursor(max_top_k - 1); + while (recursor >>= 1) + ++log_max_top_k; + switch (log_max_top_k) { - CASE_K(1, 16, 128, 128, 8); - CASE_K(17, 32, 256, 128, 8); - CASE_K(33, 64, 256, 256, 8); - CASE_K(65, 1024, 256, 256, 8); + case 0: + case 1: + case 2: + case 3: // 0 < max_top_k <= 16 + CASE_K(16, 128, 128, 8); + case 4: // 16 < max_top_k <= 32 + CASE_K(32, 256, 128, 8); + case 5: // 32 < max_top_k <= 64 + CASE_K(64, 256, 256, 8); + case 6: + case 7: + case 8: + case 9: // 64 < max_top_k <= 1024 + CASE_K(1024, 256, 256, 8); default: throw std::domain_error(fmtstr("top-k kernel supports 1<=k<=1024 but got k=%d", max_top_k)); } } diff --git a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu index be4a2ba0601..7172251bfea 100644 --- a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu +++ b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu @@ -93,7 +93,8 @@ void invokeStopWordsCriterion(const int** output_ids, const int** parent_ids, co // Check if we have sampled a word from the stop_words list. If so, stop the // sequence. dim3 block, grid; - block.x = min(((stop_words_len + 32 - 1) / 32) * 32, 256UL); + constexpr size_t max_block_size{256}; + block.x = min(((stop_words_len + 32 - 1) / 32) * 32, max_block_size); grid.x = (stop_words_len + block.x - 1) / block.x; grid.y = batch_size * beam_width; diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu index 1e95938dcc6..3ef48a0c2b5 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu @@ -1252,8 +1252,8 @@ struct Vec_t<__nv_bfloat16> template __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, T* k_buf, T* v_buf, T* QKV, const T* __restrict qkv_bias, const int* seq_lens, const int* padding_offset, const int batch_size, const int seq_len, const int head_num, - const int kv_head_num, const int size_per_head, const int rotary_embedding_dim, - PositionEmbeddingType const position_embedding_type) + const int kv_head_num, const int size_per_head, const int rotary_embedding_dim, const float rotary_embedding_base, + const float rotary_embedding_scale, PositionEmbeddingType const position_embedding_type) { // This kernel add bias to QKV, which has shape [batch_size, seq_len, 3, head_num, size_per_head], and // QKV split to 3 split buffer q, k, v and transpose them to [batch_size, head_num, seq_len, size_per_head]. @@ -1351,11 +1351,10 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, T* k_buf, T* v_buf, switch (position_embedding_type) { - case PositionEmbeddingType::kLEARNED_ABSOLUTE: case PositionEmbeddingType::kROPE_GPTJ: - case PositionEmbeddingType::kALIBI: { - mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, dst_kv_seq_idx); + mmha::apply_rotary_embedding( + q, k, tidx, rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, dst_kv_seq_idx); break; } case PositionEmbeddingType::kROPE_GPT_NEOX: @@ -1385,7 +1384,8 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, T* k_buf, T* v_buf, mmha::vec_from_smem_transpose(q, q_smem, transpose_idx, smem_pitch); mmha::vec_from_smem_transpose(k, k_smem, transpose_idx, smem_pitch); - mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, rotary_embedding_dim, dst_kv_seq_idx); + mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, rotary_embedding_dim, rotary_embedding_base, + rotary_embedding_scale, dst_kv_seq_idx); mmha::write_smem_transpose(q, q_smem, transpose_idx, smem_pitch); mmha::write_smem_transpose(k, k_smem, transpose_idx, smem_pitch); @@ -1456,12 +1456,13 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, T* k_buf, T* v_buf, #define FUSED_QKV_BIAS_ROTARY_TRANSPOSE_LAUNCH(T, ADD_BIAS, USING_CONTEXT_FMHA) \ add_fusedQKV_bias_transpose_kernel<<>>(q_buf, \ k_buf, v_buf, QKV, qkv_bias, seq_lens, padding_offset, batch_size, seq_len, head_num, kv_head_num, \ - size_per_head, rotary_embedding_dim, position_embedding_type); + size_per_head, rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale, position_embedding_type); template void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, T* v_buf, T* QKV, const T* qkv_bias, const int* seq_lens, const int* padding_offset, const int batch_size, const int seq_len, const int token_num, const int head_num, const int kv_head_num, const int size_per_head, const bool using_context_fmha, const int rotary_embedding_dim, + const float rotary_embedding_base, const float rotary_embedding_scale, const PositionEmbeddingType position_embedding_type, const float* scale, const int int8_mode, cudaStream_t stream) { // [bs, seq_len, 3, head, Dh] @@ -1534,8 +1535,9 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, T* v_buf, T* QKV, const template void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, T* v_buf, T* QKV, const T* qkv_bias, \ const int* seq_lens, const int* padding_offset, const int batch_size, const int seq_len, const int token_num, \ const int head_num, const int kv_head_num, const int size_per_head, const bool using_context_fmha, \ - const int rotary_embedding_dim, const PositionEmbeddingType position_embedding_type, const float* scale, \ - const int int8_mode, cudaStream_t stream) + const int rotary_embedding_dim, const float rotary_embedding_base, const float rotary_embedding_scale, \ + const PositionEmbeddingType position_embedding_type, const float* scale, const int int8_mode, \ + cudaStream_t stream) INSTANTIATE_ADDFUSEDQKVBIAS_TRANSPOSE(float); INSTANTIATE_ADDFUSEDQKVBIAS_TRANSPOSE(half); #ifdef ENABLE_BF16 diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h index 51ccc08cb34..f5ec68e3a57 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h @@ -76,6 +76,7 @@ template void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, T* v_buf, T* QKV, const T* qkv_bias, const int* seq_lens, const int* padding_offset, const int batch_size, const int seq_len, const int token_num, const int head_num, const int kv_head_num, const int size_per_head, const bool using_context_fmha, const int rotary_embedding_dim, + const float rotary_embedding_base, const float rotary_embedding_scale, PositionEmbeddingType const position_embedding_type, const float* scale, const int int8_mode, cudaStream_t stream); template @@ -91,11 +92,12 @@ template void invokeAddFusedQKVBiasTranspose(T* q_buf, T* k_buf, T* v_buf, T* QKV, const int* seq_lens, const int* padding_offset, const int batch_size, const int seq_len, const int token_num, const int head_num, const int kv_head_num, const int size_per_head, const bool using_context_fmha, const int rotary_embedding_dim, + const float rotary_embedding_base, const float rotary_embedding_scale, PositionEmbeddingType const position_embedding_type, const float* scale, const int int8_mode, cudaStream_t stream) { invokeAddFusedQKVBiasTranspose(q_buf, k_buf, v_buf, QKV, (const T*) nullptr, seq_lens, padding_offset, batch_size, seq_len, token_num, head_num, kv_head_num, size_per_head, using_context_fmha, rotary_embedding_dim, - position_embedding_type, scale, int8_mode, stream); + rotary_embedding_base, rotary_embedding_scale, position_embedding_type, scale, int8_mode, stream); } template diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h new file mode 100644 index 00000000000..415f2d7b361 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace tensorrt_llm +{ +namespace kernels +{ +struct WeightOnlyParams +{ + const uint8_t* qweight; + const half* scales; + const half* zeros; + const half* in; + const half* bias; + half* out; + const int m; + const int n; + const int k; + const int group_size; + + WeightOnlyParams(const uint8_t* _qweight, const half* _scales, const half* _zeros, const half* _in, + const half* _bias, half* _out, const int _m, const int _n, const int _k, const int _group_size) + : qweight(_qweight) + , scales(_scales) + , zeros(_zeros) + , in(_in) + , bias(_bias) + , out(_out) + , m(_m) + , n(_n) + , k(_k) + , group_size(_group_size) + { + } +}; +enum class WeightOnlyQuantType +{ + Int4b, + Int8b +}; +enum class WeightOnlyType +{ + PerChannel, + GroupWise +}; + +struct WeightOnlyPerChannel; +template +struct WeightOnlyGroupWise; + +enum class WeightOnlyActivationType +{ + Gelu, + Relu, + Identity, + InvalidType +}; +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h new file mode 100644 index 00000000000..9a58c352b45 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h" +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +template +struct WeightLayoutDetails; + +template <> +struct WeightLayoutDetails +{ + // Every four rows of the original weights are interleaved into a row with stride of 64, so if each thread + // processes 32 elements(for int4, we can use ldg.128 to load weights), then every group of two adjacent threads + // will alternately process four different row weights + // for example + // every 256 consecutive int4 elements [256*i, 256*(i+1)-1] of row N under interleave layout, + // the first 64 are from [64*i, 64*(i+1)-1] of row 4N before interleaving, + // and the second 64 are from [64*i, 64*(i+1)-1] of row 4N+1 before interleaving, and so on. + // So if each thread loads 32 int4 elements, then the elements of each 2 adjacent threads of each 8 + // consecutive threads will come from row 4N ~ 4N+3 respectively before interleaving. + static constexpr int kElemBits = 4; + static constexpr int kInterleave = 4; + static constexpr int kStride = 64; + + // The index remapping here is to counteracts the effect of cutlass::permute_B_rows_for_mixed_gemm + // input 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ... 31 + // weight 0 1 8 9 16 17 24 25 2 3 10 11 18 19 26 27 4 5 12 13 20 21 28 29 6 7 14 15 22 23 30 31 + static constexpr int kShuffleSize = 32; + static constexpr int kShuffleBasicTile = 2; + static constexpr int kShuffleContinous = 4; + static constexpr int kShuffleStrided = 4; + + // The rearrangement here counteracts the effect of cutlass::add_bias_and_interleave_int4s_inplace + // Input int8 data layout + // [elt_7 elt_5 elt_3 elt_1 elt_6 elt_4 elt_2 elt_0] (each elt occupies 4 bits) + // + // Converted fp16 data layout + // [elt_7 elt_6 elt_5 elt_4 elt_3 elt_2 elt_1 elt_0] (each elt occupies 16 bits) + static constexpr int kConvertCount = 8; + using Converter + = cutlass::FastInterleavedAndBiasedNumericArrayConverter; + + // Each warp completes the internal reduce and writes the [Batch * NPerBlock * Interleave] results to the + // corresponding address in shared memory + template + __device__ __forceinline__ static void sync(float* res, float (*sm)[Num * kInterleave]) + { +#pragma unroll + for (int i = 0; i < Num; ++i) + { + res[i] += __shfl_xor_sync(~0, res[i], 16); + res[i] += __shfl_xor_sync(~0, res[i], 8); + res[i] += __shfl_xor_sync(~0, res[i], 1); + } + __syncthreads(); + int warp = threadIdx.x / WarpSize, lane = threadIdx.x % WarpSize; + if (lane == 0 || lane == 2 || lane == 4 || lane == 6) + { +#pragma unroll + for (int i = 0; i < Num; ++i) + { + sm[warp][i * kInterleave + lane / 2] = res[i]; + } + } + __syncthreads(); + } +}; + +template <> +struct WeightLayoutDetails +{ + // Every two rows of the original weights are interleaved into a row with stride of 64, so if each thread + // processes 16 elements(for int8, we can use ldg.128 to load weights), then every group of four adjacent threads + // will alternately process two different row weights + // for example + // every 128 consecutive int8 elements [128*i, 128*(i+1)-1] of row N under interleave layout, + // the first 64 are from [64*i, 64*(i+1)-1] of row 2N before interleaving, + // and the last 64 are from [64*i, 64*(i+1)-1] of row 2N+1 before interleaving. + // So if each thread loads 16 int8 elements, then the elements of the first four and last four threads of each 8 + // consecutive threads will come from row 2N and row 2N+1 respectively before interleaving. + static constexpr int kElemBits = 8; + static constexpr int kInterleave = 2; + static constexpr int kStride = 64; + + // The index remapping here is to counteracts the effect of cutlass::permute_B_rows_for_mixed_gemm + // input 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // weight 0 1 8 9 2 3 10 11 4 5 12 13 6 7 14 15 + static constexpr int kShuffleSize = 16; + static constexpr int kShuffleBasicTile = 2; + static constexpr int kShuffleContinous = 2; + static constexpr int kShuffleStrided = 4; + + // The rearrangement here counteracts the effect of cutlass::add_bias_and_interleave_int8s_inplace + // Input int8 data layout + // [elt_3 elt_1 elt_2 elt_0] (each elt occupies 8 bits) + // + // Converted fp16 data layout + // [elt_3 elt_2 elt_1 elt_0] (each elt occupies 16 bits) + static constexpr int kConvertCount = 4; + using Converter = cutlass::FastInterleavedAndBiasedNumericArrayConverter; + + // Each warp completes the internal reduce and writes the [Batch * NPerBlock * Interleave] results to the + // corresponding address in shared memory + template + __device__ __forceinline__ static void sync(float* res, float (*sm)[Num * kInterleave]) + { +#pragma unroll + for (int i = 0; i < Num; ++i) + { + res[i] += __shfl_xor_sync(~0, res[i], 16); + res[i] += __shfl_xor_sync(~0, res[i], 8); + res[i] += __shfl_xor_sync(~0, res[i], 2); + res[i] += __shfl_xor_sync(~0, res[i], 1); + } + __syncthreads(); + int warp = threadIdx.x / WarpSize, lane = threadIdx.x % WarpSize; + if (lane == 0 || lane == 4) + { +#pragma unroll + for (int i = 0; i < Num; ++i) + { + sm[warp][i * kInterleave + lane / 4] = res[i]; + } + } + __syncthreads(); + } +}; + +template +struct WeightOnlyKernelDetails +{ + using Layout = WeightLayoutDetails; + + static constexpr int kElemBits = Layout::kElemBits; + static constexpr int kInterleave = Layout::kInterleave; + static constexpr int kStride = Layout::kStride; + + static constexpr int kShuffleSize = Layout::kShuffleSize; + static constexpr int kShuffleBasicTile = Layout::kShuffleBasicTile; + static constexpr int kShuffleContinous = Layout::kShuffleContinous; + static constexpr int kShuffleStrided = Layout::kShuffleStrided; + + using Converter = typename Layout::Converter; + static constexpr int kConvertCount = Layout::kConvertCount; + + // Use ldg128 load data from global memory + static constexpr int kAccessSize = 128; + using AccessType = uint4; + + static constexpr int kElemsPerByte = 8 / kElemBits; + static constexpr int kElemsPerThread = kAccessSize / kElemBits; + static constexpr int kBytePerThread = kElemsPerThread / kElemsPerByte; + static constexpr int kThreadsNumPerTile = kStride / kElemsPerThread; + static constexpr int kThreadsNumPerInterleave = kThreadsNumPerTile * kInterleave; + + static constexpr int kConvertIters = kElemsPerThread / kConvertCount; + + // Each thread loads 16(int8b)/32(int4b) quantized weight elements each time through ldg128 + // So more times of ldg128 are needed to load the same number of fp16 activation elements. + static constexpr int kActivationElemNumPerAccess = kAccessSize / (sizeof(half) * 8); + static constexpr int kActivationAccessNum = kElemsPerThread / kActivationElemNumPerAccess; +}; + +template +struct WeightOnlyProperties; + +template <> +struct WeightOnlyProperties +{ + static constexpr bool kIsFineGrained = false; + static constexpr int kGroupSize = 0; +}; + +template +struct WeightOnlyProperties> +{ + static constexpr bool kIsFineGrained = true; + static constexpr int kGroupSize = GS; +}; + +template +struct WeightOnlyScaleLoader +{ + using ElemType = half; + using Details = WeightOnlyKernelDetails; + static constexpr bool kIsFineGrained = WeightOnlyProperties::kIsFineGrained; + static constexpr int kGroupSize = WeightOnlyProperties::kGroupSize; + +private: + const ElemType* _scales; + const ElemType* _zeros; + int _stride; + int _offset; + +public: + __device__ __forceinline__ WeightOnlyScaleLoader( + const ElemType* scales, const ElemType* zeros, int initial_offset, int stride) + : _scales(scales) + , _zeros(zeros) + , _stride(stride) + { + _scales += initial_offset; + if constexpr (Zero) + { + _zeros += initial_offset; + } + // Calculate the k dimension index of the element processed by the current thread of layout before interleave + // Used to load scales and zeros in groupwise weight only quant + _offset = threadIdx.x / Details::kThreadsNumPerInterleave * Details::kStride + + (threadIdx.x % Details::kThreadsNumPerTile) * Details::kElemsPerThread; + } + + __device__ __forceinline__ void load(ElemType& scale, ElemType& zero, int nid) + { + int offset = nid * Details::kInterleave; + if constexpr (kIsFineGrained) + { + offset += _offset / kGroupSize * _stride; + } + scale = _scales[offset]; + if constexpr (Zero) + { + zero = _zeros[offset]; + } + else + { + zero = static_cast(0.f); + } + } + + __device__ __forceinline__ void advance() + { + _offset += BlockSize * Details::kElemsPerThread / Details::kInterleave; + } + + __device__ __forceinline__ int offset() + { + return _offset; + } +}; + +template class ActOp, bool Zero, bool Bias, + int NPerBlock, int Batch, int BlockSize> +__global__ void weight_only_batched_gemv(const uint8_t* qweight, const half* scales, const half* zeros, const half* in, + const half* bias, half* out, const int n, const int k) +{ + static_assert(NPerBlock == 1 || (NPerBlock % 2 == 0)); + using Details = WeightOnlyKernelDetails; + + using Converter = typename Details::Converter; + using AccType = typename Details::AccessType; + using CvtSrcType = typename Converter::source_type; + using CvtResType = typename Converter::result_type; + using ScaleLoader = WeightOnlyScaleLoader; + extern __shared__ uint8_t shmem[]; + constexpr int Interleave = Details::kInterleave; + constexpr int WarpSize = 32; + constexpr int Num = Batch * NPerBlock; + const int tid = threadIdx.x; + const int bid = blockIdx.x; + const int n_start_id = bid * NPerBlock * Interleave; + // Calculate the n-dimensional index of the data processed by the current thread in the interleave tile + const int interleave_n_id = (tid / Details::kThreadsNumPerTile) % Interleave; + + qweight += n_start_id * k / Details::kElemsPerByte; + ScaleLoader scale_loader(scales, zeros, n_start_id + interleave_n_id, n); + + float(*sm)[Num * Interleave] = reinterpret_cast(shmem); + + // In order to take advantage of hfma2, we use fp16 for accumulation within threads and fp32 for accumulation + // between threads. + half accumulator[Num]; + for (int i = 0; i < Num; ++i) + { + accumulator[i] = __float2half_rn(0.f); + } + + // Iteration in k dimensions + for (int local_k = tid * Details::kElemsPerThread; local_k < k * Interleave; + local_k += BlockSize * Details::kElemsPerThread) + { + half weights_f16[Details::kElemsPerThread * NPerBlock]; + half scale[NPerBlock], zero[NPerBlock]; +#pragma unroll + for (int idx = 0; idx < NPerBlock; ++idx) + { + // Load quantized weight and scales/zeros + uint8_t weights_quantized[Details::kBytePerThread]; + load(weights_quantized, + qweight + idx * Interleave * k / Details::kElemsPerByte + local_k / Details::kElemsPerByte); + scale_loader.load(scale[idx], zero[idx], idx); + half weights_vec[Details::kElemsPerThread]; +#pragma unroll + for (int i = 0; i < Details::kConvertIters; ++i) + { + // Use cutlass::FastInterleavedAndBiasedNumericArrayConverter for I2F type conversion + assign(weights_vec + i * Details::kConvertCount, + Converter::convert(*reinterpret_cast( + weights_quantized + i * Details::kConvertCount / Details::kElemsPerByte))); + } +#pragma unroll + for (int i = 0; i < Details::kShuffleContinous; ++i) + { +#pragma unroll + for (int j = 0; j < Details::kShuffleStrided; ++j) + { + // Dequantize the weights and arrange the shuffled elements back to the correct order in the + // register array + half2 v = *reinterpret_cast(weights_vec + i * Details::kShuffleBasicTile + + j * Details::kShuffleContinous * Details::kShuffleBasicTile); + v = __hfma2(v, __half2half2(scale[idx]), __half2half2(zero[idx])); + weights_f16[(i * Details::kShuffleStrided * Details::kShuffleBasicTile + + j * Details::kShuffleBasicTile + 0) + * NPerBlock + + idx] + = v.x; + weights_f16[(i * Details::kShuffleStrided * Details::kShuffleBasicTile + + j * Details::kShuffleBasicTile + 1) + * NPerBlock + + idx] + = v.y; + } + } + } +#pragma unroll + for (int b = 0; b < Batch; ++b) + { + half in_v[Details::kElemsPerThread]; +#pragma unroll + for (int idx = 0; idx < Details::kActivationAccessNum; ++idx) + { + // load activation elements + load(in_v + idx * Details::kActivationElemNumPerAccess, + in + b * k + scale_loader.offset() + idx * Details::kActivationElemNumPerAccess); + } + // Perform vector inner product and accumulate + if constexpr (NPerBlock == 1) + { + half2 v = __float2half2_rn(0.f); +#pragma unroll + for (int y = 0; y < Details::kElemsPerThread; y += 2) + { + v = __hfma2(*reinterpret_cast(weights_f16 + y), *reinterpret_cast(in_v + y), v); + } + accumulator[b] += __hadd(v.x, v.y); + } + else + { +#pragma unroll + for (int x = 0; x < NPerBlock / 2; ++x) + { +#pragma unroll + for (int y = 0; y < Details::kElemsPerThread; ++y) + { + *reinterpret_cast(accumulator + b * NPerBlock + x * 2) + = __hfma2(*reinterpret_cast(weights_f16 + y * NPerBlock + x * 2), + __half2half2(in_v[y]), *reinterpret_cast(accumulator + b * NPerBlock + x * 2)); + } + } + } + } + scale_loader.advance(); + } + float reses[Num]; +#pragma unroll + for (int i = 0; i < Num; ++i) + { + reses[i] = __half2float(accumulator[i]); + } + + // Each warp completes the internal reduce and writes the [Batch * NPerBlock * Interleave] results to the + // corresponding address in shared memory + Details::Layout::sync(reses, sm); + + // Each thread is responsible for the accumulation and store to global memory of one element + for (int i = tid; i < Num * Interleave; i += BlockSize) + { + int nid = i % (NPerBlock * Interleave); + float v = 0.f; + for (int j = 0; j < BlockSize / WarpSize; ++j) + { + v += sm[j][i]; + } + float bias_v = 0.f; + if constexpr (Bias) + { + bias_v = __half2float(bias[n_start_id + nid]); + } + int b = i / NPerBlock / Interleave; + out[b * n + n_start_id + nid] = __float2half_rn(ActOp::apply(v + bias_v)); + } +} + +template class ActOp, bool Zero, bool Bias, + int NPerBlock, int Batch, int BlockSize> +struct WeightOnlyBatchedGemvKernelLauncher +{ + static constexpr int kInterleave = WeightLayoutDetails::kInterleave; + + static void run(const WeightOnlyParams& params, cudaStream_t stream) + { + dim3 grid(params.n / NPerBlock / kInterleave); + dim3 block(BlockSize); + int size = sizeof(float) * BlockSize / 32 * Batch * NPerBlock * kInterleave; + weight_only_batched_gemv + <<>>( + params.qweight, params.scales, params.zeros, params.in, params.bias, params.out, params.n, params.k); + } +}; +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.cu new file mode 100644 index 00000000000..f04b2d354b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.cu @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h" +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +template class ActOp, bool Zero, bool Bias, + int N_PER_BLOCK, int BATCH, int BLOCK_SIZE> +struct WeightOnlyBatchedGemvKernelLauncher +{ + static void run(const WeightOnlyParams& params, cudaStream_t stream); +}; + +template class ActOp, int N_PER_BLOCK, + int BATCH, int BLOCK_SIZE> +void select_zero_bias(const WeightOnlyParams& params, cudaStream_t stream) +{ + if (params.zeros && params.bias) + { + WeightOnlyBatchedGemvKernelLauncher::run(params, stream); + } + else if (params.zeros && !params.bias) + { + WeightOnlyBatchedGemvKernelLauncher::run(params, stream); + } + else if (!params.zeros && params.bias) + { + WeightOnlyBatchedGemvKernelLauncher::run(params, stream); + } + else + { + WeightOnlyBatchedGemvKernelLauncher::run(params, stream); + } +} + +template +void select_activation(WeightOnlyActivationType atype, const WeightOnlyParams& params, cudaStream_t stream) +{ + switch (atype) + { + case WeightOnlyActivationType::Gelu: + { + select_zero_bias(params, stream); + break; + } + case WeightOnlyActivationType::Relu: + { + select_zero_bias(params, stream); + break; + } + case WeightOnlyActivationType::Identity: + { + select_zero_bias(params, stream); + break; + } + default: + { + throw std::runtime_error("Use unsupported activation"); + break; + } + } +} + +template +void select_quant_type( + WeightOnlyQuantType qtype, WeightOnlyActivationType atype, const WeightOnlyParams& params, cudaStream_t stream) +{ + if (qtype == WeightOnlyQuantType::Int4b) + { + select_activation( + atype, params, stream); + } + else if (qtype == WeightOnlyQuantType::Int8b) + { + select_activation( + atype, params, stream); + } + else + { + throw std::runtime_error("Unknown QuantType"); + } +} + +template +void select_groupwise_weight_only(WeightOnlyQuantType qtype, WeightOnlyType wtype, WeightOnlyActivationType atype, + const WeightOnlyParams& params, cudaStream_t stream) +{ + if (wtype == WeightOnlyType::GroupWise && params.group_size == 64) + { + select_quant_type, N_PER_BLOCK, BATCH, BLOCK_SIZE>(qtype, atype, params, stream); + } + else if (wtype == WeightOnlyType::GroupWise && params.group_size == 128) + { + select_quant_type, N_PER_BLOCK, BATCH, BLOCK_SIZE>(qtype, atype, params, stream); + } + else + { + throw std::runtime_error("Only support groupwise weight only for gs=64/128"); + } +} + +void weight_only_batched_gemv_launcher(WeightOnlyQuantType qtype, WeightOnlyType wtype, WeightOnlyActivationType atype, + const WeightOnlyParams& params, cudaStream_t stream) +{ + if (wtype == WeightOnlyType::PerChannel) + { + if (qtype == WeightOnlyQuantType::Int4b) + { + switch (params.m) + { + case 1: + { + select_activation(atype, params, stream); + break; + } + case 2: + { + select_activation(atype, params, stream); + break; + } + case 3: + { + select_activation(atype, params, stream); + break; + } + case 4: + { + select_activation(atype, params, stream); + break; + } + default: + { + throw std::runtime_error("Weight only cuda kernel only supported bs <= 4"); + break; + } + } + } + else if (qtype == WeightOnlyQuantType::Int8b) + { + switch (params.m) + { + case 1: + { + select_activation(atype, params, stream); + break; + } + case 2: + { + select_activation(atype, params, stream); + break; + } + case 3: + { + select_activation(atype, params, stream); + break; + } + case 4: + { + select_activation(atype, params, stream); + break; + } + default: + { + throw std::runtime_error("Weight only cuda kernel only supported bs <= 4"); + break; + } + } + } + } + else if (wtype == WeightOnlyType::GroupWise) + { + switch (params.m) + { + case 1: + { + select_groupwise_weight_only<2, 1, 256>(qtype, wtype, atype, params, stream); + break; + } + case 2: + { + select_groupwise_weight_only<2, 2, 256>(qtype, wtype, atype, params, stream); + break; + } + case 3: + { + select_groupwise_weight_only<2, 3, 128>(qtype, wtype, atype, params, stream); + break; + } + case 4: + { + select_groupwise_weight_only<2, 4, 128>(qtype, wtype, atype, params, stream); + break; + } + default: + { + throw std::runtime_error("Weight only cuda kernel only supported bs <= 4"); + break; + } + } + } +} +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h similarity index 52% rename from cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.h rename to cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h index ed04d343658..b4b032105e7 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h @@ -1,12 +1,11 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -15,18 +14,13 @@ * limitations under the License. */ #pragma once -#include -#include -#include +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h" namespace tensorrt_llm { namespace kernels { - -void groupwise_weight_only_matmul_i2f_launcher(const int32_t* qweight, const half* qscales, const half* qzeros, - const half* in, const half* bias, half* out, const int batch, const int n, const int k, const int group_size, - cudaStream_t* stream); - -} // namespace kernels +void weight_only_batched_gemv_launcher(WeightOnlyQuantType qtype, WeightOnlyType wtype, WeightOnlyActivationType atype, + const WeightOnlyParams& params, cudaStream_t stream); +} } // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h new file mode 100644 index 00000000000..4decc521653 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass_extensions/interleaved_numeric_conversion.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +__forceinline__ __device__ float copysignf_pos(float a, float b) +{ + float r; + r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000)); + return r; +} + +__inline__ __device__ float tanh_opt(float x) +{ +#if (__CUDA_ARCH__ >= 750 && CUDART_VERSION >= 11000) + float r; + asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x)); + return r; +#else + const float exp_val = -1.f * fabs(2 * x); + return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x); +#endif +} + +template +struct GeluActivation +{ + static __device__ __forceinline__ T apply(const T& val) + { + const float cdf = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (val + 0.044715f * val * val * val)))); + return val * cdf; + } +}; + +template +struct ReluActivation +{ + static __device__ __forceinline__ T apply(const T& val) + { + return val > static_cast(0.0f) ? val : static_cast(0.0f); + } +}; + +template +struct IdentityActivation +{ + static __device__ __forceinline__ T apply(const T& val) + { + return val; + } +}; + +template +__device__ __forceinline__ void load(T0* dst, T1* src, size_t offset = 0) +{ + *reinterpret_cast(dst) = *(reinterpret_cast(src) + offset); +} + +template +__device__ __forceinline__ void assign(T* dst, const AssignType& val) +{ + *reinterpret_cast(dst) = val; +} + +template +__device__ __forceinline__ void store(T0* src, T1* dst, size_t offset = 0) +{ + *(reinterpret_cast(dst) + offset) = *reinterpret_cast(src); +} +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int4b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int4b.cu new file mode 100644 index 00000000000..cb9ea68fd35 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int4b.cu @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 1, 256>; + +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int8b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int8b.cu new file mode 100644 index 00000000000..59270fdd753 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs1Int8b.cu @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 1, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 1, 256>; + +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int4b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int4b.cu new file mode 100644 index 00000000000..1302e8dcefb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int4b.cu @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 2, 256>; + +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int8b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int8b.cu new file mode 100644 index 00000000000..72a515fe273 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs2Int8b.cu @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 2, 256>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 2, 256>; + +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int4b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int4b.cu new file mode 100644 index 00000000000..4224bdac29a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int4b.cu @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 3, 128>; + +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int8b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int8b.cu new file mode 100644 index 00000000000..032aea0cba8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs3Int8b.cu @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 3, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 3, 128>; + +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int4b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int4b.cu new file mode 100644 index 00000000000..b3049c70fa5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int4b.cu @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 4, 128>; + +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int8b.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int8b.cu new file mode 100644 index 00000000000..66cebb38b0f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/weightOnlyBatchedGemvBs4Int8b.cu @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h" + +namespace tensorrt_llm +{ +namespace kernels +{ + +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, GeluActivation, + false, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, ReluActivation, + false, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + GeluActivation, false, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + ReluActivation, false, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, true, false, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, true, 2, 4, 128>; +template struct WeightOnlyBatchedGemvKernelLauncher, + IdentityActivation, false, false, 2, 4, 128>; + +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.cu b/cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.cu deleted file mode 100644 index 5349ffdd919..00000000000 --- a/cpp/tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.cu +++ /dev/null @@ -1,236 +0,0 @@ -#include "cutlass/cutlass.h" -#include "cutlass_extensions/interleaved_numeric_conversion.h" -#include "tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.h" -#include -#include -#include -#include -#include - -namespace tensorrt_llm -{ -namespace kernels -{ -template -__global__ void groupwise_weight_only_matmul_i2f(const int32_t* qweight, const half* scales, const half* zeros, - const half* in, const half* bias, half* out, const int n, const int k) -{ - static_assert(N_PER_BLOCK == 1 || (N_PER_BLOCK % 2 == 0)); - using Converter = cutlass::FastInterleavedAndBiasedNumericArrayConverter; - extern __shared__ uint8_t shmem[]; - constexpr int Interleave = 4; - constexpr int NUM = BATCH * N_PER_BLOCK; - const int tid = threadIdx.x; - const int bid = blockIdx.x; - const int n_start_id = bid * N_PER_BLOCK * Interleave; - const int interleave_n_id = (tid / 2) % Interleave; - - qweight += n_start_id * k / 8; - scales += (n_start_id + interleave_n_id); - if constexpr (Zero) - { - zeros += (n_start_id + interleave_n_id); - } - float(*sm)[NUM * Interleave] = reinterpret_cast(shmem); - - half reses[NUM]; - for (int i = 0; i < NUM; ++i) - { - reses[i] = __float2half_rn(0.f); - } - - for (int local_k = tid * 32, real_k = tid / 8 * 64 + (tid % 2) * 32; local_k < k * Interleave; - local_k += BLOCK_SIZE * 32, real_k += BLOCK_SIZE * 32 / Interleave) - { - half weights_f16[32 * N_PER_BLOCK]; - half scale[N_PER_BLOCK], zero[N_PER_BLOCK]; -#pragma unroll - for (int idx = 0; idx < N_PER_BLOCK; ++idx) - { - uint8_t weights_i4[16]; - *reinterpret_cast(weights_i4) - = *reinterpret_cast(qweight + idx * Interleave * k / 8 + local_k / 8); - - scale[idx] = scales[real_k / GROUP_SIZE * n + idx * Interleave]; - if constexpr (Zero) - { - zero[idx] = zeros[real_k / GROUP_SIZE * n + idx * Interleave]; - } - else - { - zero[idx] = __float2half_rn(0.f); - } - half weights_vec[32]; -#pragma unroll - for (int i = 0; i < 4; ++i) - { - *reinterpret_cast(weights_vec + i * 8) - = Converter::convert(*reinterpret_cast(weights_i4 + i * 4)); - } -#pragma unroll - for (int i = 0; i < 4; ++i) - { -#pragma unroll - for (int j = 0; j < 4; ++j) - { - half2 v = *reinterpret_cast(weights_vec + i * 2 + j * 8); - v = __hfma2(v, __half2half2(scale[idx]), __half2half2(zero[idx])); - weights_f16[(i * 8 + j * 2 + 0) * N_PER_BLOCK + idx] = v.x; - weights_f16[(i * 8 + j * 2 + 1) * N_PER_BLOCK + idx] = v.y; - } - } - } - -#pragma unroll - for (int b = 0; b < BATCH; ++b) - { - half in_v[32]; -#pragma unroll - for (int idx = 0; idx < 4; ++idx) - { - *reinterpret_cast(in_v + idx * 8) - = *reinterpret_cast(in + b * k + real_k + idx * 8); - } - if constexpr (N_PER_BLOCK == 1) - { - half2 v = __float2half2_rn(0.f); -#pragma unroll - for (int y = 0; y < 32; y += 2) - { - v = __hfma2(*reinterpret_cast(weights_f16 + y), *reinterpret_cast(in_v + y), v); - } - reses[b] += __hadd(v.x, v.y); - } - else - { -#pragma unroll - for (int x = 0; x < N_PER_BLOCK / 2; ++x) - { -#pragma unroll - for (int y = 0; y < 32; ++y) - { - *reinterpret_cast(reses + b * N_PER_BLOCK + x * 2) - = __hfma2(*reinterpret_cast(weights_f16 + y * N_PER_BLOCK + x * 2), - __half2half2(in_v[y]), *reinterpret_cast(reses + b * N_PER_BLOCK + x * 2)); - } - } - } - } - } - float reses2[NUM]; -#pragma unroll - for (int i = 0; i < NUM; ++i) - { - reses2[i] = __half2float(reses[i]); - } -#pragma unroll - for (int i = 0; i < NUM; ++i) - { - reses2[i] += __shfl_xor_sync(~0, reses2[i], 16); - reses2[i] += __shfl_xor_sync(~0, reses2[i], 8); - reses2[i] += __shfl_xor_sync(~0, reses2[i], 1); - } - __syncthreads(); - int warp = tid / 32, lane = tid % 32; - if (lane == 0 || lane == 2 || lane == 4 || lane == 6) - { -#pragma unroll - for (int i = 0; i < NUM; ++i) - { - sm[warp][i * Interleave + lane / 2] = reses2[i]; - } - } - __syncthreads(); - for (int i = tid; i < NUM * Interleave; i += BLOCK_SIZE) - { - int nid = i % (N_PER_BLOCK * Interleave); - float v = 0.f; - for (int j = 0; j < BLOCK_SIZE / 32; ++j) - { - v += sm[j][i]; - } - float bias_v; - if constexpr (Bias) - { - bias_v = __half2float(bias[n_start_id + nid]); - } - else - { - bias_v = 0.f; - } - int b = i / N_PER_BLOCK / Interleave; - out[b * n + n_start_id + nid] = __float2half_rn(v + bias_v); - } -} - -#define RUN_groupwise_weight_only_matmul_i2f_2(Zero, Bias, N_PER_BLOCK, BATCH, BLOCKSIZE) \ - { \ - dim3 grid(n / N_PER_BLOCK / 4); \ - dim3 block(BLOCKSIZE); \ - int size = sizeof(float) * BLOCKSIZE / 32 * BATCH * N_PER_BLOCK * 4; \ - if (group_size == 64) \ - { \ - groupwise_weight_only_matmul_i2f \ - <<>>(qweight, qscales, qzeros, in, bias, out, n, k); \ - } \ - else if (group_size == 128) \ - { \ - groupwise_weight_only_matmul_i2f \ - <<>>(qweight, qscales, qzeros, in, bias, out, n, k); \ - } \ - else \ - { \ - printf("Invalid group size. Only group size 64 and 128 supported for fine grained kernels."); \ - std::abort(); \ - } \ - break; \ - } - -#define RUN_groupwise_weight_only_matmul_i2f_1(N_PER_BLOCK, BATCH, BLOCKSIZE) \ - { \ - if (qzeros && bias) \ - { \ - RUN_groupwise_weight_only_matmul_i2f_2(true, true, N_PER_BLOCK, BATCH, BLOCKSIZE); \ - } \ - else if (qzeros && !bias) \ - { \ - RUN_groupwise_weight_only_matmul_i2f_2(true, false, N_PER_BLOCK, BATCH, BLOCKSIZE); \ - } \ - else if (!qzeros && bias) \ - { \ - RUN_groupwise_weight_only_matmul_i2f_2(false, true, N_PER_BLOCK, BATCH, BLOCKSIZE); \ - } \ - else \ - { \ - RUN_groupwise_weight_only_matmul_i2f_2(false, false, N_PER_BLOCK, BATCH, BLOCKSIZE); \ - } \ - } - -void groupwise_weight_only_matmul_i2f_launcher(const int32_t* qweight, const half* qscales, const half* qzeros, - const half* in, const half* bias, half* out, const int batch, const int n, const int k, const int group_size, - cudaStream_t* stream) -{ - switch (batch) - { - case 1: RUN_groupwise_weight_only_matmul_i2f_1(2, 1, 256); - case 2: RUN_groupwise_weight_only_matmul_i2f_1(2, 2, 256); - case 3: RUN_groupwise_weight_only_matmul_i2f_1(2, 3, 128); - case 4: RUN_groupwise_weight_only_matmul_i2f_1(2, 4, 128); - case 5: RUN_groupwise_weight_only_matmul_i2f_1(2, 5, 128); - case 6: RUN_groupwise_weight_only_matmul_i2f_1(2, 6, 256); - case 7: RUN_groupwise_weight_only_matmul_i2f_1(2, 7, 128); - case 8: RUN_groupwise_weight_only_matmul_i2f_1(2, 8, 128); - case 9: RUN_groupwise_weight_only_matmul_i2f_1(2, 9, 128); - case 10: RUN_groupwise_weight_only_matmul_i2f_1(4, 10, 128); - case 11: RUN_groupwise_weight_only_matmul_i2f_1(4, 11, 128); - case 12: RUN_groupwise_weight_only_matmul_i2f_1(2, 12, 128); - case 13: RUN_groupwise_weight_only_matmul_i2f_1(4, 13, 128); - case 14: RUN_groupwise_weight_only_matmul_i2f_1(4, 14, 128); - case 15: RUN_groupwise_weight_only_matmul_i2f_1(4, 15, 128); - case 16: RUN_groupwise_weight_only_matmul_i2f_1(4, 16, 128); - default: printf("vecquant4matmul_nk_kernel_launcher invalid batch!! batch=%d ", batch); std::abort(); - } -} - -} // namespace kernels -} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.cu b/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.cu deleted file mode 100644 index 79c798bc0e5..00000000000 --- a/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.cu +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "stdio.h" -#include -#include -#include - -#include "cutlass/cutlass.h" -#include "cutlass_extensions/interleaved_numeric_conversion.h" -#include "tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h" - -namespace tensorrt_llm -{ -namespace kernels -{ - -///////////////////////////////////////////////////////////////////// -/* Fast convert from weight only int8/int4 to half */ - -template -struct FastWeightOnlyHalfConverter; - -template <> -struct FastWeightOnlyHalfConverter -{ - using Converter = cutlass::FastInterleavedAndBiasedNumericArrayConverter; - static constexpr int kHalfLength = 4; - static constexpr int kWeightOnlyLength = 4; - - __device__ static inline void convert(half halves[kHalfLength], uint8_t chars[kWeightOnlyLength], half scale) - { - *reinterpret_cast(halves) - = Converter::convert(*reinterpret_cast(chars)); -#pragma unroll - for (int i = 0; i < kHalfLength; ++i) - { - halves[i] *= scale; - } - } -}; - -template <> -struct FastWeightOnlyHalfConverter -{ - using Converter = cutlass::FastInterleavedAndBiasedNumericArrayConverter; - static constexpr int kHalfLength = 8; - static constexpr int kWeightOnlyLength = 4; - - __device__ static inline void convert(half halves[kHalfLength], uint8_t chars[kWeightOnlyLength], half scale) - { - *reinterpret_cast(halves) - = Converter::convert(*reinterpret_cast(chars)); -#pragma unroll - for (int i = 0; i < kHalfLength; ++i) - { - halves[i] *= scale; - } - } -}; - -/* Activation */ - -__forceinline__ __device__ float copysignf_pos(float a, float b) -{ - float r; - r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000)); - return r; -} - -__inline__ __device__ float tanh_opt(float x) -{ -#if (__CUDA_ARCH__ >= 750 && CUDART_VERSION >= 11000) - float r; - asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x)); - return r; -#else - const float exp_val = -1.f * fabs(2 * x); - return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x); -#endif -} - -template -struct GeluActivation -{ - static __device__ __forceinline__ T apply(const T& val) - { - const float cdf = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (val + 0.044715f * val * val * val)))); - return val * cdf; - } -}; - -template -struct ReluActivation -{ - static __device__ __forceinline__ T apply(const T& val) - { - return val > static_cast(0.0f) ? val : static_cast(0.0f); - } -}; - -template -struct IdentityActivation -{ - static __device__ __forceinline__ T apply(const T& val) - { - return val; - } -}; - -template -__device__ __forceinline__ void load(T0* dst, T1* src, size_t offset = 0) -{ - *reinterpret_cast(dst) = *(reinterpret_cast(src) + offset); -} - -template -__device__ __forceinline__ void store(T0* src, T1* dst, size_t offset = 0) -{ - *(reinterpret_cast(dst) + offset) = *reinterpret_cast(src); -} - -template class Activation, int K = 0> -__global__ void int8_weight_only_gemv_interleave(const int8_t* weight, const half* input, const half* scale_list, - const half* bias, half* output, const int n, const int k_) -{ - using Converter = FastWeightOnlyHalfConverter; - int k = K != 0 ? K : k_; - uint8_t vec_weight[16]; - half vec_input[16]; - half vec_weight_f16[16]; - int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32; - int tile_id = blockIdx.x * blockDim.x / 32 + warp_id; - // Every two rows of the original weights are interleaved into a row with stride of 64, so if each thread - // processes 16 elements(for int8, we can use ldg.128 to load weights), then every group of four adjacent threads - // will alternately process two different row weights - // for example - // every 128 consecutive int8 elements [128*i, 128*(i+1)-1] of row N under interleave layout, - // the first 64 are from [64*i, 64*(i+1)-1] of row 2N before interleaving, - // and the last 64 are from [64*i, 64*(i+1)-1] of row 2N+1 before interleaving. - // So if each thread loads 16 int8 elements, then the elements of the first four and last four threads of each 8 - // consecutive threads will come from row 2N and row 2N+1 respectively before interleaving. - int row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0); - weight += tile_id * k * 2; - float v = 0.f, v_bias; - half scale = scale_list[row_id]; - if (Bias) - { - v_bias = __half2float(bias[row_id]); - } -#pragma unroll - for (int i = lane_id * 16; i < k * 2; i += 16 * 32) - { - load(vec_weight, weight + i); - load(vec_input, input + i / 128 * 64 + (i % 64)); - load(vec_input + 8, input + i / 128 * 64 + (i % 64) + 8); -#pragma unroll - for (int p = 0; p < 16; p += Converter::kHalfLength) - { - // The rearrangement here counteracts the effect of cutlass::add_bias_and_interleave_int8s_inplace - // Input int8 data layout - // [elt_3 elt_1 elt_2 elt_0] (each elt occupies 8 bits) - // - // Converted fp16 data layout - // [elt_3 elt_2 elt_1 elt_0] (each elt occupies 16 bits) - Converter::convert(vec_weight_f16 + p, vec_weight + p, scale); - } -#pragma unroll - for (int p = 0; p < 16; ++p) - { - // The index remapping here is to counteracts the effect of cutlass::permute_B_rows_for_mixed_gemm - // input 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - // weight 0 1 8 9 2 3 10 11 4 5 12 13 6 7 14 15 - v += __half2float(__hmul(vec_input[p], vec_weight_f16[4 * ((p % 8) / 2) + p % 2 + 2 * (p / 8)])); - } - } - v += __shfl_xor_sync(0xffffffff, v, 16); - v += __shfl_xor_sync(0xffffffff, v, 8); - v += __shfl_xor_sync(0xffffffff, v, 2); - v += __shfl_xor_sync(0xffffffff, v, 1); - if (lane_id == 0 || lane_id == 4) - { - if (Bias) - { - output[row_id] = __float2half_rn(Activation::apply(v + v_bias)); - } - else - { - output[row_id] = __float2half_rn(Activation::apply(v)); - } - } -} - -template class Activation, int K = 0> -__global__ void int4_weight_only_gemv_interleave(const int8_t* weight, const half* input, const half* scale_list, - const half* bias, half* output, const int n, const int k_) -{ - using Converter = FastWeightOnlyHalfConverter; - int k = K != 0 ? K : k_; - uint8_t vec_weight[16]; - half vec_input[32]; - half vec_weight_f16[32]; - int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32; - int tile_id = blockIdx.x * blockDim.x / 32 + warp_id; - // Every four rows of the original weights are interleaved into a row with stride of 64, so if each thread - // processes 32 elements(for int4, we can use ldg.128 to load weights), then every group of two adjacent threads - // will alternately process four different row weights - // for example - // every 256 consecutive int4 elements [256*i, 256*(i+1)-1] of row N under interleave layout, - // the first 64 are from [64*i, 64*(i+1)-1] of row 4N before interleaving, - // and the second 64 are from [64*i, 64*(i+1)-1] of row 4N+1 before interleaving, and so on. - // So if each thread loads 32 int4 elements, then the elements of each 2 adjacent threads of each 8 - // consecutive threads will come from row 4N ~ 4N+3 respectively before interleaving. - int row_id = tile_id * 4 + ((lane_id % 8) / 2); - weight += tile_id * k / 2 * 4; - float v = 0.f, v_bias; - half scale = scale_list[row_id]; - if (Bias) - { - v_bias = __half2float(bias[row_id]); - } -#pragma unroll - for (int i = lane_id * 32; i < k * 4; i += 32 * 32) - { - load(vec_weight, weight + i / 2); - load(vec_input, input + i / 256 * 64 + (i % 64)); - load(vec_input + 8, input + i / 256 * 64 + (i % 64) + 8); - load(vec_input + 16, input + i / 256 * 64 + (i % 64) + 16); - load(vec_input + 24, input + i / 256 * 64 + (i % 64) + 24); -#pragma unroll - for (int p = 0; p < 32; p += Converter::kHalfLength) - { - // The rearrangement here counteracts the effect of cutlass::add_bias_and_interleave_int4s_inplace - // Input int8 data layout - // [elt_7 elt_5 elt_3 elt_1 elt_6 elt_4 elt_2 elt_0] (each elt occupies 4 bits) - // - // Converted fp16 data layout - // [elt_7 elt_6 elt_5 elt_4 elt_3 elt_2 elt_1 elt_0] (each elt occupies 16 bits) - Converter::convert(vec_weight_f16 + p, vec_weight + p / 2, scale); - } -#pragma unroll - for (int p = 0; p < 32; ++p) - { - // The index remapping here is to counteracts the effect of cutlass::permute_B_rows_for_mixed_gemm - // input 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ... 31 - // weight 0 1 8 9 16 17 24 25 2 3 10 11 18 19 26 27 4 5 12 13 20 21 28 29 6 7 14 15 22 23 30 31 - v += __half2float(__hmul(vec_input[p], vec_weight_f16[8 * ((p % 8) / 2) + p % 2 + 2 * (p / 8)])); - } - } - v += __shfl_xor_sync(0xffffffff, v, 16); - v += __shfl_xor_sync(0xffffffff, v, 8); - v += __shfl_xor_sync(0xffffffff, v, 1); - if (lane_id == 0 || lane_id == 2 || lane_id == 4 || lane_id == 6) - { - if (Bias) - { - output[row_id] = __float2half_rn(Activation::apply(v + v_bias)); - } - else - { - output[row_id] = __float2half_rn(Activation::apply(v)); - } - } -} - -template class Activation, int K = 0> -void weight_only_gemv_kernel_launcher(const int8_t* weight, const half* input, const half* scale_list, const half* bias, - half* output, const int k, const int n, dim3 grid, dim3 block, QuantType qtype, cudaStream_t stream) -{ - if (qtype == QuantType::PACKED_INT4_WEIGHT_ONLY) - { - grid.x /= 2; - int4_weight_only_gemv_interleave - <<>>(weight, input, scale_list, bias, output, n, k); - } - else if (qtype == QuantType::INT8_WEIGHT_ONLY) - { - int8_weight_only_gemv_interleave - <<>>(weight, input, scale_list, bias, output, n, k); - } -} - -#define INVOKE_WEIGHT_ONLY_GEMV(ActivationType, K) \ - do \ - { \ - if (bias) \ - { \ - weight_only_gemv_kernel_launcher( \ - weight, input, scale_list, bias, output, k, n, grid, block, qtype, stream); \ - } \ - else \ - { \ - weight_only_gemv_kernel_launcher( \ - weight, input, scale_list, bias, output, k, n, grid, block, qtype, stream); \ - } \ - } while (0); - -#define SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, K) \ - case K: \ - { \ - INVOKE_WEIGHT_ONLY_GEMV(ActivationType, K); \ - break; \ - } -#define INVOKE_WEIGHT_ONLY_KERNEL_FOR_SPECIFIED_SHAPE(ActivationType) \ - do \ - { \ - switch (k) \ - { \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 1536) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 2048) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 2560) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 4096) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 4608) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 5120) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 6144) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 7680) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 8192) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 10240) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 12288) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 15360) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 16384) \ - SWITCH_CASE_FOR_DIFFERENT_K(ActivationType, 20480) \ - default: \ - { \ - INVOKE_WEIGHT_ONLY_GEMV(ActivationType, 0); \ - break; \ - } \ - } \ - } while (0); -#define INVOKE_WEIGHT_ONLY_KERNEL_FOR_DIFFERENT_ACT() \ - do \ - { \ - switch (activation) \ - { \ - case ActivationType::Gelu: \ - { \ - INVOKE_WEIGHT_ONLY_KERNEL_FOR_SPECIFIED_SHAPE(GeluActivation); \ - break; \ - } \ - case ActivationType::Relu: \ - { \ - INVOKE_WEIGHT_ONLY_KERNEL_FOR_SPECIFIED_SHAPE(ReluActivation); \ - break; \ - } \ - case ActivationType::Identity: \ - { \ - INVOKE_WEIGHT_ONLY_KERNEL_FOR_SPECIFIED_SHAPE(IdentityActivation); \ - break; \ - } \ - default: \ - { \ - assert(false); \ - break; \ - } \ - } \ - } while (0); - -template <> -void weight_only_gemv_launcher(const half* input, const int8_t* weight, const half* scale_list, const half* bias, - half* output, const int k, const int n, ActivationType activation, QuantType qtype, cudaStream_t stream) -{ - dim3 block(512); - dim3 grid(n / 32); - INVOKE_WEIGHT_ONLY_KERNEL_FOR_DIFFERENT_ACT(); -} - -} // namespace kernels -} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h b/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h deleted file mode 100644 index 7deb48cd70e..00000000000 --- a/cpp/tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "tensorrt_llm/common/int8Utils.cuh" -#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h" -#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" -#include -#include -#include -#include -#include -#include - -namespace tensorrt_llm -{ -namespace kernels -{ - -using cutlass_kernels::QuantType; -using cutlass_kernels::ActivationType; - -template -void weight_only_gemv_launcher(const AT* input, const WT* weight, const AT* scale_list, const AT* bias, AT* output, - const int k, const int n, ActivationType activation, QuantType qtype, cudaStream_t stream) -{ - assert(false); -} - -template <> -void weight_only_gemv_launcher(const half* input, const int8_t* weight, const half* scale_list, const half* bias, - half* output, const int k, const int n, ActivationType activation, QuantType qtype, cudaStream_t stream); -} // namespace kernels -} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu index 2c199f0eca0..61059967b2e 100644 --- a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu +++ b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu @@ -28,7 +28,7 @@ namespace layers __global__ void update_indir_cache_kernel(int* tgt_indir_cache, const int* src_indir_cache, const int** parent_ids, const bool* finished, const int* sequence_lengths, const int* input_lengths, int batch_dim, int local_batch_size, - int beam_width, int max_seq_len, int max_input_length) + int beam_width, int max_seq_len) { int time_step = threadIdx.x + blockIdx.x * blockDim.x; int bb_id = threadIdx.y + blockIdx.y * blockDim.y; @@ -36,34 +36,31 @@ __global__ void update_indir_cache_kernel(int* tgt_indir_cache, const int* src_i const int input_length{input_lengths == nullptr ? 0 : input_lengths[bb_id]}; const int batch_id = bb_id / beam_width; const int beam_id = bb_id % beam_width; - if (bb_id >= beam_width * local_batch_size || time_step < input_length || time_step < max_input_length - || finished[bb_id]) + if (bb_id >= beam_width * local_batch_size || time_step < input_length || finished[bb_id]) { return; } int time_step_circ = time_step % max_seq_len; // FIXME: we will remove all paddings later (@boyang) // Skip input paddings when updating the indir cache table. - const int pad_len = max_input_length - input_length; - time_step_circ = time_step_circ >= max_input_length ? (time_step_circ - pad_len) : time_step_circ; const int src_beam = parent_ids[batch_id][beam_id * max_seq_len + current_step]; - const uint tgt_offset = batch_id * beam_width * max_seq_len + beam_id * max_seq_len + time_step_circ; - const uint src_offset = batch_id * beam_width * max_seq_len + src_beam * max_seq_len + time_step_circ; + const uint32_t tgt_offset = batch_id * beam_width * max_seq_len + beam_id * max_seq_len + time_step_circ; + const uint32_t src_offset = batch_id * beam_width * max_seq_len + src_beam * max_seq_len + time_step_circ; tgt_indir_cache[tgt_offset] = (time_step == current_step) ? beam_id : src_indir_cache[src_offset]; } void update_indir_cache_kernelLauncher(int* tgt_indir_cache, const int* src_indir_cache, const int** parent_ids, const bool* finished, const int* sequence_lengths, const int* input_lengths, int batch_dim, int local_batch_size, - int beam_width, int max_seq_len, int max_input_length, cudaStream_t stream) + int beam_width, int max_seq_len, cudaStream_t stream) { const dim3 block(32); // Update indirections steps [input_length[bb_id], sequence_lengths[bb_id]], included const dim3 grid((max_seq_len + block.x - 1) / block.x, local_batch_size * beam_width); update_indir_cache_kernel<<>>(tgt_indir_cache, src_indir_cache, parent_ids, finished, - sequence_lengths, input_lengths, batch_dim, local_batch_size, beam_width, max_seq_len, max_input_length); + sequence_lengths, input_lengths, batch_dim, local_batch_size, beam_width, max_seq_len); } template @@ -129,16 +126,16 @@ void BaseBeamSearchLayer::forward(BeamSearchOutputParams& outputs, ForwardPar TLLM_LOG_DEBUG("%s", __PRETTY_FUNCTION__); Tensor& output_ids_ptr = outputs.output_ids_ptr; - const int batch_size{output_ids_ptr.shape[0]}; - const int beam_width{output_ids_ptr.shape[1]}; - const int max_seq_len{output_ids_ptr.shape[2]}; + const auto batch_size = static_cast(output_ids_ptr.shape[0]); + const auto beam_width = static_cast(output_ids_ptr.shape[1]); + const auto max_seq_len = static_cast(output_ids_ptr.shape[2]); allocateBuffer(batch_size, beam_width); TLLM_CHECK_WITH_INFO(params.ite == 0, "Pipeline Parallelism is not supported yet !"); const int ite{params.ite}; Tensor const& logits = params.logits; - const int local_batch_size = logits.shape[0]; + const auto local_batch_size = logits.shape[0]; const T* embedding_bias = params.embedding_bias ? params.embedding_bias->template getPtr() : nullptr; @@ -148,8 +145,8 @@ void BaseBeamSearchLayer::forward(BeamSearchOutputParams& outputs, ForwardPar invokeAddBiasApplyPenalties(logits.getPtr(), output_ids_ptr.template getPtr(), outputs.parent_ids_ptr.template getPtr(), input_lengths, sequence_length, embedding_bias, ite, - params.max_input_length, local_batch_size, batch_size, beam_width, vocab_size_, vocab_size_padded_, end_ids, - mTemperature, mRepetitionPenalty, mRepetitionPenaltyType, mMinLength, max_seq_len, stream_); + local_batch_size, batch_size, beam_width, vocab_size_, vocab_size_padded_, end_ids, mTemperature, + mRepetitionPenalty, mRepetitionPenaltyType, mMinLength, max_seq_len, stream_); sync_check_cuda_error(); invokeSoftMax(outputs, params); @@ -159,8 +156,7 @@ void BaseBeamSearchLayer::forward(BeamSearchOutputParams& outputs, ForwardPar update_indir_cache_kernelLauncher(outputs.tgt_cache_indirection.template getPtr(), params.src_cache_indirection.template getPtr(), outputs.parent_ids_ptr.template getPtr(), outputs.finished->template getPtr(), - sequence_length, input_lengths, batch_size, local_batch_size, beam_width, max_seq_len, - params.max_input_length, stream_); + sequence_length, input_lengths, batch_size, local_batch_size, beam_width, max_seq_len, stream_); sync_check_cuda_error(); } sync_check_cuda_error(); diff --git a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.h b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.h index f0fd7ede40c..ec3bec7625d 100644 --- a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.h +++ b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.h @@ -54,17 +54,15 @@ class BaseBeamSearchLayer : public BaseLayer class ForwardParams : public SoftmaxParams { public: - ForwardParams(int step, int ite, int max_input_length, tc::Tensor logits, tc::Tensor endIds, - tc::Tensor src_cache_indirection, int max_seq_len) + ForwardParams( + int step, int ite, tc::Tensor logits, tc::Tensor endIds, tc::Tensor src_cache_indirection, int max_seq_len) : SoftmaxParams(step, ite, std::move(logits), std::move(endIds)) - , max_input_length{max_input_length} , src_cache_indirection{std::move(src_cache_indirection)} , max_seq_len{max_seq_len} { } // mandatory parameters - int max_input_length; int max_seq_len; tc::Tensor src_cache_indirection; // [local_batch_size, beam_width, max_seq_len] diff --git a/cpp/tensorrt_llm/layers/baseSamplingLayer.cpp b/cpp/tensorrt_llm/layers/baseSamplingLayer.cpp index 1cc7ab27ca7..0f8707b5f8a 100644 --- a/cpp/tensorrt_llm/layers/baseSamplingLayer.cpp +++ b/cpp/tensorrt_llm/layers/baseSamplingLayer.cpp @@ -177,7 +177,7 @@ void BaseSamplingLayer::forward(DecodingOutputParams& outputs, ForwardParams auto const local_batch_size = params.logits.shape[0]; auto const ite = params.ite; auto const step = params.step; - auto const max_input_length = params.max_input_length; + auto* const input_lengths = params.input_lengths ? params.input_lengths->template getPtr() : nullptr; auto* logits = params.logits.template getPtr(); @@ -219,24 +219,16 @@ void BaseSamplingLayer::forward(DecodingOutputParams& outputs, ForwardParams = params.input_lengths ? params.input_lengths->template getPtr() : nullptr; invokeBatchApplyRepetitionPenalty(logits, repetition_penalty_buf_ + ite * local_batch_size, outputs.output_ids_ptr.template getPtr(), outputs.sequence_length->getPtr(), - batch_size, local_batch_size, vocab_size_padded_, input_lengths, max_input_length, - repetition_penalty_type_, params.max_seq_len, stream_); + batch_size, local_batch_size, vocab_size_padded_, input_lengths, repetition_penalty_type_, + params.max_seq_len, stream_); sync_check_cuda_error(); } } - const int num_generated_tokens = step - max_input_length; - const auto min_lengths = std::begin(mMinLengths) + ite * local_batch_size; - const bool invoke_min_length_penalty = std::any_of( - min_lengths, min_lengths + local_batch_size, [&](int min_length) { return min_length > num_generated_tokens; }); - if (invoke_min_length_penalty) - { - auto* end_ids = params.end_ids.template getPtr(); - invokeMinLengthPenalty(logits, min_lengths_buf_ + ite * local_batch_size, end_ids, - outputs.sequence_length->getPtr(), max_input_length, local_batch_size, vocab_size_padded_, - stream_); - sync_check_cuda_error(); - } + auto* end_ids = params.end_ids.template getPtr(); + invokeMinLengthPenalty(logits, min_lengths_buf_ + ite * local_batch_size, end_ids, + outputs.sequence_length->getPtr(), input_lengths, local_batch_size, vocab_size_padded_, stream_); + sync_check_cuda_error(); #undef ALL_OF runSampling(outputs, params); diff --git a/cpp/tensorrt_llm/layers/baseSamplingLayer.h b/cpp/tensorrt_llm/layers/baseSamplingLayer.h index fa35a261ec0..3ac1f699717 100644 --- a/cpp/tensorrt_llm/layers/baseSamplingLayer.h +++ b/cpp/tensorrt_llm/layers/baseSamplingLayer.h @@ -54,15 +54,13 @@ class BaseSamplingLayer : public BaseLayer class ForwardParams : public DecodingParams { public: - ForwardParams(int step, int ite, int max_input_length, tc::Tensor logits, tc::Tensor end_ids, int max_seq_len) + ForwardParams(int step, int ite, tc::Tensor logits, tc::Tensor end_ids, int max_seq_len) : DecodingParams{step, ite, std::move(logits), std::move(end_ids)} - , max_input_length{max_input_length} , max_seq_len{max_seq_len} { } // mandatory parameters - int max_input_length; int max_seq_len; // optional parameters diff --git a/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp b/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp index d597d1510b3..7849debf008 100644 --- a/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp +++ b/cpp/tensorrt_llm/layers/dynamicDecodeLayer.cpp @@ -267,8 +267,7 @@ void DynamicDecodeLayer::forward(OutputParams& outputs, ForwardParams const& } // common inputs - auto const max_input_length = params.max_input_length; - auto const& end_id = params.end_ids; + auto const& end_ids = params.end_ids; // dynamic decode GPT if (beam_width > 1) @@ -286,7 +285,7 @@ void DynamicDecodeLayer::forward(OutputParams& outputs, ForwardParams const& const size_t dynamic_decode_batch_size = has_diff_runtime_args_ ? 1 : local_batch_size; const int dynamic_decode_total_iteration = local_batch_size / dynamic_decode_batch_size; - for (uint dynamic_ite = ite * dynamic_decode_total_iteration; + for (uint32_t dynamic_ite = ite * dynamic_decode_total_iteration; dynamic_ite < (ite + 1) * dynamic_decode_total_iteration; ++dynamic_ite) { const int dynamic_id_offset = dynamic_ite * dynamic_decode_batch_size * beam_width; @@ -295,9 +294,9 @@ void DynamicDecodeLayer::forward(OutputParams& outputs, ForwardParams const& auto const logits_offset = logits.slice( {dynamic_decode_batch_size, logits.shape[1], logits.shape[2]}, dynamic_decode_vocab_size_units_offset); auto const end_id_offset - = end_id.slice({dynamic_decode_batch_size}, dynamic_ite * dynamic_decode_batch_size); - typename BaseBeamSearchLayer::ForwardParams dynamic_decode_input_tensors{ - step, ite, max_input_length, logits_offset, end_id_offset, *params.src_cache_indirection, max_seq_len}; + = end_ids.slice({dynamic_decode_batch_size}, dynamic_ite * dynamic_decode_batch_size); + typename BaseBeamSearchLayer::ForwardParams dynamic_decode_input_tensors{step, ite, logits_offset, + end_id_offset, *params.src_cache_indirection, static_cast(max_seq_len)}; dynamic_decode_input_tensors.embedding_bias = params.embedding_bias; @@ -337,9 +336,9 @@ void DynamicDecodeLayer::forward(OutputParams& outputs, ForwardParams const& Tensor const logits_slice{ logits.slice({local_batch_size, beam_width, logits.shape[2]}, local_batch_offset * logits.shape[2])}; - Tensor const end_id_slice{end_id.slice({local_batch_size}, ite * local_batch_size)}; + Tensor const end_id_slice{end_ids.slice({local_batch_size}, ite * local_batch_size)}; typename BaseSamplingLayer::ForwardParams decode_input_tensors{ - step, ite, max_input_length, logits_slice, end_id_slice, max_seq_len}; + step, ite, logits_slice, end_id_slice, static_cast(max_seq_len)}; decode_input_tensors.embedding_bias = params.embedding_bias; @@ -368,11 +367,13 @@ void DynamicDecodeLayer::forward(OutputParams& outputs, ForwardParams const& } if (outputs.output_log_probs) { + auto const generationStep = step - params.max_input_length; + TLLM_CHECK(generationStep >= 0); Tensor& output_log_probs = outputs.output_log_probs.value(); - size_t step_offset = (step - max_input_length) * batch_size * beam_width; - decode_outputs.output_log_probs = output_log_probs.slice( - {output_log_probs.shape[0] - (step - max_input_length), local_batch_size * beam_width}, - step_offset + local_batch_offset); + size_t step_offset = generationStep * batch_size * beam_width; + decode_outputs.output_log_probs + = output_log_probs.slice({output_log_probs.shape[0] - generationStep, local_batch_size * beam_width}, + step_offset + local_batch_offset); } // Run topk / topp decode layers. diff --git a/cpp/tensorrt_llm/layers/onlineBeamSearchLayer.cu b/cpp/tensorrt_llm/layers/onlineBeamSearchLayer.cu index 340662a24c1..1b1dda10fd1 100644 --- a/cpp/tensorrt_llm/layers/onlineBeamSearchLayer.cu +++ b/cpp/tensorrt_llm/layers/onlineBeamSearchLayer.cu @@ -98,12 +98,12 @@ void OnlineBeamSearchLayer::invokeSoftMax(BeamSearchOutputParams& outputs, So { TLLM_LOG_DEBUG("%s", __PRETTY_FUNCTION__); Tensor const& output_ids_ptr = outputs.output_ids_ptr; - const int batch_size{output_ids_ptr.shape[0]}; - const int beam_width{output_ids_ptr.shape[1]}; - const int max_seq_len{output_ids_ptr.shape[2]}; + const auto batch_size = static_cast(output_ids_ptr.shape[0]); + const auto beam_width = static_cast(output_ids_ptr.shape[1]); + const auto max_seq_len = static_cast(output_ids_ptr.shape[2]); const int ite{params.ite}; Tensor const& logits{params.logits}; - const int local_batch_size = logits.shape[0]; + const auto local_batch_size = logits.shape[0]; BeamHypotheses beamHypotheses; auto* const end_ids = params.end_ids.template getPtr(); diff --git a/cpp/tensorrt_llm/layers/topPSamplingLayer.cu b/cpp/tensorrt_llm/layers/topPSamplingLayer.cu index 4cd9831bb23..f6bb8c8f8ae 100644 --- a/cpp/tensorrt_llm/layers/topPSamplingLayer.cu +++ b/cpp/tensorrt_llm/layers/topPSamplingLayer.cu @@ -232,7 +232,6 @@ void TopPSamplingLayer::runSampling(DecodingOutputParams& outputs, DecodingPa auto const batch_size = outputs.output_ids_ptr.shape[0]; auto const local_batch_size = params.logits.shape[0]; auto const ite = params.ite; - auto const step = params.step; // in case of skip any, the logit value is already copied and processed. auto* logits = !skip_any_ ? params.logits.template getPtr() : runtime_logits_buf_; diff --git a/cpp/tensorrt_llm/plugins/CMakeLists.txt b/cpp/tensorrt_llm/plugins/CMakeLists.txt index 0793d339f67..0a6583c851d 100755 --- a/cpp/tensorrt_llm/plugins/CMakeLists.txt +++ b/cpp/tensorrt_llm/plugins/CMakeLists.txt @@ -60,11 +60,15 @@ add_subdirectory(common) # Set gencodes list(APPEND PLUGIN_SOURCES "${PLUGIN_CU_SOURCES}") -list(APPEND PLUGIN_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/api/InferPlugin.cpp") +list(APPEND PLUGIN_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/api/tllmPlugin.cpp") # ################################# SHARED LIBRARY # ############################################################################## +if(MSVC) + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS 1) +endif() + add_library(${PLUGIN_SHARED_TARGET} SHARED ${PLUGIN_SOURCES}) target_include_directories( @@ -85,12 +89,14 @@ set_target_properties( LIBRARY_OUTPUT_DIRECTORY "${TRT_OUT_DIR}" RUNTIME_OUTPUT_DIRECTORY "${TRT_OUT_DIR}") -set_target_properties( - ${PLUGIN_SHARED_TARGET} - PROPERTIES - LINK_FLAGS - "-Wl,--exclude-libs,ALL -Wl,--version-script=${PLUGIN_EXPORT_MAP} -Wl,--no-undefined" -) +if(NOT MSVC) + set_target_properties( + ${PLUGIN_SHARED_TARGET} + PROPERTIES + LINK_FLAGS + "-Wl,--exclude-libs,ALL -Wl,--version-script=${PLUGIN_EXPORT_MAP} ${UNDEFINED_FLAG}" + ) +endif() set_target_properties( ${PLUGIN_SHARED_TARGET} PROPERTIES VERSION ${TRT_VERSION} SOVERSION @@ -102,7 +108,6 @@ target_link_libraries( ${PLUGIN_SHARED_TARGET} ${CUBLAS_LIB} ${CUBLASLT_LIB} - ${CUDART_LIB} ${CUDNN_LIB} nvinfer ${CUDA_DRV_LIB} diff --git a/cpp/tensorrt_llm/plugins/api/InferPlugin.cpp b/cpp/tensorrt_llm/plugins/api/InferPlugin.cpp deleted file mode 100644 index c14452ba1b4..00000000000 --- a/cpp/tensorrt_llm/plugins/api/InferPlugin.cpp +++ /dev/null @@ -1,179 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "NvInfer.h" -#include "NvInferPlugin.h" -#include "tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h" -#include "tensorrt_llm/plugins/common/checkMacrosPlugin.h" -#include "tensorrt_llm/plugins/common/plugin.h" -#include "tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h" -#include "tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h" -#include "tensorrt_llm/plugins/identityPlugin/identityPlugin.h" -#include "tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h" -#include "tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h" -#include "tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h" -#if ENABLE_MULTI_DEVICE -#include "tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h" -#include "tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h" -#include "tensorrt_llm/plugins/ncclPlugin/recvPlugin.h" -#include "tensorrt_llm/plugins/ncclPlugin/sendPlugin.h" -#endif // ENABLE_MULTI_DEVICE -#include "tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h" -#include "tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h" -#include "tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h" -#include "tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h" -#include "tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h" -#include "tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h" -#include "tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h" -#include -#include -#include -#include -#include -#include -#include -using namespace nvinfer1; -using namespace nvinfer1::plugin; - -namespace nvinfer1 -{ - -namespace -{ - -// This singleton ensures that each plugin is only registered once for a given -// namespace and type, and attempts of duplicate registration are ignored. -class PluginCreatorRegistry -{ -public: - static PluginCreatorRegistry& getInstance() - { - static PluginCreatorRegistry instance; - return instance; - } - - template - void addPluginCreator(void* logger, const char* libNamespace) - { - // Make accesses to the plugin creator registry thread safe - std::lock_guard lock(mRegistryLock); - - std::string errorMsg; - std::string verboseMsg; - - std::unique_ptr pluginCreator{new CreatorType{}}; - pluginCreator->setPluginNamespace(libNamespace); - - nvinfer1::ILogger* trtLogger = static_cast(logger); - std::string pluginType = std::string{pluginCreator->getPluginNamespace()} - + "::" + std::string{pluginCreator->getPluginName()} + " version " - + std::string{pluginCreator->getPluginVersion()}; - - if (mRegistryList.find(pluginType) == mRegistryList.end()) - { - bool status = getPluginRegistry()->registerCreator(*pluginCreator, libNamespace); - if (status) - { - mRegistry.push(std::move(pluginCreator)); - mRegistryList.insert(pluginType); - verboseMsg = "Registered plugin creator - " + pluginType; - } - else - { - errorMsg = "Could not register plugin creator - " + pluginType; - } - } - else - { - verboseMsg = "Plugin creator already registered - " + pluginType; - } - - if (trtLogger) - { - if (!errorMsg.empty()) - { - trtLogger->log(ILogger::Severity::kERROR, errorMsg.c_str()); - } - - if (!verboseMsg.empty()) - { - trtLogger->log(ILogger::Severity::kVERBOSE, verboseMsg.c_str()); - } - } - } - - ~PluginCreatorRegistry() - { - std::lock_guard lock(mRegistryLock); - - // Release pluginCreators in LIFO order of registration. - while (!mRegistry.empty()) - { - mRegistry.pop(); - } - mRegistryList.clear(); - } - -private: - PluginCreatorRegistry() {} - - std::mutex mRegistryLock; - std::stack> mRegistry; - std::unordered_set mRegistryList; - -public: - PluginCreatorRegistry(PluginCreatorRegistry const&) = delete; - void operator=(PluginCreatorRegistry const&) = delete; -}; - -template -void initializePlugin(void* logger, const char* libNamespace) -{ - PluginCreatorRegistry::getInstance().addPluginCreator(logger, libNamespace); -} - -} // namespace -} // namespace nvinfer1 - -// New Plugin APIs - -extern "C" -{ - bool initLibNvInferPlugins(void* logger, const char* libNamespace) - { - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); -#if ENABLE_MULTI_DEVICE - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); -#endif // ENABLE_MULTI_DEVICE - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - nvinfer1::initializePlugin(logger, libNamespace); - return true; - } -} // extern "C" diff --git a/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp b/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp new file mode 100644 index 00000000000..aa0fde60211 --- /dev/null +++ b/cpp/tensorrt_llm/plugins/api/tllmPlugin.cpp @@ -0,0 +1,209 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "tllmPlugin.h" + +#include "tensorrt_llm/common/stringUtils.h" +#include "tensorrt_llm/runtime/tllmLogger.h" + +#include "tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h" +#include "tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h" +#include "tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h" +#include "tensorrt_llm/plugins/identityPlugin/identityPlugin.h" +#include "tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h" +#include "tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h" +#include "tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h" +#if ENABLE_MULTI_DEVICE +#include "tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h" +#include "tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h" +#include "tensorrt_llm/plugins/ncclPlugin/recvPlugin.h" +#include "tensorrt_llm/plugins/ncclPlugin/sendPlugin.h" +#endif // ENABLE_MULTI_DEVICE +#include "tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h" +#include "tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h" +#include "tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h" +#include "tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h" +#include "tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h" +#include "tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h" +#include "tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h" + +#include +#include + +#include + +namespace tc = tensorrt_llm::common; + +namespace +{ + +nvinfer1::IPluginCreator* creatorPtr(nvinfer1::IPluginCreator& creator) +{ + return &creator; +} + +auto tllmLogger = tensorrt_llm::runtime::TllmLogger(); + +nvinfer1::ILogger* gLogger{&tllmLogger}; + +class GlobalLoggerFinder : public nvinfer1::ILoggerFinder +{ +public: + nvinfer1::ILogger* findLogger() override + { + return gLogger; + } +}; + +GlobalLoggerFinder gGlobalLoggerFinder{}; + +#if !defined(_MSC_VER) +__attribute__((constructor)) +#endif +void initOnLoad() +{ + auto constexpr kLoadPlugins = "TRT_LLM_LOAD_PLUGINS"; + auto const loadPlugins = std::getenv(kLoadPlugins); + if (loadPlugins && loadPlugins[0] == '1') + { + initTrtLlmPlugins(gLogger); + } +} + +bool pluginsInitialized = false; + +} // namespace + +// New Plugin APIs + +extern "C" +{ + bool initTrtLlmPlugins(void* logger, const char* libNamespace) + { + if (pluginsInitialized) + return true; + + if (logger) + { + gLogger = static_cast(logger); + } + setLoggerFinder(&gGlobalLoggerFinder); + + auto registry = getPluginRegistry(); + std::int32_t nbCreators; + auto creators = getPluginCreators(nbCreators); + + for (std::int32_t i = 0; i < nbCreators; ++i) + { + auto const creator = creators[i]; + creator->setPluginNamespace(libNamespace); + registry->registerCreator(*creator, libNamespace); + if (gLogger) + { + auto const msg = tc::fmtstr("Registered plugin creator %s version %s in namespace %s", + creator->getPluginName(), creator->getPluginVersion(), libNamespace); + gLogger->log(nvinfer1::ILogger::Severity::kVERBOSE, msg.c_str()); + } + } + + pluginsInitialized = true; + return true; + } + + [[maybe_unused]] void setLoggerFinder([[maybe_unused]] nvinfer1::ILoggerFinder* finder) + { + tensorrt_llm::plugins::api::LoggerFinder::getInstance().setLoggerFinder(finder); + } + + [[maybe_unused]] nvinfer1::IPluginCreator* const* getPluginCreators(std::int32_t& nbCreators) + { + static tensorrt_llm::plugins::IdentityPluginCreator identityPluginCreator; + static tensorrt_llm::plugins::BertAttentionPluginCreator bertAttentionPluginCreator; + static tensorrt_llm::plugins::GPTAttentionPluginCreator gptAttentionPluginCreator; + static tensorrt_llm::plugins::GemmPluginCreator gemmPluginCreator; +#if ENABLE_MULTI_DEVICE + static tensorrt_llm::plugins::SendPluginCreator sendPluginCreator; + static tensorrt_llm::plugins::RecvPluginCreator recvPluginCreator; + static tensorrt_llm::plugins::AllreducePluginCreator allreducePluginCreator; + static tensorrt_llm::plugins::AllgatherPluginCreator allgatherPluginCreator; +#endif // ENABLE_MULTI_DEVICE + static tensorrt_llm::plugins::LayernormPluginCreator layernormPluginCreator; + static tensorrt_llm::plugins::RmsnormPluginCreator rmsnormPluginCreator; + static tensorrt_llm::plugins::SmoothQuantGemmPluginCreator smoothQuantGemmPluginCreator; + static tensorrt_llm::plugins::LayernormQuantizationPluginCreator layernormQuantizationPluginCreator; + static tensorrt_llm::plugins::QuantizePerTokenPluginCreator quantizePerTokenPluginCreator; + static tensorrt_llm::plugins::QuantizeTensorPluginCreator quantizeTensorPluginCreator; + static tensorrt_llm::plugins::RmsnormQuantizationPluginCreator rmsnormQuantizationPluginCreator; + static tensorrt_llm::plugins::WeightOnlyGroupwiseQuantMatmulPluginCreator + weightOnlyGroupwiseQuantMatmulPluginCreator; + static tensorrt_llm::plugins::WeightOnlyQuantMatmulPluginCreator weightOnlyQuantMatmulPluginCreator; + static tensorrt_llm::plugins::LookupPluginCreator lookupPluginCreator; + + static std::array pluginCreators + = { creatorPtr(identityPluginCreator), + creatorPtr(bertAttentionPluginCreator), + creatorPtr(gptAttentionPluginCreator), + creatorPtr(gemmPluginCreator), +#if ENABLE_MULTI_DEVICE + creatorPtr(sendPluginCreator), + creatorPtr(recvPluginCreator), + creatorPtr(allreducePluginCreator), + creatorPtr(allgatherPluginCreator), +#endif // ENABLE_MULTI_DEVICE + creatorPtr(layernormPluginCreator), + creatorPtr(rmsnormPluginCreator), + creatorPtr(smoothQuantGemmPluginCreator), + creatorPtr(layernormQuantizationPluginCreator), + creatorPtr(quantizePerTokenPluginCreator), + creatorPtr(quantizeTensorPluginCreator), + creatorPtr(rmsnormQuantizationPluginCreator), + creatorPtr(weightOnlyGroupwiseQuantMatmulPluginCreator), + creatorPtr(weightOnlyQuantMatmulPluginCreator), + creatorPtr(lookupPluginCreator), + }; + nbCreators = pluginCreators.size(); + return pluginCreators.data(); + } + +} // extern "C" + +namespace tensorrt_llm::plugins::api +{ +LoggerFinder& tensorrt_llm::plugins::api::LoggerFinder::getInstance() noexcept +{ + static LoggerFinder instance; + return instance; +} + +void LoggerFinder::setLoggerFinder(nvinfer1::ILoggerFinder* finder) +{ + std::lock_guard lk(mMutex); + if (mLoggerFinder == nullptr && finder != nullptr) + { + mLoggerFinder = finder; + } +} + +nvinfer1::ILogger* LoggerFinder::findLogger() +{ + std::lock_guard lk(mMutex); + if (mLoggerFinder != nullptr) + { + return mLoggerFinder->findLogger(); + } + return nullptr; +} +} // namespace tensorrt_llm::plugins::api diff --git a/cpp/tensorrt_llm/plugins/api/tllmPlugin.h b/cpp/tensorrt_llm/plugins/api/tllmPlugin.h new file mode 100644 index 00000000000..bfc034674c6 --- /dev/null +++ b/cpp/tensorrt_llm/plugins/api/tllmPlugin.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace tensorrt_llm::plugins::api +{ + +auto constexpr kDefaultNamespace = "tensorrt_llm"; + +class LoggerFinder : public nvinfer1::ILoggerFinder +{ +public: + //! Set the logger finder. + void setLoggerFinder(nvinfer1::ILoggerFinder* finder); + + //! Get the logger. + nvinfer1::ILogger* findLogger() override; + + static LoggerFinder& getInstance() noexcept; + +private: + LoggerFinder() = default; + + nvinfer1::ILoggerFinder* mLoggerFinder{nullptr}; + std::mutex mMutex; +}; + +} // namespace tensorrt_llm::plugins::api + +extern "C" +{ + // This function is used for explicitly registering the TRT-LLM plugins and the default logger. + bool initTrtLlmPlugins(void* logger, const char* libNamespace = tensorrt_llm::plugins::api::kDefaultNamespace); + + // The functions below are used by TensorRT to when loading a shared plugin library with automatic registering. + // see https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#generating-plugin-library + TENSORRTAPI [[maybe_unused]] void setLoggerFinder([[maybe_unused]] nvinfer1::ILoggerFinder* finder); + TENSORRTAPI [[maybe_unused]] nvinfer1::IPluginCreator* const* getPluginCreators(int32_t& nbCreators); +} diff --git a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp index 830000b0477..1ccdef46e48 100644 --- a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp @@ -14,22 +14,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h" +#include "bertAttentionPlugin.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h" #include "tensorrt_llm/kernels/gptKernels.h" #include "tensorrt_llm/kernels/unfusedAttentionKernels.h" using namespace nvinfer1; using namespace tensorrt_llm::kernels; -using namespace tensorrt_llm::common; -using nvinfer1::plugin::BertAttentionPluginCreator; -using nvinfer1::plugin::BertAttentionPlugin; -using nvinfer1::plugin::nextWorkspacePtr; +namespace tc = tensorrt_llm::common; + +using tensorrt_llm::plugins::BertAttentionPluginCreator; +using tensorrt_llm::plugins::BertAttentionPlugin; static const char* BERT_ATTENTION_PLUGIN_VERSION{"1"}; static const char* BERT_ATTENTION_PLUGIN_NAME{"BertAttention"}; PluginFieldCollection BertAttentionPluginCreator::mFC{}; -std::vector BertAttentionPluginCreator::mPluginAttributes; +std::vector BertAttentionPluginCreator::mPluginAttributes; BertAttentionPlugin::BertAttentionPlugin(int num_heads, int head_size, float q_scaling, bool qk_half_accum, ContextFMHAType context_fmha_type, nvinfer1::DataType type) @@ -56,7 +56,7 @@ BertAttentionPlugin::BertAttentionPlugin(const void* data, size_t length) read(d, mEnableContextFMHA); read(d, mFMHAForceFP32Acc); read(d, mType); - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); } // IPluginV2DynamicExt Methods @@ -71,7 +71,7 @@ nvinfer1::IPluginV2DynamicExt* BertAttentionPlugin::clone() const noexcept nvinfer1::DimsExprs BertAttentionPlugin::getOutputDimensions( int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept { - PLUGIN_ASSERT(outputIndex == 0); + TLLM_CHECK(outputIndex == 0); auto ret = inputs[0]; ret.d[2] = exprBuilder.constant(ret.d[2]->getConstantValue() / 3); return ret; @@ -139,7 +139,7 @@ size_t BertAttentionPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc* i workspaces[8] = qk_buf_float_size; workspaces[9] = padding_offset_size; - return plugin::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS); + return tensorrt_llm::plugins::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS); } template @@ -168,7 +168,7 @@ int BertAttentionPlugin::enqueueImpl(const nvinfer1::PluginTensorDesc* inputDesc T* context_buf_ = (T*) (outputs[0]); auto cublasHandle = mCublasWrapper->getCublasHandle(); - PLUGIN_CUBLASASSERT(cublasSetStream(cublasHandle, stream)); + TLLM_CUDA_CHECK(cublasSetStream(cublasHandle, stream)); mCublasWrapper->setStream(stream); mCublasWrapper->setWorkspace(workspace); if (inputDesc[0].type == DataType::kHALF) @@ -225,9 +225,9 @@ int BertAttentionPlugin::enqueueImpl(const nvinfer1::PluginTensorDesc* inputDesc // Padding offset = nullptr here (remove padding is not supported). invokeAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, v_buf_2_, const_cast(attention_input), input_lengths, nullptr, request_batch_size, request_seq_len, batch_size * input_seq_len, mNumHeads, mNumHeads, mHeadSize, - mEnableContextFMHA, 0, PositionEmbeddingType::kLEARNED_ABSOLUTE, (float*) nullptr, 0, stream); + mEnableContextFMHA, 0, 0.0f, 0.0f, PositionEmbeddingType::kLEARNED_ABSOLUTE, (float*) nullptr, 0, stream); - const cudaDataType_t gemm_data_type = CudaDataType::value; + const auto gemm_data_type = tc::CudaDataType::value; const int attention_seq_len_1 = request_seq_len; // q length const int attention_seq_len_2 = request_seq_len; // kv length const T qk_scale = static_cast(1.0f / (sqrtf(mHeadSize * 1.0f) * q_scaling)); @@ -338,7 +338,7 @@ int BertAttentionPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, nvinfer1::DataType BertAttentionPlugin::getOutputDataType( int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept { - PLUGIN_ASSERT(index == 0); + TLLM_CHECK(index == 0); return inputTypes[0]; } @@ -363,10 +363,10 @@ int BertAttentionPlugin::initialize() noexcept { auto cublasHandle = getCublasHandle(); auto cublasLtHandle = getCublasLtHandle(); - mCublasAlgoMap = new cublasAlgoMap(GEMM_CONFIG); + mCublasAlgoMap = new tc::cublasAlgoMap(GEMM_CONFIG); mCublasWrapperMutex = new std::mutex(); mCublasWrapper - = new cublasMMWrapper(cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap, mCublasWrapperMutex, nullptr); + = new tc::cublasMMWrapper(cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap, mCublasWrapperMutex, nullptr); if (mEnableContextFMHA) { mFMHARunner = new FusedMHARunnerV2(DATA_TYPE_FP16, mNumHeads, mHeadSize, mQScaling); @@ -415,16 +415,6 @@ void BertAttentionPlugin::serialize(void* buffer) const noexcept void BertAttentionPlugin::terminate() noexcept {} -void BertAttentionPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* BertAttentionPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// BertAttentionPluginCreator::BertAttentionPluginCreator() @@ -470,32 +460,32 @@ IPluginV2* BertAttentionPluginCreator::createPlugin(const char* name, const Plug const char* attrName = fields[i].name; if (!strcmp(attrName, "num_heads")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); num_heads = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "head_size")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); head_size = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "q_scaling")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kFLOAT32); q_scaling = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "enable_qk_half_accum")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT8); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT8); qk_half_accum = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "context_fmha_type")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT8); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT8); context_fmha_type = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } } @@ -529,13 +519,3 @@ IPluginV2* BertAttentionPluginCreator::deserializePlugin( } return nullptr; } - -void BertAttentionPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* BertAttentionPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h index 4b68f93e3b7..d8c04d9221c 100644 --- a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h +++ b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_BERT_ATTENTION_PLUGIN_H -#define TRT_BERT_ATTENTION_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h" @@ -27,12 +26,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class BertAttentionPlugin : public IPluginV2DynamicExt +class BertAttentionPlugin : public BasePlugin { public: BertAttentionPlugin() = delete; @@ -74,12 +71,9 @@ class BertAttentionPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: const std::string mLayerName; - std::string mNamespace; int mNumHeads; int mHeadSize; @@ -101,7 +95,7 @@ class BertAttentionPlugin : public IPluginV2DynamicExt tensorrt_llm::common::cublasMMWrapper* mCublasWrapper; }; -class BertAttentionPluginCreator : public IPluginCreator +class BertAttentionPluginCreator : public BaseCreator { public: BertAttentionPluginCreator(); @@ -117,17 +111,9 @@ class BertAttentionPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_BERT_ATTENTION_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.cpp b/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.cpp index be087319cb5..029c638feb8 100644 --- a/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.cpp @@ -15,135 +15,21 @@ * limitations under the License. */ -#include "tensorrt_llm/plugins/common/checkMacrosPlugin.h" -#include -#include -#include +#include "checkMacrosPlugin.h" -namespace nvinfer1 -{ -namespace plugin -{ - -namespace -{ +#include "tensorrt_llm/common/logger.h" -// This will be populated by the logger supplied by the user to initLibNvInferPlugins() -ILogger* gLogger{}; - -template -int LogStream::Buf::sync() +namespace tensorrt_llm::plugins { - std::string s = str(); - while (!s.empty() && s.back() == '\n') - { - s.pop_back(); - } - if (gLogger != nullptr) - { - gLogger->log(kSeverity, s.c_str()); - } - str(""); - return 0; -} - -// These use gLogger, and therefore require initLibNvInferPlugins() to be called with a logger -// (otherwise, it will not log) -LogStream gLogError; -LogStream gLogWarning; -LogStream gLogInfo; -LogStream gLogVerbose; -} // namespace - -// break-pointable -void throwCudaError(const char* file, const char* function, int line, int status, const char* msg) -{ - CudaError error(file, function, line, status, msg); - error.log(gLogError); - throw error; -} - -// break-pointable -void throwCublasError(const char* file, const char* function, int line, int status, const char* msg) -{ - if (msg == nullptr) - { - auto s_ = static_cast(status); - switch (s_) - { - case CUBLAS_STATUS_SUCCESS: msg = "CUBLAS_STATUS_SUCCESS"; break; - case CUBLAS_STATUS_NOT_INITIALIZED: msg = "CUBLAS_STATUS_NOT_INITIALIZED"; break; - case CUBLAS_STATUS_ALLOC_FAILED: msg = "CUBLAS_STATUS_ALLOC_FAILED"; break; - case CUBLAS_STATUS_INVALID_VALUE: msg = "CUBLAS_STATUS_INVALID_VALUE"; break; - case CUBLAS_STATUS_ARCH_MISMATCH: msg = "CUBLAS_STATUS_ARCH_MISMATCH"; break; - case CUBLAS_STATUS_MAPPING_ERROR: msg = "CUBLAS_STATUS_MAPPING_ERROR"; break; - case CUBLAS_STATUS_EXECUTION_FAILED: msg = "CUBLAS_STATUS_EXECUTION_FAILED"; break; - case CUBLAS_STATUS_INTERNAL_ERROR: msg = "CUBLAS_STATUS_INTERNAL_ERROR"; break; - case CUBLAS_STATUS_NOT_SUPPORTED: msg = "CUBLAS_STATUS_NOT_SUPPORTED"; break; - case CUBLAS_STATUS_LICENSE_ERROR: msg = "CUBLAS_STATUS_LICENSE_ERROR"; break; - } - } - CublasError error(file, function, line, status, msg); - error.log(gLogError); - throw error; -} - -// break-pointable -void throwCudnnError(const char* file, const char* function, int line, int status, const char* msg) -{ - CudnnError error(file, function, line, status, msg); - error.log(gLogError); - throw error; -} - -// break-pointable -void throwPluginError(char const* file, char const* function, int line, int status, char const* msg) -{ - PluginError error(file, function, line, status, msg); - reportValidationFailure(msg, file, line); - throw error; -} void caughtError(const std::exception& e) { - gLogError << e.what() << std::endl; + TLLM_LOG_EXCEPTION(e); } void logError(const char* msg, const char* file, const char* fn, int line) { - gLogError << "Parameter check failed at: " << file << "::" << fn << "::" << line; - gLogError << ", condition: " << msg << std::endl; -} - -void reportValidationFailure(char const* msg, char const* file, int line) -{ - std::ostringstream stream; - stream << "Validation failed: " << msg << std::endl << file << ':' << line << std::endl; - getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str()); + TLLM_LOG_ERROR("Parameter check failed at: %s::%s::%d, condition: %s", file, fn, line, msg); } -// break-pointable -void reportAssertion(const char* msg, const char* file, int line) -{ - std::ostringstream stream; - stream << "Assertion failed: " << msg << std::endl - << file << ':' << line << std::endl - << "Aborting..." << std::endl; - getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str()); - PLUGIN_CUASSERT(cudaDeviceReset()); - abort(); -} - -void TRTException::log(std::ostream& logStream) const -{ - logStream << file << " (" << line << ") - " << name << " Error in " << function << ": " << status; - if (message != nullptr) - { - logStream << " (" << message << ")"; - } - logStream << std::endl; -} - -} // namespace plugin - -} // namespace nvinfer1 +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.h b/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.h index 1117389ce91..2280306f7b1 100644 --- a/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.h +++ b/cpp/tensorrt_llm/plugins/common/checkMacrosPlugin.h @@ -14,282 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef CHECK_MACROS_PLUGIN_H -#define CHECK_MACROS_PLUGIN_H +#pragma once -#include "NvInfer.h" -#include -#include +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/cudaUtils.h" -#ifdef _MSC_VER -#define FN_NAME __FUNCTION__ -#else -#define FN_NAME __func__ -#endif - -namespace nvinfer1 -{ -namespace plugin -{ - -namespace -{ -template -class LogStream : public std::ostream -{ - class Buf : public std::stringbuf - { - public: - int sync() override; - }; - - Buf buffer; - std::mutex mLogStreamMutex; - -public: - std::mutex& getMutex() - { - return mLogStreamMutex; - } - - LogStream() - : std::ostream(&buffer){}; -}; - -// Use mutex to protect multi-stream write to buffer -template -LogStream& operator<<(LogStream& stream, T const& msg) -{ - std::lock_guard guard(stream.getMutex()); - auto& os = static_cast(stream); - os << msg; - return stream; -} - -// Special handling static numbers -template -inline LogStream& operator<<(LogStream& stream, int32_t num) +namespace tensorrt_llm::plugins { - std::lock_guard guard(stream.getMutex()); - auto& os = static_cast(stream); - os << num; - return stream; -} -// Special handling std::endl -template -inline LogStream& operator<<(LogStream& stream, std::ostream& (*f)(std::ostream&) ) -{ - std::lock_guard guard(stream.getMutex()); - auto& os = static_cast(stream); - os << f; - return stream; -} - -} // namespace - -void reportValidationFailure(char const* msg, char const* file, int line); -[[noreturn]] void reportAssertion(const char* msg, const char* file, int line); void logError(const char* msg, const char* file, const char* fn, int line); -[[noreturn]] void throwCudaError( - const char* file, const char* function, int line, int status, const char* msg = nullptr); -[[noreturn]] void throwCudnnError( - const char* file, const char* function, int line, int status, const char* msg = nullptr); -[[noreturn]] void throwCublasError( - const char* file, const char* function, int line, int status, const char* msg = nullptr); -[[noreturn]] void throwPluginError( - char const* file, char const* function, int line, int status, char const* msg = nullptr); - void caughtError(const std::exception& e); -class TRTException : public std::exception -{ -public: - TRTException(const char* fl, const char* fn, int ln, int st, const char* msg, const char* nm) - : file(fl) - , function(fn) - , line(ln) - , status(st) - , message(msg) - , name(nm) - { - } - - virtual void log(std::ostream& logStream) const; - - void setMessage(const char* msg) - { - message = msg; - } - -protected: - const char* file{nullptr}; - const char* function{nullptr}; - int line{0}; - int status{0}; - const char* message{nullptr}; - const char* name{nullptr}; -}; - -class CudaError : public TRTException -{ -public: - CudaError(const char* fl, const char* fn, int ln, int stat, const char* msg = nullptr) - : TRTException(fl, fn, ln, stat, msg, "Cuda") - { - } -}; - -class CudnnError : public TRTException -{ -public: - CudnnError(const char* fl, const char* fn, int ln, int stat, const char* msg = nullptr) - : TRTException(fl, fn, ln, stat, msg, "Cudnn") - { - } -}; - -class CublasError : public TRTException -{ -public: - CublasError(const char* fl, const char* fn, int ln, int stat, const char* msg = nullptr) - : TRTException(fl, fn, ln, stat, msg, "cuBLAS") - { - } -}; - -class PluginError : public TRTException -{ -public: - PluginError(char const* fl, char const* fn, int ln, int stat, char const* msg = nullptr) - : TRTException(fl, fn, ln, stat, msg, "Plugin") - { - } -}; -} // namespace plugin - -} // namespace nvinfer1 - -#define PLUGIN_API_CHECK(condition) \ - { \ - if ((condition) == false) \ - { \ - nvinfer1::plugin::logError(#condition, __FILE__, FN_NAME, __LINE__); \ - return; \ - } \ - } - -#define PLUGIN_API_CHECK_RETVAL(condition, retval) \ - { \ - if ((condition) == false) \ - { \ - nvinfer1::plugin::logError(#condition, __FILE__, FN_NAME, __LINE__); \ - return retval; \ - } \ - } - -#define PLUGIN_API_CHECK_ENUM_RANGE(Type, val) PLUGIN_API_CHECK(int(val) >= 0 && int(val) < EnumMax()) -#define PLUGIN_API_CHECK_ENUM_RANGE_RETVAL(Type, val, retval) \ - PLUGIN_API_CHECK_RETVAL(int(val) >= 0 && int(val) < EnumMax(), retval) - -#define PLUGIN_CHECK_CUDA(call) \ - do \ - { \ - cudaError_t status = call; \ - if (status != cudaSuccess) \ - { \ - return status; \ - } \ - } while (0) - -#define PLUGIN_CHECK_CUDNN(call) \ - do \ - { \ - cudnnStatus_t status = call; \ - if (status != CUDNN_STATUS_SUCCESS) \ - { \ - return status; \ - } \ - } while (0) - -#define PLUGIN_CUBLASASSERT(status_) \ - { \ - auto s_ = status_; \ - if (s_ != CUBLAS_STATUS_SUCCESS) \ - { \ - nvinfer1::plugin::throwCublasError(__FILE__, FN_NAME, __LINE__, s_); \ - } \ - } - -#define PLUGIN_CUDNNASSERT(status_) \ - { \ - auto s_ = status_; \ - if (s_ != CUDNN_STATUS_SUCCESS) \ - { \ - const char* msg = cudnnGetErrorString(s_); \ - nvinfer1::plugin::throwCudnnError(__FILE__, FN_NAME, __LINE__, s_, msg); \ - } \ - } - -#define PLUGIN_CUASSERT(status_) \ - { \ - auto s_ = status_; \ - if (s_ != cudaSuccess) \ - { \ - const char* msg = cudaGetErrorString(s_); \ - nvinfer1::plugin::throwCudaError(__FILE__, FN_NAME, __LINE__, s_, msg); \ - } \ - } - -#define GET_MACRO(_1, _2, NAME, ...) NAME -#define PLUGIN_VALIDATE(...) GET_MACRO(__VA_ARGS__, PLUGIN_VALIDATE_MSG, PLUGIN_VALIDATE_DEFAULT, )(__VA_ARGS__) - -// Logs failed condition and throws a PluginError. -// PLUGIN_ASSERT will eventually perform this function, at which point PLUGIN_VALIDATE -// will be removed. -#define PLUGIN_VALIDATE_DEFAULT(condition) \ - { \ - if (!(condition)) \ - { \ - nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, #condition); \ - } \ - } - -#define PLUGIN_VALIDATE_MSG(condition, msg) \ - { \ - if (!(condition)) \ - { \ - nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, msg); \ - } \ - } - -// Logs failed assertion and aborts. -// Aborting is undesirable and will be phased-out from the plugin module, at which point -// PLUGIN_ASSERT will perform the same function as PLUGIN_VALIDATE. -#define PLUGIN_ASSERT(assertion) \ - { \ - if (!(assertion)) \ - { \ - nvinfer1::plugin::reportAssertion(#assertion, __FILE__, __LINE__); \ - } \ - } - -#define PLUGIN_FAIL(msg) \ - { \ - nvinfer1::plugin::reportAssertion(msg, __FILE__, __LINE__); \ - } - -#define PLUGIN_ERROR(msg) \ - { \ - nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, msg); \ - } - -#define PLUGIN_CUERROR(status_) \ - { \ - auto s_ = status_; \ - if (s_ != 0) \ - nvinfer1::plugin::logError(#status_ " failure.", __FILE__, FN_NAME, __LINE__); \ - } - -#endif /*CHECK_MACROS_PLUGIN_H*/ +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h b/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h new file mode 100644 index 00000000000..2a975211004 --- /dev/null +++ b/cpp/tensorrt_llm/plugins/common/gemmPluginProfiler.h @@ -0,0 +1,470 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/plugins/common/plugin.h" + +namespace tensorrt_llm::plugins +{ + +struct GemmDims +{ + int32_t minM; + int32_t maxM; + int32_t n; + int32_t k; + + GemmDims() + : minM(-1) + , maxM(-1) + , n(-1) + , k(-1) + { + } + + GemmDims(int32_t minM_, int32_t maxM_, int32_t n_, int32_t k_) + : minM(minM_) + , maxM(maxM_) + , n(n_) + , k(k_) + { + } + + bool isInitialized() const + { + return minM >= 0 && maxM >= 0 && n >= 0 && k >= 0; + } +}; + +// Unique ID of GEMM +// In our case GEMM is uniqly identified by N and K +class GemmIdCore +{ +public: + int n; + int k; + nvinfer1::DataType dtype; + + GemmIdCore(int n_, int k_, const nvinfer1::DataType& dtype_) + : n(n_) + , k(k_) + , dtype(dtype_) + { + } + + GemmIdCore() + : n(-1) + , k(-1) + , dtype(nvinfer1::DataType::kFLOAT) // dtype does not matter here + { + } + + bool operator==(const GemmIdCore& id) const + { + return n == id.n && k == id.k && dtype == id.dtype; + } + + friend std::ostream& operator<<(std::ostream& out, const GemmIdCore& id) + { + out << "(N;K)=(" << id.n << ";" << id.k << "),"; + out << " type=" << static_cast(id.dtype); + return out; + } +}; + +// Hash of GemmId +struct GemmIdCoreHash +{ + std::size_t operator()(const GemmIdCore& id) const + { + auto h1 = std::hash{}(id.n); + auto h2 = std::hash{}(id.k); + auto h3 = std::hash{}(static_cast(id.dtype)); + return h1 ^ h2 ^ h3; + } +}; + +template +class GemmPluginProfiler +{ +public: + static constexpr int MAX_PROFILE_M = 8192; + + // Map for single GEMM for different Ms (GEMM dimension) to the best config for particular M + using MProfileMap = std::unordered_map>; + using MProfileMapPtr = std::shared_ptr; + + // requires exclusive ownership to write to *this + using reader_lock = std::unique_lock; + // requires shared ownership to read from other + using writer_lock = std::shared_lock; + + // Struct of contining map if GEMMs to the best profiles for different Ms + struct MNKProfileMap + { + // Mutex guarding map + std::shared_timed_mutex mutex; + // Map from GEMM Id to profile for particular GEMM + std::unordered_map profileMap; + + bool existsMProfileMap(const GemmIdType& id) + { + const auto iter = profileMap.find(id); + return iter != profileMap.end(); + } + + void createMProfileMap(const GemmIdType& id) + { + profileMap[id] = std::make_shared(); + } + + MProfileMapPtr getMProfileMap(const GemmIdType& id) + { + const auto iter = profileMap.find(id); + if (iter == profileMap.end()) + { + std::ostringstream msg; + msg << "Cannot find ID (" << id << ") in the profile map. Abort."; + TLLM_LOG_ERROR(msg.str()); + } + return iter->second; + } + }; + + using MNKProfileMapPtr = std::shared_ptr; + + GemmPluginProfiler() + { + mMNKProfileMap = std::make_shared(); + + // set SKIP_GEMM_PLUGIN_PROFILINGS=1 to avoid tactics profilings + const auto skip = std::getenv("SKIP_GEMM_PLUGIN_PROFILINGS"); + mSkip = (skip != NULL && std::stoi(skip)); + if (mSkip) + { + TLLM_LOG_DEBUG( + "SKIP_GEMM_PLUGIN_PROFILINGS is set. Skipping GEMM plugin profilings. It could result in runtime error " + "if default tactic is not defined."); + } + } + + void serialize(char* buffer, const GemmIdType& gemmId) const + { + auto mProfileMap = mMNKProfileMap->getMProfileMap(gemmId); + + // Save number of profiles for given GEMM ID + write(buffer, static_cast(mProfileMap->size())); + for (const auto& pair : *mProfileMap) + { + // Save pair of M to the best GEMM config + write(buffer, pair); + } + } + + void deserialize(const char*& data, GemmDims& dims, const GemmIdType& gemmId) + { + // NOTE(nkorobov): this mutex is not needed since each thread owns its own map, but will put here for + // consistency + writer_lock lock(mMNKProfileMap->mutex); + + mDims = dims; + + // GemmId gemmId(dims.n, dims.k); + if (!mMNKProfileMap->existsMProfileMap(gemmId)) + { + // Create GEMM with GEMM ID if it does not exist + mMNKProfileMap->createMProfileMap(gemmId); + } + // Populate map with profiles of GEMM ID + auto profileMap = mMNKProfileMap->getMProfileMap(gemmId); + int selectedMapSize; + read(data, selectedMapSize); + for (int ii = 0; ii < selectedMapSize; ++ii) + { + std::pair> config; + read(data, config); + profileMap->insert(config); + } + } + + size_t getSerializationSize(const GemmIdType& gemmId) const + { + reader_lock lock(mMNKProfileMap->mutex); + return sizeof(int) + // size of the tactics map + mMNKProfileMap->getMProfileMap(gemmId)->size() + * sizeof(std::pair>); // size of the tactics map + } + + void profileTactics(const std::vector& tactics, const RunnerPtr& runner, const nvinfer1::DataType& type, + const GemmDims& dims, const GemmIdType& gemmId) + { + writer_lock lock(mMNKProfileMap->mutex); + + if (!dims.isInitialized()) + { + return; + } + + mRunner = runner; + mType = type; + + const int maxM = std::min(nextPowerOfTwo(dims.maxM), MAX_PROFILE_M); + computeTmpSize(maxM, dims.n, dims.k); + + if (!mMNKProfileMap->existsMProfileMap(gemmId)) + { + // Create map for GEMM ID + mMNKProfileMap->createMProfileMap(gemmId); + } + + if (mSkip) + { + return; + } + + auto mProfileMap = mMNKProfileMap->getMProfileMap(gemmId); + + auto profileTactics = [&tactics, &mProfileMap, this](int m, int n, int k) + { + if (mProfileMap->count(m) == 0) + { + // Profile different tactics for particular m and insert best config to the map + mProfileMap->insert({m, this->profileTacticsForProblem(m, n, k, tactics)}); + } + }; + + // Allocate tmp data to run GEMMs + allocateTmpData(); + const int startMinMRounded = nextPowerOfTwo(dims.minM); + for (int m = startMinMRounded; m < maxM; m *= 2) + { + profileTactics(m, dims.n, dims.k); + } + + profileTactics(maxM, dims.n, dims.k); + // Free tmp data + freeTmpData(); + } + + void setSelectionTactics(const MNKProfileMapPtr& map) + { + mMNKProfileMap = map; + } + + void setTmpWorkspaceSizeInBytes(size_t bytes) + { + mTmpWorkspaceSizeInBytes = bytes; + } + + std::optional getBestConfig(int m, const GemmIdType& gemmId) const + { + reader_lock lock(mMNKProfileMap->mutex); + + if (mSkip) + { + return std::nullopt; + } + + const int mRounded = std::min(nextPowerOfTwo(m), MAX_PROFILE_M); + return mMNKProfileMap->getMProfileMap(gemmId)->at(mRounded); + } + +protected: + virtual void runTactic(int m, int n, int k, const Config& tactic, char* workspace, const cudaStream_t& stream) = 0; + + virtual void computeTmpSize(int maxM, int n, int k) = 0; + + virtual bool checkTactic(int m, int n, int k, const Config& tactic) const + { + return true; + } + +private: + void allocateTmpData() + { + TLLM_CHECK_WITH_INFO(mTmpWorkspaceSizeInBytes > 0, "tmpWorkspaceSizeInBytes must be larger than 0"); + const auto status = cudaMalloc(&mWorkspaceTmp, mTmpWorkspaceSizeInBytes); + TLLM_CHECK_WITH_INFO(status == cudaSuccess, "Can't allocate tmp workspace for GEMM tactics profiling."); + } + + void freeTmpData() + { + const auto status = cudaFree(mWorkspaceTmp); + TLLM_CHECK_WITH_INFO(status == cudaSuccess, "Can't free tmp workspace for GEMM tactics profiling."); + } + + std::optional profileTacticsForProblem(int m, int n, int k, const std::vector& tactics) + { + TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); + + float bestTime = std::numeric_limits::max(); + Config bestConfig; + bool foundOne = false; + + // Iterate over all tactics for given M, N and K + for (int ii = 0; ii < tactics.size(); ++ii) + { + const Config& candidateConfig = tactics[ii]; + float time = std::numeric_limits::max(); + try + { + if (!checkTactic(m, n, k, candidateConfig)) + { + continue; + } + // Profile particualar tactic for given M, N and K + time = profileTacticForProblem(m, n, k, candidateConfig); + foundOne = true; + } + catch (const std::exception& e) + { + std::ostringstream msg; + msg << "Cannot profile configuration " << ii << " (for" + << " m=" << m << ", n=" << n << ", k=" << k << "). Skipped"; + TLLM_LOG_WARNING(msg.str()); + continue; + } + + // Choose the fastest tactic + if (time < bestTime) + { + bestConfig = candidateConfig; + bestTime = time; + } + } + + if (!foundOne) + { + std::ostringstream msg; + msg << "Have not found any valid GEMM config for shape (" + << "m=" << m << ", n=" << n << ", k=" << k << "). Will try to use default or fail at runtime"; + TLLM_LOG_WARNING(msg.str()); + return std::nullopt; + } + return {bestConfig}; + } + + float profileTacticForProblem(int m, int n, int k, const Config& tactic) + { + constexpr int warmup = 5; + constexpr int runs = 10; + + cudaStream_t stream = cudaStreamDefault; + // Warmup the execution + for (int i = 0; i < warmup; ++i) + { + runTactic(m, n, k, tactic, mWorkspaceTmp, stream); + } + + cudaEvent_t start; + cudaEvent_t stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaDeviceSynchronize(); + cudaEventRecord(start, 0); + + // Profile GEMM + for (int i = 0; i < runs; ++i) + { + runTactic(m, n, k, tactic, mWorkspaceTmp, stream); + } + + cudaEventRecord(stop, 0); + + cudaEventSynchronize(stop); + + float elapsed; + cudaEventElapsedTime(&elapsed, start, stop); + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return elapsed / runs; + } + + int nextPowerOfTwo(int v) const + { + --v; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return ++v; + } + +protected: + RunnerPtr mRunner{nullptr}; + + nvinfer1::DataType mType{}; + +private: + MNKProfileMapPtr mMNKProfileMap{}; + + size_t mTmpWorkspaceSizeInBytes{0}; + + char* mWorkspaceTmp{nullptr}; + + GemmDims mDims{}; + + bool mSkip{false}; +}; + +template +class GemmPluginProfilerManager +{ +public: + using MNKProfileMap = typename GemmPluginProfilerType::MNKProfileMap; + using MNKProfileMapPtr = typename GemmPluginProfilerType::MNKProfileMapPtr; + using GemmPluginProfilerPtr = std::shared_ptr; + + GemmPluginProfilerManager() + { + mMNKProfileMap = std::make_shared(); + } + + GemmPluginProfilerPtr createGemmPluginProfiler(bool inference) + { + auto profiler = std::make_shared(); + // If the profiler is created during the engine build, + // mMNKProfileMap is shared between different profilers to minimize the time spent on the profiling + // and do not repeat profiling for the GEMMs of the same shape. + if (!inference) + { + profiler->setSelectionTactics(mMNKProfileMap); + } + return profiler; + } + +private: + MNKProfileMapPtr mMNKProfileMap{}; +}; + +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/common/plugin.cpp b/cpp/tensorrt_llm/plugins/common/plugin.cpp index a2a9c325332..64b5ef85a62 100644 --- a/cpp/tensorrt_llm/plugins/common/plugin.cpp +++ b/cpp/tensorrt_llm/plugins/common/plugin.cpp @@ -24,7 +24,11 @@ #include #include -#define CUDA_MEM_ALIGN 128 +#ifdef _MSC_VER +#define FN_NAME __FUNCTION__ +#else +#define FN_NAME __func__ +#endif #if ENABLE_MULTI_DEVICE std::unordered_map* getDtypeMap() @@ -51,10 +55,10 @@ inline CUcontext getCurrentCudaCtx() CUresult err = cuCtxGetCurrent(&ctx); if (err == CUDA_ERROR_NOT_INITIALIZED || ctx == nullptr) { - PLUGIN_CUASSERT(cudaFree(nullptr)); + TLLM_CUDA_CHECK(cudaFree(nullptr)); err = cuCtxGetCurrent(&ctx); } - PLUGIN_ASSERT(err == CUDA_SUCCESS); + TLLM_CHECK(err == CUDA_SUCCESS); return ctx; } @@ -131,12 +135,12 @@ std::shared_ptr getCublasHandle() []() -> auto { auto handle = std::unique_ptr(new cublasHandle_t); - PLUGIN_CUBLASASSERT(cublasCreate(handle.get())); + TLLM_CUDA_CHECK(cublasCreate(handle.get())); return handle; }, [](cublasHandle_t* handle) { - PLUGIN_CUBLASASSERT(cublasDestroy(*handle)); + TLLM_CUDA_CHECK(cublasDestroy(*handle)); delete handle; }); return creator(); @@ -148,19 +152,19 @@ std::shared_ptr getCublasLtHandle() []() -> auto { auto handle = std::unique_ptr(new cublasLtHandle_t); - PLUGIN_CUBLASASSERT(cublasLtCreate(handle.get())); + TLLM_CUDA_CHECK(cublasLtCreate(handle.get())); return handle; }, [](cublasLtHandle_t* handle) { - PLUGIN_CUBLASASSERT(cublasLtDestroy(*handle)); + TLLM_CUDA_CHECK(cublasLtDestroy(*handle)); delete handle; }); return creator(); } // ALIGNPTR -int8_t* nvinfer1::plugin::alignPtr(int8_t* ptr, uintptr_t to) +int8_t* tensorrt_llm::plugins::alignPtr(int8_t* ptr, uintptr_t to) { uintptr_t addr = (uintptr_t) ptr; if (addr % to) @@ -171,32 +175,45 @@ int8_t* nvinfer1::plugin::alignPtr(int8_t* ptr, uintptr_t to) } // NEXTWORKSPACEPTR -int8_t* nvinfer1::plugin::nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize) +int8_t* tensorrt_llm::plugins::nextWorkspacePtrCommon( + int8_t* ptr, uintptr_t previousWorkspaceSize, const uintptr_t alignment) { uintptr_t addr = (uintptr_t) ptr; addr += previousWorkspaceSize; - return alignPtr((int8_t*) addr, CUDA_MEM_ALIGN); + return alignPtr((int8_t*) addr, alignment); } -int8_t* nvinfer1::plugin::nextWorkspacePtr(int8_t* const base, uintptr_t& offset, const uintptr_t size) +int8_t* tensorrt_llm::plugins::nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize) +{ + return nextWorkspacePtrCommon(ptr, previousWorkspaceSize, kCudaMemAlign); +} + +int8_t* tensorrt_llm::plugins::nextWorkspacePtr( + int8_t* const base, uintptr_t& offset, const uintptr_t size, const uintptr_t alignment) { uintptr_t curr_offset = offset; - uintptr_t next_offset = curr_offset + ((size + CUDA_MEM_ALIGN - 1) / CUDA_MEM_ALIGN) * CUDA_MEM_ALIGN; + uintptr_t next_offset = curr_offset + ((size + alignment - 1) / alignment) * alignment; int8_t* newptr = size == 0 ? nullptr : base + curr_offset; offset = next_offset; return newptr; } +int8_t* tensorrt_llm::plugins::nextWorkspacePtrWithAlignment( + int8_t* ptr, uintptr_t previousWorkspaceSize, const uintptr_t alignment) +{ + return nextWorkspacePtrCommon(ptr, previousWorkspaceSize, alignment); +} + // CALCULATE TOTAL WORKSPACE SIZE -size_t nvinfer1::plugin::calculateTotalWorkspaceSize(size_t* workspaces, int count) +size_t tensorrt_llm::plugins::calculateTotalWorkspaceSize(size_t* workspaces, int count, const uintptr_t alignment) { size_t total = 0; for (int i = 0; i < count; i++) { total += workspaces[i]; - if (workspaces[i] % CUDA_MEM_ALIGN) + if (workspaces[i] % alignment) { - total += CUDA_MEM_ALIGN - (workspaces[i] % CUDA_MEM_ALIGN); + total += alignment - (workspaces[i] % alignment); } } return total; @@ -219,7 +236,7 @@ PluginFieldParser::~PluginFieldParser() { std::stringstream ss; ss << "unused plugin field with name: " << name; - nvinfer1::plugin::logError(ss.str().c_str(), __FILE__, FN_NAME, __LINE__); + tensorrt_llm::plugins::logError(ss.str().c_str(), __FILE__, FN_NAME, __LINE__); } } } @@ -253,7 +270,7 @@ std::optional PluginFieldParser::getScalar(std::string_view const& name) } auto& record = mMap.at(name); auto const& f = mFields[record.index]; - PLUGIN_ASSERT(toFieldType() == f.type && f.length == 1); + TLLM_CHECK(toFieldType() == f.type && f.length == 1); record.retrieved = true; return std::optional{*static_cast(f.data)}; } diff --git a/cpp/tensorrt_llm/plugins/common/plugin.h b/cpp/tensorrt_llm/plugins/common/plugin.h index 54234a736dd..9f070722d8a 100644 --- a/cpp/tensorrt_llm/plugins/common/plugin.h +++ b/cpp/tensorrt_llm/plugins/common/plugin.h @@ -14,10 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_PLUGIN_H -#define TRT_PLUGIN_H -#include "NvInferPlugin.h" + +#pragma once + +#include "tensorrt_llm/plugins/api/tllmPlugin.h" #include "tensorrt_llm/plugins/common/checkMacrosPlugin.h" + +#include + #include #include #include @@ -34,38 +38,27 @@ #include #include -typedef enum -{ - STATUS_SUCCESS = 0, - STATUS_FAILURE = 1, - STATUS_BAD_PARAM = 2, - STATUS_NOT_SUPPORTED = 3, - STATUS_NOT_INITIALIZED = 4 -} pluginStatus_t; - -namespace nvinfer1 -{ - -namespace pluginInternal +namespace tensorrt_llm::plugins { -class BasePlugin : public IPluginV2 +class BasePlugin : public nvinfer1::IPluginV2DynamicExt { -protected: +public: void setPluginNamespace(const char* libNamespace) noexcept override { mNamespace = libNamespace; } - const char* getPluginNamespace() const noexcept override + [[nodiscard]] char const* getPluginNamespace() const noexcept override { return mNamespace.c_str(); } - std::string mNamespace; +protected: + std::string mNamespace{api::kDefaultNamespace}; }; -class BaseCreator : public IPluginCreator +class BaseCreator : public nvinfer1::IPluginCreator { public: void setPluginNamespace(const char* libNamespace) noexcept override @@ -73,20 +66,15 @@ class BaseCreator : public IPluginCreator mNamespace = libNamespace; } - const char* getPluginNamespace() const noexcept override + [[nodiscard]] char const* getPluginNamespace() const noexcept override { return mNamespace.c_str(); } protected: - std::string mNamespace; + std::string mNamespace{api::kDefaultNamespace}; }; -} // namespace pluginInternal - -namespace plugin -{ - // Write values into buffer template void write(char*& buffer, const T& val) @@ -103,59 +91,41 @@ void read(const char*& buffer, T& val) buffer += sizeof(T); } -inline int32_t getTrtSMVersionDec(int32_t smVersion) -{ - // Treat SM89 as SM86 temporarily. - return (smVersion == 89) ? 86 : smVersion; -} - -inline int32_t getTrtSMVersionDec(int32_t majorVersion, int32_t minorVersion) -{ - return getTrtSMVersionDec(majorVersion * 10 + minorVersion); -} - -inline int32_t elementSize(DataType type) noexcept +inline cudaDataType_t trtToCublasDtype(nvinfer1::DataType type) { switch (type) { - case DataType::kFLOAT: return 4; - case DataType::kHALF: return 2; - case DataType::kINT8: return 1; - case DataType::kINT32: return 4; - case DataType::kBOOL: return 1; - case DataType::kUINT8: return 1; - case DataType::kFP8: return 1; + case nvinfer1::DataType::kFLOAT: return CUDA_R_32F; + case nvinfer1::DataType::kHALF: return CUDA_R_16F; #if defined(NV_TENSORRT_MAJOR) && NV_TENSORRT_MAJOR >= 9 - case DataType::kBF16: return 2; - case DataType::kINT64: return 8; + case nvinfer1::DataType::kBF16: return CUDA_R_16BF; #endif + default: TLLM_THROW("Not supported data type for cuBLAS"); } - PLUGIN_FAIL("unreachable code path"); } +std::uintptr_t constexpr kCudaMemAlign = 128; + int8_t* alignPtr(int8_t* ptr, uintptr_t to); -int8_t* nextWorkspacePtr(int8_t* const base, uintptr_t& offset, const uintptr_t size); +int8_t* nextWorkspacePtrCommon(int8_t* ptr, uintptr_t previousWorkspaceSize, const uintptr_t alignment); + +int8_t* nextWorkspacePtr( + int8_t* const base, uintptr_t& offset, const uintptr_t size, const uintptr_t alignment = kCudaMemAlign); int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize); -size_t calculateTotalWorkspaceSize(size_t* workspaces, int count); +int8_t* nextWorkspacePtrWithAlignment(int8_t* ptr, uintptr_t previousWorkspaceSize, const uintptr_t alignment); -} // namespace plugin -} // namespace nvinfer1 +size_t calculateTotalWorkspaceSize(size_t* workspaces, int count, const uintptr_t alignment = kCudaMemAlign); + +} // namespace tensorrt_llm::plugins inline bool isBuilding() { - std::string const& key = "IS_BUILDING"; - char* val = getenv(key.c_str()); - if (val == nullptr || std::string(val) != "1") - { - return false; - } - else - { - return true; - } + auto constexpr key = "IS_BUILDING"; + auto const val = getenv(key); + return val != nullptr && std::string(val) == "1"; } #define MPICHECK(cmd) \ @@ -309,5 +279,3 @@ class PluginFieldParser std::unordered_map mMap; }; - -#endif // TRT_PLUGIN_H diff --git a/cpp/tensorrt_llm/plugins/exports.map b/cpp/tensorrt_llm/plugins/exports.map index b0b1d3c5018..9d34b296f08 100644 --- a/cpp/tensorrt_llm/plugins/exports.map +++ b/cpp/tensorrt_llm/plugins/exports.map @@ -18,15 +18,16 @@ /* Hides all symbols except those specified in the global section */ { global: - getInferLibVersion; - getPluginRegistry; - initLibNvInferPlugins; + initTrtLlmPlugins; + setLoggerFinder; + getPluginCreators; extern "C++" { nvinfer1::IPluginCreator::*; nvinfer1::IPluginV2Ext::*; nvinfer1::IPluginV2IOExt::*; nvinfer1::PluginRegistrar*; - nvinfer1::plugin::*; + tensorrt_llm::plugins::api::*; + tensorrt_llm::plugins::*; }; local: *; }; diff --git a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp index c2c3d5c2308..1af7ce946f6 100644 --- a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp @@ -14,41 +14,192 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h" +#include "gemmPlugin.h" using namespace nvinfer1; using namespace tensorrt_llm::common; -using nvinfer1::plugin::GemmPluginCreator; -using nvinfer1::plugin::GemmPlugin; +using tensorrt_llm::plugins::GemmPluginCreator; +using tensorrt_llm::plugins::GemmPlugin; +using tensorrt_llm::plugins::CublasLtGemmPluginProfiler; +using tensorrt_llm::plugins::CublasGemmWrapperPtr; +using tensorrt_llm::plugins::read; +using tensorrt_llm::plugins::write; static const char* GEMM_PLUGIN_VERSION{"1"}; static const char* GEMM_PLUGIN_NAME{"Gemm"}; PluginFieldCollection GemmPluginCreator::mFC{}; -std::vector GemmPluginCreator::mPluginAttributes; +std::vector GemmPluginCreator::mPluginAttributes; -GemmPlugin::GemmPlugin(int transA, int transB, nvinfer1::DataType type) +void getProblemParams(cublasOperation_t& transa, cublasOperation_t& transb, int& m, int& n, int& k, int& lda, int& ldb, + int& ldc, bool transA, bool transB, int M, int N, int K) +{ + transa = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + transb = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + m = N; + n = M; + k = K; + lda = transB ? K : N; + ldb = transA ? M : K; + ldc = N; +} + +void runGemm(const int M, const int N, const int K, const bool transA, const bool transB, const nvinfer1::DataType type, + const CublasGemmWrapperPtr& cublasWrapperPtr, const void* act, const void* weight, void* output, + const std::optional& heuristic, void* workspace, cudaStream_t stream) +{ + auto cublasHandle = cublasWrapperPtr->getCublasHandle(); + TLLM_CUDA_CHECK(cublasSetStream(cublasHandle, stream)); + cublasWrapperPtr->setStream(stream); + cublasWrapperPtr->setWorkspace(workspace); + cublasOperation_t transa, transb; + int m, n, k; + int lda, ldb, ldc; + getProblemParams(transa, transb, m, n, k, lda, ldb, ldc, transA, transB, M, N, K); + cublasWrapperPtr->Gemm(transa, transb, m, n, k, weight, lda, act, ldb, output, ldc, heuristic); +} + +void CublasLtGemmPluginProfiler::runTactic( + int m, int n, int k, const CublasLtGemmPluginProfiler::Config& tactic, char* workspace, const cudaStream_t& stream) +{ + size_t dataSize = sizeof(half); + if (mType == DataType::kFLOAT) + { + dataSize = sizeof(float); + } + + void* actPtr = reinterpret_cast(workspace); + void* weightPtr = reinterpret_cast( + nextWorkspacePtrWithAlignment(reinterpret_cast(actPtr), m * k * dataSize, ALIGNMENT)); + void* outputPtr = reinterpret_cast( + nextWorkspacePtrWithAlignment(reinterpret_cast(weightPtr), n * k * dataSize, ALIGNMENT)); + char* workspacePtr = reinterpret_cast( + nextWorkspacePtrWithAlignment(reinterpret_cast(outputPtr), m * n * dataSize, ALIGNMENT)); + runGemm(m, n, k, mTransA, mTransB, mType, mRunner, actPtr, weightPtr, outputPtr, {tactic}, workspacePtr, stream); +} + +bool CublasLtGemmPluginProfiler::checkTactic(int m, int n, int k, const Config& tactic) const +{ + cublasOperation_t transa, transb; + int M, N, K; + int lda, ldb, ldc; + getProblemParams(transa, transb, m, n, k, lda, ldb, ldc, mTransA, mTransB, n, m, k); + + return mRunner->checkTactic(transa, transb, m, n, k, lda, ldb, ldc, tactic); +} + +void CublasLtGemmPluginProfiler::computeTmpSize(int maxM, int n, int k) +{ + size_t dataSize = sizeof(half); + if (mType == DataType::kFLOAT) + { + dataSize = sizeof(float); + } + + std::vector workspaces = { + maxM * k * dataSize, // A + n * k * dataSize, // B + maxM * n * dataSize, // C + CUBLAS_WORKSPACE_SIZE // workspace + }; + size_t bytes = calculateTotalWorkspaceSize(workspaces.data(), workspaces.size(), ALIGNMENT); + setTmpWorkspaceSizeInBytes(bytes); +} + +GemmPlugin::GemmPlugin( + int transA, int transB, nvinfer1::DataType type, bool useFp8, const GemmPlugin::PluginProfilerPtr& pluginProfiler) : mTransA(transA) , mTransB(transB) , mType(type) + , mUseFp8(useFp8) + , mPluginProfiler(pluginProfiler) { + init(); } // Parameterized constructor -GemmPlugin::GemmPlugin(const void* data, size_t length) +GemmPlugin::GemmPlugin(const void* data, size_t length, const GemmPlugin::PluginProfilerPtr& pluginProfiler) + : mPluginProfiler(pluginProfiler) { const char *d = reinterpret_cast(data), *a = d; read(d, mTransA); read(d, mTransB); read(d, mType); - PLUGIN_ASSERT(d == a + length); + read(d, mUseFp8); + read(d, mDims); + + init(); + + mPluginProfiler->deserialize(d, mDims, mGemmId); + + TLLM_CHECK(d == a + length); +} + +void GemmPlugin::init() +{ + auto cublasHandle = getCublasHandle(); + auto cublasLtHandle = getCublasLtHandle(); + mCublasAlgoMap = std::make_shared(GEMM_CONFIG); + mCublasWrapperMutex = std::make_shared(); + mCublasWrapper = std::make_shared( + cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap.get(), mCublasWrapperMutex.get(), nullptr); + + mPluginProfiler->setTranspose(mTransA, mTransB); + + mGemmId = GemmIdCublas(GemmIdCore(mDims.n, mDims.k, mType), mTransA, mTransB); +} + +void GemmPlugin::setGemmConfig() +{ + if (mType == DataType::kHALF) + { + mCublasWrapper->setFP16GemmConfig(); + } + else if (mType == DataType::kFLOAT) + { + mCublasWrapper->setFP32GemmConfig(); + } +#ifdef ENABLE_BF16 + else if (mType == DataType::kBF16) + { + mCublasWrapper->setBF16GemmConfig(); + } +#endif + +#ifdef ENABLE_FP8 + if (mUseFp8) + { + mCublasWrapper->setFP8GemmConfig(trtToCublasDtype(mType)); + } +#endif +} + +void GemmPlugin::configGemm() +{ + if (!mDims.isInitialized()) + { + return; + } + + setGemmConfig(); + + std::vector totalHeruistics; + for (int mCur = mDims.minM; mCur < mDims.maxM; mCur *= 2) + { + cublasOperation_t transa, transb; + int m, n, k; + int lda, ldb, ldc; + getProblemParams(transa, transb, m, n, k, lda, ldb, ldc, mTransA, mTransB, mCur, mDims.n, mDims.k); + const auto heruistics = mCublasWrapper->getTactics(transa, transb, m, n, k, lda, ldb, ldc); + + totalHeruistics.insert(totalHeruistics.end(), heruistics.begin(), heruistics.end()); + } + mPluginProfiler->profileTactics(totalHeruistics, mCublasWrapper, mType, mDims, mGemmId); } // IPluginV2DynamicExt Methods nvinfer1::IPluginV2DynamicExt* GemmPlugin::clone() const noexcept { auto* plugin = new GemmPlugin(*this); - plugin->setPluginNamespace(mNamespace.c_str()); - plugin->initialize(); return plugin; } @@ -57,8 +208,8 @@ nvinfer1::DimsExprs GemmPlugin::getOutputDimensions( { try { - PLUGIN_ASSERT(nbInputs == 2); - PLUGIN_ASSERT(outputIndex == 0); + TLLM_CHECK(nbInputs == 2); + TLLM_CHECK(outputIndex == 0); const int nbDimsA = inputs[0].nbDims; const int nbDimsB = inputs[1].nbDims; DimsExprs ret; @@ -107,9 +258,62 @@ bool GemmPlugin::supportsFormatCombination( return (inOut[pos].type == mType) && (inOut[pos].format == TensorFormat::kLINEAR); } +int32_t computeMDimension(bool transA, const int32_t nbDims, const int32_t* dims) +{ + int32_t M = 1; + if (transA) + { + for (int i = nbDims - 1; i > 0; --i) + { + M *= dims[i]; + } + } + else + { + for (int i = 0; i < nbDims - 1; ++i) + { + M *= dims[i]; + } + } + return M; +} + +int32_t computeNDimension(bool transB, const int32_t nbDims, const int32_t* dims) +{ + int32_t N = 1; + if (transB) + { + for (int i = 0; i < nbDims - 1; ++i) + { + N *= dims[i]; + } + } + else + { + for (int i = nbDims - 1; i > 0; --i) + { + N *= dims[i]; + } + } + return N; +} + void GemmPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept { + const int nbDimsA = in[0].max.nbDims; + const int nbDimsB = in[1].max.nbDims; + + const auto minM = computeMDimension(mTransA, nbDimsA, in[0].min.d); + const auto maxM = computeMDimension(mTransA, nbDimsA, in[0].max.d); + const auto N = computeNDimension(mTransB, nbDimsB, in[1].max.d); + const auto K = mTransA ? in[0].max.d[0] : in[0].max.d[nbDimsA - 1]; + + if (!mDims.isInitialized()) + { + mDims = {minM, maxM, N, K}; + } + mGemmId.gemmIdCore = {N, K, mType}; } size_t GemmPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, @@ -127,68 +331,19 @@ int GemmPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinf // outputs // mat [M, N] - auto cublasHandle = mCublasWrapper->getCublasHandle(); - PLUGIN_CUBLASASSERT(cublasSetStream(cublasHandle, stream)); - mCublasWrapper->setStream(stream); - mCublasWrapper->setWorkspace(workspace); - if (mType == DataType::kHALF) - { - mCublasWrapper->setFP16GemmConfig(); - } - else if (mType == DataType::kFLOAT) - { - mCublasWrapper->setFP32GemmConfig(); - } -#ifdef ENABLE_BF16 - else if (mType == DataType::kBF16) - { - mCublasWrapper->setBF16GemmConfig(); - } -#endif + setGemmConfig(); const int nbDimsA = inputDesc[0].dims.nbDims; - int M = 1, N = 1; - const int K = mTransA ? inputDesc[0].dims.d[0] : inputDesc[0].dims.d[nbDimsA - 1]; - if (mTransA) - { - for (int i = nbDimsA - 1; i > 0; --i) - { - M *= inputDesc[0].dims.d[i]; - } - } - else - { - for (int i = 0; i < nbDimsA - 1; ++i) - { - M *= inputDesc[0].dims.d[i]; - } - } const int nbDimsB = inputDesc[1].dims.nbDims; - if (mTransB) - { - for (int i = 0; i < nbDimsB - 1; ++i) - { - N *= inputDesc[1].dims.d[i]; - } - } - else - { - for (int i = nbDimsB - 1; i > 0; --i) - { - N *= inputDesc[1].dims.d[i]; - } - } - - cublasOperation_t transa = mTransB ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t transb = mTransA ? CUBLAS_OP_T : CUBLAS_OP_N; - const int m = N; - const int n = M; - const int k = K; - const auto lda = mTransB ? K : N; - const auto ldb = mTransA ? M : K; - const auto ldc = N; - mCublasWrapper->Gemm(transa, transb, m, n, k, inputs[1], lda, inputs[0], ldb, outputs[0], ldc); + const auto M = computeMDimension(mTransA, nbDimsA, inputDesc[0].dims.d); + const auto N = computeNDimension(mTransB, nbDimsB, inputDesc[1].dims.d); + const int K = mTransA ? inputDesc[0].dims.d[0] : inputDesc[0].dims.d[nbDimsA - 1]; + // FIXME(nkorobov): enable best config selection + // const auto& bestTactic = mPluginProfiler->getBestConfig(M, mGemmId); + const std::optional bestTactic = {}; + runGemm(M, N, K, mTransA, mTransB, mType, mCublasWrapper, inputs[0], inputs[1], outputs[0], bestTactic, workspace, + stream); return 0; } @@ -196,7 +351,7 @@ int GemmPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinf nvinfer1::DataType GemmPlugin::getOutputDataType( int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept { - PLUGIN_ASSERT(index == 0); + TLLM_CHECK(index == 0); return inputTypes[0]; } @@ -219,30 +374,16 @@ int GemmPlugin::getNbOutputs() const noexcept int GemmPlugin::initialize() noexcept { - auto cublasHandle = getCublasHandle(); - auto cublasLtHandle = getCublasLtHandle(); - mCublasAlgoMap = new cublasAlgoMap(GEMM_CONFIG); - mCublasWrapperMutex = new std::mutex(); - mCublasWrapper - = new cublasMMWrapper(cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap, mCublasWrapperMutex, nullptr); + configGemm(); return 0; } -void GemmPlugin::destroy() noexcept -{ - delete mCublasAlgoMap; - delete mCublasWrapperMutex; - delete mCublasWrapper; - - mCublasAlgoMap = nullptr; - mCublasWrapperMutex = nullptr; - mCublasWrapper = nullptr; - delete this; -} +void GemmPlugin::destroy() noexcept {} size_t GemmPlugin::getSerializationSize() const noexcept { - return sizeof(mTransA) + sizeof(mTransB) + sizeof(mType); + return sizeof(mTransA) + sizeof(mTransB) + sizeof(mType) + sizeof(mDims) + sizeof(mUseFp8) + + mPluginProfiler->getSerializationSize(mGemmId); // selected tactics container size } void GemmPlugin::serialize(void* buffer) const noexcept @@ -251,21 +392,15 @@ void GemmPlugin::serialize(void* buffer) const noexcept write(d, mTransA); write(d, mTransB); write(d, mType); + write(d, mUseFp8); + write(d, mDims); + mPluginProfiler->serialize(d, mGemmId); + assert(d == a + getSerializationSize()); } void GemmPlugin::terminate() noexcept {} -void GemmPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* GemmPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// GemmPluginCreator::GemmPluginCreator() @@ -275,6 +410,7 @@ GemmPluginCreator::GemmPluginCreator() mPluginAttributes.emplace_back(PluginField("transA", nullptr, PluginFieldType::kINT32, 0)); mPluginAttributes.emplace_back(PluginField("transB", nullptr, PluginFieldType::kINT32, 0)); mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1)); + mPluginAttributes.emplace_back(PluginField("use_fp8", nullptr, PluginFieldType::kINT32, 0)); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } @@ -299,29 +435,38 @@ IPluginV2* GemmPluginCreator::createPlugin(const char* name, const PluginFieldCo const PluginField* fields = fc->fields; int transA, transB; nvinfer1::DataType type; + int useFp8; // Read configurations from each fields for (int i = 0; i < fc->nbFields; ++i) { const char* attrName = fields[i].name; if (!strcmp(attrName, "transa")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); transA = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "transb")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); transB = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } + else if (!strcmp(attrName, "use_fp8")) + { + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); + useFp8 = static_cast(*(static_cast(fields[i].data))); + } } try { - auto* obj = new GemmPlugin(transA, transB, type); + // GemmPluginCreator is unique and shared for an engine generation + // Create plugin profiler with shared tactics map + auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false); + auto* obj = new GemmPlugin(transA, transB, type, useFp8, pluginProfiler); obj->setPluginNamespace(mNamespace.c_str()); return obj; } @@ -338,7 +483,10 @@ IPluginV2* GemmPluginCreator::deserializePlugin(const char* name, const void* se // call GemmPlugin::destroy() try { - auto* obj = new GemmPlugin(serialData, serialLength); + // GemmPluginCreator is unique and shared for an engine generation + // Create plugin profiler with shared tactics map + auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ true); + auto* obj = new GemmPlugin(serialData, serialLength, pluginProfiler); obj->setPluginNamespace(mNamespace.c_str()); return obj; } @@ -348,13 +496,3 @@ IPluginV2* GemmPluginCreator::deserializePlugin(const char* name, const void* se } return nullptr; } - -void GemmPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* GemmPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h index 6651f383ad9..8775388b147 100644 --- a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h +++ b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.h @@ -16,27 +16,98 @@ */ #ifndef TRT_GEMM_PLUGIN_H #define TRT_GEMM_PLUGIN_H -#include "NvInferPlugin.h" #include "tensorrt_llm/common/cublasMMWrapper.h" +#include "tensorrt_llm/plugins/common/gemmPluginProfiler.h" #include "tensorrt_llm/plugins/common/plugin.h" #include #include #include #include -namespace nvinfer1 +namespace tensorrt_llm::plugins { -namespace plugin + +using CublasGemmWrapper = tensorrt_llm::common::cublasMMWrapper; +using CublasGemmWrapperPtr = std::shared_ptr; + +class GemmIdCublas { +public: + GemmIdCore gemmIdCore{}; + bool transA{}; + bool transB{}; + + GemmIdCublas(const GemmIdCore& gemmIdCore_, bool transA_, bool transB_) + : gemmIdCore(gemmIdCore_) + , transA(transA_) + , transB(transB_) + { + } + + GemmIdCublas() {} + + bool operator==(const GemmIdCublas& id) const + { + return gemmIdCore == id.gemmIdCore && transA == id.transA && transB == id.transB; + } + + friend std::ostream& operator<<(std::ostream& out, const GemmIdCublas& id) + { + out << "Core ID = {" << id.gemmIdCore << "}"; + out << " transA=" << id.transA; + out << " transB=" << id.transB; + return out; + } +}; + +// Hash of GemmIdCublas +struct GemmIdCublasHash +{ + std::size_t operator()(const GemmIdCublas& id) const + { + auto h1 = GemmIdCoreHash()(id.gemmIdCore); + auto h2 = std::hash{}(id.transA); + auto h3 = std::hash{}(id.transB); + return h1 ^ h2 ^ h3; + } +}; -class GemmPlugin : public IPluginV2DynamicExt +class CublasLtGemmPluginProfiler + : public GemmPluginProfiler { public: + using Config = cublasLtMatmulHeuristicResult_t; + + void setTranspose(bool transposeA, bool transposeB) + { + mTransA = transposeA; + mTransB = transposeB; + } + +protected: + void runTactic(int m, int n, int k, const Config& tactic, char* workspace, const cudaStream_t& stream) override; + + void computeTmpSize(int maxM, int n, int k) override; + + bool checkTactic(int m, int n, int k, const Config& tactic) const override; + +private: + bool mTransA; + bool mTransB; + + static constexpr size_t ALIGNMENT = 256; +}; + +class GemmPlugin : public BasePlugin +{ +public: + using PluginProfilerPtr = std::shared_ptr; + GemmPlugin() = delete; - GemmPlugin(int transA, int transB, nvinfer1::DataType type); + GemmPlugin(int transA, int transB, nvinfer1::DataType type, bool useFp8, const PluginProfilerPtr& profiler); - GemmPlugin(const void* data, size_t length); + GemmPlugin(const void* data, size_t length, const PluginProfilerPtr& profiler); ~GemmPlugin() override = default; @@ -66,23 +137,31 @@ class GemmPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; + +private: + void init(); + void configGemm(); + void setGemmConfig(); private: const std::string mLayerName; - std::string mNamespace; int mTransA; int mTransB; nvinfer1::DataType mType; - tensorrt_llm::common::cublasAlgoMap* mCublasAlgoMap; - std::mutex* mCublasWrapperMutex; - tensorrt_llm::common::cublasMMWrapper* mCublasWrapper; + std::shared_ptr mCublasAlgoMap; + std::shared_ptr mCublasWrapperMutex; + CublasGemmWrapperPtr mCublasWrapper; + + GemmDims mDims{}; + GemmIdCublas mGemmId{}; + bool mUseFp8{false}; + + PluginProfilerPtr mPluginProfiler; }; -class GemmPluginCreator : public IPluginCreator +class GemmPluginCreator : public BaseCreator { public: GemmPluginCreator(); @@ -98,17 +177,12 @@ class GemmPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + GemmPluginProfilerManager gemmPluginProfileManager; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 +} // namespace tensorrt_llm::plugins #endif // TRT_GEMM_PLUGIN_H diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp index 851d725bf98..c1ec28864d8 100644 --- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp +++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp @@ -14,12 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h" -#include "checkMacrosPlugin.h" +#include "gptAttentionCommon.h" #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h" #include "tensorrt_llm/kernels/gptKernels.h" #include "tensorrt_llm/kernels/unfusedAttentionKernels.h" +#include "tensorrt_llm/plugins/common/checkMacrosPlugin.h" +#include "tensorrt_llm/runtime/iBuffer.h" #include #include #include @@ -27,10 +28,9 @@ using namespace nvinfer1; using namespace tensorrt_llm::kernels; -using namespace tensorrt_llm::common; -using nvinfer1::plugin::GPTAttentionPluginCreatorCommon; -using nvinfer1::plugin::GPTAttentionPluginCommon; -using nvinfer1::plugin::nextWorkspacePtr; +namespace tc = tensorrt_llm::common; +using tensorrt_llm::plugins::GPTAttentionPluginCreatorCommon; +using tensorrt_llm::plugins::GPTAttentionPluginCommon; template struct KVCacheBufferDataType @@ -78,6 +78,8 @@ struct FusedQKVMaskedAttentionDispatchParams int kv_head_num; int size_per_head; int rotary_embedding_dim; + float rotary_embedding_base; + float rotary_embedding_scale; PositionEmbeddingType position_embedding_type; int max_seq_len; const int* input_lengths; @@ -90,7 +92,7 @@ struct FusedQKVMaskedAttentionDispatchParams const T* ia3_value_weights; const float* qkv_scale_out; const float* attention_out_scale; - QuantMode quant_option; + tc::QuantMode quant_option; bool multi_block_mode; int max_seq_len_tile; T* partial_out; @@ -99,7 +101,7 @@ struct FusedQKVMaskedAttentionDispatchParams int* block_counter; const float* kv_scale_orig_quant; const float* kv_scale_quant_orig; - QuantMode kv_cache_quant_mode; + tc::QuantMode kv_cache_quant_mode; int multi_processor_count; KVCacheBuffer kv_block_array; }; @@ -157,6 +159,8 @@ void fusedQKV_masked_attention_dispatch( params.num_kv_heads = input_params.kv_head_num; params.hidden_size_per_head = input_params.size_per_head; params.rotary_embedding_dim = input_params.rotary_embedding_dim; + params.rotary_embedding_base = input_params.rotary_embedding_base; + params.rotary_embedding_scale = input_params.rotary_embedding_scale; params.position_embedding_type = input_params.position_embedding_type; // Note: keep norm factor (sqrt(K_dim)) when adopting megatron T5 structure (may adjust) params.inv_sqrt_dh = 1.F / (sqrtf((float) params.hidden_size_per_head) * input_params.q_scaling); @@ -214,7 +218,8 @@ template void fusedQKV_masked_attention_dispatch( GPTAttentionPluginCommon::GPTAttentionPluginCommon(int num_heads, int num_kv_heads, int unidirectional, float q_scaling, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type, int rotary_embedding_dim, // for RoPE. Use 0 for non-RoPE - int tp_size, int tp_rank, // for ALiBi + float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type, + float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, int kv_cache_quant_mode, bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type, bool paged_kv_cache, nvinfer1::DataType type, int32_t max_context_length, bool qkv_bias_enabled) @@ -224,6 +229,10 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(int num_heads, int num_kv_hea , mUnidirectional(unidirectional) , mQScaling(q_scaling) , mRotaryEmbeddingDim(rotary_embedding_dim) + , mRotaryEmbeddingBase(rotary_embedding_base) + , mRotaryEmbeddingScaleType(rotary_embedding_scale_type) + , mRotaryEmbeddingScale(rotary_embedding_scale) + , mRotaryEmbeddingMaxPositions(rotary_embedding_max_positions) , mPositionEmbeddingType(position_embedding_type) , mEnableContextFMHA(context_fmha_type != ContextFMHAType::DISABLED) , mFMHAForceFP32Acc(context_fmha_type == ContextFMHAType::ENABLED_WITH_FP32_ACC || type == DataType::kBF16) @@ -239,8 +248,8 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(int num_heads, int num_kv_hea , mQKVBiasEnabled(qkv_bias_enabled) { mEnableContextFMHA = mEnableContextFMHA && (mType == DataType::kHALF || mType == DataType::kBF16); - PLUGIN_ASSERT(isRoPE() == (rotary_embedding_dim != 0)); - TLLM_CHECK_WITH_INFO((getSMVersion() >= 80) || (mType != DataType::kBF16), + TLLM_CHECK(isRoPE() == (rotary_embedding_dim != 0)); + TLLM_CHECK_WITH_INFO((tc::getSMVersion() >= 80) || (mType != DataType::kBF16), "Unsupported data type, pre SM 80 GPUs do not support bfloat16"); } @@ -266,6 +275,10 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(const void* data, size_t leng read(d, mQScaling); read(d, mPositionEmbeddingType); read(d, mRotaryEmbeddingDim); + read(d, mRotaryEmbeddingBase); + read(d, mRotaryEmbeddingScaleType); + read(d, mRotaryEmbeddingScale); + read(d, mRotaryEmbeddingMaxPositions); read(d, mTpSize); read(d, mTpRank); read(d, mEnableContextFMHA); @@ -279,10 +292,10 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(const void* data, size_t leng read(d, mMaxContextLength); read(d, mQKVBiasEnabled); - mKVCacheQuantMode = QuantMode(kvCacheQuantMode); + mKVCacheQuantMode = tc::QuantMode(kvCacheQuantMode); - PLUGIN_ASSERT(d == a + length); - TLLM_CHECK_WITH_INFO((getSMVersion() >= 80) || (mType != DataType::kBF16), + TLLM_CHECK(d == a + length); + TLLM_CHECK_WITH_INFO((tc::getSMVersion() >= 80) || (mType != DataType::kBF16), "Unsupported data type, pre SM 80 GPUs do not support bfloat16"); } @@ -293,7 +306,7 @@ size_t GPTAttentionPluginCommon::getWorkspaceSizeForContext( const int local_hidden_units_qo = mNumHeads * getHeadSize(); const int local_hidden_units_kv = mNumKVHeads * getHeadSize(); - size_t const size = elementSize(type); + auto const size = tensorrt_llm::runtime::BufferDataType(type).getSize(); size_t context_workspace_size = 0; @@ -322,7 +335,7 @@ size_t GPTAttentionPluginCommon::getWorkspaceSizeForContext( workspaces[7] = qkv_buf_2_size; workspaces[8] = qk_buf_float_size; workspaces[9] = padding_offset_size; - context_workspace_size = plugin::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS); + context_workspace_size = tensorrt_llm::plugins::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS); return context_workspace_size; } @@ -331,7 +344,7 @@ size_t GPTAttentionPluginCommon::getWorkspaceSizeForGeneration(DataType type, in const int local_hidden_units_qo = mNumHeads * getHeadSize(); const int local_hidden_units_kv = mNumKVHeads * getHeadSize(); - size_t const size = elementSize(type); + auto const size = tensorrt_llm::runtime::BufferDataType(type).getSize(); size_t context_workspace_size = 0; size_t generation_workspace_size = 0; @@ -350,7 +363,7 @@ size_t GPTAttentionPluginCommon::getWorkspaceSizeForGeneration(DataType type, in workspaces[1] = partial_sum_size; workspaces[2] = partial_max_size; workspaces[3] = block_counter_size; - generation_workspace_size = plugin::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS); + generation_workspace_size = tensorrt_llm::plugins::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS); return generation_workspace_size; } @@ -402,7 +415,7 @@ int GPTAttentionPluginCommon::enqueueContext(const EnqueueContextParams(params.key_value_cache); } - const QuantMode quant_option = QuantMode::fromDescription(); + const auto quant_option = tc::QuantMode::fromDescription(); const float* qkv_scale_out = nullptr; const float* attention_out_scale = nullptr; @@ -421,7 +434,7 @@ int GPTAttentionPluginCommon::enqueueContext(const EnqueueContextParamsgetCublasHandle(); - PLUGIN_CUBLASASSERT(cublasSetStream(cublasHandle, stream)); + TLLM_CUDA_CHECK(cublasSetStream(cublasHandle, stream)); mCublasWrapper->setStream(stream); mCublasWrapper->setWorkspace(params.workspace); if constexpr (std::is_same_v) @@ -490,11 +503,15 @@ int GPTAttentionPluginCommon::enqueueContext(const EnqueueContextParams(v_buf_2_) - reinterpret_cast(k_buf_2_) + v_buf_2_size, stream); + float rotary_base, rotary_scale; + const int32_t kv_seq_len = params.input_seq_length; + update_rotary_params(kv_seq_len, rotary_base, rotary_scale); invokeAddFusedQKVBiasTranspose(q_buf_2_, k_buf_2_, v_buf_2_, const_cast(params.attention_input), const_cast(params.qkv_bias), params.context_lengths, mRemovePadding ? padding_offset : nullptr, request_batch_size, request_seq_length, params.num_tokens, mNumHeads, mNumKVHeads, getHeadSize(), - mEnableContextFMHA, mRotaryEmbeddingDim, position_embedding_type, (float*) nullptr, 0, stream); + mEnableContextFMHA, mRotaryEmbeddingDim, rotary_base, rotary_scale, position_embedding_type, (float*) nullptr, + 0, stream); sync_check_cuda_error(); @@ -506,7 +523,7 @@ int GPTAttentionPluginCommon::enqueueContext(const EnqueueContextParams::value; + const cudaDataType_t gemm_data_type = tc::CudaDataType::value; const int attention_seq_len_1 = request_seq_length; // q length const int attention_seq_len_2 = request_seq_length; // kv length const T qk_scale = static_cast(1.0f / (sqrtf(getHeadSize() * 1.0f) * q_scaling)); @@ -734,7 +751,7 @@ int GPTAttentionPluginCommon::enqueueGeneration( const bool* finished = nullptr; const bool has_ia3 = false; - const QuantMode quant_option = QuantMode::fromDescription(); + const auto quant_option = tc::QuantMode::fromDescription(); const float* qkv_scale_out = nullptr; const float* attention_out_scale = nullptr; @@ -761,7 +778,7 @@ int GPTAttentionPluginCommon::enqueueGeneration( int* block_counter = reinterpret_cast(nextWorkspacePtr(workspace_byte_ptr, offset, block_counter_size)); if (mMultiBlockMode) { - PLUGIN_CUASSERT(cudaMemsetAsync(block_counter, 0, block_counter_size, stream)); + TLLM_CUDA_CHECK(cudaMemsetAsync(block_counter, 0, block_counter_size, stream)); } KVCacheBuffer kv_cache_buffer; @@ -824,6 +841,8 @@ int GPTAttentionPluginCommon::enqueueGeneration( dispatch_params.kv_scale_quant_orig = params.kv_scale_quant_orig; dispatch_params.kv_block_array = kv_cache_buffer; dispatch_params.multi_processor_count = mMultiProcessorCount; + const int32_t kv_seq_len = step; + update_rotary_params(kv_seq_len, dispatch_params.rotary_embedding_base, dispatch_params.rotary_embedding_scale); fusedQKV_masked_attention_dispatch(dispatch_params, stream); sync_check_cuda_error(); return 0; @@ -856,10 +875,10 @@ int GPTAttentionPluginCommon::initialize() noexcept auto cublasHandle = getCublasHandle(); auto cublasLtHandle = getCublasLtHandle(); - mCublasAlgoMap = new cublasAlgoMap(GEMM_CONFIG); + mCublasAlgoMap = new tc::cublasAlgoMap(GEMM_CONFIG); mCublasWrapperMutex = new std::mutex(); mCublasWrapper - = new cublasMMWrapper(cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap, mCublasWrapperMutex, nullptr); + = new tc::cublasMMWrapper(cublasHandle, cublasLtHandle, nullptr, mCublasAlgoMap, mCublasWrapperMutex, nullptr); if (mEnableContextFMHA) { // Pre-checked during constructing. @@ -906,9 +925,10 @@ void GPTAttentionPluginCommon::destroy() noexcept size_t GPTAttentionPluginCommon::getCommonSerializationSize() noexcept { return sizeof(mNumHeads) + sizeof(mNumKVHeads) + sizeof(mHeadSize) + sizeof(mUnidirectional) + sizeof(mQScaling) - + sizeof(mPositionEmbeddingType) + sizeof(mRotaryEmbeddingDim) + sizeof(mTpSize) + sizeof(mTpRank) - + sizeof(mEnableContextFMHA) + sizeof(mFMHAForceFP32Acc) + sizeof(mMultiBlockMode) - + sizeof(unsigned int) // mKVCacheQuantMode + + sizeof(mPositionEmbeddingType) + sizeof(mRotaryEmbeddingDim) + sizeof(mRotaryEmbeddingBase) + + sizeof(mRotaryEmbeddingScaleType) + sizeof(mRotaryEmbeddingScale) + sizeof(mRotaryEmbeddingMaxPositions) + + sizeof(mTpSize) + sizeof(mTpRank) + sizeof(mEnableContextFMHA) + sizeof(mFMHAForceFP32Acc) + + sizeof(mMultiBlockMode) + sizeof(unsigned int) // mKVCacheQuantMode + sizeof(mRemovePadding) + sizeof(mMaskType) + sizeof(mPagedKVCache) + sizeof(mType) + sizeof(mMaxContextLength) + sizeof(mQKVBiasEnabled); } @@ -923,6 +943,10 @@ void GPTAttentionPluginCommon::serializeCommon(void* buffer) const noexcept write(d, mQScaling); write(d, mPositionEmbeddingType); write(d, mRotaryEmbeddingDim); + write(d, mRotaryEmbeddingBase); + write(d, mRotaryEmbeddingScaleType); + write(d, mRotaryEmbeddingScale); + write(d, mRotaryEmbeddingMaxPositions); write(d, mTpSize); write(d, mTpRank); write(d, mEnableContextFMHA); @@ -943,16 +967,6 @@ void GPTAttentionPluginCommon::terminate() noexcept // Do nothing, destroy will always be called, so release the resources there. } -void GPTAttentionPluginCommon::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* GPTAttentionPluginCommon::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// GPTAttentionPluginCreatorCommon::GPTAttentionPluginCreatorCommon() @@ -965,6 +979,10 @@ GPTAttentionPluginCreatorCommon::GPTAttentionPluginCreatorCommon() mPluginAttributes.emplace_back(PluginField("q_scaling", nullptr, PluginFieldType::kFLOAT32, 1.0)); mPluginAttributes.emplace_back(PluginField("position_embedding_type", nullptr, PluginFieldType::kINT8, 0)); mPluginAttributes.emplace_back(PluginField("rotary_embedding_dim", nullptr, PluginFieldType::kINT32, 0)); + mPluginAttributes.emplace_back(PluginField("rotary_embedding_base", nullptr, PluginFieldType::kFLOAT32, 0)); + mPluginAttributes.emplace_back(PluginField("rotary_embedding_scale_type", nullptr, PluginFieldType::kINT8, 0)); + mPluginAttributes.emplace_back(PluginField("rotary_embedding_scale", nullptr, PluginFieldType::kFLOAT32, 0)); + mPluginAttributes.emplace_back(PluginField("rotary_embedding_max_positions", nullptr, PluginFieldType::kINT32, 0)); mPluginAttributes.emplace_back(PluginField("tp_size", nullptr, PluginFieldType::kINT32, 0)); mPluginAttributes.emplace_back(PluginField("tp_rank", nullptr, PluginFieldType::kINT32, 0)); mPluginAttributes.emplace_back(PluginField("context_fmha_type", nullptr, PluginFieldType::kINT8, 0)); @@ -984,13 +1002,3 @@ const PluginFieldCollection* GPTAttentionPluginCreatorCommon::getFieldNames() no { return &mFC; } - -void GPTAttentionPluginCreatorCommon::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* GPTAttentionPluginCreatorCommon::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h index de4f63ef608..0de39079344 100644 --- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h +++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_GPT_ATTENTION_COMMON_H -#define TRT_GPT_ATTENTION_COMMON_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h" @@ -28,12 +27,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class GPTAttentionPluginCommon : public IPluginV2DynamicExt +class GPTAttentionPluginCommon : public BasePlugin { public: GPTAttentionPluginCommon() = delete; @@ -41,7 +38,8 @@ class GPTAttentionPluginCommon : public IPluginV2DynamicExt GPTAttentionPluginCommon(int num_heads, int num_kv_heads, int unidirectional, float q_scaling, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type, int rotary_embedding_dim, // for RoPE. Use 0 for non-RoPE - int tp_size, int tp_rank, // for ALiBi + float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type, + float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, int kv_cache_quant_mode, bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type, bool paged_kv_cache, nvinfer1::DataType type, int32_t max_context_length, bool qkv_bias_enabled); @@ -71,15 +69,13 @@ class GPTAttentionPluginCommon : public IPluginV2DynamicExt static size_t getCommonSerializationSize() noexcept; void serializeCommon(void* buffer) const noexcept; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; const int getHeadSize(bool checkInit = true) const; protected: int getMaxSeqLenTile(int elemSize) const; - size_t getWorkspaceSizeForContext(DataType type, int32_t nbReq, int32_t max_input_length) const noexcept; + size_t getWorkspaceSizeForContext(nvinfer1::DataType type, int32_t nbReq, int32_t max_input_length) const noexcept; // total_num_seq is the sum of beam_width for multiple requests - size_t getWorkspaceSizeForGeneration(DataType type, int32_t total_num_seq) const noexcept; + size_t getWorkspaceSizeForGeneration(nvinfer1::DataType type, int32_t total_num_seq) const noexcept; template struct EnqueueContextParams @@ -142,9 +138,26 @@ class GPTAttentionPluginCommon : public IPluginV2DynamicExt || mPositionEmbeddingType == tensorrt_llm::kernels::PositionEmbeddingType::kROPE_GPT_NEOX; } + inline void update_rotary_params(int32_t kv_seq_len, float& base, float& scale) + { + base = mRotaryEmbeddingBase; + scale = 1.0f / mRotaryEmbeddingScale; // do the division here so that we can avoid it in the kernel + if (mPositionEmbeddingType == tensorrt_llm::kernels::PositionEmbeddingType::kROPE_GPT_NEOX + && mRotaryEmbeddingScaleType == tensorrt_llm::kernels::RotaryScalingType::kDYNAMIC) + { + if (kv_seq_len > mRotaryEmbeddingMaxPositions) + { + const float b + = (mRotaryEmbeddingScale * kv_seq_len / mRotaryEmbeddingMaxPositions) - (mRotaryEmbeddingScale - 1); + const float p = static_cast(mRotaryEmbeddingDim) / (mRotaryEmbeddingDim - 2); + base = mRotaryEmbeddingBase * pow(b, p); + } + scale = 1.0f; // scale factor is already used in updated base + } + } + protected: const std::string mLayerName; - std::string mNamespace; int mNumHeads; int mNumKVHeads; @@ -152,6 +165,10 @@ class GPTAttentionPluginCommon : public IPluginV2DynamicExt int mUnidirectional; float mQScaling; int mRotaryEmbeddingDim; + float mRotaryEmbeddingBase; + tensorrt_llm::kernels::RotaryScalingType mRotaryEmbeddingScaleType; + float mRotaryEmbeddingScale; + int mRotaryEmbeddingMaxPositions; tensorrt_llm::kernels::PositionEmbeddingType mPositionEmbeddingType; bool mRemovePadding = false; tensorrt_llm::kernels::AttentionMaskType mMaskType; @@ -178,7 +195,7 @@ class GPTAttentionPluginCommon : public IPluginV2DynamicExt tensorrt_llm::common::cublasMMWrapper* mCublasWrapper; }; -class GPTAttentionPluginCreatorCommon : public IPluginCreator +class GPTAttentionPluginCreatorCommon : public BaseCreator { public: GPTAttentionPluginCreatorCommon(); @@ -188,17 +205,9 @@ class GPTAttentionPluginCreatorCommon : public IPluginCreator template T* deserializePluginImpl(const char* name, const void* serialData, size_t serialLength) noexcept; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - protected: - std::vector mPluginAttributes; - PluginFieldCollection mFC{}; - std::string mNamespace; + std::vector mPluginAttributes; + nvinfer1::PluginFieldCollection mFC{}; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_GPT_ATTENTION_COMMON_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommonImpl.h b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommonImpl.h index 5dfb12dc206..7d9b3b941db 100644 --- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommonImpl.h +++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommonImpl.h @@ -15,12 +15,11 @@ * limitations under the License. */ -#ifndef TRT_GPT_ATTENTION_COMMON_IMPL_H -#define TRT_GPT_ATTENTION_COMMON_IMPL_H +#pragma once #include "gptAttentionCommon.h" -namespace nvinfer1::plugin +namespace tensorrt_llm::plugins { template T* GPTAttentionPluginCommon::cloneImpl() const noexcept @@ -52,6 +51,4 @@ T* GPTAttentionPluginCreatorCommon::deserializePluginImpl( } return nullptr; } -} // namespace nvinfer1::plugin - -#endif // TRT_GPT_ATTENTION_COMMON_IMPL_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp index a5db1cfed9e..4aad4e1f443 100644 --- a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp @@ -14,14 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h" -#include "checkMacrosPlugin.h" -#include "gptAttentionCommon.h" -#include "gptAttentionCommon/gptAttentionCommonImpl.h" -#include "plugin.h" +#include "gptAttentionPlugin.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h" #include "tensorrt_llm/kernels/gptKernels.h" #include "tensorrt_llm/kernels/unfusedAttentionKernels.h" +#include "tensorrt_llm/plugins/common/checkMacrosPlugin.h" +#include "tensorrt_llm/plugins/common/plugin.h" +#include "tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h" +#include "tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommonImpl.h" #include #include #include @@ -29,9 +29,8 @@ using namespace nvinfer1; using namespace tensorrt_llm::kernels; -using namespace tensorrt_llm::common; -using nvinfer1::plugin::GPTAttentionPluginCreator; -using nvinfer1::plugin::GPTAttentionPlugin; +using tensorrt_llm::plugins::GPTAttentionPluginCreator; +using tensorrt_llm::plugins::GPTAttentionPlugin; static const char* GPT_ATTENTION_PLUGIN_VERSION{"1"}; static const char* GPT_ATTENTION_PLUGIN_NAME{"GPTAttention"}; @@ -39,27 +38,21 @@ static const char* GPT_ATTENTION_PLUGIN_NAME{"GPTAttention"}; GPTAttentionPlugin::GPTAttentionPlugin(int num_heads, int num_kv_heads, int unidirectional, float q_scaling, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type, int rotary_embedding_dim, // for RoPE. 0 for non-RoPE - int tp_size, int tp_rank, // for ALiBi + float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type, + float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, int kv_cache_quant_mode, bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type, bool paged_kv_cache, - nvinfer1::DataType type, bool in_flight_batching, int32_t max_context_length, bool qkv_bias_enabled) + nvinfer1::DataType type, int32_t max_context_length, bool qkv_bias_enabled) : GPTAttentionPluginCommon(num_heads, num_kv_heads, unidirectional, q_scaling, position_embedding_type, - rotary_embedding_dim, tp_size, tp_rank, context_fmha_type, multi_block_mode, kv_cache_quant_mode, + rotary_embedding_dim, rotary_embedding_base, rotary_embedding_scale_type, rotary_embedding_scale, + rotary_embedding_max_positions, tp_size, tp_rank, context_fmha_type, multi_block_mode, kv_cache_quant_mode, remove_input_padding, mask_type, paged_kv_cache, type, max_context_length, qkv_bias_enabled) - , mInFlightBatching(in_flight_batching) { - TLLM_CHECK(!mInFlightBatching || mRemovePadding); } GPTAttentionPlugin::GPTAttentionPlugin(const void* data, size_t length) - : GPTAttentionPluginCommon(data, GPTAttentionPluginCommon::getCommonSerializationSize()) + : GPTAttentionPluginCommon(data, length) { - const char *d = reinterpret_cast(data), *a = d; - d += GPTAttentionPluginCommon::getCommonSerializationSize(); - - read(d, mInFlightBatching); - TLLM_CHECK(d == a + length); - TLLM_CHECK(!mInFlightBatching || mRemovePadding); } // IPluginV2DynamicExt Methods @@ -157,49 +150,56 @@ int GPTAttentionPlugin::enqueueImpl(const nvinfer1::PluginTensorDesc* inputDesc, cudaStream_t stream) { int32_t const nbSeq = inputDesc[getContextLengthsIdx()].dims.d[0]; - if (!mInFlightBatching) - { - enqueueSome(0, nbSeq, 0, inputDesc, outputDesc, inputs, outputs, workspace, stream); - return 0; - } - // In-flight batching code path int32_t const beam_width = inputDesc[getCacheIndirIdx()].dims.d[1]; RequestType const* reqTypes = static_cast(inputs[getRequestTypesIdx()]); int32_t nbContextRequests = 0; int32_t contextTokenIdxEnd = 0; // count context requests - for (int32_t i = 0; i < nbSeq; i++) + for (int32_t seqIdx = 0; seqIdx < nbSeq; seqIdx++) { - if (reqTypes[i] != RequestType::kCONTEXT) + if (reqTypes[seqIdx] != RequestType::kCONTEXT) { break; } ++nbContextRequests; - contextTokenIdxEnd += (mRemovePadding ? getInputLength(inputs, i) : inputDesc[getInputTensorIdx()].dims.d[1]); + contextTokenIdxEnd += mRemovePadding ? static_cast(inputs[getHostContextLengthsIdx()])[seqIdx] + : inputDesc[getInputTensorIdx()].dims.d[1]; } - for (int32_t i = nbContextRequests; i < nbSeq; i++) + for (int32_t seqIdx = nbContextRequests; seqIdx < nbSeq; seqIdx++) + { + TLLM_CHECK(reqTypes[seqIdx] == RequestType::kGENERATION); + } + + // mixed requests require mRemovePadding and mPagedKVCache + if (nbContextRequests != 0 && nbContextRequests != nbSeq) { - TLLM_CHECK(reqTypes[i] == RequestType::kGENERATION); + TLLM_CHECK(mRemovePadding && mPagedKVCache); } if (nbContextRequests > 0) { - enqueueSome( - 0, nbContextRequests, 0, inputDesc, outputDesc, inputs, outputs, workspace, stream); + auto seqIdxBeg = 0; + auto tokenIdxBeg = 0; + auto localNbTokens = contextTokenIdxEnd; + enqueueSome(seqIdxBeg, nbContextRequests, tokenIdxBeg, localNbTokens, inputDesc, outputDesc, + inputs, outputs, workspace, stream); } - if (nbSeq - nbContextRequests > 0) + if (auto nbGenerationSeq = nbSeq - nbContextRequests; nbGenerationSeq > 0) { - enqueueSome(nbContextRequests, nbSeq - nbContextRequests, contextTokenIdxEnd, inputDesc, - outputDesc, inputs, outputs, workspace, stream); + auto seqIdxBeg = nbContextRequests; + auto tokenIdxBeg = contextTokenIdxEnd; + auto localNbTokens = nbGenerationSeq; + enqueueSome(seqIdxBeg, nbGenerationSeq, tokenIdxBeg, localNbTokens, inputDesc, outputDesc, + inputs, outputs, workspace, stream); } return 0; } template -int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32_t tokenIdxBeg, +int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32_t tokenIdxBeg, int32_t localNbTokens, const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) { @@ -217,12 +217,6 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32 auto const reqTypeInBatchPtr = static_cast(inputs[getRequestTypesIdx()]) + seqIdxBeg; bool const is_context = (reqTypeInBatchPtr[0] == RequestType::kCONTEXT); - TLLM_CHECK(std::all_of(reqTypeInBatchPtr, reqTypeInBatchPtr + localNbSeq, - [is_context](RequestType reqType) - { - TLLM_CHECK(reqType == RequestType::kCONTEXT || reqType == RequestType::kGENERATION); - return is_context == (reqType == RequestType::kCONTEXT); - })); const int* context_lengths = reinterpret_cast(inputs[getContextLengthsIdx()]) + seqIdxBeg; // Note we still need context length during generation for MMHA optimziation. @@ -235,7 +229,7 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32 auto const host_context_lengths = static_cast(inputs[getHostContextLengthsIdx()]) + seqIdxBeg; return *std::max_element(host_context_lengths, host_context_lengths + localNbSeq); }(); - PLUGIN_ASSERT(max_context_len <= mMaxContextLength); + TLLM_CHECK(max_context_len <= mMaxContextLength); const float* kv_scale_orig_quant = nullptr; const float* kv_scale_quant_orig = nullptr; @@ -276,29 +270,10 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32 if (is_context) // context stage { const int batch_size = localNbSeq; - const int request_batch_size = batch_size; - const int request_seq_len = max_context_len; - // num of total tokens (without paddings when remove paddings). - int num_tokens = 0; - if (!mRemovePadding) - { - num_tokens = request_batch_size * request_seq_len; - } - else if (mInFlightBatching) - { - auto const host_context_lengths - = static_cast(inputs[getHostContextLengthsIdx()]) + seqIdxBeg; - num_tokens = std::accumulate(host_context_lengths, host_context_lengths + localNbSeq, 0); - } - else - { - num_tokens = inputDesc[getInputTensorIdx()].dims.d[1]; - } - enqueueContext( EnqueueContextParams{attention_input, qkv_bias, max_context_len, maxSeqLen, context_lengths, kv_scale_orig_quant, kv_scale_quant_orig, alibi_slopes, context_buf_, key_value_cache, - block_pointers, batch_size, num_tokens, tokens_per_block, max_blocks_per_sequence, workspace}, + block_pointers, batch_size, localNbTokens, tokens_per_block, max_blocks_per_sequence, workspace}, stream); } else // generation stage; input_seq_len == 1 @@ -387,16 +362,12 @@ int GPTAttentionPlugin::getNbOutputs() const noexcept size_t GPTAttentionPlugin::getSerializationSize() const noexcept { - return GPTAttentionPluginCommon::getCommonSerializationSize() + sizeof(mInFlightBatching); + return GPTAttentionPluginCommon::getCommonSerializationSize(); } void GPTAttentionPlugin::serialize(void* buffer) const noexcept { - char *d = static_cast(buffer), *a = d; GPTAttentionPluginCommon::serializeCommon(buffer); - d += GPTAttentionPluginCommon::getCommonSerializationSize(); - write(d, mInFlightBatching); - PLUGIN_ASSERT(d == a + getSerializationSize()); } /////////////// @@ -435,7 +406,10 @@ IPluginV2* GPTAttentionPluginCreator::createPlugin(const char* name, const Plugi p.getScalar("num_kv_heads").value(), p.getScalar("unidirectional").value(), p.getScalar("q_scaling").value(), static_cast(p.getScalar("position_embedding_type").value()), - p.getScalar("rotary_embedding_dim").value(), + p.getScalar("rotary_embedding_dim").value(), p.getScalar("rotary_embedding_base").value(), + static_cast(p.getScalar("rotary_embedding_scale_type").value()), + p.getScalar("rotary_embedding_scale").value(), + p.getScalar("rotary_embedding_max_positions").value(), static_cast(p.getScalar("tp_size").value()), static_cast(p.getScalar("tp_rank").value()), static_cast(p.getScalar("context_fmha_type").value()), @@ -445,7 +419,7 @@ IPluginV2* GPTAttentionPluginCreator::createPlugin(const char* name, const Plugi static_cast(p.getScalar("mask_type").value()), static_cast(p.getScalar("paged_kv_cache").value()), static_cast(p.getScalar("type_id").value()), - p.getScalar("in_flight_batching").value(), p.getScalar("max_context_length").value(), + p.getScalar("max_context_length").value(), static_cast(p.getScalar("qkv_bias_enabled").value())); obj->setPluginNamespace(mNamespace.c_str()); return obj; @@ -474,13 +448,3 @@ IPluginV2* GPTAttentionPluginCreator::deserializePlugin( } return nullptr; } - -void GPTAttentionPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* GPTAttentionPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h index bb27d8b6974..101ef1111c7 100644 --- a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h +++ b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_GPT_ATTENTION_PLUGIN_H -#define TRT_GPT_ATTENTION_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "checkMacrosPlugin.h" #include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/quantization.h" @@ -31,9 +30,7 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { // batch_size = num_ctx_requests + num_gen_requests * beam_width // num_ctx_requests = number of context requests (single sequence per request). @@ -70,10 +67,11 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon GPTAttentionPlugin(int num_heads, int num_kv_heads, int unidirectional, float q_scaling, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type, int rotary_embedding_dim, // for RoPE. 0 for non-RoPE - int tp_size, int tp_rank, // for ALiBi + float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type, + float rotary_embedding_scale, int rotary_embedding_max_positions, int tp_size, int tp_rank, // for ALiBi tensorrt_llm::kernels::ContextFMHAType context_fmha_type, bool multi_block_mode, int kv_cache_quant_mode, bool remove_input_padding, tensorrt_llm::kernels::AttentionMaskType mask_type, bool paged_kv_cache, - nvinfer1::DataType type, bool in_flight_batching, int32_t max_context_length, bool qkv_bias_enabled); + nvinfer1::DataType type, int32_t max_context_length, bool qkv_bias_enabled); GPTAttentionPlugin(const void* data, size_t length); @@ -120,16 +118,12 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon enum class RequestType : int32_t { kCONTEXT = 0, - kGENERATION = 1, - kNONE = 2 + kGENERATION = 1 }; -private: - bool mInFlightBatching = false; - private: template - int enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32_t tokenIdxBeg, + int enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32_t tokenIdxBeg, int32_t localNbTokens, const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream); @@ -192,28 +186,16 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon IndexType getHostContextLengthsIdx() const { - PLUGIN_ASSERT(mRemovePadding); + TLLM_CHECK(mRemovePadding); return (mKVCacheQuantMode.hasKvCacheQuant() ? 9 : 7) + (mPagedKVCache ? 1 : 0) + (isALiBi() ? 1 : 0); } IndexType getQKVBiasTensorIdx() const { - PLUGIN_ASSERT(mQKVBiasEnabled); + TLLM_CHECK(mQKVBiasEnabled); return (mKVCacheQuantMode.hasInt8KvCache() ? 9 : 7) + (mPagedKVCache ? 1 : 0) + (isALiBi() ? 1 : 0) + (mRemovePadding ? 1 : 0); } - - int32_t getInputLength(const void* const* inputs, int32_t seqIdx) const - { - auto const reqType = static_cast(inputs[getRequestTypesIdx()])[seqIdx]; - switch (reqType) - { - case RequestType::kCONTEXT: return static_cast(inputs[getHostContextLengthsIdx()])[seqIdx]; - case RequestType::kGENERATION: return 1; - case RequestType::kNONE: return 0; - } - PLUGIN_ASSERT(!"Unexpected request type"); - } }; class GPTAttentionPluginCreator : public GPTAttentionPluginCreatorCommon @@ -231,13 +213,6 @@ class GPTAttentionPluginCreator : public GPTAttentionPluginCreatorCommon nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_GPT_ATTENTION_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp b/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp index 36f39a085b7..74ebe14e406 100644 --- a/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp @@ -14,16 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/identityPlugin/identityPlugin.h" +#include "identityPlugin.h" using namespace nvinfer1; -using nvinfer1::plugin::IdentityPluginCreator; -using nvinfer1::plugin::IdentityPlugin; +using tensorrt_llm::plugins::IdentityPluginCreator; +using tensorrt_llm::plugins::IdentityPlugin; static const char* IDENTITY_PLUGIN_VERSION{"1"}; static const char* IDENTITY_PLUGIN_NAME{"Identity"}; PluginFieldCollection IdentityPluginCreator::mFC{}; -std::vector IdentityPluginCreator::mPluginAttributes; +std::vector IdentityPluginCreator::mPluginAttributes; IdentityPlugin::IdentityPlugin() {} @@ -31,7 +31,7 @@ IdentityPlugin::IdentityPlugin() {} IdentityPlugin::IdentityPlugin(const void* data, size_t length) { const char *d = reinterpret_cast(data), *a = d; - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); } // IPluginV2DynamicExt Methods @@ -152,16 +152,6 @@ void IdentityPlugin::destroy() noexcept delete this; } -void IdentityPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* IdentityPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// IdentityPluginCreator::IdentityPluginCreator() @@ -219,13 +209,3 @@ IPluginV2* IdentityPluginCreator::deserializePlugin( } return nullptr; } - -void IdentityPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* IdentityPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.h b/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.h index ca0e7465576..79d9e4104d4 100644 --- a/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.h +++ b/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_IDENTITY_PLUGIN_H -#define TRT_IDENTITY_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/plugins/common/plugin.h" #include #include @@ -24,12 +23,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class IdentityPlugin : public IPluginV2DynamicExt +class IdentityPlugin : public BasePlugin { public: IdentityPlugin(); @@ -64,15 +61,12 @@ class IdentityPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: const std::string mLayerName; - std::string mNamespace; }; -class IdentityPluginCreator : public IPluginCreator +class IdentityPluginCreator : public BaseCreator { public: IdentityPluginCreator(); @@ -88,17 +82,9 @@ class IdentityPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_IDENTITY_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.cpp b/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.cpp index c67f83cc363..98797a3fb84 100644 --- a/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.cpp @@ -15,20 +15,20 @@ * limitations under the License. */ -#include "tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h" +#include "layernormPlugin.h" #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/kernels/layernormKernels.h" using namespace nvinfer1; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::common; -using nvinfer1::plugin::LayernormPluginCreator; -using nvinfer1::plugin::LayernormPlugin; +using tensorrt_llm::plugins::LayernormPluginCreator; +using tensorrt_llm::plugins::LayernormPlugin; static const char* LAYERNORM_PLUGIN_VERSION{"1"}; static const char* LAYERNORM_PLUGIN_NAME{"Layernorm"}; PluginFieldCollection LayernormPluginCreator::mFC{}; -std::vector LayernormPluginCreator::mPluginAttributes; +std::vector LayernormPluginCreator::mPluginAttributes; LayernormPlugin::LayernormPlugin(float eps, bool useDiffOfSquares, nvinfer1::DataType type) : mEps(eps) @@ -46,7 +46,7 @@ LayernormPlugin::LayernormPlugin(const void* data, size_t length) read(d, mEps); read(d, mUseDiffOfSquares); read(d, mType); - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); TLLM_CHECK_WITH_INFO((getSMVersion() >= 80) || (mType != DataType::kBF16), "Unsupported data type"); } @@ -67,7 +67,7 @@ nvinfer1::DimsExprs LayernormPlugin::getOutputDimensions( bool LayernormPlugin::supportsFormatCombination( int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept { - PLUGIN_ASSERT(0 <= pos && pos < 5); + TLLM_CHECK(0 <= pos && pos < 5); return (inOut[pos].type == mType) && (inOut[pos].format == TensorFormat::kLINEAR); } @@ -181,16 +181,6 @@ void LayernormPlugin::destroy() noexcept delete this; } -void LayernormPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* LayernormPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// LayernormPluginCreator::LayernormPluginCreator() @@ -231,17 +221,17 @@ IPluginV2* LayernormPluginCreator::createPlugin(const char* name, const PluginFi const char* attrName = fields[i].name; if (!strcmp(attrName, "eps")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kFLOAT32); eps = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "use_diff_of_squares")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); useDiffOfSquares = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } } @@ -275,13 +265,3 @@ IPluginV2* LayernormPluginCreator::deserializePlugin( } return nullptr; } - -void LayernormPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* LayernormPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h b/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h index 5f3fe6cab18..ef7a1b114e1 100644 --- a/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h +++ b/cpp/tensorrt_llm/plugins/layernormPlugin/layernormPlugin.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_LAYERNORM_PLUGIN_H -#define TRT_LAYERNORM_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/plugins/common/plugin.h" #include #include @@ -24,12 +23,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class LayernormPlugin : public IPluginV2DynamicExt +class LayernormPlugin : public BasePlugin { public: LayernormPlugin(float eps, bool useDiffOfSquares, nvinfer1::DataType type); @@ -64,8 +61,6 @@ class LayernormPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: float mEps; @@ -73,10 +68,9 @@ class LayernormPlugin : public IPluginV2DynamicExt nvinfer1::DataType mType; const std::string mLayerName; - std::string mNamespace; }; -class LayernormPluginCreator : public IPluginCreator +class LayernormPluginCreator : public BaseCreator { public: LayernormPluginCreator(); @@ -92,17 +86,9 @@ class LayernormPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_LAYERNORM_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp b/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp index ec80a967e2c..e0545de42e9 100644 --- a/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp @@ -14,19 +14,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h" +#include "layernormQuantizationPlugin.h" #include "tensorrt_llm/kernels/layernormKernels.h" using namespace nvinfer1; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::common; -using nvinfer1::plugin::LayernormQuantizationPluginCreator; -using nvinfer1::plugin::LayernormQuantizationPlugin; +using tensorrt_llm::plugins::LayernormQuantizationPluginCreator; +using tensorrt_llm::plugins::LayernormQuantizationPlugin; static const char* LAYERNORM_QUANTIZATION_PLUGIN_VERSION{"1"}; static const char* LAYERNORM_QUANTIZATION_PLUGIN_NAME{"LayernormQuantization"}; PluginFieldCollection LayernormQuantizationPluginCreator::mFC{}; -std::vector LayernormQuantizationPluginCreator::mPluginAttributes; +std::vector LayernormQuantizationPluginCreator::mPluginAttributes; LayernormQuantizationPlugin::LayernormQuantizationPlugin( float eps, bool useDiffOfSquares, bool dynamicActivationScaling, nvinfer1::DataType type) @@ -45,7 +45,7 @@ LayernormQuantizationPlugin::LayernormQuantizationPlugin(const void* data, size_ read(d, mUseDiffOfSquares); read(d, mDynActScaling); read(d, mType); - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); } // IPluginV2DynamicExt Methods @@ -68,7 +68,7 @@ nvinfer1::DimsExprs LayernormQuantizationPlugin::getOutputDimensions( // Dynamic scaling output if enabled try { - PLUGIN_ASSERT(outputIndex == 1); + TLLM_CHECK(outputIndex == 1); DimsExprs ret; ret.nbDims = inputs[0].nbDims; for (int di = 0; di < ret.nbDims - 1; ++di) @@ -89,8 +89,8 @@ bool LayernormQuantizationPlugin::supportsFormatCombination( int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept { const int totalPoses = 6 + static_cast(mDynActScaling); - PLUGIN_ASSERT(0 <= pos && pos < totalPoses); - PLUGIN_ASSERT(nbInputs == 4); + TLLM_CHECK(0 <= pos && pos < totalPoses); + TLLM_CHECK(nbInputs == 4); if (pos < nbInputs) { switch (pos) @@ -224,16 +224,6 @@ void LayernormQuantizationPlugin::destroy() noexcept delete this; } -void LayernormQuantizationPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* LayernormQuantizationPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// LayernormQuantizationPluginCreator::LayernormQuantizationPluginCreator() @@ -276,22 +266,22 @@ IPluginV2* LayernormQuantizationPluginCreator::createPlugin(const char* name, co const char* attrName = fields[i].name; if (!strcmp(attrName, "eps")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kFLOAT32); eps = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "dyn_act_scaling")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); dynamicActivationScaling = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "use_diff_of_squares")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); useDiffOfSquares = static_cast(*(static_cast(fields[i].data))); } } @@ -325,13 +315,3 @@ IPluginV2* LayernormQuantizationPluginCreator::deserializePlugin( } return nullptr; } - -void LayernormQuantizationPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* LayernormQuantizationPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h b/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h old mode 100755 new mode 100644 index b0720d4b224..5d4361d01c4 --- a/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h +++ b/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_LAYERNORM_QUANTIZATION_PLUGIN_H -#define TRT_LAYERNORM_QUANTIZATION_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/plugins/common/plugin.h" #include #include @@ -24,12 +23,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class LayernormQuantizationPlugin : public IPluginV2DynamicExt +class LayernormQuantizationPlugin : public BasePlugin { public: LayernormQuantizationPlugin( @@ -65,8 +62,6 @@ class LayernormQuantizationPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: float mEps; @@ -75,10 +70,9 @@ class LayernormQuantizationPlugin : public IPluginV2DynamicExt nvinfer1::DataType mType; const std::string mLayerName; - std::string mNamespace; }; -class LayernormQuantizationPluginCreator : public IPluginCreator +class LayernormQuantizationPluginCreator : public BaseCreator { public: LayernormQuantizationPluginCreator(); @@ -94,17 +88,9 @@ class LayernormQuantizationPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_LAYERNORM_QUANTIZATION_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp b/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp index 5907621951c..9174ebc7f27 100644 --- a/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp @@ -17,20 +17,20 @@ #include +#include "lookupPlugin.h" #include "tensorrt_llm/kernels/lookupKernels.h" #include "tensorrt_llm/plugins/common/plugin.h" -#include "tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h" using namespace nvinfer1; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::common; -using nvinfer1::plugin::LookupPluginCreator; -using nvinfer1::plugin::LookupPlugin; +using tensorrt_llm::plugins::LookupPluginCreator; +using tensorrt_llm::plugins::LookupPlugin; static const char* LOOKUP_PLUGIN_VERSION{"1"}; static const char* LOOKUP_PLUGIN_NAME{"Lookup"}; PluginFieldCollection LookupPluginCreator::mFC{}; -std::vector LookupPluginCreator::mPluginAttributes; +std::vector LookupPluginCreator::mPluginAttributes; LookupPlugin::LookupPlugin(nvinfer1::DataType type, int rank) : mType(type) @@ -44,7 +44,7 @@ LookupPlugin::LookupPlugin(const void* data, size_t length) const char *d = reinterpret_cast(data), *a = d; read(d, mType); read(d, mRank); - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); } // IPluginV2DynamicExt Methods @@ -61,8 +61,8 @@ nvinfer1::DimsExprs LookupPlugin::getOutputDimensions( { try { - PLUGIN_ASSERT(nbInputs == 2); - PLUGIN_ASSERT(outputIndex == 0); + TLLM_CHECK(nbInputs == 2); + TLLM_CHECK(outputIndex == 0); DimsExprs ret; const int nbDimsInput = inputs[0].nbDims; const int nbDimsWeight = inputs[1].nbDims; @@ -157,7 +157,7 @@ int LookupPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvi nvinfer1::DataType LookupPlugin::getOutputDataType( int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept { - PLUGIN_ASSERT(index == 0); + TLLM_CHECK(index == 0); return inputTypes[1]; } @@ -204,16 +204,6 @@ void LookupPlugin::serialize(void* buffer) const noexcept void LookupPlugin::terminate() noexcept {} -void LookupPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* LookupPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// LookupPluginCreator::LookupPluginCreator() @@ -252,12 +242,12 @@ IPluginV2* LookupPluginCreator::createPlugin(const char* name, const PluginField const char* attrName = fields[i].name; if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "rank")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); rank = static_cast(*(static_cast(fields[i].data))); } } @@ -291,13 +281,3 @@ IPluginV2* LookupPluginCreator::deserializePlugin( } return nullptr; } - -void LookupPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* LookupPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h b/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h index 681cf0057af..035264715aa 100644 --- a/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h +++ b/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.h @@ -14,21 +14,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_LOOKUP_PLUGIN_H -#define TRT_LOOKUP_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/plugins/common/plugin.h" #include #include #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class LookupPlugin : public IPluginV2DynamicExt +class LookupPlugin : public BasePlugin { public: LookupPlugin() = delete; @@ -65,18 +62,15 @@ class LookupPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: const std::string mLayerName; - std::string mNamespace; nvinfer1::DataType mType; int mRank; }; -class LookupPluginCreator : public IPluginCreator +class LookupPluginCreator : public BaseCreator { public: LookupPluginCreator(); @@ -92,17 +86,9 @@ class LookupPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_LOOKUP_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp index fac13e9d6d0..8a1bca755bf 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp @@ -14,16 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h" +#include "allgatherPlugin.h" using namespace nvinfer1; -using nvinfer1::plugin::AllgatherPluginCreator; -using nvinfer1::plugin::AllgatherPlugin; +using tensorrt_llm::plugins::AllgatherPluginCreator; +using tensorrt_llm::plugins::AllgatherPlugin; static const char* ALLGATHER_PLUGIN_VERSION{"1"}; static const char* ALLGATHER_PLUGIN_NAME{"AllGather"}; PluginFieldCollection AllgatherPluginCreator::mFC{}; -std::vector AllgatherPluginCreator::mPluginAttributes; +std::vector AllgatherPluginCreator::mPluginAttributes; AllgatherPlugin::AllgatherPlugin(std::set group, nvinfer1::DataType type) : mGroup(group) @@ -43,7 +43,7 @@ AllgatherPlugin::AllgatherPlugin(const void* data, size_t length) read(d, groupItem); mGroup.insert(groupItem); } - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); } // IPluginV2DynamicExt Methods @@ -200,16 +200,6 @@ void AllgatherPlugin::destroy() noexcept delete this; } -void AllgatherPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* AllgatherPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// AllgatherPluginCreator::AllgatherPluginCreator() @@ -248,7 +238,7 @@ IPluginV2* AllgatherPluginCreator::createPlugin(const char* name, const PluginFi const char* attrName = fields[i].name; if (!strcmp(attrName, "group")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); const auto* r = static_cast(fields[i].data); for (int j = 0; j < fields[i].length; ++j) { @@ -258,7 +248,7 @@ IPluginV2* AllgatherPluginCreator::createPlugin(const char* name, const PluginFi } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } } @@ -293,13 +283,3 @@ IPluginV2* AllgatherPluginCreator::deserializePlugin( } return nullptr; } - -void AllgatherPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* AllgatherPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h index 5366a44382a..923f4a2cdb0 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_ALLGATHER_PLUGIN_H -#define TRT_ALLGATHER_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/plugins/common/plugin.h" #include #include @@ -25,12 +24,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class AllgatherPlugin : public IPluginV2DynamicExt +class AllgatherPlugin : public BasePlugin { public: AllgatherPlugin(std::set group, nvinfer1::DataType type); @@ -65,17 +62,14 @@ class AllgatherPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: const std::string mLayerName; - std::string mNamespace; std::set mGroup; nvinfer1::DataType mType; }; -class AllgatherPluginCreator : public IPluginCreator +class AllgatherPluginCreator : public BaseCreator { public: AllgatherPluginCreator(); @@ -91,17 +85,9 @@ class AllgatherPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_ALLGATHER_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp index d754e0591b3..546848161b2 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp @@ -14,16 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h" +#include "allreducePlugin.h" using namespace nvinfer1; -using nvinfer1::plugin::AllreducePluginCreator; -using nvinfer1::plugin::AllreducePlugin; +using tensorrt_llm::plugins::AllreducePluginCreator; +using tensorrt_llm::plugins::AllreducePlugin; static const char* ALLREDUCE_PLUGIN_VERSION{"1"}; static const char* ALLREDUCE_PLUGIN_NAME{"AllReduce"}; PluginFieldCollection AllreducePluginCreator::mFC{}; -std::vector AllreducePluginCreator::mPluginAttributes; +std::vector AllreducePluginCreator::mPluginAttributes; AllreducePlugin::AllreducePlugin(std::set group, nvinfer1::DataType type) : mGroup(group) @@ -43,7 +43,7 @@ AllreducePlugin::AllreducePlugin(const void* data, size_t length) read(d, groupItem); mGroup.insert(groupItem); } - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); } // IPluginV2DynamicExt Methods @@ -196,16 +196,6 @@ void AllreducePlugin::destroy() noexcept delete this; } -void AllreducePlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* AllreducePlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// AllreducePluginCreator::AllreducePluginCreator() @@ -244,7 +234,7 @@ IPluginV2* AllreducePluginCreator::createPlugin(const char* name, const PluginFi const char* attrName = fields[i].name; if (!strcmp(attrName, "group")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); const auto* r = static_cast(fields[i].data); for (int j = 0; j < fields[i].length; ++j) { @@ -254,7 +244,7 @@ IPluginV2* AllreducePluginCreator::createPlugin(const char* name, const PluginFi } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } } @@ -289,13 +279,3 @@ IPluginV2* AllreducePluginCreator::deserializePlugin( } return nullptr; } - -void AllreducePluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* AllreducePluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h index 87bc1495588..a5b6e798604 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_ALLREDUCE_PLUGIN_H -#define TRT_ALLREDUCE_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/plugins/common/plugin.h" #include #include @@ -25,12 +24,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class AllreducePlugin : public IPluginV2DynamicExt +class AllreducePlugin : public BasePlugin { public: AllreducePlugin(std::set group, nvinfer1::DataType type); @@ -65,17 +62,14 @@ class AllreducePlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: const std::string mLayerName; - std::string mNamespace; std::set mGroup; nvinfer1::DataType mType; }; -class AllreducePluginCreator : public IPluginCreator +class AllreducePluginCreator : public BaseCreator { public: AllreducePluginCreator(); @@ -91,17 +85,9 @@ class AllreducePluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_ALLREDUCE_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp index f553cbdfe9a..5a2f8036db4 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp @@ -14,16 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/ncclPlugin/recvPlugin.h" +#include "recvPlugin.h" using namespace nvinfer1; -using nvinfer1::plugin::RecvPluginCreator; -using nvinfer1::plugin::RecvPlugin; +using tensorrt_llm::plugins::RecvPluginCreator; +using tensorrt_llm::plugins::RecvPlugin; static const char* RECV_PLUGIN_VERSION{"1"}; static const char* RECV_PLUGIN_NAME{"Recv"}; PluginFieldCollection RecvPluginCreator::mFC{}; -std::vector RecvPluginCreator::mPluginAttributes; +std::vector RecvPluginCreator::mPluginAttributes; RecvPlugin::RecvPlugin(int srcRank, nvinfer1::DataType type) : mSrcRank(srcRank) @@ -37,7 +37,7 @@ RecvPlugin::RecvPlugin(const void* data, size_t length) const char *d = reinterpret_cast(data), *a = d; read(d, mType); read(d, mSrcRank); - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); } // IPluginV2DynamicExt Methods @@ -158,16 +158,6 @@ void RecvPlugin::destroy() noexcept delete this; } -void RecvPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* RecvPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// RecvPluginCreator::RecvPluginCreator() @@ -207,12 +197,12 @@ IPluginV2* RecvPluginCreator::createPlugin(const char* name, const PluginFieldCo const char* attrName = fields[i].name; if (!strcmp(attrName, "src_rank")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); srcRank = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } } @@ -246,13 +236,3 @@ IPluginV2* RecvPluginCreator::deserializePlugin(const char* name, const void* se } return nullptr; } - -void RecvPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* RecvPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.h b/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.h index 37be01ffdee..ac0da643dca 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.h +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_RECV_PLUGIN_H -#define TRT_RECV_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/plugins/common/plugin.h" #include #include @@ -24,12 +23,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class RecvPlugin : public IPluginV2DynamicExt +class RecvPlugin : public BasePlugin { public: RecvPlugin(int srcRank, nvinfer1::DataType type); @@ -64,17 +61,14 @@ class RecvPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: - std::string mNamespace; ncclComm_t mComm; // TODO: (kaiyu) Remove this int mSrcRank; nvinfer1::DataType mType; }; -class RecvPluginCreator : public IPluginCreator +class RecvPluginCreator : public BaseCreator { public: RecvPluginCreator(); @@ -90,17 +84,9 @@ class RecvPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_RECV_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp index 31af9540344..ab74f500277 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp @@ -14,16 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/ncclPlugin/sendPlugin.h" +#include "sendPlugin.h" using namespace nvinfer1; -using nvinfer1::plugin::SendPluginCreator; -using nvinfer1::plugin::SendPlugin; +using tensorrt_llm::plugins::SendPluginCreator; +using tensorrt_llm::plugins::SendPlugin; static const char* SEND_PLUGIN_VERSION{"1"}; static const char* SEND_PLUGIN_NAME{"Send"}; PluginFieldCollection SendPluginCreator::mFC{}; -std::vector SendPluginCreator::mPluginAttributes; +std::vector SendPluginCreator::mPluginAttributes; SendPlugin::SendPlugin(int tgtRank, nvinfer1::DataType type) : mTgtRank(tgtRank) @@ -37,7 +37,7 @@ SendPlugin::SendPlugin(const void* data, size_t length) const char *d = reinterpret_cast(data), *a = d; read(d, mType); read(d, mTgtRank); - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); } // IPluginV2DynamicExt Methods @@ -159,16 +159,6 @@ void SendPlugin::destroy() noexcept delete this; } -void SendPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* SendPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// SendPluginCreator::SendPluginCreator() @@ -208,12 +198,12 @@ IPluginV2* SendPluginCreator::createPlugin(const char* name, const PluginFieldCo const char* attrName = fields[i].name; if (!strcmp(attrName, "tgt_rank")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); tgtRank = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } } @@ -247,13 +237,3 @@ IPluginV2* SendPluginCreator::deserializePlugin(const char* name, const void* se } return nullptr; } - -void SendPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* SendPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.h b/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.h index fc46643a5f2..70d3c049be2 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.h +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_SEND_PLUGIN_H -#define TRT_SEND_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/plugins/common/plugin.h" #include #include @@ -24,12 +23,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class SendPlugin : public IPluginV2DynamicExt +class SendPlugin : public BasePlugin { public: SendPlugin(int tgtRank, nvinfer1::DataType type); @@ -64,17 +61,14 @@ class SendPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: - std::string mNamespace; ncclComm_t mComm; // TODO: (kaiyu) Remove this int mTgtRank; nvinfer1::DataType mType; }; -class SendPluginCreator : public IPluginCreator +class SendPluginCreator : public BaseCreator { public: SendPluginCreator(); @@ -90,17 +84,9 @@ class SendPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_SEND_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp b/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp index 443e8494a6f..7bf06144551 100644 --- a/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp @@ -14,18 +14,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h" +#include "quantizePerTokenPlugin.h" #include "tensorrt_llm/kernels/quantization.h" using namespace nvinfer1; using namespace tensorrt_llm::kernels; -using nvinfer1::plugin::QuantizePerTokenPluginCreator; -using nvinfer1::plugin::QuantizePerTokenPlugin; +using tensorrt_llm::plugins::QuantizePerTokenPluginCreator; +using tensorrt_llm::plugins::QuantizePerTokenPlugin; static const char* QUANTIZE_PER_TOKEN_PLUGIN_VERSION{"1"}; static const char* QUANTIZE_PER_TOKEN_PLUGIN_NAME{"QuantizePerToken"}; PluginFieldCollection QuantizePerTokenPluginCreator::mFC{}; -std::vector QuantizePerTokenPluginCreator::mPluginAttributes; +std::vector QuantizePerTokenPluginCreator::mPluginAttributes; QuantizePerTokenPlugin::QuantizePerTokenPlugin() {} @@ -33,7 +33,7 @@ QuantizePerTokenPlugin::QuantizePerTokenPlugin() {} QuantizePerTokenPlugin::QuantizePerTokenPlugin(const void* data, size_t length) { const char *d = reinterpret_cast(data), *a = d; - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); } // IPluginV2DynamicExt Methods @@ -49,8 +49,8 @@ nvinfer1::DimsExprs QuantizePerTokenPlugin::getOutputDimensions( { try { - PLUGIN_ASSERT(nbInputs == 1); - PLUGIN_ASSERT(outputIndex < 2); + TLLM_CHECK(nbInputs == 1); + TLLM_CHECK(outputIndex < 2); if (outputIndex == 0) { // Quantized input @@ -142,8 +142,8 @@ int QuantizePerTokenPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, nvinfer1::DataType QuantizePerTokenPlugin::getOutputDataType( int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept { - PLUGIN_ASSERT(nbInputs == 1); - PLUGIN_ASSERT(index < 2); + TLLM_CHECK(nbInputs == 1); + TLLM_CHECK(index < 2); return index == 0 ? nvinfer1::DataType::kINT8 : nvinfer1::DataType::kFLOAT; } @@ -188,16 +188,6 @@ void QuantizePerTokenPlugin::destroy() noexcept delete this; } -void QuantizePerTokenPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* QuantizePerTokenPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// QuantizePerTokenPluginCreator::QuantizePerTokenPluginCreator() @@ -255,13 +245,3 @@ IPluginV2* QuantizePerTokenPluginCreator::deserializePlugin( } return nullptr; } - -void QuantizePerTokenPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* QuantizePerTokenPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h b/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h old mode 100755 new mode 100644 index 641042a3c07..c10f0bc773f --- a/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h +++ b/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.h @@ -14,10 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_QUANTIZE_PER_TOKEN_PLUGIN_H -#define TRT_QUANTIZE_PER_TOKEN_PLUGIN_H +#pragma once -#include "NvInferPlugin.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/plugins/common/plugin.h" #include @@ -26,12 +24,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class QuantizePerTokenPlugin : public IPluginV2DynamicExt +class QuantizePerTokenPlugin : public BasePlugin { public: QuantizePerTokenPlugin(); @@ -66,15 +62,12 @@ class QuantizePerTokenPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: const std::string mLayerName; - std::string mNamespace; }; -class QuantizePerTokenPluginCreator : public IPluginCreator +class QuantizePerTokenPluginCreator : public BaseCreator { public: QuantizePerTokenPluginCreator(); @@ -90,17 +83,9 @@ class QuantizePerTokenPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_QUANTIZE_PER_TOKEN_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp b/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp index 13463fd49b5..27217c59e6c 100644 --- a/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp @@ -14,18 +14,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h" +#include "quantizeTensorPlugin.h" #include "tensorrt_llm/kernels/quantization.h" using namespace nvinfer1; using namespace tensorrt_llm::kernels; -using nvinfer1::plugin::QuantizeTensorPluginCreator; -using nvinfer1::plugin::QuantizeTensorPlugin; +using tensorrt_llm::plugins::QuantizeTensorPluginCreator; +using tensorrt_llm::plugins::QuantizeTensorPlugin; static const char* QUANTIZE_TENSOR_PLUGIN_VERSION{"1"}; static const char* QUANTIZE_TENSOR_PLUGIN_NAME{"QuantizeTensor"}; PluginFieldCollection QuantizeTensorPluginCreator::mFC{}; -std::vector QuantizeTensorPluginCreator::mPluginAttributes; +std::vector QuantizeTensorPluginCreator::mPluginAttributes; QuantizeTensorPlugin::QuantizeTensorPlugin() {} @@ -33,7 +33,7 @@ QuantizeTensorPlugin::QuantizeTensorPlugin() {} QuantizeTensorPlugin::QuantizeTensorPlugin(const void* data, size_t length) { const char *d = reinterpret_cast(data), *a = d; - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); } // IPluginV2DynamicExt Methods @@ -47,8 +47,8 @@ nvinfer1::DimsExprs QuantizeTensorPlugin::getOutputDimensions( { try { - PLUGIN_ASSERT(nbInputs == 2); - PLUGIN_ASSERT(outputIndex < 1); + TLLM_CHECK(nbInputs == 2); + TLLM_CHECK(outputIndex < 1); // Quantized input return inputs[0]; } @@ -76,7 +76,7 @@ bool QuantizeTensorPlugin::supportsFormatCombination( return inOut[pos].type == nvinfer1::DataType::kINT8 && inOut[pos].format == TensorFormat::kLINEAR; default: // Never should be here - PLUGIN_ASSERT(false); + TLLM_CHECK(false); return false; } } @@ -126,8 +126,8 @@ int QuantizeTensorPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, nvinfer1::DataType QuantizeTensorPlugin::getOutputDataType( int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept { - PLUGIN_ASSERT(nbInputs == 2); - PLUGIN_ASSERT(index == 0); + TLLM_CHECK(nbInputs == 2); + TLLM_CHECK(index == 0); return nvinfer1::DataType::kINT8; } @@ -175,16 +175,6 @@ void QuantizeTensorPlugin::destroy() noexcept delete this; } -void QuantizeTensorPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* QuantizeTensorPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// QuantizeTensorPluginCreator::QuantizeTensorPluginCreator() @@ -242,13 +232,3 @@ IPluginV2* QuantizeTensorPluginCreator::deserializePlugin( } return nullptr; } - -void QuantizeTensorPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* QuantizeTensorPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h b/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h old mode 100755 new mode 100644 index d0369f22f30..ec1d33785a4 --- a/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h +++ b/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.h @@ -14,10 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_QUANTIZE_TENSOR_PLUGIN_H -#define TRT_QUANTIZE_TENSOR_PLUGIN_H +#pragma once -#include "NvInferPlugin.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/plugins/common/plugin.h" #include @@ -26,12 +24,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class QuantizeTensorPlugin : public IPluginV2DynamicExt +class QuantizeTensorPlugin : public BasePlugin { public: QuantizeTensorPlugin(); @@ -66,16 +62,13 @@ class QuantizeTensorPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: const std::string mLayerName; - std::string mNamespace; cudaDeviceProp mProp; }; -class QuantizeTensorPluginCreator : public IPluginCreator +class QuantizeTensorPluginCreator : public BaseCreator { public: QuantizeTensorPluginCreator(); @@ -91,17 +84,9 @@ class QuantizeTensorPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_QUANTIZE_TENSOR_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.cpp b/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.cpp index 3c1d590fadb..f2afe4157bf 100644 --- a/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.cpp @@ -14,20 +14,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h" +#include "rmsnormPlugin/rmsnormPlugin.h" #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/kernels/rmsnormKernels.h" using namespace nvinfer1; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::common; -using nvinfer1::plugin::RmsnormPluginCreator; -using nvinfer1::plugin::RmsnormPlugin; +using tensorrt_llm::plugins::RmsnormPluginCreator; +using tensorrt_llm::plugins::RmsnormPlugin; static const char* RMSNORM_PLUGIN_VERSION{"1"}; static const char* RMSNORM_PLUGIN_NAME{"Rmsnorm"}; PluginFieldCollection RmsnormPluginCreator::mFC{}; -std::vector RmsnormPluginCreator::mPluginAttributes; +std::vector RmsnormPluginCreator::mPluginAttributes; RmsnormPlugin::RmsnormPlugin(float eps, nvinfer1::DataType type) : mEps(eps) @@ -43,7 +43,7 @@ RmsnormPlugin::RmsnormPlugin(const void* data, size_t length) const char *d = reinterpret_cast(data), *a = d; read(d, mEps); read(d, mType); - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); TLLM_CHECK_WITH_INFO((getSMVersion() >= 80) || (mType != DataType::kBF16), "Unsupported data type"); } @@ -64,7 +64,7 @@ nvinfer1::DimsExprs RmsnormPlugin::getOutputDimensions( bool RmsnormPlugin::supportsFormatCombination( int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept { - PLUGIN_ASSERT(0 <= pos && pos < 5); + TLLM_CHECK(0 <= pos && pos < 5); return (inOut[pos].type == mType) && (inOut[pos].format == TensorFormat::kLINEAR); } @@ -173,16 +173,6 @@ void RmsnormPlugin::destroy() noexcept delete this; } -void RmsnormPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* RmsnormPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// RmsnormPluginCreator::RmsnormPluginCreator() @@ -221,12 +211,12 @@ IPluginV2* RmsnormPluginCreator::createPlugin(const char* name, const PluginFiel const char* attrName = fields[i].name; if (!strcmp(attrName, "eps")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kFLOAT32); eps = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } } @@ -260,13 +250,3 @@ IPluginV2* RmsnormPluginCreator::deserializePlugin( } return nullptr; } - -void RmsnormPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* RmsnormPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h b/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h index ca2f6bb5c35..130886127ec 100644 --- a/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h +++ b/cpp/tensorrt_llm/plugins/rmsnormPlugin/rmsnormPlugin.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_RMSNORM_PLUGIN_H -#define TRT_RMSNORM_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/plugins/common/plugin.h" #include #include @@ -24,12 +23,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class RmsnormPlugin : public IPluginV2DynamicExt +class RmsnormPlugin : public BasePlugin { public: RmsnormPlugin(float eps, nvinfer1::DataType type); @@ -64,18 +61,15 @@ class RmsnormPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: float mEps; nvinfer1::DataType mType; const std::string mLayerName; - std::string mNamespace; }; -class RmsnormPluginCreator : public IPluginCreator +class RmsnormPluginCreator : public BaseCreator { public: RmsnormPluginCreator(); @@ -91,17 +85,9 @@ class RmsnormPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_RMSNORM_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp b/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp index b59d9b167ba..c7466d73819 100644 --- a/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp @@ -14,19 +14,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h" +#include "rmsnormQuantizationPlugin.h" #include "tensorrt_llm/kernels/rmsnormKernels.h" using namespace nvinfer1; using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::common; -using nvinfer1::plugin::RmsnormQuantizationPluginCreator; -using nvinfer1::plugin::RmsnormQuantizationPlugin; +using tensorrt_llm::plugins::RmsnormQuantizationPluginCreator; +using tensorrt_llm::plugins::RmsnormQuantizationPlugin; static const char* RMSNORM_QUANTIZATION_PLUGIN_VERSION{"1"}; static const char* RMSNORM_QUANTIZATION_PLUGIN_NAME{"RmsnormQuantization"}; PluginFieldCollection RmsnormQuantizationPluginCreator::mFC{}; -std::vector RmsnormQuantizationPluginCreator::mPluginAttributes; +std::vector RmsnormQuantizationPluginCreator::mPluginAttributes; RmsnormQuantizationPlugin::RmsnormQuantizationPlugin(float eps, bool dynamicActivationScaling, nvinfer1::DataType type) : mEps(eps) @@ -42,7 +42,7 @@ RmsnormQuantizationPlugin::RmsnormQuantizationPlugin(const void* data, size_t le read(d, mEps); read(d, mDynActScaling); read(d, mType); - PLUGIN_ASSERT(d == a + length); + TLLM_CHECK(d == a + length); } // IPluginV2DynamicExt Methods @@ -65,7 +65,7 @@ nvinfer1::DimsExprs RmsnormQuantizationPlugin::getOutputDimensions( // Dynamic scaling output if enabled try { - PLUGIN_ASSERT(outputIndex == 1); + TLLM_CHECK(outputIndex == 1); DimsExprs ret; ret.nbDims = inputs[0].nbDims; for (int di = 0; di < ret.nbDims - 1; ++di) @@ -86,8 +86,8 @@ bool RmsnormQuantizationPlugin::supportsFormatCombination( int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept { const int totalPoses = 6 + static_cast(mDynActScaling); - PLUGIN_ASSERT(0 <= pos && pos < totalPoses); - PLUGIN_ASSERT(nbInputs == 4); + TLLM_CHECK(0 <= pos && pos < totalPoses); + TLLM_CHECK(nbInputs == 4); if (pos < nbInputs) { switch (pos) @@ -218,16 +218,6 @@ void RmsnormQuantizationPlugin::destroy() noexcept delete this; } -void RmsnormQuantizationPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* RmsnormQuantizationPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// RmsnormQuantizationPluginCreator::RmsnormQuantizationPluginCreator() @@ -268,17 +258,17 @@ IPluginV2* RmsnormQuantizationPluginCreator::createPlugin(const char* name, cons const char* attrName = fields[i].name; if (!strcmp(attrName, "eps")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kFLOAT32); eps = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "dyn_act_scaling")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); dynamicActivationScaling = static_cast(*(static_cast(fields[i].data))); } } @@ -312,13 +302,3 @@ IPluginV2* RmsnormQuantizationPluginCreator::deserializePlugin( } return nullptr; } - -void RmsnormQuantizationPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* RmsnormQuantizationPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h b/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h index aa08956dec7..761b86cc172 100644 --- a/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h +++ b/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.h @@ -14,9 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_RMSNORM_QUANTIZATION_PLUGIN_H -#define TRT_RMSNORM_QUANTIZATION_PLUGIN_H -#include "NvInferPlugin.h" +#pragma once + #include "tensorrt_llm/plugins/common/plugin.h" #include #include @@ -24,12 +23,10 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { -class RmsnormQuantizationPlugin : public IPluginV2DynamicExt +class RmsnormQuantizationPlugin : public BasePlugin { public: RmsnormQuantizationPlugin(float eps, bool dynamicActivationScaling, nvinfer1::DataType type); @@ -64,8 +61,6 @@ class RmsnormQuantizationPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: float mEps; @@ -73,10 +68,9 @@ class RmsnormQuantizationPlugin : public IPluginV2DynamicExt nvinfer1::DataType mType; const std::string mLayerName; - std::string mNamespace; }; -class RmsnormQuantizationPluginCreator : public IPluginCreator +class RmsnormQuantizationPluginCreator : public BaseCreator { public: RmsnormQuantizationPluginCreator(); @@ -92,17 +86,9 @@ class RmsnormQuantizationPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_RMSNORM_QUANTIZATION_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp index b684aa2707e..b0731cd63bb 100644 --- a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp @@ -14,28 +14,68 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h" +#include "smoothQuantGemmPlugin.h" #include using namespace nvinfer1; using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels::cutlass_kernels; -using nvinfer1::plugin::SmoothQuantGemmPluginCreator; -using nvinfer1::plugin::SmoothQuantGemmPlugin; +using tensorrt_llm::plugins::SmoothQuantGemmPluginCreator; +using tensorrt_llm::plugins::SmoothQuantGemmPlugin; +using tensorrt_llm::plugins::SmoothQuantGemmPluginProfiler; +using tensorrt_llm::plugins::read; +using tensorrt_llm::plugins::write; static const char* SQ_GEMM_PLUGIN_VERSION{"1"}; static const char* SQ_GEMM_PLUGIN_NAME{"SmoothQuantGemm"}; PluginFieldCollection SmoothQuantGemmPluginCreator::mFC{}; -std::vector SmoothQuantGemmPluginCreator::mPluginAttributes; +std::vector SmoothQuantGemmPluginCreator::mPluginAttributes; -SmoothQuantGemmPlugin::SmoothQuantGemmPlugin(QuantMode quantMode, nvinfer1::DataType type) +void SmoothQuantGemmPluginProfiler::runTactic(int m, int n, int k, const SmoothQuantGemmPluginProfiler::Config& tactic, + char* workspace, const cudaStream_t& stream) +{ + int8_t* aTmp = reinterpret_cast(workspace); + int8_t* bTmp = nextWorkspacePtr(aTmp, m * k * sizeof(int8_t)); + void* cTmp = reinterpret_cast(nextWorkspacePtr(bTmp, n * k * sizeof(int8_t))); + float* alphaRowTmp = reinterpret_cast( + nextWorkspacePtr(reinterpret_cast(cTmp), m * n * (mType == nvinfer1::DataType::kFLOAT ? 4 : 2))); + float* alphaColTmp + = reinterpret_cast(nextWorkspacePtr(reinterpret_cast(alphaRowTmp), m * sizeof(float))); + char* workspaceTmp + = reinterpret_cast(nextWorkspacePtr(reinterpret_cast(alphaColTmp), n * sizeof(float))); + + const int wsSize = mRunner->getWorkspaceSize(m, n, k); + + mRunner->gemm( + aTmp, bTmp, mQuantMode, alphaColTmp, alphaRowTmp, cTmp, m, n, k, tactic, workspaceTmp, wsSize, stream); +} + +void SmoothQuantGemmPluginProfiler::computeTmpSize(int maxM, int n, int k) +{ + std::vector workspaces = { + maxM * k * sizeof(int8_t), // A + n * k * sizeof(int8_t), // B + maxM * n * (mType == nvinfer1::DataType::kFLOAT ? 4u : 2u), // C + maxM * sizeof(float), // alphaRow + n * sizeof(float), // alphaCol + mRunner->getWorkspaceSize(maxM, n, k) // workspace + }; + size_t bytes = calculateTotalWorkspaceSize(workspaces.data(), workspaces.size()); + setTmpWorkspaceSizeInBytes(bytes); +} + +SmoothQuantGemmPlugin::SmoothQuantGemmPlugin( + QuantMode quantMode, nvinfer1::DataType type, const SmoothQuantGemmPlugin::PluginProfilerPtr& pluginProfiler) : mQuantMode(quantMode) + , mPluginProfiler(pluginProfiler) { init(type); } // Parameterized constructor -SmoothQuantGemmPlugin::SmoothQuantGemmPlugin(const void* data, size_t length) +SmoothQuantGemmPlugin::SmoothQuantGemmPlugin( + const void* data, size_t length, const SmoothQuantGemmPlugin::PluginProfilerPtr& pluginProfiler) + : mPluginProfiler(pluginProfiler) { const char *d = reinterpret_cast(data), *a = d; bool perChannelScaling = false, perTokenScaling = false; @@ -43,29 +83,15 @@ SmoothQuantGemmPlugin::SmoothQuantGemmPlugin(const void* data, size_t length) unsigned int quantMode; read(d, quantMode); read(d, type); - read(d, mMinM); - read(d, mMaxM); - read(d, mN); - read(d, mK); - int selectedMapSize; - read(d, selectedMapSize); - perfMapType selectedTacticsMap; - for (int ii = 0; ii < selectedMapSize; ++ii) - { - std::pair config; - read(d, config); - selectedTacticsMap.insert(config); - } + read(d, mDims); + mQuantMode = QuantMode(quantMode); + init(type); - m_sqGemmRunner->setSelectedTactics(selectedTacticsMap); - m_sqGemmRunner->setMaxM(mMaxM); - PLUGIN_ASSERT(d == a + length); -} -void SmoothQuantGemmPlugin::setSelectedTactics(const perfMapType& selectedTacticsMap) -{ - m_sqGemmRunner->setSelectedTactics(selectedTacticsMap); + mPluginProfiler->deserialize(d, mDims, mGemmId); + + TLLM_CHECK(d == a + length); } void SmoothQuantGemmPlugin::init(nvinfer1::DataType type) @@ -86,36 +112,30 @@ void SmoothQuantGemmPlugin::init(nvinfer1::DataType type) else { // TODO (nkorobov): add bf16 support - PLUGIN_ASSERT(false); + TLLM_THROW("Support for bf16 is missing"); } + + mPluginProfiler->setQuantMode(mQuantMode); + + mGemmId = GemmIdCore(mDims.n, mDims.k, mType); } // IPluginV2DynamicExt Methods nvinfer1::IPluginV2DynamicExt* SmoothQuantGemmPlugin::clone() const noexcept { - auto* plugin = new SmoothQuantGemmPlugin(mQuantMode, mType); - plugin->setPluginNamespace(mNamespace.c_str()); - plugin->setProblemSize(mMinM, mMaxM, mN, mK); - plugin->setSelectedTactics(m_sqGemmRunner->getSelectedTactics()); - plugin->setMaxM(m_sqGemmRunner->getMaxM()); + auto* plugin = new SmoothQuantGemmPlugin(*this); return plugin; } -void SmoothQuantGemmPlugin::setMaxM(int maxM) -{ - mMaxM = maxM; - m_sqGemmRunner->setMaxM(maxM); -} - nvinfer1::DimsExprs SmoothQuantGemmPlugin::getOutputDimensions( int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept { try { - PLUGIN_ASSERT(nbInputs == 4); - PLUGIN_ASSERT(outputIndex == 0); + TLLM_CHECK(nbInputs == 4); + TLLM_CHECK(outputIndex == 0); const int nbDimsA = inputs[0].nbDims; - PLUGIN_ASSERT(nbDimsA >= 2); + TLLM_CHECK(nbDimsA >= 2); DimsExprs ret; ret.nbDims = nbDimsA; for (int ii = 0; ii < nbDimsA - 1; ++ii) @@ -167,8 +187,8 @@ bool SmoothQuantGemmPlugin::supportsFormatCombination( void SmoothQuantGemmPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept { - mMinM = std::accumulate(in[0].min.d, in[0].min.d + in[0].min.nbDims - 1, 1, std::multiplies()); - mMaxM = std::accumulate(in[0].max.d, in[0].max.d + in[0].max.nbDims - 1, 1, std::multiplies()); + const auto minM = std::accumulate(in[0].min.d, in[0].min.d + in[0].min.nbDims - 1, 1, std::multiplies()); + const auto maxM = std::accumulate(in[0].max.d, in[0].max.d + in[0].max.nbDims - 1, 1, std::multiplies()); const int maxK = in[0].max.d[in[0].max.nbDims - 1]; const int maxN = in[1].max.d[0]; @@ -178,10 +198,13 @@ void SmoothQuantGemmPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorD TLLM_CHECK_WITH_INFO(minN == maxN, "Variable out channels is not allowed"); TLLM_CHECK_WITH_INFO(minK == maxK, "Variable in channels is not allowed"); - mK = maxK; - mN = maxN; + if (!mDims.isInitialized()) + { + mDims = {minM, maxM, maxN, maxK}; + } + mGemmId = {maxN, maxK, mType}; - m_workspaceMaxSize = m_sqGemmRunner->getWorkspaceSize(mMaxM, maxN, maxK); + m_workspaceMaxSize = m_sqGemmRunner->getWorkspaceSize(maxM, maxN, maxK); } size_t SmoothQuantGemmPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, @@ -210,9 +233,11 @@ int SmoothQuantGemmPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const int k = inputDesc[0].dims.d[inputDesc[0].dims.nbDims - 1]; const int wsSize = m_sqGemmRunner->getWorkspaceSize(m, n, k); + const auto& bestTactic = mPluginProfiler->getBestConfig(m, mGemmId); + TLLM_CHECK_WITH_INFO(bestTactic, "No valid SQ GEMM tactic"); m_sqGemmRunner->gemm(reinterpret_cast(inputs[0]), reinterpret_cast(inputs[1]), mQuantMode, reinterpret_cast(inputs[3]), reinterpret_cast(inputs[2]), - reinterpret_cast(outputs[0]), m, n, k, reinterpret_cast(workspace), wsSize, stream); + reinterpret_cast(outputs[0]), m, n, k, *bestTactic, reinterpret_cast(workspace), wsSize, stream); return 0; } @@ -221,7 +246,7 @@ int SmoothQuantGemmPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, nvinfer1::DataType SmoothQuantGemmPlugin::getOutputDataType( int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept { - PLUGIN_ASSERT(index == 0); + TLLM_CHECK(index == 0); return mType; } @@ -252,14 +277,10 @@ void SmoothQuantGemmPlugin::terminate() noexcept {} size_t SmoothQuantGemmPlugin::getSerializationSize() const noexcept { - const auto& selectedTactics = m_sqGemmRunner->getSelectedTactics(); - return sizeof(unsigned int) + // QuantMode - sizeof(nvinfer1::DataType) + // dtype - 4 * sizeof(int) + // Problem sizes (minM, maxM, N, K) - sizeof(int) + // selected tactics constainer num of elems - selectedTactics.size() - * sizeof( - std::pair); // selected tactics container size + return sizeof(unsigned int) + // QuantMode + sizeof(nvinfer1::DataType) + // dtype + sizeof(mDims) + // Dimensions + mPluginProfiler->getSerializationSize(mGemmId); // selected tactics container size } void SmoothQuantGemmPlugin::serialize(void* buffer) const noexcept @@ -267,16 +288,9 @@ void SmoothQuantGemmPlugin::serialize(void* buffer) const noexcept char *d = static_cast(buffer), *a = d; write(d, mQuantMode.value()); write(d, mType); - write(d, mMinM); - write(d, m_sqGemmRunner->getMaxM()); - write(d, mN); - write(d, mK); - const auto& selectedTacticsMap = m_sqGemmRunner->getSelectedTactics(); - write(d, static_cast(selectedTacticsMap.size())); - for (const auto& pair : selectedTacticsMap) - { - write(d, pair); - } + write(d, mDims); + + mPluginProfiler->serialize(d, mGemmId); assert(d == a + getSerializationSize()); } @@ -286,58 +300,9 @@ void SmoothQuantGemmPlugin::destroy() noexcept delete this; } -void SmoothQuantGemmPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* SmoothQuantGemmPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - -void SmoothQuantGemmPlugin::setProblemSize(int minM, int maxM, int n, int k) -{ - mMinM = minM; - mMaxM = maxM; - mN = n; - mK = k; -} - -void SmoothQuantGemmPlugin::allocateTmpData() -{ - cudaMalloc(&mATmp, mMaxM * mK * sizeof(int8_t)); - cudaMalloc(&mBTmp, mN * mK * sizeof(int8_t)); - cudaMalloc(&mCTmp, mMaxM * mN * (mType == nvinfer1::DataType::kFLOAT ? 4 : 2)); - cudaMalloc(&mAlphaRowTmp, mMaxM * sizeof(float)); - cudaMalloc(&mAlphaColTmp, mN * sizeof(float)); - cudaMalloc(&mWorkspaceTmp, m_sqGemmRunner->getWorkspaceSize(mMaxM, mN, mK)); -} - -void SmoothQuantGemmPlugin::freeTmpData() -{ - cudaFree(mATmp); - cudaFree(mBTmp); - cudaFree(mCTmp); - cudaFree(mAlphaRowTmp); - cudaFree(mAlphaColTmp); - cudaFree(mWorkspaceTmp); -} - void SmoothQuantGemmPlugin::configGemm() { - if (mMaxM == -1 || mMinM == -1 || mN == -1 || mK == -1) - { - return; - } - if (!m_sqGemmRunner->hasSelectedTactics()) - { - allocateTmpData(); - m_sqGemmRunner->profileGemms( - mQuantMode, mMinM, mMaxM, mN, mK, mATmp, mBTmp, mCTmp, mAlphaColTmp, mAlphaRowTmp, mWorkspaceTmp); - m_sqGemmRunner->setMaxM(mMaxM); - freeTmpData(); - } + mPluginProfiler->profileTactics(m_sqGemmRunner->getConfigs(), m_sqGemmRunner, mType, mDims, mGemmId); } /////////////// @@ -379,24 +344,27 @@ IPluginV2* SmoothQuantGemmPluginCreator::createPlugin(const char* name, const Pl const char* attrName = fields[i].name; if (!strcmp(attrName, "has_per_channel_scaling")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); perChannelScaling = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "has_per_token_scaling")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); perTokenScaling = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } } try { + // SmoothQuantGemmPluginCreator is unique and shared for an engine generation + // Create plugin profiler with shared tactics map + auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false); QuantMode quantMode = QuantMode::fromDescription(true, true, perTokenScaling, perChannelScaling); - auto* obj = new SmoothQuantGemmPlugin(quantMode, type); + auto* obj = new SmoothQuantGemmPlugin(quantMode, type, pluginProfiler); obj->setPluginNamespace(mNamespace.c_str()); return obj; } @@ -414,7 +382,9 @@ IPluginV2* SmoothQuantGemmPluginCreator::deserializePlugin( // call SmoothQuantGemmPlugin::destroy() try { - auto* obj = new SmoothQuantGemmPlugin(serialData, serialLength); + // Create plugin profiler with private tactics map which is read from the serialized engine + auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ true); + auto* obj = new SmoothQuantGemmPlugin(serialData, serialLength, pluginProfiler); obj->setPluginNamespace(mNamespace.c_str()); return obj; } @@ -424,13 +394,3 @@ IPluginV2* SmoothQuantGemmPluginCreator::deserializePlugin( } return nullptr; } - -void SmoothQuantGemmPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* SmoothQuantGemmPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h index 1011098e3d5..d0a7fba78aa 100644 --- a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h +++ b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.h @@ -14,12 +14,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_SMOOTH_QUANT_GEMM_PLUGIN_H -#define TRT_SMOOTH_QUANT_GEMM_PLUGIN_H +#pragma once -#include "NvInferPlugin.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h" +#include "tensorrt_llm/plugins/common/gemmPluginProfiler.h" #include "tensorrt_llm/plugins/common/plugin.h" #include #include @@ -27,21 +26,43 @@ #include #include -namespace nvinfer1 -{ -namespace plugin +namespace tensorrt_llm::plugins { using perfMapType = std::unordered_map; +using SqGemmRunnerPtr = std::shared_ptr; + +class SmoothQuantGemmPluginProfiler : public GemmPluginProfiler +{ +public: + using Config = tensorrt_llm::cutlass_extensions::CutlassGemmConfig; + + void setQuantMode(const tensorrt_llm::common::QuantMode& quantMode) + { + mQuantMode = quantMode; + } + +protected: + void runTactic(int m, int n, int k, const Config& tactic, char* workspace, const cudaStream_t& stream) override; + + void computeTmpSize(int maxM, int n, int k) override; + +private: + tensorrt_llm::common::QuantMode mQuantMode; +}; -class SmoothQuantGemmPlugin : public IPluginV2DynamicExt +class SmoothQuantGemmPlugin : public BasePlugin { public: + using PluginProfilerPtr = std::shared_ptr; + SmoothQuantGemmPlugin() = delete; - SmoothQuantGemmPlugin(tensorrt_llm::common::QuantMode quantMode, nvinfer1::DataType type); + SmoothQuantGemmPlugin( + tensorrt_llm::common::QuantMode quantMode, nvinfer1::DataType type, const PluginProfilerPtr& pluginProfiler); - SmoothQuantGemmPlugin(const void* data, size_t length); + SmoothQuantGemmPlugin(const void* data, size_t length, const PluginProfilerPtr& pluginProfiler); ~SmoothQuantGemmPlugin() override = default; @@ -71,43 +92,28 @@ class SmoothQuantGemmPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: void init(nvinfer1::DataType type); - void setProblemSize(int minM, int maxM, int n, int k); void configGemm(); - void setSelectedTactics(const perfMapType& selected_tactics_map); - void setMaxM(int maxM); - - void allocateTmpData(); - void freeTmpData(); private: const std::string mLayerName; - std::string mNamespace; - std::shared_ptr m_sqGemmRunner; + SqGemmRunnerPtr m_sqGemmRunner; tensorrt_llm::common::QuantMode mQuantMode; int m_workspaceMaxSize; - int mMaxM{-1}; - int mMinM{-1}; - int mN{-1}; - int mK{-1}; - - int8_t* mATmp{nullptr}; - int8_t* mBTmp{nullptr}; - void* mCTmp{nullptr}; - float* mAlphaRowTmp{nullptr}; - float* mAlphaColTmp{nullptr}; - char* mWorkspaceTmp{nullptr}; + + GemmDims mDims{}; + GemmIdCore mGemmId{}; + + PluginProfilerPtr mPluginProfiler; nvinfer1::DataType mType; }; -class SmoothQuantGemmPluginCreator : public IPluginCreator +class SmoothQuantGemmPluginCreator : public BaseCreator { public: SmoothQuantGemmPluginCreator(); @@ -123,17 +129,10 @@ class SmoothQuantGemmPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + GemmPluginProfilerManager gemmPluginProfileManager; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_SMOOTH_QUANT_GEMM_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp index ae03dbd0334..a01458f8282 100644 --- a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp @@ -14,27 +14,89 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h" +#include "weightOnlyGroupwiseQuantMatmulPlugin.h" using namespace nvinfer1; using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels::cutlass_kernels; -using nvinfer1::plugin::WeightOnlyGroupwiseQuantMatmulPluginCreator; -using nvinfer1::plugin::WeightOnlyGroupwiseQuantMatmulPlugin; +using tensorrt_llm::plugins::WeightOnlyGroupwiseQuantMatmulPluginCreator; +using tensorrt_llm::plugins::WeightOnlyGroupwiseQuantMatmulPlugin; +using tensorrt_llm::plugins::WeightOnlyGroupwiseQuantGemmPluginProfiler; + +// Flags for indicating whether the corresponding inputs are applied in mQuantAlgo +// mQuantAlgo = pre_quant_scale * PRE_SCALE_QUANT + zero * ZER0 + bias * BIAS +// Here pre_quant_scale, zero and bias are boolean type +static constexpr int BIAS = int(1) << 0; +static constexpr int ZER0 = int(1) << 1; +static constexpr int PRE_SCALE_QUANT = int(1) << 2; +using tensorrt_llm::plugins::read; +using tensorrt_llm::plugins::write; static const char* WOQ_GROUPWISE_MATMUL_PLUGIN_VERSION{"1"}; static const char* WOQ_GROUPWISE_MATMUL_PLUGIN_NAME{"WeightOnlyGroupwiseQuantMatmul"}; PluginFieldCollection WeightOnlyGroupwiseQuantMatmulPluginCreator::mFC{}; -std::vector WeightOnlyGroupwiseQuantMatmulPluginCreator::mPluginAttributes; +std::vector WeightOnlyGroupwiseQuantMatmulPluginCreator::mPluginAttributes; -WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin( - nvinfer1::DataType type, int quant_algo, int group_size) +void WeightOnlyGroupwiseQuantGemmPluginProfiler::runTactic(int m, int n, int k, + const WeightOnlyGroupwiseQuantGemmPluginProfiler::Config& tactic, char* workspace, const cudaStream_t& stream) +{ + const int originalN = n * 8; + half* actPtr = reinterpret_cast(workspace); + cutlass::uint4b_t* weightPtr = reinterpret_cast( + nextWorkspacePtr(reinterpret_cast(actPtr), m * k * sizeof(half))); + half* inputScalesPtr + = reinterpret_cast(nextWorkspacePtr(reinterpret_cast(weightPtr), n * k * sizeof(float))); + half* zerosPtr = reinterpret_cast( + nextWorkspacePtr(reinterpret_cast(inputScalesPtr), k * originalN * sizeof(half) / mGroupSize)); + half* biasesPtr = reinterpret_cast( + nextWorkspacePtr(reinterpret_cast(zerosPtr), k * originalN * sizeof(half) / mGroupSize)); + half* outputPtr = reinterpret_cast(nextWorkspacePtr(reinterpret_cast(biasesPtr), m * sizeof(half))); + char* workspacePtr + = reinterpret_cast(nextWorkspacePtr(reinterpret_cast(outputPtr), m * originalN * sizeof(half))); + + if ((mQuantAlgo & ZER0) == 0) + { + zerosPtr = nullptr; + } + + if ((mQuantAlgo & BIAS) == 0) + { + biasesPtr = nullptr; + } + + const int wsSize = mRunner->getWorkspaceSize(m, n, k); + + mRunner->gemm(actPtr, weightPtr, inputScalesPtr, zerosPtr, biasesPtr, outputPtr, m, originalN, k, mGroupSize, + tactic, workspacePtr, wsSize, stream); +} + +void WeightOnlyGroupwiseQuantGemmPluginProfiler::computeTmpSize(int maxM, int n, int k) +{ + const int originalN = n * 8; + std::vector workspaces = { + maxM * k * sizeof(half), // A + k * n * sizeof(float), // B + k * originalN * sizeof(half) / mGroupSize, // scales + k * originalN * sizeof(half) / mGroupSize, // zeros + maxM * sizeof(half), // biases + maxM * originalN * sizeof(half), // C + mRunner->getWorkspaceSize(maxM, n, k) // workspace + }; + size_t bytes = calculateTotalWorkspaceSize(workspaces.data(), workspaces.size()); + setTmpWorkspaceSizeInBytes(bytes); +} + +WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin(nvinfer1::DataType type, int quant_algo, + int group_size, const WeightOnlyGroupwiseQuantMatmulPlugin::PluginProfilerPtr& pluginProfiler) + : mPluginProfiler(pluginProfiler) { init(type, quant_algo, group_size); } // Parameterized constructor -WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin(const void* data, size_t length) +WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin( + const void* data, size_t length, const WeightOnlyGroupwiseQuantMatmulPlugin::PluginProfilerPtr& pluginProfiler) + : mPluginProfiler(pluginProfiler) { const char *d = reinterpret_cast(data), *a = d; nvinfer1::DataType type; @@ -43,8 +105,13 @@ WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin(const read(d, type); read(d, quant_algo); read(d, group_size); + read(d, mDims); + init(type, quant_algo, group_size); - PLUGIN_ASSERT(d == a + length); + + mPluginProfiler->deserialize(d, mDims, mGemmId); + + TLLM_CHECK(d == a + length); } void WeightOnlyGroupwiseQuantMatmulPlugin::init(nvinfer1::DataType type, int quant_algo, int group_size) @@ -79,18 +146,28 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::init(nvinfer1::DataType type, int qua } else { - PLUGIN_ASSERT(false); + TLLM_THROW("Unsupported data type"); } + + mPluginProfiler->setQuantAlgo(mQuantAlgo); + mPluginProfiler->setGroupSize(mGroupSize); + + mGemmId = GemmIdCore(mDims.n, mDims.k, mType); } // IPluginV2DynamicExt Methods nvinfer1::IPluginV2DynamicExt* WeightOnlyGroupwiseQuantMatmulPlugin::clone() const noexcept { - auto* plugin = new WeightOnlyGroupwiseQuantMatmulPlugin(mType, mQuantAlgo, mGroupSize); - plugin->setPluginNamespace(mNamespace.c_str()); + auto* plugin = new WeightOnlyGroupwiseQuantMatmulPlugin(*this); return plugin; } +void WeightOnlyGroupwiseQuantMatmulPlugin::configGemm() +{ + mPluginProfiler->profileTactics( + m_weightOnlyGroupwiseGemmRunner->getConfigs(), m_weightOnlyGroupwiseGemmRunner, mType, mDims, mGemmId); +} + nvinfer1::DimsExprs WeightOnlyGroupwiseQuantMatmulPlugin::getOutputDimensions( int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept { @@ -105,12 +182,12 @@ nvinfer1::DimsExprs WeightOnlyGroupwiseQuantMatmulPlugin::getOutputDimensions( try { - PLUGIN_ASSERT(nbInputs == mBiasesInputIdx + 1); - PLUGIN_ASSERT(outputIndex == 0); + TLLM_CHECK(nbInputs == mBiasesInputIdx + 1); + TLLM_CHECK(outputIndex == 0); const int nbDimsA = inputs[0].nbDims; const int nbDimsB = inputs[mWeightInputIdx].nbDims; - PLUGIN_ASSERT(nbDimsA >= 2); - PLUGIN_ASSERT(nbDimsB == 2); + TLLM_CHECK(nbDimsA >= 2); + TLLM_CHECK(nbDimsB == 2); DimsExprs ret; ret.nbDims = nbDimsA; for (int ii = 0; ii < nbDimsA - 1; ++ii) @@ -157,14 +234,22 @@ bool WeightOnlyGroupwiseQuantMatmulPlugin::supportsFormatCombination( void WeightOnlyGroupwiseQuantMatmulPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept { - int maxM = 1; - for (int ii = 0; ii < in[0].max.nbDims - 1; ++ii) - { - maxM *= in[0].max.d[ii]; - } + const auto minM = std::accumulate(in[0].min.d, in[0].min.d + in[0].min.nbDims - 1, 1, std::multiplies()); + const auto maxM = std::accumulate(in[0].max.d, in[0].max.d + in[0].max.nbDims - 1, 1, std::multiplies()); + const int maxK = in[0].max.d[in[0].max.nbDims - 1]; // int32 packed int4 elements const int maxN = in[mWeightInputIdx].max.d[1] * 8; + + const auto K = maxK; + const auto N = maxN / 8; + + if (!mDims.isInitialized()) + { + mDims = {minM, maxM, N, K}; + } + mGemmId = {N, K, mType}; + int smoothedActSize = maxM * maxK * (in[0].desc.type == nvinfer1::DataType::kFLOAT ? 4 : 2); m_workspaceMaxSize = smoothedActSize + m_weightOnlyGroupwiseGemmRunner->getWorkspaceSize(maxM, maxN, maxK); } @@ -212,13 +297,17 @@ int WeightOnlyGroupwiseQuantMatmulPlugin::enqueue(const nvinfer1::PluginTensorDe if (mType == nvinfer1::DataType::kHALF) { - if (m < SMALL_M_FAST_PATH) + if (m < SMALL_M_FAST_PATH && mSM >= 75) { // Use CUDA kernels for small batch size - tensorrt_llm::kernels::groupwise_weight_only_matmul_i2f_launcher( - reinterpret_cast(inputs[mWeightInputIdx]), + // The CUDA kernel is designed for ColumnMajorTileInterleave weight layout used in fpAIntB cutlass kernel + // when sm >= 75 and the preprocessing of cutlass on sm70 does not interleave the weights. + tensorrt_llm::kernels::WeightOnlyParams params{reinterpret_cast(inputs[mWeightInputIdx]), reinterpret_cast(inputs[mScalesInputIdx]), zeros_ptr, act_ptr, biases_ptr, - reinterpret_cast(outputs[0]), m, n * 8, k, mGroupSize, &stream); + reinterpret_cast(outputs[0]), m, n * 8, k, mGroupSize}; + tensorrt_llm::kernels::weight_only_batched_gemv_launcher(tensorrt_llm::kernels::WeightOnlyQuantType::Int4b, + tensorrt_llm::kernels::WeightOnlyType::GroupWise, + tensorrt_llm::kernels::WeightOnlyActivationType::Identity, params, stream); } else { @@ -227,10 +316,12 @@ int WeightOnlyGroupwiseQuantMatmulPlugin::enqueue(const nvinfer1::PluginTensorDe int32_t* weight_ptr = const_cast(reinterpret_cast(inputs[mWeightInputIdx])); + const auto& bestTactic = mPluginProfiler->getBestConfig(m, mGemmId); + TLLM_CHECK_WITH_INFO(bestTactic, "No valid SQ GEMM tactic"); m_weightOnlyGroupwiseGemmRunner->gemm(act_ptr, reinterpret_cast(weight_ptr), reinterpret_cast(inputs[mScalesInputIdx]), zeros_ptr, biases_ptr, - reinterpret_cast(outputs[0]), m, n * 8, k, mGroupSize, - reinterpret_cast(workspace + m * k * sizeof(half)), ws_bytes, stream); + reinterpret_cast(outputs[0]), m, n * 8, k, mGroupSize, *bestTactic, + reinterpret_cast(workspace) + m * k * sizeof(half), ws_bytes, stream); } } else @@ -245,7 +336,7 @@ int WeightOnlyGroupwiseQuantMatmulPlugin::enqueue(const nvinfer1::PluginTensorDe nvinfer1::DataType WeightOnlyGroupwiseQuantMatmulPlugin::getOutputDataType( int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept { - PLUGIN_ASSERT(index == 0); + TLLM_CHECK(index == 0); return mType; } @@ -268,6 +359,7 @@ int WeightOnlyGroupwiseQuantMatmulPlugin::getNbOutputs() const noexcept int WeightOnlyGroupwiseQuantMatmulPlugin::initialize() noexcept { + configGemm(); return 0; } @@ -275,7 +367,11 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::terminate() noexcept {} size_t WeightOnlyGroupwiseQuantMatmulPlugin::getSerializationSize() const noexcept { - return 2 * sizeof(int) + sizeof(nvinfer1::DataType); + return sizeof(int) + // mQuantAlgo + sizeof(int) + // mGroupSize + sizeof(nvinfer1::DataType) + // mType + sizeof(mDims) + // Dimensions + mPluginProfiler->getSerializationSize(mGemmId); // selected tactics container size } void WeightOnlyGroupwiseQuantMatmulPlugin::serialize(void* buffer) const noexcept @@ -284,6 +380,9 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::serialize(void* buffer) const noexcep write(d, mType); write(d, mQuantAlgo); write(d, mGroupSize); + write(d, mDims); + + mPluginProfiler->serialize(d, mGemmId); assert(d == a + getSerializationSize()); } @@ -293,16 +392,6 @@ void WeightOnlyGroupwiseQuantMatmulPlugin::destroy() noexcept delete this; } -void WeightOnlyGroupwiseQuantMatmulPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* WeightOnlyGroupwiseQuantMatmulPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// WeightOnlyGroupwiseQuantMatmulPluginCreator::WeightOnlyGroupwiseQuantMatmulPluginCreator() @@ -344,23 +433,26 @@ IPluginV2* WeightOnlyGroupwiseQuantMatmulPluginCreator::createPlugin( const char* attrName = fields[i].name; if (!strcmp(attrName, "quant_algo")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); QuantAlgo = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "group_size")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); GroupSize = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } } try { - auto* obj = new WeightOnlyGroupwiseQuantMatmulPlugin(type, QuantAlgo, GroupSize); + // WeightOnlyGroupwiseQuantMatmulPluginCreator is unique and shared for an engine generation + // Create plugin profiler with shared tactics map + auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false); + auto* obj = new WeightOnlyGroupwiseQuantMatmulPlugin(type, QuantAlgo, GroupSize, pluginProfiler); obj->setPluginNamespace(mNamespace.c_str()); return obj; } @@ -378,7 +470,9 @@ IPluginV2* WeightOnlyGroupwiseQuantMatmulPluginCreator::deserializePlugin( // call weightOnlyGroupwiseQuantMatmulPlugin::destroy() try { - auto* obj = new WeightOnlyGroupwiseQuantMatmulPlugin(serialData, serialLength); + // Create plugin profiler with private tactics map which is read from the serialized engine + auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ true); + auto* obj = new WeightOnlyGroupwiseQuantMatmulPlugin(serialData, serialLength, pluginProfiler); obj->setPluginNamespace(mNamespace.c_str()); return obj; } @@ -388,13 +482,3 @@ IPluginV2* WeightOnlyGroupwiseQuantMatmulPluginCreator::deserializePlugin( } return nullptr; } - -void WeightOnlyGroupwiseQuantMatmulPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* WeightOnlyGroupwiseQuantMatmulPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h index 131e93cae86..52cf6adfa67 100644 --- a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h +++ b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.h @@ -14,16 +14,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_WEIGHT_ONLY_GROUPWISE_QUANT_MATMUL_PLUGIN_H -#define TRT_WEIGHT_ONLY_GROUPWISE_QUANT_MATMUL_PLUGIN_H +#pragma once -#include "NvInferPlugin.h" -#include "cutlass/numeric_types.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" #include "tensorrt_llm/kernels/preQuantScaleKernel.h" -#include "tensorrt_llm/kernels/weightOnlyGroupwiseMatrixVectorMultiplication.h" +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h" +#include "tensorrt_llm/plugins/common/gemmPluginProfiler.h" #include "tensorrt_llm/plugins/common/plugin.h" + +#include + #include #include #include @@ -34,19 +35,50 @@ // breaking dependencies #include "cutlass/integer_subbyte.h" -namespace nvinfer1 +namespace tensorrt_llm::plugins { -namespace plugin + +using WeightOnlyGemmRunner = tensorrt_llm::kernels::cutlass_kernels::CutlassFpAIntBGemmRunnerInterface; +using WeightOnlyGemmRunnerPtr = std::shared_ptr; + +class WeightOnlyGroupwiseQuantGemmPluginProfiler + : public GemmPluginProfiler { +public: + using Config = tensorrt_llm::cutlass_extensions::CutlassGemmConfig; + + void setQuantAlgo(int quantAlgo) + { + mQuantAlgo = quantAlgo; + } + + void setGroupSize(int groupSize) + { + mGroupSize = groupSize; + } + +protected: + void runTactic(int m, int n, int k, const Config& tactic, char* workspace, const cudaStream_t& stream) override; + + void computeTmpSize(int maxM, int n, int k) override; + +private: + int mQuantAlgo; + int mGroupSize; +}; -class WeightOnlyGroupwiseQuantMatmulPlugin : public IPluginV2DynamicExt +class WeightOnlyGroupwiseQuantMatmulPlugin : public BasePlugin { public: + using PluginProfilerPtr = std::shared_ptr; + WeightOnlyGroupwiseQuantMatmulPlugin() = delete; - WeightOnlyGroupwiseQuantMatmulPlugin(nvinfer1::DataType type, int quant_algo, int group_size); + WeightOnlyGroupwiseQuantMatmulPlugin( + nvinfer1::DataType type, int quant_algo, int group_size, const PluginProfilerPtr& profiler); - WeightOnlyGroupwiseQuantMatmulPlugin(const void* data, size_t length); + WeightOnlyGroupwiseQuantMatmulPlugin(const void* data, size_t length, const PluginProfilerPtr& profiler); ~WeightOnlyGroupwiseQuantMatmulPlugin() override = default; @@ -76,21 +108,20 @@ class WeightOnlyGroupwiseQuantMatmulPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: // group_size: 64, 128 void init(nvinfer1::DataType type, int quant_algo, int group_size); + void configGemm(); + private: const std::string mLayerName; - std::string mNamespace; - std::shared_ptr - m_weightOnlyGroupwiseGemmRunner; + WeightOnlyGemmRunnerPtr m_weightOnlyGroupwiseGemmRunner; int m_workspaceMaxSize; nvinfer1::DataType mType; + int mSM = tensorrt_llm::common::getSMVersion(); // When M is smaller than this value, we trigger a fast path // I.e. a tailored kernel instead of cutlass. @@ -98,13 +129,6 @@ class WeightOnlyGroupwiseQuantMatmulPlugin : public IPluginV2DynamicExt int mQuantAlgo; - // Flags for indicating whether the corresponding inputs are applied in mQuantAlgo - // mQuantAlgo = pre_quant_scale * PRE_SCALE_QUANT + zero * ZER0 + bias * BIAS - // Here pre_quant_scale, zero and bias are boolean type - static constexpr int BIAS = int(1) << 0; - static constexpr int ZER0 = int(1) << 1; - static constexpr int PRE_SCALE_QUANT = int(1) << 2; - int mGroupSize; int mPreQuantScaleInputIdx; @@ -112,9 +136,14 @@ class WeightOnlyGroupwiseQuantMatmulPlugin : public IPluginV2DynamicExt int mScalesInputIdx; int mZerosInputIdx; int mBiasesInputIdx; + + GemmDims mDims{}; + GemmIdCore mGemmId{}; + + PluginProfilerPtr mPluginProfiler; }; -class WeightOnlyGroupwiseQuantMatmulPluginCreator : public IPluginCreator +class WeightOnlyGroupwiseQuantMatmulPluginCreator : public BaseCreator { public: WeightOnlyGroupwiseQuantMatmulPluginCreator(); @@ -130,17 +159,10 @@ class WeightOnlyGroupwiseQuantMatmulPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + GemmPluginProfilerManager gemmPluginProfileManager; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_WEIGHT_ONLY_GROUPWISE_QUANT_MATMUL_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp index 6ce0f163b8a..f612ca7160f 100644 --- a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp @@ -14,34 +14,87 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h" +#include "weightOnlyQuantMatmulPlugin.h" using namespace nvinfer1; using namespace tensorrt_llm::common; using namespace tensorrt_llm::kernels::cutlass_kernels; -using nvinfer1::plugin::WeightOnlyQuantMatmulPluginCreator; -using nvinfer1::plugin::WeightOnlyQuantMatmulPlugin; +using tensorrt_llm::plugins::WeightOnlyQuantMatmulPluginCreator; +using tensorrt_llm::plugins::WeightOnlyQuantMatmulPlugin; +using tensorrt_llm::plugins::WeightOnlyQuantGemmPluginProfiler; +using tensorrt_llm::plugins::read; +using tensorrt_llm::plugins::write; static const char* WOQ_MATMUL_PLUGIN_VERSION{"1"}; static const char* WOQ_MATMUL_PLUGIN_NAME{"WeightOnlyQuantMatmul"}; PluginFieldCollection WeightOnlyQuantMatmulPluginCreator::mFC{}; -std::vector WeightOnlyQuantMatmulPluginCreator::mPluginAttributes; +std::vector WeightOnlyQuantMatmulPluginCreator::mPluginAttributes; -WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin(nvinfer1::DataType type, int weightTypeId) +void WeightOnlyQuantGemmPluginProfiler::runTactic(int m, int n, int k, + const WeightOnlyQuantGemmPluginProfiler::Config& tactic, char* workspace, const cudaStream_t& stream) +{ + const int originalN = n * (mWeightTypeId == 1 ? 4 : 8); + half* actPtr = reinterpret_cast(workspace); + int8_t* weightPtr + = reinterpret_cast(nextWorkspacePtr(reinterpret_cast(actPtr), m * k * sizeof(half))); + half* scalesPtr = reinterpret_cast( + nextWorkspacePtr(reinterpret_cast(weightPtr), originalN * k * sizeof(int8_t))); + half* outputPtr + = reinterpret_cast(nextWorkspacePtr(reinterpret_cast(scalesPtr), originalN * sizeof(half))); + char* workspacePtr + = reinterpret_cast(nextWorkspacePtr(reinterpret_cast(outputPtr), m * originalN * sizeof(half))); + + const int wsSize = mRunner->getWorkspaceSize(m, n, k); + + if (mWeightTypeId == 1) + { + mRunner->gemm(actPtr, weightPtr, scalesPtr, outputPtr, m, originalN, k, tactic, workspacePtr, wsSize, stream); + } + else + { + mRunner->gemm(actPtr, reinterpret_cast(weightPtr), scalesPtr, outputPtr, m, originalN, k, + tactic, workspacePtr, wsSize, stream); + } +} + +void WeightOnlyQuantGemmPluginProfiler::computeTmpSize(int maxM, int n, int k) +{ + const int originalN = n * (mWeightTypeId == 1 ? 4 : 8); + std::vector workspaces = { + maxM * k * sizeof(half), // A + originalN * k * sizeof(int8_t), // B + originalN * sizeof(half), // scales + maxM * originalN * sizeof(half), // C + mRunner->getWorkspaceSize(maxM, n, k) // workspace + }; + size_t bytes = calculateTotalWorkspaceSize(workspaces.data(), workspaces.size()); + setTmpWorkspaceSizeInBytes(bytes); +} + +WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin( + nvinfer1::DataType type, int weightTypeId, const WeightOnlyQuantMatmulPlugin::PluginProfilerPtr& pluginProfiler) + : mPluginProfiler(pluginProfiler) { init(type, weightTypeId); } // Parameterized constructor -WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin(const void* data, size_t length) +WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin( + const void* data, size_t length, const WeightOnlyQuantMatmulPlugin::PluginProfilerPtr& pluginProfiler) + : mPluginProfiler(pluginProfiler) { const char *d = reinterpret_cast(data), *a = d; nvinfer1::DataType type; int weightTypeId = 0; read(d, type); read(d, weightTypeId); + read(d, mDims); + init(type, weightTypeId); - PLUGIN_ASSERT(d == a + length); + + mPluginProfiler->deserialize(d, mDims, mGemmId); + + TLLM_CHECK(d == a + length); } void WeightOnlyQuantMatmulPlugin::init(nvinfer1::DataType type, int weightTypeId) @@ -60,18 +113,27 @@ void WeightOnlyQuantMatmulPlugin::init(nvinfer1::DataType type, int weightTypeId } else { - PLUGIN_ASSERT(false); + TLLM_CHECK(false); } + + mPluginProfiler->setWeightTypeId(mWeightTypeId); + + mGemmId = GemmIdCore(mDims.n, mDims.k, mType); } // IPluginV2DynamicExt Methods nvinfer1::IPluginV2DynamicExt* WeightOnlyQuantMatmulPlugin::clone() const noexcept { - auto* plugin = new WeightOnlyQuantMatmulPlugin(mType, mWeightTypeId); - plugin->setPluginNamespace(mNamespace.c_str()); + auto* plugin = new WeightOnlyQuantMatmulPlugin(*this); return plugin; } +void WeightOnlyQuantMatmulPlugin::configGemm() +{ + mPluginProfiler->profileTactics( + m_weightOnlyGemmRunner->getConfigs(), m_weightOnlyGemmRunner, mType, mDims, mGemmId); +} + nvinfer1::DimsExprs WeightOnlyQuantMatmulPlugin::getOutputDimensions( int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept { @@ -80,12 +142,12 @@ nvinfer1::DimsExprs WeightOnlyQuantMatmulPlugin::getOutputDimensions( try { - PLUGIN_ASSERT(nbInputs == 3); - PLUGIN_ASSERT(outputIndex == 0); + TLLM_CHECK(nbInputs == 3); + TLLM_CHECK(outputIndex == 0); const int nbDimsA = inputs[0].nbDims; const int nbDimsB = inputs[1].nbDims; - PLUGIN_ASSERT(nbDimsA >= 2); - PLUGIN_ASSERT(nbDimsB == 2); + TLLM_CHECK(nbDimsA >= 2); + TLLM_CHECK(nbDimsB == 2); DimsExprs ret; ret.nbDims = nbDimsA; for (int ii = 0; ii < nbDimsA - 1; ++ii) @@ -145,13 +207,22 @@ bool WeightOnlyQuantMatmulPlugin::supportsFormatCombination( void WeightOnlyQuantMatmulPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept { - int maxM = 1; - for (int ii = 0; ii < in[0].max.nbDims - 1; ++ii) - { - maxM *= in[0].max.d[ii]; - } + const auto minM = std::accumulate(in[0].min.d, in[0].min.d + in[0].min.nbDims - 1, 1, std::multiplies()); + const auto maxM = std::accumulate(in[0].max.d, in[0].max.d + in[0].max.nbDims - 1, 1, std::multiplies()); + const int maxK = in[0].max.d[in[0].max.nbDims - 1]; const int maxN = in[1].max.d[1] * (mWeightTypeId == 1 ? 4 : 8); + + const auto K = maxK; + const auto N = maxN / (mWeightTypeId == 1 ? 4 : 8); + + if (!mDims.isInitialized()) + { + mDims = {minM, maxM, N, K}; + } + + mGemmId = {N, K, mType}; + m_workspaceMaxSize = m_weightOnlyGemmRunner->getWorkspaceSize(maxM, maxN, maxK); } @@ -181,38 +252,50 @@ int WeightOnlyQuantMatmulPlugin::enqueue(const nvinfer1::PluginTensorDesc* input const int k = inputDesc[0].dims.d[inputDesc[0].dims.nbDims - 1]; const int ws_size = m_weightOnlyGemmRunner->getWorkspaceSize(m, n, k); + const auto& bestTactic = mPluginProfiler->getBestConfig(m, mGemmId); + TLLM_CHECK_WITH_INFO(bestTactic, "No valid SQ GEMM tactic"); if (mType == nvinfer1::DataType::kHALF && mWeightTypeId == 1) { - if (m == 1) + if (m < SMALL_M_FAST_PATH && mSM >= 75) { - const half* bias = nullptr; - tensorrt_llm::kernels::weight_only_gemv_launcher(reinterpret_cast(inputs[0]), - reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), bias, - reinterpret_cast(outputs[0]), k, n * 4, tensorrt_llm::kernels::ActivationType::Identity, - tensorrt_llm::kernels::QuantType::INT8_WEIGHT_ONLY, stream); + // Use CUDA kernels for small batch size + // The CUDA kernel is designed for ColumnMajorTileInterleave weight layout used in fpAIntB cutlass kernel + // when sm >= 75 and the preprocessing of cutlass on sm70 does not interleave the weights. + tensorrt_llm::kernels::WeightOnlyParams params{reinterpret_cast(inputs[1]), + reinterpret_cast(inputs[2]), nullptr, reinterpret_cast(inputs[0]), nullptr, + reinterpret_cast(outputs[0]), m, n * 4, k, 0}; + tensorrt_llm::kernels::weight_only_batched_gemv_launcher(tensorrt_llm::kernels::WeightOnlyQuantType::Int8b, + tensorrt_llm::kernels::WeightOnlyType::PerChannel, + tensorrt_llm::kernels::WeightOnlyActivationType::Identity, params, stream); } else { m_weightOnlyGemmRunner->gemm(reinterpret_cast(inputs[0]), reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), - reinterpret_cast(outputs[0]), m, n * 4, k, reinterpret_cast(workspace), ws_size, stream); + reinterpret_cast(outputs[0]), m, n * 4, k, *bestTactic, reinterpret_cast(workspace), + ws_size, stream); } } else if (mType == nvinfer1::DataType::kHALF && mWeightTypeId == 2) { - if (m == 1) + if (m < SMALL_M_FAST_PATH && mSM >= 75) { - const half* bias = nullptr; - tensorrt_llm::kernels::weight_only_gemv_launcher(reinterpret_cast(inputs[0]), - reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), bias, - reinterpret_cast(outputs[0]), k, n * 8, tensorrt_llm::kernels::ActivationType::Identity, - tensorrt_llm::kernels::QuantType::PACKED_INT4_WEIGHT_ONLY, stream); + // Use CUDA kernels for small batch size + // The CUDA kernel is designed for ColumnMajorTileInterleave weight layout used in fpAIntB cutlass kernel + // when sm >= 75 and the preprocessing of cutlass on sm70 does not interleave the weights. + tensorrt_llm::kernels::WeightOnlyParams params{reinterpret_cast(inputs[1]), + reinterpret_cast(inputs[2]), nullptr, reinterpret_cast(inputs[0]), nullptr, + reinterpret_cast(outputs[0]), m, n * 8, k, 0}; + tensorrt_llm::kernels::weight_only_batched_gemv_launcher(tensorrt_llm::kernels::WeightOnlyQuantType::Int4b, + tensorrt_llm::kernels::WeightOnlyType::PerChannel, + tensorrt_llm::kernels::WeightOnlyActivationType::Identity, params, stream); } else { m_weightOnlyGemmRunner->gemm(reinterpret_cast(inputs[0]), reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), - reinterpret_cast(outputs[0]), m, n * 8, k, reinterpret_cast(workspace), ws_size, stream); + reinterpret_cast(outputs[0]), m, n * 8, k, *bestTactic, reinterpret_cast(workspace), + ws_size, stream); } } else @@ -227,7 +310,7 @@ int WeightOnlyQuantMatmulPlugin::enqueue(const nvinfer1::PluginTensorDesc* input nvinfer1::DataType WeightOnlyQuantMatmulPlugin::getOutputDataType( int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept { - PLUGIN_ASSERT(index == 0); + TLLM_CHECK(index == 0); return mType; } @@ -250,6 +333,7 @@ int WeightOnlyQuantMatmulPlugin::getNbOutputs() const noexcept int WeightOnlyQuantMatmulPlugin::initialize() noexcept { + configGemm(); return 0; } @@ -257,7 +341,10 @@ void WeightOnlyQuantMatmulPlugin::terminate() noexcept {} size_t WeightOnlyQuantMatmulPlugin::getSerializationSize() const noexcept { - return sizeof(int) + sizeof(nvinfer1::DataType); + return sizeof(int) + // mWeightTypeId + sizeof(nvinfer1::DataType) + // mType + sizeof(mDims) + // Dimensions + mPluginProfiler->getSerializationSize(mGemmId); // selected tactics container size } void WeightOnlyQuantMatmulPlugin::serialize(void* buffer) const noexcept @@ -265,6 +352,9 @@ void WeightOnlyQuantMatmulPlugin::serialize(void* buffer) const noexcept char *d = static_cast(buffer), *a = d; write(d, mType); write(d, mWeightTypeId); + write(d, mDims); + + mPluginProfiler->serialize(d, mGemmId); assert(d == a + getSerializationSize()); } @@ -274,16 +364,6 @@ void WeightOnlyQuantMatmulPlugin::destroy() noexcept delete this; } -void WeightOnlyQuantMatmulPlugin::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* WeightOnlyQuantMatmulPlugin::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} - /////////////// WeightOnlyQuantMatmulPluginCreator::WeightOnlyQuantMatmulPluginCreator() @@ -322,18 +402,21 @@ IPluginV2* WeightOnlyQuantMatmulPluginCreator::createPlugin(const char* name, co const char* attrName = fields[i].name; if (!strcmp(attrName, "weight_type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); weightTypeId = static_cast(*(static_cast(fields[i].data))); } else if (!strcmp(attrName, "type_id")) { - PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32); + TLLM_CHECK(fields[i].type == PluginFieldType::kINT32); type = static_cast(*(static_cast(fields[i].data))); } } try { - auto* obj = new WeightOnlyQuantMatmulPlugin(type, weightTypeId); + // WeightOnlyGroupwiseQuantMatmulPluginCreator is unique and shared for an engine generation + // Create plugin profiler with shared tactics map + auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false); + auto* obj = new WeightOnlyQuantMatmulPlugin(type, weightTypeId, pluginProfiler); obj->setPluginNamespace(mNamespace.c_str()); return obj; } @@ -351,7 +434,9 @@ IPluginV2* WeightOnlyQuantMatmulPluginCreator::deserializePlugin( // call WeightOnlyQuantMatmulPlugin::destroy() try { - auto* obj = new WeightOnlyQuantMatmulPlugin(serialData, serialLength); + // Create plugin profiler with private tactics map which is read from the serialized engine + auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ true); + auto* obj = new WeightOnlyQuantMatmulPlugin(serialData, serialLength, pluginProfiler); obj->setPluginNamespace(mNamespace.c_str()); return obj; } @@ -361,13 +446,3 @@ IPluginV2* WeightOnlyQuantMatmulPluginCreator::deserializePlugin( } return nullptr; } - -void WeightOnlyQuantMatmulPluginCreator::setPluginNamespace(const char* libNamespace) noexcept -{ - mNamespace = libNamespace; -} - -const char* WeightOnlyQuantMatmulPluginCreator::getPluginNamespace() const noexcept -{ - return mNamespace.c_str(); -} diff --git a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h index c5cb6ed0be9..2cddeca382a 100644 --- a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h +++ b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.h @@ -14,16 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef TRT_WEIGHT_ONLY_QUANT_MATMUL_PLUGIN_H -#define TRT_WEIGHT_ONLY_QUANT_MATMUL_PLUGIN_H +#pragma once -#include "NvInferPlugin.h" -#include "cutlass/numeric_types.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" -#include "tensorrt_llm/kernels/weightOnlyMatrixVectorMultiplication.h" +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h" +#include "tensorrt_llm/plugins/common/gemmPluginProfiler.h" #include "tensorrt_llm/plugins/common/plugin.h" + #include +#include #include #include #include @@ -33,21 +33,43 @@ // breaking dependencies #include "cutlass/integer_subbyte.h" -namespace nvinfer1 +namespace tensorrt_llm::plugins { -namespace plugin + +using WeightOnlyGemmRunner = tensorrt_llm::kernels::cutlass_kernels::CutlassFpAIntBGemmRunnerInterface; +using WeightOnlyGemmRunnerPtr = std::shared_ptr; + +class WeightOnlyQuantGemmPluginProfiler : public GemmPluginProfiler { +public: + using Config = tensorrt_llm::cutlass_extensions::CutlassGemmConfig; + + void setWeightTypeId(int weightId) + { + mWeightTypeId = weightId; + } + +protected: + void runTactic(int m, int n, int k, const Config& tactic, char* workspace, const cudaStream_t& stream) override; + + void computeTmpSize(int maxM, int n, int k) override; + +private: + int mWeightTypeId; +}; -class WeightOnlyQuantMatmulPlugin : public IPluginV2DynamicExt +class WeightOnlyQuantMatmulPlugin : public BasePlugin { public: + using PluginProfilerPtr = std::shared_ptr; WeightOnlyQuantMatmulPlugin() = delete; // int8 weight only : weightTypeId = 1; // int4 weight only : weightTypeId = 2; - WeightOnlyQuantMatmulPlugin(nvinfer1::DataType type, int weightTypeId); + WeightOnlyQuantMatmulPlugin(nvinfer1::DataType type, int weightTypeId, const PluginProfilerPtr& profiler); - WeightOnlyQuantMatmulPlugin(const void* data, size_t length); + WeightOnlyQuantMatmulPlugin(const void* data, size_t length, const PluginProfilerPtr& profiler); ~WeightOnlyQuantMatmulPlugin() override = default; @@ -77,25 +99,34 @@ class WeightOnlyQuantMatmulPlugin : public IPluginV2DynamicExt size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void destroy() noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - const char* getPluginNamespace() const noexcept override; private: // int8 weight only : weightTypeId = 1; // int4 weight only : weightTypeId = 2; void init(nvinfer1::DataType type, int weightTypeId); + void configGemm(); + private: const std::string mLayerName; - std::string mNamespace; - std::shared_ptr m_weightOnlyGemmRunner; + WeightOnlyGemmRunnerPtr m_weightOnlyGemmRunner; int m_workspaceMaxSize; nvinfer1::DataType mType; int mWeightTypeId; + int mSM = tensorrt_llm::common::getSMVersion(); + + // When M is smaller than this value, we trigger a fast path + // I.e. a tailored kernel instead of cutlass. + static constexpr int SMALL_M_FAST_PATH = 5; + + GemmDims mDims{}; + GemmIdCore mGemmId{}; + + PluginProfilerPtr mPluginProfiler; }; -class WeightOnlyQuantMatmulPluginCreator : public IPluginCreator +class WeightOnlyQuantMatmulPluginCreator : public BaseCreator { public: WeightOnlyQuantMatmulPluginCreator(); @@ -111,17 +142,10 @@ class WeightOnlyQuantMatmulPluginCreator : public IPluginCreator nvinfer1::IPluginV2* deserializePlugin( const char* name, const void* serialData, size_t serialLength) noexcept override; - void setPluginNamespace(const char* pluginNamespace) noexcept override; - - const char* getPluginNamespace() const noexcept override; - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; + GemmPluginProfilerManager gemmPluginProfileManager; + static nvinfer1::PluginFieldCollection mFC; + static std::vector mPluginAttributes; }; -} // namespace plugin -} // namespace nvinfer1 - -#endif // TRT_WEIGHT_ONLY_QUANT_MATMUL_PLUGIN_H +} // namespace tensorrt_llm::plugins diff --git a/cpp/tensorrt_llm/runtime/CMakeLists.txt b/cpp/tensorrt_llm/runtime/CMakeLists.txt index ecf09ca7980..4e5a6ec9373 100644 --- a/cpp/tensorrt_llm/runtime/CMakeLists.txt +++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt @@ -25,6 +25,7 @@ set(SRCS iBuffer.cpp iTensor.cpp memoryCounters.cpp + ncclCommunicator.cpp runtimeBuffers.cpp runtimeKernels.cu statefulGptDecoder.cpp @@ -34,6 +35,7 @@ set(SRCS include_directories(${API_INCLUDE_DIR}/tensorrt_llm/runtime) +add_compile_options(-Wall) add_library(runtime_src OBJECT ${SRCS}) set_property(TARGET runtime_src PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET runtime_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) @@ -43,4 +45,9 @@ target_include_directories(runtime_src PRIVATE ${MPI_INCLUDE_PATH}) set(JSON_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/json) add_subdirectory(${JSON_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/json) -target_link_libraries(runtime_src PUBLIC nlohmann_json::nlohmann_json) +if(ENABLE_MULTI_DEVICE EQUAL 1) + target_link_libraries(runtime_src PUBLIC nlohmann_json::nlohmann_json + ${NCCL_LIB}) +else() + target_link_libraries(runtime_src PUBLIC nlohmann_json::nlohmann_json) +endif() diff --git a/cpp/tensorrt_llm/runtime/bufferManager.cpp b/cpp/tensorrt_llm/runtime/bufferManager.cpp index 796673b91fa..857efd4ea9f 100644 --- a/cpp/tensorrt_llm/runtime/bufferManager.cpp +++ b/cpp/tensorrt_llm/runtime/bufferManager.cpp @@ -86,7 +86,14 @@ void BufferManager::copy(void const* src, IBuffer& dst) const { if (dst.getSizeInBytes() > 0) { - TLLM_CUDA_CHECK(cudaMemcpyAsync(dst.data(), src, dst.getSizeInBytes(), cudaMemcpyDefault, mStream->get())); + if (IBuffer::memoryType(src) != MemoryType::kGPU && dst.getMemoryType() != MemoryType::kGPU) + { + std::memcpy(dst.data(), src, dst.getSizeInBytes()); + } + else + { + TLLM_CUDA_CHECK(cudaMemcpyAsync(dst.data(), src, dst.getSizeInBytes(), cudaMemcpyDefault, mStream->get())); + } } } @@ -94,7 +101,14 @@ void BufferManager::copy(IBuffer const& src, void* dst) const { if (src.getSizeInBytes() > 0) { - TLLM_CUDA_CHECK(cudaMemcpyAsync(dst, src.data(), src.getSizeInBytes(), cudaMemcpyDefault, mStream->get())); + if (IBuffer::memoryType(dst) != MemoryType::kGPU && src.getMemoryType() != MemoryType::kGPU) + { + std::memcpy(dst, src.data(), src.getSizeInBytes()); + } + else + { + TLLM_CUDA_CHECK(cudaMemcpyAsync(dst, src.data(), src.getSizeInBytes(), cudaMemcpyDefault, mStream->get())); + } } } diff --git a/cpp/tensorrt_llm/runtime/gptDecoder.cpp b/cpp/tensorrt_llm/runtime/gptDecoder.cpp index fd4af84d886..6e4ba484c7a 100644 --- a/cpp/tensorrt_llm/runtime/gptDecoder.cpp +++ b/cpp/tensorrt_llm/runtime/gptDecoder.cpp @@ -131,6 +131,7 @@ template typename tl::DynamicDecodeLayer::OutputParams prepareOutputs( DecodingOutput& output, DecodingInput::TensorPtr const& inputLengths) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); typename tl::DynamicDecodeLayer::OutputParams outputParams(tcc::toTllmTensor(*output.ids)); outputParams.newTokens = tcc::toTllmTensor(*output.newTokens); @@ -271,6 +272,7 @@ template class GptDecoder; void IGptDecoder::gatherTree(ITensor& finalOutputIds, DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, BufferManager const& manager) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto const& finalOutputIdsShape = finalOutputIds.getShape(); auto const& decodingOutputIdsShape = decodingOutput.ids->getShape(); auto const batchSize = finalOutputIdsShape.d[0]; @@ -322,39 +324,12 @@ void IGptDecoder::gatherTree(ITensor& finalOutputIds, DecodingOutput const& deco nullptr, // output_logs beamHypotheses.output_ids_tgt, beamHypotheses.sequence_lengths_tgt, beamHypotheses.normed_scores, beamHypotheses.cum_log_probs, beamHypotheses.log_probs, beamHypotheses.num_beams, - beamHypotheses.input_lengths, beamWidth, maxSeqLength, batchSize, decodingInput.maxLength, stream.get()); + beamHypotheses.input_lengths, beamWidth, maxSeqLength, batchSize, stream.get()); sync_check_cuda_error(); } else { - auto workspace = manager.gpu(batchSize * beamWidth * maxSeqLength, nvinfer1::DataType::kINT32); - manager.setZero(*workspace); - - // For sampling, it is equivalent to all parent ids are 0. - tensorrt_llm::kernels::gatherTreeParam param; - param.beams = bufferCast(*workspace); - // Remove prompt length if possible - param.sequence_lengths = bufferCast(*decodingOutput.lengths); - // add sequence_length 1 here because the sequence_length of time step t is t - 1 - param.max_sequence_length_final_step = 1; - // response input lengths (used to slice the ids during postprocessing), used in interactive generation - // This feature is not supported yet, setting it to nullptr temporarily. - param.response_input_lengths = nullptr; - param.max_seq_len = maxSeqLength; - param.batch_size = batchSize; - param.beam_width = beamWidth; - param.step_ids = bufferCast(*decodingOutput.ids); - param.parent_ids = nullptr; - param.end_tokens = bufferCast(*decodingInput.endIds); - param.max_input_length = decodingInput.maxLength; - param.input_lengths = bufferCast(*decodingInput.lengths); - // decoder output has padding - param.has_padding = true; - - param.output_ids = bufferCast(finalOutputIds); - param.stream = stream.get(); - param.cum_log_probs = bufferCast(*decodingOutput.cumLogProbs); - param.length_penalty = 1.0f; - tensorrt_llm::kernels::invokeGatherTree(param); + manager.copy(*decodingOutput.ids, finalOutputIds); + sync_check_cuda_error(); } } diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp index 16331503a9a..cfba542dd22 100644 --- a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp +++ b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp @@ -19,6 +19,7 @@ #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/runtimeKernels.h" +#include #include using namespace tensorrt_llm::runtime; @@ -29,6 +30,7 @@ namespace { SamplingConfig extractSamplingConfig(SamplingConfig const& batchSamplingConfig, SizeType batchIdx) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); SamplingConfig samplingConfig{batchSamplingConfig.beamWidth}; auto extractOptional = [&batchIdx](auto& single, auto const& batch) @@ -59,6 +61,7 @@ SamplingConfig extractSamplingConfig(SamplingConfig const& batchSamplingConfig, samplingConfig.beamSearchDiversityRate = batchSamplingConfig.beamSearchDiversityRate; samplingConfig.lengthPenalty = batchSamplingConfig.lengthPenalty; + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); return samplingConfig; } @@ -73,6 +76,7 @@ GptDecoderBatch::GptDecoderBatch( , mEventStart(tc::CreateEvent()) , mEventStop(tc::CreateEvent()) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto constexpr nvTokenIdType = TRTDataType::value; auto constexpr nvSizeType = TRTDataType::value; auto constexpr nvFloatType = TRTDataType::value; @@ -97,11 +101,13 @@ GptDecoderBatch::GptDecoderBatch( dOutput->lengths = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType); dOutput->cumLogProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType); dOutput->beamHypotheses.empty(mBufferManager); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void GptDecoderBatch::setup( SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TLLM_CHECK(maxBatchSize > 0); TLLM_CHECK(maxBeamWidth > 0); TLLM_CHECK(maxSequenceLength > 0); @@ -128,19 +134,19 @@ void GptDecoderBatch::setup( dOutput.newTokens->reshape(maxBatchSizeXmaxBeamWidth); mBufferManager.setZero(*dOutput.newTokens); dOutput.parentIds->reshape(jointOutputIdsShape); + dOutput.lengths->reshape(maxBatchSizeXmaxBeamWidth); + mBufferManager.setZero(*dOutput.lengths); dOutput.finished->reshape(maxBatchSizeXmaxBeamWidth); mBufferManager.setZero(*dOutput.finished); mBufferManager.setZero(*dOutput.finishedSum); - dOutput.lengths->reshape(maxBatchSizeXmaxBeamWidth); - mBufferManager.setZero(*dOutput.lengths); - dOutput.cumLogProbs->reshape(maxBatchSizeXmaxBeamWidth); - mBufferManager.setZero(*dOutput.cumLogProbs); // use batchSize many entries instead of the usual 1 dOutput.finishedSum->reshape(maxBatchSizeShape); mBufferManager.setZero(*dOutput.finishedSum); if (maxBeamWidth > 1) { + dOutput.cumLogProbs->reshape(maxBatchSizeXmaxBeamWidth); + mBufferManager.setZero(*dOutput.cumLogProbs); dOutput.beamHypotheses.reshape(maxBatchSize, maxBeamWidth, mMaxSequenceLength); } else @@ -171,6 +177,7 @@ void GptDecoderBatch::setup( mMaxNewTokens[i] = 0; mBeamWidths[i] = 0; } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void GptDecoderBatch::newRequest( @@ -234,13 +241,13 @@ void GptDecoderBatch::newRequest( manager.setZero(*dOutput->finishedSum); dOutput->lengths = ITensor::slice(dJointOutput.lengths, batchIdx, localBatchSize); kernels::invokeFill(*dOutput->lengths, inputLength, *stream); - dOutput->cumLogProbs = ITensor::slice(dJointOutput.cumLogProbs, batchIdx, localBatchSize); - manager.setZero(*IBuffer::slice(dOutput->cumLogProbs, 0, 1)); dOutput->newTokens = ITensor::slice(dJointOutput.newTokens, batchIdx, localBatchSize); manager.setZero(*dOutput->newTokens); if (beamWidth > 1) { + dOutput->cumLogProbs = ITensor::slice(dJointOutput.cumLogProbs, batchIdx, localBatchSize); + manager.setZero(*IBuffer::slice(dOutput->cumLogProbs, 0, 1)); kernels::invokeFill( *IBuffer::slice(dOutput->cumLogProbs, 1, beamWidth - 1), DecodingOutput::kNegativeInfinity, *stream); @@ -263,6 +270,7 @@ void GptDecoderBatch::newRequest( auto outputIdsView = ITensor::view(outputIds, ITensor::makeShape({beamWidth, mMaxSequenceLength})); kernels::invokeFill(*outputIdsView, endId, *stream); kernels::tileTensor(*outputIdsView, *inputIdsView, beamWidth, *stream); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void GptDecoderBatch::forward(decoder_batch::Output& output, decoder_batch::Input const& input) @@ -284,6 +292,10 @@ void GptDecoderBatch::forward(decoder_batch::Output& output, decoder_batch::Inpu TLLM_CHECK(!srcCacheIndirection || srcCacheIndirection->getDataType() == TRTDataType::value); TLLM_CHECK(!tgtCacheIndirection || tgtCacheIndirection->getDataType() == TRTDataType::value); + // TODO(bhsueh) should remove this reshape and set shape to [batch_size, beam_width] outside + TensorPtr sequenceLengths = ITensor::view(output.sequenceLengths); + sequenceLengths->reshape(ITensor::makeShape({mActualBatchSize, maxBeamWidth})); + TLLM_CHECK(sequenceLengths); auto constexpr singleRequest = 1; mStream->record(mEventStart.get()); @@ -308,6 +320,8 @@ void GptDecoderBatch::forward(decoder_batch::Output& output, decoder_batch::Inpu dOutput.cacheIndirection = ITensor::view(tgtView, ITensor::makeShape({singleRequest, mBeamWidths[i], tgtView->getShape().d[2]})); } + auto sequenceLengthsView = std::shared_ptr(ITensor::slice(sequenceLengths, i, singleRequest)); + dOutput.lengths = ITensor::view(sequenceLengthsView, ITensor::makeShape({singleRequest, mBeamWidths[i]})); auto& decoder = *mDecoders[i]; decoder.forwardAsync(dOutput, dInput); @@ -321,6 +335,10 @@ void GptDecoderBatch::forward(decoder_batch::Output& output, decoder_batch::Inpu manager.copy(*dOutput.ids, *jointOutputIdsView); + auto jointSequenceLengthsView = ITensor::slice(mJointDecodingOutput->lengths, i, singleRequest); + jointSequenceLengthsView->reshape(ITensor::makeShape({1, mBeamWidths[i]})); + manager.copy(*dOutput.lengths, *jointSequenceLengthsView); + if (mBeamWidths[i] > 1) { auto jointOutputParentIdsView = ITensor::slice(mJointDecodingOutput->parentIds, i, singleRequest); @@ -347,11 +365,13 @@ void GptDecoderBatch::forward(decoder_batch::Output& output, decoder_batch::Inpu // This condition requires the synchronization above || *bufferCast(*dOutput.finishedSum) == static_cast(dOutput.finished->getSize()); } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } // TODO (rkobus) call this at the end of forward if mFinished[i] changes from false to true? void GptDecoderBatch::postProcessRequest(SizeType batchIdx) const { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto& stream = mStreams[batchIdx]; auto manager = BufferManager{stream}; @@ -368,10 +388,12 @@ void GptDecoderBatch::postProcessRequest(SizeType batchIdx) const auto& event = mEvents[batchIdx]; stream->record(event.get()); mStream->wait(event.get()); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void GptDecoderBatch::newBatch(GenerationInput const& inputs, SamplingConfig const& samplingConfig) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); // split batch into single requests auto const& inputLengths = inputs.lengths; mActualBatchSize = inputLengths->getShape().d[0]; @@ -409,28 +431,34 @@ void GptDecoderBatch::newBatch(GenerationInput const& inputs, SamplingConfig con request.stopWordsList = inputs.stopWordsList; newRequest(batchIdx, request, extractSamplingConfig(samplingConfig, batchIdx)); } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } bool GptDecoderBatch::forward(decoder::Output& output, decoder::Input const& input) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); decoder_batch::Input batchInput{input.logits}; batchInput.cacheIndirection = input.cacheIndirection; decoder_batch::Output batchOutput; batchOutput.cacheIndirection = output.cacheIndirection; + batchOutput.sequenceLengths = output.sequenceLengths; forward(batchOutput, batchInput); auto finished = getFinished(); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); return std::all_of(finished.begin(), finished.end(), [](bool x) { return x; }); } IStatefulGptDecoder::TensorPtr GptDecoderBatch::getFinalOutputIds() const { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); for (SizeType batchIdx = 0; batchIdx < mActualBatchSize; ++batchIdx) { postProcessRequest(batchIdx); } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); return ITensor::slice(getOutputIds(), 0, mActualBatchSize); } diff --git a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp index f7d12881487..ae5ffee9f48 100644 --- a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp +++ b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp @@ -55,9 +55,10 @@ GptJsonConfig parseJson(InputType&& i) auto const& builderConfig = json.at("builder_config"); auto const name = builderConfig.at("name").template get(); auto const precision = builderConfig.at("precision").template get(); - auto const worldSize = builderConfig.at("tensor_parallel").template get(); - auto const numHeads = builderConfig.at("num_heads").template get() / worldSize; - auto const hiddenSize = builderConfig.at("hidden_size").template get() / worldSize; + auto const tensorParallelism = builderConfig.at("tensor_parallel").template get(); + auto const pipelineParallelism = parseJsonFieldOr(builderConfig, "pipeline_parallel", 1); + auto const numHeads = builderConfig.at("num_heads").template get() / tensorParallelism; + auto const hiddenSize = builderConfig.at("hidden_size").template get() / tensorParallelism; auto const vocabSize = builderConfig.at("vocab_size").template get(); auto const numLayers = builderConfig.at("num_layers").template get(); @@ -74,33 +75,41 @@ GptJsonConfig parseJson(InputType&& i) auto const pagedKvCache = parseJsonFieldOr(builderConfig, "paged_kv_cache", false); auto const tokensPerBlock = parseJsonFieldOr(builderConfig, "tokens_per_block", 0); auto const quantMode = tc::QuantMode(parseJsonFieldOr(builderConfig, "quant_mode", tc::QuantMode::none().value())); - auto const numKvHeads = parseJsonFieldOr(builderConfig, "num_kv_heads", numHeads * worldSize) / worldSize; + auto const numKvHeads + = parseJsonFieldOr(builderConfig, "num_kv_heads", numHeads * tensorParallelism) / tensorParallelism; + auto const maxBatchSize = parseJsonFieldOr(builderConfig, "max_batch_size", 0); + auto const maxInputLen = parseJsonFieldOr(builderConfig, "max_input_len", 0); + auto const maxOutputLen = parseJsonFieldOr(builderConfig, "max_output_len", 0); auto const& pluginConfig = json.at("plugin_config"); auto const& gptAttentionPlugin = pluginConfig.at("gpt_attention_plugin"); auto const useGptAttentionPlugin = !gptAttentionPlugin.is_boolean() || gptAttentionPlugin.template get(); auto const removeInputPadding = pluginConfig.at("remove_input_padding").template get(); - auto const inflightBatching = pluginConfig.at("in_flight_batching").template get(); auto modelConfig = GptModelConfig{vocabSize, numLayers, numHeads, hiddenSize, dataType}; modelConfig.useGptAttentionPlugin(useGptAttentionPlugin); modelConfig.usePackedInput(removeInputPadding); modelConfig.usePagedKvCache(pagedKvCache); - modelConfig.useInflightBatching(inflightBatching); modelConfig.setTokensPerBlock(tokensPerBlock); modelConfig.setQuantMode(quantMode); modelConfig.setNbKvHeads(numKvHeads); - return GptJsonConfig{name, precision, worldSize, modelConfig}; + modelConfig.setMaxBatchSize(maxBatchSize); + modelConfig.setMaxInputLen(maxInputLen); + modelConfig.setMaxOutputLen(maxOutputLen); + + return GptJsonConfig{name, precision, tensorParallelism, pipelineParallelism, modelConfig}; } } // namespace std::string GptJsonConfig::engineFilename(WorldConfig const& worldConfig, std::string const& model) const - { - TLLM_CHECK_WITH_INFO(getWorldSize() == worldConfig.getSize(), "world size mismatch"); - return model + "_" + getPrecision() + "_tp" + std::to_string(worldConfig.getSize()) + "_rank" + TLLM_CHECK_WITH_INFO(getTensorParallelism() == worldConfig.getTensorParallelism(), "tensor parallelism mismatch"); + TLLM_CHECK_WITH_INFO( + getPipelineParallelism() == worldConfig.getPipelineParallelism(), "pipeline parallelism mismatch"); + auto pp = worldConfig.isPipelineParallel() ? "_pp" + std::to_string(worldConfig.getPipelineParallelism()) : ""; + return model + "_" + getPrecision() + "_tp" + std::to_string(worldConfig.getTensorParallelism()) + pp + "_rank" + std::to_string(worldConfig.getRank()) + ".engine"; } diff --git a/cpp/tensorrt_llm/runtime/gptSession.cpp b/cpp/tensorrt_llm/runtime/gptSession.cpp index 90006df265f..a10c83daa35 100644 --- a/cpp/tensorrt_llm/runtime/gptSession.cpp +++ b/cpp/tensorrt_llm/runtime/gptSession.cpp @@ -24,6 +24,7 @@ #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/decodingKernels.h" #include "tensorrt_llm/runtime/gptDecoderBatch.h" +#include "tensorrt_llm/runtime/ncclCommunicator.h" #include "tensorrt_llm/runtime/runtimeBuffers.h" #include "tensorrt_llm/runtime/runtimeKernels.h" #include "tensorrt_llm/runtime/statefulGptDecoder.h" @@ -50,32 +51,52 @@ GptSession::GptSession(GptModelConfig const& modelConfig, WorldConfig const& wor , mBuffers{std::make_shared()} , mCudaGraphInstances{} { - TLLM_CHECK_WITH_INFO(mRuntime->getNbProfiles() == 1, "GPT only expects one optimization profile"); createContexts(); - mBuffers->create(*mRuntime, mModelConfig); + mBuffers->create(*mRuntime, mModelConfig, mWorldConfig); + + if (mWorldConfig.isPipelineParallel()) + { + mPipelineComm = NcclCommunicator::createPipelineComm(mWorldConfig, *mLogger); + } + // TODO compare expected and runtime tensor names? } -nvinfer1::ILogger& tensorrt_llm::runtime::GptSession::getLogger() const +nvinfer1::ILogger& GptSession::getLogger() const { return *mLogger; } -BufferManager& tensorrt_llm::runtime::GptSession::getBufferManager() const +BufferManager& GptSession::getBufferManager() const { return mRuntime->getBufferManager(); } void GptSession::createContexts() { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); mRuntime->clearContexts(); + auto numProfiles = mRuntime->getNbProfiles(); + TLLM_CHECK_WITH_INFO( + numProfiles == 1 || numProfiles == 2, "GPT only expects one optimization profile or two optimization profiles"); // Instantiate two contexts for flip-flopping - mRuntime->addContext(0); - mRuntime->addContext(0); + if (numProfiles == 1) + { + mRuntime->addContext(0); + mRuntime->addContext(0); + } + else + { + mRuntime->addContext(1); + mRuntime->addContext(1); + mRuntime->addContext(0); + } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void GptSession::createDecoder(bool decoderPerRequest) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto const vocabSize = mModelConfig.getVocabSize(); auto const vocabSizePadded = mModelConfig.getVocabSizePadded(mWorldConfig.getSize()); auto const& stream = mRuntime->getStreamPtr(); @@ -84,6 +105,7 @@ void GptSession::createDecoder(bool decoderPerRequest) mDecoder = std::make_shared(vocabSize, vocabSizePadded, stream); else mDecoder = std::make_shared(vocabSize, vocabSizePadded, stream); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void GptSession::setup(SizeType const batchSize, SizeType const beamWidth, SizeType const maxSequenceLength, @@ -116,14 +138,17 @@ void GptSession::setup(SizeType const batchSize, SizeType const beamWidth, SizeT tokensPerBlock, maxNumBlocks, batchSize, kvDtype, mRuntime->getStreamPtr()); } - auto const logitsType = utils::getTensorDataType(mRuntime->getEngine(), "logits"); - - createDecoder(decoderPerRequest); - mDecoder->setup(batchSize, beamWidth, maxSequenceLength, logitsType); + if (mWorldConfig.isLastPipelineParallelRank()) + { + auto const logitsType = mRuntime->getEngine().getTensorDataType("logits"); + createDecoder(decoderPerRequest); + mDecoder->setup(batchSize, beamWidth, maxSequenceLength, logitsType); + } // reshape does not care about maxInputLength or maxNewTokens auto const generationConfig = RuntimeBuffers::GenerationConfig{batchSize, beamWidth, 0, 0, maxSequenceLength}; - mBuffers->reshape(generationConfig, mModelConfig, mWorldConfig.getSize()); + mBuffers->reshape(generationConfig, mModelConfig, mWorldConfig); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void GptSession::generate( @@ -151,20 +176,11 @@ void GptSession::generate( auto const beamWidth = generationConfig.beamWidth; auto const maxInputLength = generationConfig.maxInputLength; auto const maxNewTokens = generationConfig.maxNewTokens; - auto const maxSeqLength = generationConfig.maxSeqLength; - auto finalSeqLength = maxSeqLength; TLLM_CHECK_WITH_INFO(buffers.allocated, "Buffers not allocated, please call setup first!"); - buffers.reshape(generationConfig, mModelConfig, mWorldConfig.getSize()); + buffers.reshape(generationConfig, mModelConfig, mWorldConfig); - if (mModelConfig.usePackedInput()) - { - buffers.inputOffsets->reshape(ITensor::makeShape({batchSize + 1})); - manager.setZero(*buffers.inputOffsets); - kernels::invokeInclusiveSum( - *ITensor::slice(buffers.inputOffsets, 1), *buffers.contextLengthsDevice, manager, stream); - } if (mModelConfig.usePagedKvCache()) { auto const contextLengthsHost = bufferCast(*buffers.contextLengthsHost); @@ -174,23 +190,39 @@ void GptSession::generate( } } - mDecoder->newBatch(inputs, samplingConfig); - RuntimeBuffers::TensorMap inputBuffers[2]; RuntimeBuffers::TensorMap outputBuffers[2]; auto& onTokenGenerated = outputs.onTokenGenerated; + outputs.ids->reshape(ITensor::makeShape({batchSize, beamWidth, mDecoderMaxSequenceLength})); + ITensor::SharedPtr newTokens; + if (mWorldConfig.isLastPipelineParallelRank()) + { + mDecoder->newBatch(inputs, samplingConfig); + newTokens = mDecoder->getNewTokens(); + } + else if (mWorldConfig.isFirstPipelineParallelRank()) + { + newTokens = manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kINT32); + } for (SizeType step = 0; step < maxNewTokens; ++step) { auto const contextId = step % 2; + bool enqueueSuccessful = false; if (step == 0) { + SizeType contextIdForContextPhase = 0; + if (mRuntime->getNbProfiles() == 2) + { + contextIdForContextPhase = 2; + } buffers.prepareContextStep( - inputs.ids, inputs.padId, manager, *mKvCacheManager, generationConfig, mModelConfig); - buffers.getRuntimeBuffers( - inputBuffers[contextId], outputBuffers[contextId], step, inputs.ids, *mKvCacheManager, mModelConfig); - mRuntime->setInputTensors(contextId, inputBuffers[contextId]); - mRuntime->setOutputTensors(contextId, outputBuffers[contextId]); + inputs.ids, inputs.padId, manager, *mKvCacheManager, generationConfig, mModelConfig, mWorldConfig); + buffers.getRuntimeBuffers(inputBuffers[contextId], outputBuffers[contextId], step, inputs.ids, + *mKvCacheManager, mModelConfig, mWorldConfig); + mRuntime->setInputTensors(contextIdForContextPhase, inputBuffers[contextId]); + mRuntime->setOutputTensors(contextIdForContextPhase, outputBuffers[contextId]); + if (isCudaGraphMode()) { for (auto& instance : mCudaGraphInstances) @@ -198,16 +230,19 @@ void GptSession::generate( instance.clear(); } } - } - bool enqueueSuccessful = false; - if (isCudaGraphMode() && mCudaGraphInstances[contextId].hasInstance()) - { - mCudaGraphInstances[contextId].launch(stream); - enqueueSuccessful = true; + enqueueSuccessful = mRuntime->executeContext(contextIdForContextPhase); } else { - enqueueSuccessful = mRuntime->executeContext(contextId); + if (isCudaGraphMode() && mCudaGraphInstances[contextId].hasInstance()) + { + mCudaGraphInstances[contextId].launch(stream); + enqueueSuccessful = true; + } + else + { + enqueueSuccessful = mRuntime->executeContext(contextId); + } } TLLM_CHECK_WITH_INFO(enqueueSuccessful, "Executing TRT engine failed!"); @@ -215,50 +250,25 @@ void GptSession::generate( if (step == 0) { - buffers.postContextStep(manager, generationConfig, mModelConfig); + buffers.postContextStep(manager, generationConfig, mModelConfig, mWorldConfig); } std::swap(buffers.cacheIndirectionDecoderInput, buffers.cacheIndirectionDecoderOutput); - decoder::Input decodingInput{buffers.logits}; - decoder::Output decodingOutput{}; - decodingInput.cacheIndirection = buffers.cacheIndirectionDecoderInput; - decodingOutput.cacheIndirection = buffers.cacheIndirectionDecoderOutput; - if (step < maxNewTokens - 1) { auto const nextStep = step + 1; auto const nextContextId = nextStep % 2; auto nextInputIds = buffers.prepareNextStep( - step, mDecoder->getNewTokens(), manager, *mKvCacheManager, generationConfig, mModelConfig); + step, newTokens, manager, *mKvCacheManager, generationConfig, mModelConfig, mWorldConfig); buffers.getRuntimeBuffers(inputBuffers[nextContextId], outputBuffers[nextContextId], nextStep, nextInputIds, - *mKvCacheManager, mModelConfig); + *mKvCacheManager, mModelConfig, mWorldConfig); mRuntime->setInputTensors(nextContextId, inputBuffers[nextContextId]); mRuntime->setOutputTensors(nextContextId, outputBuffers[nextContextId]); if (isCudaGraphMode()) { - // capture cuda graph - cudaGraph_t next_graph; - TLLM_CUDA_CHECK(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal)); - mRuntime->executeContext(nextContextId); - TLLM_CUDA_CHECK(cudaStreamEndCapture(stream.get(), &next_graph)); - - if (mCudaGraphInstances[nextContextId].hasInstance()) - { - if (mCudaGraphInstances[nextContextId].update(next_graph)) - { - mCudaGraphInstances[nextContextId].clear(); - mCudaGraphInstances[nextContextId].create(next_graph); - } - } - else - { - mCudaGraphInstances[nextContextId].create(next_graph); - } - - TLLM_CUDA_CHECK(cudaGraphDestroy(next_graph)); - mCudaGraphInstances[nextContextId].uploadToStream(stream); + mCudaGraphInstances[nextContextId].prepareNextGraph(*mRuntime, nextContextId); } } @@ -267,17 +277,21 @@ void GptSession::generate( // FIXME(nkorobov): this synchronize is important to get logits right // manager.getStream().synchronize(); - auto const shouldStop = mDecoder->forward(decodingOutput, decodingInput); + auto shouldStop = executeDecoderStep(outputs.ids, newTokens, maxInputLength + step); - if (onTokenGenerated) + if (mWorldConfig.isFirstPipelineParallelRank()) { - // TODO(rkobus) use getNewTokens(), remove step from Callback? - onTokenGenerated(mDecoder->getOutputIds(), step, shouldStop || step == maxNewTokens - 1); + if (onTokenGenerated) + { + // TODO(rkobus) use getNewTokens(), remove step from Callback? + ITensor::SharedPtr outputIds + = mWorldConfig.isPipelineParallel() ? outputs.ids : mDecoder->getOutputIds(); + onTokenGenerated(outputIds, step, shouldStop || step == maxNewTokens - 1); + } } if (shouldStop) { - finalSeqLength = maxInputLength + step + 1; mLogger->log(nvinfer1::ILogger::Severity::kVERBOSE, "GPT decoding finished early"); break; } @@ -291,38 +305,161 @@ void GptSession::generate( } } - outputs.ids->reshape(ITensor::makeShape({batchSize, beamWidth, finalSeqLength})); - manager.copy(*mDecoder->getFinalOutputIds(), *outputs.ids); + finalizeOutputIds(*outputs.ids); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); +} + +bool GptSession::executeDecoderStep(ITensor::SharedPtr& outputIds, ITensor::SharedPtr& newTokens, SizeType decoderStep) +{ + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + auto& stream = mRuntime->getStream(); + auto& buffers = *mBuffers; + + auto shouldStopPtr = bufferCast(*buffers.shouldStop); + auto& shouldStop = *shouldStopPtr; + shouldStop = false; + if (mWorldConfig.isLastPipelineParallelRank()) + { + decoder::Input decodingInput{buffers.logits}; + decoder::Output decodingOutput{}; + decodingInput.cacheIndirection = buffers.cacheIndirectionDecoderInput; + decodingOutput.cacheIndirection = buffers.cacheIndirectionDecoderOutput; + decodingOutput.sequenceLengths = buffers.sequenceLengths; + + shouldStop = mDecoder->forward(decodingOutput, decodingInput); + } + + if (mWorldConfig.isPipelineParallel()) + { + if (mWorldConfig.isLastPipelineParallelRank()) + { + for (auto peer = 0; peer < mWorldConfig.getPipelineParallelism() - 1; ++peer) + { + mPipelineComm->send(shouldStopPtr, 1, peer, stream, *mLogger); + } + mPipelineComm->send(bufferCast(*newTokens), newTokens->getSize(), 0, stream, *mLogger); + } + else + { + auto const peer = mWorldConfig.getPipelineParallelism() - 1; + mPipelineComm->receive(shouldStopPtr, 1, peer, stream, *mLogger); + + if (mWorldConfig.isFirstPipelineParallelRank()) + { + mPipelineComm->receive( + bufferCast(*newTokens), newTokens->getSize(), peer, stream, *mLogger); + + auto const& newTokensShape = newTokens->getShape(); + auto newTokensView + = ITensor::view(outputIds, ITensor::makeShape({1, newTokensShape.d[0] * newTokensShape.d[1]})); + auto const& outputIdsShape = outputIds->getShape(); + auto outputIdsView = ITensor::view( + outputIds, ITensor::makeShape({outputIdsShape.d[0] * outputIdsShape.d[1], outputIdsShape.d[2]})); + kernels::invokeTransposeWithOutputOffset(*outputIdsView, *newTokensView, decoderStep, stream); + } + } + } + sync_check_cuda_error(); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); + return shouldStop; +} + +void GptSession::finalizeOutputIds(ITensor& outputIds) +{ + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + auto& manager = mRuntime->getBufferManager(); + auto& stream = mRuntime->getStream(); + + ITensor::SharedPtr finalOutputIds; + if (mWorldConfig.isLastPipelineParallelRank()) + { + finalOutputIds = mDecoder->getFinalOutputIds(); + if (mWorldConfig.isPipelineParallel()) + { + mPipelineComm->send( + bufferCast(*finalOutputIds), finalOutputIds->getSize(), 0, stream, *mLogger); + } + } + if (mWorldConfig.isFirstPipelineParallelRank()) + { + if (mWorldConfig.isPipelineParallel()) + { + auto const peer = mWorldConfig.getPipelineParallelism() - 1; + mPipelineComm->receive(bufferCast(outputIds), outputIds.getSize(), peer, stream, *mLogger); + } + else + { + manager.copy(*finalOutputIds, outputIds); + } + } sync_check_cuda_error(); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void GptSession::CudaGraphExecutor::create(cudaGraph_t const& graph) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); assert(mInstance == nullptr); TLLM_CUDA_CHECK(cudaGraphInstantiate(&mInstance, graph, nullptr, nullptr, 0)); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void GptSession::CudaGraphExecutor::uploadToStream(CudaStream const& stream) { - assert(mInstance.hasInstance()); + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + assert(hasInstance()); TLLM_CUDA_CHECK(cudaGraphUpload(mInstance, stream.get())); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void GptSession::CudaGraphExecutor::launch(CudaStream const& stream) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TLLM_CUDA_CHECK(cudaGraphLaunch(mInstance, stream.get())); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } bool GptSession::CudaGraphExecutor::update(cudaGraph_t const& graph) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); return cudaGraphExecUpdate(mInstance, graph, nullptr) != cudaSuccess; } void GptSession::CudaGraphExecutor::clear() { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); if (mInstance != nullptr) { TLLM_CUDA_CHECK(cudaGraphExecDestroy(mInstance)); mInstance = nullptr; } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); +} + +void GptSession::CudaGraphExecutor::prepareNextGraph(TllmRuntime const& runtime, SizeType nextContextId) +{ + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + auto& stream = runtime.getStream(); + + cudaGraph_t nextGraph; + TLLM_CUDA_CHECK(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal)); + runtime.executeContext(nextContextId); + TLLM_CUDA_CHECK(cudaStreamEndCapture(stream.get(), &nextGraph)); + + if (hasInstance()) + { + if (update(nextGraph)) + { + clear(); + create(nextGraph); + } + } + else + { + create(nextGraph); + } + + TLLM_CUDA_CHECK(cudaGraphDestroy(nextGraph)); + uploadToStream(stream); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } diff --git a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp new file mode 100644 index 00000000000..e338c0a46cf --- /dev/null +++ b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/runtime/ncclCommunicator.h" + +#include "tensorrt_llm/runtime/utils/multiDeviceUtils.h" + +#include +#include + +#if ENABLE_MULTI_DEVICE +#include +#endif // ENABLE_MULTI_DEVICE + +using namespace tensorrt_llm::runtime; + +namespace +{ +#if ENABLE_MULTI_DEVICE +//! \brief For converting a C++ data type to a Nccl data type. +template +struct NcclDataType +{ +}; + +template <> +struct NcclDataType +{ + static constexpr auto value = ncclDataType_t::ncclHalf; +}; + +template <> +struct NcclDataType +{ + static constexpr auto value = ncclDataType_t::ncclFloat; +}; + +template <> +struct NcclDataType +{ + static constexpr auto value = ncclDataType_t::ncclUint8; +}; + +template <> +struct NcclDataType +{ + static constexpr auto value = ncclDataType_t::ncclInt32; +}; +#endif // ENABLE_MULTI_DEVICE +} // namespace + +template +void NcclCommunicator::send( + T* sendbuff, size_t count, int peer, CudaStream const& stream, nvinfer1::ILogger& logger) const +{ +#if ENABLE_MULTI_DEVICE + auto datatype = NcclDataType::value; + TLLM_NCCL_CHECK(ncclSend(sendbuff, count, datatype, peer, mComm, stream.get()), logger); +#else + TLLM_THROW("Multi device support is disabled."); +#endif // ENABLE_MULTI_DEVICE +} + +template void NcclCommunicator::send(std::uint8_t*, size_t, int, CudaStream const&, nvinfer1::ILogger&) const; +template void NcclCommunicator::send(std::int32_t*, size_t, int, CudaStream const&, nvinfer1::ILogger&) const; + +template +void NcclCommunicator::receive( + T* sendbuff, size_t count, int peer, CudaStream const& stream, nvinfer1::ILogger& logger) const +{ +#if ENABLE_MULTI_DEVICE + auto datatype = NcclDataType::value; + TLLM_NCCL_CHECK(ncclRecv(sendbuff, count, datatype, peer, mComm, stream.get()), logger); +#else + TLLM_THROW("Multi device support is disabled."); +#endif // ENABLE_MULTI_DEVICE +} + +template void NcclCommunicator::receive(std::uint8_t*, size_t, int, CudaStream const&, nvinfer1::ILogger&) const; +template void NcclCommunicator::receive(std::int32_t*, size_t, int, CudaStream const&, nvinfer1::ILogger&) const; + +std::shared_ptr NcclCommunicator::createPipelineComm( + WorldConfig const& worldConfig, nvinfer1::ILogger& logger) +{ +#if ENABLE_MULTI_DEVICE + auto ppGroup = worldConfig.getPipelineParallelGroup(); + + int myRank = worldConfig.getRank(); + int groupRank = 0; + for (auto it = ppGroup.begin(); it != ppGroup.end(); ++it) + { + if (*it == myRank) + { + break; + } + ++groupRank; + } + + ncclUniqueId id; + if (myRank == ppGroup.front()) + { + ncclGetUniqueId(&id); + for (auto it = std::next(std::begin(ppGroup), 1); it != ppGroup.end(); ++it) + { + TLLM_MPI_CHECK(MPI_Send(&id, sizeof(id), MPI_BYTE, *it, 0, MPI_COMM_WORLD), logger); + } + } + else + { + MPI_Status status; + TLLM_MPI_CHECK(MPI_Recv(&id, sizeof(id), MPI_BYTE, ppGroup.front(), 0, MPI_COMM_WORLD, &status), logger); + } + + auto pipelineComm = std::make_shared(); + TLLM_NCCL_CHECK(ncclCommInitRank(&pipelineComm->mComm, ppGroup.size(), id, groupRank), logger); + + return pipelineComm; +#else + TLLM_THROW("Multi device support is disabled."); + return nullptr; +#endif // ENABLE_MULTI_DEVICE +} diff --git a/cpp/tensorrt_llm/runtime/ncclCommunicator.h b/cpp/tensorrt_llm/runtime/ncclCommunicator.h new file mode 100644 index 00000000000..1843cd24a4f --- /dev/null +++ b/cpp/tensorrt_llm/runtime/ncclCommunicator.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/runtime/cudaStream.h" +#include "tensorrt_llm/runtime/worldConfig.h" + +struct ncclComm; +typedef struct ncclComm* ncclComm_t; + +namespace tensorrt_llm::runtime +{ + +class NcclCommunicator +{ +public: + template + void send(T* sendbuff, size_t count, int peer, CudaStream const& stream, nvinfer1::ILogger& logger) const; + + template + void receive(T* sendbuff, size_t count, int peer, CudaStream const& stream, nvinfer1::ILogger& logger) const; + + static std::shared_ptr createPipelineComm( + WorldConfig const& worldConfig, nvinfer1::ILogger& logger); + +private: + ncclComm_t mComm; +}; + +} // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp b/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp index 0875eb59032..7252a633544 100644 --- a/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp +++ b/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp @@ -19,6 +19,9 @@ #include "tensorrt_llm/runtime/runtimeBuffers.h" +#include +#include + #include "tensorrt_llm/batch_manager/kvCacheManager.h" #include "tensorrt_llm/runtime/runtimeKernels.h" #include "tensorrt_llm/runtime/tllmRuntime.h" @@ -30,20 +33,22 @@ RuntimeBuffers::GenerationConfig RuntimeBuffers::GenerationConfig::fromInput(ITe ITensor::SharedPtr const& inputLengthsHost, bool const inputPacked, SizeType const beamWidth, SizeType const maxSequenceLength, std::optional const& maxNewTokensOpt, BufferManager& manager) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto const batchSize = static_cast(inputLengthsHost->getSize()); auto const* inputLengthsPtr = bufferCast(*inputLengthsHost); - auto const maxInputLength = *std::max_element(inputLengthsPtr, inputLengthsPtr + batchSize); + SizeType const maxInputLength = *std::max_element(inputLengthsPtr, inputLengthsPtr + batchSize); + auto const& inputShape = inputIds->getShape(); if (inputPacked) { auto const inputLengthSum = std::reduce(inputLengthsPtr, inputLengthsPtr + batchSize); - TLLM_CHECK_WITH_INFO(inputIds->getShape().d[0] == 1 && inputIds->getShape().d[1] == inputLengthSum, + TLLM_CHECK_WITH_INFO(inputShape.d[0] == 1 && inputShape.d[1] == inputLengthSum, "Packed input must have shape [1, ]."); } else { - TLLM_CHECK_WITH_INFO(inputIds->getShape().d[0] == batchSize && inputIds->getShape().d[1] == maxInputLength, + TLLM_CHECK_WITH_INFO(inputShape.d[0] == batchSize && inputShape.d[1] == maxInputLength, "Padded input must have shape [batch size, max input length]"); } @@ -52,55 +57,74 @@ RuntimeBuffers::GenerationConfig RuntimeBuffers::GenerationConfig::fromInput(ITe "Max input length is equal to or larger that maxSequenceLength given in setup. No new tokens can be " "generated."); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); return GenerationConfig{batchSize, beamWidth, maxInputLength, maxNewTokens, maxSequenceLength}; } void RuntimeBuffers::clear() { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + contextLengthsHost = nullptr; + contextLengthsDevice = nullptr; + logits = nullptr; sequenceLengths = nullptr; pastKeyValueLengths = nullptr; attentionMask = nullptr; positionIds = nullptr; lastTokenIds = nullptr; + requestTypes = nullptr; presentKeysVals.clear(); presentKeysValsAlt.clear(); + kvCacheBlockPointers = nullptr; - contextLengthsHost = nullptr; - requestTypes = nullptr; + cacheIndirectionDecoderInput = nullptr; + cacheIndirectionDecoderOutput = nullptr; + + hiddenStates = nullptr; allocated = false; + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -void RuntimeBuffers::create(TllmRuntime& runtime, GptModelConfig const& modelConfig) +void RuntimeBuffers::create(TllmRuntime& runtime, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto& manager = runtime.getBufferManager(); + auto& engine = runtime.getEngine(); - auto const logitsType = utils::getTensorDataType(runtime.getEngine(), "logits"); - logits = manager.emptyTensor(MemoryType::kGPU, logitsType); + if (worldConfig.isLastPipelineParallelRank()) + { + auto const logitsType = engine.getTensorDataType("logits"); + logits = manager.emptyTensor(MemoryType::kGPU, logitsType); + } contextLengthsHost = manager.emptyTensor(MemoryType::kPINNED, nvinfer1::DataType::kINT32); - inputOffsets = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); + sequenceLengths = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); + lastTokenIds = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); + + auto const localNbLayers = modelConfig.getNbLayers(worldConfig.getPipelineParallelism()); + auto const firstLayerId = worldConfig.getPipelineParallelRank() * localNbLayers; presentKeysVals - = utils::createBufferVector(runtime, modelConfig.getNbLayers(), "present_key_value_", MemoryType::kGPU); + = utils::createBufferVector(runtime, firstLayerId, localNbLayers, "present_key_value_", MemoryType::kGPU); if (modelConfig.useGptAttentionPlugin()) { - sequenceLengths = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); pastKeyValueLengths = manager.emptyTensor(MemoryType::kCPU, nvinfer1::DataType::kINT32); } else { presentKeysValsAlt - = utils::createBufferVector(runtime, modelConfig.getNbLayers(), "present_key_value_", MemoryType::kGPU); + = utils::createBufferVector(runtime, firstLayerId, localNbLayers, "present_key_value_", MemoryType::kGPU); } if (modelConfig.usePagedKvCache()) { - kvCacheBlockPointers = utils::createBufferVector( - runtime, modelConfig.getNbLayers(), "kv_cache_block_pointers_", MemoryType::kGPU); + auto const kvCacheBlockPointersType + = engine.getTensorDataType(("kv_cache_block_pointers_" + std::to_string(firstLayerId)).c_str()); + kvCacheBlockPointers = manager.emptyTensor(MemoryType::kGPU, kvCacheBlockPointersType); } if (modelConfig.useGptAttentionPlugin()) @@ -110,31 +134,51 @@ void RuntimeBuffers::create(TllmRuntime& runtime, GptModelConfig const& modelCon cacheIndirectionDecoderInput = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); cacheIndirectionDecoderOutput = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); + + shouldStop = BufferManager::pinned(ITensor::makeShape({1}), nvinfer1::DataType::kUINT8); + + if (worldConfig.isPipelineParallel()) + { + hiddenStates = manager.emptyTensor(MemoryType::kGPU, modelConfig.getDataType()); + } + + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void RuntimeBuffers::reshape( - GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, SizeType worldSize) + GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + auto const batchSize = generationConfig.batchSize; auto const beamWidth = generationConfig.beamWidth; + auto const maxInputLength = generationConfig.maxInputLength; auto const maxSeqLength = generationConfig.maxSeqLength; - auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldSize); - // logits are tiled to {batchSize, beamWidth, vocabSizePadded} after context step of engine - logits->reshape(ITensor::makeShape({batchSize, 1, vocabSizePadded})); + if (worldConfig.isLastPipelineParallelRank()) + { + auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize()); + // logits are tiled to {batchSize, beamWidth, vocabSizePadded} after context step of engine + logits->reshape(ITensor::makeShape({batchSize, 1, vocabSizePadded})); + } + + sequenceLengths->reshape(ITensor::makeShape({batchSize})); + lastTokenIds->reshape(ITensor::makeShape({batchSize})); auto kvCacheShape = ITensor::makeShape({batchSize, 2, modelConfig.getNbKvHeads(), maxSeqLength, modelConfig.getSizePerHead()}); if (modelConfig.usePagedKvCache()) { + auto const localNbLayers = modelConfig.getNbLayers(worldConfig.getPipelineParallelism()); auto const tokensPerBlock = modelConfig.getTokensPerBlock(); auto const maxBlocksPerSeq = (maxSeqLength + tokensPerBlock - 1) / tokensPerBlock; // reserve batchSize * beamWidth and resize to batchSize - auto cacheBlockPointersShape = ITensor::makeShape({batchSize * beamWidth, 2, maxBlocksPerSeq * 2}); - utils::reshapeBufferVector(kvCacheBlockPointers, cacheBlockPointersShape); - cacheBlockPointersShape.d[0] = batchSize; - utils::reshapeBufferVector(kvCacheBlockPointers, cacheBlockPointersShape); + auto cacheBlockPointersShape + = ITensor::makeShape({localNbLayers, batchSize * beamWidth, 2, maxBlocksPerSeq * 2}); + kvCacheBlockPointers->reshape(cacheBlockPointersShape); + cacheBlockPointersShape.d[1] = batchSize; + kvCacheBlockPointers->reshape(cacheBlockPointersShape); } else { @@ -143,7 +187,6 @@ void RuntimeBuffers::reshape( if (modelConfig.useGptAttentionPlugin()) { - sequenceLengths->reshape(ITensor::makeShape({batchSize})); pastKeyValueLengths->reshape(ITensor::makeShape({batchSize})); requestTypes->reshape(ITensor::makeShape({batchSize})); } @@ -156,27 +199,40 @@ void RuntimeBuffers::reshape( cacheIndirectionDecoderInput->reshape(cacheIndirShape); cacheIndirectionDecoderOutput->reshape(cacheIndirShape); + if (worldConfig.isPipelineParallel()) + { + // reserve max size + auto const maxNumTokens = std::max(batchSize * beamWidth, batchSize * maxInputLength); + auto const hiddenSize = modelConfig.getHiddenSize() * worldConfig.getTensorParallelism(); + auto const hiddenStatesShape = ITensor::makeShape({1, maxNumTokens, hiddenSize}); + hiddenStates->reshape(hiddenStatesShape); + } + allocated = true; + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -void RuntimeBuffers::tile( - BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig) +void RuntimeBuffers::tile(BufferManager& manager, GenerationConfig const& generationConfig, + GptModelConfig const& modelConfig, WorldConfig const& worldConfig) { - auto const batchSize = generationConfig.batchSize; + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto const beamWidth = generationConfig.beamWidth; TLLM_CHECK_WITH_INFO(beamWidth > 1, "Tiling is only necessary for beam search."); - // logits needs beamWidth in second dimension - auto logitsShape = logits->getShape(); - logitsShape.d[1] *= beamWidth; - utils::tileBufferReplace(logits, beamWidth, manager); - logits->reshape(logitsShape); + if (worldConfig.isLastPipelineParallelRank()) + { + // logits needs beamWidth in second dimension + auto logitsShape = logits->getShape(); + logitsShape.d[1] *= beamWidth; + utils::tileBufferReplace(logits, beamWidth, manager); + logits->reshape(logitsShape); + } utils::tileBufferReplace(contextLengthsDevice, beamWidth, manager); + utils::tileBufferReplace(sequenceLengths, beamWidth, manager); if (modelConfig.useGptAttentionPlugin()) { - utils::tileBufferReplace(sequenceLengths, beamWidth, manager); utils::tileCpuBufferReplace(contextLengthsHost, beamWidth, manager); utils::tileCpuBufferReplace(pastKeyValueLengths, beamWidth, manager); } @@ -192,14 +248,15 @@ void RuntimeBuffers::tile( for (auto& buffer : presentKeysValsAlt) utils::tileBufferReplace(buffer, beamWidth, manager); } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -void RuntimeBuffers::postContextStep( - BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig) +void RuntimeBuffers::postContextStep(BufferManager& manager, GenerationConfig const& generationConfig, + GptModelConfig const& modelConfig, WorldConfig const& worldConfig) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto const batchSize = generationConfig.batchSize; auto const beamWidth = generationConfig.beamWidth; - auto const maxSeqLength = generationConfig.maxSeqLength; if (modelConfig.useGptAttentionPlugin()) { @@ -210,7 +267,7 @@ void RuntimeBuffers::postContextStep( if (beamWidth > 1) { - tile(manager, generationConfig, modelConfig); + tile(manager, generationConfig, modelConfig, worldConfig); } // no need to copy data in lastTokenIds because it is overwritten in prepareNextStep @@ -218,21 +275,23 @@ void RuntimeBuffers::postContextStep( if (modelConfig.useGptAttentionPlugin() && modelConfig.usePagedKvCache()) { - auto const& pointersShape = kvCacheBlockPointers[0]->getShape(); - auto const maxBlocksPerSeq = pointersShape.d[pointersShape.nbDims - 1] / 2; - auto cacheBlockPointersShape = ITensor::makeShape({batchSize * beamWidth, 2, maxBlocksPerSeq * 2}); - utils::reshapeBufferVector(kvCacheBlockPointers, cacheBlockPointersShape); + auto cacheBlockPointersShape = kvCacheBlockPointers->getShape(); + cacheBlockPointersShape.d[1] = batchSize * beamWidth; + kvCacheBlockPointers->reshape(cacheBlockPointersShape); } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType const padId, BufferManager& manager, - KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig) + KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, + WorldConfig const& worldConfig) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto& stream = manager.getStream(); SizeType const batchSize = generationConfig.batchSize; - SizeType const beamWidth = generationConfig.beamWidth; SizeType const maxInputLength = generationConfig.maxInputLength; - SizeType const maxSeqLength = generationConfig.maxSeqLength; + + manager.copy(*contextLengthsDevice, *sequenceLengths); if (modelConfig.useGptAttentionPlugin()) { @@ -246,24 +305,25 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c std::fill_n(RequestTypesPtr, batchSize, 0); } - if (modelConfig.usePackedInput()) + auto const inputSize = inputIds->getSize(); + auto const& inputShape = inputIds->getShape(); + + auto const contextLengthsHostPtr = bufferCast(*contextLengthsHost); + std::vector positionIdsVec(inputSize); + auto begin = std::begin(positionIdsVec); + for (SizeType i = 0; i < batchSize; ++i) { - auto const inputOffsetsHost = manager.copyFrom(*inputOffsets, MemoryType::kCPU); - auto const* inputOffsetsPtr = bufferCast(*inputOffsetsHost); - - std::vector positionIdsVec(inputIds->getShape().d[1]); - for (SizeType i = 0; i < batchSize; ++i) - std::iota(std::begin(positionIdsVec) + inputOffsetsPtr[i], - std::begin(positionIdsVec) + inputOffsetsPtr[i + 1], 0); - positionIds = manager.copyFrom(positionIdsVec, inputIds->getShape(), MemoryType::kGPU); + auto end = begin + (modelConfig.usePackedInput() ? contextLengthsHostPtr[i] : maxInputLength); + std::iota(begin, end, 0); + begin = end; } - else + positionIds = manager.copyFrom(positionIdsVec, inputShape, MemoryType::kGPU); + + if (worldConfig.isPipelineParallel()) { - std::vector positionIdsVec(inputIds->getSize()); - for (SizeType i = 0; i < batchSize; ++i) - std::iota(std::begin(positionIdsVec) + i * maxInputLength, - std::begin(positionIdsVec) + (i + 1) * maxInputLength, 0); - positionIds = manager.copyFrom(positionIdsVec, inputIds->getShape(), MemoryType::kGPU); + auto const hiddenSize = hiddenStates->getShape().d[2]; + auto const hiddenStatesShape = ITensor::makeShape({inputShape.d[0], inputShape.d[1], hiddenSize}); + hiddenStates->reshape(hiddenStatesShape); } } else @@ -285,62 +345,55 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c positionIds = manager.copyFrom(positionIdsVec, attentionMask->getShape(), MemoryType::kGPU); } - if (modelConfig.useGptAttentionPlugin()) - { - manager.copy(*contextLengthsDevice, *sequenceLengths); - } - if (modelConfig.useGptAttentionPlugin() && modelConfig.usePagedKvCache()) { auto constexpr contextBeamWidth = 1; - auto const& pointersShape = kvCacheBlockPointers[0]->getShape(); + auto const& pointersShape = kvCacheBlockPointers->getShape(); auto const maxBlocksPerSeq = pointersShape.d[pointersShape.nbDims - 1] / 2; auto const& blockPointersBatch = kvCacheManager.getBlockPointersOfBatch(batchSize, contextBeamWidth, maxBlocksPerSeq); - for (auto layer = 0; layer < modelConfig.getNbLayers(); ++layer) - { - TLLM_CHECK(blockPointersBatch[layer]->getSizeInBytes() == kvCacheBlockPointers[layer]->getSizeInBytes()); - auto pointersPtr = bufferCast(*blockPointersBatch[layer]); - auto pointersPtr32 = reinterpret_cast(pointersPtr); - manager.copy(pointersPtr32, *kvCacheBlockPointers[layer]); - } + TLLM_CHECK(blockPointersBatch->getSizeInBytes() == kvCacheBlockPointers->getSizeInBytes()); + auto pointersPtr = bufferCast(*blockPointersBatch); + auto pointersPtr32 = reinterpret_cast(pointersPtr); + manager.copy(pointersPtr32, *kvCacheBlockPointers); } if (modelConfig.usePackedInput()) { - lastTokenIds = manager.copyFrom(*ITensor::slice(inputOffsets, 1), MemoryType::kGPU); + kernels::invokeInclusiveSum(*lastTokenIds, *contextLengthsDevice, manager, stream); } else { - lastTokenIds = manager.copyFrom(*contextLengthsDevice, MemoryType::kGPU); + manager.copy(*contextLengthsDevice, *lastTokenIds); } manager.setZero(*cacheIndirectionDecoderInput); manager.setZero(*cacheIndirectionDecoderOutput); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); }; RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, TensorPtr const& outputIds, BufferManager& manager, KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, - GptModelConfig const& modelConfig) + GptModelConfig const& modelConfig, WorldConfig const& worldConfig) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto& stream = manager.getStream(); SizeType const batchSize = generationConfig.batchSize; SizeType const beamWidth = generationConfig.beamWidth; - SizeType const maxSeqLength = generationConfig.maxSeqLength; - nvinfer1::Dims nextInputIdsShape; + nvinfer1::Dims inputShape; if (modelConfig.usePackedInput()) { - // squeeze first dim and batch in last dim - nextInputIdsShape = ITensor::makeShape({1, batchSize * beamWidth}); + // batch in last dim + inputShape = ITensor::makeShape({1, batchSize * beamWidth}); } else { - // squeeze first dim - nextInputIdsShape = ITensor::makeShape({batchSize * beamWidth, 1}); + // batch in first dim + inputShape = ITensor::makeShape({batchSize * beamWidth, 1}); } - auto nextInputIds = ITensor::view(outputIds, nextInputIdsShape); + auto nextInputIds = ITensor::view(outputIds, inputShape); if (modelConfig.useGptAttentionPlugin()) { @@ -354,18 +407,16 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, T pastKeyValueLengthsPtr[i] = contextLengthsHostPtr[i * srcStride] + step; } - // The sequence_lengths = context_lengths + step for generation stage. - kernels::invokeAdd(*sequenceLengths, 1, stream); - - positionIds->reshape(contextLengthsDevice->getShape()); + positionIds->reshape(inputShape); manager.copy(*contextLengthsDevice, *positionIds); kernels::invokeAdd(*positionIds, step, stream); - auto const size = static_cast(positionIds->getSize()); - if (modelConfig.usePackedInput()) - positionIds->reshape(ITensor::makeShape({1, size})); - else - positionIds->reshape(ITensor::makeShape({size, 1})); + if (worldConfig.isPipelineParallel()) + { + auto const hiddenSize = hiddenStates->getShape().d[2]; + auto const hiddenStatesShape = ITensor::makeShape({inputShape.d[0], inputShape.d[1], hiddenSize}); + hiddenStates->reshape(hiddenStatesShape); + } } else { @@ -405,16 +456,13 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, T { kvCacheManager.addToken(batchIdx); } - auto const& pointersShape = kvCacheBlockPointers[0]->getShape(); + auto const& pointersShape = kvCacheBlockPointers->getShape(); auto const maxBlocksPerSeq = pointersShape.d[pointersShape.nbDims - 1] / 2; auto const& blockPointersBatch = kvCacheManager.getBlockPointersOfBatch(batchSize, beamWidth, maxBlocksPerSeq); - for (auto layer = 0; layer < modelConfig.getNbLayers(); ++layer) - { - TLLM_CHECK(blockPointersBatch[layer]->getSizeInBytes() == kvCacheBlockPointers[layer]->getSizeInBytes()); - auto pointersPtr = bufferCast(*blockPointersBatch[layer]); - auto pointersPtr32 = reinterpret_cast(pointersPtr); - manager.copy(pointersPtr32, *kvCacheBlockPointers[layer]); - } + TLLM_CHECK(blockPointersBatch->getSizeInBytes() == kvCacheBlockPointers->getSizeInBytes()); + auto pointersPtr = bufferCast(*blockPointersBatch); + auto pointersPtr32 = reinterpret_cast(pointersPtr); + manager.copy(pointersPtr32, *kvCacheBlockPointers); } kernels::invokeFill(*lastTokenIds, 1, stream); @@ -423,23 +471,44 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, T kernels::invokeInclusiveSum(*lastTokenIds, *lastTokenIds, manager, stream); } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); return nextInputIds; }; void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, SizeType const step, - TensorPtr const& inputIds, KvCacheManager& kvCacheManager, GptModelConfig const& modelConfig) const + TensorPtr const& inputIds, KvCacheManager& kvCacheManager, GptModelConfig const& modelConfig, + WorldConfig const& worldConfig) const { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); inputBuffers.clear(); outputBuffers.clear(); - outputBuffers.insert_or_assign("logits", ITensor::view(logits)); // feed a view to TensorRT runtime + if (worldConfig.isLastPipelineParallelRank()) + { + // feed a view to TensorRT runtime so reshaping does not change logits buffer + outputBuffers.insert_or_assign("logits", ITensor::view(logits)); + } + else + { + outputBuffers.insert_or_assign("hidden_states_output", hiddenStates); + } - inputBuffers.insert_or_assign("input_ids", inputIds); + if (worldConfig.isFirstPipelineParallelRank()) + { + inputBuffers.insert_or_assign("input_ids", inputIds); + } + else + { + inputBuffers.insert_or_assign("hidden_states_input", hiddenStates); + } inputBuffers.insert_or_assign("context_lengths", contextLengthsDevice); inputBuffers.insert_or_assign("last_token_ids", lastTokenIds); inputBuffers.insert_or_assign("position_ids", positionIds); + auto const localNbLayers = modelConfig.getNbLayers(worldConfig.getPipelineParallelism()); + auto const firstLayerId = worldConfig.getPipelineParallelRank() * localNbLayers; + if (modelConfig.useGptAttentionPlugin()) { inputBuffers.insert_or_assign("cache_indirection", cacheIndirectionDecoderOutput); @@ -453,14 +522,15 @@ void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outpu } if (modelConfig.usePagedKvCache()) { - utils::insertTensorVector(inputBuffers, "past_key_value_", kvCacheManager.getMemoryPools()); - utils::insertTensorVector(outputBuffers, "present_key_value_", kvCacheManager.getMemoryPools()); - utils::insertTensorVector(inputBuffers, "kv_cache_block_pointers_", kvCacheBlockPointers); + utils::insertTensorVector(inputBuffers, "past_key_value_", kvCacheManager.getMemoryPools(), firstLayerId); + utils::insertTensorVector( + outputBuffers, "present_key_value_", kvCacheManager.getMemoryPools(), firstLayerId); + utils::insertTensorSlices(inputBuffers, "kv_cache_block_pointers_", kvCacheBlockPointers, firstLayerId); } else { - utils::insertTensorVector(inputBuffers, "past_key_value_", presentKeysVals); - utils::insertTensorVector(outputBuffers, "present_key_value_", presentKeysVals); + utils::insertTensorVector(inputBuffers, "past_key_value_", presentKeysVals, firstLayerId); + utils::insertTensorVector(outputBuffers, "present_key_value_", presentKeysVals, firstLayerId); } } else @@ -468,14 +538,14 @@ void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outpu inputBuffers.insert_or_assign("attention_mask", attentionMask); inputBuffers.insert_or_assign("cache_indirection", cacheIndirectionDecoderOutput); utils::insertTensorVector( - outputBuffers, "present_key_value_", (step % 2) ? presentKeysValsAlt : presentKeysVals); + outputBuffers, "present_key_value_", (step % 2) ? presentKeysValsAlt : presentKeysVals, firstLayerId); if (step == 0) { auto kvCacheShape = presentKeysValsAlt.at(0)->getShape(); kvCacheShape.d[3] = 0; - for (SizeType i = 0; i < modelConfig.getNbLayers(); ++i) + for (SizeType i = firstLayerId; i < firstLayerId + localNbLayers; ++i) { std::string name = "past_key_value_" + std::to_string(i); TensorPtr tmp = ITensor::view(presentKeysValsAlt[i], kvCacheShape); @@ -485,7 +555,8 @@ void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outpu else { utils::insertTensorVector( - inputBuffers, "past_key_value_", (step % 2) ? presentKeysVals : presentKeysValsAlt); + inputBuffers, "past_key_value_", (step % 2) ? presentKeysVals : presentKeysValsAlt, firstLayerId); } } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } diff --git a/cpp/tensorrt_llm/runtime/runtimeBuffers.h b/cpp/tensorrt_llm/runtime/runtimeBuffers.h index 16981c6c655..049b815dca5 100644 --- a/cpp/tensorrt_llm/runtime/runtimeBuffers.h +++ b/cpp/tensorrt_llm/runtime/runtimeBuffers.h @@ -19,6 +19,7 @@ #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/gptModelConfig.h" #include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/worldConfig.h" namespace tensorrt_llm::batch_manager::kv_cache_manager { @@ -40,7 +41,7 @@ class RuntimeBuffers // general TensorPtr contextLengthsHost; TensorPtr contextLengthsDevice; - TensorPtr inputOffsets; + TensorPtr inputOffsets; // helper for packed input // engine TensorPtr logits; @@ -49,16 +50,22 @@ class RuntimeBuffers TensorPtr attentionMask; // without attention plugin TensorPtr positionIds; TensorPtr lastTokenIds; - TensorPtr requestTypes; // with attention plugin and inflight batching. Host tensor + TensorPtr requestTypes; // with attention plugin. Host tensor std::vector presentKeysVals; std::vector presentKeysValsAlt; // without attention plugin - std::vector kvCacheBlockPointers; + TensorPtr kvCacheBlockPointers; // [numLayers, batchSize * beamWidth, 2, maxBlocksPerSeq * 2] // beam search (shared between engine and decoder) TensorPtr cacheIndirectionDecoderInput; TensorPtr cacheIndirectionDecoderOutput; + // decoder + TensorPtr shouldStop; + + // pipeline parallelism + TensorPtr hiddenStates; + bool allocated{false}; public: @@ -91,24 +98,28 @@ class RuntimeBuffers public: void clear(); - void create(TllmRuntime& runtime, GptModelConfig const& modelConfig); + void create(TllmRuntime& runtime, GptModelConfig const& modelConfig, WorldConfig const& worldConfig); - void reshape(GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, SizeType worldSize); + void reshape( + GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig); - void postContextStep( - BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig); + void postContextStep(BufferManager& manager, GenerationConfig const& generationConfig, + GptModelConfig const& modelConfig, WorldConfig const& worldConfig); void prepareContextStep(TensorPtr const& inputIds, TokenIdType padId, BufferManager& manager, - KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig); + KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, + WorldConfig const& worldConfig); TensorPtr prepareNextStep(SizeType step, TensorPtr const& outputIds, BufferManager& manager, - KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig); + KvCacheManager& kvCacheManager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, + WorldConfig const& worldConfig); void getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, SizeType step, TensorPtr const& inputIds, - KvCacheManager& kvCacheManager, GptModelConfig const& modelConfig) const; + KvCacheManager& kvCacheManager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) const; private: // Some tensors are properly tiled, some are just reshaped. - void tile(BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig); + void tile(BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, + WorldConfig const& worldConfig); }; } // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.cu b/cpp/tensorrt_llm/runtime/runtimeKernels.cu index a70ad011b1f..892385e8749 100644 --- a/cpp/tensorrt_llm/runtime/runtimeKernels.cu +++ b/cpp/tensorrt_llm/runtime/runtimeKernels.cu @@ -35,9 +35,10 @@ namespace template __global__ void fill(T* data, std::size_t size, T const value) { - auto const idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + auto const tidx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + auto const stride = static_cast(blockDim.x) * gridDim.x; - if (idx < size) + for (auto idx = tidx; idx < size; idx += stride) { data[idx] = value; } @@ -49,14 +50,17 @@ void invokeFill(IBuffer& buffer, T const value, CudaStream const& stream) { auto data = bufferCast(buffer); auto const size = buffer.getSize(); - dim3 const blockSize(256); - dim3 const gridSize((size + blockSize.x - 1) / blockSize.x); + dim3 const blockSize{256}; + std::size_t const gridx{tc::ceilDiv(size, blockSize.x)}; + std::size_t const gridMax{std::numeric_limits::max()}; + dim3 const gridSize{static_cast(std::min(gridx, gridMax))}; fill<<>>(data, size, value); } // template instantiation -template void invokeFill(IBuffer&, SizeType, CudaStream const&); +template void invokeFill(IBuffer&, std::int32_t, CudaStream const&); +template void invokeFill(IBuffer&, std::int8_t, CudaStream const&); template void invokeFill(IBuffer&, float, CudaStream const&); namespace @@ -64,9 +68,10 @@ namespace template __global__ void add(T* data, std::size_t size, T const value) { - auto const idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + auto const tidx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + auto const stride = static_cast(blockDim.x) * gridDim.x; - if (idx < size) + for (auto idx = tidx; idx < size; idx += stride) { data[idx] += value; } @@ -78,13 +83,17 @@ void invokeAdd(IBuffer& buffer, T const value, CudaStream const& stream) { auto data = bufferCast(buffer); auto const size = buffer.getSize(); - dim3 const blockSize(256); - dim3 const gridSize((size + blockSize.x - 1) / blockSize.x); + dim3 const blockSize{256}; + std::size_t const gridx{tc::ceilDiv(size, blockSize.x)}; + std::size_t const gridMax{std::numeric_limits::max()}; + dim3 const gridSize{static_cast(std::min(gridx, gridMax))}; add<<>>(data, size, value); } -template void invokeAdd(IBuffer&, SizeType, CudaStream const&); +template void invokeAdd(IBuffer&, std::int32_t, CudaStream const&); +template void invokeAdd(IBuffer&, std::int8_t, CudaStream const&); +template void invokeAdd(IBuffer&, float, CudaStream const&); namespace { @@ -572,19 +581,21 @@ void invokeCopyPackedInputToOutput(ITensor& outputIds, ITensor const& inputIds, namespace { template -__global__ void scatterTensor(T* output, T const* input, SizeType const batchSize, SizeType const inputRowSize, - SizeType const outputRowSize, SizeType const beamWidth) +__global__ void scatterTensor(T* output, T const* input, std::uint32_t const batchSize, + std::uint32_t const inputRowSize, std::size_t const outputRowSize, std::uint32_t const beamWidth) { - SizeType const tidx = blockIdx.x * blockDim.x + threadIdx.x; - SizeType const tidy = blockIdx.y * blockDim.y + threadIdx.y; + auto const tidx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + auto const tidy = static_cast(blockIdx.y) * blockDim.y + threadIdx.y; + auto const stridex = static_cast(blockDim.x) * gridDim.x; + auto const stridey = static_cast(blockDim.y) * gridDim.y; - for (SizeType batchIdx = tidy; batchIdx < batchSize; batchIdx += blockDim.y * gridDim.y) + for (auto batchIdx = tidy; batchIdx < batchSize; batchIdx += stridey) { - for (SizeType columnIdx = tidx; columnIdx < inputRowSize; columnIdx += blockDim.x * gridDim.x) + for (auto columnIdx = tidx; columnIdx < inputRowSize; columnIdx += stridex) { auto const inputIdx = batchIdx * inputRowSize + columnIdx; auto const value = input[inputIdx]; - SizeType constexpr beamIdx = 0; + std::size_t constexpr beamIdx{0}; auto const outputIdx = (batchIdx * beamWidth + beamIdx) * outputRowSize + columnIdx; output[outputIdx] = value; } @@ -592,19 +603,21 @@ __global__ void scatterTensor(T* output, T const* input, SizeType const batchSiz } template -__global__ void tileTensor(T* output, T const* input, SizeType const batchSize, SizeType const inputRowSize, - SizeType const outputRowSize, SizeType const beamWidth) +__global__ void tileTensor(T* output, T const* input, std::uint32_t const batchSize, std::size_t const inputRowSize, + std::size_t const outputRowSize, std::uint32_t const beamWidth) { - SizeType const tidx = blockIdx.x * blockDim.x + threadIdx.x; - SizeType const tidy = blockIdx.y * blockDim.y + threadIdx.y; + auto const tidx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + auto const tidy = static_cast(blockIdx.y) * blockDim.y + threadIdx.y; + auto const stridex = static_cast(blockDim.x) * gridDim.x; + auto const stridey = static_cast(blockDim.y) * gridDim.y; - for (SizeType batchIdx = tidy; batchIdx < batchSize; batchIdx += blockDim.y * gridDim.y) + for (auto batchIdx = tidy; batchIdx < batchSize; batchIdx += stridey) { - for (SizeType columnIdx = tidx; columnIdx < inputRowSize; columnIdx += blockDim.x * gridDim.x) + for (auto columnIdx = tidx; columnIdx < inputRowSize; columnIdx += stridex) { auto const inputIdx = batchIdx * inputRowSize + columnIdx; auto const value = input[inputIdx]; - for (SizeType beamIdx = 0; beamIdx < beamWidth; ++beamIdx) + for (std::size_t beamIdx = 0; beamIdx < beamWidth; ++beamIdx) { auto const outputIdx = (batchIdx * beamWidth + beamIdx) * outputRowSize + columnIdx; output[outputIdx] = value; @@ -615,18 +628,20 @@ __global__ void tileTensor(T* output, T const* input, SizeType const batchSize, template __global__ void tileTensorInPlace( - T* inputOutput, SizeType const batchSize, SizeType const inputOutputRowSize, SizeType const beamWidth) + T* inputOutput, std::uint32_t const batchSize, std::size_t const inputOutputRowSize, std::uint32_t const beamWidth) { - SizeType const tidx = blockIdx.x * blockDim.x + threadIdx.x; - SizeType const tidy = blockIdx.y * blockDim.y + threadIdx.y; + auto const tidx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + auto const tidy = static_cast(blockIdx.y) * blockDim.y + threadIdx.y; + auto const stridex = static_cast(blockDim.x) * gridDim.x; + auto const stridey = static_cast(blockDim.y) * gridDim.y; - for (SizeType batchIdx = tidy; batchIdx < batchSize; batchIdx += blockDim.y * gridDim.y) + for (auto batchIdx = tidy; batchIdx < batchSize; batchIdx += stridey) { - for (SizeType columnIdx = tidx; columnIdx < inputOutputRowSize; columnIdx += blockDim.x * gridDim.x) + for (auto columnIdx = tidx; columnIdx < inputOutputRowSize; columnIdx += stridex) { auto const inputIdx = (batchIdx * beamWidth + 0) * inputOutputRowSize + columnIdx; auto const value = inputOutput[inputIdx]; - for (SizeType beamIdx = 1; beamIdx < beamWidth; ++beamIdx) + for (std::size_t beamIdx = 1; beamIdx < beamWidth; ++beamIdx) { auto const outputIdx = (batchIdx * beamWidth + beamIdx) * inputOutputRowSize + columnIdx; inputOutput[outputIdx] = value; @@ -641,22 +656,24 @@ template void invokeScatterTensor(ITensor& output, ITensor const& input, SizeType beamWidth, CudaStream const& stream) { auto const& inputShape = input.getShape(); - auto const nbInputRows = inputShape.d[0]; - auto const inputRowSize = static_cast(input.getSize()) / nbInputRows; + auto const nbInputRows = static_cast(inputShape.d[0]); + auto const inputRowSize = input.getSize() / static_cast(nbInputRows); auto const& outputShape = output.getShape(); - auto const nbOutputRows = outputShape.d[0]; - auto const outputRowSize = static_cast(output.getSize()) / nbOutputRows; + auto const nbOutputRows = static_cast(outputShape.d[0]); + auto const outputRowSize = output.getSize() / static_cast(nbOutputRows); TLLM_CHECK_WITH_INFO(nbOutputRows == beamWidth * nbInputRows, common::fmtstr( "nbOutputRows (%d) must be beamWidth (%d) times nbInputRows (%d)", nbOutputRows, beamWidth, nbInputRows)); TLLM_CHECK_WITH_INFO(outputRowSize >= inputRowSize, - common::fmtstr("output row size (%d) must be at least input row size (%d)", outputRowSize, inputRowSize)); + common::fmtstr("output row size (%ld) must be at least input row size (%ld)", outputRowSize, inputRowSize)); - dim3 const blockSize(256, 1); - dim3 const gridSize((inputRowSize + blockSize.x - 1) / blockSize.x, nbInputRows); - scatterTensor<<>>( - bufferCast(output), bufferCast(input), nbInputRows, inputRowSize, outputRowSize, beamWidth); + dim3 const blockSize{256, 1}; + std::size_t const gridx{tc::ceilDiv(inputRowSize, blockSize.x)}; + std::size_t const gridMax{std::numeric_limits::max()}; + dim3 const gridSize{static_cast(std::min(gridx, gridMax)), nbInputRows}; + scatterTensor<<>>(bufferCast(output), bufferCast(input), + nbInputRows, inputRowSize, outputRowSize, static_cast(beamWidth)); } void scatterTensor(ITensor& output, ITensor const& input, SizeType beamWidth, CudaStream const& stream) @@ -676,22 +693,24 @@ template void invokeTileTensor(ITensor& output, ITensor const& input, SizeType const beamWidth, CudaStream const& stream) { auto const& inputShape = input.getShape(); - auto const nbInputRows = inputShape.d[0]; - auto const inputRowSize = static_cast(input.getSize()) / nbInputRows; + auto const nbInputRows = static_cast(inputShape.d[0]); + auto const inputRowSize = input.getSize() / static_cast(nbInputRows); auto const& outputShape = output.getShape(); - auto const nbOutputRows = outputShape.d[0]; - auto const outputRowSize = static_cast(output.getSize()) / nbOutputRows; + auto const nbOutputRows = static_cast(outputShape.d[0]); + auto const outputRowSize = output.getSize() / static_cast(nbOutputRows); TLLM_CHECK_WITH_INFO(nbOutputRows == beamWidth * nbInputRows, common::fmtstr( "nbOutputRows (%d) must be beamWidth (%d) times nbInputRows (%d)", nbOutputRows, beamWidth, nbInputRows)); TLLM_CHECK_WITH_INFO(outputRowSize >= inputRowSize, - common::fmtstr("output row size (%d) must be at least input row size (%d)", outputRowSize, inputRowSize)); + common::fmtstr("output row size (%ld) must be at least input row size (%ld)", outputRowSize, inputRowSize)); - dim3 const blockSize(256, 1); - dim3 const gridSize((inputRowSize + blockSize.x - 1) / blockSize.x, nbInputRows); - tileTensor<<>>( - bufferCast(output), bufferCast(input), nbInputRows, inputRowSize, outputRowSize, beamWidth); + dim3 const blockSize{256, 1}; + std::size_t const gridx{tc::ceilDiv(inputRowSize, blockSize.x)}; + std::size_t const gridMax{std::numeric_limits::max()}; + dim3 const gridSize{static_cast(std::min(gridx, gridMax)), nbInputRows}; + tileTensor<<>>(bufferCast(output), bufferCast(input), nbInputRows, + inputRowSize, outputRowSize, static_cast(beamWidth)); } void tileTensor(ITensor& output, ITensor const& input, SizeType beamWidth, CudaStream const& stream) @@ -711,14 +730,16 @@ template void invokeTileTensorInPlace(ITensor& inputOutput, SizeType const beamWidth, CudaStream const& stream) { auto const& inputOutputShape = inputOutput.getShape(); - auto const nbOutputRows = inputOutputShape.d[0]; - auto const nbInputRows = nbOutputRows / beamWidth; - auto const inputOutputRowSize = static_cast(inputOutput.getSize()) / nbOutputRows; - - dim3 const blockSize(256, 1); - dim3 const gridSize((inputOutputRowSize + blockSize.x - 1) / blockSize.x, nbInputRows); + auto const nbOutputRows = static_cast(inputOutputShape.d[0]); + auto const nbInputRows = nbOutputRows / static_cast(beamWidth); + auto const inputOutputRowSize = inputOutput.getSize() / static_cast(nbOutputRows); + + dim3 const blockSize{256, 1}; + std::size_t const gridx{tc::ceilDiv(inputOutputRowSize, blockSize.x)}; + std::size_t const gridMax{std::numeric_limits::max()}; + dim3 const gridSize{static_cast(std::min(gridx, gridMax)), nbInputRows}; tileTensorInPlace<<>>( - bufferCast(inputOutput), nbInputRows, inputOutputRowSize, beamWidth); + bufferCast(inputOutput), nbInputRows, inputOutputRowSize, static_cast(beamWidth)); } void tileTensorInplace(ITensor& tensor, SizeType beamWidth, CudaStream const& stream) diff --git a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp index 0a35c93fbaf..dcdbe62b8ec 100644 --- a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp +++ b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp @@ -16,12 +16,16 @@ #include "tensorrt_llm/runtime/statefulGptDecoder.h" +#include +#include + #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/runtime/runtimeKernels.h" namespace tc = tensorrt_llm::common; using namespace tensorrt_llm::runtime; + using TensorPtr = ITensor::SharedPtr; StatefulGptDecoder::StatefulGptDecoder(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream) @@ -30,6 +34,7 @@ StatefulGptDecoder::StatefulGptDecoder(std::size_t vocabSize, std::size_t vocabS , mStream{std::move(stream)} , mBufferManager{mStream} { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto constexpr nvTokenIdType = TRTDataType::value; auto constexpr nvSizeType = TRTDataType::value; auto constexpr nvFloatType = TRTDataType::value; @@ -53,18 +58,22 @@ StatefulGptDecoder::StatefulGptDecoder(std::size_t vocabSize, std::size_t vocabS dOutput->lengths = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType); dOutput->cumLogProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType); dOutput->beamHypotheses.empty(mBufferManager); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void StatefulGptDecoder::setup( SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); mDecoder = IGptDecoder::create(dtype, mVocabSize, mVocabSizePadded, mStream); reshapeBuffers(maxBatchSize, maxBeamWidth, maxSequenceLength); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void StatefulGptDecoder::reshapeBuffers(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TLLM_CHECK(batchSize > 0); TLLM_CHECK(beamWidth > 0); TLLM_CHECK(maxSequenceLength > 0); @@ -93,13 +102,11 @@ void StatefulGptDecoder::reshapeBuffers(SizeType batchSize, SizeType beamWidth, dOutput.finished->reshape(batchSizeXbeamWidth); mBufferManager.setZero(*dOutput.finished); mBufferManager.setZero(*dOutput.finishedSum); - dOutput.lengths->reshape(batchSizeXbeamWidth); - mBufferManager.setZero(*dOutput.lengths); - dOutput.cumLogProbs->reshape(batchSizeXbeamWidth); - mBufferManager.setZero(*dOutput.cumLogProbs); if (beamWidth > 1) { + dOutput.cumLogProbs->reshape(batchSizeXbeamWidth); + mBufferManager.setZero(*dOutput.cumLogProbs); dOutput.beamHypotheses.reshape(batchSize, beamWidth, mMaxSequenceLength); } else @@ -111,6 +118,7 @@ void StatefulGptDecoder::reshapeBuffers(SizeType batchSize, SizeType beamWidth, mNbSteps = 0; mFinished.clear(); mFinished.resize(batchSize, true); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } namespace @@ -119,6 +127,7 @@ void initOutputIds(TensorPtr const& outputIds, TensorPtr const& inputIds, Tensor TensorPtr const& inputOffsets, SizeType const padId, SizeType const endId, SizeType const maxInputLength, bool const inputPacked, CudaStream const& stream) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); kernels::invokeFill(*outputIds, endId, stream); if (inputPacked) @@ -129,11 +138,13 @@ void initOutputIds(TensorPtr const& outputIds, TensorPtr const& inputIds, Tensor { kernels::invokeCopyInputToOutput(*outputIds, *inputIds, *inputLengths, padId, stream); } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } } // namespace void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig const& samplingConfig) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto& manager = mBufferManager; auto& stream = mStream; @@ -155,7 +166,7 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig auto const& inputIds = inputs.ids; auto const inputLengthsHost = manager.copyFrom(*inputLengths, MemoryType::kCPU); auto const* inputLengthsData = bufferCast(*inputLengthsHost); - auto const maxInputLength = *std::max_element(inputLengthsData, inputLengthsData + inputLengths->getSize()); + SizeType const maxInputLength = *std::max_element(inputLengthsData, inputLengthsData + inputLengths->getSize()); TensorPtr inputOffsets = manager.emptyTensor(MemoryType::kGPU, TRTDataType::value); if (inputs.packed) @@ -191,17 +202,17 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig manager.setZero(*dOutput.newTokens); manager.setZero(*dOutput.finished); manager.setZero(*dOutput.finishedSum); - kernels::invokeFill(*dOutput.lengths, maxInputLength, *stream); - std::vector cumLogProbsHost(batchSize * beamWidth, DecodingOutput::kNegativeInfinity); - // Set the entries for the first beam to 0 - for (SizeType i = 0; i < batchSize; ++i) - { - cumLogProbsHost[tc::flat_index2(i, 0, beamWidth)] = 0; - } - manager.copy(cumLogProbsHost.data(), *dOutput.cumLogProbs); if (beamWidth > 1) { + std::vector cumLogProbsHost(batchSize * beamWidth, DecodingOutput::kNegativeInfinity); + // Set the entries for the first beam to 0 + for (SizeType i = 0; i < batchSize; ++i) + { + cumLogProbsHost[tc::flat_index2(i, 0, beamWidth)] = 0; + } + manager.copy(cumLogProbsHost.data(), *dOutput.cumLogProbs); + // kernels::invokeFill(*dOutput.cumLogProbs, DecodingOutput::kNegativeInfinity, *stream); // for (SizeType batchIdx = 0; batchIdx < batchSize; ++batchIdx) // { @@ -225,10 +236,12 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig mNbSteps = 0; mFinished.clear(); mFinished.resize(batchSize, false); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } bool StatefulGptDecoder::forward(decoder::Output& output, decoder::Input const& input) { + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto& logits = input.logits; auto const& logitsShape = logits->getShape(); @@ -245,6 +258,7 @@ bool StatefulGptDecoder::forward(decoder::Output& output, decoder::Input const& "Specify both srcCacheIndirection and tgtCacheIndirection or neither."); TLLM_CHECK(!srcCacheIndirection || srcCacheIndirection->getDataType() == TRTDataType::value); TLLM_CHECK(!tgtCacheIndirection || tgtCacheIndirection->getDataType() == TRTDataType::value); + auto& sequenceLengths = output.sequenceLengths; auto& stream = mStream; auto& dInput = *mDecodingInput; @@ -255,6 +269,7 @@ bool StatefulGptDecoder::forward(decoder::Output& output, decoder::Input const& dInput.cacheIndirection = srcCacheIndirection; dOutput.cacheIndirection = tgtCacheIndirection; } + dOutput.lengths = sequenceLengths; auto& decoder = *mDecoder; decoder.forwardAsync(dOutput, dInput); @@ -269,15 +284,18 @@ bool StatefulGptDecoder::forward(decoder::Output& output, decoder::Input const& || *bufferCast(*dOutput.finishedSum) == static_cast(dOutput.finished->getSize()); std::fill(mFinished.begin(), mFinished.end(), finished); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); return finished; } IStatefulGptDecoder::TensorPtr StatefulGptDecoder::getFinalOutputIds() const { // TODO (rkobus) can we do this inplace? + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto& outputIds = mDecodingOutput->ids; auto finalOutputIds = mBufferManager.gpu(outputIds->getShape(), outputIds->getDataType()); IGptDecoder::gatherTree(*finalOutputIds, *mDecodingOutput, *mDecodingInput, mBufferManager); mBufferManager.copy(*finalOutputIds, *outputIds); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); return outputIds; } diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp index de5e2b8b561..9b7c51284de 100644 --- a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp +++ b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp @@ -15,6 +15,7 @@ */ #include "tllmRuntime.h" #include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/nvtxUtils.h" #include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/tensor.h" #include "tllmBuffers.h" @@ -99,20 +100,24 @@ void TllmRuntime::clearContexts() mContexts.clear(); } -bool TllmRuntime::executeContext(SizeType contextIndex) +bool TllmRuntime::executeContext(SizeType contextIndex) const { + NVTX3_FUNC_RANGE(); auto& context = getContext(contextIndex); return context.enqueueV3(mStream->get()); } void TllmRuntime::setInputTensors(SizeType contextIndex, TensorMap const& tensorMap) { + NVTX3_FUNC_RANGE(); + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto& context = getContext(contextIndex); for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i) { auto const name = mEngine->getIOTensorName(i); if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kINPUT) { + NVTX3_SCOPED_RANGE(input_tensor); auto pos = tensorMap.find(name); if (pos == tensorMap.end()) { @@ -126,14 +131,14 @@ void TllmRuntime::setInputTensors(SizeType contextIndex, TensorMap const& tensor auto const shapeProvided = tensor->getShape(); TLLM_CHECK_WITH_INFO(shapeExpected.nbDims == shapeProvided.nbDims, tc::fmtstr("%s: expected %d dims, provided %d dims", name, shapeExpected.nbDims, shapeProvided.nbDims)); - for (SizeType i = 0; i < shapeExpected.nbDims; ++i) + for (SizeType j = 0; j < shapeExpected.nbDims; ++j) { - auto const dimExpected = shapeExpected.d[i]; - auto const dimProvided = shapeProvided.d[i]; + auto const dimExpected = shapeExpected.d[j]; + auto const dimProvided = shapeProvided.d[j]; if (dimExpected >= 0 && dimExpected != dimProvided) { TLLM_LOG_WARNING( - "%s: expected dim[%d] = %d, provided dim[%d] = %d", name, i, dimExpected, i, dimProvided); + "%s: expected dim[%d] = %d, provided dim[%d] = %d", name, j, dimExpected, j, dimProvided); } } TLLM_CHECK_WITH_INFO(context.setInputShape(name, shapeProvided), name); @@ -155,30 +160,37 @@ void TllmRuntime::setInputTensors(SizeType contextIndex, TensorMap const& tensor } } - char const* missing; - auto const nbMissing = context.inferShapes(1, &missing); - if (nbMissing > 0) { - TLLM_THROW("Input shape not specified: %s", missing); + NVTX3_SCOPED_RANGE(infer_shapes); + char const* missing; + auto const nbMissing = context.inferShapes(1, &missing); + if (nbMissing > 0) + { + TLLM_THROW("Input shape not specified: %s", missing); + } + else if (nbMissing < 0) + { + TLLM_THROW("Invalid input shape"); + } } - else if (nbMissing < 0) + { - TLLM_THROW("Invalid input shape"); + NVTX3_SCOPED_RANGE(final_checks); + TLLM_CHECK_WITH_INFO(context.allInputDimensionsSpecified(), "Input dimensions not specified"); + TLLM_CHECK_WITH_INFO(context.allInputShapesSpecified(), "Input shapes not specified"); } - - TLLM_CHECK_WITH_INFO(context.allInputDimensionsSpecified(), "Input dimensions not specified"); - TLLM_CHECK_WITH_INFO(context.allInputShapesSpecified(), "Input shapes not specified"); } void TllmRuntime::setOutputTensors(SizeType contextIndex, TensorMap& tensorMap) { - + NVTX3_FUNC_RANGE(); auto& context = getContext(contextIndex); for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i) { auto const name = mEngine->getIOTensorName(i); if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kOUTPUT) { + NVTX3_SCOPED_RANGE(output_tensor); auto const dims = context.getTensorShape(name); auto const type = mEngine->getTensorDataType(name); auto pos = tensorMap.find(name); diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.h b/cpp/tensorrt_llm/runtime/tllmRuntime.h index c6de6edb1a4..0a1e438445f 100644 --- a/cpp/tensorrt_llm/runtime/tllmRuntime.h +++ b/cpp/tensorrt_llm/runtime/tllmRuntime.h @@ -50,7 +50,7 @@ class TllmRuntime return static_cast(mContexts.size()); } - nvinfer1::IExecutionContext& getContext(SizeType contextIndex) + nvinfer1::IExecutionContext& getContext(SizeType contextIndex) const { return *mContexts.at(contextIndex); } @@ -68,7 +68,7 @@ class TllmRuntime void setOutputTensors(SizeType contextIndex, TensorMap& tensorMap); - bool executeContext(SizeType contextIndex); + bool executeContext(SizeType contextIndex) const; CudaStream const& getStream() const; diff --git a/cpp/tensorrt_llm/runtime/torchView.h b/cpp/tensorrt_llm/runtime/torchView.h index f10555774cb..93d6cdbf491 100644 --- a/cpp/tensorrt_llm/runtime/torchView.h +++ b/cpp/tensorrt_llm/runtime/torchView.h @@ -18,6 +18,7 @@ #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/torchUtils.h" #include #include diff --git a/cpp/tensorrt_llm/runtime/utils/multiDeviceUtils.h b/cpp/tensorrt_llm/runtime/utils/multiDeviceUtils.h new file mode 100644 index 00000000000..1acd2e53091 --- /dev/null +++ b/cpp/tensorrt_llm/runtime/utils/multiDeviceUtils.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/common/stringUtils.h" + +#include + +#if ENABLE_MULTI_DEVICE +#include +#endif // ENABLE_MULTI_DEVICE + +#define TLLM_MPI_CHECK(cmd, logger) \ + do \ + { \ + auto e = cmd; \ + if (e != MPI_SUCCESS) \ + { \ + logger.log(nvinfer1::ILogger::Severity::kERROR, \ + tensorrt_llm::common::fmtstr("Failed: MPI error %s:%d '%d'", __FILE__, __LINE__, e).c_str()); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#if ENABLE_MULTI_DEVICE +#define TLLM_NCCL_CHECK(cmd, logger) \ + do \ + { \ + ncclResult_t r = cmd; \ + if (r != ncclSuccess) \ + { \ + logger.log(nvinfer1::ILogger::Severity::kERROR, \ + tensorrt_llm::common::fmtstr( \ + "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(r)) \ + .c_str()); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#endif // ENABLE_MULTI_DEVICE diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp index 984ed057745..c7884b1cfc8 100644 --- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp +++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp @@ -50,29 +50,18 @@ std::vector loadEngine(std::string const& enginePath) return engineBlob; } -void insertTensorVector(StringPtrMap& map, std::string const& key, std::vector const& vec) -{ - for (std::size_t i = 0; i < vec.size(); ++i) - map.insert_or_assign(key + std::to_string(i), vec[i]); -} - -nvinfer1::DataType getTensorDataType(nvinfer1::ICudaEngine const& engine, std::string const& name) -{ - return engine.getTensorDataType(name.c_str()); -} - -std::vector createBufferVector( - TllmRuntime const& runtime, SizeType const numBuffers, std::string const& prefix, MemoryType memType) +std::vector createBufferVector(TllmRuntime const& runtime, SizeType const indexOffset, + SizeType const numBuffers, std::string const& prefix, MemoryType memType) { auto const& manager = runtime.getBufferManager(); auto const& engine = runtime.getEngine(); std::vector vector; - for (SizeType i = 0; i < numBuffers; ++i) + for (SizeType i = indexOffset; i < indexOffset + numBuffers; ++i) { std::string name{prefix + std::to_string(i)}; - auto type = getTensorDataType(engine, name); + auto type = engine.getTensorDataType(name.c_str()); vector.emplace_back(manager.emptyTensor(memType, type)); } return vector; @@ -86,6 +75,25 @@ void reshapeBufferVector(std::vector& vector, nvinfer1::Dims } } +void insertTensorVector(StringPtrMap& map, std::string const& key, std::vector const& vec, + SizeType const indexOffset) +{ + for (std::size_t i = 0; i < vec.size(); ++i) + map.insert_or_assign(key + std::to_string(indexOffset + i), vec[i]); +} + +void insertTensorSlices( + StringPtrMap& map, std::string const& key, ITensor::SharedPtr const& tensor, SizeType const indexOffset) +{ + auto const numSlices = tensor->getShape().d[0]; + for (SizeType i = 0; i < numSlices; ++i) + { + ITensor::SharedPtr slice = ITensor::slice(tensor, i, 1); + slice->squeeze(0); + map.insert_or_assign(key + std::to_string(indexOffset + i), slice); + } +} + void setRawPointers(ITensor& pointers, ITensor::SharedPtr const& input, int32_t pointersSlot, int32_t inputSlot) { auto const pointersLength = static_cast(pointers.getSizeInBytes() / sizeof(void**)); diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h index 48b42c322b1..538d56edd22 100644 --- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h +++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h @@ -37,14 +37,16 @@ int initDevice(WorldConfig const& worldConfig); std::vector loadEngine(std::string const& enginePath); -void insertTensorVector(StringPtrMap& map, std::string const& key, std::vector const& vec); +std::vector createBufferVector(TllmRuntime const& runtime, SizeType indexOffset, + SizeType numBuffers, std::string const& prefix, MemoryType memType); -nvinfer1::DataType getTensorDataType(nvinfer1::ICudaEngine const& engine, std::string const& name); +void reshapeBufferVector(std::vector& vector, nvinfer1::Dims const& shape); -std::vector createBufferVector( - TllmRuntime const& runtime, SizeType const numBuffers, std::string const& prefix, MemoryType memType); +void insertTensorVector(StringPtrMap& map, std::string const& key, std::vector const& vec, + SizeType indexOffset); -void reshapeBufferVector(std::vector& vector, nvinfer1::Dims const& shape); +void insertTensorSlices( + StringPtrMap& map, std::string const& key, ITensor::SharedPtr const& tensor, SizeType indexOffset); void setRawPointers(ITensor& pointers, ITensor::SharedPtr const& input, int32_t pointersSlot, int32_t inputSlot); diff --git a/cpp/tensorrt_llm/runtime/worldConfig.cpp b/cpp/tensorrt_llm/runtime/worldConfig.cpp index e0523195c80..0ca547e9381 100644 --- a/cpp/tensorrt_llm/runtime/worldConfig.cpp +++ b/cpp/tensorrt_llm/runtime/worldConfig.cpp @@ -19,6 +19,7 @@ #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/runtime/tllmLogger.h" +#include "tensorrt_llm/runtime/utils/multiDeviceUtils.h" #include #include @@ -26,18 +27,6 @@ using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; -#define TLLM_MPI_CHECK(cmd, logger) \ - do \ - { \ - auto e = cmd; \ - if (e != MPI_SUCCESS) \ - { \ - logger.log(nvinfer1::ILogger::Severity::kERROR, \ - tc::fmtstr("Failed: MPI error %s:%d '%d'", __FILE__, __LINE__, e).c_str()); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - namespace { @@ -67,7 +56,8 @@ void initMpi(nvinfer1::ILogger& logger, int threadMode = MPI_THREAD_FUNNELED) } // namespace -WorldConfig WorldConfig::mpi(nvinfer1::ILogger& logger, SizeType gpusPerNode) +WorldConfig WorldConfig::mpi(nvinfer1::ILogger& logger, SizeType gpusPerNode, std::optional tensorParallelism, + std::optional pipelineParallelism) { initMpi(logger); @@ -75,11 +65,27 @@ WorldConfig WorldConfig::mpi(nvinfer1::ILogger& logger, SizeType gpusPerNode) TLLM_MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &mpiSize), logger); TLLM_MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank), logger); logger.log(nvinfer1::ILogger::Severity::kINFO, tc::fmtstr("MPI size: %d, rank: %d", mpiSize, mpiRank).c_str()); - return WorldConfig{mpiSize, mpiRank, gpusPerNode}; + + auto pp = pipelineParallelism.value_or(1); + auto tp = tensorParallelism.value_or(mpiSize / pp); + TLLM_CHECK(mpiSize == tp * pp); + return WorldConfig{tp, pp, mpiRank, gpusPerNode}; } -WorldConfig WorldConfig::mpi(SizeType gpusPerNode) +WorldConfig WorldConfig::mpi( + SizeType gpusPerNode, std::optional tensorParallelism, std::optional pipelineParallelism) { TllmLogger logger{}; - return mpi(logger, gpusPerNode); + return mpi(logger, gpusPerNode, tensorParallelism, pipelineParallelism); +} + +std::vector WorldConfig::getPipelineParallelGroup() const +{ + std::vector group; + auto const groupIdx = getTensorParallelRank(); + for (SizeType i = 0; i < getPipelineParallelism(); ++i) + { + group.push_back(groupIdx + i * getTensorParallelism()); + } + return group; } diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt index 2bbb7f72ba2..065b9b40617 100644 --- a/cpp/tensorrt_llm/thop/CMakeLists.txt +++ b/cpp/tensorrt_llm/thop/CMakeLists.txt @@ -15,12 +15,12 @@ add_library(th_utils STATIC thUtils.cu torchAllocator.cpp) set_property(TARGET th_utils PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET th_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(th_utils PUBLIC ${TORCH_LIBRARIES} -lcublas -lcudart - -lcurand) +target_link_libraries(th_utils PUBLIC ${TORCH_LIBRARIES} ${CUBLAS_LIB} + ${CURAND_LIB}) add_library(th_common SHARED dynamicDecodeOp.cpp weightOnlyQuantOp.cpp gatherTreeOp.cpp fp8Op.cpp) set_property(TARGET th_common PROPERTY POSITION_INDEPENDENT_CODE ON) target_link_libraries( th_common PRIVATE ${TORCH_LIBRARIES} th_utils ${Python3_LIBRARIES} - ${STATIC_TARGET} "-Wl,--no-undefined") + ${STATIC_TARGET} ${UNDEFINED_FLAG}) diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp index e6e35180005..75d168368a9 100644 --- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp +++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp @@ -139,7 +139,7 @@ void FtDynamicDecode::setup(size_t batch_size, size_t beam_width, th::optiona template void FtDynamicDecode::forward(th::Tensor& logits, // (batch_size, beam_width, hidden_size) - int step, int max_input_length, uint ite, int local_batch_size, th::Tensor end_id, + int step, int max_input_length, uint64_t ite, int local_batch_size, th::Tensor end_id, th::optional embedding_bias_opt, th::optional input_lengths_opt, th::optional sequence_limit_length_opt, th::optional stop_words_list_opt, th::optional bad_words_list_opt, th::optional no_repeat_ngram_size_opt, @@ -342,7 +342,7 @@ th::Tensor DynamicDecodeOp::forward(th::Tensor logits, int64_t step, int64_t max dynamic_decode_->forward( // Inputs - logits, static_cast(step), static_cast(max_input_length), static_cast(ite), + logits, static_cast(step), static_cast(max_input_length), static_cast(ite), static_cast(local_batch_size), end_id, embedding_bias_opt, input_lengths_opt, sequence_limit_length_opt, stop_words_list_opt, bad_words_list_opt, no_repeat_ngram_size_opt, src_cache_indirection_opt, // Outputs diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h index 29cc97ee31c..d451ffadbb0 100644 --- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h +++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h @@ -38,7 +38,7 @@ class IFtDynamicDecode = 0; virtual void forward(th::Tensor& logits, // (batch_size, beam_width, hidden_size) - int step, int max_input_length, uint ite, int local_batch_size, th::Tensor end_id, + int step, int max_input_length, uint64_t ite, int local_batch_size, th::Tensor end_id, th::optional embedding_bias_opt, th::optional input_lengths_opt, th::optional sequence_limit_length_opt, th::optional stop_words_list_opt, th::optional bad_words_list_opt, th::optional no_repeat_ngram_size_opt, @@ -76,7 +76,7 @@ class FtDynamicDecode : public IFtDynamicDecode th::optional top_p_reset_ids_opt) override; void forward(th::Tensor& logits, // (batch_size, beam_width, hidden_size) - int step, int max_input_length, uint ite, int local_batch_size, th::Tensor end_id, + int step, int max_input_length, uint64_t ite, int local_batch_size, th::Tensor end_id, th::optional embedding_bias_opt, th::optional input_lengths_opt, th::optional sequence_limit_length_opt, th::optional stop_words_list_opt, th::optional bad_words_list_opt, th::optional no_repeat_ngram_size_opt, diff --git a/cpp/tensorrt_llm/thop/fp8Op.cpp b/cpp/tensorrt_llm/thop/fp8Op.cpp index 011cf92f026..231046a4df6 100644 --- a/cpp/tensorrt_llm/thop/fp8Op.cpp +++ b/cpp/tensorrt_llm/thop/fp8Op.cpp @@ -39,10 +39,10 @@ std::vector e4m3_quantize_helper(Tensor input, QuantizeMode quantize_mod TORCH_CHECK(_st == torch::kFloat32 || _st == torch::kFloat16 || _st == torch::kBFloat16, "Invalid datatype. input must be FP16 or BF16 or FP32"); - std::vector quantized_input_shape; + std::vector quantized_input_shape; for (int i = 0; i < input.dim(); i++) quantized_input_shape.push_back(input.size(i)); - std::vector scale_shape; + std::vector scale_shape; if (quantize_mode == QuantizeMode::PER_TOKEN) { for (int i = 0; i < input.dim() - 1; i++) @@ -113,7 +113,7 @@ Tensor e4m3_dequantize_helper(Tensor input, Tensor scales, QuantizeMode quantize TORCH_CHECK(input.scalar_type() == torch::kInt8, "Invalid datatype. input must be Int8 (Fp8)"); - std::vector dequantized_input_shape; + std::vector dequantized_input_shape; for (int i = 0; i < input.dim(); i++) dequantized_input_shape.push_back(input.size(i)); TORCH_CHECK(scales.dim() == input.dim()); diff --git a/cpp/tensorrt_llm/thop/gatherTreeOp.cpp b/cpp/tensorrt_llm/thop/gatherTreeOp.cpp index 0058528af9c..f71d6df7213 100644 --- a/cpp/tensorrt_llm/thop/gatherTreeOp.cpp +++ b/cpp/tensorrt_llm/thop/gatherTreeOp.cpp @@ -32,13 +32,13 @@ th::Tensor gatherTree(th::Tensor& sequence_lengths, th::Tensor& output_ids, th:: th::optional beam_hyps_log_probs, th::optional beam_hyps_min_normed_scores, th::optional beam_hyps_num_beams, th::optional beam_hyps_is_done, th::optional finished, th::Tensor& length_penalty, int64_t batch_size, int64_t beam_width, - int64_t max_input_length, int64_t max_seq_len, bool use_beam_hyps) + int64_t max_seq_len, bool use_beam_hyps) { + auto stream = at::cuda::getCurrentCUDAStream().stream(); + th::Tensor final_output_ids = torch::zeros( + {batch_size, beam_width, max_seq_len}, torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false)); if (use_beam_hyps && beam_width > 1) { - auto stream = at::cuda::getCurrentCUDAStream().stream(); - th::Tensor final_output_ids = torch::zeros({batch_size, beam_width, max_seq_len}, - torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false)); tl::kernels::invokeInitializeOutput(get_ptr(final_output_ids), get_ptr(end_ids), batch_size * beam_width, max_seq_len, stream); @@ -69,19 +69,14 @@ th::Tensor gatherTree(th::Tensor& sequence_lengths, th::Tensor& output_ids, th:: nullptr, // output_logs beamHypotheses.output_ids_tgt, beamHypotheses.sequence_lengths_tgt, beamHypotheses.normed_scores, beamHypotheses.cum_log_probs, beamHypotheses.log_probs, beamHypotheses.num_beams, - get_ptr(tiled_input_lengths), beam_width, max_seq_len, batch_size, max_input_length, stream); + get_ptr(tiled_input_lengths), beam_width, max_seq_len, batch_size, stream); sync_check_cuda_error(); - - return final_output_ids; } - else + else if (!use_beam_hyps && beam_width > 1) { th::Tensor workspace = torch::zeros(batch_size * beam_width * max_seq_len * sizeof(int32_t), torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); - th::Tensor final_output_ids = torch::zeros({batch_size, beam_width, max_seq_len}, - torch::dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false)); - // For sampling, it is equivalent to all parent ids are 0. tl::kernels::gatherTreeParam param; param.beams = get_ptr(workspace); @@ -98,10 +93,9 @@ th::Tensor gatherTree(th::Tensor& sequence_lengths, th::Tensor& output_ids, th:: param.step_ids = get_ptr(output_ids); param.parent_ids = beam_width == 1 ? nullptr : get_ptr(parent_ids); param.end_tokens = get_ptr(end_ids); - param.max_input_length = max_input_length; param.input_lengths = get_ptr(tiled_input_lengths); - param.stream = at::cuda::getCurrentCUDAStream().stream(); + param.stream = stream; param.output_ids = get_ptr(final_output_ids); param.cum_log_probs = cum_log_probs_opt.has_value() ? get_ptr(cum_log_probs_opt.value()) : nullptr; param.length_penalty = get_val(length_penalty, 0); @@ -109,8 +103,14 @@ th::Tensor gatherTree(th::Tensor& sequence_lengths, th::Tensor& output_ids, th:: // NOTE: need to remove all prompt virtual tokens tl::kernels::invokeGatherTree(param); sync_check_cuda_error(); - return final_output_ids; } + else + { + cudaMemcpyAsync(get_ptr(final_output_ids), get_ptr(output_ids), + sizeof(int) * batch_size * beam_width * max_seq_len, cudaMemcpyDeviceToDevice, stream); + sync_check_cuda_error(); + } + return final_output_ids; } } // namespace torch_ext diff --git a/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp b/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp index fc9a4daf347..79e8751c8ab 100644 --- a/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp +++ b/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp @@ -164,17 +164,17 @@ std::vector symmetric_quantize_helper( const size_t input_mat_size = num_rows * num_cols; const size_t quantized_mat_size = num_rows * bytes_per_out_col; - std::vector quantized_weight_shape; - std::vector scale_shape; + std::vector quantized_weight_shape; + std::vector scale_shape; if (weight.dim() == 2) { - quantized_weight_shape = {long(num_rows), long(bytes_per_out_col)}; - scale_shape = {long(num_cols)}; + quantized_weight_shape = {int64_t(num_rows), int64_t(bytes_per_out_col)}; + scale_shape = {int64_t(num_cols)}; } else if (weight.dim() == 3) { - quantized_weight_shape = {long(num_experts), long(num_rows), long(bytes_per_out_col)}; - scale_shape = {long(num_experts), long(num_cols)}; + quantized_weight_shape = {int64_t(num_experts), int64_t(num_rows), int64_t(bytes_per_out_col)}; + scale_shape = {int64_t(num_experts), int64_t(num_cols)}; } else { @@ -273,7 +273,7 @@ Tensor unpack_int4_packed_tensor_to_int8(Tensor weight) TORCH_CHECK(weight.numel() != 0, "weight should not be empty tensor"); TORCH_CHECK(weight.dtype() == torch::kInt8, "Weight must be a packed int8 tensor"); - std::vector int8_tensor_size(weight.dim()); + std::vector int8_tensor_size(weight.dim()); for (int i = 0; i < weight.dim(); ++i) { int8_tensor_size[i] = weight.size(i); @@ -307,7 +307,7 @@ Tensor pack_int8_tensor_to_packed_int4(Tensor weight) TORCH_CHECK(weight.numel() != 0, "weight should not be empty tensor"); TORCH_CHECK(weight.dtype() == torch::kInt8, "Weight must be a int8 tensor"); - std::vector packed_tensor_size(weight.dim()); + std::vector packed_tensor_size(weight.dim()); for (int i = 0; i < weight.dim(); ++i) { packed_tensor_size[i] = weight.size(i); diff --git a/cpp/tests/README.md b/cpp/tests/README.md index 98506379e71..5c2ee9a84a7 100644 --- a/cpp/tests/README.md +++ b/cpp/tests/README.md @@ -26,6 +26,12 @@ Single tests can be executed from `CPP_BUILD_DIR/tests`, e.g. ### Build engines +To avoid discrepancy in the reference and tests data set `SKIP_GEMM_PLUGIN_PROFILINGS=1` to disable GEMM tactic profiling in GEMM plugins. + +```bash +export SKIP_GEMM_PLUGIN_PROFILINGS=1 +``` + [Scripts](resources/scripts) are provided that download the GPT2 and GPT-J models from Huggingface and convert them to TensorRT engines. The weights and built engines are stored under [cpp/tests/resources/models](resources/models). To build the engines from the top-level directory: @@ -33,6 +39,7 @@ To build the engines from the top-level directory: ```bash PYTHONPATH=examples/gpt python3 cpp/tests/resources/scripts/build_gpt_engines.py PYTHONPATH=examples/gptj python3 cpp/tests/resources/scripts/build_gptj_engines.py +PYTHONPATH=examples/llama python3 cpp/tests/resources/scripts/build_llama_engines.py ``` ### Generate expected output @@ -42,6 +49,7 @@ End-to-end tests read inputs and expected outputs from Numpy files located at [c ```bash PYTHONPATH=examples/gpt python3 cpp/tests/resources/scripts/generate_expected_gpt_output.py PYTHONPATH=examples/gptj python3 cpp/tests/resources/scripts/generate_expected_gptj_output.py +PYTHONPATH=examples/llama python3 cpp/tests/resources/scripts/generate_expected_llama_output.py ``` ### Run test diff --git a/cpp/tests/resources/.gitignore b/cpp/tests/resources/.gitignore index 949c93855ed..d864e5f6cfc 100644 --- a/cpp/tests/resources/.gitignore +++ b/cpp/tests/resources/.gitignore @@ -1,5 +1,6 @@ models/gpt2 models/gpt-j-6b +models/llama-7b-hf models/c-model models/rt_engine /models/v2 diff --git a/cpp/tests/resources/scripts/build_gpt_engines.py b/cpp/tests/resources/scripts/build_gpt_engines.py index 8ed877a6a11..2ac8747d971 100755 --- a/cpp/tests/resources/scripts/build_gpt_engines.py +++ b/cpp/tests/resources/scripts/build_gpt_engines.py @@ -131,15 +131,6 @@ def build_engines(model_cache: _tp.Optional[str] = None, world_size: int = 1): engine_dir / 'fp16-plugin-packed-paged' / tp_dir, world_size, '--dtype=float16', '--use_gpt_attention_plugin=float16', '--remove_input_padding', '--paged_kv_cache') - # build_engine(fp16_weight_dir_x_gpu, - # engine_dir / 'fp16-inflight-batching-plugin' / tp_dir, - # world_size, '--dtype=float16', '--use_inflight_batching', - # '--use_gpt_attention_plugin=float16', '--remove_input_padding') - build_engine(fp16_weight_dir_x_gpu, - engine_dir / 'fp16-inflight-batching-plugin-paged' / tp_dir, - world_size, '--dtype=float16', '--use_inflight_batching', - '--use_gpt_attention_plugin=float16', '--remove_input_padding', - '--paged_kv_cache') print("Done.") diff --git a/cpp/tests/resources/scripts/build_gptj_engines.py b/cpp/tests/resources/scripts/build_gptj_engines.py index 1913065aa43..f6002699480 100755 --- a/cpp/tests/resources/scripts/build_gptj_engines.py +++ b/cpp/tests/resources/scripts/build_gptj_engines.py @@ -106,12 +106,10 @@ def build_engines(model_cache: _tp.Optional[str] = None, only_fp8=False): '--use_gpt_attention_plugin=float16', '--remove_input_padding') - print("\nBuilding fp16-inflight-batching-plugin-paged engine") - build_engine(hf_dir, - engine_dir / 'fp16-inflight-batching-plugin-paged/1-gpu', + print("\nBuilding fp16-plugin-packed-paged engine") + build_engine(hf_dir, engine_dir / 'fp16-plugin-packed-paged/1-gpu', '--use_gpt_attention_plugin=float16', - '--use_inflight_batching', '--remove_input_padding', - '--paged_kv_cache') + '--use_inflight_batching') print("Done.") diff --git a/cpp/tests/resources/scripts/build_llama_engines.py b/cpp/tests/resources/scripts/build_llama_engines.py new file mode 100644 index 00000000000..30d0eee6330 --- /dev/null +++ b/cpp/tests/resources/scripts/build_llama_engines.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse as _arg +import pathlib as _pl +import subprocess as _sp +import sys as _sys +import typing as _tp + + +def run_command(command: _tp.Sequence[str], *, cwd=None, **kwargs) -> None: + print(f"Running: cd %s && %s" % + (str(cwd or _pl.Path.cwd()), " ".join(command))) + _sp.check_call(command, cwd=cwd, **kwargs) + + +def build_engine(weigth_dir: _pl.Path, engine_dir: _pl.Path, *args): + build_args = [_sys.executable, "examples/llama/build.py"] + ( + ['--model_dir', str(weigth_dir)] if weigth_dir else []) + [ + '--output_dir', + str(engine_dir), + '--dtype=float16', + '--use_gpt_attention_plugin=float16', + '--use_gemm_plugin=float16', + '--max_batch_size=32', + '--max_input_len=40', + '--max_output_len=20', + '--max_beam_width=2', + '--log_level=error', + ] + list(args) + run_command(build_args) + + +def build_engines(model_cache: str): + resources_dir = _pl.Path(__file__).parent.resolve().parent + models_dir = resources_dir / 'models' + model_name = 'llama-7b-hf' + + if model_cache: + print("Copy model from model_cache") + model_cache_dir = _pl.Path(model_cache) / 'llama-models' / model_name + assert (model_cache_dir.is_dir()) + + run_command(["rsync", "-av", str(model_cache_dir), "."], cwd=models_dir) + + hf_dir = models_dir / model_name + assert hf_dir.is_dir() + + engine_dir = models_dir / 'rt_engine' / model_name + + tp_size = 1 + pp_size = 1 + print(f"\nBuilding fp16 tp{tp_size} pp{pp_size} engine") + build_engine(hf_dir, engine_dir / 'fp16-plugin/1-gpu') + + tp_size = 2 + pp_size = 2 + world_size = tp_size * pp_size + print(f"\nBuilding fp16 tp{tp_size} pp{pp_size} engine") + build_engine(hf_dir, engine_dir / f'fp16-plugin/{world_size}-gpu', + f'--world_size={world_size}', f'--tp_size={tp_size}', + f'--pp_size={pp_size}') + + print("Done.") + + +if __name__ == "__main__": + parser = _arg.ArgumentParser() + parser.add_argument("--model_cache", + type=str, + help="Directory where models are stored") + + build_engines(**vars(parser.parse_args())) diff --git a/cpp/tests/resources/scripts/generate_expected_llama_output.py b/cpp/tests/resources/scripts/generate_expected_llama_output.py new file mode 100644 index 00000000000..f263352163e --- /dev/null +++ b/cpp/tests/resources/scripts/generate_expected_llama_output.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +import run + + +def generate_output(engine: str, + num_beams: int, + output_name: str, + max_output_len: int = 8): + + model = 'llama-7b-hf' + resources_dir = Path(__file__).parent.resolve().parent + models_dir = resources_dir / 'models' + hf_dir = models_dir / model + engine_dir = models_dir / 'rt_engine' / model / engine / '1-gpu/' + + data_dir = resources_dir / 'data' + input_file = data_dir / 'input_tokens.npy' + model_data_dir = data_dir / model + if num_beams <= 1: + output_dir = model_data_dir / 'sampling' + else: + output_dir = model_data_dir / ('beam_search_' + str(num_beams)) + + run.generate(engine_dir=str(engine_dir), + tokenizer_dir=str(hf_dir), + input_file=str(input_file), + output_npy=str(output_dir / (output_name + '.npy')), + output_csv=str(output_dir / (output_name + '.csv')), + max_output_len=max_output_len, + num_beams=num_beams) + + +def generate_outputs(num_beams): + print('Generating Llama FP16 outputs') + generate_output(engine='fp16-plugin', + num_beams=num_beams, + output_name='output_tokens_fp16_plugin') + + +if __name__ == '__main__': + generate_outputs(num_beams=1) + generate_outputs(num_beams=2) diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py index 1f7e3d01460..9ef98e88138 100755 --- a/cpp/tests/resources/scripts/test_cpp.py +++ b/cpp/tests/resources/scripts/test_cpp.py @@ -49,6 +49,7 @@ def run_tests(cuda_architectures: _tp.Optional[str] = None, dist_dir: _tp.Optional[str] = None, model_cache: _tp.Optional[str] = None, skip_gptj=False, + skip_llama=False, only_fp8=False, trt_root: _tp.Optional[str] = None) -> None: root_dir = find_root_dir() @@ -92,7 +93,10 @@ def run_command(command: _tp.Sequence[str], model_cache = ["--model_cache", model_cache] if model_cache else [] only_fp8_arg = ["--only_fp8"] if only_fp8 else [] - gpt_env = {**_os.environ, "PYTHONPATH": "examples/gpt"} + gpt_env = { + **_os.environ, "PYTHONPATH": "examples/gpt", + "SKIP_GEMM_PLUGIN_PROFILINGS": "1" + } build_gpt_engines = [python_exe, str(scripts_dir / "build_gpt_engines.py") ] + model_cache @@ -110,7 +114,10 @@ def run_command(command: _tp.Sequence[str], ] + model_cache + only_fp8_arg run_command(build_gptj_engines) - gptj_env = {**_os.environ, "PYTHONPATH": "examples/gptj"} + gptj_env = { + **_os.environ, "PYTHONPATH": "examples/gptj", + "SKIP_GEMM_PLUGIN_PROFILINGS": "1" + } generate_expected_gptj_output = [ python_exe, str(scripts_dir / "generate_expected_gptj_output.py") @@ -119,22 +126,46 @@ def run_command(command: _tp.Sequence[str], else: _log.info("Skipping GPT-J tests") + if not skip_llama: + build_llama_engines = [ + python_exe, str(scripts_dir / "build_llama_engines.py") + ] + model_cache + run_command(build_llama_engines) + + llama_env = { + **_os.environ, "PYTHONPATH": "examples/llama", + "SKIP_GEMM_PLUGIN_PROFILINGS": "1" + } + generate_expected_llama_output = [ + python_exe, + str(scripts_dir / "generate_expected_llama_output.py") + ] + run_command(generate_expected_llama_output, env=llama_env) + else: + _log.info("Skipping Lllama tests") + build_dir = build_dir if build_dir.is_absolute() else root_dir / build_dir make_google_tests = ["make", "-j", "google-tests"] run_command(make_google_tests, cwd=build_dir) + cpp_env = {**_os.environ, "SKIP_GEMM_PLUGIN_PROFILINGS": "1"} ctest = ["ctest", "--output-on-failure", "--output-junit", "report.xml"] + excluded_tests = [] if skip_gptj: - ctest.extend(["-E", ".*Gptj.*"]) + excluded_tests.append(".*Gptj.*") + if skip_llama: + excluded_tests.append(".*Llama.*") if only_fp8: ctest.extend(["-R", ".*FP8.*"]) else: - ctest.extend(["-E", ".*FP8.*"]) - run_command(ctest, cwd=build_dir) + excluded_tests.append(".*FP8.*") + if excluded_tests: + ctest.extend(["-E", "|".join(excluded_tests)]) + run_command(ctest, cwd=build_dir, env=cpp_env) make_benchmarks = ["make", "-j", "benchmarks"] - run_command(make_benchmarks, cwd=build_dir) + run_command(make_benchmarks, cwd=build_dir, env=cpp_env) benchmark = [ str(build_dir / "benchmarks" / "gptSessionBenchmark"), "--model", "gpt", @@ -142,7 +173,7 @@ def run_command(command: _tp.Sequence[str], "../tests/resources/models/rt_engine/gpt2/fp16-plugin/1-gpu", "--batch_size", "8", "--input_output_len", "10,20", "--duration", "10" ] - run_command(benchmark, cwd=build_dir) + run_command(benchmark, cwd=build_dir, env=cpp_env) if __name__ == "__main__": @@ -165,6 +196,9 @@ def run_command(command: _tp.Sequence[str], parser.add_argument("--skip_gptj", action="store_true", help="Skip the tests for GPT-J") + parser.add_argument("--skip_llama", + action="store_true", + help="Skip the tests for Llama") parser.add_argument( "--only_fp8", action="store_true", diff --git a/cpp/tests/runtime/gptDecoderBatchTest.cpp b/cpp/tests/runtime/gptDecoderBatchTest.cpp index 02fcbec1570..f0e90ba9f54 100644 --- a/cpp/tests/runtime/gptDecoderBatchTest.cpp +++ b/cpp/tests/runtime/gptDecoderBatchTest.cpp @@ -92,9 +92,11 @@ void verifyResults(BufferManager& manager, GptDecoderBatch const& decoder, void testDecoder(nvinfer1::DataType const dtype, std::vector const& samplingConfigs, int maxBeamWidth) { - SizeType constexpr worldSize{1}; + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + SizeType constexpr tensorParallelism{1}; + SizeType constexpr pipelineParallelism{1}; SizeType constexpr localRank{0}; - WorldConfig constexpr worldConfig{worldSize, localRank}; + WorldConfig constexpr worldConfig{tensorParallelism, pipelineParallelism, localRank}; SizeType constexpr vocabSize{51200}; SizeType constexpr nbLayers{2}; @@ -122,6 +124,14 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector con decoder.setup(batchSize, maxBeamWidth, maxSeqLength, modelConfig.getDataType()); std::vector const inputLengths{4, 5, 6, 7}; + std::vector tiledInputLengths; + for (int batch_id = 0; batch_id < inputLengths.size(); batch_id++) + { + for (int beam_id = 0; beam_id < maxBeamWidth; beam_id++) + { + tiledInputLengths.push_back(inputLengths.at(batch_id)); + } + } // set up inputs auto logits = std::shared_ptr( @@ -147,6 +157,10 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector con manager.setZero(*tgtCacheIndirection); outputs.cacheIndirection = tgtCacheIndirection; } + auto sequenceLengths + = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize * maxBeamWidth}), TRTDataType::value)); + manager.copy(tiledInputLengths.data(), *sequenceLengths); + outputs.sequenceLengths = sequenceLengths; auto constexpr tokenId = 1; std::vector inputIds; @@ -198,9 +212,11 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector con void testDecoderWavefront( nvinfer1::DataType const dtype, std::vector const& samplingConfigs, int maxBeamWidth) { - SizeType constexpr worldSize{1}; + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + SizeType constexpr tensorParallelism{1}; + SizeType constexpr pipelineParallelism{1}; SizeType constexpr localRank{0}; - WorldConfig constexpr worldConfig{worldSize, localRank}; + WorldConfig constexpr worldConfig{tensorParallelism, pipelineParallelism, localRank}; SizeType constexpr vocabSize{51200}; SizeType constexpr nbLayers{2}; @@ -228,6 +244,14 @@ void testDecoderWavefront( decoder.setup(batchSize, maxBeamWidth, maxSeqLength, modelConfig.getDataType()); std::vector const inputLengths{4, 5, 6, 7}; + std::vector tiledInputLengths; + for (int batch_id = 0; batch_id < inputLengths.size(); batch_id++) + { + for (int beam_id = 0; beam_id < maxBeamWidth; beam_id++) + { + tiledInputLengths.push_back(inputLengths.at(batch_id)); + } + } // set up inputs auto logits = std::shared_ptr( @@ -253,6 +277,10 @@ void testDecoderWavefront( manager.setZero(*tgtCacheIndirection); outputs.cacheIndirection = tgtCacheIndirection; } + auto sequenceLengths + = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize * maxBeamWidth}), TRTDataType::value)); + manager.copy(tiledInputLengths.data(), *sequenceLengths); + outputs.sequenceLengths = sequenceLengths; auto const& nbSteps = decoder.getNbSteps(); EXPECT_EQ(nbSteps.size(), batchSize); diff --git a/cpp/tests/runtime/gptDecoderTest.cpp b/cpp/tests/runtime/gptDecoderTest.cpp index 3aeb3c95349..f91a882dd5d 100644 --- a/cpp/tests/runtime/gptDecoderTest.cpp +++ b/cpp/tests/runtime/gptDecoderTest.cpp @@ -31,9 +31,10 @@ namespace void testDecoder(nvinfer1::DataType const dtype, SamplingConfig const& samplingConfig) { - SizeType constexpr worldSize{1}; + SizeType constexpr tensorParallelism{1}; + SizeType constexpr pipelineParallelism{1}; SizeType constexpr localRank{0}; - WorldConfig constexpr worldConfig{worldSize, localRank}; + WorldConfig constexpr worldConfig{tensorParallelism, pipelineParallelism, localRank}; SizeType constexpr vocabSize{51200}; SizeType constexpr nbLayers{2}; diff --git a/cpp/tests/runtime/gptSessionTest.cpp b/cpp/tests/runtime/gptSessionTest.cpp index 24b36a60153..5ba7be6cf03 100644 --- a/cpp/tests/runtime/gptSessionTest.cpp +++ b/cpp/tests/runtime/gptSessionTest.cpp @@ -21,6 +21,7 @@ #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/tensor.h" +#include "tensorrt_llm/plugins/api/tllmPlugin.h" #include "tensorrt_llm/runtime/gptJsonConfig.h" #include "tensorrt_llm/runtime/gptSession.h" #include "tensorrt_llm/runtime/tllmLogger.h" @@ -28,8 +29,6 @@ #include #include -#include - using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; @@ -43,6 +42,7 @@ auto const DATA_PATH = TEST_RESOURCE_PATH / "data"; auto const GPT_MODEL_DIR = "gpt2"; auto const GPTJ_MODEL_DIR = "gpt-j-6b"; +auto const LLAMA_MODEL_DIR = "llama-7b-hf"; // Engines need to be generated using cpp/tests/resources/scripts/build_gpt_engines.py. auto const FP32_GPT_DIR = "fp32-default"; @@ -51,9 +51,6 @@ auto const FP16_GPT_DIR = "fp16-default"; auto const FP16_GPT_ATTENTION_DIR = "fp16-plugin"; auto const FP16_GPT_ATTENTION_PACKED_DIR = FP16_GPT_ATTENTION_DIR + std::string("-packed"); auto const FP16_GPT_ATTENTION_PACKED_PAGED_DIR = FP16_GPT_ATTENTION_PACKED_DIR + std::string("-paged"); -auto const FP16_GPT_ATTENTION_INFLIGHT_BATCHING_DIR = "fp16-inflight-batching-plugin"; -auto const FP16_GPT_ATTENTION_INFLIGHT_BATCHING_PAGED_DIR - = FP16_GPT_ATTENTION_INFLIGHT_BATCHING_DIR + std::string("-paged"); // Expected outputs need to be generated using cpp/tests/resources/scripts/generate_expected_gpt_output.py. auto const FP32_RESULT_FILE = "output_tokens_fp32.npy"; @@ -62,6 +59,18 @@ auto const FP16_RESULT_FILE = "output_tokens_fp16.npy"; auto const FP16_PLUGIN_RESULT_FILE = "output_tokens_fp16_plugin.npy"; auto const FP16_PLUGIN_PACKED_RESULT_FILE = "output_tokens_fp16_plugin_packed.npy"; +struct ModelIds +{ + int endId; + int padId; +}; + +struct ModelParams +{ + char const* baseDir; + ModelIds ids; +}; + class ModelSpec { public: @@ -70,7 +79,6 @@ class ModelSpec , mResultsFile{std::move(resultsFile)} , mDataType{dtype} , mUseGptAttentionPlugin{false} - , mUseInflightBatching{false} , mUsePackedInput{false} , mUsePagedKvCache{false} , mDecoderPerRequest{false} @@ -83,12 +91,6 @@ class ModelSpec return *this; } - ModelSpec& useInflightBatching() - { - mUseInflightBatching = true; - return *this; - } - ModelSpec& usePackedInput() { mUsePackedInput = true; @@ -111,7 +113,6 @@ class ModelSpec std::string mResultsFile; nvinfer1::DataType mDataType; bool mUseGptAttentionPlugin; - bool mUseInflightBatching; bool mUsePackedInput; bool mUsePagedKvCache; bool mDecoderPerRequest; @@ -130,7 +131,7 @@ class SessionTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type- mLogger = std::make_shared(); - initLibNvInferPlugins(mLogger.get(), "tensorrt_llm"); + initTrtLlmPlugins(mLogger.get()); } void TearDown() override {} @@ -149,10 +150,9 @@ void verifyModelConfig(GptModelConfig const& modelConfig, ModelSpec const& model ASSERT_EQ(modelSpec.mDataType, modelConfig.getDataType()); } -template -void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeType beamWidth, +void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds const modelIds, SizeType beamWidth, std::initializer_list const& batchSizes, std::string const& resultsFile, - std::shared_ptr const& logger, bool replicateFirstInput = false, bool cudaGraphMode = false) + std::shared_ptr const& logger, bool cudaGraphMode = false) { ASSERT_TRUE(fs::exists(DATA_PATH)); auto givenInput = tc::Tensor::loadNpy(DATA_PATH / "input_tokens.npy", tc::MEMORY_CPU); @@ -172,7 +172,8 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT auto const decoderPerRequest = modelSpec.mDecoderPerRequest; auto const worldConfig = WorldConfig::mpi(*logger); - auto const enginePath = modelPath / json.engineFilename(worldConfig); + auto enginePath = modelPath / json.engineFilename(worldConfig); + ASSERT_TRUE(fs::exists(enginePath)); auto const maxInputLength = static_cast(givenInput.shape[1]); auto const maxSeqLength = static_cast(expectedOutput.shape[1]); @@ -185,6 +186,9 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT samplingConfig.topK = std::vector{0}; samplingConfig.topP = std::vector{0.0f}; + auto const padId = modelIds.padId; + auto const endId = modelIds.endId; + std::vector givenInputLengths(nbGivenInputs); for (SizeType i = 0; i < nbGivenInputs; ++i) { @@ -210,7 +214,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT std::vector inputLenghtsHost(batchSize); for (SizeType i = 0; i < batchSize; ++i) { - const int inputIdx = replicateFirstInput ? 0 : i % nbGivenInputs; + const int inputIdx = i % nbGivenInputs; inputLenghtsHost[i] = givenInputLengths[inputIdx]; } auto inputLenghts = bufferManager.copyFrom(inputLenghtsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU); @@ -226,7 +230,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT std::vector inputsHost(totalInputSize); for (SizeType i = 0; i < batchSize; ++i) { - auto const seqBegin = givenInputData + (replicateFirstInput ? 0 : (i % nbGivenInputs) * maxInputLength); + auto const seqBegin = givenInputData + (i % nbGivenInputs) * maxInputLength; std::copy(seqBegin, seqBegin + inputLenghtsHost[i], inputsHost.begin() + inputOffsetsHost[i]); } inputIds = bufferManager.copyFrom(inputsHost, ITensor::makeShape({1, totalInputSize}), MemoryType::kGPU); @@ -236,7 +240,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT std::vector inputsHost(batchSize * maxInputLength, padId); for (SizeType i = 0; i < batchSize; ++i) { - auto const seqBegin = givenInputData + (replicateFirstInput ? 0 : (i % nbGivenInputs) * maxInputLength); + auto const seqBegin = givenInputData + (i % nbGivenInputs) * maxInputLength; std::copy(seqBegin, seqBegin + inputLenghtsHost[i], inputsHost.begin() + i * maxInputLength); } inputIds @@ -282,9 +286,8 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT for (auto i = 0; i < maxSeqLength; ++i) { auto const outputIndex = tc::flat_index3(b, beam, i, beamWidth, maxSeqLength); - const int expectedBatch = replicateFirstInput ? 0 : b; auto const expectIndex - = tc::flat_index2((expectedBatch % nbGivenInputs * beamWidth + beam), i, maxSeqLength); + = tc::flat_index2((b % nbGivenInputs * beamWidth + beam), i, maxSeqLength); EXPECT_EQ(output[outputIndex], expectedOutputData[expectIndex]) << " b: " << b << " beam: " << beam << " i: " << i; anyMismatch |= (output[outputIndex] != expectedOutputData[expectIndex]); @@ -304,7 +307,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, SizeT auto constexpr kBatchSizes = {1, 8}; -using ParamType = std::tuple; +using ParamType = std::tuple; std::string generateTestName(const testing::TestParamInfo& info) { @@ -314,8 +317,6 @@ std::string generateTestName(const testing::TestParamInfo& info) name.append(beamWidth == 1 ? "Sampling" : "BeamWidth" + std::to_string(beamWidth)); if (modelSpec.mUseGptAttentionPlugin) name.append("GptAttentionPlugin"); - if (modelSpec.mUseInflightBatching) - name.append("WithInflightBatching"); if (modelSpec.mUsePackedInput) name.append("Packed"); if (modelSpec.mUsePagedKvCache) @@ -334,7 +335,9 @@ class ParamTest : public SessionTest, public ::testing::WithParamInterface(GetParam()); + auto const modelParams = std::get<0>(GetParam()); + auto const modelDir = modelParams.baseDir; + auto const modelIds = modelParams.ids; auto const modelSpec = std::get<1>(GetParam()); auto const modelPath{ENGINGE_PATH / modelDir / modelSpec.mModelPath / "1-gpu"}; SizeType const beamWidth{std::get<2>(GetParam())}; @@ -345,15 +348,13 @@ TEST_P(ParamTest, Test) if (!modelSpec.mUseGptAttentionPlugin && beamWidth > 1) GTEST_SKIP(); - auto const replicateFirstInput = false; auto const cudaGraphMode = std::get<3>(GetParam()); - testGptSession( - modelPath, modelSpec, beamWidth, kBatchSizes, resultsFile, mLogger, replicateFirstInput, cudaGraphMode); + testGptSession(modelPath, modelSpec, modelIds, beamWidth, kBatchSizes, resultsFile, mLogger, cudaGraphMode); } INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest, - testing::Combine(testing::Values(GPT_MODEL_DIR), + testing::Combine(testing::Values(ModelParams{GPT_MODEL_DIR, {50256, 50256}}), testing::Values( // single decoder ModelSpec{FP32_GPT_DIR, FP32_RESULT_FILE, nvinfer1::DataType::kFLOAT}, @@ -369,17 +370,6 @@ INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest, .useGptAttentionPlugin() .usePackedInput() .usePagedKvCache(), - // ModelSpec{ - // FP16_GPT_ATTENTION_INFLIGHT_BATCHING_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, nvinfer1::DataType::kHALF} - // .useGptAttentionPlugin() - // .useInflightBatching() - // .usePackedInput(), - ModelSpec{FP16_GPT_ATTENTION_INFLIGHT_BATCHING_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, - nvinfer1::DataType::kHALF} - .useGptAttentionPlugin() - .useInflightBatching() - .usePackedInput() - .usePagedKvCache(), // decoderBatch ModelSpec{FP32_GPT_DIR, FP32_RESULT_FILE, nvinfer1::DataType::kFLOAT}.useDecoderPerRequest(), ModelSpec{FP32_GPT_ATTENTION_DIR, FP32_PLUGIN_RESULT_FILE, nvinfer1::DataType::kFLOAT} @@ -397,19 +387,6 @@ INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest, .useGptAttentionPlugin() .usePackedInput() .usePagedKvCache() - .useDecoderPerRequest(), - // ModelSpec{ - // FP16_GPT_ATTENTION_INFLIGHT_BATCHING_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, nvinfer1::DataType::kHALF} - // .useGptAttentionPlugin() - // .useInflightBatching() - // .usePackedInput() - // .useDecoderPerRequest(), - ModelSpec{FP16_GPT_ATTENTION_INFLIGHT_BATCHING_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, - nvinfer1::DataType::kHALF} - .useGptAttentionPlugin() - .useInflightBatching() - .usePackedInput() - .usePagedKvCache() .useDecoderPerRequest() ), @@ -417,7 +394,7 @@ INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest, generateTestName); INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest, - testing::Combine(testing::Values(GPTJ_MODEL_DIR), + testing::Combine(testing::Values(ModelParams{GPTJ_MODEL_DIR, {50256, 50256}}), testing::Values( // single decoder ModelSpec{FP16_GPT_ATTENTION_DIR, FP16_PLUGIN_RESULT_FILE, nvinfer1::DataType::kHALF} @@ -425,8 +402,7 @@ INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest, ModelSpec{FP16_GPT_ATTENTION_PACKED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, nvinfer1::DataType::kHALF} .useGptAttentionPlugin() .usePackedInput(), - ModelSpec{FP16_GPT_ATTENTION_INFLIGHT_BATCHING_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, - nvinfer1::DataType::kHALF} + ModelSpec{FP16_GPT_ATTENTION_PACKED_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, nvinfer1::DataType::kHALF} .useGptAttentionPlugin() .usePackedInput() .usePagedKvCache(), @@ -438,8 +414,7 @@ INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest, .useGptAttentionPlugin() .usePackedInput() .useDecoderPerRequest(), - ModelSpec{FP16_GPT_ATTENTION_INFLIGHT_BATCHING_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, - nvinfer1::DataType::kHALF} + ModelSpec{FP16_GPT_ATTENTION_PACKED_PAGED_DIR, FP16_PLUGIN_PACKED_RESULT_FILE, nvinfer1::DataType::kHALF} .useGptAttentionPlugin() .usePackedInput() .usePagedKvCache() @@ -449,11 +424,26 @@ INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest, testing::Values(1, 2), testing::Values(false)), generateTestName); -class LlamaSessionTest : public SessionTest +INSTANTIATE_TEST_SUITE_P(LlamaSessionTest, ParamTest, + testing::Combine(testing::Values(ModelParams{LLAMA_MODEL_DIR, {2, 2}}), + testing::Values( + // single decoder + ModelSpec{FP16_GPT_ATTENTION_DIR, FP16_PLUGIN_RESULT_FILE, nvinfer1::DataType::kHALF} + .useGptAttentionPlugin(), + // decoderBatch + ModelSpec{FP16_GPT_ATTENTION_DIR, FP16_PLUGIN_RESULT_FILE, nvinfer1::DataType::kHALF} + .useGptAttentionPlugin() + .useDecoderPerRequest() + + ), + testing::Values(1, 2), testing::Values(false)), + generateTestName); + +class LlamaSessionOnDemandTest : public SessionTest { }; -TEST_F(LlamaSessionTest, SamplingFP16WithAttentionPlugin) +TEST_F(LlamaSessionOnDemandTest, SamplingFP16WithAttentionPlugin) { GTEST_SKIP() << "Run only on demand"; auto const modelDir = "llama_7bf"; @@ -465,11 +455,12 @@ TEST_F(LlamaSessionTest, SamplingFP16WithAttentionPlugin) auto constexpr dtype = nvinfer1::DataType::kHALF; auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin(); + auto const modeIds = ModelIds{2, 2}; - testGptSession<2, 2>(modelPath, modelSpec, beamWidth, batchSizes, resultsFile, mLogger); + testGptSession(modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger); } -TEST_F(LlamaSessionTest, SamplingFP16AttentionPluginDecoderBatch) +TEST_F(LlamaSessionOnDemandTest, SamplingFP16AttentionPluginDecoderBatch) { GTEST_SKIP() << "Run only on demand"; auto const modelDir = "llamav2"; @@ -480,6 +471,7 @@ TEST_F(LlamaSessionTest, SamplingFP16AttentionPluginDecoderBatch) auto constexpr dtype = nvinfer1::DataType::kHALF; auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin().usePackedInput().useDecoderPerRequest(); + auto const modeIds = ModelIds{2, 2}; - testGptSession<2, 2>(modelPath, modelSpec, beamWidth, batchSizes, resultsFile, mLogger); + testGptSession(modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger); } diff --git a/cpp/tests/runtime/runtimeKernelTest.cpp b/cpp/tests/runtime/runtimeKernelTest.cpp index 3af4f9cfb47..e503e912e22 100644 --- a/cpp/tests/runtime/runtimeKernelTest.cpp +++ b/cpp/tests/runtime/runtimeKernelTest.cpp @@ -54,80 +54,87 @@ class RuntimeKernelTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro BufferManager::CudaStreamPtr mStream; }; -TEST_F(RuntimeKernelTest, FillInt32) +namespace { - SizeType constexpr value{3}; - SizeType constexpr size{123}; - auto buffer = mManager->gpu(size, nvinfer1::DataType::kINT32); - kernels::invokeFill(*buffer, value, *mStream); - auto bufferHost = mManager->copyFrom(*buffer, MemoryType::kCPU); - auto bufferPtr = bufferCast(*bufferHost); - std::vector expected(buffer->getSize(), value); +template +void testFill(IBuffer& buffer, BufferManager& manager, CudaStream& stream) +{ + T constexpr value{3}; + kernels::invokeFill(buffer, value, stream); + auto bufferHost = manager.copyFrom(buffer, MemoryType::kCPU); + auto bufferPtr = bufferCast(*bufferHost); + auto constexpr expected = value; auto anyMismatch = false; - for (std::size_t i = 0; i < buffer->getSize(); ++i) + for (std::size_t i = 0; i < buffer.getSize(); ++i) { - EXPECT_EQ(bufferPtr[i], expected[i]) << "Error at index " << i; - anyMismatch |= bufferPtr[i] != expected[i]; + EXPECT_EQ(bufferPtr[i], expected) << "Error at index " << i; + anyMismatch |= bufferPtr[i] != expected; } - buffer.release(); ASSERT_FALSE(anyMismatch); +} +} // namespace - auto tensor = mManager->gpu(ITensor::makeShape({size, size}), nvinfer1::DataType::kINT32); - kernels::invokeFill(*tensor, value, *mStream); - auto tensorHost = mManager->copyFrom(*tensor, MemoryType::kCPU); - auto tensorPtr = bufferCast(*tensorHost); - expected.clear(); - expected.resize(tensor->getSize(), value); +TEST_F(RuntimeKernelTest, FillBufferInt8) +{ + for (auto size : {123llu, 1025llu, 1llu << 32}) + { + auto buffer = mManager->gpu(size, nvinfer1::DataType::kINT8); + testFill(*buffer, *mManager, *mStream); + buffer.release(); + } +} - anyMismatch = false; - for (std::size_t i = 0; i < tensor->getSize(); ++i) +TEST_F(RuntimeKernelTest, FillTensorInt8) +{ + for (auto size : {123, 1025, std::numeric_limits::max()}) { - EXPECT_EQ(tensorPtr[i], expected[i]) << "Error at index " << i; - anyMismatch |= tensorPtr[i] != expected[i]; + auto tensor = mManager->gpu(ITensor::makeShape({size, 2}), nvinfer1::DataType::kINT8); + testFill(*tensor, *mManager, *mStream); + tensor.release(); } - tensor.release(); - ASSERT_FALSE(anyMismatch); } -TEST_F(RuntimeKernelTest, AddInt32) +namespace +{ +void testAdd(IBuffer& buffer, BufferManager& manager, CudaStream& stream) { SizeType constexpr value{3}; - SizeType constexpr size{123}; - auto buffer = mManager->gpu(size, nvinfer1::DataType::kINT32); - mManager->setZero(*buffer); - kernels::invokeAdd(*buffer, value, *mStream); - kernels::invokeAdd(*buffer, value, *mStream); - auto bufferHost = mManager->copyFrom(*buffer, MemoryType::kCPU); + manager.setZero(buffer); + kernels::invokeAdd(buffer, value, stream); + kernels::invokeAdd(buffer, value, stream); + auto bufferHost = manager.copyFrom(buffer, MemoryType::kCPU); auto bufferPtr = bufferCast(*bufferHost); - std::vector expected(buffer->getSize(), 2 * value); + auto constexpr expected = 2 * value; auto anyMismatch = false; - for (std::size_t i = 0; i < buffer->getSize(); ++i) + for (std::size_t i = 0; i < buffer.getSize(); ++i) { - EXPECT_EQ(bufferPtr[i], expected[i]) << "Error at index " << i; - anyMismatch |= bufferPtr[i] != expected[i]; + EXPECT_EQ(bufferPtr[i], expected) << "Error at index " << i; + anyMismatch |= bufferPtr[i] != expected; } - buffer.release(); ASSERT_FALSE(anyMismatch); +} +} // namespace - auto tensor = mManager->gpu(ITensor::makeShape({size, size}), nvinfer1::DataType::kINT32); - mManager->setZero(*tensor); - kernels::invokeAdd(*tensor, value, *mStream); - kernels::invokeAdd(*tensor, value, *mStream); - auto tensorHost = mManager->copyFrom(*tensor, MemoryType::kCPU); - auto tensorPtr = bufferCast(*tensorHost); - expected.clear(); - expected.resize(tensor->getSize(), 2 * value); - - anyMismatch = false; - for (std::size_t i = 0; i < tensor->getSize(); ++i) +TEST_F(RuntimeKernelTest, AddBufferInt32) +{ + for (auto size : {123, 1025}) { - EXPECT_EQ(tensorPtr[i], expected[i]) << "Error at index " << i; - anyMismatch |= tensorPtr[i] != expected[i]; + auto buffer = mManager->gpu(size, nvinfer1::DataType::kINT32); + testAdd(*buffer, *mManager, *mStream); + buffer.release(); + } +} + +TEST_F(RuntimeKernelTest, AddTensorInt32) +{ + for (auto size : {123, 1025}) + { + auto tensor = mManager->gpu(ITensor::makeShape({size, size}), nvinfer1::DataType::kINT32); + testAdd(*tensor, *mManager, *mStream); + tensor.release(); } - tensor.release(); - ASSERT_FALSE(anyMismatch); } TEST_F(RuntimeKernelTest, Transpose) @@ -623,6 +630,35 @@ TEST_F(RuntimeKernelTest, ScatterHalf) } } +namespace +{ +template +void verifyTiling(std::vector const& input, ITensor const& outputTensor, BufferManager& manager) +{ + auto outputHost = manager.copyFrom(outputTensor, MemoryType::kCPU); + auto outputPtr = bufferCast(*outputHost); + + auto const& shape = outputTensor.getShape(); + auto batchSize = static_cast(shape.d[0]); + auto beamWidth = static_cast(shape.d[1]); + auto inputLength = outputTensor.getSize() / batchSize / beamWidth; + + for (std::size_t b = 0; b < batchSize; ++b) + { + for (std::size_t beam = 0; beam < beamWidth; ++beam) + { + for (std::size_t i = 0; i < inputLength; ++i) + { + auto const inputIdx = tc::flat_index2(b, i, inputLength); + auto const outputIdx = tc::flat_index3(b, beam, i, beamWidth, inputLength); + EXPECT_EQ(outputPtr[outputIdx], input[inputIdx]) + << "Error at index (" << b << ',' << beam << ',' << i << ')'; + } + } + } +} +} // namespace + TEST_F(RuntimeKernelTest, TileInt32) { SizeType const beamWidth{3}; @@ -637,22 +673,9 @@ TEST_F(RuntimeKernelTest, TileInt32) auto outputTensor = mManager->gpu(outputShape, nvinfer1::DataType::kINT32); kernels::tileTensor(*outputTensor, *inputTensor, beamWidth, *mStream); - auto outputHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU); - auto outputPtr = bufferCast(*outputHost); - for (SizeType b = 0; b < batchSize; ++b) - { - for (SizeType beam = 0; beam < beamWidth; ++beam) - { - for (SizeType i = 0; i < inputLength; ++i) - { - auto const inputIdx = tc::flat_index2(b, i, inputLength); - auto const outputIdx = tc::flat_index3(b, beam, i, beamWidth, inputLength); - EXPECT_EQ(outputPtr[outputIdx], input[inputIdx]) - << "Error at index (" << b << ',' << beam << ',' << i << ')'; - } - } - } + outputTensor->reshape(ITensor::makeShape({batchSize, beamWidth, inputLength})); + verifyTiling(input, *outputTensor, *mManager); } TEST_F(RuntimeKernelTest, TileHalf) @@ -670,22 +693,9 @@ TEST_F(RuntimeKernelTest, TileHalf) auto outputTensor = mManager->gpu(outputShape, nvinfer1::DataType::kHALF); kernels::tileTensor(*outputTensor, *inputTensor, beamWidth, *mStream); - auto outputHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU); - auto outputPtr = bufferCast(*outputHost); - for (SizeType b = 0; b < batchSize; ++b) - { - for (SizeType beam = 0; beam < beamWidth; ++beam) - { - for (SizeType i = 0; i < inputLength; ++i) - { - auto const inputIdx = tc::flat_index2(b, i, inputLength); - auto const outputIdx = tc::flat_index3(b, beam, i, beamWidth, inputLength); - EXPECT_EQ(outputPtr[outputIdx], input[inputIdx]) - << "Error at index (" << b << ',' << beam << ',' << i << ')'; - } - } - } + outputTensor->reshape(ITensor::makeShape({batchSize, beamWidth, inputLength})); + verifyTiling(input, *outputTensor, *mManager); } TEST_F(RuntimeKernelTest, TileInplaceInt32) @@ -703,22 +713,9 @@ TEST_F(RuntimeKernelTest, TileInplaceInt32) kernels::scatterTensor(*outputTensor, *inputTensor, beamWidth, *mStream); kernels::tileTensorInplace(*outputTensor, beamWidth, *mStream); - auto outputHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU); - auto outputPtr = bufferCast(*outputHost); - for (SizeType b = 0; b < batchSize; ++b) - { - for (SizeType beam = 0; beam < beamWidth; ++beam) - { - for (SizeType i = 0; i < inputLength; ++i) - { - auto const inputIdx = tc::flat_index2(b, i, inputLength); - auto const outputIdx = tc::flat_index3(b, beam, i, beamWidth, inputLength); - EXPECT_EQ(outputPtr[outputIdx], input[inputIdx]) - << "Error at index (" << b << ',' << beam << ',' << i << ')'; - } - } - } + outputTensor->reshape(ITensor::makeShape({batchSize, beamWidth, inputLength})); + verifyTiling(input, *outputTensor, *mManager); } TEST_F(RuntimeKernelTest, TileInplaceHalf) @@ -737,20 +734,62 @@ TEST_F(RuntimeKernelTest, TileInplaceHalf) kernels::scatterTensor(*outputTensor, *inputTensor, beamWidth, *mStream); kernels::tileTensorInplace(*outputTensor, beamWidth, *mStream); - auto outputHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU); - auto outputPtr = bufferCast(*outputHost); - for (SizeType b = 0; b < batchSize; ++b) + outputTensor->reshape(ITensor::makeShape({batchSize, beamWidth, inputLength})); + verifyTiling(input, *outputTensor, *mManager); +} + +TEST_F(RuntimeKernelTest, TileInt8Large) +{ + std::int8_t constexpr value{3}; + SizeType constexpr batchSize{1}; + SizeType constexpr beamWidth{2}; + + SizeType const d2{2}; + auto const d3 = std::numeric_limits::max(); + auto const inputShape = ITensor::makeShape({batchSize, d2, d3}); + auto const outputShape = ITensor::makeShape({batchSize * beamWidth, d2, d3}); + + auto inputTensor = mManager->gpu(inputShape, nvinfer1::DataType::kINT8); + kernels::invokeFill(*inputTensor, value, *mStream); + mStream->synchronize(); + + auto outputTensor = mManager->gpu(outputShape, nvinfer1::DataType::kINT8); + kernels::tileTensor(*outputTensor, *inputTensor, beamWidth, *mStream); + mStream->synchronize(); + + auto bufferHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU); + auto bufferPtr = bufferCast(*bufferHost); + auto constexpr expected = value; + for (std::size_t i = 0; i < bufferHost->getSize(); ++i) { - for (SizeType beam = 0; beam < beamWidth; ++beam) - { - for (SizeType i = 0; i < inputLength; ++i) - { - auto const inputIdx = tc::flat_index2(b, i, inputLength); - auto const outputIdx = tc::flat_index3(b, beam, i, beamWidth, inputLength); - EXPECT_EQ(outputPtr[outputIdx], input[inputIdx]) - << "Error at index (" << b << ',' << beam << ',' << i << ')'; - } - } + EXPECT_EQ(bufferPtr[i], expected) << "Error at index " << i; + } +} + +TEST_F(RuntimeKernelTest, TileInplaceInt8Large) +{ + std::int8_t constexpr value{3}; + SizeType constexpr batchSize{1}; + SizeType constexpr beamWidth{2}; + + SizeType const d2{2}; + auto const d3 = std::numeric_limits::max(); + auto const inputShape = ITensor::makeShape({batchSize, d2, d3}); + auto const outputShape = ITensor::makeShape({batchSize * beamWidth, d2, d3}); + + auto inputTensor = mManager->gpu(inputShape, nvinfer1::DataType::kINT8); + kernels::invokeFill(*inputTensor, value, *mStream); + + auto outputTensor = mManager->gpu(outputShape, nvinfer1::DataType::kINT8); + kernels::scatterTensor(*outputTensor, *inputTensor, beamWidth, *mStream); + kernels::tileTensorInplace(*outputTensor, beamWidth, *mStream); + + auto bufferHost = mManager->copyFrom(*outputTensor, MemoryType::kCPU); + auto bufferPtr = bufferCast(*bufferHost); + auto constexpr expected = value; + for (std::size_t i = 0; i < bufferHost->getSize(); ++i) + { + EXPECT_EQ(bufferPtr[i], expected) << "Error at index " << i; } } diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 87d19f07540..e2ae69f7ad7 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -1,6 +1,6 @@ # Multi-stage Dockerfile ARG BASE_IMAGE=nvcr.io/nvidia/pytorch -ARG BASE_TAG=23.07-py3 +ARG BASE_TAG=23.08-py3 FROM ${BASE_IMAGE}:${BASE_TAG} as base @@ -24,15 +24,26 @@ RUN --mount=type=cache,target=/root/.cache \ pip uninstall -y tensorrt # Download & install internal TRT release -ARG TENSOR_RT_VERSION="9.0.1.4" +ARG TENSOR_RT_VERSION="9.1.0.1" ARG CUDA_VERSION="12.2" -ARG RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/secure/9.0.1/tars/TensorRT-${TENSOR_RT_VERSION}.Linux.x86_64-gnu.cuda-${CUDA_VERSION}.tar.gz +ARG RELEASE_URL_TRT +ARG TARGETARCH + RUN --mount=type=cache,target=/root/.cache \ - wget --no-verbose ${RELEASE_URL_TRT} -P /workspace && \ - tar -xf /workspace/TensorRT-${TENSOR_RT_VERSION}.Linux.x86_64-gnu.cuda-${CUDA_VERSION}.tar.gz -C /usr/local/ && \ + if [ -z "$RELEASE_URL_TRT"];then \ + ARCH=${TARGETARCH} && \ + if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi && \ + if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi && \ + if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi &&\ + if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04"; else OS1="Linux" && OS2="Linux";fi &&\ + RELEASE_URL_TRT=http://cuda-repo.nvidia.com/release-candidates/Libraries/TensorRT/v9.1/${TENSOR_RT_VERSION}-b6aa91dc/${CUDA_VERSION}-r535/${OS1}-${DIR_NAME}/tar/TensorRT-${TENSOR_RT_VERSION}.${OS2}.${ARCH}-gnu.cuda-${CUDA_VERSION}.tar.gz;\ + fi &&\ + wget --no-verbose ${RELEASE_URL_TRT} -O /workspace/TensorRT.tar && \ + tar -xf TensorRT.tar -C /usr/local/ && \ mv /usr/local/TensorRT-${TENSOR_RT_VERSION} /usr/local/tensorrt && \ - pip install /usr/local/tensorrt/python/tensorrt-9.0.1*cp310-none-linux_x86_64.whl && \ - rm -rf /workspace/TensorRT-${TENSOR_RT_VERSION}.Linux.x86_64-gnu.cuda-${CUDA_VERSION}.tar.gz + pip install /usr/local/tensorrt/python/tensorrt-*-cp310-*.whl && \ + rm -rf /workspace/TensorRT.tar + ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH} # Install latest Polygraphy @@ -59,6 +70,7 @@ FROM devel as release WORKDIR /app/tensorrt_llm COPY --from=wheel /src/tensorrt_llm/build/tensorrt_llm*.whl . +COPY --from=wheel /src/tensorrt_llm/cpp/include/ include/ RUN pip install tensorrt_llm*.whl && \ rm tensorrt_llm*.whl COPY README.md ./ diff --git a/docker/Makefile b/docker/Makefile index f4aa231251b..3e4153f81d3 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -42,7 +42,7 @@ endef %_build: @echo "Building docker image: $(IMAGE_WITH_TAG)" - docker build $(DOCKER_BUILD_OPTS) $(DOCKER_BUILD_ARGS) \ + DOCKER_BUILDKIT=1 docker build $(DOCKER_BUILD_OPTS) $(DOCKER_BUILD_ARGS) \ --progress $(DOCKER_PROGRESS) \ $(if $(BASE_IMAGE), --build-arg BASE_IMAGE=$(BASE_IMAGE)) \ $(if $(BASE_TAG), --build-arg BASE_TAG=$(BASE_TAG)) \ @@ -77,7 +77,7 @@ endif --workdir $(CODE_DIR) \ --hostname $(shell hostname)-$* \ --name $(CONTAINER_NAME)-$*-$(USER_NAME) \ - --tmpfs /tmp \ + --tmpfs /tmp:exec \ $(IMAGE_WITH_TAG)$(IMAGE_TAG_SUFFIX) $(RUN_CMD) devel_%: STAGE = devel diff --git a/docs/Doxygen b/docs/Doxygen new file mode 100644 index 00000000000..617416c16bf --- /dev/null +++ b/docs/Doxygen @@ -0,0 +1,2658 @@ +# Doxyfile 1.9.1 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "TensorRT-LLM" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = cpp_docs + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all generated output in the proper direction. +# Possible values are: None, LTR, RTL and Context. +# The default value is: None. + +OUTPUT_TEXT_DIRECTION = None + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# By default Python docstrings are displayed as preformatted text and doxygen's +# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the +# doxygen's special commands can be used and the contents of the docstring +# documentation blocks is shown as doxygen documentation. +# The default value is: YES. + +PYTHON_DOCSTRING = YES + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. +# When you need a literal { or } or , in the value part of an alias you have to +# escape them by means of a backslash (\), this can lead to conflicts with the +# commands \{ and \} for these it is advised to use the version @{ and @} or use +# a double escape (\\{ and \\}) + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL, +# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files). For instance to make doxygen treat .inc files +# as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. When specifying no_extension you should add +# * to the FILE_PATTERNS. +# +# Note see also the list of default file extension mappings. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See https://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 5. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 5 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use +# during processing. When set to 0 doxygen will based this on the number of +# cores available in the system. You can set it explicitly to a value larger +# than 0 to get more control over the balance between CPU load and processing +# speed. At this moment only the input processing can be done using multiple +# threads. Since this is still an experimental feature the default is set to 1, +# which efficively disables parallel processing. Please report any issues you +# encounter. Generating dot graphs in parallel is controlled by the +# DOT_NUM_THREADS setting. +# Minimum value: 0, maximum value: 32, default value: 1. + +NUM_PROC_THREADS = 1 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If this flag is set to YES, the name of an unnamed parameter in a declaration +# will be determined by the corresponding definition. By default unnamed +# parameters remain unnamed in the output. +# The default value is: YES. + +RESOLVE_UNNAMED_PARAMS = YES + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# declarations. If set to NO, these declarations will be included in the +# documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# With the correct setting of option CASE_SENSE_NAMES doxygen will better be +# able to match the capabilities of the underlying filesystem. In case the +# filesystem is case sensitive (i.e. it supports files in the same directory +# whose names only differ in casing), the option must be set to YES to properly +# deal with such files in case they appear in the input. For filesystems that +# are not case sensitive the option should be be set to NO to properly deal with +# output files written for symbols that only differ in casing, such as for two +# classes, one named CLASS and the other named Class, and to also support +# references to files without having to specify the exact matching casing. On +# Windows (including Cygwin) and MacOS, users should typically set this option +# to NO, whereas on Linux or other Unix flavors it should typically be set to +# YES. +# The default value is: system dependent. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. If +# EXTRACT_ALL is set to YES then this flag will automatically be disabled. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS +# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but +# at the end of the doxygen process doxygen will return with a non-zero status. +# Possible values are: NO, YES and FAIL_ON_WARNINGS. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = ../cpp/include/tensorrt_llm/runtime + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# Note the list of default checked file patterns might differ from the list of +# default file extension mappings. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), +# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl, +# *.ucf, *.qsf and *.ice. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.pyw \ + *.f90 \ + *.f95 \ + *.f03 \ + *.f08 \ + *.f18 \ + *.f \ + *.for \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf \ + *.ice + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# entity all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: +# http://clang.llvm.org/) for more accurate parsing at the cost of reduced +# performance. This can be particularly helpful with template rich C++ code for +# which doxygen's built-in parser lacks the necessary type information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled and the CLANG_ADD_INC_PATHS tag is set to +# YES then doxygen will add the directory of each input to the include path. +# The default value is: YES. + +CLANG_ADD_INC_PATHS = YES + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the directory containing a file called compile_commands.json. This +# file is the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the +# options used when the source files were built. This is equivalent to +# specifying the -p option to a clang tool, such as clang-check. These options +# will then be passed to the parser. Any options specified with CLANG_OPTIONS +# will be added as well. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. + +CLANG_DATABASE_PATH = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To +# create a documentation set, doxygen will generate a Makefile in the HTML +# output directory. Running make will produce the docset in that directory and +# running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: +# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the main .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location (absolute path +# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to +# run qhelpgenerator on the generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg +# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see +# https://inkscape.org) to generate formulas as SVG images instead of PNGs for +# the HTML output. These images will generally look nicer at scaled resolutions. +# Possible values are: png (the default) and svg (looks nicer but requires the +# pdf2svg or inkscape tool). +# The default value is: png. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FORMULA_FORMAT = png + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side JavaScript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = https://cdn.jsdelivr.net/npm/mathjax@2 + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /