NVIDIA · Funatiq · Oct 3, 2025 · Jul 2, 2025 · Jul 8, 2025 · Aug 6, 2025
diff --git a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,19 +20,15 @@
 #include "tensorrt_llm/batch_manager/common.h"
 #include "tensorrt_llm/common/algorithm.h"
 #include "tensorrt_llm/common/optionalRef.h"
-#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 namespace tensorrt_llm::runtime
 {
-class DecodingInput;
-class DecodingOutput;
-class GptDecoderBatched;
 class SamplingConfig;
-class SpeculativeDecodingMode;
 
 namespace decoder
 {
@@ -56,10 +52,6 @@ class CreateNewDecoderRequests : Algorithm
     using CudaStream = tensorrt_llm::runtime::CudaStream;
     using TensorPtr = runtime::ITensor::SharedPtr;
     using SharedConstPtr = runtime::ITensor::SharedConstPtr;
-    using DecodingInput = runtime::DecodingInput;
-    using DecodingOutput = runtime::DecodingOutput;
-    using SpeculativeDecodingMode = runtime::SpeculativeDecodingMode;
-    using GptDecoderBatched = runtime::GptDecoderBatched;
     template <typename T>
     using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
 
@@ -70,16 +62,15 @@ class CreateNewDecoderRequests : Algorithm
     {
     }
 
-    std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
+    [[nodiscard]] std::tuple<TensorPtr, std::vector<SamplingConfig>, std::vector<SharedConstPtr>,
         std::vector<executor::LookaheadDecodingConfig>>
     operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
         executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
         nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
         CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
         SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
 
-    [[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
-        std::vector<executor::LookaheadDecodingConfig>>
+    [[nodiscard]] std::tuple<std::vector<SharedConstPtr>, std::vector<executor::LookaheadDecodingConfig>>
     createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
         executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
         nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,

diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,8 @@
 #include <cassert>
 #include <chrono>
 #include <cstdint>
+#include <cstring>
+#include <list>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -56,9 +58,9 @@ enum class LlmRequestState : int32_t
                                             /// used in layer-wise transmission
     kDISAGG_GENERATION_TRANS_COMPLETE = 12, ///< Kv cache transmission are finished
     kGENERATION_IN_PROGRESS = 13,           ///< Generation phase is in progress
-    kGENERATION_TO_COMPLETE = 14,           ///< Generation phase is to be completed
 
     // schedulable states ends
+    kGENERATION_TO_COMPLETE = 14,           ///< Generation phase is to be completed
     kGENERATION_COMPLETE = 20,              ///< Generation phase completed
     kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache,
                                             /// after computation finished
@@ -1074,7 +1076,6 @@ class GenericLlmRequest
         TLLM_CHECK_WITH_INFO(prepopulatedPromptLen < promptLen,
             "Invalid state: prepopulatedPromptLen (%d) >= promptLen (%d) for request %lu", prepopulatedPromptLen,
             promptLen, mRequestId);
-        TLLM_CHECK(prepopulatedPromptLen < promptLen);
 
         auto& prePromptLen = mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget;
         auto& contextCurrentPosition = mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget;
@@ -1115,9 +1116,9 @@ class GenericLlmRequest
         mDraftLogits = draftLogits;
     }
 
-    [[nodiscard]] SizeType32 getNumDraftTokens() const
+    [[nodiscard]] SizeType32 getNumDraftTokens() const noexcept
     {
-        return hasDraftTokens() ? mDraftTokens->size() : 0;
+        return hasDraftTokens() ? static_cast<SizeType32>(mDraftTokens->size()) : 0;
     }
 
     void discardDraftTokens(SizeType32 numTokensToDiscard)
@@ -1378,17 +1379,17 @@ class GenericLlmRequest
         mGenerationLogitsFragments.push_back(genLogits);
     }
 
-    SizeType32 getGenerationLogitsFragmentsSize()
+    [[nodiscard]] SizeType32 getGenerationLogitsFragmentsSize() const noexcept
     {
-        return mGenerationLogitsFragments.size();
+        return static_cast<SizeType32>(mGenerationLogitsFragments.size());
     }
 
-    void clearGenerationLogitsFragments()
+    void clearGenerationLogitsFragments() noexcept
     {
         mGenerationLogitsFragments.clear();
     }
 
-    bool hasAdditionalOutputs()
+    [[nodiscard]] bool hasAdditionalOutputs() const noexcept
     {
         return !mAdditionalContextOutputTensors.empty() || !mAdditionalGenerationOutputTensors.empty();
     }

diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1478,7 +1478,8 @@ class CacheTransceiverConfig
 class ExecutorConfig
 {
 public:
-    static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000;
+    static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds
+        = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::minutes(3)).count();
 
     static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000;
 

diff --git a/cpp/include/tensorrt_llm/runtime/lookaheadModule.h b/cpp/include/tensorrt_llm/runtime/lookaheadModule.h
@@ -19,7 +19,6 @@
 #include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/speculativeDecodingModule.h"
-#include <memory>
 
 namespace tensorrt_llm::runtime
 {
@@ -29,7 +28,6 @@ class LookaheadModule : public SpeculativeDecodingModule
 public:
     explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
         : SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxDecodingDraftTokens)
-        , mExecutionConfig()
     {
     }
 
@@ -43,7 +41,7 @@ class LookaheadModule : public SpeculativeDecodingModule
         mExecutionConfig = config;
     }
 
-    executor::LookaheadDecodingConfig const getExecutionConfig() const
+    [[nodiscard]] executor::LookaheadDecodingConfig const& getExecutionConfig() const
     {
         return mExecutionConfig;
     }

diff --git a/cpp/include/tensorrt_llm/runtime/modelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h
@@ -21,6 +21,7 @@
 #include "tensorrt_llm/runtime/lookaheadModule.h"
 #include "tensorrt_llm/runtime/loraModule.h"
 #include "tensorrt_llm/runtime/speculativeDecodingMode.h"
+#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
 
 #include <NvInferRuntime.h>
 #include <array>

diff --git a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
@@ -39,7 +39,6 @@ using namespace tensorrt_llm::runtime;
 
 namespace tc = tensorrt_llm::common;
 namespace te = tensorrt_llm::executor;
-namespace tk = tensorrt_llm::kernels;
 namespace tr = tensorrt_llm::runtime;
 
 namespace tensorrt_llm::batch_manager

diff --git a/cpp/tensorrt_llm/runtime/bufferView.h b/cpp/tensorrt_llm/runtime/bufferView.h
@@ -39,8 +39,8 @@ class BufferView : virtual public IBuffer
 
         if (offset + size > mBuffer->getSize())
         {
-            throw std::out_of_range(std::string("slice ") + std::to_string(offset + size) + " exceeds buffer size "
-                + std::to_string(mBuffer->getSize()));
+            throw std::out_of_range(std::string("offset ") + std::to_string(offset) + std::string(" + size ")
+                + std::to_string(size) + " exceeds buffer size " + std::to_string(mBuffer->getSize()));
         }
     }
 

diff --git a/cpp/tests/e2e_tests/batch_manager/trtGptModelRealDecoderTest.cpp b/cpp/tests/e2e_tests/batch_manager/trtGptModelRealDecoderTest.cpp
@@ -284,8 +284,8 @@ void verifyOutput(RequestList const& finishedRequestList,
 }
 
 // Pick a different endId at random from one of the expected tokens
-std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelType const& modelType,
-    std::vector<SizeType32> const& givenInputLengths, SizeType32 const maxNewTokens, bool replaceLogits)
+std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, std::vector<SizeType32> const& givenInputLengths,
+    SizeType32 const maxNewTokens, bool replaceLogits)
 {
     auto const nbGivenInputs = testData.nbGivenInputs;
     auto const beamWidth = testData.beamWidth;
@@ -328,9 +328,9 @@ std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelT
     return endIds;
 }
 
-TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelType, ModelIds const modelIds,
-    BeamResult const& beamResult, ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId,
-    bool const replaceLogits, BufferManager& manager)
+TestData loadTestData(ModelSpec const& modelSpec, ModelIds const modelIds, BeamResult const& beamResult,
+    ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
+    BufferManager& manager)
 {
     auto const [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(givenInput, modelIds.padId);
     auto const& [beamWidth, resultsFile, contextLogitsFile, genLogitsFile, cumLogProbsFile, logProbsFile] = beamResult;
@@ -353,7 +353,7 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy
 
     if (useRandomEndId)
     {
-        testData.endIds = pickRandomEndIds(testData, modelType, givenInputLengths, maxNewTokens, replaceLogits);
+        testData.endIds = pickRandomEndIds(testData, givenInputLengths, maxNewTokens, replaceLogits);
     }
     else
     {
@@ -409,9 +409,8 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy
 }
 
 std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> loadTestData(ModelSpec const& modelSpec,
-    TrtGptModelType const& modelType, ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths,
-    ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
-    BufferManager& manager)
+    ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths, ITensor const& givenInput,
+    SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits, BufferManager& manager)
 {
     // Map between beam width, and expected results for that beam width
     std::unordered_map<SizeType32, TestData> beamWidthTestData;
@@ -424,8 +423,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
         EXPECT_EQ(std::find(beamWidths.begin(), beamWidths.end(), beamWidth), beamWidths.end());
         beamWidths.push_back(beamWidth);
 
-        auto testData = loadTestData(modelSpec, modelType, modelIds, beamResult, givenInput, maxBeamWidth,
-            useRandomEndId, replaceLogits, manager);
+        auto testData = loadTestData(
+            modelSpec, modelIds, beamResult, givenInput, maxBeamWidth, useRandomEndId, replaceLogits, manager);
         beamWidthTestData.emplace(beamWidth, std::move(testData));
     }
 
@@ -435,9 +434,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
 RequestList runGptModelInference(std::shared_ptr<TrtGptModel>& trtGptModel, std::vector<SizeType32> const& beamWidths,
     std::unordered_map<SizeType32, TestData> const& beamWidthTestData, SizeType32 batchSize, SizeType32 nbGivenInputs,
     SizeType32 maxInputLength, SizeType32 padId, std::vector<SizeType32> const& givenInputLengths,
-    TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType,
-    TrtGptModelType modelType, int maxReqPerStep, bool prepopulateKVCache, bool enableStreamingMode,
-    bool enableBlockReuse)
+    TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType, int maxReqPerStep,
+    bool prepopulateKVCache, bool enableStreamingMode, bool enableBlockReuse)
 {
     // Fill the requests using givenInput
     // requestList will have batchSize requests
@@ -641,8 +639,8 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds
 
     auto const maxBeamWidth = executorConfig.getMaxBeamWidth();
     // Load expected outputs for each beam width value
-    auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelType, modelIds, resultsFilesBeamWidths,
-        *givenInput, maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);
+    auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelIds, resultsFilesBeamWidths, *givenInput,
+        maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);
 
     int const worldSize = modelSpec.mTPSize * modelSpec.mPPSize * modelSpec.mCPSize;
     auto const worldConfig = WorldConfig::mpi(worldSize, modelSpec.mTPSize, modelSpec.mPPSize, modelSpec.mCPSize);
@@ -663,14 +661,14 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds
         // Prepopulate KV cache for speculative decoding test
         bool const prepopulateKVCache = modelSpec.mMaxDraftTokens > 0;
         auto finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
-            nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
-            maxReqPerStep, prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);
+            nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, maxReqPerStep,
+            prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);
 
         if (prepopulateKVCache)
         {
             // Call the 2nd time with prefilled KV cache
             finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
-                nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
+                nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType,
                 maxReqPerStep, false, enableStreamingMode, modelSpec.mKVCacheReuse);
         }
 

diff --git a/cpp/tests/unit_tests/batch_manager/llmRequestTest.cpp b/cpp/tests/unit_tests/batch_manager/llmRequestTest.cpp
@@ -56,7 +56,7 @@ TEST_F(LlmRequestTest, fromExecutorRequest)
         EXPECT_EQ(llmReq.getState(), tb::LlmRequestState::kCONTEXT_INIT);
         EXPECT_FALSE(llmReq.mSeqSlot);
         // No speculative decoding config, draft tokens should be empty
-        EXPECT_EQ(llmReq.getDraftTokens()->size(), 0);
+        EXPECT_EQ(llmReq.getNumDraftTokens(), 0);
         EXPECT_FALSE(llmReq.getEmbeddingBias().has_value());
         EXPECT_FALSE(llmReq.getBadWordsList().has_value());
         EXPECT_FALSE(llmReq.getStopWordsList().has_value());