Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 4 additions & 13 deletions cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -20,19 +20,15 @@
#include "tensorrt_llm/batch_manager/common.h"
#include "tensorrt_llm/common/algorithm.h"
#include "tensorrt_llm/common/optionalRef.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/worldConfig.h"

namespace tensorrt_llm::runtime
{
class DecodingInput;
class DecodingOutput;
class GptDecoderBatched;
class SamplingConfig;
class SpeculativeDecodingMode;

namespace decoder
{
Expand All @@ -56,10 +52,6 @@ class CreateNewDecoderRequests : Algorithm
using CudaStream = tensorrt_llm::runtime::CudaStream;
using TensorPtr = runtime::ITensor::SharedPtr;
using SharedConstPtr = runtime::ITensor::SharedConstPtr;
using DecodingInput = runtime::DecodingInput;
using DecodingOutput = runtime::DecodingOutput;
using SpeculativeDecodingMode = runtime::SpeculativeDecodingMode;
using GptDecoderBatched = runtime::GptDecoderBatched;
template <typename T>
using OptionalRef = tensorrt_llm::common::OptionalRef<T>;

Expand All @@ -70,16 +62,15 @@ class CreateNewDecoderRequests : Algorithm
{
}

std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
[[nodiscard]] std::tuple<TensorPtr, std::vector<SamplingConfig>, std::vector<SharedConstPtr>,
std::vector<executor::LookaheadDecodingConfig>>
operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;

[[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
std::vector<executor::LookaheadDecodingConfig>>
[[nodiscard]] std::tuple<std::vector<SharedConstPtr>, std::vector<executor::LookaheadDecodingConfig>>
createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
Expand Down
19 changes: 10 additions & 9 deletions cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,6 +29,8 @@
#include <cassert>
#include <chrono>
#include <cstdint>
#include <cstring>
#include <list>
#include <memory>
#include <optional>
#include <utility>
Expand Down Expand Up @@ -56,9 +58,9 @@ enum class LlmRequestState : int32_t
/// used in layer-wise transmission
kDISAGG_GENERATION_TRANS_COMPLETE = 12, ///< Kv cache transmission are finished
kGENERATION_IN_PROGRESS = 13, ///< Generation phase is in progress
kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed

// schedulable states ends
kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed
kGENERATION_COMPLETE = 20, ///< Generation phase completed
kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache,
/// after computation finished
Expand Down Expand Up @@ -1074,7 +1076,6 @@ class GenericLlmRequest
TLLM_CHECK_WITH_INFO(prepopulatedPromptLen < promptLen,
"Invalid state: prepopulatedPromptLen (%d) >= promptLen (%d) for request %lu", prepopulatedPromptLen,
promptLen, mRequestId);
TLLM_CHECK(prepopulatedPromptLen < promptLen);

auto& prePromptLen = mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget;
auto& contextCurrentPosition = mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget;
Expand Down Expand Up @@ -1115,9 +1116,9 @@ class GenericLlmRequest
mDraftLogits = draftLogits;
}

[[nodiscard]] SizeType32 getNumDraftTokens() const
[[nodiscard]] SizeType32 getNumDraftTokens() const noexcept
{
return hasDraftTokens() ? mDraftTokens->size() : 0;
return hasDraftTokens() ? static_cast<SizeType32>(mDraftTokens->size()) : 0;
}

void discardDraftTokens(SizeType32 numTokensToDiscard)
Expand Down Expand Up @@ -1378,17 +1379,17 @@ class GenericLlmRequest
mGenerationLogitsFragments.push_back(genLogits);
}

SizeType32 getGenerationLogitsFragmentsSize()
[[nodiscard]] SizeType32 getGenerationLogitsFragmentsSize() const noexcept
{
return mGenerationLogitsFragments.size();
return static_cast<SizeType32>(mGenerationLogitsFragments.size());
}

void clearGenerationLogitsFragments()
void clearGenerationLogitsFragments() noexcept
{
mGenerationLogitsFragments.clear();
}

bool hasAdditionalOutputs()
[[nodiscard]] bool hasAdditionalOutputs() const noexcept
{
return !mAdditionalContextOutputTensors.empty() || !mAdditionalGenerationOutputTensors.empty();
}
Expand Down
3 changes: 2 additions & 1 deletion cpp/include/tensorrt_llm/executor/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -1478,7 +1478,8 @@ class CacheTransceiverConfig
class ExecutorConfig
{
public:
static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000;
static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::minutes(3)).count();

static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000;

Expand Down
4 changes: 1 addition & 3 deletions cpp/include/tensorrt_llm/runtime/lookaheadModule.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
#include <memory>

namespace tensorrt_llm::runtime
{
Expand All @@ -29,7 +28,6 @@ class LookaheadModule : public SpeculativeDecodingModule
public:
explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
: SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxDecodingDraftTokens)
, mExecutionConfig()
{
}

Expand All @@ -43,7 +41,7 @@ class LookaheadModule : public SpeculativeDecodingModule
mExecutionConfig = config;
}

executor::LookaheadDecodingConfig const getExecutionConfig() const
[[nodiscard]] executor::LookaheadDecodingConfig const& getExecutionConfig() const
{
return mExecutionConfig;
}
Expand Down
1 change: 1 addition & 0 deletions cpp/include/tensorrt_llm/runtime/modelConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "tensorrt_llm/runtime/lookaheadModule.h"
#include "tensorrt_llm/runtime/loraModule.h"
#include "tensorrt_llm/runtime/speculativeDecodingMode.h"
#include "tensorrt_llm/runtime/speculativeDecodingModule.h"

#include <NvInferRuntime.h>
#include <array>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ using namespace tensorrt_llm::runtime;

namespace tc = tensorrt_llm::common;
namespace te = tensorrt_llm::executor;
namespace tk = tensorrt_llm::kernels;
namespace tr = tensorrt_llm::runtime;

namespace tensorrt_llm::batch_manager
Expand Down
4 changes: 2 additions & 2 deletions cpp/tensorrt_llm/runtime/bufferView.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ class BufferView : virtual public IBuffer

if (offset + size > mBuffer->getSize())
{
throw std::out_of_range(std::string("slice ") + std::to_string(offset + size) + " exceeds buffer size "
+ std::to_string(mBuffer->getSize()));
throw std::out_of_range(std::string("offset ") + std::to_string(offset) + std::string(" + size ")
+ std::to_string(size) + " exceeds buffer size " + std::to_string(mBuffer->getSize()));
}
}

Expand Down
36 changes: 17 additions & 19 deletions cpp/tests/e2e_tests/batch_manager/trtGptModelRealDecoderTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,8 @@ void verifyOutput(RequestList const& finishedRequestList,
}

// Pick a different endId at random from one of the expected tokens
std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelType const& modelType,
std::vector<SizeType32> const& givenInputLengths, SizeType32 const maxNewTokens, bool replaceLogits)
std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, std::vector<SizeType32> const& givenInputLengths,
SizeType32 const maxNewTokens, bool replaceLogits)
{
auto const nbGivenInputs = testData.nbGivenInputs;
auto const beamWidth = testData.beamWidth;
Expand Down Expand Up @@ -328,9 +328,9 @@ std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelT
return endIds;
}

TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelType, ModelIds const modelIds,
BeamResult const& beamResult, ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId,
bool const replaceLogits, BufferManager& manager)
TestData loadTestData(ModelSpec const& modelSpec, ModelIds const modelIds, BeamResult const& beamResult,
ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
BufferManager& manager)
{
auto const [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(givenInput, modelIds.padId);
auto const& [beamWidth, resultsFile, contextLogitsFile, genLogitsFile, cumLogProbsFile, logProbsFile] = beamResult;
Expand All @@ -353,7 +353,7 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy

if (useRandomEndId)
{
testData.endIds = pickRandomEndIds(testData, modelType, givenInputLengths, maxNewTokens, replaceLogits);
testData.endIds = pickRandomEndIds(testData, givenInputLengths, maxNewTokens, replaceLogits);
}
else
{
Expand Down Expand Up @@ -409,9 +409,8 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy
}

std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> loadTestData(ModelSpec const& modelSpec,
TrtGptModelType const& modelType, ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths,
ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
BufferManager& manager)
ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths, ITensor const& givenInput,
SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits, BufferManager& manager)
{
// Map between beam width, and expected results for that beam width
std::unordered_map<SizeType32, TestData> beamWidthTestData;
Expand All @@ -424,8 +423,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
EXPECT_EQ(std::find(beamWidths.begin(), beamWidths.end(), beamWidth), beamWidths.end());
beamWidths.push_back(beamWidth);

auto testData = loadTestData(modelSpec, modelType, modelIds, beamResult, givenInput, maxBeamWidth,
useRandomEndId, replaceLogits, manager);
auto testData = loadTestData(
modelSpec, modelIds, beamResult, givenInput, maxBeamWidth, useRandomEndId, replaceLogits, manager);
beamWidthTestData.emplace(beamWidth, std::move(testData));
}

Expand All @@ -435,9 +434,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
RequestList runGptModelInference(std::shared_ptr<TrtGptModel>& trtGptModel, std::vector<SizeType32> const& beamWidths,
std::unordered_map<SizeType32, TestData> const& beamWidthTestData, SizeType32 batchSize, SizeType32 nbGivenInputs,
SizeType32 maxInputLength, SizeType32 padId, std::vector<SizeType32> const& givenInputLengths,
TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType,
TrtGptModelType modelType, int maxReqPerStep, bool prepopulateKVCache, bool enableStreamingMode,
bool enableBlockReuse)
TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType, int maxReqPerStep,
bool prepopulateKVCache, bool enableStreamingMode, bool enableBlockReuse)
{
// Fill the requests using givenInput
// requestList will have batchSize requests
Expand Down Expand Up @@ -641,8 +639,8 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds

auto const maxBeamWidth = executorConfig.getMaxBeamWidth();
// Load expected outputs for each beam width value
auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelType, modelIds, resultsFilesBeamWidths,
*givenInput, maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);
auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelIds, resultsFilesBeamWidths, *givenInput,
maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);

int const worldSize = modelSpec.mTPSize * modelSpec.mPPSize * modelSpec.mCPSize;
auto const worldConfig = WorldConfig::mpi(worldSize, modelSpec.mTPSize, modelSpec.mPPSize, modelSpec.mCPSize);
Expand All @@ -663,14 +661,14 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds
// Prepopulate KV cache for speculative decoding test
bool const prepopulateKVCache = modelSpec.mMaxDraftTokens > 0;
auto finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
maxReqPerStep, prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, maxReqPerStep,
prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);

if (prepopulateKVCache)
{
// Call the 2nd time with prefilled KV cache
finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType,
maxReqPerStep, false, enableStreamingMode, modelSpec.mKVCacheReuse);
}

Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/unit_tests/batch_manager/llmRequestTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ TEST_F(LlmRequestTest, fromExecutorRequest)
EXPECT_EQ(llmReq.getState(), tb::LlmRequestState::kCONTEXT_INIT);
EXPECT_FALSE(llmReq.mSeqSlot);
// No speculative decoding config, draft tokens should be empty
EXPECT_EQ(llmReq.getDraftTokens()->size(), 0);
EXPECT_EQ(llmReq.getNumDraftTokens(), 0);
EXPECT_FALSE(llmReq.getEmbeddingBias().has_value());
EXPECT_FALSE(llmReq.getBadWordsList().has_value());
EXPECT_FALSE(llmReq.getStopWordsList().has_value());
Expand Down
Loading