diff --git a/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h b/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h index 27b3de2e3d3..272758936ed 100644 --- a/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h +++ b/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h @@ -97,12 +97,6 @@ class MaxUtilizationScheduler : public BaseCapacityScheduler RequestList const& activeRequests) const; private: - /// @return {fitsKvCache, fitsPeft} - std::pair trySchedulingRequestMaxUtilization(kv_cache_manager::BaseKVCacheManager const& kvCacheManager, - OptionalRef peftCacheManager, std::shared_ptr const& req, - RequestVector& scheduledRequests, SizeType32& numScheduledBlocks, SizeType32& numScheduledPeftPages, - std::unordered_set& seenTaskIds) const; - SizeType32 mMaxNumRequests; /// @brief Boolean that indicates if multiple micro batches might be in flight bool mManyMicroBatches; diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h index 7e89241a2e4..335ce77a4cd 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h @@ -52,6 +52,7 @@ static constexpr SizeType32 kPrimaryLevel = 0; static constexpr SizeType32 kSecondaryLevel = 1; class KVCacheBlock; +class BlockManager; class KVCacheManager; class KVCacheTransferManager; @@ -68,6 +69,33 @@ using LoraTaskIdType = tensorrt_llm::runtime::LoraTaskIdType; template using OptionalRef = tensorrt_llm::common::OptionalRef; +struct TempAttentionWindowInputs +{ + bool pagedContextFMHA; + SizeType32 maxInputLen; + SizeType32 maxNumTokens; +}; + +struct WindowSizeMetadata +{ + SizeType32 absolutePoolsOffset; // cumulative number of pools up to manager + SizeType32 numPools; // number of managed pools + SizeType32 maxTokenNum; // Maximum token length (including bubble) + SizeType32 maxBlocksPerSeq; + SizeType32 maxNumBlocks; // Number of primary+secondary blocks allotted to the windowSize + SizeType32 temporaryAttentionWindow; // Temporary kv cache length per sequence. + // Only needed when chunked context + sliding window attention are used + // together. And it should only be considered when allocating blocks. + + std::string toString() + { + return tensorrt_llm::common::fmtstr( + "WindowSizeMetadata{ .absolutePoolsOffset=%d, .numPools=%d, .maxTokenNum=%d, .maxBlocksPerSeq=%d, " + ".maxNumBlocks=%d, .temporaryAttentionWindow=%d }", + absolutePoolsOffset, numPools, maxTokenNum, maxBlocksPerSeq, maxNumBlocks, temporaryAttentionWindow); + } +}; + struct BlockKey { bool usesExtraIds = false; @@ -300,22 +328,38 @@ class GenerationRequest using SizeType32 = tensorrt_llm::runtime::SizeType32; explicit GenerationRequest(LlmRequest::RequestIdType requestId, SizeType32 numTokens, SizeType32 beamWidth, - SizeType32 maxBlocks, SizeType32 cyclicThreshold, SizeType32 numPools = 1, + std::map const& windowSizeToMetadata, executor::KvCacheRetentionConfig kvCacheRetentionConfig = executor::KvCacheRetentionConfig()) : mRequestId(requestId) , mNumTokens(numTokens) , mBeamWidth(beamWidth) - , mCacheBlockIds(beamWidth) - , mCacheBlockIndices{runtime::BufferManager::cpu( - runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}), - runtime::TRTDataType::value)} , mKvCacheRetentionConfig(std::move(kvCacheRetentionConfig)) - , mCyclicThreshold(cyclicThreshold) - { - auto cacheBlockIdsRange = runtime::BufferRange(*mCacheBlockIndices); - std::fill(cacheBlockIdsRange.begin(), cacheBlockIdsRange.end(), - tensorrt_llm::kernels::KVCacheIndex{ - std::numeric_limits::max()}); + // min window size + sink bubble length + // Why use the minimum window size: + // Chunked Prefill + Reuse calls `setPrepopulatedPromptLen()` which sets + // `mContextCurrentPosition` - this cannot be done for some windows sizes and + // not for others, the state needs to remain identical for all window sizes. So + // we currently resort to strictly disabling the reuse code path for all window + // sizes at once or enable it for all window sizes at once. + , mCyclicThreshold(windowSizeToMetadata.cbegin()->second.maxTokenNum) + { + auto const numWindowSizes = windowSizeToMetadata.size(); + mCacheBlockIds.reserve(numWindowSizes); + mCacheBlockIndices.reserve(numWindowSizes); + for (auto const [windowSize, metadata] : windowSizeToMetadata) + { + mCacheBlockIds[windowSize] = std::vector>(beamWidth); + auto const numPools = metadata.numPools; + auto const maxBlocks = metadata.maxBlocksPerSeq; + mCacheBlockIndices[windowSize] + = runtime::BufferManager::cpu(runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}), + runtime::TRTDataType::value); + auto cacheBlockIdsRange + = runtime::BufferRange(*mCacheBlockIndices.at(windowSize)); + std::fill(cacheBlockIdsRange.begin(), cacheBlockIdsRange.end(), + tensorrt_llm::kernels::KVCacheIndex{ + std::numeric_limits::max()}); + } } void addNewTokens(SizeType32 n) @@ -345,42 +389,43 @@ class GenerationRequest return mBeamWidth; } - [[nodiscard]] std::vector> const& getCacheBlockIds() const + [[nodiscard]] std::vector> const& getCacheBlockIds(SizeType32 windowSize) const { - return mCacheBlockIds; + return mCacheBlockIds.at(windowSize); } - [[nodiscard]] runtime::ITensor& getCacheBlockIndices() + [[nodiscard]] runtime::ITensor& getCacheBlockIndices(SizeType32 windowSize) { - return *mCacheBlockIndices; + return *(mCacheBlockIndices.at(windowSize)); } - [[nodiscard]] runtime::ITensor const& getCacheBlockIndices() const + [[nodiscard]] runtime::ITensor const& getCacheBlockIndices(SizeType32 windowSize) const { - return *mCacheBlockIndices; + return *(mCacheBlockIndices.at(windowSize)); } - void addCacheBlock(SizeType32 beamIdx, KVCacheBlock::IdType blockId) + void addCacheBlock(SizeType32 windowSize, SizeType32 beamIdx, KVCacheBlock::IdType blockId) { - mCacheBlockIds.at(beamIdx).push_back(blockId); + mCacheBlockIds.at(windowSize).at(beamIdx).push_back(blockId); } - void changeCacheBlock(SizeType32 beamIdx, SizeType32 pagedBlockIdx, KVCacheBlock::IdType blockId) + void changeCacheBlock( + SizeType32 windowSize, SizeType32 beamIdx, SizeType32 pagedBlockIdx, KVCacheBlock::IdType blockId) { - mCacheBlockIds.at(beamIdx).at(pagedBlockIdx) = blockId; + mCacheBlockIds.at(windowSize).at(beamIdx).at(pagedBlockIdx) = blockId; } - void clearCacheBlocks() + void clearCacheBlocks(SizeType32 windowSize) { - for (auto& beamBlockIds : mCacheBlockIds) + for (auto& beamBlockIds : mCacheBlockIds.at(windowSize)) { beamBlockIds.clear(); } } - void removeLastBlock() + void removeLastBlock(SizeType32 windowSize) { - for (auto& beamBlockIds : mCacheBlockIds) + for (auto& beamBlockIds : mCacheBlockIds.at(windowSize)) { beamBlockIds.pop_back(); } @@ -411,14 +456,14 @@ class GenerationRequest SizeType32 mNumTokens; // Number of beams SizeType32 mBeamWidth; - // List of block ids allocated for each beam of the sequence - std::vector> mCacheBlockIds; - // Tensor of block indices allocated for each beam of the sequence - runtime::ITensor::SharedPtr mCacheBlockIndices; + // List of block ids allocated per each window size, for each beam of the sequence + std::unordered_map>> mCacheBlockIds; + // Tensor of block indices allocated per each window size, for each beam of the sequence + std::unordered_map mCacheBlockIndices; // The retention priority to assign to decode blocks executor::KvCacheRetentionConfig mKvCacheRetentionConfig; - // Number of tokens at which the KV Cache begins sliding + // Number of tokens at which the KV Cache begins sliding [for the minimum attention window] SizeType32 mCyclicThreshold; }; @@ -456,7 +501,7 @@ class KVCacheBlockPool } }; -// The BlockManager manages the metadata of KVCacheBlocks. +// The WindowBlockManager manages the metadata of KVCacheBlocks. // It manages multiple arrays of cache blocks called pools. // Layers with the same number of kv heads are grouped under the same pool. // Each pool has shape [max_blocks, num_layers, 2, num_kv_heads, tokens_pre_block, head_size], where num_layers refers @@ -464,6 +509,7 @@ class KVCacheBlockPool // The metadata of KVCacheBlocks is shared between layers, so each block spans all of the managed pool - an allocated // block matches some chunk of memory in each pool. The shape of the chunk in every pool is [2, num_kv_heads, // tokens_per_block, head_size]. The size per block and number of blocks are pre-determined and set in the constructor. +// WindowBlockManager maintains a list of free blocks at any time. // // FP4 KV caches allocate additional pools for block scale factors. These pools have the same // shape as the regular KV pools, except that the the last dim is head_size / N where N is determined @@ -471,9 +517,9 @@ class KVCacheBlockPool // // BlockManager maintains a list of free blocks at any time. // Alloc pops off the block at the front, and Free pushes it back to the vector. -// BlockManager maintains a vector of lists of request ids to allocated blocks +// WindowBlockManager maintains a vector of lists of request ids to allocated blocks // per sequence. This can be used to Free all blocks belonging to a sequence. -class BlockManager +class WindowBlockManager { public: using SizeType32 = tensorrt_llm::runtime::SizeType32; @@ -482,17 +528,17 @@ class BlockManager using BlockMap = std::unordered_multimap; using BlockMapIterRange = std::pair; - explicit BlockManager(std::vector const& numKvHeadsPerLayer, SizeType32 sizePerHead, - SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, - SizeType32 maxNumSequences, std::shared_ptr stream, bool onboardBlocks, - CacheType cacheType = CacheType::kSELF, - std::optional secondaryOffloadMinPriority = std::nullopt, - std::shared_ptr eventManager = nullptr, bool enableHashKey = false, - bool enablePartialReuse = true, bool copyOnPartialReuse = true); + explicit WindowBlockManager(nvinfer1::DataType dtype, SizeType32 windowSize, + std::vector const& managedLayers, std::vector const& numKvHeadsPerLayer, + SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, + SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr stream, + bool onboardBlocks, CacheType cacheType, std::optional secondaryOffloadMinPriority, + std::shared_ptr eventManager, bool enableHashKey, bool enablePartialReuse, + bool copyOnPartialReuse); - ~BlockManager(); + ~WindowBlockManager(); - void allocatePools(nvinfer1::DataType dtype, bool useUvm); + void allocatePools(bool useUvm); void releasePools(); @@ -507,15 +553,17 @@ class BlockManager //! \brief Allocate new block for each beam of the sequence. //! \details Might free cached blocks if no free blocks are available. - void allocateBlock(GenerationRequest& sequence, bool shareAmongBeams = false); + void allocateBlock(GenerationRequest& sequence, bool shareAmongBeams); void replaceSharedBlock(GenerationRequest& sequence, SizeType32 blockIdx); //! \brief Get the ids of all newly allocated (not reused) blocks for the sequence. std::vector getNewlyAllocatedBlockIds(GenerationRequest const& sequence) const; - //! \brief Release blocks of the sequence. Store blocks for reuse if llmReqeust is provided. - void releaseBlocks(GenerationRequest& sequence, OptionalRef llmRequest = std::nullopt); + void storeBlocksForReuse(GenerationRequest& sequence, OptionalRef llmRequest); + + //! \brief Release blocks of the sequence. + void releaseBlocks(GenerationRequest& sequence); //! \brief Simulate freeing all blocks for that sequence to check impact on number of free blocks void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId); @@ -523,6 +571,16 @@ class BlockManager //! \brief Release last block in the sequence void releaseLastBlock(GenerationRequest& sequence); + [[nodiscard]] SizeType32 getWindowSize() const noexcept + { + return mWindowSize; + } + + [[nodiscard]] std::string const& getLogPrefix() const noexcept + { + return mLogPrefix; + } + [[nodiscard]] SizeType32 getNumFreeBlocks() const noexcept; [[nodiscard]] SizeType32 getNumAllocTotalBlocks() const @@ -550,15 +608,12 @@ class BlockManager return mMissedBlocks; } - [[nodiscard]] std::deque getLatestEvents( - std::optional timeout) const; - [[nodiscard]] bool hasFreeBlocks(SizeType32 numRequired = 1) const noexcept { return getNumFreeBlocks() >= numRequired; } - [[nodiscard]] bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const noexcept + [[nodiscard]] bool schedulingHasFreeBlocks(SizeType32 numRequired) const noexcept { return mSchedulingNumFreeBlocks >= numRequired; } @@ -599,14 +654,9 @@ class BlockManager return std::count_if(mPools.begin(), mPools.end(), [](auto const& pool) { return !pool.containsBlockScales; }); } - [[nodiscard]] runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 poolIdx) const - { - return mPools.at(poolIdx).primaryPtr; - } - - [[nodiscard]] runtime::ITensor::SharedPtr getSecondaryPool(SizeType32 poolIdx) const + [[nodiscard]] KVCacheBlockPool const& getPool(SizeType32 poolIdx) const { - return mPools.at(poolIdx).secondaryPtr; + return mPools.at(poolIdx); } [[nodiscard]] bool containsBlockScales(SizeType32 poolIdx) const @@ -614,11 +664,6 @@ class BlockManager return mPools.at(poolIdx).containsBlockScales; } - [[nodiscard]] SizeType32 getNumLayers() const - { - return mNumLayers; - } - [[nodiscard]] SizeType32 getNumPrimaryBlocks() const { return mNumPrimaryBlocks; @@ -629,14 +674,9 @@ class BlockManager return mNumSecondaryBlocks; } - [[nodiscard]] CacheType getCacheType() const - { - return mCacheType; - } - [[nodiscard]] SizeType32 getLayerPoolIdx(SizeType32 layerIdx) const { - return mLayerToPool.at(layerIdx); + return mLayerToPoolIndex.at(layerIdx); } //! \brief Maps a global layer index to its layer index within its pool. @@ -644,22 +684,18 @@ class BlockManager //! \details gives the layer index into the getLayerPoolIdx(i). [[nodiscard]] SizeType32 getPoolLayerIdx(SizeType32 layerIdx) const { - return mLayerIndexToPoolLayerIndex.at(layerIdx); + return mLayerToIndexWithinPool.at(layerIdx); } - //! \brief Get index in pool to K or V block. - //! \param blockId the blockId as returned by getBlockId() - //! \param fieldIdx either 0 (K) or 1 (V), - //! \param poolIdx the index of the pool for which the index is calculated (each pool has different strides) - [[nodiscard]] kernels::KVCacheIndex getKOrVBlockIndex( - KVCacheBlock::IdType blockId, SizeType32 fieldIdx, SizeType32 poolIdx) const; + void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx, + SizeType32 blockIdx, KVCacheBlock::IdType blockId) const; //! \brief Bring offloaded block from secondary to primary memory. - //! \details Does nothing of block is already in primary memory. + //! \details Does nothing if block is already in primary memory. void onboardBlock(BlockPtr const& offloadBlock); //! \brief Bring block from primary to secondary memory. - //! \details Does nothing of block is already in secondary memory. + //! \details Does nothing if block is already in secondary memory. void offloadBlock(BlockPtr const& block); //! \brief Find first new block that must be allocated for context phase and return it's concatenated token vectors. @@ -675,24 +711,35 @@ class BlockManager //! \brief Perform per-request bookkeeping void refreshBlocks(); - void flushIterationEvents() - { - if (mEventManager) - { - mEventManager->flush(); - } - } - [[nodiscard]] static bool blockInRadixTree(BlockPtr const& block); - [[nodiscard]] bool verifyQueueIntegrity(); + //! \brief Store blocks in cached blocks. + //! \param blockKeys Key of each block. + //! \param blockIds Id of each block. + void storeBlocks(std::vector const& blockKeys, std::vector const& blockIds); - //! \brief Store context blocks - void storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest); + void addBlockToHashMap(BlockPtr const& block); - [[nodiscard]] bool isEnableHashKey() const + void removeBlockFromHashMap(BlockPtr const& block); + + [[nodiscard]] bool verifyQueueIntegrity(); + + // Only needed when sliding window attention + paged context fmha are used together. + // In that case, a temporary kv cache buffer with maximum chunk size (maxNumTokens) is needed. + // TODO: There are several things that can be improved later. + // 1. a dynamic temporary kv cache allocation based on real chunk size might be needed. + // 2. reuse the same temporary kv cache buffer among all layers in the same pool. + [[nodiscard]] SizeType32 calculateTemporaryAttentionWindow( + std::optional const& inputs) const { - return mEnableHashKey; + + if (inputs && inputs->pagedContextFMHA && (inputs->maxInputLen > mWindowSize)) + { + auto window = std::min(inputs->maxNumTokens, inputs->maxInputLen - mWindowSize); + window = std::max(window, 0); // clamp negative values to 0 + return window; + } + return 0; } private: @@ -702,11 +749,6 @@ class BlockManager //! \brief Add single block to all beams of sequence. void addBlockToAllBeams(BlockPtr& block, GenerationRequest& sequence); - //! \brief Store blocks in cached blocks. - //! \param blockKeys Key of each block. - //! \param blockIds Id of each block. - void storeBlocks(std::vector blockKeys, std::vector const& blockIds); - //! \brief Try to load blocks from cache. Allocate new blocks if necessary. //! \param blockKeys Key of each block. //! \param sequence Sequence to which blocks are assigned. @@ -720,17 +762,16 @@ class BlockManager std::optional durationMs = std::nullopt); //! \brief Free block from previous block and claim it from free blocks list. - void claimLeafBlock(BlockPtr block, std::optional priority = std::nullopt, + void claimLeafBlock(BlockPtr const& block, std::optional priority = std::nullopt, std::optional durationMs = std::nullopt); - void addBlockToHashMap(BlockPtr block); - - void removeBlockFromHashMap(BlockPtr block); - -private: //! \brief For FP4 quantization. Creates pool objects for FP4 block scalars. void createBlockScalePools(SizeType32 blockSize); +private: + nvinfer1::DataType mDataType; + SizeType32 mWindowSize; + // Number of blocks in pools SizeType32 mNumPrimaryBlocks; SizeType32 mNumSecondaryBlocks; @@ -741,20 +782,17 @@ class BlockManager // Pool per unique numKvHeads in the model std::vector mPools; - // Matching of model layers to their pools - std::vector mLayerToPool; - // See getPoolLayerIdx - std::vector mLayerIndexToPoolLayerIndex; + // Matching layers to their respective pools: {: , }, etc. + std::unordered_map mLayerToPoolIndex; + // Matching layers to their index *within* their respective pools: {..., : }. See + // getPoolLayerIdx + std::unordered_map mLayerToIndexWithinPool; // Whether offloaded blocks should be onboarded before reuse. bool mOnboardBlocks; // Buffer manager runtime::BufferManager mBufferManager; - // Size of a single KV heads - SizeType32 mSizePerHead; - // Number of layers - SizeType32 mNumLayers; // Used to keep track of number of free blocks during scheduling SizeType32 mSchedulingNumFreeBlocks; // Number of tokens per one block @@ -789,22 +827,338 @@ class BlockManager // max_num_tokens(For DeepSeek). Controlled by mCacheType SizeType32 mKVFactor; std::set reusedBlockIds; + std::string const mLogPrefix; // Number of reused tokens double mReusedTokens; // Total number of input tokens double mTotalInputTokens; - // Whether or not to maintain a hashmap of blocks. bool mEnableHashKey; - // Whether blocks that are partially matched should be reused. bool mEnablePartialReuse; - // Whether partially matched blocks that are already in use should be copied and reused. bool mCopyOnPartialReuse; +}; + +class BlockManager +{ +public: + using CudaStreamPtr = std::shared_ptr; + using SizeType32 = tensorrt_llm::runtime::SizeType32; + using BaseEvictionPolicy = tensorrt_llm::batch_manager::eviction_policy::BaseEvictionPolicy; + + explicit BlockManager(std::vector const& numKvHeadsPerLayer, SizeType32 sizePerHead, + SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, + SizeType32 maxNumSequences, CudaStreamPtr stream, std::optional maxSequenceLength, + SizeType32 maxBeamWidth, std::vector const& maxAttentionWindowVec, + std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, + SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF, + std::optional secondaryOffloadMinPriority = std::nullopt, + std::shared_ptr eventManager = nullptr, bool enableHashKey = false, + bool enablePartialReuse = true, bool copyOnPartialReuse = true); + + //! \brief Calculate the number of blocks each window size heap receives of blocksIn{Primary/Secondary}Pool + //! \details Example: (total=16384, uniqueWindowSizeToLayers={1024: [1], 4096: [0, 4, 5], 8192: [2, 3]}) + //! Would Return: {1024: 565, 4096: 6780, 8192: 9039} [sums to total]. + //! See: TEST_F(KVCacheManagerTest, BlockManagerTestBlocksPerWindowSize). + //! \return Map + static std::map blocksPerWindowSize( + SizeType32 totalBlocks, std::map> const& uniqueWindowSizeToLayers); + + void allocatePools(bool useUvm); + + void addSequence(GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks, + LlmRequest& llmRequest, SizeType32 windowSize); + + void addSequence( + GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx, SizeType32 windowSize); + + void allocateBlock(GenerationRequest& sequence, SizeType32 windowSize); + + void replaceSharedBlock(GenerationRequest& sequence, SizeType32 windowSize, SizeType32 blockIdx); + + std::vector getNewlyAllocatedBlockIds( + GenerationRequest const& sequence, SizeType32 windowSize) const; + + void releaseBlocks(GenerationRequest& sequence, OptionalRef llmRequest = std::nullopt); + + void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId); + + void releaseLastBlock(GenerationRequest& sequence, SizeType32 windowSize); + + void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx, + SizeType32 blockIdx, KVCacheBlock::IdType blockId, SizeType32 windowSize) const; + + // WILL NOT WORK FOR VARIABLE WINDOW ATTENTION + [[nodiscard]] std::optional findNewContextBlock( + VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const; + + //! \brief Bring block from primary to secondary memory for window size. + //! \details Does nothing if block is already in primary memory. + void onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize); + + //! \brief Bring block from primary to secondary memory for window size. + //! \details Does nothing if block is already in secondary memory. + void offloadBlock(BlockPtr const& block, SizeType32 windowSize); + + void storeBlocks(std::vector const& blockKeys, std::vector const& blockIds, + SizeType32 windowSize) + { + mWindowBlockManagers.at(windowSize).storeBlocks(blockKeys, blockIds); + } + + [[nodiscard]] bool verifyQueueIntegrity(SizeType32 windowSize); + + void releasePools(); + + void startScheduling(); + + [[nodiscard]] std::map getNumFreeBlocksPerWindowSize() const + { + std::map numFreeBlocksPerWindowSize; + for (auto const& [windowSize, manager] : mWindowBlockManagers) + { + numFreeBlocksPerWindowSize[windowSize] = manager.getNumFreeBlocks(); + } + return numFreeBlocksPerWindowSize; + } + + [[nodiscard]] SizeType32 getNumFreeBlocks() const + { + return sumWindows([](auto const& manager) { return manager.getNumFreeBlocks(); }); + } + + [[nodiscard]] bool schedulingHasFreeBlocks(SizeType32 numRequired, SizeType32 windowSize) const + { + return mWindowBlockManagers.at(windowSize).schedulingHasFreeBlocks(numRequired); + } + + [[nodiscard]] SizeType32 getNumAllocTotalBlocks() const + { + return sumWindows([](auto const& manager) { return manager.getNumAllocTotalBlocks(); }); + } + + [[nodiscard]] SizeType32 getNumAllocNewBlocks() const + { + return sumWindows([](auto const& manager) { return manager.getNumAllocNewBlocks(); }); + } + + [[nodiscard]] SizeType32 getNumReusedBlocks() const + { + return sumWindows([](auto const& manager) { return manager.getNumReusedBlocks(); }); + } + + [[nodiscard]] SizeType32 getNumMissedBlocks() const + { + return sumWindows([](auto const& manager) { return manager.getNumMissedBlocks(); }); + } + + [[nodiscard]] SizeType32 getNumLayers() const + { + return mNumLayers; + } + + [[nodiscard]] CacheType getCacheType() const + { + return mCacheType; + } + + [[nodiscard]] SizeType32 getLayerPoolIdx(SizeType32 layerIdx) const + { + auto const& manager = windowManagerByLayer(layerIdx); + auto const absoluteOffset = absolutePoolsOffset(manager); + auto const relativePoolIndex = manager.getLayerPoolIdx(layerIdx); + return absoluteOffset + relativePoolIndex; + } + + [[nodiscard]] SizeType32 getPoolLayerIdx(SizeType32 layerIdx) const + { + return windowManagerByLayer(layerIdx).getPoolLayerIdx(layerIdx); + } + + [[nodiscard]] SizeType32 getTokensPerBlock() const noexcept + { + return mTokensPerBlock; + } + + [[nodiscard]] SizeType32 getStreamDevice() const + { + return mStream->getDevice(); + } + + [[nodiscard]] std::deque getLatestEvents( + std::optional timeout) const; + + void flushIterationEvents() + { + if (mEventManager) + { + mEventManager->flush(); + } + } + + [[nodiscard]] SizeType32 getPoolWindowSize(SizeType32 poolIdx) const + { + return mAbsolutePoolToWindowSize.at(poolIdx); + } + + [[nodiscard]] SizeType32 getBlockSize(SizeType32 poolIdx) const + { + return getPool(poolIdx).blockSize; + } + + [[nodiscard]] SizeType32 getNumPools(bool includeBlockScalePools = true) const + { + return sumWindows( + [includeBlockScalePools](auto const& manager) { return manager.getNumPools(includeBlockScalePools); }); + } + + [[nodiscard]] std::map const& getWindowSizesMetadata() const noexcept + { + return mWindowSizeToMetadata; + } + + [[nodiscard]] WindowSizeMetadata getWindowSizeMetadata(SizeType32 windowSize) const noexcept + { + return mWindowSizeToMetadata.at(windowSize); + } + + [[nodiscard]] bool isVariableWindow() const noexcept + { + return mIsVariableWindow; + } + + [[nodiscard]] SizeType32 getMaxBlockPerSeqWhenSingleWindowSize() const + { + TLLM_CHECK_WITH_INFO(!isVariableWindow(), + "This function was called assuming there is only a single window size, and therefore a single " + "maxBlocksPerSeq"); + auto const windowSize = windowManagerByLayer(0).getWindowSize(); + auto const onlyWindowSizeMetadata = getWindowSizeMetadata(windowSize); + return onlyWindowSizeMetadata.maxBlocksPerSeq; + } + + [[nodiscard]] bool isVariableGQA() const noexcept + { + return mIsVariableGQA; + } + + [[nodiscard]] runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 poolIdx) const + { + return getPool(poolIdx).primaryPtr; + } + + [[nodiscard]] runtime::ITensor::SharedPtr getSecondaryPool(SizeType32 poolIdx) const + { + return getPool(poolIdx).secondaryPtr; + } + + [[nodiscard]] SizeType32 getNumAllocatedBlocks() const + { + return sumWindows([](auto const& manager) { return manager.getNumAllocatedBlocks(); }); + } + + [[nodiscard]] SizeType32 getMaxNumBlocks() const + { + return sumWindows([](auto const& manager) { return manager.getMaxNumBlocks(); }); + } + + [[nodiscard]] BlockPtr const& getBlockById(KVCacheBlock::IdType blockId, SizeType32 windowSize) const + { + return mWindowBlockManagers.at(windowSize).getBlockById(blockId); + } + + [[nodiscard]] WindowBlockManager::BlockMapIterRange getBlocksByHash(size_t hash, SizeType32 windowSize) const + { + return mWindowBlockManagers.at(windowSize).getBlocksByHash(hash); + } + + [[nodiscard]] SizeType32 getNumPrimaryBlocks() const + { + return sumWindows([](auto const& manager) { return manager.getNumPrimaryBlocks(); }); + } + + [[nodiscard]] bool containsBlockScales(SizeType32 poolIdx) const + { + return getPool(poolIdx).containsBlockScales; + } + + void addBlockToHashMap(BlockPtr const& block, SizeType32 windowSize) + { + mWindowBlockManagers.at(windowSize).addBlockToHashMap(block); + } + + void removeBlockFromHashMap(BlockPtr const& block, SizeType32 windowSize) + { + mWindowBlockManagers.at(windowSize).removeBlockFromHashMap(block); + } + + //! \brief Store context blocks + void storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest); + + [[nodiscard]] static bool isUseOneMoreBlock( + SizeType32 windowSize, std::optional maxSequenceLength, SizeType32 maxBeamWidth) + { + bool const isCyclicWindowSize = maxSequenceLength.has_value() && maxSequenceLength.value() > windowSize; + bool const isBeamSearch = maxBeamWidth > 1; + return isCyclicWindowSize && isBeamSearch; + } + + //! \brief Perform per-request bookkeeping + void refreshBlocks(); + + [[nodiscard]] runtime::BufferManager const& getBufferManager(SizeType32 windowSize) const + { + return mWindowBlockManagers.at(windowSize).getBufferManager(); + } + + [[nodiscard]] KVCacheBlockPool const& getPool(SizeType32 poolIdx) const + { + auto const windowSize = getPoolWindowSize(poolIdx); + auto const relativePoolIndex = mAbsolutePoolToRelativePoolIndex.at(poolIdx); + return mWindowBlockManagers.at(windowSize).getPool(relativePoolIndex); + } private: - friend class KVCacheManager; + [[nodiscard]] WindowBlockManager const& windowManagerByLayer(SizeType32 layerIdx) const + { + return mWindowBlockManagers.at(mLayerToWindowSize.at(layerIdx)); + } + + [[nodiscard]] SizeType32 sumWindows(std::function produce) const + { + return std::accumulate(mWindowBlockManagers.cbegin(), mWindowBlockManagers.cend(), SizeType32{0}, + [&produce](SizeType32 acc, auto const& manager) { return acc + produce(manager.second); }); + } + + [[nodiscard]] SizeType32 absolutePoolsOffset(WindowBlockManager const& manager) const + { + auto const windowSize = manager.getWindowSize(); + return getWindowSizeMetadata(windowSize).absolutePoolsOffset; + } + +private: + SizeType32 mNumLayers; + SizeType32 mTokensPerBlock; + std::shared_ptr mEventManager; + CudaStreamPtr mStream; + CacheType mCacheType; + + bool mIsVariableWindow; + bool mIsVariableGQA; + + std::map mWindowBlockManagers; + std::map mWindowSizeToMetadata; + std::vector mLayerToWindowSize; + std::vector mAbsolutePoolToWindowSize; + std::vector mAbsolutePoolToRelativePoolIndex; +}; + +struct OffsetTableDimensions +{ + SizeType32 maxBlocksPerSeq; + SizeType32 numPools; + CacheType cacheType; }; class BaseKVCacheManager @@ -816,7 +1170,7 @@ class BaseKVCacheManager virtual ~BaseKVCacheManager() {} - virtual void allocatePools(nvinfer1::DataType dtype, bool useUvm = false) = 0; + virtual void allocatePools(bool useUvm = false) = 0; virtual void releasePools() = 0; @@ -834,9 +1188,10 @@ class BaseKVCacheManager // only used by test [[nodiscard]] virtual SizeType32 getNumReusedBlocks() const noexcept = 0; + [[nodiscard]] virtual KvCacheStats getKvCacheStats() const = 0; - [[nodiscard]] virtual SizeType32 getMaxBlocksPerSeq() const = 0; + [[nodiscard]] virtual OffsetTableDimensions getOffsetTableDimensions() const = 0; [[nodiscard]] virtual std::deque getLatestEvents( std::optional timeout = std::nullopt) const @@ -848,13 +1203,16 @@ class BaseKVCacheManager /// iterations /// @param req The request for which we need to calculate the number of needed KV cache blocks /// @return The number of blocks - [[nodiscard]] virtual SizeType32 getNeededBlocksOneStep(LlmRequest const& req, bool twoStepsLookAhead) const = 0; + [[nodiscard]] virtual SizeType32 getNeededBlocksOneStep( + LlmRequest const& req, bool twoStepsLookAhead, SizeType32 windowSize) const + = 0; /// @brief Function that computes the number of KV cache blocks needed to advance a request to completion (i.e. for /// maxNewTokens) /// @param req The request for which we need to calculate the number of needed KV cache blocks /// @return The number of blocks - [[nodiscard]] virtual SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const = 0; + [[nodiscard]] virtual SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req, SizeType32 windowSize) const + = 0; /// @brief Increase size for request at seqSlotIdx. Allocate new KV cache block(s) if needed. virtual void addToken(LlmRequest::RequestIdType requestId) = 0; @@ -890,8 +1248,6 @@ class BaseKVCacheManager [[nodiscard]] virtual bool isEnableBlockReuse() const = 0; - [[nodiscard]] virtual bool isUseOneMoreBlock() const = 0; - // void removeToken(SizeType32 seqSlotIdx); virtual void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) = 0; @@ -910,18 +1266,18 @@ class BaseKVCacheManager //! \details These blocks become reusable from next step. virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0; - [[nodiscard]] virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0; - + //! \brief Get the block ids of a request [per beam] **for a given window size block manager** [[nodiscard]] virtual std::vector> const& getCacheBlockIds( - LlmRequest::RequestIdType requestId) const + LlmRequest::RequestIdType requestId, SizeType32 windowSize) const = 0; + //! \brief Get the block ids of a batch of requests [per beam] **for a given window size block manager** [[nodiscard]] virtual std::vector>> getBatchCacheBlockIds( - std::vector const& requestIds) const + std::vector const& requestIds, SizeType32 windowSize) const = 0; [[nodiscard]] virtual std::vector getNewlyAllocatedBlockIds( - LlmRequest::RequestIdType requestId) const + LlmRequest::RequestIdType requestId, SizeType32 windowSize) const = 0; [[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0; @@ -972,9 +1328,9 @@ class KVCacheManager : public BaseKVCacheManager KVCacheManager(std::vector const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, std::vector const& maxAttentionWindowVec, - SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, CudaStreamPtr stream, - std::optional maxSequenceLength, bool enableBlockReuse = false, bool onboardBlocks = true, - CacheType cacheType = CacheType::kSELF, + std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, + SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional maxSequenceLength, + bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF, std::optional secondaryOffloadMinPriority = std::nullopt, std::shared_ptr eventManager = nullptr, bool enableHashKey = false, bool enablePartialReuse = true, bool copyOnpartialReuse = true); @@ -982,9 +1338,9 @@ class KVCacheManager : public BaseKVCacheManager KVCacheManager(std::vector const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, std::vector const& maxAttentionWindowVec, - SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, int64_t stream, - std::optional maxSequenceLength, bool enableBlockReuse = false, bool onboardBlocks = true, - CacheType cacheType = CacheType::kSELF, + std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, + SizeType32 sinkTokenLength, int64_t stream, std::optional maxSequenceLength, + bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF, std::optional secondaryOffloadMinPriority = std::nullopt, std::shared_ptr eventManager = nullptr, bool enablePartialReuse = true, bool copyOnpartialReuse = true); @@ -992,9 +1348,9 @@ class KVCacheManager : public BaseKVCacheManager KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, std::vector const& maxAttentionWindowVec, - SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, CudaStreamPtr stream, - std::optional maxSequenceLength, bool enableBlockReuse = true, bool onboardBlocks = true, - CacheType cacheType = CacheType::kSELF, + std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, + SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional maxSequenceLength, + bool enableBlockReuse = true, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF, std::optional secondaryOffloadMinPriority = std::nullopt, std::shared_ptr eventManager = nullptr, bool enableHashKey = false, bool enablePartialReuse = true, bool copyOnpartialReuse = true); @@ -1002,13 +1358,14 @@ class KVCacheManager : public BaseKVCacheManager KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, std::vector const& maxAttentionWindowVec, - SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, int64_t stream, - std::optional maxSequenceLength, bool enableBlockReuse = false, bool onboardBlocks = true, - CacheType cacheType = CacheType::kSELF, bool enablePartialReuse = true, bool copyOnpartialReuse = true); + std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, + SizeType32 sinkTokenLength, int64_t stream, std::optional maxSequenceLength, + bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF, + bool enablePartialReuse = true, bool copyOnpartialReuse = true); ~KVCacheManager() override = default; - void allocatePools(nvinfer1::DataType dtype, bool useUvm = false) override; + void allocatePools(bool useUvm = false) override; void releasePools() override; @@ -1076,9 +1433,15 @@ class KVCacheManager : public BaseKVCacheManager return kvCacheStats; } - [[nodiscard]] SizeType32 getMaxBlocksPerSeq() const override + [[nodiscard]] OffsetTableDimensions getOffsetTableDimensions() const override { - return mMaxBlocksPerSeq; + OffsetTableDimensions dims; + // We use the mMaxAttentionWindow here, because we prefer to have a single offset table for simplicity, + // And we don't mind that it should be as wide as the widest window, because that is negligible. + dims.maxBlocksPerSeq = mBlockManager.getWindowSizeMetadata(mMaxAttentionWindow).maxBlocksPerSeq; + dims.numPools = mBlockManager.getNumPools(); + dims.cacheType = mBlockManager.getCacheType(); + return dims; } [[nodiscard]] std::deque getLatestEvents( @@ -1096,13 +1459,15 @@ class KVCacheManager : public BaseKVCacheManager /// iterations /// @param req The request for which we need to calculate the number of needed KV cache blocks /// @return The number of blocks - [[nodiscard]] SizeType32 getNeededBlocksOneStep(LlmRequest const& req, bool twoStepsLookAhead) const override; + [[nodiscard]] SizeType32 getNeededBlocksOneStep( + LlmRequest const& req, bool twoStepsLookAhead, SizeType32 windowSize) const override; /// @brief Function that computes the number of KV cache blocks remaining to advance a request to completion (i.e. /// for maxNewTokens); the allocated blocks are excluded /// @param req The request for which we need to calculate the number of needed KV cache blocks /// @return The number of blocks - [[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const override; + [[nodiscard]] SizeType32 getRemainingBlocksToCompletion( + LlmRequest const& req, SizeType32 windowSize) const override; /// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed. void addToken(LlmRequest::RequestIdType requestId) override; @@ -1149,16 +1514,6 @@ class KVCacheManager : public BaseKVCacheManager return mEnableBlockReuse; } - [[nodiscard]] bool isEnableHashKey() const - { - return mEnableHashKey; - } - - [[nodiscard]] bool isUseOneMoreBlock() const override - { - return mUseOneMoreBlock; - } - void removeToken(LlmRequest::RequestIdType requestId); void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override; @@ -1210,14 +1565,14 @@ class KVCacheManager : public BaseKVCacheManager [[nodiscard]] static SizeType32 calculateMaxBlockRequirementsPerBeam(SizeType32 sequenceLength, SizeType32 sinkTokenLength, SizeType32 maxAttentionWindow, SizeType32 tokensPerBlock); - bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const override; - - std::vector> const& getCacheBlockIds(LlmRequest::RequestIdType requestId) const override; + std::vector> const& getCacheBlockIds( + LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override; std::vector>> getBatchCacheBlockIds( - std::vector const& requestIds) const override; + std::vector const& requestIds, SizeType32 windowSize) const override; - std::vector getNewlyAllocatedBlockIds(LlmRequest::RequestIdType requestId) const override; + std::vector getNewlyAllocatedBlockIds( + LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override; runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const override; @@ -1250,12 +1605,9 @@ class KVCacheManager : public BaseKVCacheManager SizeType32 sinkTokenLength, SizeType32 blockCapacity, SizeType32 beamWidth, SizeType32 tokensPerBlock); private: - void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx, - SizeType32 blockIdx, KVCacheBlock::IdType blockId) const; - - void cacheBlockOffsets(GenerationRequest& seq); - void cacheNewBlockOffsets(GenerationRequest& seq); - void updateNewBlockPointer(GenerationRequest& seq, SizeType32 blockIdx); + void cacheBlockOffsets(GenerationRequest& seq, SizeType32 windowSize); + void cacheNewBlockOffsets(GenerationRequest& seq, SizeType32 windowSize); + void updateNewBlockPointer(GenerationRequest& seq, SizeType32 windowSize, SizeType32 blockIdx); void updateToken(GenerationRequest& sequence, bool addToken); private: @@ -1263,22 +1615,13 @@ class KVCacheManager : public BaseKVCacheManager SizeType32 mMaxNumSequences; // Maximum beam width SizeType32 mMaxBeamWidth; - // Maximum number of blocks per sequence - SizeType32 mMaxBlocksPerSeq; + nvinfer1::DataType mDataType; // Maximum kv cache length per sequence SizeType32 mMaxAttentionWindow; - // Minimum kv cache length per sequence - SizeType32 mMinAttentionWindow; - // Temporary kv cache length per sequence. - // Only needed when chunked context + sliding window attention are used together. - // And it should only be considered when allocating blocks. - SizeType32 mTemporaryAttentionWindow; // Number of tokens per block SizeType32 mTokensPerBlock; // Number of tokens to fill up the sink tokens to a full block size SizeType32 mSinkBubbleLength; - // Maximum token length (including bubble) - SizeType32 mMaxTokenNum; // Number of tokens in the sink blocks SizeType32 mSinkBlockTokenLength; // Block manager @@ -1289,8 +1632,6 @@ class KVCacheManager : public BaseKVCacheManager bool mEnableBlockReuse; // Whether enable finding blocks by their hash, ignored when reuse enabled bool mEnableHashKey; - // Whether use one more block for each sequence - bool mUseOneMoreBlock; // Mutex to protect access to mSequences mutable std::mutex mSequencesMtx; // buffers for static tensors, will be created after allocating pools @@ -1299,4 +1640,90 @@ class KVCacheManager : public BaseKVCacheManager runtime::ITensor::SharedPtr mBlockScalePoolPointers; }; +class NoEvictScheduledBlocksManager +{ +public: + explicit NoEvictScheduledBlocksManager(BaseKVCacheManager const& kvCacheManager) + : mKvCacheManager(kvCacheManager) + , mAvailableBlocks(mKvCacheManager.getBlockManager().getNumFreeBlocksPerWindowSize()) + { + } + + void decrementReservedBlocks(LlmRequest const& req) + { + for (auto& [windowSize, availableBlocks] : mAvailableBlocks) + { + availableBlocks -= mKvCacheManager.getRemainingBlocksToCompletion(req, windowSize); + } + } + + bool enoughAvailableBlocks(LlmRequest const& req) + { + return std::all_of(mAvailableBlocks.cbegin(), mAvailableBlocks.cend(), + [this, &req](auto const& pair) + { + auto const& [windowSize, availableBlocks] = pair; + auto const neededBlocks = mKvCacheManager.getRemainingBlocksToCompletion(req, windowSize); + return neededBlocks <= availableBlocks; + }); + } + +private: + BaseKVCacheManager const& mKvCacheManager; + std::map mAvailableBlocks; +}; + +class MaxUtilizationScheduledBlocksManager +{ +public: + MaxUtilizationScheduledBlocksManager(BaseKVCacheManager const& kvCacheManager, bool twoStepsLookAhead) + : mKvCacheManager(kvCacheManager) + , mTwoStepsLookAhead(twoStepsLookAhead) + { + auto const& windowSizes = mKvCacheManager.getBlockManager().getWindowSizesMetadata(); + for (auto const& [windowSize, _] : windowSizes) + { + mNumScheduledBlocks[windowSize] = 0; + } + } + + std::optional> prepareNewNumberOfBlocksIfWeEndUpScheduling(LlmRequest const& req) + { + std::map blocksIfScheduled; + for (auto const& [windowSize, numScheduled] : mNumScheduledBlocks) + { + auto const required = mKvCacheManager.getNeededBlocksOneStep(req, mTwoStepsLookAhead, windowSize); + + TLLM_LOG_DEBUG("MaxUtilizationScheduler: request ID %lu required blocks %i for %i window size", + req.mRequestId, required, windowSize); + + auto const scheduledTotal = numScheduled + required; + bool const hasFreeBlocks + = mKvCacheManager.getBlockManager().schedulingHasFreeBlocks(scheduledTotal, windowSize); + if (!hasFreeBlocks) + { + return std::nullopt; + } + blocksIfScheduled[windowSize] = scheduledTotal; + } + return blocksIfScheduled; + } + + void updateScheduledBlocks(std::map const& numBlocksIfScheduled) + { + assert(numBlocksIfScheduled.size() == mNumScheduledBlocks.size()); + for (auto const& [windowSize, blocksIfScheduled] : numBlocksIfScheduled) + { + TLLM_LOG_DEBUG( + "MaxUtilizationScheduler: scheduled blocks %i for window size %i", blocksIfScheduled, windowSize); + mNumScheduledBlocks.at(windowSize) = blocksIfScheduled; + } + } + +private: + BaseKVCacheManager const& mKvCacheManager; + std::map mNumScheduledBlocks; + bool const mTwoStepsLookAhead; +}; + } // namespace tensorrt_llm::batch_manager::kv_cache_manager diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h index 476dce90779..8a44e472728 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h @@ -31,24 +31,28 @@ class BlockRange { }; - BlockRange(BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId, SizeType32 beam, - SizeType32 poolIdx = 0) - : mManager(&cacheManager) - , mPool(cacheManager.getBlockManager().getPrimaryPool(poolIdx)) - , mBlockIds(cacheManager.getSequence(requestId).getCacheBlockIds().at(beam)) + static BlockRange fromOldAllocatedBlockIds(BaseKVCacheManager const& cacheManager, + LlmRequest::RequestIdType requestId, SizeType32 beam = kFIRST_AND_ONLY_BEAM) { + assert(kFIRST_AND_ONLY_BEAM == beam); + auto const windowSize = firstWindowSize(cacheManager); + auto const blockIds = cacheManager.getSequence(requestId).getCacheBlockIds(windowSize).at(kFIRST_AND_ONLY_BEAM); + return BlockRange(cacheManager, blockIds, requestId); } - BlockRange(BaseKVCacheManager const& cacheManager, std::vector blockIds, SizeType32 poolIdx = 0) - : mManager(&cacheManager) - , mPool(cacheManager.getBlockManager().getPrimaryPool(poolIdx)) - , mBlockIds(std::move(blockIds)) + static BlockRange fromNewlyAllocatedBlockIds( + BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId) { + auto const windowSize = firstWindowSize(cacheManager); + auto const blockIds = cacheManager.getNewlyAllocatedBlockIds(requestId, windowSize); + return BlockRange(cacheManager, blockIds, requestId); } - BlockRange(runtime::ITensor::SharedPtr pool, std::vector const& blockIds) + BlockRange(runtime::ITensor::SharedPtr pool, std::vector const& blockIds) // Only used in tests : mManager{nullptr} , mPool{std::move(pool)} + , mWindowSize{0} + , mRequestId{0} , mBlockIds{blockIds} { TLLM_CHECK(mPool); @@ -84,25 +88,51 @@ class BlockRange auto& blockManager = mManager->getBlockManager(); for (auto id : mBlockIds) { - blockHashes.emplace_back(blockManager.getBlockById(id)->getHash()); + blockHashes.emplace_back(blockManager.getBlockById(id, mWindowSize)->getHash()); } return blockHashes; } void updatePoolIdx(SizeType32 poolIdx) { - if (mManager) + TLLM_CHECK(mManager); + mPool = mManager->getBlockManager().getPrimaryPool(poolIdx); + auto const newWindowSize = mManager->getBlockManager().getPoolWindowSize(poolIdx); + if (newWindowSize != mWindowSize) { - mPool = mManager->getBlockManager().getPrimaryPool(poolIdx); + mWindowSize = newWindowSize; + mBlockIds = mManager->getSequence(mRequestId).getCacheBlockIds(mWindowSize).at(kFIRST_AND_ONLY_BEAM); } } friend class BlockIterator; +private: + BlockRange( + BaseKVCacheManager const& cacheManager, std::vector blockIds, LlmRequest::RequestIdType requestId) + : mManager(&cacheManager) + , mPool(cacheManager.getBlockManager().getPrimaryPool(kFIRST_POOL_INDEX)) + , mWindowSize(firstWindowSize(cacheManager)) + , mRequestId(requestId) + , mBlockIds(std::move(blockIds)) + { + } + + static SizeType32 firstWindowSize(BaseKVCacheManager const& cacheManager) + { + constexpr SizeType32 FIRST_POOL_IDX = 0; + return cacheManager.getBlockManager().getPoolWindowSize(FIRST_POOL_IDX); + } + private: BaseKVCacheManager const* mManager; runtime::ITensor::SharedPtr mPool; + SizeType32 mWindowSize; + const LlmRequest::RequestIdType mRequestId; std::vector mBlockIds; + + static constexpr SizeType32 kFIRST_AND_ONLY_BEAM = 0; + static constexpr SizeType32 kFIRST_POOL_INDEX = 0; }; class BlockIterator diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp index eb787936e50..f87b128ccf7 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp @@ -49,7 +49,7 @@ void CacheFormatter::formatOutput(LlmRequest const& llmRequest, constexpr SizeType32 beam{0}; auto& blockManager = mCacheManager->getBlockManager(); size_t requestBlockNum = llmRequest.getRequestedBlockHashes().size(); - auto blockRange = BlockRange(*mCacheManager, llmRequest.mRequestId, beam); + auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mCacheManager, llmRequest.mRequestId, beam); if (requestBlockNum < blockRange.size() && requestBlockNum > 0) { // handle block reuse, the prefix blocks are reused @@ -109,7 +109,7 @@ void CacheFormatter::formatOutput(LlmRequest const& llmRequest, } TLLM_CHECK(!inputKvCacheBlocks.empty()); TLLM_CHECK(blockNum > 0); - int deviceId = mCacheManager->getBlockManager().getBufferManager().getStream().getDevice(); + int deviceId = mCacheManager->getBlockManager().getStreamDevice(); if (common::getEnvTryZCopyForKVCacheTransfer() && (destConfig.getParallelConfig().mPipelineParallelism @@ -318,8 +318,7 @@ void CacheFormatter::formatInput(LlmRequest const& llmRequest, "Start receiving KV cache for request ID: %ld, context request ID: %ld.", llmRequest.mRequestId, llmRequest.getContextPhaseParams().value().getReqId()); TLLM_CHECK(!connections.empty()); - auto blockRange = BlockRange(*mCacheManager, mCacheManager->getNewlyAllocatedBlockIds(llmRequest.mRequestId)); - + auto blockRange = BlockRange::fromNewlyAllocatedBlockIds(*mCacheManager, llmRequest.mRequestId); std::vector recvBufferTmps; std::vector outputBuffers; auto const numPools = mCacheManager->getBlockManager().getNumPools(); diff --git a/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp b/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp index 023a7c30e18..6805be14f2a 100644 --- a/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp +++ b/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp @@ -199,25 +199,32 @@ std::tuple GuaranteedNoEvictScheduler::impl( RequestVector scheduledRequests; // Now check if we can add pending requests - auto const numFreeBlocks = kvCacheManager.getNumFreeBlocks(); - auto const numFreeCrossBlocks = crossKvCacheManager ? crossKvCacheManager->getNumFreeBlocks() : 0; auto const maxPeftCachePages = peftCacheManager ? peftCacheManager->getMaxDevicePages() : std::numeric_limits::max(); + // The optimization of delaying requests won't work for variable window attention + bool skippingIsRelevant = (!kvCacheManager.getBlockManager().isVariableWindow()) + && (!crossKvCacheManager || !crossKvCacheManager->getBlockManager().isVariableWindow()); + // Keep track of blocks contributed by requests in context phase std::unordered_set newlyContributedContextBlocks; std::unordered_set newlyContributedCrossContextBlocks; if constexpr (!StaticBatchScheduling) { - std::tie(newlyContributedContextBlocks, newlyContributedCrossContextBlocks) - = prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager, crossKvCacheManager); + if (skippingIsRelevant) + { + std::tie(newlyContributedContextBlocks, newlyContributedCrossContextBlocks) + = prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager, crossKvCacheManager); + } } // If a request is already in progress, include it // If it's been allocated, it had resource to run to completion // Also keep track of blocks needed to drive all in-progress requests to completion - SizeType32 reservedBlocks{0}; - SizeType32 reservedCrossBlocks{0}; + auto reservedBlocks = kv_cache_manager::NoEvictScheduledBlocksManager(kvCacheManager); + auto reservedCrossBlocks = crossKvCacheManager + ? std::optional(kv_cache_manager::NoEvictScheduledBlocksManager(*crossKvCacheManager)) + : std::nullopt; SizeType32 claimedPeftPages{0}; std::unordered_set uniqTaskIds{}; RequestVector pendingRequests; @@ -242,8 +249,9 @@ std::tuple GuaranteedNoEvictScheduler::impl( else if (req->isGenerationInProgressState()) { scheduledRequests.emplace_back(req); - reservedBlocks += kvCacheManager.getRemainingBlocksToCompletion(*req); - + reservedBlocks.decrementReservedBlocks(*req); + if (reservedCrossBlocks) + reservedCrossBlocks->decrementReservedBlocks(*req); bool const reqHasLora = req->getLoraTaskId().has_value(); bool const isNewTask = reqHasLora && !uniqTaskIds.count(req->getLoraTaskId().value()); if (isNewTask) @@ -251,7 +259,6 @@ std::tuple GuaranteedNoEvictScheduler::impl( claimedPeftPages += peftCacheManager ? peftCacheManager->determineNumPages(req) : 0; uniqTaskIds.insert(req->getLoraTaskId().value()); } - reservedCrossBlocks += crossKvCacheManager ? crossKvCacheManager->getRemainingBlocksToCompletion(*req) : 0; } else if (req->isDisaggGenerationInitState()) { @@ -268,8 +275,6 @@ std::tuple GuaranteedNoEvictScheduler::impl( if (!StaticBatchScheduling || scheduledRequests.size() == 0) { // Now check if we can add pending requests - auto availableBlocks = numFreeBlocks - reservedBlocks; - auto availableCrossBlocks = numFreeCrossBlocks - reservedCrossBlocks; auto availablePeftPages = maxPeftCachePages - claimedPeftPages; // Loop over pending requests and add them if they can be scheduled @@ -279,7 +284,7 @@ std::tuple GuaranteedNoEvictScheduler::impl( for (auto const& req : requests) { // if context request can reuse blocks contributed by another context request, skip - if (!StaticBatchScheduling && !req->isDisaggGenerationInitState() + if (!StaticBatchScheduling && skippingIsRelevant && !req->isDisaggGenerationInitState() && beneficialToSkip(req, kvCacheManager, crossKvCacheManager, newlyContributedContextBlocks, newlyContributedCrossContextBlocks)) { @@ -292,27 +297,26 @@ std::tuple GuaranteedNoEvictScheduler::impl( } else if (req->isContextInitState() || req->isDisaggGenerationInitState()) { - auto const neededBlocks = kvCacheManager.getRemainingBlocksToCompletion(*req); - auto const neededCrossBlocks - = crossKvCacheManager ? crossKvCacheManager->getRemainingBlocksToCompletion(*req) : 0; - bool const reqHasLora = req->getLoraTaskId().has_value(); - bool const isNewTask = reqHasLora && !uniqTaskIds.count(req->getLoraTaskId().value()); - auto const neededPeftPages - = (isNewTask && peftCacheManager) ? peftCacheManager->determineNumPages(req) : 0; - - if (neededBlocks <= availableBlocks && neededCrossBlocks <= availableCrossBlocks - && neededPeftPages <= availablePeftPages) + bool enoughBlocks = reservedBlocks.enoughAvailableBlocks(*req); + bool enoughCrossBlocks + = reservedCrossBlocks ? reservedCrossBlocks->enoughAvailableBlocks(*req) : true; + bool reqHasLora = req->getLoraTaskId().has_value(); + bool isNewTask = reqHasLora && !uniqTaskIds.count(req->getLoraTaskId().value()); + auto neededPeftPages = isNewTask && peftCacheManager ? peftCacheManager->determineNumPages(req) : 0; + + if (enoughBlocks && enoughCrossBlocks && neededPeftPages <= availablePeftPages) { scheduledRequests.emplace_back(req); - availableBlocks -= neededBlocks; - availableCrossBlocks -= neededCrossBlocks; + reservedBlocks.decrementReservedBlocks(*req); + if (reservedCrossBlocks) + reservedCrossBlocks->decrementReservedBlocks(*req); availablePeftPages -= neededPeftPages; if (isNewTask) { uniqTaskIds.insert(req->getLoraTaskId().value()); } } - else if (neededBlocks > availableBlocks || neededCrossBlocks > availableCrossBlocks) + else if (!enoughBlocks || !enoughCrossBlocks) { // If one requests fails to be scheduled, break break; @@ -324,14 +328,25 @@ std::tuple GuaranteedNoEvictScheduler::impl( return {std::move(scheduledRequests), RequestVector{}}; } +// TODO(nhaber): remove forward declare and just keep the function here, right before the merge. I put it below just so +// the remote diff is easier to look at/rebase conflicts +bool trySchedulingRequestMaxUtilization(std::shared_ptr const& req, SizeType32 maxNumRequests, + RequestVector& scheduledRequests, kv_cache_manager::MaxUtilizationScheduledBlocksManager& blocksManager, + OptionalRef peftCacheManager, SizeType32& numScheduledPeftPages, + std::unordered_set& seenTaskIds); + std::tuple MaxUtilizationScheduler::operator()( kv_cache_manager::BaseKVCacheManager& kvCacheManager, OptionalRef peftCacheManager, RequestList const& activeRequests) const { kvCacheManager.startScheduling(); + // The optimization of delaying requests won't work for variable window attention + bool skippingIsRelevant = !kvCacheManager.getBlockManager().isVariableWindow(); + // Keep track of number of requests and block needed for the scheduled requests - SizeType32 numScheduledBlocks{0}; + auto scheduledBlocksManager + = kv_cache_manager::MaxUtilizationScheduledBlocksManager(kvCacheManager, mManyMicroBatches); SizeType32 numScheduledPeftPages{0}; std::unordered_set seenTaskIds; @@ -366,16 +381,17 @@ std::tuple MaxUtilizationScheduler::operator()( } // if context request can reuse blocks contributed by another context request, skip - if (beneficialToSkip( + if (skippingIsRelevant + && beneficialToSkip( req, kvCacheManager, std::nullopt, newlyContributedContextBlocks, newlyContributedCrossContextBlocks)) { reqIt++; continue; } - auto const [fitsKvCache, fitsPeftCache] = trySchedulingRequestMaxUtilization(kvCacheManager, peftCacheManager, - req, scheduledRequests, numScheduledBlocks, numScheduledPeftPages, seenTaskIds); - if (fitsKvCache && fitsPeftCache) + bool const wasScheduled = trySchedulingRequestMaxUtilization(req, mMaxNumRequests, scheduledRequests, + scheduledBlocksManager, peftCacheManager, numScheduledPeftPages, seenTaskIds); + if (wasScheduled) { TLLM_LOG_DEBUG("MaxUtilizationScheduler: request ID %lu -> start", req->mRequestId); reqIt++; @@ -405,34 +421,27 @@ std::tuple MaxUtilizationScheduler::operator()( return {std::move(scheduledRequests), std::move(pausedRequests)}; } -std::pair MaxUtilizationScheduler::trySchedulingRequestMaxUtilization( - kv_cache_manager::BaseKVCacheManager const& kvCacheManager, - OptionalRef peftCacheManager, std::shared_ptr const& req, - RequestVector& scheduledRequests, SizeType32& numScheduledBlocks, SizeType32& numScheduledPeftPages, - std::unordered_set& seenTaskIds) const +bool trySchedulingRequestMaxUtilization(std::shared_ptr const& req, SizeType32 maxNumRequests, + RequestVector& scheduledRequests, kv_cache_manager::MaxUtilizationScheduledBlocksManager& blocksManager, + OptionalRef peftCacheManager, SizeType32& numScheduledPeftPages, + std::unordered_set& seenTaskIds) { - if (scheduledRequests.size() < static_cast(mMaxNumRequests)) + if (scheduledRequests.size() < static_cast(maxNumRequests)) { - SizeType32 numRequiredBlocks = kvCacheManager.getNeededBlocksOneStep(*req, mManyMicroBatches); - TLLM_LOG_DEBUG( - "MaxUtilizationScheduler: request ID %lu required blocks: %i", req->mRequestId, numRequiredBlocks); - - bool const reqHasLora = req->getLoraTaskId().has_value(); - bool const isNewTask = reqHasLora && !seenTaskIds.count(req->getLoraTaskId().value()); - auto const numRequiredPeftPages + bool reqHasLora = req->getLoraTaskId().has_value(); + bool isNewTask = reqHasLora && !seenTaskIds.count(req->getLoraTaskId().value()); + SizeType32 numRequiredPeftPages = (isNewTask && peftCacheManager) ? peftCacheManager->determineNumPages(req) : 0; TLLM_LOG_DEBUG( "MaxUtilizationScheduler: request ID %lu required peft pages: %i", req->mRequestId, numRequiredPeftPages); - bool const fitsKvCache - = kvCacheManager.getBlockManager().schedulingHasFreeBlocks(numScheduledBlocks + numRequiredBlocks); - bool const fitsPeft + auto const scheduledBlocksIfFitsKvCache = blocksManager.prepareNewNumberOfBlocksIfWeEndUpScheduling(*req); + bool fitsPeft = (peftCacheManager ? numRequiredPeftPages + numScheduledPeftPages <= peftCacheManager->getMaxDevicePages() : true); - if (fitsKvCache && fitsPeft) + if (scheduledBlocksIfFitsKvCache && fitsPeft) { - numScheduledBlocks += numRequiredBlocks; - TLLM_LOG_DEBUG("MaxUtilizationScheduler: scheduled blocks: %i", numScheduledBlocks); + blocksManager.updateScheduledBlocks(scheduledBlocksIfFitsKvCache.value()); numScheduledPeftPages += numRequiredPeftPages; TLLM_LOG_DEBUG("MaxUtilizationScheduler: scheduled peft pages: %i", numRequiredPeftPages); scheduledRequests.emplace_back(req); @@ -440,10 +449,10 @@ std::pair MaxUtilizationScheduler::trySchedulingRequestMaxUtilizatio { seenTaskIds.insert(req->getLoraTaskId().value()); } + return true; } - return std::make_pair(fitsKvCache, fitsPeft); } - return std::make_pair(false, false); + return false; } CapacityScheduler::CapacityScheduler(SizeType32 maxNumRequests, diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp index 3597626c84f..8f9cfa7763a 100644 --- a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp +++ b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp @@ -142,8 +142,8 @@ void DataReceiverImpl::sendRequestInfo(LlmRequest const& llmRequest) if (cacheFormatter != nullptr) { auto* cacheManager = cacheFormatter->getCacheManager(); - auto blockRange = kv_cache_manager::BlockRange( - *cacheManager, cacheManager->getNewlyAllocatedBlockIds(llmRequest.mRequestId)); + auto blockRange + = kv_cache_manager::BlockRange::fromNewlyAllocatedBlockIds(*cacheManager, llmRequest.mRequestId); requestInfo = RequestInfo(requestId, blockRange.getBlockHashes(), mSelfState); } diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp index 943b95dfe15..6cf74333056 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp @@ -323,18 +323,170 @@ bool KVCacheBlock::isLeaf() const return mNextBlocks.empty(); } +/* +Example: +``` +totalBlocks = 16384, uniqueWindowSizeToLayers = {1024: [1], 4096: [0, 4, 5], 8192: [2, 3]} +windowSizeToContribution = {1024: (1024*1=1024), 4096: (4096*3=12288), 8192: (8192*2)}; +return windowSizeToAllottedBlocks = { + 1024: (1024.0 /29696)*16384 + 1 = 565, + 4096: (12288.0 /29696)*16384 + 1 = 6780, + 8192: (16384.0 /29696)*16384 = 9039 + }; +``` +*/ +std::map BlockManager::blocksPerWindowSize( + SizeType32 totalBlocks, std::map> const& uniqueWindowSizeToLayers) +{ + TLLM_CHECK(totalBlocks > 0); + std::map windowSizeToContribution; + + for (auto const& [windowSize, layers] : uniqueWindowSizeToLayers) + { + windowSizeToContribution[windowSize] = windowSize * layers.size(); + } + auto const windowSizesTotalSum = std::accumulate(windowSizeToContribution.begin(), windowSizeToContribution.end(), + SizeType32{0}, [](auto sum, auto const& windowSize) { return sum + windowSize.second; }); + + std::map windowSizeToAllottedBlocks; + SizeType32 remainingBlocks = totalBlocks; + // First pass: allocate blocks proportionally + for (auto const& [windowSize, windowSizeSum] : windowSizeToContribution) + { + float const fraction = static_cast(windowSizeSum) / windowSizesTotalSum; + TLLM_CHECK(0.0f < fraction && fraction <= 1.0f); + SizeType32 const allotted = static_cast(fraction * totalBlocks); + windowSizeToAllottedBlocks[windowSize] = allotted; + remainingBlocks -= allotted; + } + + // Second pass: awarded blocks lost to rounding down back to heaps. + for (auto& [windowSize, allottedBlocks] : windowSizeToAllottedBlocks) + { + if (remainingBlocks == 0) + { + break; + } + allottedBlocks++; + remainingBlocks--; + } + return windowSizeToAllottedBlocks; +} + BlockManager::BlockManager(std::vector const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, + SizeType32 maxNumSequences, std::shared_ptr stream, + std::optional maxSequenceLength, SizeType32 maxBeamWidth, + std::vector const& maxAttentionWindowVec, + std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, + SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType, + std::optional secondaryOffloadMinPriority, + std::shared_ptr eventManager, bool enableHashKey, bool enablePartialReuse, + bool copyOnPartialReuse) + : mNumLayers{static_cast(numKvHeadsPerLayer.size())} + , mTokensPerBlock{tokensPerBlock} + , mEventManager{std::move(eventManager)} + , mStream{stream} + , mCacheType{cacheType} +{ + auto const numNonUniqueWindowSizes = static_cast(maxAttentionWindowVec.size()); + + std::map> uniqueWindowSizeToLayers; + for (SizeType32 layerIdx = 0; layerIdx < mNumLayers; layerIdx++) + { + /* + At this point (Deep in the construction of TrtGptModel), maxAttentionWindowVec isn't "stretched" to the + length of numLayers yet. So, we need to rotate the window sizes per layer with modulo. + */ + auto const windowSize = maxAttentionWindowVec.at(layerIdx % numNonUniqueWindowSizes); + uniqueWindowSizeToLayers[windowSize].push_back(layerIdx); + } + + auto const numUniqueWindowSizes = static_cast(uniqueWindowSizeToLayers.size()); + + mIsVariableWindow = numUniqueWindowSizes > 1; + mIsVariableGQA = std::unordered_set(numKvHeadsPerLayer.begin(), numKvHeadsPerLayer.end()).size() > 1; + + auto const primaryBlocksPerWindowSize = blocksPerWindowSize(blocksInPrimaryPool, uniqueWindowSizeToLayers); + std::optional> secondaryBlocksPerWindowSize; + if (blocksInSecondaryPool > 0) + { + secondaryBlocksPerWindowSize = blocksPerWindowSize(blocksInSecondaryPool, uniqueWindowSizeToLayers); + } + + mLayerToWindowSize.resize(mNumLayers); + for (auto const& [windowSize, layersWithWindowSize] : uniqueWindowSizeToLayers) + { + for (auto& layerIdx : layersWithWindowSize) + { + mLayerToWindowSize.at(layerIdx) = windowSize; + } + SizeType32 const allottedPrimaryBlocks = primaryBlocksPerWindowSize.at(windowSize); + TLLM_CHECK(allottedPrimaryBlocks > 0); // You can't have a model with negative primary blocks... + SizeType32 const allottedSecondaryBlocks + = secondaryBlocksPerWindowSize ? secondaryBlocksPerWindowSize->at(windowSize) : 0; + mWindowBlockManagers.try_emplace(windowSize, dtype, windowSize, layersWithWindowSize, numKvHeadsPerLayer, + sizePerHead, tokensPerBlock, allottedPrimaryBlocks, allottedSecondaryBlocks, maxNumSequences, stream, + onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager, enableHashKey, enablePartialReuse, + copyOnPartialReuse); + } + + auto const numAllPools = getNumPools(); + mAbsolutePoolToWindowSize.reserve(numAllPools); + mAbsolutePoolToRelativePoolIndex.reserve(numAllPools); + auto absolutePoolsOffset = SizeType32{0}; + for (auto const& [windowSize, manager] : mWindowBlockManagers) + { + auto const numPools = manager.getNumPools(); + for (auto i = 0; i < numPools; ++i) + { + mAbsolutePoolToWindowSize.push_back(windowSize); + mAbsolutePoolToRelativePoolIndex.push_back(i); + } + auto const maxTokenNum = windowSize + sinkBubbleLength + + (isUseOneMoreBlock(windowSize, maxSequenceLength, maxBeamWidth) ? tokensPerBlock : 0); + auto const temporaryAttentionWindow = manager.calculateTemporaryAttentionWindow(tempAttentionWindowInputs); + // Consider the temporaryAttentionWindow when allocating blocks. + auto const maxBlocksPerSeq = tc::ceilDiv(maxTokenNum + temporaryAttentionWindow, tokensPerBlock); + TLLM_LOG_INFO("Max KV cache pages per sequence: %d [window size=%d]", maxBlocksPerSeq, windowSize); + mWindowSizeToMetadata[windowSize] = WindowSizeMetadata{absolutePoolsOffset, numPools, maxTokenNum, + maxBlocksPerSeq, manager.getMaxNumBlocks(), temporaryAttentionWindow}; + TLLM_LOG_DEBUG( + "%s Metadata: %s", manager.getLogPrefix().c_str(), mWindowSizeToMetadata[windowSize].toString().c_str()); + absolutePoolsOffset += numPools; + } + + TLLM_CHECK_WITH_INFO(mWindowBlockManagers.size() == mWindowSizeToMetadata.size() + && std::equal(mWindowBlockManagers.cbegin(), mWindowBlockManagers.cend(), mWindowSizeToMetadata.cbegin(), + mWindowSizeToMetadata.cend(), + [](auto const& window1, auto const& window2) { return window1.first == window2.first; }), + "Iteration order of window sizes between mWindowBlockManagers and mWindowSizeToMetadata *must* be ensured. " + "Maybe you tried changing either of them to an std::unordered_map?"); +} + +namespace +{ +inline SizeType32 digits(SizeType32 number) +{ + TLLM_CHECK(number > 0); + return static_cast(std::log10(number)) + 1; +} + +} // namespace + +WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 windowSize, + std::vector const& managedLayers, std::vector const& numKvHeadsPerLayer, + SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr stream, bool onboardBlocks, CacheType cacheType, std::optional secondaryOffloadMinPriority, std::shared_ptr eventManager, bool enableHashKey, bool enablePartialReuse, bool copyOnPartialReuse) - : mNumPrimaryBlocks{blocksInPrimaryPool} + : mDataType{dtype} + , mWindowSize{windowSize} + , mNumPrimaryBlocks{blocksInPrimaryPool} , mNumSecondaryBlocks{blocksInSecondaryPool} , mOnboardBlocks(onboardBlocks) , mBufferManager{std::move(stream)} - , mSizePerHead{sizePerHead} - , mNumLayers{static_cast(numKvHeadsPerLayer.size())} , mSchedulingNumFreeBlocks{0} , mTokensPerBlock{tokensPerBlock} , mCachedBlocksRoot{std::make_shared(KVCacheBlock::kCachedBlocksRootId, tk::KVCacheIndex{0})} @@ -346,6 +498,8 @@ BlockManager::BlockManager(std::vector const& numKvHeadsPerLayer, Si , mReusedBlocks{0} , mReusedUniqueBlocks{0} , mMissedBlocks{0} + , mKVFactor{mCacheType == CacheType::kSELFKONLY ? 1 : 2} + , mLogPrefix{tensorrt_llm::common::fmtstr("BlockManager[windowSize=%*u]", digits(windowSize), mWindowSize)} , mReusedTokens{0.0} , mTotalInputTokens{0.0} , mEnableHashKey{enableHashKey} @@ -354,32 +508,36 @@ BlockManager::BlockManager(std::vector const& numKvHeadsPerLayer, Si { std::map numLayersPerPool; - mKVFactor = (cacheType == CacheType::kSELFKONLY) ? 1 : 2; - - // count how many layers should go in each pool - mLayerIndexToPoolLayerIndex.reserve(mNumLayers); - for (SizeType32 layerIdx = 0; layerIdx < mNumLayers; layerIdx++) + for (auto const layerIdx : managedLayers) { - auto const numKvHeads = numKvHeadsPerLayer.at(layerIdx); - auto search = numLayersPerPool.find(numKvHeads); - numLayersPerPool[numKvHeads] = search == numLayersPerPool.end() ? 1 : search->second + 1; - mLayerIndexToPoolLayerIndex.emplace_back(numLayersPerPool[numKvHeads] - 1); + auto const& layerIndexWithinPool = numLayersPerPool[numKvHeadsPerLayer.at(layerIdx)]++; + mLayerToIndexWithinPool[layerIdx] = layerIndexWithinPool; } + size_t poolIndex = 0; for (auto const [numKvHeads, numLayers] : numLayersPerPool) { + for (auto const layerIdx : managedLayers) + { + if (numKvHeadsPerLayer.at(layerIdx) == numKvHeads) + { + mLayerToPoolIndex[layerIdx] = poolIndex; + } + } mPools.emplace_back(numLayers, numKvHeads, sizePerHead, tokensPerBlock, 1); + ++poolIndex; } - // assign each layer to its pool - mLayerToPool.reserve(mNumLayers); - for (SizeType32 layerIdx = 0; layerIdx < mNumLayers; layerIdx++) +#ifdef ENABLE_FP4 + // TODO(miovine): make the block size configurable. Should we have an additional argument + // to specify FP4 related parameters (scale dtypes, etc)? This can also be passed + // in the constructor. + constexpr SizeType32 kQuantBlockSizeNVFP4 = 16; + if (dtype == nvinfer1::DataType::kFP4) { - auto poolPos = std::find_if(mPools.cbegin(), mPools.cend(), - [numKvHeads = numKvHeadsPerLayer[layerIdx]](KVCacheBlockPool const& pool) - { return numKvHeads == pool.numKvHeads; }); - mLayerToPool.emplace_back(poolPos - mPools.cbegin()); + createBlockScalePools(kQuantBlockSizeNVFP4); } +#endif // Create free blocks mAllBlocksById.reserve(blocksInPrimaryPool + blocksInSecondaryPool); @@ -403,7 +561,7 @@ BlockManager::BlockManager(std::vector const& numKvHeadsPerLayer, Si } } -BlockManager::~BlockManager() +WindowBlockManager::~WindowBlockManager() { float reusedUniqueBlocksPercentage = mReusedUniqueBlocks == 0 || mAllocTotalBlocks == 0 ? 0 @@ -411,19 +569,25 @@ BlockManager::~BlockManager() float cacheHitRate = mReusedBlocks == 0 ? 0 : static_cast(mReusedBlocks) / (static_cast(mReusedBlocks + mMissedBlocks)); - TLLM_LOG_DEBUG("BlockManager - total allocated blocks: %lu ", mAllocTotalBlocks); - TLLM_LOG_DEBUG("BlockManager - allocated new blocks: %lu ", mAllocNewBlocks); - TLLM_LOG_DEBUG("BlockManager - missed blocks: %lu ", mMissedBlocks); - TLLM_LOG_DEBUG("BlockManager - reused blocks: %lu ", mReusedBlocks); - TLLM_LOG_DEBUG("BlockManager - reused unique blocks: %lu ", mReusedUniqueBlocks); - TLLM_LOG_DEBUG("BlockManager - reused unique blocks percentage (%%): %.2f ", reusedUniqueBlocksPercentage); - TLLM_LOG_DEBUG("BlockManager - cache hit rate: %.2f ", cacheHitRate); - TLLM_LOG_DEBUG("BlockManager - reused tokens: %.0f ", mReusedTokens); + TLLM_LOG_DEBUG("%s - total allocated blocks: %lu ", mLogPrefix.c_str(), mAllocTotalBlocks); + TLLM_LOG_DEBUG("%s - allocated new blocks: %lu ", mLogPrefix.c_str(), mAllocNewBlocks); + TLLM_LOG_DEBUG("%s - missed blocks: %lu ", mLogPrefix.c_str(), mMissedBlocks); + TLLM_LOG_DEBUG("%s - reused blocks: %lu ", mLogPrefix.c_str(), mReusedBlocks); + TLLM_LOG_DEBUG("%s - reused unique blocks: %lu ", mLogPrefix.c_str(), mReusedUniqueBlocks); TLLM_LOG_DEBUG( - "BlockManager - reused tokens percentage (%%): %.2f ", 100.0 * mReusedTokens / mTotalInputTokens); + "%s - reused unique blocks percentage (%%): %.2f ", mLogPrefix.c_str(), reusedUniqueBlocksPercentage); + TLLM_LOG_DEBUG("%s - cache hit rate: %.2f ", mLogPrefix.c_str(), cacheHitRate); + TLLM_LOG_DEBUG("%s - reused tokens: %.0f ", mLogPrefix.c_str(), mReusedTokens); + TLLM_LOG_DEBUG("%s - reused tokens percentage (%%): %.2f ", mLogPrefix.c_str(), + 100.0 * mReusedTokens / mTotalInputTokens); +} + +bool BlockManager::verifyQueueIntegrity(SizeType32 windowSize) +{ + return mWindowBlockManagers.at(windowSize).verifyQueueIntegrity(); } -bool BlockManager::verifyQueueIntegrity() +bool WindowBlockManager::verifyQueueIntegrity() { return mEvictionPolicy->verifyQueueIntegrity(); } @@ -431,16 +595,19 @@ bool BlockManager::verifyQueueIntegrity() void BlockManager::storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest) { constexpr int beamIdx = 0; // no need to consider more than one beam for input tokens - auto const& cacheBlockIds = sequence.getCacheBlockIds(); - auto const& uniqueTokens = llmRequest.getUniqueTokens(beamIdx); + for (auto const& [windowSize, _] : mWindowBlockManagers) + { + auto cacheBlockIds = sequence.getCacheBlockIds(windowSize); + auto const& uniqueTokens = llmRequest.getUniqueTokens(beamIdx); - auto blockedUniqueTokens - = chopVectorIntoBlocks(uniqueTokens, uniqueTokens.size() - 1, getTokensPerBlock(), false); - auto blockKeys = buildBlockKeys(blockedUniqueTokens, llmRequest); - storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx]); + auto blockedUniqueTokens + = chopVectorIntoBlocks(uniqueTokens, uniqueTokens.size() - 1, getTokensPerBlock(), false); + auto blockKeys = buildBlockKeys(blockedUniqueTokens, llmRequest); + storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx], windowSize); + } } -void BlockManager::createBlockScalePools(SizeType32 quantBlockSize) +void WindowBlockManager::createBlockScalePools(SizeType32 quantBlockSize) { auto num_pools = mPools.size(); for (size_t i = 0; i < num_pools; ++i) @@ -457,30 +624,24 @@ void BlockManager::createBlockScalePools(SizeType32 quantBlockSize) } } -void BlockManager::allocatePools(nvinfer1::DataType dtype, bool useUvm) +void BlockManager::allocatePools(bool useUvm) { - // TODO: make the block size configurable. Should we have an additional argument - // to specify FP4 related parameters (scale dtypes, etc)? This can also be passed - // in the constructor. - constexpr SizeType32 kQuantBlockSizeNVFP4 = 16; - constexpr nvinfer1::DataType kScaleDtypeNVFP4 = nvinfer1::DataType::kFP8; - -#ifdef ENABLE_FP4 - if (dtype == nvinfer1::DataType::kFP4) + for (auto& [_, manager] : mWindowBlockManagers) { - // It would be slightly better to move construction of these objects to the BlockManager - // constructor for consistency. We would have to do a bit of refactoring to pass - // the dtype in earlier. - createBlockScalePools(kQuantBlockSizeNVFP4); + manager.allocatePools(useUvm); } -#endif +} + +void WindowBlockManager::allocatePools(bool useUvm) +{ + constexpr nvinfer1::DataType kScaleDtypeNVFP4 = nvinfer1::DataType::kFP8; // Allocate a memory pool backing the blocks for each numKvHeads // TODO(oargov): allocate pools in a single buffer and split it, to avoid fragmentation for (auto& pool : mPools) { auto blockSize = pool.blockSize; - auto poolDtype = pool.containsBlockScales ? kScaleDtypeNVFP4 : dtype; + auto poolDtype = pool.containsBlockScales ? kScaleDtypeNVFP4 : mDataType; #ifdef ENABLE_FP4 auto const poolIsFP4 = poolDtype == nvinfer1::DataType::kFP4; @@ -499,7 +660,7 @@ void BlockManager::allocatePools(nvinfer1::DataType dtype, bool useUvm) nvinfer1::Dims const cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize}); - TLLM_LOG_DEBUG("[BlockManager] Allocating primary pool with %d blocks for %d layers with %d kv heads", + TLLM_LOG_DEBUG("[%s] Allocating primary pool with %d blocks for %d layers with %d kv heads", mLogPrefix.c_str(), mNumPrimaryBlocks, pool.numLayers, pool.numKvHeads); if (useUvm) @@ -511,14 +672,22 @@ void BlockManager::allocatePools(nvinfer1::DataType dtype, bool useUvm) { nvinfer1::Dims const cacheShapeOffload = ITensor::makeShape({mNumSecondaryBlocks, pool.numLayers, mKVFactor, blockSize}); - TLLM_LOG_DEBUG("[BlockManager] Allocating secondary pool with %d blocks for %d layers with %d kv heads", - mNumSecondaryBlocks, pool.numLayers, pool.numKvHeads); + TLLM_LOG_DEBUG("[%s] Allocating secondary pool with %d blocks for %d layers with %d kv heads", + mLogPrefix.c_str(), mNumSecondaryBlocks, pool.numLayers, pool.numKvHeads); pool.secondaryPtr = BufferManager::pinned(cacheShapeOffload, poolDtype); } } } void BlockManager::releasePools() +{ + for (auto& [_, manager] : mWindowBlockManagers) + { + manager.releasePools(); + } +} + +void WindowBlockManager::releasePools() { for (auto& pool : mPools) { @@ -536,6 +705,14 @@ void BlockManager::releasePools() } void BlockManager::startScheduling() +{ + for (auto& [_, manager] : mWindowBlockManagers) + { + manager.startScheduling(); + } +} + +void WindowBlockManager::startScheduling() { mSchedulingNumFreeBlocks = mEvictionPolicy->getNumFreeBlocks(kPrimaryLevel); for (auto& [requestId, slotAllocatedBlocks] : mAllocatedBlocksPerSeq) @@ -547,7 +724,7 @@ void BlockManager::startScheduling() } } -void BlockManager::claimLeafBlock(BlockPtr block, std::optional priority, +void WindowBlockManager::claimLeafBlock(BlockPtr const& block, std::optional priority, std::optional durationMs) { // The eviction policy needs blocks to still be linked to their old parents when they're reclaimed. @@ -556,7 +733,7 @@ void BlockManager::claimLeafBlock(BlockPtr block, std::optionalfreeLeafBlock(); } -BlockPtr BlockManager::getFreeBlock( +BlockPtr WindowBlockManager::getFreeBlock( executor::RetentionPriority priority, std::optional durationMs) { // eviction policy get free primary block @@ -595,40 +772,35 @@ BlockPtr BlockManager::getFreeBlock( return block; } -tk::KVCacheIndex BlockManager::getKOrVBlockIndex( - KVCacheBlock::IdType blockId, SizeType32 fieldIdx, SizeType32 poolIdx) const -{ - TLLM_CHECK_WITH_INFO(poolIdx < getNumPools(), "Pool index %d is out of bounds", poolIdx); - auto const& block = mAllBlocksById[blockId]; - auto const& pool = mPools.at(poolIdx); - if (mCacheType == CacheType::kSELFKONLY) - { - fieldIdx = 0; - } - auto constexpr layerIdx = 0; - return tk::KVCacheIndex{ - common::flat_index3(block->getMemoryPoolBlockIndex(), layerIdx, fieldIdx, pool.numLayers, mKVFactor)}; -} - -void KVCacheManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx, - SizeType32 blockIdx, KVCacheBlock::IdType blockId) const +void WindowBlockManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, + SizeType32 beamIdx, SizeType32 blockIdx, KVCacheBlock::IdType blockId) const { auto constexpr kIdx = 0; auto constexpr vIdx = 1; - auto const numPools = mBlockManager.getNumPools(); - - for (SizeType32 poolIdx = 0; poolIdx < numPools; poolIdx++) + auto const& block = mAllBlocksById[blockId]; + for (SizeType32 poolIdx = 0; poolIdx < static_cast(mPools.size()); poolIdx++) { - for (auto xIdx : {kIdx, vIdx}) + auto const& pool = mPools.at(poolIdx); + for (auto const xIdx : {kIdx, vIdx}) { + auto constexpr layerIdx = 0; auto const offsetIndex = tensorrt_llm::common::flat_index(offsetsShape.d, poolIdx, beamIdx, xIdx, blockIdx); - offsetsPtr[offsetIndex] = mBlockManager.getKOrVBlockIndex(blockId, xIdx, poolIdx); + auto const fieldIdx = mCacheType == CacheType::kSELFKONLY ? 0 : xIdx; + auto const blockIndex = tk::KVCacheIndex{ + common::flat_index3(block->getMemoryPoolBlockIndex(), layerIdx, fieldIdx, pool.numLayers, mKVFactor)}; + offsetsPtr[offsetIndex] = blockIndex; } } } -void BlockManager::addBlockToHashMap(BlockPtr block) +void BlockManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx, + SizeType32 blockIdx, KVCacheBlock::IdType blockId, SizeType32 windowSize) const +{ + mWindowBlockManagers.at(windowSize).setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId); +} + +void WindowBlockManager::addBlockToHashMap(BlockPtr const& block) { if (!mEnableHashKey) { @@ -650,7 +822,7 @@ void BlockManager::addBlockToHashMap(BlockPtr block) mContextBlocksByHash.emplace(block->getHash(), std::move(block)); } -void BlockManager::removeBlockFromHashMap(BlockPtr block) +void WindowBlockManager::removeBlockFromHashMap(BlockPtr const& block) { if (mContextBlocksByHash.empty() || block->getBlockKey().uniqueTokens.empty()) { @@ -672,7 +844,12 @@ void BlockManager::removeBlockFromHashMap(BlockPtr block) TLLM_LOG_DEBUG("Trying to remove block %d by %zx that is not in hash map", block->getBlockId(), block->getHash()); } -void BlockManager::onboardBlock(BlockPtr const& offloadBlock) +void BlockManager::onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize) +{ + mWindowBlockManagers.at(windowSize).onboardBlock(offloadBlock); +} + +void WindowBlockManager::onboardBlock(BlockPtr const& offloadBlock) { if (mOnboardBlocks && !offloadBlock->isPrimary()) { @@ -691,7 +868,12 @@ void BlockManager::onboardBlock(BlockPtr const& offloadBlock) } } -void BlockManager::offloadBlock(BlockPtr const& block) +void BlockManager::offloadBlock(BlockPtr const& block, SizeType32 windowSize) +{ + mWindowBlockManagers.at(windowSize).offloadBlock(block); +} + +void WindowBlockManager::offloadBlock(BlockPtr const& block) { if (mOnboardBlocks && block->isPrimary()) { @@ -713,7 +895,16 @@ void BlockManager::offloadBlock(BlockPtr const& block) } } -std::optional BlockManager::findNewContextBlock( +[[nodiscard]] std::optional BlockManager::findNewContextBlock( + VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const +{ + TLLM_CHECK_WITH_INFO( + !isVariableWindow(), "The optimization of delaying requests won't work for variable window attention"); + auto const& onlyManager = mWindowBlockManagers.cbegin()->second; + return onlyManager.findNewContextBlock(uniqueTokens, llmRequest); +} + +std::optional WindowBlockManager::findNewContextBlock( VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const { auto blockedUniqueTokens @@ -737,12 +928,12 @@ std::optional BlockManager::findNewContextBlock( return std::nullopt; } -bool BlockManager::blockInRadixTree(BlockPtr const& block) +bool WindowBlockManager::blockInRadixTree(BlockPtr const& block) { return !block->getUniqueTokens().empty() && block->getPrevBlock() != nullptr; } -SizeType32 BlockManager::loadOrAllocateBlocks(std::vector const& blockKeys, SizeType32 numContextBlocks, +SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector const& blockKeys, SizeType32 numContextBlocks, GenerationRequest& sequence, std::vector const& perBlockRetentions) { SizeType32 numMatchedTokens{0}; @@ -780,16 +971,16 @@ SizeType32 BlockManager::loadOrAllocateBlocks(std::vector const& block mTransferManager->onboard(matchingBlock, newBlock, mPools, numMatched); // TODO: (optional) Send out event matchingBlock = newBlock; - TLLM_LOG_DEBUG( - "BlockManager::loadOrAllocateBlocks - Copied partially filled block %d", matchingBlockId); + TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Copied partially filled block %d", mLogPrefix.c_str(), + matchingBlockId); } else { // Leaf block that nobody is using. Make block private and reuse claimLeafBlock( matchingBlock, perBlockRetentions[bi].retentionPriority, perBlockRetentions[bi].durationMs); - TLLM_LOG_DEBUG( - "BlockManager::loadOrAllocateBlocks - Reused partially filled block %d", matchingBlockId); + TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Reused partially filled block %d", mLogPrefix.c_str(), + matchingBlockId); addBlockToHashMap(matchingBlock); } searchRoot = nullptr; // no matching needed for following blocks @@ -799,7 +990,7 @@ SizeType32 BlockManager::loadOrAllocateBlocks(std::vector const& block // Recover block and reuse mEvictionPolicy->claimBlock( matchingBlock, perBlockRetentions[bi].retentionPriority, perBlockRetentions[bi].durationMs); - TLLM_LOG_DEBUG("BlockManager::loadOrAllocateBlocks - Matched full block %d", matchingBlockId); + TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Matched full block %d", mLogPrefix.c_str(), matchingBlockId); addBlockToHashMap(matchingBlock); searchRoot = matchingBlock; } @@ -821,8 +1012,8 @@ SizeType32 BlockManager::loadOrAllocateBlocks(std::vector const& block executor::KvCacheRetentionConfig::kDefaultRetentionPriority), perBlockRetentions[bi].durationMs); addBlockToAllBeams(freeBlock, sequence); - TLLM_LOG_DEBUG("BlockManager::loadOrAllocateBlocks - No match, allocated new block %d for sequence %lu", - freeBlock->getBlockId(), sequence.getRequestId()); + TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - No match, allocated new block %d for sequence %lu", + mLogPrefix.c_str(), freeBlock->getBlockId(), sequence.getRequestId()); searchRoot = nullptr; // no matching needed for following blocks if (blockItr != blockKeys.end()) { @@ -856,8 +1047,8 @@ SizeType32 BlockManager::loadOrAllocateBlocks(std::vector const& block } freeBlock->setHash(); addBlockToHashMap(freeBlock); - TLLM_LOG_DEBUG("BlockManager::loadOrAllocateBlocks - Beam %d. Allocated non-shared block %d for bi %d", - beamIdx, freeBlock->getBlockId(), bi); + TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Beam %d. Allocated non-shared block %d for bi %d", + mLogPrefix.c_str(), beamIdx, freeBlock->getBlockId(), bi); } ++mMissedBlocks; if (blockItr != blockKeys.end()) @@ -870,12 +1061,26 @@ SizeType32 BlockManager::loadOrAllocateBlocks(std::vector const& block } void BlockManager::refreshBlocks() +{ + for (auto& [_, manager] : mWindowBlockManagers) + { + manager.refreshBlocks(); + } +} + +void WindowBlockManager::refreshBlocks() { mEvictionPolicy->refresh(); mTransferManager->syncTransfers(); } -void BlockManager::addSequence( +void BlockManager::addSequence(GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks, + LlmRequest& llmRequest, SizeType32 windowSize) +{ + mWindowBlockManagers.at(windowSize).addSequence(sequence, inputLength, numContextBlocks, llmRequest); +} + +void WindowBlockManager::addSequence( GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks, LlmRequest& llmRequest) { auto const requestId = sequence.getRequestId(); @@ -912,7 +1117,13 @@ void BlockManager::addSequence( inputLength, prepopulatedPromptLen); } -void BlockManager::addSequence(GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx) +void BlockManager::addSequence( + GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx, SizeType32 windowSize) +{ + mWindowBlockManagers.at(windowSize).addSequence(sequence, numBlocks, unsharedBlockIdx); +} + +void WindowBlockManager::addSequence(GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx) { auto const requestId = sequence.getRequestId(); auto const [seqIt, emplaceDone] = mAllocatedBlocksPerSeq.emplace(requestId, std::vector{}); @@ -926,23 +1137,23 @@ void BlockManager::addSequence(GenerationRequest& sequence, SizeType32 numBlocks } } -void BlockManager::addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx) +void WindowBlockManager::addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx) { auto const requestId = sequence.getRequestId(); block->incRefCount(); - if (sequence.getCacheBlockIds().at(beamIdx).size() == 0) + if (sequence.getCacheBlockIds(mWindowSize).at(beamIdx).size() == 0) { block->setPrevBlockInSeq(nullptr); } else { - block->setPrevBlockInSeq(mAllBlocksById.at(sequence.getCacheBlockIds()[beamIdx].back())); + block->setPrevBlockInSeq(mAllBlocksById.at(sequence.getCacheBlockIds(mWindowSize)[beamIdx].back())); } - sequence.addCacheBlock(beamIdx, block->getBlockId()); + sequence.addCacheBlock(mWindowSize, beamIdx, block->getBlockId()); mAllocatedBlocksPerSeq.at(requestId).push_back(block); } -void BlockManager::addBlockToAllBeams(BlockPtr& block, GenerationRequest& sequence) +void WindowBlockManager::addBlockToAllBeams(BlockPtr& block, GenerationRequest& sequence) { auto const beamWidth = sequence.getBeamWidth(); @@ -952,7 +1163,12 @@ void BlockManager::addBlockToAllBeams(BlockPtr& block, GenerationRequest& sequen } } -void BlockManager::allocateBlock(GenerationRequest& sequence, bool shareAmongBeams) +void BlockManager::allocateBlock(GenerationRequest& sequence, SizeType32 windowSize) +{ + mWindowBlockManagers.at(windowSize).allocateBlock(sequence, false); +} + +void WindowBlockManager::allocateBlock(GenerationRequest& sequence, bool shareAmongBeams) { auto const beamWidth = sequence.getBeamWidth(); auto const requiredBlocks = shareAmongBeams ? 1 : beamWidth; @@ -979,9 +1195,11 @@ void BlockManager::allocateBlock(GenerationRequest& sequence, bool shareAmongBea } } -void BlockManager::storeBlocks(std::vector blockKeys, std::vector const& blockIds) +void WindowBlockManager::storeBlocks( + std::vector const& blockKeys, std::vector const& blockIds) { - TLLM_LOG_DEBUG("BlockManager::storeBlocks - %zu blockKeys, %zu blockIds", blockKeys.size(), blockIds.size()); + TLLM_LOG_DEBUG( + "%s::storeBlocks - %zu blockKeys, %zu blockIds", mLogPrefix.c_str(), blockKeys.size(), blockIds.size()); auto searchRoot = mCachedBlocksRoot; bool needMatch = true; @@ -991,7 +1209,7 @@ void BlockManager::storeBlocks(std::vector blockKeys, std::vector blockKeys, std::vectorgetBlockId()); + TLLM_LOG_DEBUG( + "%s::storeBlocks - Found matching block %d, traverse", mLogPrefix.c_str(), matchedBlock->getBlockId()); searchRoot = matchedBlock; // TODO possible optimization: if bid != matchedBlock->getBlockId(), // block can be freed and inserted at mFreePrimaryBlocks.begin() @@ -1008,8 +1227,8 @@ void BlockManager::storeBlocks(std::vector blockKeys, std::vectorgetBlockId()); + TLLM_LOG_DEBUG("%s::storeBlocks - No match, inserting block %d into search structure", mLogPrefix.c_str(), + block->getBlockId()); needMatch = false; // no matching needed for following blocks block->setBlockKey(blockKey, static_cast(blockKey.uniqueTokens.size()) == mTokensPerBlock); block->setPrevBlock(searchRoot); @@ -1040,7 +1259,12 @@ void BlockManager::storeBlocks(std::vector blockKeys, std::vectorincRefCount(); - if (sequence.getCacheBlockIds().at(beamIdx).size() == 0) + if (sequence.getCacheBlockIds(mWindowSize).at(beamIdx).size() == 0) { block->setPrevBlockInSeq(nullptr); } else { - block->setPrevBlockInSeq(mAllBlocksById.at(sequence.getCacheBlockIds()[beamIdx].back())); + block->setPrevBlockInSeq(mAllBlocksById.at(sequence.getCacheBlockIds(mWindowSize)[beamIdx].back())); } block->setBlockKey(blockKey, isFull); block->setHash(); - sequence.changeCacheBlock(beamIdx, blockIdx, block->getBlockId()); + sequence.changeCacheBlock(mWindowSize, beamIdx, blockIdx, block->getBlockId()); allocatedBlocks.at(blockIdx * beamWidth + beamIdx) = block; } } -std::vector BlockManager::getNewlyAllocatedBlockIds(GenerationRequest const& sequence) const +std::vector BlockManager::getNewlyAllocatedBlockIds( + GenerationRequest const& sequence, SizeType32 windowSize) const +{ + return mWindowBlockManagers.at(windowSize).getNewlyAllocatedBlockIds(sequence); +} + +std::vector WindowBlockManager::getNewlyAllocatedBlockIds(GenerationRequest const& sequence) const { std::vector allocatedBlockIds; - for (auto const& beamBlockIds : sequence.getCacheBlockIds()) + for (auto const& beamBlockIds : sequence.getCacheBlockIds(mWindowSize)) { for (auto const& blockId : beamBlockIds) { @@ -1103,7 +1333,12 @@ std::vector BlockManager::getNewlyAllocatedBlockIds(Genera return allocatedBlockIds; } -void BlockManager::releaseLastBlock(GenerationRequest& sequence) +void BlockManager::releaseLastBlock(GenerationRequest& sequence, SizeType32 windowSize) +{ + mWindowBlockManagers.at(windowSize).releaseLastBlock(sequence); +} + +void WindowBlockManager::releaseLastBlock(GenerationRequest& sequence) { auto const requestId = sequence.getRequestId(); auto& allocatedBlocks = mAllocatedBlocksPerSeq.at(requestId); @@ -1120,10 +1355,10 @@ void BlockManager::releaseLastBlock(GenerationRequest& sequence) // Remove block from allocated blocks allocatedBlocks.pop_back(); // Remove stored block ids in sequence - sequence.removeLastBlock(); + sequence.removeLastBlock(mWindowSize); } -[[nodiscard]] SizeType32 BlockManager::getNumFreeBlocks() const noexcept +[[nodiscard]] SizeType32 WindowBlockManager::getNumFreeBlocks() const noexcept { return mEvictionPolicy->getNumFreeBlocks(kPrimaryLevel); } @@ -1135,10 +1370,6 @@ std::deque BlockManager::getLatestEvents(std::optional llmRequest) { - auto const requestId = sequence.getRequestId(); - - // TODO: refactor this method in two: store blocks for reuse and just 'release blocks'. Only the caller - // can know which blocks to store for reuse and which to just release. // When releasing the blocks for a sequence, we store those blocks for potential reuse only if: // - Block reuse is enabled. // - A request was provided to this function call to identify which tokens these blocks cover @@ -1146,23 +1377,39 @@ void BlockManager::releaseBlocks(GenerationRequest& sequence, OptionalRefgetUniqueTokens(beamIdx); - auto const& cacheBlockIds = sequence.getCacheBlockIds(); - - // TODO: get the caller to mark tokens as filled / not filled, so that the kv-cache manager doesn't - // have to guess. Only (length - 1) tokens of the sequence have their kv-state recorded in kv-cache. We assume - // the last token's state is not filled yet. - auto const usableSize = static_cast(uniqueTokens.size()) - 1; - auto blockedUniqueTokens = chopVectorIntoBlocks(uniqueTokens, usableSize, mTokensPerBlock, true); - auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest); - storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx]); + if (storeBlocksForReuse) + { + manager.storeBlocksForReuse(sequence, llmRequest); + } + manager.releaseBlocks(sequence); } +} + +void WindowBlockManager::storeBlocksForReuse(GenerationRequest& sequence, OptionalRef llmRequest) +{ + auto constexpr beamIdx = 0; + auto const& uniqueTokens = llmRequest->getUniqueTokens(beamIdx); + auto const& cacheBlockIds = sequence.getCacheBlockIds(mWindowSize); + + // TODO: get the caller to mark tokens as filled / not filled, so that the kv-cache manager doesn't + // have to guess. Only (length - 1) tokens of the sequence have their kv-state recorded in kv-cache. We assume + // the last token's state is not filled yet. + auto const usableSize = static_cast(uniqueTokens.size()) - 1; + auto blockedUniqueTokens = chopVectorIntoBlocks(uniqueTokens, usableSize, mTokensPerBlock, true); + auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest); + storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx]); +} + +void WindowBlockManager::releaseBlocks(GenerationRequest& sequence) +{ + auto const requestId = sequence.getRequestId(); auto node = mAllocatedBlocksPerSeq.extract(requestId); + TLLM_CHECK(node); auto& allocatedBlocks = node.mapped(); for (auto it = allocatedBlocks.rbegin(); it != allocatedBlocks.rend(); ++it) { @@ -1177,10 +1424,18 @@ void BlockManager::releaseBlocks(GenerationRequest& sequence, OptionalRef const& maxAttentionWindowVec, - SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, int64_t stream, - std::optional maxSequenceLength, bool enableBlockReuse, bool onboardBlocks, - CacheType cacheType, bool enablePartialReuse, bool copyOnPartialReuse) + std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, + SizeType32 sinkTokenLength, int64_t stream, std::optional maxSequenceLength, + bool enableBlockReuse, bool onboardBlocks, CacheType cacheType, bool enablePartialReuse, bool copyOnPartialReuse) : KVCacheManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, temporaryAttentionWindow, + blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength, std::make_shared(reinterpret_cast(stream)), maxSequenceLength, enableBlockReuse, onboardBlocks, cacheType, std::nullopt, nullptr, false, enablePartialReuse, copyOnPartialReuse) @@ -1211,102 +1466,89 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size KVCacheManager::KVCacheManager(std::vector const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, std::vector const& maxAttentionWindowVec, - SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, int64_t stream, - std::optional maxSequenceLength, bool enableBlockReuse, bool onboardBlocks, - CacheType cacheType, std::optional secondaryOffloadMinPriority, + std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, + SizeType32 sinkTokenLength, int64_t stream, std::optional maxSequenceLength, + bool enableBlockReuse, bool onboardBlocks, CacheType cacheType, + std::optional secondaryOffloadMinPriority, std::shared_ptr eventManager, bool enablePartialReuse, bool copyOnPartialReuse) : KVCacheManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, maxAttentionWindowVec, temporaryAttentionWindow, sinkTokenLength, + maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength, std::make_shared(reinterpret_cast(stream)), maxSequenceLength, - enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority, std::move(eventManager), false, - enablePartialReuse, copyOnPartialReuse) + enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority, eventManager, enablePartialReuse, + copyOnPartialReuse) { } KVCacheManager::KVCacheManager(std::vector const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, std::vector const& maxAttentionWindowVec, - SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, CudaStreamPtr stream, - std::optional maxSequenceLength, bool enableBlockReuse, bool onboardBlocks, - CacheType cacheType, std::optional secondaryOffloadMinPriority, + std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, + SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional maxSequenceLength, + bool enableBlockReuse, bool onboardBlocks, CacheType cacheType, + std::optional secondaryOffloadMinPriority, std::shared_ptr eventManager, bool enableHashKey, bool enablePartialReuse, bool copyOnPartialReuse) : mMaxBeamWidth(maxBeamWidth) + , mDataType(dtype) , mMaxAttentionWindow(*std::max_element(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end())) - , mMinAttentionWindow(*std::min_element(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end())) - , mTemporaryAttentionWindow(temporaryAttentionWindow) , mTokensPerBlock(tokensPerBlock) , mSinkBubbleLength(BaseKVCacheManager::getSinkBubbleLength(sinkTokenLength, tokensPerBlock)) , mSinkBlockTokenLength(mSinkBubbleLength + sinkTokenLength) , mBlockManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, - maxNumSequences, std::move(stream), onboardBlocks, cacheType, secondaryOffloadMinPriority, + maxNumSequences, std::move(stream), maxSequenceLength, maxBeamWidth, maxAttentionWindowVec, + tempAttentionWindowInputs, dtype, mSinkBubbleLength, onboardBlocks, cacheType, secondaryOffloadMinPriority, std::move(eventManager), enableHashKey, enablePartialReuse, copyOnPartialReuse) // disable block reuse for sink bubble since chopVectorIntoBlocks does not match KV cache blocks in this case , mEnableBlockReuse{mSinkBubbleLength > 0 ? false : enableBlockReuse} , mEnableHashKey{enableHashKey} { + TLLM_CHECK_DEBUG(std::find(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end(), mMaxAttentionWindow) + != maxAttentionWindowVec.end()); // The sink tokens are stored in blocks separate from other tokens. // If the last block of sink tokens is only partially filled, // we fill that block with a "bubble" to reach the number of tokens per block. TLLM_CHECK(mSinkBlockTokenLength % tokensPerBlock == 0); - - mMaxTokenNum = mMaxAttentionWindow + mSinkBubbleLength; - - // If maxBeamWidth > 1, use one more block for each sequence in the paged kv cache to avoid dropping the needed - // tokens, when enabling cyclic kv cache. - mUseOneMoreBlock - = maxSequenceLength.has_value() && maxSequenceLength.value() > mMaxAttentionWindow && maxBeamWidth > 1; - TLLM_CHECK_WITH_INFO(!mUseOneMoreBlock || mTemporaryAttentionWindow == 0, - "Can't support sliding window attention, paged context fmha, and beam search are used together."); - if (mUseOneMoreBlock) - { - mMaxTokenNum += tokensPerBlock; - } - - // Consider the mTemporaryAttentionWindow when allocating blocks. - mMaxBlocksPerSeq = tc::ceilDiv(mMaxTokenNum + mTemporaryAttentionWindow, tokensPerBlock); - TLLM_LOG_DEBUG("KV cache block reuse is %s", mEnableBlockReuse ? "enabled" : "disabled"); - TLLM_LOG_DEBUG("Max KV cache pages per sequence: %d", mMaxBlocksPerSeq); mSequences.reserve(maxNumSequences); } KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, std::vector const& maxAttentionWindowVec, - SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, CudaStreamPtr stream, - std::optional maxSequenceLength, bool enableBlockReuse, bool onboardBlocks, - CacheType cacheType, std::optional secondaryOffloadMinPriority, + std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, + SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional maxSequenceLength, + bool enableBlockReuse, bool onboardBlocks, CacheType cacheType, + std::optional secondaryOffloadMinPriority, std::shared_ptr eventManager, bool enableHashKey, bool enablePartialReuse, bool copyOnPartialReuse) : KVCacheManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, temporaryAttentionWindow, + blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength, std::move(stream), maxSequenceLength, enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority, std::move(eventManager), enableHashKey, enablePartialReuse, copyOnPartialReuse) { } -void KVCacheManager::allocatePools(nvinfer1::DataType dtype, bool useUvm) +void KVCacheManager::allocatePools(bool useUvm) { - mBlockManager.allocatePools(dtype, useUvm); + mBlockManager.allocatePools(useUvm); + auto const numPools = mBlockManager.getNumPools(); if (tc::Logger::getLogger()->getLevel() == tc::Logger::INFO) { uint64_t cacheSizeBytes = 0; - auto const numPools = mBlockManager.getNumPools(); for (SizeType32 poolIdx = 0; poolIdx < numPools; poolIdx++) { auto const cacheShape = mBlockManager.getPrimaryPool(poolIdx)->getShape(); auto const cacheVolume = ITensor::volume(cacheShape); #ifdef ENABLE_FP4 - auto const isFp4 = dtype == nvinfer1::DataType::kFP4; + auto const isFp4 = mDataType == nvinfer1::DataType::kFP4; #else auto const isFp4 = false; #endif if (!isFp4) { - cacheSizeBytes += cacheVolume * BufferDataType(dtype).getSize(); + cacheSizeBytes += cacheVolume * BufferDataType(mDataType).getSize(); } else { @@ -1320,7 +1562,6 @@ void KVCacheManager::allocatePools(nvinfer1::DataType dtype, bool useUvm) cacheSizeBytes / static_cast(1 << 30), maxNumTokens); } - auto const numPools = mBlockManager.getNumPools(); auto const numKVPools = mBlockManager.getNumPools(/*include_block_scalar_pools=*/false); auto const numBlockScalePools = numPools - numKVPools; @@ -1336,13 +1577,12 @@ void KVCacheManager::allocatePools(nvinfer1::DataType dtype, bool useUvm) for (SizeType32 poolIdx = 0; poolIdx < numPools; poolIdx++) { - auto const isBlockScale = mBlockManager.containsBlockScales(poolIdx); - auto& outIdx = isBlockScale ? blockScalePoolIdx : kvPoolIdx; - auto& outRange = isBlockScale ? blockScalePtrsRange : poolPtrsRange; + auto const& pool = mBlockManager.getPool(poolIdx); + auto& outIdx = pool.containsBlockScales ? blockScalePoolIdx : kvPoolIdx; + auto& outRange = pool.containsBlockScales ? blockScalePtrsRange : poolPtrsRange; - outRange[outIdx * 2] = mBlockManager.getPrimaryPool(poolIdx)->data(); - auto secondaryPool = mBlockManager.getSecondaryPool(poolIdx); - outRange[outIdx * 2 + 1] = secondaryPool ? secondaryPool->data() : nullptr; + outRange[outIdx * 2] = pool.primaryPtr->data(); + outRange[outIdx * 2 + 1] = pool.secondaryPtr ? pool.secondaryPtr->data() : nullptr; ++outIdx; } @@ -1368,7 +1608,8 @@ void KVCacheManager::startScheduling() mBlockManager.startScheduling(); } -SizeType32 KVCacheManager::getNeededBlocksOneStep(LlmRequest const& req, bool twoStepsLookAhead) const +SizeType32 KVCacheManager::getNeededBlocksOneStep( + LlmRequest const& req, bool twoStepsLookAhead, SizeType32 windowSize) const { SizeType32 numRequiredBlocks = 0; SizeType32 const numDraftTokens = req.getNumDraftTokens(); @@ -1380,8 +1621,7 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(LlmRequest const& req, bool tw { // Assumes shared among beam = True auto const promptCacheLen - = std::min((isCrossKv() ? req.getEncoderOutputLen() : req.mPromptLen) + numDraftTokensPerStep, - mMaxAttentionWindow) + = std::min((isCrossKv() ? req.getEncoderOutputLen() : req.mPromptLen) + numDraftTokensPerStep, windowSize) + mSinkBubbleLength; auto const numSharedBlocks = promptCacheLen / getTokensPerBlock(); auto const numUnSharedTokens = promptCacheLen % getTokensPerBlock(); @@ -1401,7 +1641,7 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(LlmRequest const& req, bool tw auto const numPastTokens = req.mPromptLen + generatedTokens + mSinkBubbleLength - 1; auto const numNextTokens = numPastTokens + (twoStepsLookAhead ? 2 : 1) * numTokensPerStep; - if (numNextTokens > mMaxTokenNum) + if (numNextTokens > mBlockManager.getWindowSizeMetadata(windowSize).maxTokenNum) { return 0; } @@ -1413,8 +1653,9 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(LlmRequest const& req, bool tw return numRequiredBlocks; } -SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req) const +SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req, SizeType32 windowSize) const { + if (isCrossKv()) { if (req.isContextInitState() && req.getContextCurrentPosition() == 0) @@ -1424,14 +1665,15 @@ SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req) return 0; // cross KV cache doesn't grow after the initial context phase } + + auto const temporaryAttentionWindow = mBlockManager.getWindowSizeMetadata(windowSize).temporaryAttentionWindow; + SizeType32 const numContextBlocks - = (std::min(req.mPromptLen, mMaxAttentionWindow + mTemporaryAttentionWindow) + mSinkBubbleLength) - / getTokensPerBlock(); + = (std::min(req.mPromptLen, windowSize + temporaryAttentionWindow) + mSinkBubbleLength) / getTokensPerBlock(); - SizeType32 const numTotalBlocksPerBeam - = tc::ceilDiv(std::min(req.mPromptLen + req.mMaxNewTokens, mMaxAttentionWindow + mTemporaryAttentionWindow) - + mSinkBubbleLength, - getTokensPerBlock()); + SizeType32 const numTotalBlocksPerBeam = tc::ceilDiv( + std::min(req.mPromptLen + req.mMaxNewTokens, windowSize + temporaryAttentionWindow) + mSinkBubbleLength, + getTokensPerBlock()); SizeType32 const numGenBlocksPerBeam = numTotalBlocksPerBeam - numContextBlocks; @@ -1442,7 +1684,7 @@ SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req) if (seqIt != mSequences.end()) { auto const& seq = seqIt->second; - numAllocBlocksPerBeam = seq.getCacheBlockIds().at(0).size(); + numAllocBlocksPerBeam = seq.getCacheBlockIds(windowSize).at(0).size(); } } @@ -1454,10 +1696,10 @@ SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req) return (numTotalBlocksPerBeam - numAllocBlocksPerBeam) * req.mSamplingConfig.beamWidth; } -void KVCacheManager::cacheBlockOffsets(GenerationRequest& sequence) +void KVCacheManager::cacheBlockOffsets(GenerationRequest& sequence, SizeType32 windowSize) { - auto const& cacheBlocks = sequence.getCacheBlockIds(); - auto& cacheBlocksTensor = sequence.getCacheBlockIndices(); + auto const& cacheBlocks = sequence.getCacheBlockIds(windowSize); + auto& cacheBlocksTensor = sequence.getCacheBlockIndices(windowSize); auto const beamWidth = sequence.getBeamWidth(); auto* offsetsPtr = bufferCast(cacheBlocksTensor); @@ -1469,15 +1711,15 @@ void KVCacheManager::cacheBlockOffsets(GenerationRequest& sequence) for (SizeType32 blockIdx = 0; blockIdx < static_cast(beamCacheBlock.size()); ++blockIdx) { auto const blockId = beamCacheBlock.at(blockIdx); - setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId); + mBlockManager.setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId, windowSize); } } } -void KVCacheManager::cacheNewBlockOffsets(GenerationRequest& sequence) +void KVCacheManager::cacheNewBlockOffsets(GenerationRequest& sequence, SizeType32 windowSize) { - auto const& cacheBlocks = sequence.getCacheBlockIds(); - auto& cacheBlocksTensor = sequence.getCacheBlockIndices(); + auto const& cacheBlocks = sequence.getCacheBlockIds(windowSize); + auto& cacheBlocksTensor = sequence.getCacheBlockIndices(windowSize); auto const beamWidth = sequence.getBeamWidth(); auto* offsetsPtr = bufferCast(cacheBlocksTensor); @@ -1488,14 +1730,14 @@ void KVCacheManager::cacheNewBlockOffsets(GenerationRequest& sequence) auto const& beamCacheBlock = cacheBlocks[beamIdx]; auto const blockId = beamCacheBlock.back(); auto const blockIdx = static_cast(beamCacheBlock.size() - 1); - setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId); + mBlockManager.setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId, windowSize); } } -void KVCacheManager::updateNewBlockPointer(GenerationRequest& sequence, SizeType32 blockIdx) +void KVCacheManager::updateNewBlockPointer(GenerationRequest& sequence, SizeType32 windowSize, SizeType32 blockIdx) { - auto const& cacheBlocks = sequence.getCacheBlockIds(); - auto& cacheBlocksTensor = sequence.getCacheBlockIndices(); + auto const& cacheBlocks = sequence.getCacheBlockIds(windowSize); + auto& cacheBlocksTensor = sequence.getCacheBlockIndices(windowSize); auto const beamWidth = sequence.getBeamWidth(); auto* offsetsPtr = bufferCast(cacheBlocksTensor); @@ -1505,7 +1747,7 @@ void KVCacheManager::updateNewBlockPointer(GenerationRequest& sequence, SizeType { auto const& beamCacheBlock = cacheBlocks[beamIdx]; auto const blockId = beamCacheBlock.at(blockIdx); - setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId); + mBlockManager.setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId, windowSize); } } @@ -1529,39 +1771,43 @@ void KVCacheManager::updateToken(GenerationRequest& sequence, bool addToken) std::swap(currNumTokens, newNumTokens); } - SizeType32 const cyclicTokenNum = mMaxTokenNum - mSinkBlockTokenLength; - SizeType32 const nextTokenIdxInCycle = (currNumTokens - mSinkBlockTokenLength) % cyclicTokenNum; - SizeType32 const nextTokenIdxInCache = mSinkBlockTokenLength + nextTokenIdxInCycle; + for (auto const [windowSize, metadata] : mBlockManager.getWindowSizesMetadata()) + { + auto const maxTokenNum = metadata.maxTokenNum; + SizeType32 const cyclicTokenNum = maxTokenNum - mSinkBlockTokenLength; + SizeType32 const nextTokenIdxInCycle = (currNumTokens - mSinkBlockTokenLength) % cyclicTokenNum; + SizeType32 const nextTokenIdxInCache = mSinkBlockTokenLength + nextTokenIdxInCycle; - // (nextTokenIdxInCache - mSinkBlockTokenLength) % cyclicTokenNum == 0) - // <=> nextTokenIdxInCycle == 0 - // <=> nextTokenIdxInCache == mSinkBlockTokenLength - // => nextTokenIdxInCache % getTokensPerBlock() == 0 + // (nextTokenIdxInCache - mSinkBlockTokenLength) % cyclicTokenNum == 0) + // <=> nextTokenIdxInCycle == 0 + // <=> nextTokenIdxInCache == mSinkBlockTokenLength + // => nextTokenIdxInCache % getTokensPerBlock() == 0 - // Check if require a new block - if (nextTokenIdxInCache % getTokensPerBlock() == 0) - { - if (newNumTokens <= mMaxTokenNum) + // Check if require a new block + if (nextTokenIdxInCache % getTokensPerBlock() == 0) { - if (addToken) + if (newNumTokens <= maxTokenNum) { - mBlockManager.allocateBlock(sequence); - cacheNewBlockOffsets(sequence); + if (addToken) + { + mBlockManager.allocateBlock(sequence, windowSize); + cacheNewBlockOffsets(sequence, windowSize); + } + else + { + mBlockManager.releaseLastBlock(sequence, windowSize); + } } - else + else if (sequence.getBeamWidth() > 1) { - mBlockManager.releaseLastBlock(sequence); + TLLM_CHECK_WITH_INFO(addToken, "Remove token is not supported with beam search"); + // Get next block index + SizeType32 nextBlockIdx = nextTokenIdxInCache / getTokensPerBlock(); + // Replace the shared block with the unshared ones + mBlockManager.replaceSharedBlock(sequence, windowSize, nextBlockIdx); + updateNewBlockPointer(sequence, windowSize, nextBlockIdx); } } - else if (sequence.getBeamWidth() > 1 || mEnableBlockReuse) - { - TLLM_CHECK_WITH_INFO(addToken, "Remove token is not supported with beam search"); - // Get next block index - SizeType32 nextBlockIdx = nextTokenIdxInCache / getTokensPerBlock(); - // Replace the shared block with the unshared ones - mBlockManager.replaceSharedBlock(sequence, nextBlockIdx); - updateNewBlockPointer(sequence, nextBlockIdx); - } } } @@ -1591,78 +1837,83 @@ void KVCacheManager::addSequence( auto const [seqIt, emplaceDone] = [&] { auto lck = std::scoped_lock(mSequencesMtx); - return mSequences.emplace(requestId, - GenerationRequest(requestId, inputLength, beamWidth, mMaxBlocksPerSeq, - mMinAttentionWindow + mSinkBubbleLength, // When the request has started cycling, disable reuse. - mBlockManager.getNumPools(), kvCacheRetentionConfig)); + return mSequences.try_emplace(requestId, requestId, inputLength, beamWidth, + mBlockManager.getWindowSizesMetadata(), kvCacheRetentionConfig); }(); TLLM_CHECK(emplaceDone); auto& sequence = seqIt->second; - // Get the final token index in kv cache - SizeType32 const finalTokenKVIdx - = mSinkBlockTokenLength + ((inputLength - 1 - mSinkBlockTokenLength) % (mMaxTokenNum - mSinkBlockTokenLength)); - - // Get block index that with shareAmongBeams=False. - // For cross kv cache in encoder-decoder models, always shareAmongBeams=True. - SizeType32 unsharedBlockIdx = -1; - if ((!sequence.isCyclic() || beamWidth > 1 || finalTokenKVIdx % getTokensPerBlock() > 0) && !isCrossKv()) - { - unsharedBlockIdx = ((finalTokenKVIdx + 1) % getTokensPerBlock() == 0) - ? finalTokenKVIdx / getTokensPerBlock() + 1 - : finalTokenKVIdx / getTokensPerBlock(); - } - - // Consider the mTemporaryAttentionWindow when allocating blocks. - inputLength = std::min(inputLength, mMaxTokenNum + mTemporaryAttentionWindow); - auto const numContextBlocks = tc::ceilDiv(inputLength, getTokensPerBlock()); - // Get statistics for block allocations/reuse pre request. SizeType32 const numAllocTotalBlocksPreRequest = mBlockManager.getNumAllocTotalBlocks(); SizeType32 const numAllocNewBlocksPreRequest = mBlockManager.getNumAllocNewBlocks(); SizeType32 const numReusedBlocksPreRequest = mBlockManager.getNumReusedBlocks(); SizeType32 const numMissedBlocksPreRequest = mBlockManager.getNumMissedBlocks(); - if (!sequence.isCyclic() && mEnableBlockReuse) - { - mBlockManager.addSequence(sequence, inputLength, numContextBlocks, *llmRequest); - } - else + for (auto const [windowSize, metadata] : mBlockManager.getWindowSizesMetadata()) { - if (!mEnableBlockReuse && llmRequest && llmRequest->getKvCacheRetentionConfig().has_value()) + auto const maxTokenNum = metadata.maxTokenNum; + auto const temporaryAttentionWindow = metadata.temporaryAttentionWindow; + + // Get the final token index in kv cache + SizeType32 const finalTokenKVIdx = mSinkBlockTokenLength + + ((inputLength - 1 - mSinkBlockTokenLength) % (maxTokenNum - mSinkBlockTokenLength)); + + // Get block index that with shareAmongBeams=False. + // For cross kv cache in encoder-decoder models, always shareAmongBeams=True. + SizeType32 unsharedBlockIdx = -1; + if ((!sequence.isCyclic() || beamWidth > 1 || finalTokenKVIdx % getTokensPerBlock() > 0) && !isCrossKv()) { - TLLM_LOG_WARNING( - "Request %d has a retention configuration set, but block reuse is disabled. The retention config will " - "have no effect.", - llmRequest->mRequestId); + unsharedBlockIdx = ((finalTokenKVIdx + 1) % getTokensPerBlock() == 0) + ? finalTokenKVIdx / getTokensPerBlock() + 1 + : finalTokenKVIdx / getTokensPerBlock(); } - mBlockManager.addSequence(sequence, numContextBlocks, unsharedBlockIdx); - if (mEnableHashKey && llmRequest.has_value() && beamWidth == 1) + + // Consider the temporaryAttentionWindow when allocating blocks. + auto const effectiveInputLength = std::min(inputLength, maxTokenNum + temporaryAttentionWindow); + auto const numContextBlocks = tc::ceilDiv(effectiveInputLength, getTokensPerBlock()); + if (!sequence.isCyclic() && mEnableBlockReuse) { - constexpr SizeType32 beamIdx = 0; - auto const& blockIds = sequence.getCacheBlockIds().at(beamIdx); - auto const& uniqueTokens = llmRequest->getUniqueTokens(beamIdx); - auto blockedUniqueTokens - = chopVectorIntoBlocks(uniqueTokens, uniqueTokens.size() - 1, getTokensPerBlock(), true); - auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest); - auto tokensPerBlock = static_cast(getTokensPerBlock()); - for (size_t i = 0; i < blockIds.size(); i++) + mBlockManager.addSequence(sequence, effectiveInputLength, numContextBlocks, *llmRequest, windowSize); + } + else + { + if (!mEnableBlockReuse && llmRequest && llmRequest->getKvCacheRetentionConfig().has_value()) { - auto const& block = mBlockManager.getBlockById(blockIds[i]); - if (i < blockKeys.size()) - { - block->setBlockKey(blockKeys[i], blockKeys[i].uniqueTokens.size() == tokensPerBlock); - } - else + TLLM_LOG_WARNING( + "Request %d has a retention configuration set, but block reuse is disabled. The retention " + "config " + "will " + "have no effect.", + llmRequest->mRequestId); + } + mBlockManager.addSequence(sequence, numContextBlocks, unsharedBlockIdx, windowSize); + if (mEnableHashKey && llmRequest.has_value() && beamWidth == 1) + { + constexpr SizeType32 beamIdx = 0; + auto const& blockIds = sequence.getCacheBlockIds(windowSize).at(beamIdx); + auto const& uniqueTokens = llmRequest->getUniqueTokens(beamIdx); + auto blockedUniqueTokens = chopVectorIntoBlocks( + uniqueTokens, uniqueTokens.size() - 1, getTokensPerBlock(), true); + auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest); + auto tokensPerBlock = static_cast(getTokensPerBlock()); + for (size_t i = 0; i < blockIds.size(); i++) { - block->setBlockKey({}, false); + auto const& block = mBlockManager.getBlockById(blockIds[i], windowSize); + if (i < blockKeys.size()) + { + block->setBlockKey(blockKeys[i], blockKeys[i].uniqueTokens.size() == tokensPerBlock); + } + else + { + block->setBlockKey({}, false); + } + block->setHash(); + mBlockManager.addBlockToHashMap(block, windowSize); } - block->setHash(); - mBlockManager.addBlockToHashMap(block); } } + cacheBlockOffsets(sequence, windowSize); } - cacheBlockOffsets(sequence); if (llmRequest) { @@ -1702,7 +1953,7 @@ void KVCacheManager::removeSequence(RequestIdType requestId, OptionalRef(output); - auto const* srcPtr = bufferCast(cacheBlocksTensor); auto const& dstShape = output.getShape(); - auto const& srcShape = cacheBlocksTensor.getShape(); SizeType32 constexpr kIdx = 0; SizeType32 constexpr vIdx = 1; SizeType32 maxBlockCount{0}; // Get page table for each KV cache pool - auto const numPools = mBlockManager.getNumPools(); - - for (SizeType32 poolIdx = 0; poolIdx < numPools; poolIdx++) - { - for (SizeType32 beamIdx = 0; beamIdx < beamWidth; ++beamIdx) + SizeType32 absolutePoolIdx = 0; + for (auto const [windowSize, metadata] : mBlockManager.getWindowSizesMetadata()) + { + auto const& cacheBlocksTensor = sequence.getCacheBlockIndices(windowSize); + auto const* srcPtr = bufferCast(cacheBlocksTensor); + auto const& srcShape = cacheBlocksTensor.getShape(); + auto const& cacheBlockIds = sequence.getCacheBlockIds(windowSize); + for (SizeType32 poolIdx = 0; poolIdx < metadata.numPools; poolIdx++, absolutePoolIdx++) { - auto const beamBlockCount = sequence.getCacheBlockIds()[beamIdx].size(); - auto const copyChunkSize = beamBlockCount * sizeof(tk::KVCacheIndex); - for (auto xIdx : {kIdx, vIdx}) + for (SizeType32 beamIdx = 0; beamIdx < beamWidth; ++beamIdx) { - auto const srcIndex = tc::flat_index(srcShape.d, poolIdx, beamIdx, xIdx, 0); - auto const dstIndex = tc::flat_index(dstShape.d, poolIdx, outputSlotOffset + beamIdx, xIdx, 0); - std::memcpy(dstPtr + dstIndex, srcPtr + srcIndex, copyChunkSize); + auto const beamBlockCount = cacheBlockIds[beamIdx].size(); + auto const copyChunkSize = beamBlockCount * sizeof(tk::KVCacheIndex); + for (auto xIdx : {kIdx, vIdx}) + { + auto const srcIndex = tc::flat_index(srcShape.d, poolIdx, beamIdx, xIdx, 0); + auto const dstIndex + = tc::flat_index(dstShape.d, absolutePoolIdx, outputSlotOffset + beamIdx, xIdx, 0); + std::memcpy(dstPtr + dstIndex, srcPtr + srcIndex, copyChunkSize); + } + maxBlockCount = std::max(maxBlockCount, static_cast(beamBlockCount)); } - maxBlockCount = std::max(maxBlockCount, static_cast(beamBlockCount)); } } return maxBlockCount; @@ -1860,32 +2115,29 @@ SizeType32 BaseKVCacheManager::getSinkBubbleLength(SizeType32 sinkTokenLen, Size return sinkBubbleLength; } -bool KVCacheManager::schedulingHasFreeBlocks(SizeType32 numRequired) const -{ - return mBlockManager.schedulingHasFreeBlocks(numRequired); -} - -std::vector> const& KVCacheManager::getCacheBlockIds(RequestIdType requestId) const +std::vector> const& KVCacheManager::getCacheBlockIds( + RequestIdType requestId, SizeType32 windowSize) const { - return getSequence(requestId).getCacheBlockIds(); + return getSequence(requestId).getCacheBlockIds(windowSize); } std::vector>> KVCacheManager::getBatchCacheBlockIds( - std::vector const& requestIds) const + std::vector const& requestIds, SizeType32 windowSize) const { std::vector>> result{}; result.reserve(requestIds.size()); for (auto const& requestId : requestIds) { auto const& sequence = getSequence(requestId); - result.emplace_back(sequence.getCacheBlockIds()); + result.emplace_back(sequence.getCacheBlockIds(windowSize)); } return result; } -std::vector KVCacheManager::getNewlyAllocatedBlockIds(LlmRequest::RequestIdType requestId) const +std::vector KVCacheManager::getNewlyAllocatedBlockIds( + LlmRequest::RequestIdType requestId, SizeType32 windowSize) const { - return mBlockManager.getNewlyAllocatedBlockIds(mSequences.at(requestId)); + return mBlockManager.getNewlyAllocatedBlockIds(mSequences.at(requestId), windowSize); } runtime::ITensor::SharedPtr KVCacheManager::getPrimaryPool(SizeType32 layer_idx) const @@ -1895,7 +2147,7 @@ runtime::ITensor::SharedPtr KVCacheManager::getPrimaryPool(SizeType32 layer_idx) SizeType32 KVCacheManager::getMaxCapacityBatchSize(SizeType32 inputLength, SizeType32 outputLength) const { - + // TODO(nhaber): mMaxAttentionWindow -> Check this call auto const blockRequirementsPerSequence = KVCacheManager::calculateMaxBlockRequirements( inputLength, outputLength, mSinkBlockTokenLength, mMaxAttentionWindow, mMaxBeamWidth, mTokensPerBlock); @@ -1922,8 +2174,8 @@ SizeType32 KVCacheManager::calculateMaxBlockRequirements(SizeType32 inputLength, wholeSequenceLength, sinkTokenLength, maxAttentionWindow, tokensPerBlock); } - // If the whole attention window can fit in the output, then we can simply multiply the cost of a sequence of length - // max attention window by the beam width. + // If the whole attention window can fit in the output, then we can simply multiply the cost of a sequence of + // length max attention window by the beam width. if (maxAttentionWindow <= outputLength) { return KVCacheManager::calculateMaxBlockRequirementsPerBeam( @@ -1955,9 +2207,9 @@ SizeType32 KVCacheManager::calculateMaxBlockRequirements(SizeType32 inputLength, return (blockCapacity / beamWidth) * tokensPerBlock; } - // Otherwise, we need to determine how many context tokens we can fit on top of the output tokens. First, there are - // a few context tokens we might be able to fit 'for free' because the output is not a multiple of the number of - // tokens per block. + // Otherwise, we need to determine how many context tokens we can fit on top of the output tokens. First, there + // are a few context tokens we might be able to fit 'for free' because the output is not a multiple of the + // number of tokens per block. auto const leftoverBlockCapacity = blockCapacity - outputBlockRequirements; return std::min(outputLength + leftoverBlockCapacity * tokensPerBlock, inputLength + outputLength); } diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp index 329147b4d0c..78badb9bd55 100644 --- a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp @@ -107,7 +107,7 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest, constexpr SizeType32 beam{0}; auto const numPools = mCacheManager->getBlockManager().getNumPools(); - auto blockRange = BlockRange(*mCacheManager, llmRequest.mRequestId, beam); + auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mCacheManager, llmRequest.mRequestId, beam); int blockNum = 0; std::vector inputKvCacheBlocks; @@ -121,7 +121,7 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest, } } TLLM_CHECK(blockNum > 0); - int deviceId = mCacheManager->getBlockManager().getBufferManager().getStream().getDevice(); + int deviceId = mCacheManager->getBlockManager().getStreamDevice(); if (common::getEnvTryZCopyForKVCacheTransfer() && destConfig.getParallelConfig().mPipelineParallelism == selfConfig.getParallelConfig().mPipelineParallelism) @@ -330,7 +330,7 @@ void MLACacheFormatter::formatInput(LlmRequest const& llmRequest, auto pickUpConnections = pickRecvConnections(connections, selfConfig, selfIdx, destConfig); // diff end constexpr SizeType32 beam{0}; - auto blockRange = BlockRange(*mCacheManager, llmRequest.mRequestId, beam); + auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mCacheManager, llmRequest.mRequestId, beam); std::vector recvBufferTmps; std::vector outputBuffers; auto const numPools = mCacheManager->getBlockManager().getNumPools(); diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp index b6ad1a1f585..5d3e6620081 100644 --- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp +++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp @@ -200,12 +200,6 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr(optionalParams.guidedDecodingConfig.value(), @@ -299,6 +293,31 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptrgetBlockManager(); + + TLLM_CHECK_WITH_INFO(blockManager.getNumPools() == 1, + "Rewinding KV cache blocks for models with multiple pools is not supported"); + + // Two "redundant" checks given the pool size check above, but those below don't rely on an implementation + // detail I guess. + TLLM_CHECK_WITH_INFO( + !blockManager.isVariableWindow(), "Rewinding KV cache blocks for variable SWA models isn't supported"); + auto const maxBlocksPerSeq = blockManager.getMaxBlockPerSeqWhenSingleWindowSize(); + auto const isUseOneMoreBlock = kv_cache_manager::BlockManager::isUseOneMoreBlock( + getMaxAttentionWindow(), getMaxSequenceLen(), getMaxBeamWidth()); + + // TODO(oargov): VGQA is not supported, assume all layers have the same num_kv_heads + TLLM_CHECK_WITH_INFO( + !blockManager.isVariableGQA(), "Rewinding KV cache blocks for variable GQA models isn't supported"); + auto const numKvHeads = mModelConfig.getNbKvHeads(0); + + mRewindInputs = RewindInputs{maxBlocksPerSeq, isUseOneMoreBlock, numKvHeads}; + } + if (mWorldConfig.isPipelineParallel()) { mAsyncSendWaitThread = std::make_unique( @@ -347,7 +366,7 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptrgetTokensPerBlock(); // If sliding window attention is used, then make sure the unit size aligns with the paged context fmha's kv // step size. - if (getMaxInputLen() > getMaxAttentionWindow()) + if (getMaxInputLen() > getMaxAttentionWindow()) // TODO(nhaber): minAttentionWindow { chunkUnitSize = std::max(/* maxKvStepSizeInFmha */ 256, chunkUnitSize); TLLM_LOG_INFO("ChunkUnitSize is set to %d as sliding window attention is used.", chunkUnitSize); @@ -484,8 +503,7 @@ void TrtGptModelInflightBatching::setupSpeculativeDecodingModule(executor::Decod } } -void TrtGptModelInflightBatching::reshapeKvTensors( - SizeType32 maxBlocksPerSeq, kv_cache_manager::CacheType kvCacheType, SizeType32 numPools) +void TrtGptModelInflightBatching::reshapeKvTensors(OffsetTableDimensions const& dims) { TLLM_CHECK(mBuffers.size() == static_cast(mNumBuffers)); auto const& manager = mRuntime->getBufferManager(); @@ -495,7 +513,7 @@ void TrtGptModelInflightBatching::reshapeKvTensors( // any method that operates on transformerBuffers must distinguish between self and cross cache, because // transformerBuffers is not managed by KVCacheManager same rule applies to kv pool pointers below buffers->transformerBuffers->reshapeKvTensors( - getMaxBatchSize(), mOperatingBeamWidth, maxBlocksPerSeq, kvCacheType, numPools, manager); + getMaxBatchSize(), mOperatingBeamWidth, dims.maxBlocksPerSeq, dims.cacheType, dims.numPools, manager); } } @@ -558,6 +576,7 @@ std::shared_ptr TrtGptModelInflightBatching::c break; } } + // Below assertion should be removed once SWA/VSWA is no longer cyclic. TLLM_CHECK_WITH_INFO( getMaxBeamWidth() == 1 || !enableCyclicKvCache, "Can't support cyclic kv cache with beam search."); @@ -575,23 +594,18 @@ std::shared_ptr TrtGptModelInflightBatching::c adjustMaxAttentionWindow(blocksInPrimaryPool, tokensPerBlock); } - auto const& maxAttentionWindowVec = kvCacheType == KvCacheType::kSELF - ? getMaxAttentionWindowVec() - : std::vector{mModelConfig.getMaxEncoderLen()}; + auto maxAttentionWindowVec = getMaxAttentionWindowVec(); - // Only needed when sliding window attention + paged context fmha are used together. - // In that case, a temporary kv cache buffer with maximum chunk size (maxNumTokens) is needed. - // TODO: There are several things that can be improved later. - // 1. a dynamic temporary kv cache allocation based on real chunk size might be needed. - // 2. reuse the same temporary kv cache buffer among all layers in the same pool. - SizeType32 temporaryKvCacheLength{0}; - if (mModelConfig.getPagedContextFMHA() && (getMaxInputLen() > getMaxAttentionWindow())) + if (kvCacheType != KvCacheType::kSELF) // TODO(nhaber): more foolproof way of initing cross-kvcache-manager { - TLLM_CHECK_WITH_INFO(getMaxNumTokens(), "Max number of tokens is not set in model config."); - temporaryKvCacheLength = std::min(getMaxNumTokens().value(), getMaxInputLen() - getMaxAttentionWindow()); - TLLM_LOG_INFO("TemporaryKvCacheLength for sliding window attention: %d", temporaryKvCacheLength); + maxAttentionWindowVec = std::vector{mModelConfig.getMaxEncoderLen()}; } + kv_cache_manager::TempAttentionWindowInputs tempAttentionWindowInputs; + tempAttentionWindowInputs.pagedContextFMHA = mModelConfig.getPagedContextFMHA(); + tempAttentionWindowInputs.maxInputLen = getMaxInputLen(); + tempAttentionWindowInputs.maxNumTokens = getMaxNumTokens().value(); + if (kvCacheType == KvCacheType::kCROSS && kvCacheConfig.enableBlockReuse) { TLLM_LOG_INFO( @@ -602,18 +616,16 @@ std::shared_ptr TrtGptModelInflightBatching::c auto kvCacheManager = std::make_shared(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, getMaxNumSequences(), getMaxBeamWidth(), maxAttentionWindowVec, - temporaryKvCacheLength, getSinkTokenLen(), mRuntime->getStreamPtr(), std::nullopt, enableBlockReuse, + tempAttentionWindowInputs, kvDtype, getSinkTokenLen(), mRuntime->getStreamPtr(), std::nullopt, enableBlockReuse, kvCacheConfig.onboardBlocks, kvCacheType, kvCacheConfig.secondaryOffloadMinPriority, kvCacheConfig.eventBufferMaxSize > 0 ? std::make_unique(kvCacheConfig.eventBufferMaxSize) : nullptr, false, kvCacheConfig.enablePartialReuse, kvCacheConfig.copyOnPartialReuse); - auto const& blockManager = kvCacheManager->getBlockManager(); + reshapeKvTensors(kvCacheManager->getOffsetTableDimensions()); - reshapeKvTensors(kvCacheManager->getMaxBlocksPerSeq(), blockManager.getCacheType(), blockManager.getNumPools()); - - kvCacheManager->allocatePools(kvDtype, kvCacheConfig.useUvm); + kvCacheManager->allocatePools(kvCacheConfig.useUvm); TensorMap inputBuffers; TensorPtr poolPointers = kvCacheManager->getBlockPoolPointers(); @@ -1047,11 +1059,20 @@ void TrtGptModelInflightBatching::forwardAsync(RequestList const& activeRequests mMicroBatchId = getNextMicroBatchId(mMicroBatchId); } // In case of error, we need to free the batch slot associated with those requests - catch (std::exception const& e) + catch (std::exception const&) { - for (auto const& llmReq : activeRequests) + try + { + for (auto const& llmReq : activeRequests) + { + terminateRequest(llmReq); + } + } + catch (std::exception const& e) { - terminateRequest(llmReq); + TLLM_LOG_ERROR("forwardAsync catch-all catch block that runs `terminateRequest` has failed with:"); + TLLM_LOG_EXCEPTION(e); + TLLM_LOG_ERROR("Rethrowing *outer* exception:"); } throw; } @@ -2308,8 +2329,6 @@ std::vector> TrtGptModelInflightBatching:: void TrtGptModelInflightBatching::rewindKVCacheBlocks(SizeType32 numSequences) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - TLLM_CHECK_WITH_INFO(mKvCacheManager->getBlockManager().getNumPools() == 1, - "Rewinding KV cache blocks for models with multiple pools is not supported"); auto const bufferId = getFusedBufferId(); auto& runtimeBuffers = *mBuffers.at(bufferId); @@ -2323,13 +2342,9 @@ void TrtGptModelInflightBatching::rewindKVCacheBlocks(SizeType32 numSequences) localNbLayers -= eagleModulePtr->getNumTransformerLayers(); } - // TODO: VGQA is not supported, assume all layers have the same num_kv_heads - auto const numKvHeads = mModelConfig.getNbKvHeads(0); auto const tokensPerBlock = mModelConfig.getTokensPerBlock(); auto const elemSize = BufferDataType(mModelConfig.getKvDataType()).getSize(); auto const sizeInBytesPerKVHead = mModelConfig.getSizePerHead() * elemSize; - auto const maxBlocksPerSeq = mKvCacheManager->getMaxBlocksPerSeq(); - auto const useOneMoreBlock = mKvCacheManager->isUseOneMoreBlock(); auto const poolPointers = mKvCacheManager->getBlockPoolPointers(); auto* const* pointerArrayPtr = bufferCast(*poolPointers); @@ -2347,9 +2362,10 @@ void TrtGptModelInflightBatching::rewindKVCacheBlocks(SizeType32 numSequences) tensorrt_llm::runtime::kernels::invokeUpdateKVBlockArrayDraftTokenLocation( *mDecoderBuffers->draftBuffers.acceptedLengthsCumSumDevice, *mDecoderBuffers->draftBuffers.acceptedPackedPathsDevice, *runtimeBuffers.sequenceLengthsDevice, - pointerArrayPtr, offsetArrayPtr, localNbLayers, numSequences, numKvHeads, sizeInBytesPerKVHead, commonRewindLen, - rewindLens, *runtimeBuffers.seqSlotRemappingDevice, *runtimeBuffers.sortedSeqSlots, getMaxAttentionWindow(), - maxBlocksPerSeq, tokensPerBlock, useOneMoreBlock, mRuntime->getStreamPtr()->get()); + pointerArrayPtr, offsetArrayPtr, localNbLayers, numSequences, mRewindInputs.numKvHeads, sizeInBytesPerKVHead, + commonRewindLen, rewindLens, *runtimeBuffers.seqSlotRemappingDevice, *runtimeBuffers.sortedSeqSlots, + getMaxAttentionWindow(), mRewindInputs.maxBlocksPerSeq, tokensPerBlock, mRewindInputs.isUseOneMoreBlock, + mRuntime->getStreamPtr()->get()); sync_check_cuda_error(mRuntime->getStream().get()); @@ -2377,15 +2393,13 @@ void TrtGptModelInflightBatching::changeBeamWidth(SizeType32 beamWidth) if (static_cast(mKvCacheManager)) { - auto const& blockManager = mKvCacheManager->getBlockManager(); - reshapeKvTensors( - mKvCacheManager->getMaxBlocksPerSeq(), blockManager.getCacheType(), blockManager.getNumPools()); + auto const dims = mKvCacheManager->getOffsetTableDimensions(); + reshapeKvTensors(dims); } if (static_cast(mCrossKvCacheManager)) { - auto const& blockManager = mCrossKvCacheManager->getBlockManager(); - reshapeKvTensors( - mCrossKvCacheManager->getMaxBlocksPerSeq(), blockManager.getCacheType(), blockManager.getNumPools()); + auto const dims = mCrossKvCacheManager->getOffsetTableDimensions(); + reshapeKvTensors(dims); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h index 91cf6d76df6..e66e0ff765b 100644 --- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h +++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h @@ -50,6 +50,7 @@ namespace tensorrt_llm::batch_manager namespace kv_cache_manager { class KVCacheManager; +struct OffsetTableDimensions; } // namespace kv_cache_manager namespace rnn_state_manager @@ -86,9 +87,17 @@ namespace utils class CudaGraphExecutorCache; } // namespace utils +struct RewindInputs +{ + SizeType32 maxBlocksPerSeq; + bool isUseOneMoreBlock; + SizeType32 numKvHeads; +}; + class TrtGptModelInflightBatching : public TrtGptModel { using BaseKVCacheManager = kv_cache_manager::BaseKVCacheManager; + using OffsetTableDimensions = kv_cache_manager::OffsetTableDimensions; using KVCacheManager = kv_cache_manager::KVCacheManager; using KvCacheType = kv_cache_manager::CacheType; using KvCacheConfig = kv_cache_manager::KvCacheConfig; @@ -330,7 +339,7 @@ class TrtGptModelInflightBatching : public TrtGptModel [[nodiscard]] nvinfer1::Dims getTensorShape(std::string const& name) const override; - void reshapeKvTensors(SizeType32 maxBlocksPerSeq, kv_cache_manager::CacheType kvCacheType, SizeType32 numPools); + void reshapeKvTensors(OffsetTableDimensions const& dims); [[nodiscard]] bool hasSpeculativeDecodingFastLogits() const noexcept override { @@ -533,6 +542,7 @@ class TrtGptModelInflightBatching : public TrtGptModel RequestVector mDraftRequestsWaitingToSendLogits; SizeType32 mSeamlessLADMaxDraftLen{0}; bool mUseSeamlessLookahead{false}; + RewindInputs mRewindInputs; /******************** Algorithms ********************/ // Algorithms are reentrant, they are assigned a state at diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp index 926d3733f9c..d39ca65db7c 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp @@ -55,9 +55,9 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager { public: // using BaseKVCacheManager::BaseKVCacheManager; // Inherit constructors - void allocatePools(nvinfer1::DataType dtype, bool useUvm = false) override + void allocatePools(bool useUvm = false) override { - PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, allocatePools, dtype, useUvm); + PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, allocatePools, useUvm); } void releasePools() override @@ -90,21 +90,6 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager PYBIND11_OVERLOAD_PURE(tbk::KvCacheStats, tbk::BaseKVCacheManager, getKvCacheStats); } - SizeType32 getMaxBlocksPerSeq() const override - { - PYBIND11_OVERLOAD_PURE(SizeType32, tbk::BaseKVCacheManager, getMaxBlocksPerSeq); - } - - SizeType32 getNeededBlocksOneStep(tb::LlmRequest const& req, bool twoStepsLookAhead) const override - { - PYBIND11_OVERLOAD_PURE(SizeType32, tbk::BaseKVCacheManager, getNeededBlocksOneStep, req, twoStepsLookAhead); - } - - SizeType32 getRemainingBlocksToCompletion(tb::LlmRequest const& req) const override - { - PYBIND11_OVERLOAD_PURE(SizeType32, tbk::BaseKVCacheManager, getRemainingBlocksToCompletion, req); - } - void addToken(tb::LlmRequest::RequestIdType requestId) override { PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, addToken, requestId); @@ -164,11 +149,6 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager PYBIND11_OVERLOAD_PURE(bool, tbk::BaseKVCacheManager, isEnableBlockReuse); } - bool isUseOneMoreBlock() const override - { - PYBIND11_OVERLOAD_PURE(bool, tbk::BaseKVCacheManager, isUseOneMoreBlock); - } - void rewindKVCache(tb::LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override { PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, rewindKVCache, requestId, rewindLengths); @@ -191,27 +171,25 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, storeContextBlocks, llmRequest); } - bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const override + std::vector> const& getCacheBlockIds( + tb::LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override { - PYBIND11_OVERLOAD_PURE(bool, tbk::BaseKVCacheManager, schedulingHasFreeBlocks, numRequired); - } - - std::vector> const& getCacheBlockIds(tb::LlmRequest::RequestIdType requestId) const override - { - PYBIND11_OVERLOAD_PURE( - std::vector> const&, tbk::BaseKVCacheManager, getCacheBlockIds, requestId); + PYBIND11_OVERLOAD_PURE(std::vector> const&, tbk::BaseKVCacheManager, getCacheBlockIds, + requestId, windowSize); } std::vector>> getBatchCacheBlockIds( - std::vector const& requestIds) const override + std::vector const& requestIds, SizeType32 windowSize) const override { PYBIND11_OVERLOAD_PURE(std::vector>>, tbk::BaseKVCacheManager, - getBatchCacheBlockIds, requestIds); + getBatchCacheBlockIds, requestIds, windowSize); } - std::vector getNewlyAllocatedBlockIds(tb::LlmRequest::RequestIdType requestId) const override + std::vector getNewlyAllocatedBlockIds( + tb::LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override { - PYBIND11_OVERLOAD_PURE(std::vector, tbk::BaseKVCacheManager, getNewlyAllocatedBlockIds, requestId); + PYBIND11_OVERLOAD_PURE( + std::vector, tbk::BaseKVCacheManager, getNewlyAllocatedBlockIds, requestId, windowSize); } SizeType32 getUsedNumBlocks() const override @@ -322,6 +300,12 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m) .def_readwrite("missed_blocks", &tbk::KvCacheStats::missedBlocks) .def_readwrite("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate); + py::class_(m, "TempAttentionWindowInputs") + .def(py::init<>()) + .def_readwrite("paged_context_fmha", &tbk::TempAttentionWindowInputs::pagedContextFMHA) + .def_readwrite("max_input_len", &tbk::TempAttentionWindowInputs::maxInputLen) + .def_readwrite("max_num_tokens", &tbk::TempAttentionWindowInputs::maxNumTokens); + py::class_>(m, "KVCacheEventManager") .def(py::init(), py::arg("max_kv_event_entries")); @@ -336,7 +320,8 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m) .def_property_readonly("max_num_blocks", &BaseKVCacheManager::getMaxNumBlocks) .def_property_readonly("num_pools", &BaseKVCacheManager::getNumPools) .def("get_kv_cache_stats", &BaseKVCacheManager::getKvCacheStats) - .def_property_readonly("max_blocks_per_seq", &BaseKVCacheManager::getMaxBlocksPerSeq) + .def_property_readonly("max_blocks_per_seq", + [](tbk::BaseKVCacheManager& self) { return self.getOffsetTableDimensions().maxBlocksPerSeq; }) .def("get_needed_blocks_one_step", &BaseKVCacheManager::getNeededBlocksOneStep) .def("get_remaining_blocks_to_completion", &BaseKVCacheManager::getRemainingBlocksToCompletion) .def("add_token", &BaseKVCacheManager::addToken) @@ -414,11 +399,9 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m) }, py::arg("timeout_ms") = std::nullopt) .def_property_readonly("enable_block_reuse", &BaseKVCacheManager::isEnableBlockReuse) - .def_property_readonly("use_one_more_block", &BaseKVCacheManager::isUseOneMoreBlock) .def("rewind_kv_cache", &BaseKVCacheManager::rewindKVCache) .def_property_readonly("cross_kv", &BaseKVCacheManager::isCrossKv) .def("store_context_blocks", &BaseKVCacheManager::storeContextBlocks) - .def("scheduling_has_free_blocks", &BaseKVCacheManager::schedulingHasFreeBlocks) .def("get_cache_block_ids", &BaseKVCacheManager::getCacheBlockIds) .def("get_batch_cache_block_ids", &BaseKVCacheManager::getBatchCacheBlockIds) .def("get_newly_allocated_block_ids", &BaseKVCacheManager::getNewlyAllocatedBlockIds) @@ -431,13 +414,14 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m) py::classh(m, "KVCacheManager") .def(py::init const&, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, - SizeType32, std::vector const&, SizeType32, SizeType32, bool, int64_t, bool, bool, - tbk::CacheType, std::optional, - std::shared_ptr, bool, bool>(), + SizeType32, std::vector const&, std::optional const&, + nvinfer1::DataType, SizeType32, bool, int64_t, bool, bool, tbk::CacheType, + std::optional, std::shared_ptr, + bool, bool>(), py::arg("num_kv_heads_per_layer"), py::arg("size_per_head"), py::arg("tokens_per_block"), py::arg("blocks_in_primary_pool"), py::arg("blocks_in_secondary_pool"), py::arg("max_num_sequences"), - py::arg("max_beam_width"), py::arg("max_attention_window_vec"), py::arg("temporary_attention_window"), - py::arg("sink_token_length"), py::arg("stream"), py::arg("max_sequence_length"), + py::arg("max_beam_width"), py::arg("max_attention_window_vec"), py::arg("temp_attention_window_inputs"), + py::arg("dtype"), py::arg("sink_token_length"), py::arg("stream"), py::arg("max_sequence_length"), py::arg("enable_block_reuse") = false, py::arg("onboard_blocks") = true, py::arg_v("cache_type", tbk::CacheType::kSELF, "bindings.internal.batch_manager.CacheType.SELF"), py::arg("secondary_offload_min_priority") = std::nullopt, py::arg("event_manager") = nullptr, diff --git a/cpp/tensorrt_llm/runtime/gptSession.cpp b/cpp/tensorrt_llm/runtime/gptSession.cpp index 7147b0f1252..4e5732a992f 100644 --- a/cpp/tensorrt_llm/runtime/gptSession.cpp +++ b/cpp/tensorrt_llm/runtime/gptSession.cpp @@ -241,10 +241,10 @@ void GptSession::createKvCacheManager(SizeType32 maxBatchSize, SizeType32 maxBea mKvCacheManager = std::make_shared( std::vector(numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd), sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxBatchSize, maxBeamWidth, mDecoderMaxAttentionWindowVec, - /*temporaryAttentionWindow*/ 0, sinkTokenLength, mRuntime->getStreamPtr(), maxSequenceLength, enableBlockReuse, - kvCacheConfig.onboardBlocks); + /*tempAttentionWindowInputs*/ std::nullopt, kvDtype, sinkTokenLength, mRuntime->getStreamPtr(), + maxSequenceLength, enableBlockReuse, kvCacheConfig.onboardBlocks); - auto const maxBlocksPerSeq = mKvCacheManager->getMaxBlocksPerSeq(); + auto const maxBlocksPerSeq = mKvCacheManager->getOffsetTableDimensions().maxBlocksPerSeq; TLLM_CHECK(mBuffers.size() == static_cast(mMicroBatchConfig.numGenBatches)); for (auto& buffers : mBuffers) @@ -253,7 +253,7 @@ void GptSession::createKvCacheManager(SizeType32 maxBatchSize, SizeType32 maxBea buffers->transformerBuffers->reshapeKvTensors(maxBatchSize, maxBeamWidth, maxBlocksPerSeq, *mRuntime); } - mKvCacheManager->allocatePools(kvDtype, kvCacheConfig.useUvm); + mKvCacheManager->allocatePools(kvCacheConfig.useUvm); for (auto& buffers : mBuffers) { diff --git a/cpp/tests/batch_manager/cacheTransceiverTest.cpp b/cpp/tests/batch_manager/cacheTransceiverTest.cpp index 047df9a2bbc..7db3bb89c1b 100644 --- a/cpp/tests/batch_manager/cacheTransceiverTest.cpp +++ b/cpp/tests/batch_manager/cacheTransceiverTest.cpp @@ -295,7 +295,6 @@ class SymmetricalCacheTest : public ::testing::Test // NOLINT(cppcoreguidelines- auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxNumTokens; - auto constexpr temporaryAttentionWindow = 0; auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1; auto constexpr numSharedBlocks = inputLength / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; @@ -308,9 +307,9 @@ class SymmetricalCacheTest : public ::testing::Test // NOLINT(cppcoreguidelines- auto constexpr dataType = nvinfer1::DataType::kFLOAT; mManager = std::make_unique(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, - blocksInSecondaryPool, mMaxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, - temporaryAttentionWindow, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks, - CacheType::kSELF, std::nullopt, nullptr, true); + blocksInSecondaryPool, mMaxNumSequences, maxBeamWidth, + std::vector{maxAttentionWindow}, std::nullopt, dataType, sinkTokenLength, stream, + std::nullopt, enableBlockReuse, onboardBlocks, CacheType::kSELF, std::nullopt, nullptr, true); mCacheState = std::make_unique( numLayers, numHeads, sizePerHead, tokensPerBlock, 1, 1, dataType); @@ -377,7 +376,7 @@ class SymmetricalCacheTest : public ::testing::Test // NOLINT(cppcoreguidelines- } // UVM seems to be incompatible with MPI, and it is continuing to investigate. bool constexpr useUvm = false; - mManager->allocatePools(dataType, useUvm); + mManager->allocatePools(useUvm); } void setUpCacheTransceiver() @@ -414,7 +413,7 @@ class SymmetricalCacheTest : public ::testing::Test // NOLINT(cppcoreguidelines- mManager->addSequence(llmRequest->mRequestId, llmRequest->getNumTokens(beamIdx), beamWidth, llmRequest); if (isSender) { - auto blockRange = BlockRange(*mManager, llmRequest->mRequestId, beamIdx, 0); + auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mManager, llmRequest->mRequestId); for (auto& block : blockRange) { // fill cache with tokens (= request length), for reuse test @@ -427,7 +426,7 @@ class SymmetricalCacheTest : public ::testing::Test // NOLINT(cppcoreguidelines- auto future = mRequester->requestAndReceiveAsync(*llmRequest); future.get(); TLLM_CUDA_CHECK(cudaDeviceSynchronize()); - auto blockRange = BlockRange(*mManager, llmRequest->mRequestId, beamIdx, 0); + auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mManager, llmRequest->mRequestId); for (auto& block : blockRange) { std::vector bytes(block.getSizeInBytes()); @@ -619,7 +618,6 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam(numLayers / mPpSize, numHeadsPerRank, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, mMaxNumSequences, maxBeamWidth, - std::vector{maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream, + std::vector{maxAttentionWindow}, std::nullopt, dataType, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks, cacheType, std::nullopt, nullptr, true); texec::kv_cache::CacheState::AttentionType attentionType = isMLA ? texec::kv_cache::CacheState::AttentionType::kMLA @@ -670,7 +668,7 @@ class AsymmetricalCacheTest : public ::testing::TestWithParamallocatePools(dataType, useUvm); + mManager->allocatePools(useUvm); } void setUpCacheTransceiver() @@ -830,14 +828,21 @@ class AsymmetricalCacheTest : public ::testing::TestWithParamaddSequence(llmRequest->mRequestId, llmRequest->getNumTokens(beamIdx), beamWidth, llmRequest); - auto blockRange = BlockRange(*mManager, llmRequest->mRequestId, beamIdx, 0); + auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mManager, llmRequest->mRequestId); int blockIdx = 0; for (auto& block : blockRange) { fillBlockData(block, blockIdx, llmRequest->getPromptLen()); blockIdx++; } - mManager->getBlockManager().getBufferManager().getStream().synchronize(); + auto const& blockManager = mManager->getBlockManager(); + if (blockManager.getNumPools() != 1) + { + throw std::runtime_error("Test assumes that just a single pool (single window size) is used"); + } + auto const onlyWindowSize = blockManager.getPoolWindowSize(0); + + blockManager.getBufferManager(onlyWindowSize).getStream().synchronize(); auto future = mResponder->respondAndSendAsync(*llmRequest); return future; } @@ -859,7 +864,7 @@ class AsymmetricalCacheTest : public ::testing::TestWithParammRequestId, beamIdx, 0); + auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mManager, llmRequest->mRequestId); for (auto& block : blockRange) { verifyBlockData(block, blockIdx, llmRequest->getPromptLen()); @@ -869,8 +874,12 @@ class AsymmetricalCacheTest : public ::testing::TestWithParamgetBlockManager().getBufferManager().cpu(blockData.getShape(), blockData.getDataType()); + auto const& blockManager = mManager->getBlockManager(); + ASSERT_EQ(blockManager.getNumPools(), 1); + auto const onlyWindowSize = blockManager.getPoolWindowSize(0); + auto const& bufferManager = blockManager.getBufferManager(onlyWindowSize); + + auto hostTensor = tensorrt_llm::runtime::BufferManager::cpu(blockData.getShape(), blockData.getDataType()); int layerSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.size() / mPpSize; int startLayerId = layerSizePerRank * mPpRank; int headSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.at(0); @@ -924,13 +933,17 @@ class AsymmetricalCacheTest : public ::testing::TestWithParamgetBlockManager().getBufferManager().copy(*hostTensor, blockData); + bufferManager.copy(*hostTensor, blockData); } void verifyBlockData(tensorrt_llm::runtime::ITensor& blockData, int blockId, size_t initial) { - auto hostTensor - = mManager->getBlockManager().getBufferManager().cpu(blockData.getShape(), blockData.getDataType()); + auto const& blockManager = mManager->getBlockManager(); + ASSERT_EQ(blockManager.getNumPools(), 1); + auto const onlyWindowSize = blockManager.getPoolWindowSize(0); + auto const& bufferManager = blockManager.getBufferManager(onlyWindowSize); + + auto hostTensor = tensorrt_llm::runtime::BufferManager::cpu(blockData.getShape(), blockData.getDataType()); int layerSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.size() / mPpSize; int startLayerId = layerSizePerRank * mPpRank; int headSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.at(0); @@ -945,8 +958,8 @@ class AsymmetricalCacheTest : public ::testing::TestWithParamgetModelConfig().mSizePerHead; - mManager->getBlockManager().getBufferManager().copy(blockData, *hostTensor); - mManager->getBlockManager().getBufferManager().getStream().synchronize(); + bufferManager.copy(blockData, *hostTensor); + bufferManager.getStream().synchronize(); for (int layerId = 0; layerId < layerSizePerRank; layerId++) { diff --git a/cpp/tests/unit_tests/batch_manager/capacitySchedulerTest.cpp b/cpp/tests/unit_tests/batch_manager/capacitySchedulerTest.cpp index e420e8b8124..372de1b109d 100644 --- a/cpp/tests/unit_tests/batch_manager/capacitySchedulerTest.cpp +++ b/cpp/tests/unit_tests/batch_manager/capacitySchedulerTest.cpp @@ -131,8 +131,8 @@ class CapacitySchedulerTest : public ::testing::Test // NOLINT(cppcoreguidelines // init KV cache block manager return std::make_shared(numLayers, nbKvHeads, sizePerHead, tokensPerBlock, - maxNumBlocks, 0, maxNumRequests, 1, std::vector{maxNumTokensPerSeq}, 0, sinkTokenLength, - streamPtr, std::nullopt, enableReuse, onboardBlocks, cacheType); + maxNumBlocks, 0, maxNumRequests, 1, std::vector{maxNumTokensPerSeq}, std::nullopt, kvDtype, + sinkTokenLength, streamPtr, std::nullopt, enableReuse, onboardBlocks, cacheType); } static std::shared_ptr getPeftCacheManager() diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp index 8dbee68798a..470911ec9b5 100644 --- a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp +++ b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp @@ -47,6 +47,8 @@ namespace tr = tensorrt_llm::runtime; using ParamType = bool; +namespace +{ std::string generateTestName(testing::TestParamInfo const& info) { auto const homogeneousLayers = info.param; @@ -62,6 +64,14 @@ std::string generateTestName(testing::TestParamInfo const& info) return name; } +SizeType32 theOnlyWindowSize(KVCacheManager const& kvCacheManager) +{ + auto const& blockManager = kvCacheManager.getBlockManager(); + EXPECT_EQ(blockManager.getWindowSizesMetadata().size(), 1) << "Assuming a single window size"; + return blockManager.getPoolWindowSize(0); +} +} // namespace + class KVCacheManagerTest : public ::testing::Test, public ::testing::WithParamInterface // NOLINT(cppcoreguidelines-pro-type-member-init) @@ -81,14 +91,6 @@ class KVCacheManagerTest namespace { -void allocateBlocks(BlockManager& manager, GenerationRequest& sequence, std::size_t numBlocks, bool shareAmongBeams) -{ - for (std::size_t i = 0; i < numBlocks; ++i) - { - manager.allocateBlock(sequence, shareAmongBeams); - } -} - // TODO: This is really ugly. Flushing the event queue is done in a separate thread, so if we want to check the value we // need to wait for the thread to complete. It works, but it's technically not deterministic. std::deque getEvents(KVCacheManager& kvCacheManager) @@ -112,25 +114,27 @@ TEST_F(KVCacheManagerTest, BlockManagerTest) auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; - BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, - blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks); - blockManager.allocatePools(nvinfer1::DataType::kHALF, false); + auto constexpr beamWidth = 8; + auto constexpr numBlocksPerBeam = blocksInPrimaryPool / beamWidth; + auto constexpr numTokens = tokensPerBlock * numBlocksPerBeam; + auto constexpr maxAttentionWindow = numTokens; + + BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, + blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, + std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, + onboardBlocks); + blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); auto constexpr requestId = 42; - auto constexpr beamWidth = 8; - auto constexpr numBlocksPerBeam = blocksInPrimaryPool / beamWidth; - auto constexpr numTokens = tokensPerBlock * numBlocksPerBeam; - auto constexpr maxAttentionWindow = numTokens; - auto const maxAttentionWindowVec = std::vector{maxAttentionWindow}; - GenerationRequest seq0{requestId, numTokens, beamWidth, numBlocksPerBeam, maxAttentionWindow}; - blockManager.addSequence(seq0, numBlocksPerBeam, numBlocksPerBeam - 1); + GenerationRequest seq0{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; + blockManager.addSequence(seq0, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow); auto constexpr occupiedBlocks = (numBlocksPerBeam - 1) + beamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - occupiedBlocks); - auto const& ids = seq0.getCacheBlockIds(); + auto const& ids = seq0.getCacheBlockIds(maxAttentionWindow); std::set idSet{}; EXPECT_EQ(ids.size(), beamWidth); for (auto const& beam : ids) @@ -142,7 +146,7 @@ TEST_F(KVCacheManagerTest, BlockManagerTest) blockManager.releaseBlocks(seq0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); - blockManager.addSequence(seq0, numBlocksPerBeam, -1); + blockManager.addSequence(seq0, numBlocksPerBeam, -1, maxAttentionWindow); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocksPerBeam); EXPECT_EQ(ids.size(), beamWidth); for (std::size_t i = 0u; i < ids.front().size(); ++i) @@ -156,15 +160,17 @@ TEST_F(KVCacheManagerTest, BlockManagerTest) EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); // occupy 22/24 blocks - EXPECT_NO_THROW(blockManager.addSequence(seq0, numBlocksPerBeam, numBlocksPerBeam - 1)); - GenerationRequest seq1{requestId + 1, numTokens, beamWidth, numBlocksPerBeam, maxAttentionWindow}; - EXPECT_NO_THROW(blockManager.addSequence(seq1, numBlocksPerBeam, numBlocksPerBeam - 1)); + EXPECT_NO_THROW(blockManager.addSequence(seq0, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow)); + GenerationRequest seq1{requestId + 1, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; + EXPECT_NO_THROW(blockManager.addSequence(seq1, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow)); // same requestId not allowed - GenerationRequest seq2{requestId, numTokens, beamWidth, numBlocksPerBeam, maxAttentionWindow}; - EXPECT_THROW(blockManager.addSequence(seq2, numBlocksPerBeam, numBlocksPerBeam - 1), std::runtime_error); + GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; + EXPECT_THROW( + blockManager.addSequence(seq2, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow), std::runtime_error); // no more blocks - GenerationRequest seq3{requestId + 2, numTokens, beamWidth, numBlocksPerBeam, maxAttentionWindow}; - EXPECT_THROW(blockManager.addSequence(seq3, numBlocksPerBeam, numBlocksPerBeam - 1), std::runtime_error); + GenerationRequest seq3{requestId + 2, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; + EXPECT_THROW( + blockManager.addSequence(seq3, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow), std::runtime_error); } template @@ -195,8 +201,9 @@ void runPartialCopyTest() bool constexpr isStreaming{false}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, - blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks); - blockManager.allocatePools(type, false); + blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, + std::vector{maxAttentionWindow}, std::nullopt, type, 0, onboardBlocks); + blockManager.allocatePools(false); auto oneLayerBlockSize = blockManager.getBlockSize(0); EXPECT_EQ(oneLayerBlockSize, numKvHeads * sizePerHead * tokensPerBlock); @@ -224,21 +231,21 @@ void runPartialCopyTest() auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); - GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0); + blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); - auto cacheBlockIds = seq0.getCacheBlockIds().at(beamIdx); + auto cacheBlockIds = seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds, ::testing::ElementsAreArray({0, 1, 2})); // Offload all 3 blocks, fill with predictable pattern, onboard for (auto cacheBlockId : cacheBlockIds) { - auto block = blockManager.getBlockById(cacheBlockId); + auto block = blockManager.getBlockById(cacheBlockId, maxAttentionWindow); EXPECT_TRUE(block->isPrimary()); // offload so we can write to block in CPU code - blockManager.offloadBlock(block); + blockManager.offloadBlock(block, maxAttentionWindow); EXPECT_FALSE(block->isPrimary()); // need to sync so D2H transfer is done before accessing blocks EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); @@ -251,10 +258,10 @@ void runPartialCopyTest() rawBlockPtr[i] = i & mask; } // onboard - blockManager.onboardBlock(block); + blockManager.onboardBlock(block, maxAttentionWindow); EXPECT_TRUE(block->isPrimary()); EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); - EXPECT_TRUE(blockManager.verifyQueueIntegrity()); + EXPECT_TRUE(blockManager.verifyQueueIntegrity(maxAttentionWindow)); } blockManager.releaseBlocks(seq0, llmRequest0); @@ -263,12 +270,12 @@ void runPartialCopyTest() auto const inputLength1 = static_cast(inputTokens1->size()); requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming); - GenerationRequest seq1{requestId, inputLength1, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq1{requestId, inputLength1, beamWidth, blockManager.getWindowSizesMetadata()}; auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1); + blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 16); - auto cacheBlockIds1 = seq1.getCacheBlockIds().at(beamIdx); + auto cacheBlockIds1 = seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds1, ::testing::ElementsAreArray({0, 1, 6})); // store blocks 0, 1 ([0,1,2,3,4,5,6,7], [8,9,10,11,12,13,14,15]) blockManager.storeContextBlocks(seq1, *llmRequest1); @@ -277,7 +284,7 @@ void runPartialCopyTest() // Add sequence [0,1,2,3,4,5,6,7,8,9,10,11] again. // Reuse blocks 0 and 1(pc). Block 1 is partially reused, but already referenced by seq1 so must be partial copied // into new block 2. Clear block 2 so we can see what was partial copied. - auto block2 = blockManager.getBlockById(2); + auto block2 = blockManager.getBlockById(2, maxAttentionWindow); auto memoryPoolIndex2 = block2->getMemoryPoolBlockIndex(); auto block2Ptr{tr::ITensor::slice(primaryPoolPtr, memoryPoolIndex2, 1)}; EXPECT_EQ(cudaMemset(block2Ptr->data(), 0, blockSize * sizeof(T)), cudaSuccess); @@ -287,18 +294,18 @@ void runPartialCopyTest() auto const inputLength2 = static_cast(inputTokens2->size()); requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens2, samplingConfig, isStreaming); - GenerationRequest seq2{requestId, inputLength2, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq2{requestId, inputLength2, beamWidth, blockManager.getWindowSizesMetadata()}; auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2); + blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 11); - auto cacheBlockIds2 = seq2.getCacheBlockIds().at(beamIdx); + auto cacheBlockIds2 = seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds2, ::testing::ElementsAreArray({0, 2})); EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); // Verify partial copied block 2 // Block has shape [2, numLayers, numKvHeads, tokensPerBlock, sizePerHead] - blockManager.offloadBlock(block2); + blockManager.offloadBlock(block2, maxAttentionWindow); EXPECT_FALSE(block2->isPrimary()); // need to sync so D2H transfer is done before accessing blocks EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); @@ -326,7 +333,7 @@ void runPartialCopyTest() } } EXPECT_EQ(numBad, 0); - blockManager.onboardBlock(block2); + blockManager.onboardBlock(block2, maxAttentionWindow); EXPECT_TRUE(block2->isPrimary()); EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); @@ -383,6 +390,34 @@ TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyFP8) } #endif +TEST_F(KVCacheManagerTest, BlockManagerTestBlocksPerWindowSize) +{ + auto constexpr numPrimaryBlocks = 16384; + // Single window size + { + std::map> windowSizeToLayers{{1024, {0, 1, 2}}}; + auto result = BlockManager::blocksPerWindowSize(numPrimaryBlocks, windowSizeToLayers); + EXPECT_EQ(result.size(), 1); + EXPECT_EQ(result.at(1024), numPrimaryBlocks); + } + // Variable window size + { + std::map> windowSizeToLayers{ + {1024, {1}}, // contribution = 1024*1 = 1024 / 29696 + {4096, {0, 4, 5}}, // contribution = 4096*3 = 12288 / 29696 + {8192, {2, 3}}, // contribution = 8192*2 = 16384 / 29696 + }; + auto result = BlockManager::blocksPerWindowSize(numPrimaryBlocks, windowSizeToLayers); + EXPECT_EQ(result.size(), 3); + EXPECT_EQ(std::accumulate(result.begin(), result.end(), 0, [](auto sum, auto cur) { return sum + cur.second; }), + numPrimaryBlocks); + // Two blocks that were lost due to rounding down were awarded in order to the smallest window sizes: + EXPECT_EQ(result.at(1024), 565); // 564 + 1 + EXPECT_EQ(result.at(4096), 6780); // 6779 + 1 + EXPECT_EQ(result.at(8192), 9039); // 9039 + 0 + } +} + #ifdef ENABLE_FP4 TEST_F(KVCacheManagerTest, FP4BlockScaleManagementTest) { @@ -398,11 +433,12 @@ TEST_F(KVCacheManagerTest, FP4BlockScaleManagementTest) auto const stream = std::make_shared(); auto constexpr beamWidth = 1; + auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, false, stream, true, - onboardBlocks); + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, nvinfer1::DataType::kFP4, false, stream, true, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kFP4, /*useUvm=*/false); + kvCacheManager.allocatePools(/*useUvm=*/false); // We should have one additional pool for the block scales. EXPECT_EQ(kvCacheManager.getBlockManager().getNumPools(), 2); @@ -429,16 +465,19 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; - BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, - blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks); - blockManager.allocatePools(nvinfer1::DataType::kHALF, false); + auto constexpr beamWidth = 1; + + BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, + blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, + std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, + onboardBlocks); + blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; - auto constexpr beamWidth = 1; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; @@ -447,7 +486,7 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); - GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request and then remove it @@ -455,9 +494,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0); + blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); - EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(9, beamIdx); // block 2 contains [8] llmRequest0->addNewToken(10, beamIdx); // block 2 contains [8, 9] auto numTokens = llmRequest0->getNumTokens(beamIdx); @@ -475,14 +514,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) // new request with same tokens [0, 1, 2, 3, 4, 5, 6, 7, 8] and then remove it requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); - GenerationRequest seq1{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1 ([0, 1, 2, 3], [4, 5, 6, 7]) and get new block 3 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1); + blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); + EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); llmRequest1->addNewToken(9, beamIdx); // block 3 contains [8] llmRequest1->addNewToken(10, beamIdx); // block 3 contains [8, 9] EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); @@ -502,9 +541,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens0, samplingConfig, isStreaming); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0); + blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), promptLen0 - 1); - EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // note that seq0 is holding blocks 0, 1 and 2 until releaseBlocks is called @@ -515,9 +554,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1); + blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); + EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); llmRequest1->addNewToken(10, beamIdx); // block 4 contains [8, 9, 10] EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); @@ -541,13 +580,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens2, samplingConfig, isStreaming); numTokens = llmRequest2->getNumTokens(beamIdx); - GenerationRequest seq2{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 0 ([0, 1, 2, 3]), get new block 5 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2); + blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), tokensPerBlock); - EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 5})); + EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 5})); llmRequest2->addNewToken(5, beamIdx); // block 5 contains [4] numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); @@ -562,13 +601,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) auto llmRequest3 = std::make_shared(requestId, maxNewTokens, inputTokens3, samplingConfig, isStreaming); numTokens = llmRequest3->getNumTokens(beamIdx); - GenerationRequest seq3{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1, 4(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8, 9]) auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3); + blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), numTokens - 1); - EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); + EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); llmRequest3->addNewToken(11, beamIdx); // block 4 contains [8, 9, 11] numTokens = llmRequest3->getNumTokens(beamIdx); // one block used by both seq2 and seq3 @@ -591,14 +630,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) auto llmRequest4 = std::make_shared(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming); numTokens = llmRequest4->getNumTokens(beamIdx); - GenerationRequest seq4{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1, 4(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8,9]) auto promptLen4 = llmRequest4->getNumTokens(beamIdx); auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4); + blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), promptLen4 - 1); - EXPECT_THAT(seq4.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); + EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); @@ -624,9 +663,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) llmRequest4 = std::make_shared(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming); promptLen4 = llmRequest4->getNumTokens(beamIdx); numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4); + blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), promptLen4 - 2); - EXPECT_THAT(seq4.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); @@ -645,11 +684,11 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) auto llmRequest5 = std::make_shared(requestId, maxNewTokens, inputTokens5, samplingConfig, isStreaming); numTokens = llmRequest5->getNumTokens(beamIdx); - GenerationRequest seq5{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq5{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, all blocks need to be freed auto promptLen5 = llmRequest5->getNumTokens(beamIdx); auto numContextBlocks5 = tc::ceilDiv(promptLen5, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5); + blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5, maxAttentionWindow); llmRequest5->addNewToken(0, beamIdx); EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 1); // incidental reuse @@ -668,11 +707,11 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) auto llmRequest6 = std::make_shared(requestId, maxNewTokens, inputTokens6, samplingConfig, isStreaming); numTokens = llmRequest6->getNumTokens(beamIdx); - GenerationRequest seq6{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq6{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, all blocks need to be freed auto promptLen6 = llmRequest6->getNumTokens(beamIdx); auto numContextBlocks6 = tc::ceilDiv(promptLen6, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq6, promptLen6, numContextBlocks6, *llmRequest6); + blockManager.addSequence(seq6, promptLen6, numContextBlocks6, *llmRequest6, maxAttentionWindow); llmRequest6->addNewToken(0, beamIdx); // no reuse occurs because we are unable to reuse last input token and inputLength6 == 1. EXPECT_EQ(llmRequest6->getContextCurrentPosition(), 0); @@ -703,16 +742,19 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest) auto constexpr numReturnSequences = 1; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; + auto constexpr beamWidth = 1; + BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks); - blockManager.allocatePools(nvinfer1::DataType::kHALF, false); + blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, + std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, + onboardBlocks); + blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; - auto constexpr beamWidth = 1; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; @@ -728,16 +770,16 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest) std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds, numReturnSequences); - GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request and then remove it auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0); + blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); - EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(3, beamIdx); llmRequest0->addNewToken(4, beamIdx); auto numTokens = llmRequest0->getNumTokens(beamIdx); @@ -760,14 +802,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest) false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds, numReturnSequences); - GenerationRequest seq1{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1 and get new block 3 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1); + blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); + EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); llmRequest1->addNewToken(3, beamIdx); llmRequest1->addNewToken(4, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); @@ -789,10 +831,10 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest) inputTokenExtraIds, numReturnSequences); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0); + blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); llmRequest0->addNewToken(3, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); + EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); @@ -809,9 +851,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest) inputTokenExtraIds1, numReturnSequences); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1); + blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); - EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest1->addNewToken(5, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); @@ -836,13 +878,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest) inputTokenExtraIds2, numReturnSequences); numTokens = llmRequest2->getNumTokens(beamIdx); - GenerationRequest seq2{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 5, 6, 7 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2); + blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0); - EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); + EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); llmRequest2->addNewToken(3, beamIdx); numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); @@ -861,13 +903,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest) inputTokenExtraIds3, numReturnSequences); numTokens = llmRequest3->getNumTokens(beamIdx); - GenerationRequest seq3{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 0, get new block 8, 9 auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3); + blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), tokensPerBlock); - EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 8, 9})); + EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 8, 9})); llmRequest3->addNewToken(3, beamIdx); numTokens = llmRequest3->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); @@ -897,16 +939,19 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; + auto constexpr beamWidth = 1; + BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks); - blockManager.allocatePools(nvinfer1::DataType::kHALF, false); + blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, + std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, + onboardBlocks); + blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; - auto constexpr beamWidth = 1; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; @@ -918,7 +963,7 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); - GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request and then remove it @@ -926,9 +971,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); // get new blocks 0, 1, 2 ([0,1,2,3], [4,5,6,7], [8]) - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0); + blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); - EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(9, beamIdx); llmRequest0->addNewToken(10, beamIdx); auto numTokens = llmRequest0->getNumTokens(beamIdx); @@ -949,14 +994,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); - GenerationRequest seq1{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1 and get new block 3 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1); + blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); + EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); llmRequest1->addNewToken(9, beamIdx); llmRequest1->addNewToken(10, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); @@ -976,7 +1021,7 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) std::nullopt, std::nullopt, loraTaskId); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0); + blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); // nb! addNewToken adds new generated token, number of input tokens stay the same. // calling addNewToken before addSequence potentially triggers this error message: // Assertion failed: prepopulatedPromptLen < promptLen @@ -984,7 +1029,7 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) // but promptLen is number of input tokens. llmRequest0->addNewToken(9, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); + EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); @@ -996,9 +1041,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); // reuse 0, 1, 2(p) ([0,1,2,3], [4,5,6,7], [8]) - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1); + blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); - EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest1->addNewToken(10, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); @@ -1021,14 +1066,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) std::nullopt, std::nullopt, loraTaskId); numTokens = llmRequest2->getNumTokens(beamIdx); - GenerationRequest seq2{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 5, 6, 7 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2); + blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); // no reuse expected. Input tokens match blocks 0 and 1, but lora task id differs. EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0); - EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); + EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); llmRequest2->addNewToken(9, beamIdx); numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); @@ -1048,13 +1093,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) std::nullopt, std::nullopt, loraTaskId); numTokens = llmRequest3->getNumTokens(beamIdx); - GenerationRequest seq3{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 5, 6, 7(p) ([0,1,2,3], [4,5,6,7], [8]) auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3); + blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), promptLen3 - 2); - EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); + EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); llmRequest3->addNewToken(11, beamIdx); numTokens = llmRequest3->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); @@ -1076,13 +1121,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) std::nullopt, std::nullopt, loraTaskId); numTokens = llmRequest4->getNumTokens(beamIdx); - GenerationRequest seq4{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, get new block 8 auto promptLen4 = llmRequest4->getNumTokens(beamIdx); auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4); + blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), tokensPerBlock); - EXPECT_THAT(seq4.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 8})); + EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 8})); llmRequest4->addNewToken(5, beamIdx); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); @@ -1099,13 +1144,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt); numTokens = llmRequest5->getNumTokens(beamIdx); - GenerationRequest seq5{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq5{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 9, 10, 11 auto promptLen5 = llmRequest5->getNumTokens(beamIdx); auto numContextBlocks5 = tc::ceilDiv(promptLen5, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5); + blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5, maxAttentionWindow); EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 0); - EXPECT_THAT(seq5.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({9, 10, 11})); + EXPECT_THAT(seq5.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({9, 10, 11})); llmRequest5->addNewToken(9, beamIdx); numTokens = llmRequest5->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); @@ -1134,16 +1179,19 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; + auto constexpr beamWidth = 1; + BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks); - blockManager.allocatePools(nvinfer1::DataType::kHALF, false); + blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, + std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, + onboardBlocks); + blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; - auto constexpr beamWidth = 1; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; @@ -1160,16 +1208,16 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds); - GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request with loraTaskId 1 and then remove it auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0); + blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); - EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(3, beamIdx); llmRequest0->addNewToken(4, beamIdx); auto numTokens = llmRequest0->getNumTokens(beamIdx); @@ -1193,14 +1241,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds); - GenerationRequest seq1{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 3, 4, 5 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1); + blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 0); - EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); + EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); llmRequest1->addNewToken(3, beamIdx); llmRequest1->addNewToken(4, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); @@ -1222,10 +1270,10 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); // reuse blocks 0, 1 and get new block 6 - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0); + blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); llmRequest0->addNewToken(3, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 6})); + EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 6})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); @@ -1242,9 +1290,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) inputTokenExtraIds1); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1); + blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); - EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); + EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); llmRequest1->addNewToken(5, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2); @@ -1268,13 +1316,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) inputTokenExtraIds2); numTokens = llmRequest2->getNumTokens(beamIdx); - GenerationRequest seq2{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 7, 8, 9 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2); + blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0); - EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({7, 8, 9})); + EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({7, 8, 9})); llmRequest2->addNewToken(3, beamIdx); numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); @@ -1293,13 +1341,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) inputTokenExtraIds3); numTokens = llmRequest3->getNumTokens(beamIdx); - GenerationRequest seq3{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 0, get new block 10, 11 auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3); + blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), tokensPerBlock); - EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 10, 11})); + EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 10, 11})); llmRequest3->addNewToken(3, beamIdx); numTokens = llmRequest3->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); @@ -1317,13 +1365,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) inputTokenExtraIds3); numTokens = llmRequest4->getNumTokens(beamIdx); - GenerationRequest seq4{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 3, get new block 12, 13 auto promptLen4 = llmRequest4->getNumTokens(beamIdx); auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4); + blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), tokensPerBlock); - EXPECT_THAT(seq4.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({3, 12, 13})); + EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 12, 13})); llmRequest4->addNewToken(3, beamIdx); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); @@ -1355,10 +1403,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerPerRequestStatsTest) tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; + auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream, - std::nullopt, true, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks); + kvCacheManager.allocatePools(false); auto inputTokens = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); auto const inputLength = static_cast(inputTokens->size()); @@ -1406,16 +1455,19 @@ TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest) auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; - BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, - blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks); - blockManager.allocatePools(nvinfer1::DataType::kHALF, false); + auto constexpr beamWidth = 1; + + BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, + blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, + std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, + onboardBlocks); + blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; - auto constexpr beamWidth = 1; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; @@ -1427,17 +1479,17 @@ TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest) KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, 4, 90), KvCacheRetentionConfig::TokenRangeRetentionConfig(4, 8, 10)}, 20)); - GenerationRequest seq0{0, inputLength0, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq0{0, inputLength0, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks0 = tc::ceilDiv(inputLength0, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq0, llmRequest0->getNumTokens(0), numContextBlocks0, *llmRequest0); + blockManager.addSequence(seq0, llmRequest0->getNumTokens(0), numContextBlocks0, *llmRequest0, maxAttentionWindow); // Add another sequence with different tokens, at a low priority auto inputTokens1 = std::make_shared(VecTokens{8, 9, 10, 11, 12, 13, 14, 15}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); - GenerationRequest seq1{1, inputLength1, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq1{1, inputLength1, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks1 = tc::ceilDiv(inputLength1, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq1, llmRequest1->getNumTokens(0), numContextBlocks1, *llmRequest1); + blockManager.addSequence(seq1, llmRequest1->getNumTokens(0), numContextBlocks1, *llmRequest1, maxAttentionWindow); // Release both sequences blockManager.releaseBlocks(seq0, llmRequest0); @@ -1449,18 +1501,18 @@ TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest) auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); llmRequest2->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 20)}, 20)); - GenerationRequest seq2{2, inputLength2, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq2{2, inputLength2, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks2 = tc::ceilDiv(inputLength2, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq2, llmRequest2->getNumTokens(0), numContextBlocks2, *llmRequest2); + blockManager.addSequence(seq2, llmRequest2->getNumTokens(0), numContextBlocks2, *llmRequest2, maxAttentionWindow); blockManager.releaseBlocks(seq2, llmRequest2); // Check that request 1 blocks were overwritten auto inputTokens3 = std::make_shared(VecTokens{8, 9, 10, 11, 12, 13, 14, 15}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); - GenerationRequest seq3{3, inputLength3, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq3{3, inputLength3, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks3 = tc::ceilDiv(inputLength3, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq3, llmRequest3->getNumTokens(0), numContextBlocks3, *llmRequest3); + blockManager.addSequence(seq3, llmRequest3->getNumTokens(0), numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 4); @@ -1471,9 +1523,9 @@ TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest) auto inputTokens4 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7}); auto const inputLength4 = static_cast(inputTokens4->size()); auto llmRequest4 = std::make_shared(4, maxNewTokens, inputTokens4, samplingConfig, isStreaming); - GenerationRequest seq4{4, inputLength3, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq4{4, inputLength3, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks4 = tc::ceilDiv(inputLength4, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq4, llmRequest4->getNumTokens(0), numContextBlocks4, *llmRequest4); + blockManager.addSequence(seq4, llmRequest4->getNumTokens(0), numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), 4); @@ -1481,9 +1533,9 @@ TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest) auto inputTokens5 = std::make_shared(VecTokens{16, 17, 18, 19, 20, 21, 22, 23}); auto const inputLength5 = static_cast(inputTokens5->size()); auto llmRequest5 = std::make_shared(5, maxNewTokens, inputTokens5, samplingConfig, isStreaming); - GenerationRequest seq5{5, inputLength5, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq5{5, inputLength5, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks5 = tc::ceilDiv(inputLength5, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq5, llmRequest5->getNumTokens(0), numContextBlocks5, *llmRequest5); + blockManager.addSequence(seq5, llmRequest5->getNumTokens(0), numContextBlocks5, *llmRequest5, maxAttentionWindow); EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 0); } @@ -1506,10 +1558,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerDecodeBlockPriorityTest) tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; + auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream, - std::nullopt, true, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks); + kvCacheManager.allocatePools(false); auto const& blockManager = kvCacheManager.getBlockManager(); @@ -1608,10 +1661,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerTimedEvictionTest) tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; + auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream, - std::nullopt, true, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks); + kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength0 = static_cast(inputTokens0->size()); @@ -1672,10 +1726,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerDecodeTimedEvictionTest) tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; + auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream, - std::nullopt, true, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks); + kvCacheManager.allocatePools(false); { auto inputTokens0 = std::make_shared(VecTokens{1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength0 = static_cast(inputTokens0->size()); @@ -1745,10 +1800,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerSecondaryBlockPrimaryChildTest) tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; + auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, false, stream, true, - onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, nvinfer1::DataType::kHALF, false, stream, true, onboardBlocks); + kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength0 = static_cast(inputTokens0->size()); @@ -1817,10 +1873,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerLeafBlockTest) tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; + auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, false, stream, true, - onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, nvinfer1::DataType::kHALF, false, stream, true, onboardBlocks); + kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3}); auto const inputLength0 = static_cast(inputTokens0->size()); @@ -1842,7 +1899,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerLeafBlockTest) GenerationRequest const& seq1 = kvCacheManager.getSequence(1); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 0); // Block 1 should NOT be reused. It was not freed even if partial. - EXPECT_THAT(seq1.getCacheBlockIds().at(0), ::testing::ElementsAreArray({2})); + EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(0), ::testing::ElementsAreArray({2})); // Allocate the remaining 3 blocks in primary auto inputTokens2 = std::make_shared(VecTokens{2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); @@ -1890,7 +1947,6 @@ TEST_P(KVCacheManagerTest, DISABLED_KVCacheManagerAllocationTest) auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxNumTokens; - auto constexpr temporaryAttentionWindow = 0; auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1; auto constexpr numSharedBlocks = inputLength / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; @@ -1906,15 +1962,17 @@ TEST_P(KVCacheManagerTest, DISABLED_KVCacheManagerAllocationTest) auto const granularity = tensorrt_llm::common::getAllocationGranularity(); KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream, - std::nullopt, enableBlockReuse, onboardBlocks) + maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, + nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks) : KVCacheManager(std::vector(numLayers, numHeads), sizePerHead, tokensPerBlock, - totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, {maxAttentionWindow}, - temporaryAttentionWindow, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); + totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, + std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, + sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); - auto const& bufferManager = kvCacheManager.getBlockManager().getBufferManager(); + auto const blockManager = kvCacheManager.getBlockManager(); + auto const& bufferManager = blockManager.getBufferManager(theOnlyWindowSize(kvCacheManager)); auto const memoryPoolUsedBefore = bufferManager.memoryPoolUsed(); - kvCacheManager.allocatePools(dtype, useUvm); + kvCacheManager.allocatePools(useUvm); auto const memoryPoolUsedAfter = bufferManager.memoryPoolUsed(); EXPECT_GT(memoryPoolUsedAfter, memoryPoolUsedBefore); @@ -1948,7 +2006,6 @@ TEST_P(KVCacheManagerTest, KVCacheManagerTest) auto constexpr requestId = 7; auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxNumTokens; - auto constexpr temporaryAttentionWindow = 0; auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1; auto constexpr numSharedBlocks = inputLength / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; @@ -1963,14 +2020,14 @@ TEST_P(KVCacheManagerTest, KVCacheManagerTest) KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream, - std::nullopt, enableBlockReuse, onboardBlocks) + maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, + nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream, - std::nullopt, enableBlockReuse, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, + nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); + kvCacheManager.allocatePools(false); - EXPECT_EQ(kvCacheManager.getMaxBlocksPerSeq(), maxBlocksPerSeq); + EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); @@ -2096,7 +2153,6 @@ TEST_P(KVCacheManagerTest, KVCacheManagerRewindTokensTest) auto constexpr requestId = 7; auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxNumTokens; - auto constexpr temporaryAttentionWindow = 0; auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1; auto constexpr numSharedBlocks = inputLength / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; @@ -2110,12 +2166,13 @@ TEST_P(KVCacheManagerTest, KVCacheManagerRewindTokensTest) KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream, - std::nullopt, enableBlockReuse, onboardBlocks) + maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, + nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks) : KVCacheManager(std::vector(numLayers, numHeads), sizePerHead, tokensPerBlock, - totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, {maxAttentionWindow}, - temporaryAttentionWindow, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, + std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, + sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); + kvCacheManager.allocatePools(false); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); @@ -2181,7 +2238,6 @@ TEST_P(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowTest) auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1; // Enable cyclic kv cache for all new generated tokens. auto constexpr maxAttentionWindow = inputLength; - auto constexpr temporaryAttentionWindow = 0; auto constexpr numSharedBlocks = std::min(inputLength, maxAttentionWindow) / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (blockLengthPerSeq - numSharedBlocks) * maxBeamWidth; auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxAttentionWindow, tokensPerBlock); @@ -2196,14 +2252,14 @@ TEST_P(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowTest) KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream, - std::nullopt, enableBlockReuse, onboardBlocks) + maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, + nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream, - std::nullopt, enableBlockReuse, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, + nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); + kvCacheManager.allocatePools(false); - EXPECT_EQ(kvCacheManager.getMaxBlocksPerSeq(), maxBlocksPerSeq); + EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); @@ -2303,7 +2359,6 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest) // Enable cyclic kv cache for long input tokens. auto constexpr maxAttentionWindow = 16; - auto constexpr temporaryAttentionWindow = 0; auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxAttentionWindow, tokensPerBlock); auto constexpr blocksInPrimaryPool = 16; @@ -2313,9 +2368,10 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest) auto constexpr onboardBlocks = true; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, - sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, + std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, + onboardBlocks); + kvCacheManager.allocatePools(false); auto const& blockManager = kvCacheManager.getBlockManager(); @@ -2340,7 +2396,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); - EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 3})); + EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 3})); // add tokens to enable cyclic kv cache llmRequest->addNewToken(1016, beamIdx); @@ -2348,7 +2404,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest) llmRequest->addNewToken(1017, beamIdx); kvCacheManager.addToken(requestId); auto numTokens = llmRequest->getNumTokens(beamIdx); - auto numBlocks = seq0.getCacheBlockIds()[beamIdx].size(); + auto numBlocks = seq0.getCacheBlockIds(maxAttentionWindow)[beamIdx].size(); EXPECT_EQ(numBlocks, maxBlocksPerSeq); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); @@ -2366,14 +2422,14 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); - EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({4, 5})); + EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5})); llmRequest->addNewToken(1007, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1008, beamIdx); kvCacheManager.addToken(requestId); numTokens = llmRequest->getNumTokens(beamIdx); - numBlocks = seq1.getCacheBlockIds()[beamIdx].size(); + numBlocks = seq1.getCacheBlockIds(maxAttentionWindow)[beamIdx].size(); EXPECT_EQ(numBlocks, 3); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); // store blocks 4, 5 for reuse ([1000,1001,1002,1003], [1004,1005,1006,1007]) @@ -2389,7 +2445,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq2 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({4, 5, 7})); + EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5, 7})); numTokens = llmRequest->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); @@ -2400,7 +2456,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest) /////////////////////////////////////////////////////////////////////////// // add a longer request within attention window and try to reuse // reuse blocks 4, 5, 7(p) and get new block 8 - // upon reached attention window, shared block 4 is replaced with unshared block 9 + // upon reaching the attention window, the block ids shouldn't change requestId = 3; inputLength = 15; inputTokens->resize(inputLength); @@ -2409,13 +2465,15 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq3 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 9); - EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({4, 5, 7, 8})); + EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5, 7, 8})); llmRequest->addNewToken(1015, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1016, beamIdx); kvCacheManager.addToken(requestId); - EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({9, 5, 7, 8})); + // FIXME: This means that reuse will break here - the window will start writing to a reused block, and the following + // sequence that tries to reuse the block will read garbage. This will be fixed by removing the cyclic kv cache. + EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5, 7, 8})); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); /////////////////////////////////////////////////////////////////////////// @@ -2428,7 +2486,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); GenerationRequest const& seq4 = kvCacheManager.getSequence(requestId); - EXPECT_THAT(seq4.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({10, 11, 12, 13})); + EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({9, 10, 11, 12})); } TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest) @@ -2440,13 +2498,13 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest) auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; + auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); // Enable cyclic kv cache for long input tokens. auto constexpr minAttentionWindow = 8; auto constexpr maxAttentionWindow = 16; auto const maxAttentionWindowVec = std::vector{maxAttentionWindow, minAttentionWindow}; - auto constexpr temporaryAttentionWindow = 0; auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxAttentionWindow, tokensPerBlock); auto constexpr blocksInPrimaryPool = 16; @@ -2456,12 +2514,18 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest) auto constexpr onboardBlocks = true; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, temporaryAttentionWindow, + blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, std::nullopt, dtype, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + kvCacheManager.allocatePools(false); auto const& blockManager = kvCacheManager.getBlockManager(); + auto const allBlocksInPrimaryPools = blockManager.getNumPrimaryBlocks(); + EXPECT_THAT(allBlocksInPrimaryPools, blocksInPrimaryPool); + + ASSERT_EQ(blockManager.isVariableWindow(), true); + ASSERT_EQ(blockManager.isVariableGQA(), false); + SizeType32 constexpr maxNewTokens = 4; // prepare tokens with token[i] = 1000 + i @@ -2483,21 +2547,31 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); - EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1})); - // add tokens to enable cyclic kv cache + auto const assertBlocks + = [minAttentionWindow, maxAttentionWindow, beamIdx](GenerationRequest seq, + std::initializer_list expectedBlocksMin, std::initializer_list expectedBlocksMax) + { + auto blocksMin = seq.getCacheBlockIds(minAttentionWindow).at(beamIdx); + auto blocksMax = seq.getCacheBlockIds(maxAttentionWindow).at(beamIdx); + EXPECT_THAT(blocksMin, ::testing::ElementsAreArray(expectedBlocksMin)); + EXPECT_THAT(blocksMax, ::testing::ElementsAreArray(expectedBlocksMax)); + return blocksMin.size() + blocksMax.size(); + }; + + assertBlocks(seq0, {0, 1}, {0, 1}); + + // add tokens to enable cyclic kv cache for minimum but not maximum llmRequest->addNewToken(1016, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1017, beamIdx); kvCacheManager.addToken(requestId); - auto numTokens = llmRequest->getNumTokens(beamIdx); - auto numBlocks = seq0.getCacheBlockIds()[beamIdx].size(); - EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); - EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); + auto const numBlocks = assertBlocks(seq0, {0, 1}, {0, 1, 2}); + EXPECT_EQ(blockManager.getNumFreeBlocks(), allBlocksInPrimaryPools - numBlocks); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); // no blocks stored because cyclic KV cache was enabled EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); - EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); + EXPECT_EQ(blockManager.getNumFreeBlocks(), allBlocksInPrimaryPools); /////////////////////////////////////////////////////////////////////////// // add a short request that is between the min and max attention window @@ -2508,15 +2582,12 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); - EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); - + assertBlocks(seq1, {2, 3}, {3, 4, 5}); llmRequest->addNewToken(1007, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1008, beamIdx); kvCacheManager.addToken(requestId); - numTokens = llmRequest->getNumTokens(beamIdx); - numBlocks = seq1.getCacheBlockIds()[beamIdx].size(); - EXPECT_EQ(numBlocks, 3); + assertBlocks(seq1, {2, 3}, {3, 4, 5}); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); /////////////////////////////////////////////////////////////////////////// @@ -2529,11 +2600,10 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq2 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); - EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({6})); + assertBlocks(seq2, {4}, {6}); - numTokens = llmRequest->getNumTokens(beamIdx); - numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); - EXPECT_EQ(numBlocks, 1); + auto const numTokens = llmRequest->getNumTokens(beamIdx); + EXPECT_EQ(tc::ceilDiv(numTokens, tokensPerBlock), 1); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); // store block 6 for reuse @@ -2547,7 +2617,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq3 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 3); - EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({6})); + assertBlocks(seq3, {4}, {6}); } namespace @@ -2566,31 +2636,31 @@ KVCacheManager setupKvCacheManagerForHashTest(bool enableBlockReuse) auto constexpr maxBlocksPerSeq = 8; auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxNumTokens; - auto constexpr temporaryAttentionWindow = 0; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; - return {std::vector(numLayers, numHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, + return KVCacheManager(std::vector(numLayers, numHeads), sizePerHead, tokensPerBlock, + blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, + std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks, CacheType::kSELF, std::nullopt, nullptr, - /*enableHashKey*/ true}; + /*enableHashKey*/ true); } std::vector getHashAndRetrieveBlocksByHashTest( - BlockManager const& blockManager, std::vector const& blockIds) + BlockManager const& blockManager, std::vector const& blockIds, SizeType32 windowSize) { std::vector blockHashes; for (auto blockId : blockIds) { - blockHashes.emplace_back(blockManager.getBlockById(blockId)->getHash()); + blockHashes.emplace_back(blockManager.getBlockById(blockId, windowSize)->getHash()); } std::vector blockPtrs; for (auto hash : blockHashes) { - auto range = blockManager.getBlocksByHash(hash); + auto range = blockManager.getBlocksByHash(hash, windowSize); BlockPtr const prevBlock = blockPtrs.empty() ? nullptr : blockPtrs.back(); BlockPtr thisBlock = nullptr; for (auto it = range.first; it != range.second; ++it) @@ -2607,7 +2677,7 @@ std::vector getHashAndRetrieveBlocksByHashTest( EXPECT_EQ(blockHashes.size(), blockPtrs.size()); for (size_t i = 0; i < blockHashes.size(); i++) { - EXPECT_EQ(blockManager.getBlockById(blockIds[i]), blockPtrs[i]); + EXPECT_EQ(blockManager.getBlockById(blockIds[i], windowSize), blockPtrs[i]); } return blockHashes; } @@ -2640,18 +2710,21 @@ TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); - auto& blockIds = seq.getCacheBlockIds().at(beamIdx); + + auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager); + + auto& blockIds = seq.getCacheBlockIds(onlyWindowSize).at(beamIdx); EXPECT_THAT(blockIds, ::testing::ElementsAreArray({0, 1, 2, 3})); // get blocks by hash and try to retrieve them by hash - auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds); + auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds, onlyWindowSize); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); // blocks are all removed for (auto hash : blockHashes) { - auto range = blockManager.getBlocksByHash(hash); + auto range = blockManager.getBlocksByHash(hash, onlyWindowSize); EXPECT_EQ(range.first, range.second); } EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); @@ -2684,14 +2757,17 @@ TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyWithReuseTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); - auto& blockIds0 = seq0.getCacheBlockIds().at(beamIdx); + + EXPECT_EQ(blockManager.getNumPools(), 1); + auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager); + + auto& blockIds0 = seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx); EXPECT_THAT(blockIds0, ::testing::ElementsAreArray({0, 1, 2, 3})); // get blocks by hash and try to retrieve them by hash - auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds0); + auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds0, onlyWindowSize); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); - // store 4 blocks with total of 15 reusable tokens (last token is not stored). // TODO: Make reused blocks accessible by hash, after sequence removed. Test here. @@ -2705,21 +2781,21 @@ TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyWithReuseTest) kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 15); - auto& blockIds1 = seq1.getCacheBlockIds().at(beamIdx); + auto& blockIds1 = seq1.getCacheBlockIds(onlyWindowSize).at(beamIdx); EXPECT_THAT(blockIds1, ::testing::ElementsAreArray({0, 1, 2, 3, 4})); - std::ignore = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds1); + std::ignore = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds1, onlyWindowSize); // blocks are reused, so reused blocks are still accessible by previous hashes for (size_t i = 0; i < 4; i++) { - auto range = blockManager.getBlocksByHash(blockHashes[i]); + auto range = blockManager.getBlocksByHash(blockHashes[i], onlyWindowSize); EXPECT_NE(range.first, range.second); } // evicted block is not accessible { size_t i = 4; - auto range = blockManager.getBlocksByHash(blockHashes[i]); + auto range = blockManager.getBlocksByHash(blockHashes[i], onlyWindowSize); EXPECT_EQ(range.first, range.second); } EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 5); @@ -2744,11 +2820,12 @@ TEST_F(KVCacheManagerTest, KVCacheManagerEventStream) tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; + auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream, - std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt, + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt, std::make_unique(1024)); - kvCacheManager.allocatePools(dtype, false); + kvCacheManager.allocatePools(false); auto events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 1); @@ -2896,11 +2973,12 @@ TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamOverflow) tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; + auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream, - std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt, + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt, std::make_unique(1)); - kvCacheManager.allocatePools(dtype, false); + kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto llmRequest0 = std::make_shared(0, 0, inputTokens0, samplingConfig, true); @@ -2950,11 +3028,12 @@ TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamPriority) tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; + auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream, - std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt, + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt, std::make_unique(1024)); - kvCacheManager.allocatePools(dtype, false); + kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7}); auto llmRequest0 = std::make_shared(0, 0, inputTokens0, samplingConfig, true); @@ -3021,18 +3100,19 @@ TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamBlocking) tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; + auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManagerTest(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream, - std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt); + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt); EXPECT_EQ(getEvents(kvCacheManagerTest).size(), 0); KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, - blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream, - std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt, - std::make_unique(1024)); + blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, + std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, + std::nullopt, std::make_unique(1024)); - kvCacheManager.allocatePools(dtype, false); + kvCacheManager.allocatePools(false); kvCacheManager.flushIterationEvents(); auto events = kvCacheManager.getLatestEvents(std::chrono::seconds(1)); @@ -3113,7 +3193,6 @@ TEST_P(KVCacheManagerTest, KVCacheManagerSinkTokenLengthTest) auto constexpr bubbleLength = (sinkTokensInLastBlock) ? tokensPerBlock - sinkTokensInLastBlock : 0; auto constexpr inputLength = tokensPerBlock * maxBlocksPerSeq - bubbleLength - 1; auto constexpr maxAttentionWindow = inputLength - tokensPerBlock; - auto constexpr temporaryAttentionWindow = 0; auto constexpr numSharedBlocks = (sinkTokenLength + bubbleLength) / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; @@ -3130,14 +3209,14 @@ TEST_P(KVCacheManagerTest, KVCacheManagerSinkTokenLengthTest) auto const maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream, - maxSequenceLength, enableBlockReuse, onboardBlocks) + maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, + nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream, - maxSequenceLength, enableBlockReuse, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, + nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks); + kvCacheManager.allocatePools(false); - EXPECT_EQ(kvCacheManager.getMaxBlocksPerSeq(), maxBlocksPerSeq); + EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); @@ -3252,7 +3331,6 @@ TEST_P(KVCacheManagerTest, KVCacheManagerBatchTest) auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxNumTokens; - auto constexpr temporaryAttentionWindow = 0; auto constexpr inputLength = maxNumTokens - 2; auto constexpr numBlocksPerSeq = maxBlocksPerSeq - 1 + maxBeamWidth; @@ -3266,14 +3344,14 @@ TEST_P(KVCacheManagerTest, KVCacheManagerBatchTest) KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream, - std::nullopt, enableBlockReuse, onboardBlocks) + maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, + nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream, - std::nullopt, enableBlockReuse, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, + nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); + kvCacheManager.allocatePools(false); - EXPECT_EQ(kvCacheManager.getMaxBlocksPerSeq(), maxBlocksPerSeq); + EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); @@ -3395,22 +3473,23 @@ void testNeededBlocksOneStep(bool kv_cache_block_reuse, int beamWidth, int draft auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; // auto constexpr maxAttentionWindow = maxNumTokens / 2; auto constexpr maxAttentionWindow = 46; - auto constexpr temporaryAttentionWindow = 0; auto constexpr totalNumBlocks = maxNumSequences * maxBlocksPerSeq; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, - blocksInSecondaryPool, maxNumSequences, maxBeamWidth, {maxAttentionWindow}, - temporaryAttentionWindow, sinkTokenLength, stream, std::nullopt, kv_cache_block_reuse, - onboardBlocks) + blocksInSecondaryPool, maxNumSequences, maxBeamWidth, + std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, + sinkTokenLength, stream, std::nullopt, kv_cache_block_reuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, - maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, - stream, std::nullopt, kv_cache_block_reuse, onboardBlocks); - kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false); + maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, + std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, + kv_cache_block_reuse, onboardBlocks); + kvCacheManager.allocatePools(false); - EXPECT_EQ(kvCacheManager.getMaxBlocksPerSeq(), tc::ceilDiv(maxAttentionWindow, tokensPerBlock)); + EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, + tc::ceilDiv(maxAttentionWindow, tokensPerBlock)); auto inputTokens = std::make_shared(VecTokens(inputLength, 0)); @@ -3419,8 +3498,11 @@ void testNeededBlocksOneStep(bool kv_cache_block_reuse, int beamWidth, int draft = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); llmRequest->setDraftTokens(draftTokens); - auto remainingBlocksToCompletion = kvCacheManager.getRemainingBlocksToCompletion(*llmRequest); - auto neededBlocksOneStep = kvCacheManager.getNeededBlocksOneStep(*llmRequest, false); + auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager); + + auto remainingBlocksToCompletion + = kvCacheManager.getRemainingBlocksToCompletion(*llmRequest, onlyWindowSize); + auto neededBlocksOneStep = kvCacheManager.getNeededBlocksOneStep(*llmRequest, false, onlyWindowSize); EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth, llmRequest)); for (int di = 0; di < draftLen && di < maxNewTokens && (inputLength + di) < maxAttentionWindow; ++di) @@ -3445,7 +3527,7 @@ void testNeededBlocksOneStep(bool kv_cache_block_reuse, int beamWidth, int draft llmRequest->addNewToken(1, beam); } - neededBlocksOneStep = kvCacheManager.getNeededBlocksOneStep(*llmRequest, false); + neededBlocksOneStep = kvCacheManager.getNeededBlocksOneStep(*llmRequest, false, onlyWindowSize); for (int beam = 0; beam < maxBeamWidth; beam++) { @@ -3468,7 +3550,7 @@ void testNeededBlocksOneStep(bool kv_cache_block_reuse, int beamWidth, int draft // After adding all tokens, we should match remainingBlocksToCompletion EXPECT_EQ(remainingBlocksToCompletion, kvCacheManager.getUsedNumBlocks()); - EXPECT_EQ(kvCacheManager.getRemainingBlocksToCompletion(*llmRequest), 0); + EXPECT_EQ(kvCacheManager.getRemainingBlocksToCompletion(*llmRequest, onlyWindowSize), 0); } } } @@ -3587,6 +3669,8 @@ struct KvCacheManagerInstantiationParameters SizeType32 maxBeamWidth; SizeType32 maxNumTokens; bool kvCacheBlockReuse; + std::vector maxAttentionWindowVec = {maxAttentionWindow}; + nvinfer1::DataType dtype = nvinfer1::DataType::kFLOAT; }; struct GetRemainingBlocksToCompletionOneRequestParameters @@ -3608,8 +3692,8 @@ std::shared_ptr createKvCacheManager( KvCacheManagerInstantiationParameters const& kvCacheInstantiationParameters, StreamPtr stream) { auto const maxInputLength = kvCacheInstantiationParameters.maxNumTokens - 1; - auto const temporaryKvCacheLength = std::min(kvCacheInstantiationParameters.maxNumTokens, - maxInputLength - kvCacheInstantiationParameters.maxAttentionWindow); + auto const temporaryKvCacheInputs + = TempAttentionWindowInputs{true, maxInputLength, kvCacheInstantiationParameters.maxNumTokens}; if (std::holds_alternative(kvCacheInstantiationParameters.numHeadsPerLayer)) { @@ -3619,9 +3703,9 @@ std::shared_ptr createKvCacheManager( return std::make_shared(numHeadsPerLayerVec, kvCacheInstantiationParameters.sizePerHead, kvCacheInstantiationParameters.tokensPerBlock, kvCacheInstantiationParameters.numBlocksInPrimaryPool, 0, kvCacheInstantiationParameters.numBlocksInPrimaryPool, kvCacheInstantiationParameters.maxBeamWidth, - std::vector{kvCacheInstantiationParameters.maxAttentionWindow}, temporaryKvCacheLength, - kvCacheInstantiationParameters.sinkTokenLength, stream, std::nullopt, - kvCacheInstantiationParameters.kvCacheBlockReuse, true); + std::vector{kvCacheInstantiationParameters.maxAttentionWindow}, temporaryKvCacheInputs, + kvCacheInstantiationParameters.dtype, kvCacheInstantiationParameters.sinkTokenLength, stream, std::nullopt, + kvCacheInstantiationParameters.kvCacheBlockReuse, true, CacheType::kSELF); } if (std::holds_alternative>(kvCacheInstantiationParameters.numHeadsPerLayer)) { @@ -3630,9 +3714,9 @@ std::shared_ptr createKvCacheManager( return std::make_shared(numHeadsPerLayer, kvCacheInstantiationParameters.sizePerHead, kvCacheInstantiationParameters.tokensPerBlock, kvCacheInstantiationParameters.numBlocksInPrimaryPool, 0, kvCacheInstantiationParameters.numBlocksInPrimaryPool, kvCacheInstantiationParameters.maxBeamWidth, - std::vector{kvCacheInstantiationParameters.maxAttentionWindow}, temporaryKvCacheLength, - kvCacheInstantiationParameters.sinkTokenLength, stream, std::nullopt, - kvCacheInstantiationParameters.kvCacheBlockReuse, true); + std::vector{kvCacheInstantiationParameters.maxAttentionWindow}, temporaryKvCacheInputs, + kvCacheInstantiationParameters.dtype, kvCacheInstantiationParameters.sinkTokenLength, stream, std::nullopt, + kvCacheInstantiationParameters.kvCacheBlockReuse, true, CacheType::kSELF); } TLLM_THROW("Unhandled type of num heads per layer provided."); } @@ -3652,7 +3736,8 @@ std::vector fillKvCacheManager(KVCacheManager& kvCacheManager, SizeT // Adding enough requests to fill the kv-cache. auto remainingFreeBlocks = kvCacheManager.getNumFreeBlocks(); auto llmRequests = std::vector{}; - auto const remainingBlocksToCompletionFromStart = kvCacheManager.getRemainingBlocksToCompletion(llmRequest); + auto const remainingBlocksToCompletionFromStart + = kvCacheManager.getRemainingBlocksToCompletion(llmRequest, theOnlyWindowSize(kvCacheManager)); do { ++requestIdStart; @@ -3682,7 +3767,7 @@ class RemainingBlocksToCompletionTest auto const stream = std::make_shared(); auto const params = GetParam(); kvCacheManager = createKvCacheManager(params.kvCacheManagerInstantiationParameters, stream); - kvCacheManager->allocatePools(nvinfer1::DataType::kFLOAT); + kvCacheManager->allocatePools(false); } void TearDown() override {} @@ -3701,7 +3786,7 @@ TEST_P(RemainingBlocksToCompletionTest, RemainingBlocksToCompletionCorrectlyEsti tensorrt_llm::runtime::SamplingConfig{params.kvCacheManagerInstantiationParameters.maxBeamWidth}, true, }; - auto const result = kvCacheManager->getRemainingBlocksToCompletion(llmRequest); + auto const result = kvCacheManager->getRemainingBlocksToCompletion(llmRequest, theOnlyWindowSize(*kvCacheManager)); ASSERT_EQ(result, params.expectedRemainingBlocksToCompletion); } @@ -3766,7 +3851,7 @@ class FillKvCacheAndCompleteRequestsTest : public ::testing::TestWithParam(); auto const params = GetParam(); kvCacheManager = createKvCacheManager(params.kvCacheManagerInstantiationParameters, stream); - kvCacheManager->allocatePools(nvinfer1::DataType::kFLOAT); + kvCacheManager->allocatePools(false); } void TearDown() override {} diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheUtilsTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheUtilsTest.cpp index 7bb9a096193..8c167e1219f 100644 --- a/cpp/tests/unit_tests/batch_manager/kvCacheUtilsTest.cpp +++ b/cpp/tests/unit_tests/batch_manager/kvCacheUtilsTest.cpp @@ -84,16 +84,21 @@ TEST_F(BlockIteratorTest, CacheManagerTest) auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; + auto constexpr beamWidth = 1; + auto constexpr numBlocksPerBeam = blocksInPrimaryPool / beamWidth; + auto constexpr maxSequenceLength = tokensPerBlock * numBlocksPerBeam; + auto const maxAttentionWindowVec = std::vector{maxAttentionWindow}; + BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, - blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks); - blockManager.allocatePools(dataType, false); + blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, maxSequenceLength, beamWidth, + maxAttentionWindowVec, std::nullopt, dataType, 0, onboardBlocks); + blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; - auto constexpr beamWidth = 1; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; @@ -102,14 +107,14 @@ TEST_F(BlockIteratorTest, CacheManagerTest) LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); - GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow}; + GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0); + blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); - auto const blockIds = seq0.getCacheBlockIds().at(beamIdx); + auto const blockIds = seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(blockIds, ::testing::ElementsAreArray({0, 1, 2})); auto const pool = blockManager.getPrimaryPool(0); diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 817d030b27d..498d4ef565a 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -147,7 +147,10 @@ def __init__( if kv_cache_config.max_attention_window is None: max_attention_window = max_seq_len else: - max_attention_window = max(kv_cache_config.max_attention_window) + assert len( + kv_cache_config.max_attention_window + ) == 1, "Python KvCacheManager doesn't currently support variable window attention" + max_attention_window = kv_cache_config.max_attention_window[0] sink_token_length = (kv_cache_config.sink_token_length if kv_cache_config.sink_token_length is not None @@ -176,8 +179,7 @@ def __init__( max_attention_window = max_atten_window_upper_bound self.max_seq_len = max_atten_window_upper_bound - max_kv_cache_len = (max_attention_window if kv_cache_type - == CacheTypeCpp.SELF else self.max_seq_len) + self.max_attention_window = max_attention_window if kv_cache_type == CacheTypeCpp.SELF else self.max_seq_len # Note that this stream is unused for now. Will be used for copying to host # when that feature is enabled. @@ -190,8 +192,9 @@ def __init__( 'blocks_in_secondary_pool': self.blocks_in_secondary_pool, 'max_num_sequences': max_batch_size, 'max_beam_width': 1, # TODO: more than 1 beam? - 'max_attention_window_vec': [max_kv_cache_len], - 'temporary_attention_window': 0, + 'max_attention_window_vec': [self.max_attention_window], + 'temp_attention_window_inputs': None, + 'dtype': dtype, 'sink_token_length': sink_token_length, 'stream': self._stream.cuda_stream, 'max_sequence_length': max_seq_len, @@ -207,7 +210,7 @@ def __init__( self.impl = KVCacheManagerCpp(**kwargs) - self.impl.allocate_pools(dtype, False) + self.impl.allocate_pools(False) self.kv_cache_pool_pointers = self.impl.get_block_pool_pointers() self.kv_cache_pool_mapping = self.impl.get_layer_to_pool_mapping() self.num_pools = self.impl.num_pools @@ -407,7 +410,8 @@ def get_max_atten_window_upper_bound(self, blocks_in_primary_pool, return max_atten_window_upper_bound def get_cache_indices(self, request: LlmRequest) -> List[int]: - result = self.impl.get_cache_block_ids(request.py_request_id) + result = self.impl.get_cache_block_ids(request.py_request_id, + self.max_attention_window) assert len(result) == 1 return result[0] @@ -415,7 +419,8 @@ def get_batch_cache_indices( self, request_ids: List[int], ) -> Dict[int, List[int]]: - result = self.impl.get_batch_cache_block_ids(request_ids) + result = self.impl.get_batch_cache_block_ids(request_ids, + self.max_attention_window) for i in range(len(result)): assert (len(result[i])) == 1 result[i] = result[i][0] diff --git a/tensorrt_llm/models/gemma/config.py b/tensorrt_llm/models/gemma/config.py index d95244f79d2..e37926818ba 100644 --- a/tensorrt_llm/models/gemma/config.py +++ b/tensorrt_llm/models/gemma/config.py @@ -12,6 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from json import loads +from pathlib import Path from typing import TYPE_CHECKING, Optional, Union from tensorrt_llm.functional import PositionEmbeddingType @@ -145,6 +147,19 @@ def to_dict(self): } if self.is_gemma_3 else {}) } + @staticmethod + def get_hf_config(config_dir: "Union[str, PathLike]"): + import transformers + model_type = loads( + (Path(config_dir) / "config.json").read_text())["model_type"] + HFConfigClass = { + "gemma2": transformers.GemmaConfig, + "gemma": transformers.Gemma2Config, + "gemma3_text": transformers.Gemma3TextConfig, + }[model_type] + hf_config = HFConfigClass.from_pretrained(config_dir) + return hf_config + @classmethod def from_hugging_face( cls, @@ -158,8 +173,7 @@ def from_hugging_face( if isinstance(hf_config_or_dir, transformers.PretrainedConfig): hf_config = hf_config_or_dir else: - hf_config = transformers.GemmaConfig.from_pretrained( - hf_config_or_dir) + hf_config = cls.get_hf_config(hf_config_or_dir) dtype = infer_dtype(dtype, getattr(hf_config, 'torch_dtype', None)) diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py index d3da7e46711..049f177de65 100644 --- a/tensorrt_llm/models/generation_mixin.py +++ b/tensorrt_llm/models/generation_mixin.py @@ -339,17 +339,17 @@ def prepare_attention_inputs( math.ceil(kv_cache_range[0][2] / tokens_per_block) ]] * num_profiles - num_kv_cache_pools = 1 if num_kv_heads_per_layer is None else len( - set(num_kv_heads_per_layer[num_attn_layers_lower_ranks: - num_attn_layers_lower_ranks + - len(local_attn_layers)])) + NUM_KV_CACHE_POOLS = -1 # the number of unique variable window sizes, which is only known at runtime, affects the number of pools. + # dim_range=(min=1, opt=1 (this is the usual case - non vgqa, non vsliding_window), max=num_layers, + # TODO(nhaber): Benchmark if making NUM_KV_CACHE_POOLS dynamic has a significant performance hit? + + kv_pools_range = [[1, 1, len(local_attn_layers)]] * num_profiles kv_cache_block_offsets = Tensor( name=f'kv_cache_block_offsets', dtype=trt.int32, - shape=[num_kv_cache_pools, -1, 2, -1], + shape=[NUM_KV_CACHE_POOLS, -1, 2, -1], dim_range=OrderedDict([ - ('num_kv_cache_pools', - [num_kv_cache_pools] * num_profiles), + ('num_kv_cache_pools', kv_pools_range), ('batch_size_beam_width', bb_range), ('kv', [2] * num_profiles), ('max_blocks_per_seq', max_blocks_per_seq_range), @@ -357,10 +357,9 @@ def prepare_attention_inputs( host_kv_cache_block_offsets = Tensor( name=f'host_kv_cache_block_offsets', dtype=trt.int32, - shape=[num_kv_cache_pools, -1, 2, -1], + shape=[NUM_KV_CACHE_POOLS, -1, 2, -1], dim_range=OrderedDict([ - ('num_kv_cache_pools', - [num_kv_cache_pools] * num_profiles), + ('num_kv_cache_pools', kv_pools_range), ('batch_size_beam_width', bb_range), ('kv', [2] * num_profiles), ('max_blocks_per_seq', max_blocks_per_seq_range), @@ -368,10 +367,9 @@ def prepare_attention_inputs( host_kv_cache_pool_pointers = Tensor( name=f'host_kv_cache_pool_pointers', dtype=trt.int64, - shape=[num_kv_cache_pools, 2], + shape=[NUM_KV_CACHE_POOLS, 2], dim_range=OrderedDict([ - ('num_pools_layers', - [num_kv_cache_pools] * num_profiles), + ('num_pools_layers', kv_pools_range), ('num_pools_kv', [2] * num_profiles), ])) diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py index cdeaf6eb8ac..67d2a084f08 100644 --- a/tests/integration/defs/common.py +++ b/tests/integration/defs/common.py @@ -561,6 +561,8 @@ def generate_summary_cmd(example_root, *args, **kwargs): if isinstance(value, bool): if value: summary_cmd.append(f"--{key}") + elif isinstance(value, list): # Support max_attention_window + summary_cmd.extend([f"--{key}", *map(str, value)]) else: summary_cmd.extend([f"--{key}", f"{value}"]) diff --git a/tests/integration/defs/examples/test_gemma.py b/tests/integration/defs/examples/test_gemma.py index f16bfb6deff..5194f7dfa84 100644 --- a/tests/integration/defs/examples/test_gemma.py +++ b/tests/integration/defs/examples/test_gemma.py @@ -71,19 +71,71 @@ def get_ckpt_type(model_path): return ckpt_type +GEMMA_2_9B_IT = "gemma-2-9b-it" +GEMMA_2_27B_IT = "gemma-2-27b-it" +GEMMA_3_1B_IT = "gemma-3-1b-it" +VSWA_ATTENTION = { + GEMMA_2_9B_IT: [4096, 8192], + GEMMA_2_27B_IT: [4096, 8192], + GEMMA_3_1B_IT: [512, 512, 512, 512, 512, 32768] +} +""" +* Gemma-2: (local `4096`: https://huggingface.co/google/gemma-2-9b-it/blob/main/config.json#L27, global `8192`: https://huggingface.co/google/gemma-2-9b-it/blob/main/config.json#L18) +* Gemma-3-1b: (local `512`: https://huggingface.co/google/gemma-3-1b-it/blob/main/config.json#L31, global `32768`: https://huggingface.co/google/gemma-3-1b-it/blob/9b99be8/config.json#L20) +* (global `131072`: All other gemma 3 models https://github.com/huggingface/transformers/blob/ae5ce226644c8576c9047987e6b1d2e9bdeaed24/src/transformers/models/gemma3/modular_gemma3.py#L200C33-L200C40) +""" +VSWA_MODELS = VSWA_ATTENTION.keys() + +GEMMA2_MODELS = {GEMMA_2_9B_IT, GEMMA_2_27B_IT} + + +@pytest.mark.skip(reason="untested") +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize("data_type", ['bfloat16']) +@pytest.mark.parametrize("qformat", ['fp8']) +@pytest.mark.parametrize("gemma_model_root", VSWA_MODELS, indirect=True) +def test_llm_hf_gemma_quantization_1gpu_vswa(batch_size, data_type, + gemma_model_root, llm_venv, + cmodel_dir, engine_dir, + gemma_example_root, + llm_datasets_root, llm_rouge_root, + qformat): + max_attention_window = VSWA_ATTENTION[Path(gemma_model_root).stem] + hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root, + llm_venv, cmodel_dir, engine_dir, + gemma_example_root, llm_datasets_root, + llm_rouge_root, qformat, max_attention_window) + + @skip_post_blackwell @skip_pre_hopper @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("data_type", ['bfloat16', 'float16']) @pytest.mark.parametrize("qformat", ['fp8', 'int4_awq', 'int8_sq']) -@pytest.mark.parametrize( - "gemma_model_root", - ["gemma-2b", "gemma-7b", "gemma-2-9b-it", "gemma-2-27b-it"], - indirect=True) +@pytest.mark.parametrize("gemma_model_root", + ["gemma-2b", "gemma-7b", *GEMMA2_MODELS], + indirect=True) def test_llm_hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root, llm_venv, cmodel_dir, engine_dir, gemma_example_root, llm_datasets_root, llm_rouge_root, qformat): + hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root, + llm_venv, cmodel_dir, engine_dir, + gemma_example_root, llm_datasets_root, + llm_rouge_root, qformat) + + +def hf_gemma_quantization_1gpu(batch_size, + data_type, + gemma_model_root, + llm_venv, + cmodel_dir, + engine_dir, + gemma_example_root, + llm_datasets_root, + llm_rouge_root, + qformat, + max_attention_window: list[int] | None = None): "run gemma quantization tests" print("Convert checkpoint by modelopt...") kv_cache_dtype = 'fp8' if qformat == 'fp8' else 'int8' @@ -121,6 +173,11 @@ def test_llm_hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root, if "gemma-7b" in gemma_model_root: threshold_score = 18 + window = [ + "--max_attention_window_size", + *max_attention_window, + ] if max_attention_window is not None else [] + summary_cmd = [ f"{gemma_example_root}/../summarize.py", "--test_trt_llm", @@ -133,10 +190,27 @@ def test_llm_hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root, f"--batch_size={batch_size}", f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}", + *window, ] venv_check_call(llm_venv, summary_cmd) +# max_seq_len=3100, one local value that won't slide, and one that will +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize("data_type", ['bfloat16']) +@pytest.mark.parametrize("test_case", ['other']) +@pytest.mark.parametrize("gemma_model_root", VSWA_MODELS, indirect=True) +def test_llm_gemma_1gpu_summary_vswa(batch_size, data_type, gemma_model_root, + llm_venv, cmodel_dir, engine_dir, + gemma_example_root, llm_datasets_root, + llm_rouge_root, test_case): + max_attention_window = VSWA_ATTENTION[Path(gemma_model_root).stem] + gemma_1gpu_summary(batch_size, data_type, gemma_model_root, llm_venv, + cmodel_dir, engine_dir, gemma_example_root, + llm_datasets_root, llm_rouge_root, test_case, + max_attention_window) + + @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("data_type", ['float16', 'bfloat16']) @pytest.mark.parametrize("test_case", [ @@ -150,13 +224,29 @@ def test_llm_hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root, @pytest.mark.parametrize("gemma_model_root", [ "gemma-2b", "gemma-7b", "gemma-2b-torch", "gemma-7b-torch", "gemma-2b-keras", "gemma-7b-keras", "gemma-2b-it-flax", "gemma-7b-it-flax", - "gemma-2-9b-it", "gemma-2-27b-it", "gemma-3-1b-it" + *GEMMA2_MODELS ], indirect=True) def test_llm_gemma_1gpu_summary(batch_size, data_type, gemma_model_root, llm_venv, cmodel_dir, engine_dir, gemma_example_root, llm_datasets_root, llm_rouge_root, test_case): + gemma_1gpu_summary(batch_size, data_type, gemma_model_root, llm_venv, + cmodel_dir, engine_dir, gemma_example_root, + llm_datasets_root, llm_rouge_root, test_case) + + +def gemma_1gpu_summary(batch_size, + data_type, + gemma_model_root, + llm_venv, + cmodel_dir, + engine_dir, + gemma_example_root, + llm_datasets_root, + llm_rouge_root, + test_case, + max_attention_window: list[int] | None = None): "run gemm test on 1 gpu" skip_fp8_pre_ada(use_fp8=test_case == "fp8_kv_cache") if "smooth_quant" in test_case and "bfloat16" in data_type: @@ -215,6 +305,10 @@ def test_llm_gemma_1gpu_summary(batch_size, data_type, gemma_model_root, check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env) + window = { + 'max_attention_window_size': max_attention_window + } if max_attention_window is not None else {} + print("Run summarize...") summary_cmd = generate_summary_cmd(gemma_example_root, engine_dir=engine_dir, @@ -222,7 +316,8 @@ def test_llm_gemma_1gpu_summary(batch_size, data_type, gemma_model_root, batch_size=batch_size, tensorrt_llm_rouge1_threshold=15, dataset_dir=llm_datasets_root, - rouge_dir=llm_rouge_root) + rouge_dir=llm_rouge_root, + **window) if ckpt_type == "hf": summary_cmd.extend([ @@ -232,13 +327,6 @@ def test_llm_gemma_1gpu_summary(batch_size, data_type, gemma_model_root, else: summary_cmd.append(f"--vocab_file={vocab_file}") - os.path.basename(gemma_model_root) - if 'gemma-3-1b-it' in gemma_model_root: - max_attention_window_size = [512, 512, 512, 512, 512, 3100] - summary_cmd.append(f"--max_attention_window_size") - for window_size in max_attention_window_size: - summary_cmd.append(str(window_size)) - venv_check_call(llm_venv, summary_cmd) @@ -436,10 +524,9 @@ def test_llm_gemma_1gpu_evaltool(gemma_model_root, llm_venv, cmodel_dir, @skip_pre_hopper -@pytest.mark.parametrize( - "gemma_model_root", - ["gemma-2b", "gemma-7b", "gemma-2-9b-it", "gemma-2-27b-it"], - indirect=True) +@pytest.mark.parametrize("gemma_model_root", + ["gemma-2b", "gemma-7b", *GEMMA2_MODELS], + indirect=True) def test_hf_gemma_fp8_base_bf16_multi_lora(gemma_model_root, llm_venv, cmodel_dir, diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt index f392317cccc..e4e88d623d4 100644 --- a/tests/integration/test_lists/qa/examples_test_list.txt +++ b/tests/integration/test_lists/qa/examples_test_list.txt @@ -35,7 +35,7 @@ examples/test_exaone.py::test_llm_exaone_1gpu[disable_weight_only-exaone_3.0_7.8 examples/test_exaone.py::test_llm_exaone_1gpu[enable_weight_only-exaone_deep_2.4b-float16-nb:1] examples/test_exaone.py::test_llm_exaone_2gpu[exaone_3.0_7.8b_instruct-float16-nb:1] examples/test_gemma.py::test_llm_gemma_1gpu_summary[gemma-2-27b-it-other-bfloat16-8] -examples/test_gemma.py::test_llm_gemma_1gpu_summary[gemma-3-1b-it-other-bfloat16-8] +examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8] examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 554dda10c74..89b64d0eb88 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -113,7 +113,7 @@ l0_h100: - examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-1b] - examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_1.5b_instruct] - examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] - - examples/test_gemma.py::test_llm_gemma_1gpu_summary[gemma-3-1b-it-other-bfloat16-8] + - examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8] - examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] - unittest/trt/model_api/test_model_level_api.py # 9 mins on H100 - unittest/trt/model_api/test_model_api_multi_gpu.py # 0.5 mins on H100 diff --git a/tests/unittest/bindings/test_bindings_ut.py b/tests/unittest/bindings/test_bindings_ut.py index 3849018bf8c..de5a8a811a4 100644 --- a/tests/unittest/bindings/test_bindings_ut.py +++ b/tests/unittest/bindings/test_bindings_ut.py @@ -555,8 +555,10 @@ def test_KvCache_events_binding(): 'max_beam_width': 1, 'max_attention_window_vec': [10], - 'temporary_attention_window': - 0, + 'temp_attention_window_inputs': + None, + 'dtype': + _tb.DataType.FLOAT, 'sink_token_length': 0, 'stream':