diff --git a/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h b/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h
index 27b3de2e3d3..272758936ed 100644
--- a/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h
+++ b/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h
@@ -97,12 +97,6 @@ class MaxUtilizationScheduler : public BaseCapacityScheduler
         RequestList const& activeRequests) const;
 
 private:
-    /// @return {fitsKvCache, fitsPeft}
-    std::pair<bool, bool> trySchedulingRequestMaxUtilization(kv_cache_manager::BaseKVCacheManager const& kvCacheManager,
-        OptionalRef<BasePeftCacheManager const> peftCacheManager, std::shared_ptr<LlmRequest> const& req,
-        RequestVector& scheduledRequests, SizeType32& numScheduledBlocks, SizeType32& numScheduledPeftPages,
-        std::unordered_set<uint64_t>& seenTaskIds) const;
-
     SizeType32 mMaxNumRequests;
     /// @brief Boolean that indicates if multiple micro batches might be in flight
     bool mManyMicroBatches;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
index 7e89241a2e4..335ce77a4cd 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -52,6 +52,7 @@ static constexpr SizeType32 kPrimaryLevel = 0;
 static constexpr SizeType32 kSecondaryLevel = 1;
 
 class KVCacheBlock;
+class BlockManager;
 class KVCacheManager;
 class KVCacheTransferManager;
 
@@ -68,6 +69,33 @@ using LoraTaskIdType = tensorrt_llm::runtime::LoraTaskIdType;
 template <typename T>
 using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
 
+struct TempAttentionWindowInputs
+{
+    bool pagedContextFMHA;
+    SizeType32 maxInputLen;
+    SizeType32 maxNumTokens;
+};
+
+struct WindowSizeMetadata
+{
+    SizeType32 absolutePoolsOffset;      // cumulative number of pools up to manager
+    SizeType32 numPools;                 // number of managed pools
+    SizeType32 maxTokenNum;              // Maximum token length (including bubble)
+    SizeType32 maxBlocksPerSeq;
+    SizeType32 maxNumBlocks;             // Number of primary+secondary blocks allotted to the windowSize
+    SizeType32 temporaryAttentionWindow; // Temporary kv cache length per sequence.
+                                         // Only needed when chunked context + sliding window attention are used
+                                         // together. And it should only be considered when allocating blocks.
+
+    std::string toString()
+    {
+        return tensorrt_llm::common::fmtstr(
+            "WindowSizeMetadata{ .absolutePoolsOffset=%d, .numPools=%d, .maxTokenNum=%d, .maxBlocksPerSeq=%d, "
+            ".maxNumBlocks=%d, .temporaryAttentionWindow=%d }",
+            absolutePoolsOffset, numPools, maxTokenNum, maxBlocksPerSeq, maxNumBlocks, temporaryAttentionWindow);
+    }
+};
+
 struct BlockKey
 {
     bool usesExtraIds = false;
@@ -300,22 +328,38 @@ class GenerationRequest
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
 
     explicit GenerationRequest(LlmRequest::RequestIdType requestId, SizeType32 numTokens, SizeType32 beamWidth,
-        SizeType32 maxBlocks, SizeType32 cyclicThreshold, SizeType32 numPools = 1,
+        std::map<SizeType32, WindowSizeMetadata> const& windowSizeToMetadata,
         executor::KvCacheRetentionConfig kvCacheRetentionConfig = executor::KvCacheRetentionConfig())
         : mRequestId(requestId)
         , mNumTokens(numTokens)
         , mBeamWidth(beamWidth)
-        , mCacheBlockIds(beamWidth)
-        , mCacheBlockIndices{runtime::BufferManager::cpu(
-              runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}),
-              runtime::TRTDataType<tensorrt_llm::kernels::KVCacheIndex>::value)}
         , mKvCacheRetentionConfig(std::move(kvCacheRetentionConfig))
-        , mCyclicThreshold(cyclicThreshold)
-    {
-        auto cacheBlockIdsRange = runtime::BufferRange<tensorrt_llm::kernels::KVCacheIndex>(*mCacheBlockIndices);
-        std::fill(cacheBlockIdsRange.begin(), cacheBlockIdsRange.end(),
-            tensorrt_llm::kernels::KVCacheIndex{
-                std::numeric_limits<tensorrt_llm::kernels::KVCacheIndex::UnderlyingType>::max()});
+        // min window size + sink bubble length
+        // Why use the minimum window size:
+        // Chunked Prefill + Reuse calls `setPrepopulatedPromptLen()` which sets
+        // `mContextCurrentPosition` - this cannot be done for some windows sizes and
+        // not for others, the state needs to remain identical for all window sizes. So
+        // we currently resort to strictly disabling the reuse code path for all window
+        // sizes at once or enable it for all window sizes at once.
+        , mCyclicThreshold(windowSizeToMetadata.cbegin()->second.maxTokenNum)
+    {
+        auto const numWindowSizes = windowSizeToMetadata.size();
+        mCacheBlockIds.reserve(numWindowSizes);
+        mCacheBlockIndices.reserve(numWindowSizes);
+        for (auto const [windowSize, metadata] : windowSizeToMetadata)
+        {
+            mCacheBlockIds[windowSize] = std::vector<std::vector<KVCacheBlock::IdType>>(beamWidth);
+            auto const numPools = metadata.numPools;
+            auto const maxBlocks = metadata.maxBlocksPerSeq;
+            mCacheBlockIndices[windowSize]
+                = runtime::BufferManager::cpu(runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}),
+                    runtime::TRTDataType<tensorrt_llm::kernels::KVCacheIndex>::value);
+            auto cacheBlockIdsRange
+                = runtime::BufferRange<tensorrt_llm::kernels::KVCacheIndex>(*mCacheBlockIndices.at(windowSize));
+            std::fill(cacheBlockIdsRange.begin(), cacheBlockIdsRange.end(),
+                tensorrt_llm::kernels::KVCacheIndex{
+                    std::numeric_limits<tensorrt_llm::kernels::KVCacheIndex::UnderlyingType>::max()});
+        }
     }
 
     void addNewTokens(SizeType32 n)
@@ -345,42 +389,43 @@ class GenerationRequest
         return mBeamWidth;
     }
 
-    [[nodiscard]] std::vector<std::vector<SizeType32>> const& getCacheBlockIds() const
+    [[nodiscard]] std::vector<std::vector<SizeType32>> const& getCacheBlockIds(SizeType32 windowSize) const
     {
-        return mCacheBlockIds;
+        return mCacheBlockIds.at(windowSize);
     }
 
-    [[nodiscard]] runtime::ITensor& getCacheBlockIndices()
+    [[nodiscard]] runtime::ITensor& getCacheBlockIndices(SizeType32 windowSize)
     {
-        return *mCacheBlockIndices;
+        return *(mCacheBlockIndices.at(windowSize));
     }
 
-    [[nodiscard]] runtime::ITensor const& getCacheBlockIndices() const
+    [[nodiscard]] runtime::ITensor const& getCacheBlockIndices(SizeType32 windowSize) const
     {
-        return *mCacheBlockIndices;
+        return *(mCacheBlockIndices.at(windowSize));
     }
 
-    void addCacheBlock(SizeType32 beamIdx, KVCacheBlock::IdType blockId)
+    void addCacheBlock(SizeType32 windowSize, SizeType32 beamIdx, KVCacheBlock::IdType blockId)
     {
-        mCacheBlockIds.at(beamIdx).push_back(blockId);
+        mCacheBlockIds.at(windowSize).at(beamIdx).push_back(blockId);
     }
 
-    void changeCacheBlock(SizeType32 beamIdx, SizeType32 pagedBlockIdx, KVCacheBlock::IdType blockId)
+    void changeCacheBlock(
+        SizeType32 windowSize, SizeType32 beamIdx, SizeType32 pagedBlockIdx, KVCacheBlock::IdType blockId)
     {
-        mCacheBlockIds.at(beamIdx).at(pagedBlockIdx) = blockId;
+        mCacheBlockIds.at(windowSize).at(beamIdx).at(pagedBlockIdx) = blockId;
     }
 
-    void clearCacheBlocks()
+    void clearCacheBlocks(SizeType32 windowSize)
     {
-        for (auto& beamBlockIds : mCacheBlockIds)
+        for (auto& beamBlockIds : mCacheBlockIds.at(windowSize))
         {
             beamBlockIds.clear();
         }
     }
 
-    void removeLastBlock()
+    void removeLastBlock(SizeType32 windowSize)
     {
-        for (auto& beamBlockIds : mCacheBlockIds)
+        for (auto& beamBlockIds : mCacheBlockIds.at(windowSize))
         {
             beamBlockIds.pop_back();
         }
@@ -411,14 +456,14 @@ class GenerationRequest
     SizeType32 mNumTokens;
     // Number of beams
     SizeType32 mBeamWidth;
-    // List of block ids allocated for each beam of the sequence
-    std::vector<std::vector<KVCacheBlock::IdType>> mCacheBlockIds;
-    // Tensor of block indices allocated for each beam of the sequence
-    runtime::ITensor::SharedPtr mCacheBlockIndices;
+    // List of block ids allocated per each window size, for each beam of the sequence
+    std::unordered_map<SizeType32, std::vector<std::vector<KVCacheBlock::IdType>>> mCacheBlockIds;
+    // Tensor of block indices allocated per each window size, for each beam of the sequence
+    std::unordered_map<SizeType32, runtime::ITensor::SharedPtr> mCacheBlockIndices;
     // The retention priority to assign to decode blocks
     executor::KvCacheRetentionConfig mKvCacheRetentionConfig;
 
-    // Number of tokens at which the KV Cache begins sliding
+    // Number of tokens at which the KV Cache begins sliding [for the minimum attention window]
     SizeType32 mCyclicThreshold;
 };
 
@@ -456,7 +501,7 @@ class KVCacheBlockPool
     }
 };
 
-// The BlockManager manages the metadata of KVCacheBlocks.
+// The WindowBlockManager manages the metadata of KVCacheBlocks.
 // It manages multiple arrays of cache blocks called pools.
 // Layers with the same number of kv heads are grouped under the same pool.
 // Each pool has shape [max_blocks, num_layers, 2, num_kv_heads, tokens_pre_block, head_size], where num_layers refers
@@ -464,6 +509,7 @@ class KVCacheBlockPool
 // The metadata of KVCacheBlocks is shared between layers, so each block spans all of the managed pool - an allocated
 // block matches some chunk of memory in each pool. The shape of the chunk in every pool is [2, num_kv_heads,
 // tokens_per_block, head_size]. The size per block and number of blocks are pre-determined and set in the constructor.
+// WindowBlockManager maintains a list of free blocks at any time.
 //
 // FP4 KV caches allocate additional pools for block scale factors. These pools have the same
 // shape as the regular KV pools, except that the the last dim is head_size / N where N is determined
@@ -471,9 +517,9 @@ class KVCacheBlockPool
 //
 // BlockManager maintains a list of free blocks at any time.
 // Alloc pops off the block at the front, and Free pushes it back to the vector.
-// BlockManager maintains a vector of lists of request ids to allocated blocks
+// WindowBlockManager maintains a vector of lists of request ids to allocated blocks
 // per sequence. This can be used to Free all blocks belonging to a sequence.
-class BlockManager
+class WindowBlockManager
 {
 public:
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
@@ -482,17 +528,17 @@ class BlockManager
     using BlockMap = std::unordered_multimap<size_t, BlockPtr>;
     using BlockMapIterRange = std::pair<BlockMap::const_iterator, BlockMap::const_iterator>;
 
-    explicit BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead,
-        SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
-        SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
-        CacheType cacheType = CacheType::kSELF,
-        std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
-        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false,
-        bool enablePartialReuse = true, bool copyOnPartialReuse = true);
+    explicit WindowBlockManager(nvinfer1::DataType dtype, SizeType32 windowSize,
+        std::vector<SizeType32> const& managedLayers, std::vector<SizeType32> const& numKvHeadsPerLayer,
+        SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool,
+        SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
+        bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
+        std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
+        bool copyOnPartialReuse);
 
-    ~BlockManager();
+    ~WindowBlockManager();
 
-    void allocatePools(nvinfer1::DataType dtype, bool useUvm);
+    void allocatePools(bool useUvm);
 
     void releasePools();
 
@@ -507,15 +553,17 @@ class BlockManager
 
     //! \brief Allocate new block for each beam of the sequence.
     //! \details Might free cached blocks if no free blocks are available.
-    void allocateBlock(GenerationRequest& sequence, bool shareAmongBeams = false);
+    void allocateBlock(GenerationRequest& sequence, bool shareAmongBeams);
 
     void replaceSharedBlock(GenerationRequest& sequence, SizeType32 blockIdx);
 
     //! \brief Get the ids of all newly allocated (not reused) blocks for the sequence.
     std::vector<KVCacheBlock::IdType> getNewlyAllocatedBlockIds(GenerationRequest const& sequence) const;
 
-    //! \brief Release blocks of the sequence. Store blocks for reuse if llmReqeust is provided.
-    void releaseBlocks(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt);
+    void storeBlocksForReuse(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
+
+    //! \brief Release blocks of the sequence.
+    void releaseBlocks(GenerationRequest& sequence);
 
     //! \brief Simulate freeing all blocks for that sequence to check impact on number of free blocks
     void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
@@ -523,6 +571,16 @@ class BlockManager
     //! \brief Release last block in the sequence
     void releaseLastBlock(GenerationRequest& sequence);
 
+    [[nodiscard]] SizeType32 getWindowSize() const noexcept
+    {
+        return mWindowSize;
+    }
+
+    [[nodiscard]] std::string const& getLogPrefix() const noexcept
+    {
+        return mLogPrefix;
+    }
+
     [[nodiscard]] SizeType32 getNumFreeBlocks() const noexcept;
 
     [[nodiscard]] SizeType32 getNumAllocTotalBlocks() const
@@ -550,15 +608,12 @@ class BlockManager
         return mMissedBlocks;
     }
 
-    [[nodiscard]] std::deque<executor::KVCacheEvent> getLatestEvents(
-        std::optional<std::chrono::milliseconds> timeout) const;
-
     [[nodiscard]] bool hasFreeBlocks(SizeType32 numRequired = 1) const noexcept
     {
         return getNumFreeBlocks() >= numRequired;
     }
 
-    [[nodiscard]] bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const noexcept
+    [[nodiscard]] bool schedulingHasFreeBlocks(SizeType32 numRequired) const noexcept
     {
         return mSchedulingNumFreeBlocks >= numRequired;
     }
@@ -599,14 +654,9 @@ class BlockManager
         return std::count_if(mPools.begin(), mPools.end(), [](auto const& pool) { return !pool.containsBlockScales; });
     }
 
-    [[nodiscard]] runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 poolIdx) const
-    {
-        return mPools.at(poolIdx).primaryPtr;
-    }
-
-    [[nodiscard]] runtime::ITensor::SharedPtr getSecondaryPool(SizeType32 poolIdx) const
+    [[nodiscard]] KVCacheBlockPool const& getPool(SizeType32 poolIdx) const
     {
-        return mPools.at(poolIdx).secondaryPtr;
+        return mPools.at(poolIdx);
     }
 
     [[nodiscard]] bool containsBlockScales(SizeType32 poolIdx) const
@@ -614,11 +664,6 @@ class BlockManager
         return mPools.at(poolIdx).containsBlockScales;
     }
 
-    [[nodiscard]] SizeType32 getNumLayers() const
-    {
-        return mNumLayers;
-    }
-
     [[nodiscard]] SizeType32 getNumPrimaryBlocks() const
     {
         return mNumPrimaryBlocks;
@@ -629,14 +674,9 @@ class BlockManager
         return mNumSecondaryBlocks;
     }
 
-    [[nodiscard]] CacheType getCacheType() const
-    {
-        return mCacheType;
-    }
-
     [[nodiscard]] SizeType32 getLayerPoolIdx(SizeType32 layerIdx) const
     {
-        return mLayerToPool.at(layerIdx);
+        return mLayerToPoolIndex.at(layerIdx);
     }
 
     //! \brief Maps a global layer index to its layer index within its pool.
@@ -644,22 +684,18 @@ class BlockManager
     //! \details gives the layer index into the getLayerPoolIdx(i).
     [[nodiscard]] SizeType32 getPoolLayerIdx(SizeType32 layerIdx) const
     {
-        return mLayerIndexToPoolLayerIndex.at(layerIdx);
+        return mLayerToIndexWithinPool.at(layerIdx);
     }
 
-    //! \brief Get index in pool to K or V block.
-    //! \param blockId the blockId as returned by getBlockId()
-    //! \param fieldIdx either 0 (K) or 1 (V),
-    //! \param poolIdx the index of the pool for which the index is calculated (each pool has different strides)
-    [[nodiscard]] kernels::KVCacheIndex getKOrVBlockIndex(
-        KVCacheBlock::IdType blockId, SizeType32 fieldIdx, SizeType32 poolIdx) const;
+    void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx,
+        SizeType32 blockIdx, KVCacheBlock::IdType blockId) const;
 
     //! \brief Bring offloaded block from secondary to primary memory.
-    //! \details Does nothing of block is already in primary memory.
+    //! \details Does nothing if block is already in primary memory.
     void onboardBlock(BlockPtr const& offloadBlock);
 
     //! \brief Bring block from primary to secondary memory.
-    //! \details Does nothing of block is already in secondary memory.
+    //! \details Does nothing if block is already in secondary memory.
     void offloadBlock(BlockPtr const& block);
 
     //! \brief Find first new block that must be allocated for context phase and return it's concatenated token vectors.
@@ -675,24 +711,35 @@ class BlockManager
     //! \brief Perform per-request bookkeeping
     void refreshBlocks();
 
-    void flushIterationEvents()
-    {
-        if (mEventManager)
-        {
-            mEventManager->flush();
-        }
-    }
-
     [[nodiscard]] static bool blockInRadixTree(BlockPtr const& block);
 
-    [[nodiscard]] bool verifyQueueIntegrity();
+    //! \brief Store blocks in cached blocks.
+    //! \param blockKeys Key of each block.
+    //! \param blockIds Id of each block.
+    void storeBlocks(std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds);
 
-    //! \brief Store context blocks
-    void storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest);
+    void addBlockToHashMap(BlockPtr const& block);
 
-    [[nodiscard]] bool isEnableHashKey() const
+    void removeBlockFromHashMap(BlockPtr const& block);
+
+    [[nodiscard]] bool verifyQueueIntegrity();
+
+    // Only needed when sliding window attention + paged context fmha are used together.
+    // In that case, a temporary kv cache buffer with maximum chunk size (maxNumTokens) is needed.
+    // TODO: There are several things that can be improved later.
+    //  1. a dynamic temporary kv cache allocation based on real chunk size might be needed.
+    //  2. reuse the same temporary kv cache buffer among all layers in the same pool.
+    [[nodiscard]] SizeType32 calculateTemporaryAttentionWindow(
+        std::optional<TempAttentionWindowInputs> const& inputs) const
     {
-        return mEnableHashKey;
+
+        if (inputs && inputs->pagedContextFMHA && (inputs->maxInputLen > mWindowSize))
+        {
+            auto window = std::min(inputs->maxNumTokens, inputs->maxInputLen - mWindowSize);
+            window = std::max(window, 0); // clamp negative values to 0
+            return window;
+        }
+        return 0;
     }
 
 private:
@@ -702,11 +749,6 @@ class BlockManager
     //! \brief Add single block to all beams of sequence.
     void addBlockToAllBeams(BlockPtr& block, GenerationRequest& sequence);
 
-    //! \brief Store blocks in cached blocks.
-    //! \param blockKeys Key of each block.
-    //! \param blockIds Id of each block.
-    void storeBlocks(std::vector<BlockKey> blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds);
-
     //! \brief Try to load blocks from cache. Allocate new blocks if necessary.
     //! \param blockKeys Key of each block.
     //! \param sequence Sequence to which blocks are assigned.
@@ -720,17 +762,16 @@ class BlockManager
         std::optional<std::chrono::milliseconds> durationMs = std::nullopt);
 
     //! \brief Free block from previous block and claim it from free blocks list.
-    void claimLeafBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt,
+    void claimLeafBlock(BlockPtr const& block, std::optional<executor::RetentionPriority> priority = std::nullopt,
         std::optional<std::chrono::milliseconds> durationMs = std::nullopt);
 
-    void addBlockToHashMap(BlockPtr block);
-
-    void removeBlockFromHashMap(BlockPtr block);
-
-private:
     //! \brief For FP4 quantization. Creates pool objects for FP4 block scalars.
     void createBlockScalePools(SizeType32 blockSize);
 
+private:
+    nvinfer1::DataType mDataType;
+    SizeType32 mWindowSize;
+
     // Number of blocks in pools
     SizeType32 mNumPrimaryBlocks;
     SizeType32 mNumSecondaryBlocks;
@@ -741,20 +782,17 @@ class BlockManager
     // Pool per unique numKvHeads in the model
     std::vector<KVCacheBlockPool> mPools;
 
-    // Matching of model layers to their pools
-    std::vector<SizeType32> mLayerToPool;
-    // See getPoolLayerIdx
-    std::vector<SizeType32> mLayerIndexToPoolLayerIndex;
+    // Matching layers to their respective pools: {<layer #0>: <pool idx 2>, }, etc.
+    std::unordered_map<SizeType32, SizeType32> mLayerToPoolIndex;
+    // Matching layers to their index *within* their respective pools: {..., <layer 3>: <idx 2 within pool> }. See
+    // getPoolLayerIdx
+    std::unordered_map<SizeType32, SizeType32> mLayerToIndexWithinPool;
 
     // Whether offloaded blocks should be onboarded before reuse.
     bool mOnboardBlocks;
     // Buffer manager
     runtime::BufferManager mBufferManager;
 
-    // Size of a single KV heads
-    SizeType32 mSizePerHead;
-    // Number of layers
-    SizeType32 mNumLayers;
     // Used to keep track of number of free blocks during scheduling
     SizeType32 mSchedulingNumFreeBlocks;
     // Number of tokens per one block
@@ -789,22 +827,338 @@ class BlockManager
     // max_num_tokens(For DeepSeek). Controlled by mCacheType
     SizeType32 mKVFactor;
     std::set<KVCacheBlock::IdType> reusedBlockIds;
+    std::string const mLogPrefix;
     // Number of reused tokens
     double mReusedTokens;
     // Total number of input tokens
     double mTotalInputTokens;
-
     // Whether or not to maintain a hashmap of blocks.
     bool mEnableHashKey;
-
     // Whether blocks that are partially matched should be reused.
     bool mEnablePartialReuse;
-
     // Whether partially matched blocks that are already in use should be copied and reused.
     bool mCopyOnPartialReuse;
+};
+
+class BlockManager
+{
+public:
+    using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>;
+    using SizeType32 = tensorrt_llm::runtime::SizeType32;
+    using BaseEvictionPolicy = tensorrt_llm::batch_manager::eviction_policy::BaseEvictionPolicy;
+
+    explicit BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead,
+        SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
+        SizeType32 maxNumSequences, CudaStreamPtr stream, std::optional<SizeType32> maxSequenceLength,
+        SizeType32 maxBeamWidth, std::vector<SizeType32> const& maxAttentionWindowVec,
+        std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
+        SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF,
+        std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
+        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false,
+        bool enablePartialReuse = true, bool copyOnPartialReuse = true);
+
+    //! \brief Calculate the number of blocks each window size heap receives of blocksIn{Primary/Secondary}Pool
+    //! \details Example:       (total=16384, uniqueWindowSizeToLayers={1024: [1], 4096: [0, 4, 5], 8192: [2, 3]})
+    //!          Would Return:  {1024: 565, 4096: 6780, 8192: 9039} [sums to total].
+    //!          See: TEST_F(KVCacheManagerTest, BlockManagerTestBlocksPerWindowSize).
+    //! \return Map<windowSize, numBlocks>
+    static std::map<SizeType32, SizeType32> blocksPerWindowSize(
+        SizeType32 totalBlocks, std::map<SizeType32, std::vector<SizeType32>> const& uniqueWindowSizeToLayers);
+
+    void allocatePools(bool useUvm);
+
+    void addSequence(GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks,
+        LlmRequest& llmRequest, SizeType32 windowSize);
+
+    void addSequence(
+        GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx, SizeType32 windowSize);
+
+    void allocateBlock(GenerationRequest& sequence, SizeType32 windowSize);
+
+    void replaceSharedBlock(GenerationRequest& sequence, SizeType32 windowSize, SizeType32 blockIdx);
+
+    std::vector<KVCacheBlock::IdType> getNewlyAllocatedBlockIds(
+        GenerationRequest const& sequence, SizeType32 windowSize) const;
+
+    void releaseBlocks(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt);
+
+    void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
+
+    void releaseLastBlock(GenerationRequest& sequence, SizeType32 windowSize);
+
+    void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx,
+        SizeType32 blockIdx, KVCacheBlock::IdType blockId, SizeType32 windowSize) const;
+
+    // WILL NOT WORK FOR VARIABLE WINDOW ATTENTION
+    [[nodiscard]] std::optional<BlockKey> findNewContextBlock(
+        VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const;
+
+    //! \brief Bring block from primary to secondary memory for window size.
+    //! \details Does nothing if block is already in primary memory.
+    void onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize);
+
+    //! \brief Bring block from primary to secondary memory for window size.
+    //! \details Does nothing if block is already in secondary memory.
+    void offloadBlock(BlockPtr const& block, SizeType32 windowSize);
+
+    void storeBlocks(std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
+        SizeType32 windowSize)
+    {
+        mWindowBlockManagers.at(windowSize).storeBlocks(blockKeys, blockIds);
+    }
+
+    [[nodiscard]] bool verifyQueueIntegrity(SizeType32 windowSize);
+
+    void releasePools();
+
+    void startScheduling();
+
+    [[nodiscard]] std::map<SizeType32, SizeType32> getNumFreeBlocksPerWindowSize() const
+    {
+        std::map<SizeType32, SizeType32> numFreeBlocksPerWindowSize;
+        for (auto const& [windowSize, manager] : mWindowBlockManagers)
+        {
+            numFreeBlocksPerWindowSize[windowSize] = manager.getNumFreeBlocks();
+        }
+        return numFreeBlocksPerWindowSize;
+    }
+
+    [[nodiscard]] SizeType32 getNumFreeBlocks() const
+    {
+        return sumWindows([](auto const& manager) { return manager.getNumFreeBlocks(); });
+    }
+
+    [[nodiscard]] bool schedulingHasFreeBlocks(SizeType32 numRequired, SizeType32 windowSize) const
+    {
+        return mWindowBlockManagers.at(windowSize).schedulingHasFreeBlocks(numRequired);
+    }
+
+    [[nodiscard]] SizeType32 getNumAllocTotalBlocks() const
+    {
+        return sumWindows([](auto const& manager) { return manager.getNumAllocTotalBlocks(); });
+    }
+
+    [[nodiscard]] SizeType32 getNumAllocNewBlocks() const
+    {
+        return sumWindows([](auto const& manager) { return manager.getNumAllocNewBlocks(); });
+    }
+
+    [[nodiscard]] SizeType32 getNumReusedBlocks() const
+    {
+        return sumWindows([](auto const& manager) { return manager.getNumReusedBlocks(); });
+    }
+
+    [[nodiscard]] SizeType32 getNumMissedBlocks() const
+    {
+        return sumWindows([](auto const& manager) { return manager.getNumMissedBlocks(); });
+    }
+
+    [[nodiscard]] SizeType32 getNumLayers() const
+    {
+        return mNumLayers;
+    }
+
+    [[nodiscard]] CacheType getCacheType() const
+    {
+        return mCacheType;
+    }
+
+    [[nodiscard]] SizeType32 getLayerPoolIdx(SizeType32 layerIdx) const
+    {
+        auto const& manager = windowManagerByLayer(layerIdx);
+        auto const absoluteOffset = absolutePoolsOffset(manager);
+        auto const relativePoolIndex = manager.getLayerPoolIdx(layerIdx);
+        return absoluteOffset + relativePoolIndex;
+    }
+
+    [[nodiscard]] SizeType32 getPoolLayerIdx(SizeType32 layerIdx) const
+    {
+        return windowManagerByLayer(layerIdx).getPoolLayerIdx(layerIdx);
+    }
+
+    [[nodiscard]] SizeType32 getTokensPerBlock() const noexcept
+    {
+        return mTokensPerBlock;
+    }
+
+    [[nodiscard]] SizeType32 getStreamDevice() const
+    {
+        return mStream->getDevice();
+    }
+
+    [[nodiscard]] std::deque<executor::KVCacheEvent> getLatestEvents(
+        std::optional<std::chrono::milliseconds> timeout) const;
+
+    void flushIterationEvents()
+    {
+        if (mEventManager)
+        {
+            mEventManager->flush();
+        }
+    }
+
+    [[nodiscard]] SizeType32 getPoolWindowSize(SizeType32 poolIdx) const
+    {
+        return mAbsolutePoolToWindowSize.at(poolIdx);
+    }
+
+    [[nodiscard]] SizeType32 getBlockSize(SizeType32 poolIdx) const
+    {
+        return getPool(poolIdx).blockSize;
+    }
+
+    [[nodiscard]] SizeType32 getNumPools(bool includeBlockScalePools = true) const
+    {
+        return sumWindows(
+            [includeBlockScalePools](auto const& manager) { return manager.getNumPools(includeBlockScalePools); });
+    }
+
+    [[nodiscard]] std::map<SizeType32, WindowSizeMetadata> const& getWindowSizesMetadata() const noexcept
+    {
+        return mWindowSizeToMetadata;
+    }
+
+    [[nodiscard]] WindowSizeMetadata getWindowSizeMetadata(SizeType32 windowSize) const noexcept
+    {
+        return mWindowSizeToMetadata.at(windowSize);
+    }
+
+    [[nodiscard]] bool isVariableWindow() const noexcept
+    {
+        return mIsVariableWindow;
+    }
+
+    [[nodiscard]] SizeType32 getMaxBlockPerSeqWhenSingleWindowSize() const
+    {
+        TLLM_CHECK_WITH_INFO(!isVariableWindow(),
+            "This function was called assuming there is only a single window size, and therefore a single "
+            "maxBlocksPerSeq");
+        auto const windowSize = windowManagerByLayer(0).getWindowSize();
+        auto const onlyWindowSizeMetadata = getWindowSizeMetadata(windowSize);
+        return onlyWindowSizeMetadata.maxBlocksPerSeq;
+    }
+
+    [[nodiscard]] bool isVariableGQA() const noexcept
+    {
+        return mIsVariableGQA;
+    }
+
+    [[nodiscard]] runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 poolIdx) const
+    {
+        return getPool(poolIdx).primaryPtr;
+    }
+
+    [[nodiscard]] runtime::ITensor::SharedPtr getSecondaryPool(SizeType32 poolIdx) const
+    {
+        return getPool(poolIdx).secondaryPtr;
+    }
+
+    [[nodiscard]] SizeType32 getNumAllocatedBlocks() const
+    {
+        return sumWindows([](auto const& manager) { return manager.getNumAllocatedBlocks(); });
+    }
+
+    [[nodiscard]] SizeType32 getMaxNumBlocks() const
+    {
+        return sumWindows([](auto const& manager) { return manager.getMaxNumBlocks(); });
+    }
+
+    [[nodiscard]] BlockPtr const& getBlockById(KVCacheBlock::IdType blockId, SizeType32 windowSize) const
+    {
+        return mWindowBlockManagers.at(windowSize).getBlockById(blockId);
+    }
+
+    [[nodiscard]] WindowBlockManager::BlockMapIterRange getBlocksByHash(size_t hash, SizeType32 windowSize) const
+    {
+        return mWindowBlockManagers.at(windowSize).getBlocksByHash(hash);
+    }
+
+    [[nodiscard]] SizeType32 getNumPrimaryBlocks() const
+    {
+        return sumWindows([](auto const& manager) { return manager.getNumPrimaryBlocks(); });
+    }
+
+    [[nodiscard]] bool containsBlockScales(SizeType32 poolIdx) const
+    {
+        return getPool(poolIdx).containsBlockScales;
+    }
+
+    void addBlockToHashMap(BlockPtr const& block, SizeType32 windowSize)
+    {
+        mWindowBlockManagers.at(windowSize).addBlockToHashMap(block);
+    }
+
+    void removeBlockFromHashMap(BlockPtr const& block, SizeType32 windowSize)
+    {
+        mWindowBlockManagers.at(windowSize).removeBlockFromHashMap(block);
+    }
+
+    //! \brief Store context blocks
+    void storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest);
+
+    [[nodiscard]] static bool isUseOneMoreBlock(
+        SizeType32 windowSize, std::optional<SizeType32> maxSequenceLength, SizeType32 maxBeamWidth)
+    {
+        bool const isCyclicWindowSize = maxSequenceLength.has_value() && maxSequenceLength.value() > windowSize;
+        bool const isBeamSearch = maxBeamWidth > 1;
+        return isCyclicWindowSize && isBeamSearch;
+    }
+
+    //! \brief Perform per-request bookkeeping
+    void refreshBlocks();
+
+    [[nodiscard]] runtime::BufferManager const& getBufferManager(SizeType32 windowSize) const
+    {
+        return mWindowBlockManagers.at(windowSize).getBufferManager();
+    }
+
+    [[nodiscard]] KVCacheBlockPool const& getPool(SizeType32 poolIdx) const
+    {
+        auto const windowSize = getPoolWindowSize(poolIdx);
+        auto const relativePoolIndex = mAbsolutePoolToRelativePoolIndex.at(poolIdx);
+        return mWindowBlockManagers.at(windowSize).getPool(relativePoolIndex);
+    }
 
 private:
-    friend class KVCacheManager;
+    [[nodiscard]] WindowBlockManager const& windowManagerByLayer(SizeType32 layerIdx) const
+    {
+        return mWindowBlockManagers.at(mLayerToWindowSize.at(layerIdx));
+    }
+
+    [[nodiscard]] SizeType32 sumWindows(std::function<SizeType32(WindowBlockManager const&)> produce) const
+    {
+        return std::accumulate(mWindowBlockManagers.cbegin(), mWindowBlockManagers.cend(), SizeType32{0},
+            [&produce](SizeType32 acc, auto const& manager) { return acc + produce(manager.second); });
+    }
+
+    [[nodiscard]] SizeType32 absolutePoolsOffset(WindowBlockManager const& manager) const
+    {
+        auto const windowSize = manager.getWindowSize();
+        return getWindowSizeMetadata(windowSize).absolutePoolsOffset;
+    }
+
+private:
+    SizeType32 mNumLayers;
+    SizeType32 mTokensPerBlock;
+    std::shared_ptr<KVCacheEventManager> mEventManager;
+    CudaStreamPtr mStream;
+    CacheType mCacheType;
+
+    bool mIsVariableWindow;
+    bool mIsVariableGQA;
+
+    std::map<SizeType32, WindowBlockManager> mWindowBlockManagers;
+    std::map<SizeType32, WindowSizeMetadata> mWindowSizeToMetadata;
+    std::vector<SizeType32> mLayerToWindowSize;
+    std::vector<SizeType32> mAbsolutePoolToWindowSize;
+    std::vector<SizeType32> mAbsolutePoolToRelativePoolIndex;
+};
+
+struct OffsetTableDimensions
+{
+    SizeType32 maxBlocksPerSeq;
+    SizeType32 numPools;
+    CacheType cacheType;
 };
 
 class BaseKVCacheManager
@@ -816,7 +1170,7 @@ class BaseKVCacheManager
 
     virtual ~BaseKVCacheManager() {}
 
-    virtual void allocatePools(nvinfer1::DataType dtype, bool useUvm = false) = 0;
+    virtual void allocatePools(bool useUvm = false) = 0;
 
     virtual void releasePools() = 0;
 
@@ -834,9 +1188,10 @@ class BaseKVCacheManager
 
     // only used by test
     [[nodiscard]] virtual SizeType32 getNumReusedBlocks() const noexcept = 0;
+
     [[nodiscard]] virtual KvCacheStats getKvCacheStats() const = 0;
 
-    [[nodiscard]] virtual SizeType32 getMaxBlocksPerSeq() const = 0;
+    [[nodiscard]] virtual OffsetTableDimensions getOffsetTableDimensions() const = 0;
 
     [[nodiscard]] virtual std::deque<executor::KVCacheEvent> getLatestEvents(
         std::optional<std::chrono::milliseconds> timeout = std::nullopt) const
@@ -848,13 +1203,16 @@ class BaseKVCacheManager
     /// iterations
     /// @param req The request for which we need to calculate the number of needed KV cache blocks
     /// @return  The number of blocks
-    [[nodiscard]] virtual SizeType32 getNeededBlocksOneStep(LlmRequest const& req, bool twoStepsLookAhead) const = 0;
+    [[nodiscard]] virtual SizeType32 getNeededBlocksOneStep(
+        LlmRequest const& req, bool twoStepsLookAhead, SizeType32 windowSize) const
+        = 0;
 
     /// @brief  Function that computes the number of KV cache blocks needed to advance a request to completion (i.e. for
     /// maxNewTokens)
     /// @param req The request for which we need to calculate the number of needed KV cache blocks
     /// @return  The number of blocks
-    [[nodiscard]] virtual SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const = 0;
+    [[nodiscard]] virtual SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req, SizeType32 windowSize) const
+        = 0;
 
     /// @brief Increase size for request at seqSlotIdx. Allocate new KV cache block(s) if needed.
     virtual void addToken(LlmRequest::RequestIdType requestId) = 0;
@@ -890,8 +1248,6 @@ class BaseKVCacheManager
 
     [[nodiscard]] virtual bool isEnableBlockReuse() const = 0;
 
-    [[nodiscard]] virtual bool isUseOneMoreBlock() const = 0;
-
     // void removeToken(SizeType32 seqSlotIdx);
     virtual void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) = 0;
 
@@ -910,18 +1266,18 @@ class BaseKVCacheManager
     //! \details These blocks become reusable from next step.
     virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0;
 
-    [[nodiscard]] virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
-
+    //! \brief Get the block ids of a request [per beam] **for a given window size block manager**
     [[nodiscard]] virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
-        LlmRequest::RequestIdType requestId) const
+        LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
         = 0;
 
+    //! \brief Get the block ids of a batch of requests [per beam] **for a given window size block manager**
     [[nodiscard]] virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
-        std::vector<LlmRequest::RequestIdType> const& requestIds) const
+        std::vector<LlmRequest::RequestIdType> const& requestIds, SizeType32 windowSize) const
         = 0;
 
     [[nodiscard]] virtual std::vector<KVCacheBlock::IdType> getNewlyAllocatedBlockIds(
-        LlmRequest::RequestIdType requestId) const
+        LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
         = 0;
 
     [[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
@@ -972,9 +1328,9 @@ class KVCacheManager : public BaseKVCacheManager
     KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
         SizeType32 maxBeamWidth, std::vector<SizeType32> const& maxAttentionWindowVec,
-        SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, CudaStreamPtr stream,
-        std::optional<SizeType32> maxSequenceLength, bool enableBlockReuse = false, bool onboardBlocks = true,
-        CacheType cacheType = CacheType::kSELF,
+        std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
+        SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<SizeType32> maxSequenceLength,
+        bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false,
         bool enablePartialReuse = true, bool copyOnpartialReuse = true);
@@ -982,9 +1338,9 @@ class KVCacheManager : public BaseKVCacheManager
     KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
         SizeType32 maxBeamWidth, std::vector<SizeType32> const& maxAttentionWindowVec,
-        SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, int64_t stream,
-        std::optional<SizeType32> maxSequenceLength, bool enableBlockReuse = false, bool onboardBlocks = true,
-        CacheType cacheType = CacheType::kSELF,
+        std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
+        SizeType32 sinkTokenLength, int64_t stream, std::optional<SizeType32> maxSequenceLength,
+        bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
         bool copyOnpartialReuse = true);
@@ -992,9 +1348,9 @@ class KVCacheManager : public BaseKVCacheManager
     KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
         SizeType32 maxBeamWidth, std::vector<SizeType32> const& maxAttentionWindowVec,
-        SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, CudaStreamPtr stream,
-        std::optional<SizeType32> maxSequenceLength, bool enableBlockReuse = true, bool onboardBlocks = true,
-        CacheType cacheType = CacheType::kSELF,
+        std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
+        SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<SizeType32> maxSequenceLength,
+        bool enableBlockReuse = true, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false,
         bool enablePartialReuse = true, bool copyOnpartialReuse = true);
@@ -1002,13 +1358,14 @@ class KVCacheManager : public BaseKVCacheManager
     KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
         SizeType32 maxBeamWidth, std::vector<SizeType32> const& maxAttentionWindowVec,
-        SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, int64_t stream,
-        std::optional<SizeType32> maxSequenceLength, bool enableBlockReuse = false, bool onboardBlocks = true,
-        CacheType cacheType = CacheType::kSELF, bool enablePartialReuse = true, bool copyOnpartialReuse = true);
+        std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
+        SizeType32 sinkTokenLength, int64_t stream, std::optional<SizeType32> maxSequenceLength,
+        bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
+        bool enablePartialReuse = true, bool copyOnpartialReuse = true);
 
     ~KVCacheManager() override = default;
 
-    void allocatePools(nvinfer1::DataType dtype, bool useUvm = false) override;
+    void allocatePools(bool useUvm = false) override;
 
     void releasePools() override;
 
@@ -1076,9 +1433,15 @@ class KVCacheManager : public BaseKVCacheManager
         return kvCacheStats;
     }
 
-    [[nodiscard]] SizeType32 getMaxBlocksPerSeq() const override
+    [[nodiscard]] OffsetTableDimensions getOffsetTableDimensions() const override
     {
-        return mMaxBlocksPerSeq;
+        OffsetTableDimensions dims;
+        // We use the mMaxAttentionWindow here, because we prefer to have a single offset table for simplicity,
+        // And we don't mind that it should be as wide as the widest window, because that is negligible.
+        dims.maxBlocksPerSeq = mBlockManager.getWindowSizeMetadata(mMaxAttentionWindow).maxBlocksPerSeq;
+        dims.numPools = mBlockManager.getNumPools();
+        dims.cacheType = mBlockManager.getCacheType();
+        return dims;
     }
 
     [[nodiscard]] std::deque<executor::KVCacheEvent> getLatestEvents(
@@ -1096,13 +1459,15 @@ class KVCacheManager : public BaseKVCacheManager
     /// iterations
     /// @param req The request for which we need to calculate the number of needed KV cache blocks
     /// @return  The number of blocks
-    [[nodiscard]] SizeType32 getNeededBlocksOneStep(LlmRequest const& req, bool twoStepsLookAhead) const override;
+    [[nodiscard]] SizeType32 getNeededBlocksOneStep(
+        LlmRequest const& req, bool twoStepsLookAhead, SizeType32 windowSize) const override;
 
     /// @brief  Function that computes the number of KV cache blocks remaining to advance a request to completion (i.e.
     /// for maxNewTokens); the allocated blocks are excluded
     /// @param req The request for which we need to calculate the number of needed KV cache blocks
     /// @return  The number of blocks
-    [[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const override;
+    [[nodiscard]] SizeType32 getRemainingBlocksToCompletion(
+        LlmRequest const& req, SizeType32 windowSize) const override;
 
     /// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed.
     void addToken(LlmRequest::RequestIdType requestId) override;
@@ -1149,16 +1514,6 @@ class KVCacheManager : public BaseKVCacheManager
         return mEnableBlockReuse;
     }
 
-    [[nodiscard]] bool isEnableHashKey() const
-    {
-        return mEnableHashKey;
-    }
-
-    [[nodiscard]] bool isUseOneMoreBlock() const override
-    {
-        return mUseOneMoreBlock;
-    }
-
     void removeToken(LlmRequest::RequestIdType requestId);
     void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override;
 
@@ -1210,14 +1565,14 @@ class KVCacheManager : public BaseKVCacheManager
     [[nodiscard]] static SizeType32 calculateMaxBlockRequirementsPerBeam(SizeType32 sequenceLength,
         SizeType32 sinkTokenLength, SizeType32 maxAttentionWindow, SizeType32 tokensPerBlock);
 
-    bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const override;
-
-    std::vector<std::vector<SizeType32>> const& getCacheBlockIds(LlmRequest::RequestIdType requestId) const override;
+    std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
+        LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override;
 
     std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
-        std::vector<LlmRequest::RequestIdType> const& requestIds) const override;
+        std::vector<LlmRequest::RequestIdType> const& requestIds, SizeType32 windowSize) const override;
 
-    std::vector<SizeType32> getNewlyAllocatedBlockIds(LlmRequest::RequestIdType requestId) const override;
+    std::vector<SizeType32> getNewlyAllocatedBlockIds(
+        LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override;
 
     runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const override;
 
@@ -1250,12 +1605,9 @@ class KVCacheManager : public BaseKVCacheManager
         SizeType32 sinkTokenLength, SizeType32 blockCapacity, SizeType32 beamWidth, SizeType32 tokensPerBlock);
 
 private:
-    void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx,
-        SizeType32 blockIdx, KVCacheBlock::IdType blockId) const;
-
-    void cacheBlockOffsets(GenerationRequest& seq);
-    void cacheNewBlockOffsets(GenerationRequest& seq);
-    void updateNewBlockPointer(GenerationRequest& seq, SizeType32 blockIdx);
+    void cacheBlockOffsets(GenerationRequest& seq, SizeType32 windowSize);
+    void cacheNewBlockOffsets(GenerationRequest& seq, SizeType32 windowSize);
+    void updateNewBlockPointer(GenerationRequest& seq, SizeType32 windowSize, SizeType32 blockIdx);
     void updateToken(GenerationRequest& sequence, bool addToken);
 
 private:
@@ -1263,22 +1615,13 @@ class KVCacheManager : public BaseKVCacheManager
     SizeType32 mMaxNumSequences;
     // Maximum beam width
     SizeType32 mMaxBeamWidth;
-    // Maximum number of blocks per sequence
-    SizeType32 mMaxBlocksPerSeq;
+    nvinfer1::DataType mDataType;
     // Maximum kv cache length per sequence
     SizeType32 mMaxAttentionWindow;
-    // Minimum kv cache length per sequence
-    SizeType32 mMinAttentionWindow;
-    // Temporary kv cache length per sequence.
-    // Only needed when chunked context + sliding window attention are used together.
-    // And it should only be considered when allocating blocks.
-    SizeType32 mTemporaryAttentionWindow;
     // Number of tokens per block
     SizeType32 mTokensPerBlock;
     // Number of tokens to fill up the sink tokens to a full block size
     SizeType32 mSinkBubbleLength;
-    // Maximum token length (including bubble)
-    SizeType32 mMaxTokenNum;
     // Number of tokens in the sink blocks
     SizeType32 mSinkBlockTokenLength;
     // Block manager
@@ -1289,8 +1632,6 @@ class KVCacheManager : public BaseKVCacheManager
     bool mEnableBlockReuse;
     // Whether enable finding blocks by their hash, ignored when reuse enabled
     bool mEnableHashKey;
-    // Whether use one more block for each sequence
-    bool mUseOneMoreBlock;
     // Mutex to protect access to mSequences
     mutable std::mutex mSequencesMtx;
     // buffers for static tensors, will be created after allocating pools
@@ -1299,4 +1640,90 @@ class KVCacheManager : public BaseKVCacheManager
     runtime::ITensor::SharedPtr mBlockScalePoolPointers;
 };
 
+class NoEvictScheduledBlocksManager
+{
+public:
+    explicit NoEvictScheduledBlocksManager(BaseKVCacheManager const& kvCacheManager)
+        : mKvCacheManager(kvCacheManager)
+        , mAvailableBlocks(mKvCacheManager.getBlockManager().getNumFreeBlocksPerWindowSize())
+    {
+    }
+
+    void decrementReservedBlocks(LlmRequest const& req)
+    {
+        for (auto& [windowSize, availableBlocks] : mAvailableBlocks)
+        {
+            availableBlocks -= mKvCacheManager.getRemainingBlocksToCompletion(req, windowSize);
+        }
+    }
+
+    bool enoughAvailableBlocks(LlmRequest const& req)
+    {
+        return std::all_of(mAvailableBlocks.cbegin(), mAvailableBlocks.cend(),
+            [this, &req](auto const& pair)
+            {
+                auto const& [windowSize, availableBlocks] = pair;
+                auto const neededBlocks = mKvCacheManager.getRemainingBlocksToCompletion(req, windowSize);
+                return neededBlocks <= availableBlocks;
+            });
+    }
+
+private:
+    BaseKVCacheManager const& mKvCacheManager;
+    std::map<SizeType32, SizeType32> mAvailableBlocks;
+};
+
+class MaxUtilizationScheduledBlocksManager
+{
+public:
+    MaxUtilizationScheduledBlocksManager(BaseKVCacheManager const& kvCacheManager, bool twoStepsLookAhead)
+        : mKvCacheManager(kvCacheManager)
+        , mTwoStepsLookAhead(twoStepsLookAhead)
+    {
+        auto const& windowSizes = mKvCacheManager.getBlockManager().getWindowSizesMetadata();
+        for (auto const& [windowSize, _] : windowSizes)
+        {
+            mNumScheduledBlocks[windowSize] = 0;
+        }
+    }
+
+    std::optional<std::map<SizeType32, SizeType32>> prepareNewNumberOfBlocksIfWeEndUpScheduling(LlmRequest const& req)
+    {
+        std::map<SizeType32, SizeType32> blocksIfScheduled;
+        for (auto const& [windowSize, numScheduled] : mNumScheduledBlocks)
+        {
+            auto const required = mKvCacheManager.getNeededBlocksOneStep(req, mTwoStepsLookAhead, windowSize);
+
+            TLLM_LOG_DEBUG("MaxUtilizationScheduler: request ID %lu required blocks %i for %i window size",
+                req.mRequestId, required, windowSize);
+
+            auto const scheduledTotal = numScheduled + required;
+            bool const hasFreeBlocks
+                = mKvCacheManager.getBlockManager().schedulingHasFreeBlocks(scheduledTotal, windowSize);
+            if (!hasFreeBlocks)
+            {
+                return std::nullopt;
+            }
+            blocksIfScheduled[windowSize] = scheduledTotal;
+        }
+        return blocksIfScheduled;
+    }
+
+    void updateScheduledBlocks(std::map<SizeType32, SizeType32> const& numBlocksIfScheduled)
+    {
+        assert(numBlocksIfScheduled.size() == mNumScheduledBlocks.size());
+        for (auto const& [windowSize, blocksIfScheduled] : numBlocksIfScheduled)
+        {
+            TLLM_LOG_DEBUG(
+                "MaxUtilizationScheduler: scheduled blocks %i for window size %i", blocksIfScheduled, windowSize);
+            mNumScheduledBlocks.at(windowSize) = blocksIfScheduled;
+        }
+    }
+
+private:
+    BaseKVCacheManager const& mKvCacheManager;
+    std::map<SizeType32, SizeType32> mNumScheduledBlocks;
+    bool const mTwoStepsLookAhead;
+};
+
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
index 476dce90779..8a44e472728 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
@@ -31,24 +31,28 @@ class BlockRange
     {
     };
 
-    BlockRange(BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId, SizeType32 beam,
-        SizeType32 poolIdx = 0)
-        : mManager(&cacheManager)
-        , mPool(cacheManager.getBlockManager().getPrimaryPool(poolIdx))
-        , mBlockIds(cacheManager.getSequence(requestId).getCacheBlockIds().at(beam))
+    static BlockRange fromOldAllocatedBlockIds(BaseKVCacheManager const& cacheManager,
+        LlmRequest::RequestIdType requestId, SizeType32 beam = kFIRST_AND_ONLY_BEAM)
     {
+        assert(kFIRST_AND_ONLY_BEAM == beam);
+        auto const windowSize = firstWindowSize(cacheManager);
+        auto const blockIds = cacheManager.getSequence(requestId).getCacheBlockIds(windowSize).at(kFIRST_AND_ONLY_BEAM);
+        return BlockRange(cacheManager, blockIds, requestId);
     }
 
-    BlockRange(BaseKVCacheManager const& cacheManager, std::vector<SizeType32> blockIds, SizeType32 poolIdx = 0)
-        : mManager(&cacheManager)
-        , mPool(cacheManager.getBlockManager().getPrimaryPool(poolIdx))
-        , mBlockIds(std::move(blockIds))
+    static BlockRange fromNewlyAllocatedBlockIds(
+        BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId)
     {
+        auto const windowSize = firstWindowSize(cacheManager);
+        auto const blockIds = cacheManager.getNewlyAllocatedBlockIds(requestId, windowSize);
+        return BlockRange(cacheManager, blockIds, requestId);
     }
 
-    BlockRange(runtime::ITensor::SharedPtr pool, std::vector<SizeType32> const& blockIds)
+    BlockRange(runtime::ITensor::SharedPtr pool, std::vector<SizeType32> const& blockIds) // Only used in tests
         : mManager{nullptr}
         , mPool{std::move(pool)}
+        , mWindowSize{0}
+        , mRequestId{0}
         , mBlockIds{blockIds}
     {
         TLLM_CHECK(mPool);
@@ -84,25 +88,51 @@ class BlockRange
         auto& blockManager = mManager->getBlockManager();
         for (auto id : mBlockIds)
         {
-            blockHashes.emplace_back(blockManager.getBlockById(id)->getHash());
+            blockHashes.emplace_back(blockManager.getBlockById(id, mWindowSize)->getHash());
         }
         return blockHashes;
     }
 
     void updatePoolIdx(SizeType32 poolIdx)
     {
-        if (mManager)
+        TLLM_CHECK(mManager);
+        mPool = mManager->getBlockManager().getPrimaryPool(poolIdx);
+        auto const newWindowSize = mManager->getBlockManager().getPoolWindowSize(poolIdx);
+        if (newWindowSize != mWindowSize)
         {
-            mPool = mManager->getBlockManager().getPrimaryPool(poolIdx);
+            mWindowSize = newWindowSize;
+            mBlockIds = mManager->getSequence(mRequestId).getCacheBlockIds(mWindowSize).at(kFIRST_AND_ONLY_BEAM);
         }
     }
 
     friend class BlockIterator;
 
+private:
+    BlockRange(
+        BaseKVCacheManager const& cacheManager, std::vector<SizeType32> blockIds, LlmRequest::RequestIdType requestId)
+        : mManager(&cacheManager)
+        , mPool(cacheManager.getBlockManager().getPrimaryPool(kFIRST_POOL_INDEX))
+        , mWindowSize(firstWindowSize(cacheManager))
+        , mRequestId(requestId)
+        , mBlockIds(std::move(blockIds))
+    {
+    }
+
+    static SizeType32 firstWindowSize(BaseKVCacheManager const& cacheManager)
+    {
+        constexpr SizeType32 FIRST_POOL_IDX = 0;
+        return cacheManager.getBlockManager().getPoolWindowSize(FIRST_POOL_IDX);
+    }
+
 private:
     BaseKVCacheManager const* mManager;
     runtime::ITensor::SharedPtr mPool;
+    SizeType32 mWindowSize;
+    const LlmRequest::RequestIdType mRequestId;
     std::vector<SizeType32> mBlockIds;
+
+    static constexpr SizeType32 kFIRST_AND_ONLY_BEAM = 0;
+    static constexpr SizeType32 kFIRST_POOL_INDEX = 0;
 };
 
 class BlockIterator
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
index eb787936e50..f87b128ccf7 100644
--- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
+++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
@@ -49,7 +49,7 @@ void CacheFormatter::formatOutput(LlmRequest const& llmRequest,
     constexpr SizeType32 beam{0};
     auto& blockManager = mCacheManager->getBlockManager();
     size_t requestBlockNum = llmRequest.getRequestedBlockHashes().size();
-    auto blockRange = BlockRange(*mCacheManager, llmRequest.mRequestId, beam);
+    auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mCacheManager, llmRequest.mRequestId, beam);
     if (requestBlockNum < blockRange.size() && requestBlockNum > 0)
     {
         // handle block reuse, the prefix blocks are reused
@@ -109,7 +109,7 @@ void CacheFormatter::formatOutput(LlmRequest const& llmRequest,
         }
         TLLM_CHECK(!inputKvCacheBlocks.empty());
         TLLM_CHECK(blockNum > 0);
-        int deviceId = mCacheManager->getBlockManager().getBufferManager().getStream().getDevice();
+        int deviceId = mCacheManager->getBlockManager().getStreamDevice();
 
         if (common::getEnvTryZCopyForKVCacheTransfer()
             && (destConfig.getParallelConfig().mPipelineParallelism
@@ -318,8 +318,7 @@ void CacheFormatter::formatInput(LlmRequest const& llmRequest,
         "Start receiving KV cache for request ID: %ld, context request ID: %ld.", llmRequest.mRequestId,
         llmRequest.getContextPhaseParams().value().getReqId());
     TLLM_CHECK(!connections.empty());
-    auto blockRange = BlockRange(*mCacheManager, mCacheManager->getNewlyAllocatedBlockIds(llmRequest.mRequestId));
-
+    auto blockRange = BlockRange::fromNewlyAllocatedBlockIds(*mCacheManager, llmRequest.mRequestId);
     std::vector<runtime::ITensor::SharedPtr> recvBufferTmps;
     std::vector<runtime::ITensor::SharedPtr> outputBuffers;
     auto const numPools = mCacheManager->getBlockManager().getNumPools();
diff --git a/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp b/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp
index 023a7c30e18..6805be14f2a 100644
--- a/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp
+++ b/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp
@@ -199,25 +199,32 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
     RequestVector scheduledRequests;
 
     // Now check if we can add pending requests
-    auto const numFreeBlocks = kvCacheManager.getNumFreeBlocks();
-    auto const numFreeCrossBlocks = crossKvCacheManager ? crossKvCacheManager->getNumFreeBlocks() : 0;
     auto const maxPeftCachePages
         = peftCacheManager ? peftCacheManager->getMaxDevicePages() : std::numeric_limits<SizeType32>::max();
 
+    // The optimization of delaying requests won't work for variable window attention
+    bool skippingIsRelevant = (!kvCacheManager.getBlockManager().isVariableWindow())
+        && (!crossKvCacheManager || !crossKvCacheManager->getBlockManager().isVariableWindow());
+
     // Keep track of blocks contributed by requests in context phase
     std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedContextBlocks;
     std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedCrossContextBlocks;
     if constexpr (!StaticBatchScheduling)
     {
-        std::tie(newlyContributedContextBlocks, newlyContributedCrossContextBlocks)
-            = prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager, crossKvCacheManager);
+        if (skippingIsRelevant)
+        {
+            std::tie(newlyContributedContextBlocks, newlyContributedCrossContextBlocks)
+                = prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager, crossKvCacheManager);
+        }
     }
 
     // If a request is already in progress, include it
     // If it's been allocated, it had resource to run to completion
     // Also keep track of blocks needed to drive all in-progress requests to completion
-    SizeType32 reservedBlocks{0};
-    SizeType32 reservedCrossBlocks{0};
+    auto reservedBlocks = kv_cache_manager::NoEvictScheduledBlocksManager(kvCacheManager);
+    auto reservedCrossBlocks = crossKvCacheManager
+        ? std::optional(kv_cache_manager::NoEvictScheduledBlocksManager(*crossKvCacheManager))
+        : std::nullopt;
     SizeType32 claimedPeftPages{0};
     std::unordered_set<uint64_t> uniqTaskIds{};
     RequestVector pendingRequests;
@@ -242,8 +249,9 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
         else if (req->isGenerationInProgressState())
         {
             scheduledRequests.emplace_back(req);
-            reservedBlocks += kvCacheManager.getRemainingBlocksToCompletion(*req);
-
+            reservedBlocks.decrementReservedBlocks(*req);
+            if (reservedCrossBlocks)
+                reservedCrossBlocks->decrementReservedBlocks(*req);
             bool const reqHasLora = req->getLoraTaskId().has_value();
             bool const isNewTask = reqHasLora && !uniqTaskIds.count(req->getLoraTaskId().value());
             if (isNewTask)
@@ -251,7 +259,6 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
                 claimedPeftPages += peftCacheManager ? peftCacheManager->determineNumPages(req) : 0;
                 uniqTaskIds.insert(req->getLoraTaskId().value());
             }
-            reservedCrossBlocks += crossKvCacheManager ? crossKvCacheManager->getRemainingBlocksToCompletion(*req) : 0;
         }
         else if (req->isDisaggGenerationInitState())
         {
@@ -268,8 +275,6 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
     if (!StaticBatchScheduling || scheduledRequests.size() == 0)
     {
         // Now check if we can add pending requests
-        auto availableBlocks = numFreeBlocks - reservedBlocks;
-        auto availableCrossBlocks = numFreeCrossBlocks - reservedCrossBlocks;
         auto availablePeftPages = maxPeftCachePages - claimedPeftPages;
 
         // Loop over pending requests and add them if they can be scheduled
@@ -279,7 +284,7 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
             for (auto const& req : requests)
             {
                 // if context request can reuse blocks contributed by another context request, skip
-                if (!StaticBatchScheduling && !req->isDisaggGenerationInitState()
+                if (!StaticBatchScheduling && skippingIsRelevant && !req->isDisaggGenerationInitState()
                     && beneficialToSkip(req, kvCacheManager, crossKvCacheManager, newlyContributedContextBlocks,
                         newlyContributedCrossContextBlocks))
                 {
@@ -292,27 +297,26 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
                 }
                 else if (req->isContextInitState() || req->isDisaggGenerationInitState())
                 {
-                    auto const neededBlocks = kvCacheManager.getRemainingBlocksToCompletion(*req);
-                    auto const neededCrossBlocks
-                        = crossKvCacheManager ? crossKvCacheManager->getRemainingBlocksToCompletion(*req) : 0;
-                    bool const reqHasLora = req->getLoraTaskId().has_value();
-                    bool const isNewTask = reqHasLora && !uniqTaskIds.count(req->getLoraTaskId().value());
-                    auto const neededPeftPages
-                        = (isNewTask && peftCacheManager) ? peftCacheManager->determineNumPages(req) : 0;
-
-                    if (neededBlocks <= availableBlocks && neededCrossBlocks <= availableCrossBlocks
-                        && neededPeftPages <= availablePeftPages)
+                    bool enoughBlocks = reservedBlocks.enoughAvailableBlocks(*req);
+                    bool enoughCrossBlocks
+                        = reservedCrossBlocks ? reservedCrossBlocks->enoughAvailableBlocks(*req) : true;
+                    bool reqHasLora = req->getLoraTaskId().has_value();
+                    bool isNewTask = reqHasLora && !uniqTaskIds.count(req->getLoraTaskId().value());
+                    auto neededPeftPages = isNewTask && peftCacheManager ? peftCacheManager->determineNumPages(req) : 0;
+
+                    if (enoughBlocks && enoughCrossBlocks && neededPeftPages <= availablePeftPages)
                     {
                         scheduledRequests.emplace_back(req);
-                        availableBlocks -= neededBlocks;
-                        availableCrossBlocks -= neededCrossBlocks;
+                        reservedBlocks.decrementReservedBlocks(*req);
+                        if (reservedCrossBlocks)
+                            reservedCrossBlocks->decrementReservedBlocks(*req);
                         availablePeftPages -= neededPeftPages;
                         if (isNewTask)
                         {
                             uniqTaskIds.insert(req->getLoraTaskId().value());
                         }
                     }
-                    else if (neededBlocks > availableBlocks || neededCrossBlocks > availableCrossBlocks)
+                    else if (!enoughBlocks || !enoughCrossBlocks)
                     {
                         // If one requests fails to be scheduled, break
                         break;
@@ -324,14 +328,25 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
     return {std::move(scheduledRequests), RequestVector{}};
 }
 
+// TODO(nhaber): remove forward declare and just keep the function here, right before the merge. I put it below just so
+// the remote diff is easier to look at/rebase conflicts
+bool trySchedulingRequestMaxUtilization(std::shared_ptr<LlmRequest> const& req, SizeType32 maxNumRequests,
+    RequestVector& scheduledRequests, kv_cache_manager::MaxUtilizationScheduledBlocksManager& blocksManager,
+    OptionalRef<BasePeftCacheManager const> peftCacheManager, SizeType32& numScheduledPeftPages,
+    std::unordered_set<uint64_t>& seenTaskIds);
+
 std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
     kv_cache_manager::BaseKVCacheManager& kvCacheManager, OptionalRef<BasePeftCacheManager const> peftCacheManager,
     RequestList const& activeRequests) const
 {
     kvCacheManager.startScheduling();
 
+    // The optimization of delaying requests won't work for variable window attention
+    bool skippingIsRelevant = !kvCacheManager.getBlockManager().isVariableWindow();
+
     // Keep track of number of requests and block needed for the scheduled requests
-    SizeType32 numScheduledBlocks{0};
+    auto scheduledBlocksManager
+        = kv_cache_manager::MaxUtilizationScheduledBlocksManager(kvCacheManager, mManyMicroBatches);
     SizeType32 numScheduledPeftPages{0};
     std::unordered_set<uint64_t> seenTaskIds;
 
@@ -366,16 +381,17 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
         }
 
         // if context request can reuse blocks contributed by another context request, skip
-        if (beneficialToSkip(
+        if (skippingIsRelevant
+            && beneficialToSkip(
                 req, kvCacheManager, std::nullopt, newlyContributedContextBlocks, newlyContributedCrossContextBlocks))
         {
             reqIt++;
             continue;
         }
 
-        auto const [fitsKvCache, fitsPeftCache] = trySchedulingRequestMaxUtilization(kvCacheManager, peftCacheManager,
-            req, scheduledRequests, numScheduledBlocks, numScheduledPeftPages, seenTaskIds);
-        if (fitsKvCache && fitsPeftCache)
+        bool const wasScheduled = trySchedulingRequestMaxUtilization(req, mMaxNumRequests, scheduledRequests,
+            scheduledBlocksManager, peftCacheManager, numScheduledPeftPages, seenTaskIds);
+        if (wasScheduled)
         {
             TLLM_LOG_DEBUG("MaxUtilizationScheduler: request ID %lu -> start", req->mRequestId);
             reqIt++;
@@ -405,34 +421,27 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
     return {std::move(scheduledRequests), std::move(pausedRequests)};
 }
 
-std::pair<bool, bool> MaxUtilizationScheduler::trySchedulingRequestMaxUtilization(
-    kv_cache_manager::BaseKVCacheManager const& kvCacheManager,
-    OptionalRef<BasePeftCacheManager const> peftCacheManager, std::shared_ptr<LlmRequest> const& req,
-    RequestVector& scheduledRequests, SizeType32& numScheduledBlocks, SizeType32& numScheduledPeftPages,
-    std::unordered_set<uint64_t>& seenTaskIds) const
+bool trySchedulingRequestMaxUtilization(std::shared_ptr<LlmRequest> const& req, SizeType32 maxNumRequests,
+    RequestVector& scheduledRequests, kv_cache_manager::MaxUtilizationScheduledBlocksManager& blocksManager,
+    OptionalRef<BasePeftCacheManager const> peftCacheManager, SizeType32& numScheduledPeftPages,
+    std::unordered_set<uint64_t>& seenTaskIds)
 {
-    if (scheduledRequests.size() < static_cast<std::size_t>(mMaxNumRequests))
+    if (scheduledRequests.size() < static_cast<std::size_t>(maxNumRequests))
     {
-        SizeType32 numRequiredBlocks = kvCacheManager.getNeededBlocksOneStep(*req, mManyMicroBatches);
-        TLLM_LOG_DEBUG(
-            "MaxUtilizationScheduler: request ID %lu required blocks: %i", req->mRequestId, numRequiredBlocks);
-
-        bool const reqHasLora = req->getLoraTaskId().has_value();
-        bool const isNewTask = reqHasLora && !seenTaskIds.count(req->getLoraTaskId().value());
-        auto const numRequiredPeftPages
+        bool reqHasLora = req->getLoraTaskId().has_value();
+        bool isNewTask = reqHasLora && !seenTaskIds.count(req->getLoraTaskId().value());
+        SizeType32 numRequiredPeftPages
             = (isNewTask && peftCacheManager) ? peftCacheManager->determineNumPages(req) : 0;
         TLLM_LOG_DEBUG(
             "MaxUtilizationScheduler: request ID %lu required peft pages: %i", req->mRequestId, numRequiredPeftPages);
-        bool const fitsKvCache
-            = kvCacheManager.getBlockManager().schedulingHasFreeBlocks(numScheduledBlocks + numRequiredBlocks);
-        bool const fitsPeft
+        auto const scheduledBlocksIfFitsKvCache = blocksManager.prepareNewNumberOfBlocksIfWeEndUpScheduling(*req);
+        bool fitsPeft
             = (peftCacheManager ? numRequiredPeftPages + numScheduledPeftPages <= peftCacheManager->getMaxDevicePages()
                                 : true);
 
-        if (fitsKvCache && fitsPeft)
+        if (scheduledBlocksIfFitsKvCache && fitsPeft)
         {
-            numScheduledBlocks += numRequiredBlocks;
-            TLLM_LOG_DEBUG("MaxUtilizationScheduler: scheduled blocks: %i", numScheduledBlocks);
+            blocksManager.updateScheduledBlocks(scheduledBlocksIfFitsKvCache.value());
             numScheduledPeftPages += numRequiredPeftPages;
             TLLM_LOG_DEBUG("MaxUtilizationScheduler: scheduled peft pages: %i", numRequiredPeftPages);
             scheduledRequests.emplace_back(req);
@@ -440,10 +449,10 @@ std::pair<bool, bool> MaxUtilizationScheduler::trySchedulingRequestMaxUtilizatio
             {
                 seenTaskIds.insert(req->getLoraTaskId().value());
             }
+            return true;
         }
-        return std::make_pair(fitsKvCache, fitsPeft);
     }
-    return std::make_pair(false, false);
+    return false;
 }
 
 CapacityScheduler::CapacityScheduler(SizeType32 maxNumRequests,
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp
index 3597626c84f..8f9cfa7763a 100644
--- a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp
+++ b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp
@@ -142,8 +142,8 @@ void DataReceiverImpl::sendRequestInfo(LlmRequest const& llmRequest)
     if (cacheFormatter != nullptr)
     {
         auto* cacheManager = cacheFormatter->getCacheManager();
-        auto blockRange = kv_cache_manager::BlockRange(
-            *cacheManager, cacheManager->getNewlyAllocatedBlockIds(llmRequest.mRequestId));
+        auto blockRange
+            = kv_cache_manager::BlockRange::fromNewlyAllocatedBlockIds(*cacheManager, llmRequest.mRequestId);
         requestInfo = RequestInfo(requestId, blockRange.getBlockHashes(), mSelfState);
     }
 
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
index 943b95dfe15..6cf74333056 100644
--- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -323,18 +323,170 @@ bool KVCacheBlock::isLeaf() const
     return mNextBlocks.empty();
 }
 
+/*
+Example:
+```
+totalBlocks = 16384, uniqueWindowSizeToLayers = {1024: [1], 4096: [0, 4, 5], 8192: [2, 3]}
+windowSizeToContribution = {1024: (1024*1=1024), 4096: (4096*3=12288), 8192: (8192*2)};
+return windowSizeToAllottedBlocks =  {
+        1024: (1024.0   /29696)*16384 + 1  = 565,
+        4096: (12288.0  /29696)*16384 + 1  = 6780,
+        8192: (16384.0  /29696)*16384      = 9039
+    };
+```
+*/
+std::map<SizeType32, SizeType32> BlockManager::blocksPerWindowSize(
+    SizeType32 totalBlocks, std::map<SizeType32, std::vector<SizeType32>> const& uniqueWindowSizeToLayers)
+{
+    TLLM_CHECK(totalBlocks > 0);
+    std::map<SizeType32, SizeType32> windowSizeToContribution;
+
+    for (auto const& [windowSize, layers] : uniqueWindowSizeToLayers)
+    {
+        windowSizeToContribution[windowSize] = windowSize * layers.size();
+    }
+    auto const windowSizesTotalSum = std::accumulate(windowSizeToContribution.begin(), windowSizeToContribution.end(),
+        SizeType32{0}, [](auto sum, auto const& windowSize) { return sum + windowSize.second; });
+
+    std::map<SizeType32, SizeType32> windowSizeToAllottedBlocks;
+    SizeType32 remainingBlocks = totalBlocks;
+    // First pass: allocate blocks proportionally
+    for (auto const& [windowSize, windowSizeSum] : windowSizeToContribution)
+    {
+        float const fraction = static_cast<float>(windowSizeSum) / windowSizesTotalSum;
+        TLLM_CHECK(0.0f < fraction && fraction <= 1.0f);
+        SizeType32 const allotted = static_cast<int>(fraction * totalBlocks);
+        windowSizeToAllottedBlocks[windowSize] = allotted;
+        remainingBlocks -= allotted;
+    }
+
+    // Second pass: awarded blocks lost to rounding down back to heaps.
+    for (auto& [windowSize, allottedBlocks] : windowSizeToAllottedBlocks)
+    {
+        if (remainingBlocks == 0)
+        {
+            break;
+        }
+        allottedBlocks++;
+        remainingBlocks--;
+    }
+    return windowSizeToAllottedBlocks;
+}
+
 BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead,
     SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
+    SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
+    std::optional<SizeType32> maxSequenceLength, SizeType32 maxBeamWidth,
+    std::vector<SizeType32> const& maxAttentionWindowVec,
+    std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
+    SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType,
+    std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
+    std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
+    bool copyOnPartialReuse)
+    : mNumLayers{static_cast<SizeType32>(numKvHeadsPerLayer.size())}
+    , mTokensPerBlock{tokensPerBlock}
+    , mEventManager{std::move(eventManager)}
+    , mStream{stream}
+    , mCacheType{cacheType}
+{
+    auto const numNonUniqueWindowSizes = static_cast<SizeType32>(maxAttentionWindowVec.size());
+
+    std::map<SizeType32, std::vector<SizeType32>> uniqueWindowSizeToLayers;
+    for (SizeType32 layerIdx = 0; layerIdx < mNumLayers; layerIdx++)
+    {
+        /*
+        At this point (Deep in the construction of TrtGptModel), maxAttentionWindowVec isn't "stretched" to the
+        length of numLayers yet. So, we need to rotate the window sizes per layer with modulo.
+        */
+        auto const windowSize = maxAttentionWindowVec.at(layerIdx % numNonUniqueWindowSizes);
+        uniqueWindowSizeToLayers[windowSize].push_back(layerIdx);
+    }
+
+    auto const numUniqueWindowSizes = static_cast<SizeType32>(uniqueWindowSizeToLayers.size());
+
+    mIsVariableWindow = numUniqueWindowSizes > 1;
+    mIsVariableGQA = std::unordered_set(numKvHeadsPerLayer.begin(), numKvHeadsPerLayer.end()).size() > 1;
+
+    auto const primaryBlocksPerWindowSize = blocksPerWindowSize(blocksInPrimaryPool, uniqueWindowSizeToLayers);
+    std::optional<std::map<SizeType32, SizeType32>> secondaryBlocksPerWindowSize;
+    if (blocksInSecondaryPool > 0)
+    {
+        secondaryBlocksPerWindowSize = blocksPerWindowSize(blocksInSecondaryPool, uniqueWindowSizeToLayers);
+    }
+
+    mLayerToWindowSize.resize(mNumLayers);
+    for (auto const& [windowSize, layersWithWindowSize] : uniqueWindowSizeToLayers)
+    {
+        for (auto& layerIdx : layersWithWindowSize)
+        {
+            mLayerToWindowSize.at(layerIdx) = windowSize;
+        }
+        SizeType32 const allottedPrimaryBlocks = primaryBlocksPerWindowSize.at(windowSize);
+        TLLM_CHECK(allottedPrimaryBlocks > 0); // You can't have a model with negative primary blocks...
+        SizeType32 const allottedSecondaryBlocks
+            = secondaryBlocksPerWindowSize ? secondaryBlocksPerWindowSize->at(windowSize) : 0;
+        mWindowBlockManagers.try_emplace(windowSize, dtype, windowSize, layersWithWindowSize, numKvHeadsPerLayer,
+            sizePerHead, tokensPerBlock, allottedPrimaryBlocks, allottedSecondaryBlocks, maxNumSequences, stream,
+            onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager, enableHashKey, enablePartialReuse,
+            copyOnPartialReuse);
+    }
+
+    auto const numAllPools = getNumPools();
+    mAbsolutePoolToWindowSize.reserve(numAllPools);
+    mAbsolutePoolToRelativePoolIndex.reserve(numAllPools);
+    auto absolutePoolsOffset = SizeType32{0};
+    for (auto const& [windowSize, manager] : mWindowBlockManagers)
+    {
+        auto const numPools = manager.getNumPools();
+        for (auto i = 0; i < numPools; ++i)
+        {
+            mAbsolutePoolToWindowSize.push_back(windowSize);
+            mAbsolutePoolToRelativePoolIndex.push_back(i);
+        }
+        auto const maxTokenNum = windowSize + sinkBubbleLength
+            + (isUseOneMoreBlock(windowSize, maxSequenceLength, maxBeamWidth) ? tokensPerBlock : 0);
+        auto const temporaryAttentionWindow = manager.calculateTemporaryAttentionWindow(tempAttentionWindowInputs);
+        // Consider the temporaryAttentionWindow when allocating blocks.
+        auto const maxBlocksPerSeq = tc::ceilDiv(maxTokenNum + temporaryAttentionWindow, tokensPerBlock);
+        TLLM_LOG_INFO("Max KV cache pages per sequence: %d [window size=%d]", maxBlocksPerSeq, windowSize);
+        mWindowSizeToMetadata[windowSize] = WindowSizeMetadata{absolutePoolsOffset, numPools, maxTokenNum,
+            maxBlocksPerSeq, manager.getMaxNumBlocks(), temporaryAttentionWindow};
+        TLLM_LOG_DEBUG(
+            "%s Metadata: %s", manager.getLogPrefix().c_str(), mWindowSizeToMetadata[windowSize].toString().c_str());
+        absolutePoolsOffset += numPools;
+    }
+
+    TLLM_CHECK_WITH_INFO(mWindowBlockManagers.size() == mWindowSizeToMetadata.size()
+            && std::equal(mWindowBlockManagers.cbegin(), mWindowBlockManagers.cend(), mWindowSizeToMetadata.cbegin(),
+                mWindowSizeToMetadata.cend(),
+                [](auto const& window1, auto const& window2) { return window1.first == window2.first; }),
+        "Iteration order of window sizes between mWindowBlockManagers and mWindowSizeToMetadata *must* be ensured. "
+        "Maybe you tried changing either of them to an std::unordered_map?");
+}
+
+namespace
+{
+inline SizeType32 digits(SizeType32 number)
+{
+    TLLM_CHECK(number > 0);
+    return static_cast<SizeType32>(std::log10(number)) + 1;
+}
+
+} // namespace
+
+WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 windowSize,
+    std::vector<SizeType32> const& managedLayers, std::vector<SizeType32> const& numKvHeadsPerLayer,
+    SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
     SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks, CacheType cacheType,
     std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
     std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
     bool copyOnPartialReuse)
-    : mNumPrimaryBlocks{blocksInPrimaryPool}
+    : mDataType{dtype}
+    , mWindowSize{windowSize}
+    , mNumPrimaryBlocks{blocksInPrimaryPool}
     , mNumSecondaryBlocks{blocksInSecondaryPool}
     , mOnboardBlocks(onboardBlocks)
     , mBufferManager{std::move(stream)}
-    , mSizePerHead{sizePerHead}
-    , mNumLayers{static_cast<SizeType32>(numKvHeadsPerLayer.size())}
     , mSchedulingNumFreeBlocks{0}
     , mTokensPerBlock{tokensPerBlock}
     , mCachedBlocksRoot{std::make_shared<KVCacheBlock>(KVCacheBlock::kCachedBlocksRootId, tk::KVCacheIndex{0})}
@@ -346,6 +498,8 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
     , mReusedBlocks{0}
     , mReusedUniqueBlocks{0}
     , mMissedBlocks{0}
+    , mKVFactor{mCacheType == CacheType::kSELFKONLY ? 1 : 2}
+    , mLogPrefix{tensorrt_llm::common::fmtstr("BlockManager[windowSize=%*u]", digits(windowSize), mWindowSize)}
     , mReusedTokens{0.0}
     , mTotalInputTokens{0.0}
     , mEnableHashKey{enableHashKey}
@@ -354,32 +508,36 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
 {
     std::map<SizeType32, SizeType32> numLayersPerPool;
 
-    mKVFactor = (cacheType == CacheType::kSELFKONLY) ? 1 : 2;
-
-    // count how many layers should go in each pool
-    mLayerIndexToPoolLayerIndex.reserve(mNumLayers);
-    for (SizeType32 layerIdx = 0; layerIdx < mNumLayers; layerIdx++)
+    for (auto const layerIdx : managedLayers)
     {
-        auto const numKvHeads = numKvHeadsPerLayer.at(layerIdx);
-        auto search = numLayersPerPool.find(numKvHeads);
-        numLayersPerPool[numKvHeads] = search == numLayersPerPool.end() ? 1 : search->second + 1;
-        mLayerIndexToPoolLayerIndex.emplace_back(numLayersPerPool[numKvHeads] - 1);
+        auto const& layerIndexWithinPool = numLayersPerPool[numKvHeadsPerLayer.at(layerIdx)]++;
+        mLayerToIndexWithinPool[layerIdx] = layerIndexWithinPool;
     }
 
+    size_t poolIndex = 0;
     for (auto const [numKvHeads, numLayers] : numLayersPerPool)
     {
+        for (auto const layerIdx : managedLayers)
+        {
+            if (numKvHeadsPerLayer.at(layerIdx) == numKvHeads)
+            {
+                mLayerToPoolIndex[layerIdx] = poolIndex;
+            }
+        }
         mPools.emplace_back(numLayers, numKvHeads, sizePerHead, tokensPerBlock, 1);
+        ++poolIndex;
     }
 
-    // assign each layer to its pool
-    mLayerToPool.reserve(mNumLayers);
-    for (SizeType32 layerIdx = 0; layerIdx < mNumLayers; layerIdx++)
+#ifdef ENABLE_FP4
+    // TODO(miovine): make the block size configurable. Should we have an additional argument
+    // to specify FP4 related parameters (scale dtypes, etc)? This can also be passed
+    // in the constructor.
+    constexpr SizeType32 kQuantBlockSizeNVFP4 = 16;
+    if (dtype == nvinfer1::DataType::kFP4)
     {
-        auto poolPos = std::find_if(mPools.cbegin(), mPools.cend(),
-            [numKvHeads = numKvHeadsPerLayer[layerIdx]](KVCacheBlockPool const& pool)
-            { return numKvHeads == pool.numKvHeads; });
-        mLayerToPool.emplace_back(poolPos - mPools.cbegin());
+        createBlockScalePools(kQuantBlockSizeNVFP4);
     }
+#endif
 
     // Create free blocks
     mAllBlocksById.reserve(blocksInPrimaryPool + blocksInSecondaryPool);
@@ -403,7 +561,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
     }
 }
 
-BlockManager::~BlockManager()
+WindowBlockManager::~WindowBlockManager()
 {
     float reusedUniqueBlocksPercentage = mReusedUniqueBlocks == 0 || mAllocTotalBlocks == 0
         ? 0
@@ -411,19 +569,25 @@ BlockManager::~BlockManager()
     float cacheHitRate = mReusedBlocks == 0
         ? 0
         : static_cast<float>(mReusedBlocks) / (static_cast<float>(mReusedBlocks + mMissedBlocks));
-    TLLM_LOG_DEBUG("BlockManager - total allocated blocks:              %lu  ", mAllocTotalBlocks);
-    TLLM_LOG_DEBUG("BlockManager - allocated new blocks:                %lu  ", mAllocNewBlocks);
-    TLLM_LOG_DEBUG("BlockManager - missed blocks:                       %lu  ", mMissedBlocks);
-    TLLM_LOG_DEBUG("BlockManager - reused blocks:                       %lu  ", mReusedBlocks);
-    TLLM_LOG_DEBUG("BlockManager - reused unique blocks:                %lu  ", mReusedUniqueBlocks);
-    TLLM_LOG_DEBUG("BlockManager - reused unique blocks percentage (%%): %.2f ", reusedUniqueBlocksPercentage);
-    TLLM_LOG_DEBUG("BlockManager - cache hit rate:                      %.2f ", cacheHitRate);
-    TLLM_LOG_DEBUG("BlockManager - reused tokens:                       %.0f ", mReusedTokens);
+    TLLM_LOG_DEBUG("%s - total allocated blocks:              %lu  ", mLogPrefix.c_str(), mAllocTotalBlocks);
+    TLLM_LOG_DEBUG("%s - allocated new blocks:                %lu  ", mLogPrefix.c_str(), mAllocNewBlocks);
+    TLLM_LOG_DEBUG("%s - missed blocks:                       %lu  ", mLogPrefix.c_str(), mMissedBlocks);
+    TLLM_LOG_DEBUG("%s - reused blocks:                       %lu  ", mLogPrefix.c_str(), mReusedBlocks);
+    TLLM_LOG_DEBUG("%s - reused unique blocks:                %lu  ", mLogPrefix.c_str(), mReusedUniqueBlocks);
     TLLM_LOG_DEBUG(
-        "BlockManager - reused tokens percentage (%%):        %.2f ", 100.0 * mReusedTokens / mTotalInputTokens);
+        "%s - reused unique blocks percentage (%%): %.2f ", mLogPrefix.c_str(), reusedUniqueBlocksPercentage);
+    TLLM_LOG_DEBUG("%s - cache hit rate:                      %.2f ", mLogPrefix.c_str(), cacheHitRate);
+    TLLM_LOG_DEBUG("%s - reused tokens:                       %.0f ", mLogPrefix.c_str(), mReusedTokens);
+    TLLM_LOG_DEBUG("%s - reused tokens percentage (%%):        %.2f ", mLogPrefix.c_str(),
+        100.0 * mReusedTokens / mTotalInputTokens);
+}
+
+bool BlockManager::verifyQueueIntegrity(SizeType32 windowSize)
+{
+    return mWindowBlockManagers.at(windowSize).verifyQueueIntegrity();
 }
 
-bool BlockManager::verifyQueueIntegrity()
+bool WindowBlockManager::verifyQueueIntegrity()
 {
     return mEvictionPolicy->verifyQueueIntegrity();
 }
@@ -431,16 +595,19 @@ bool BlockManager::verifyQueueIntegrity()
 void BlockManager::storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest)
 {
     constexpr int beamIdx = 0; // no need to consider more than one beam for input tokens
-    auto const& cacheBlockIds = sequence.getCacheBlockIds();
-    auto const& uniqueTokens = llmRequest.getUniqueTokens(beamIdx);
+    for (auto const& [windowSize, _] : mWindowBlockManagers)
+    {
+        auto cacheBlockIds = sequence.getCacheBlockIds(windowSize);
+        auto const& uniqueTokens = llmRequest.getUniqueTokens(beamIdx);
 
-    auto blockedUniqueTokens
-        = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, uniqueTokens.size() - 1, getTokensPerBlock(), false);
-    auto blockKeys = buildBlockKeys(blockedUniqueTokens, llmRequest);
-    storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx]);
+        auto blockedUniqueTokens
+            = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, uniqueTokens.size() - 1, getTokensPerBlock(), false);
+        auto blockKeys = buildBlockKeys(blockedUniqueTokens, llmRequest);
+        storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx], windowSize);
+    }
 }
 
-void BlockManager::createBlockScalePools(SizeType32 quantBlockSize)
+void WindowBlockManager::createBlockScalePools(SizeType32 quantBlockSize)
 {
     auto num_pools = mPools.size();
     for (size_t i = 0; i < num_pools; ++i)
@@ -457,30 +624,24 @@ void BlockManager::createBlockScalePools(SizeType32 quantBlockSize)
     }
 }
 
-void BlockManager::allocatePools(nvinfer1::DataType dtype, bool useUvm)
+void BlockManager::allocatePools(bool useUvm)
 {
-    // TODO: make the block size configurable. Should we have an additional argument
-    // to specify FP4 related parameters (scale dtypes, etc)? This can also be passed
-    // in the constructor.
-    constexpr SizeType32 kQuantBlockSizeNVFP4 = 16;
-    constexpr nvinfer1::DataType kScaleDtypeNVFP4 = nvinfer1::DataType::kFP8;
-
-#ifdef ENABLE_FP4
-    if (dtype == nvinfer1::DataType::kFP4)
+    for (auto& [_, manager] : mWindowBlockManagers)
     {
-        // It would be slightly better to move construction of these objects to the BlockManager
-        // constructor for consistency. We would have to do a bit of refactoring to pass
-        // the dtype in earlier.
-        createBlockScalePools(kQuantBlockSizeNVFP4);
+        manager.allocatePools(useUvm);
     }
-#endif
+}
+
+void WindowBlockManager::allocatePools(bool useUvm)
+{
+    constexpr nvinfer1::DataType kScaleDtypeNVFP4 = nvinfer1::DataType::kFP8;
 
     // Allocate a memory pool backing the blocks for each numKvHeads
     // TODO(oargov): allocate pools in a single buffer and split it, to avoid fragmentation
     for (auto& pool : mPools)
     {
         auto blockSize = pool.blockSize;
-        auto poolDtype = pool.containsBlockScales ? kScaleDtypeNVFP4 : dtype;
+        auto poolDtype = pool.containsBlockScales ? kScaleDtypeNVFP4 : mDataType;
 
 #ifdef ENABLE_FP4
         auto const poolIsFP4 = poolDtype == nvinfer1::DataType::kFP4;
@@ -499,7 +660,7 @@ void BlockManager::allocatePools(nvinfer1::DataType dtype, bool useUvm)
 
         nvinfer1::Dims const cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize});
 
-        TLLM_LOG_DEBUG("[BlockManager] Allocating primary pool with %d blocks for %d layers with %d kv heads",
+        TLLM_LOG_DEBUG("[%s] Allocating primary pool with %d blocks for %d layers with %d kv heads", mLogPrefix.c_str(),
             mNumPrimaryBlocks, pool.numLayers, pool.numKvHeads);
 
         if (useUvm)
@@ -511,14 +672,22 @@ void BlockManager::allocatePools(nvinfer1::DataType dtype, bool useUvm)
         {
             nvinfer1::Dims const cacheShapeOffload
                 = ITensor::makeShape({mNumSecondaryBlocks, pool.numLayers, mKVFactor, blockSize});
-            TLLM_LOG_DEBUG("[BlockManager] Allocating secondary pool with %d blocks for %d layers with %d kv heads",
-                mNumSecondaryBlocks, pool.numLayers, pool.numKvHeads);
+            TLLM_LOG_DEBUG("[%s] Allocating secondary pool with %d blocks for %d layers with %d kv heads",
+                mLogPrefix.c_str(), mNumSecondaryBlocks, pool.numLayers, pool.numKvHeads);
             pool.secondaryPtr = BufferManager::pinned(cacheShapeOffload, poolDtype);
         }
     }
 }
 
 void BlockManager::releasePools()
+{
+    for (auto& [_, manager] : mWindowBlockManagers)
+    {
+        manager.releasePools();
+    }
+}
+
+void WindowBlockManager::releasePools()
 {
     for (auto& pool : mPools)
     {
@@ -536,6 +705,14 @@ void BlockManager::releasePools()
 }
 
 void BlockManager::startScheduling()
+{
+    for (auto& [_, manager] : mWindowBlockManagers)
+    {
+        manager.startScheduling();
+    }
+}
+
+void WindowBlockManager::startScheduling()
 {
     mSchedulingNumFreeBlocks = mEvictionPolicy->getNumFreeBlocks(kPrimaryLevel);
     for (auto& [requestId, slotAllocatedBlocks] : mAllocatedBlocksPerSeq)
@@ -547,7 +724,7 @@ void BlockManager::startScheduling()
     }
 }
 
-void BlockManager::claimLeafBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority,
+void WindowBlockManager::claimLeafBlock(BlockPtr const& block, std::optional<executor::RetentionPriority> priority,
     std::optional<std::chrono::milliseconds> durationMs)
 {
     // The eviction policy needs blocks to still be linked to their old parents when they're reclaimed.
@@ -556,7 +733,7 @@ void BlockManager::claimLeafBlock(BlockPtr block, std::optional<executor::Retent
     block->freeLeafBlock();
 }
 
-BlockPtr BlockManager::getFreeBlock(
+BlockPtr WindowBlockManager::getFreeBlock(
     executor::RetentionPriority priority, std::optional<std::chrono::milliseconds> durationMs)
 {
     // eviction policy get free primary block
@@ -595,40 +772,35 @@ BlockPtr BlockManager::getFreeBlock(
     return block;
 }
 
-tk::KVCacheIndex BlockManager::getKOrVBlockIndex(
-    KVCacheBlock::IdType blockId, SizeType32 fieldIdx, SizeType32 poolIdx) const
-{
-    TLLM_CHECK_WITH_INFO(poolIdx < getNumPools(), "Pool index %d is out of bounds", poolIdx);
-    auto const& block = mAllBlocksById[blockId];
-    auto const& pool = mPools.at(poolIdx);
-    if (mCacheType == CacheType::kSELFKONLY)
-    {
-        fieldIdx = 0;
-    }
-    auto constexpr layerIdx = 0;
-    return tk::KVCacheIndex{
-        common::flat_index3(block->getMemoryPoolBlockIndex(), layerIdx, fieldIdx, pool.numLayers, mKVFactor)};
-}
-
-void KVCacheManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx,
-    SizeType32 blockIdx, KVCacheBlock::IdType blockId) const
+void WindowBlockManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape,
+    SizeType32 beamIdx, SizeType32 blockIdx, KVCacheBlock::IdType blockId) const
 {
     auto constexpr kIdx = 0;
     auto constexpr vIdx = 1;
 
-    auto const numPools = mBlockManager.getNumPools();
-
-    for (SizeType32 poolIdx = 0; poolIdx < numPools; poolIdx++)
+    auto const& block = mAllBlocksById[blockId];
+    for (SizeType32 poolIdx = 0; poolIdx < static_cast<SizeType32>(mPools.size()); poolIdx++)
     {
-        for (auto xIdx : {kIdx, vIdx})
+        auto const& pool = mPools.at(poolIdx);
+        for (auto const xIdx : {kIdx, vIdx})
         {
+            auto constexpr layerIdx = 0;
             auto const offsetIndex = tensorrt_llm::common::flat_index(offsetsShape.d, poolIdx, beamIdx, xIdx, blockIdx);
-            offsetsPtr[offsetIndex] = mBlockManager.getKOrVBlockIndex(blockId, xIdx, poolIdx);
+            auto const fieldIdx = mCacheType == CacheType::kSELFKONLY ? 0 : xIdx;
+            auto const blockIndex = tk::KVCacheIndex{
+                common::flat_index3(block->getMemoryPoolBlockIndex(), layerIdx, fieldIdx, pool.numLayers, mKVFactor)};
+            offsetsPtr[offsetIndex] = blockIndex;
         }
     }
 }
 
-void BlockManager::addBlockToHashMap(BlockPtr block)
+void BlockManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx,
+    SizeType32 blockIdx, KVCacheBlock::IdType blockId, SizeType32 windowSize) const
+{
+    mWindowBlockManagers.at(windowSize).setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId);
+}
+
+void WindowBlockManager::addBlockToHashMap(BlockPtr const& block)
 {
     if (!mEnableHashKey)
     {
@@ -650,7 +822,7 @@ void BlockManager::addBlockToHashMap(BlockPtr block)
     mContextBlocksByHash.emplace(block->getHash(), std::move(block));
 }
 
-void BlockManager::removeBlockFromHashMap(BlockPtr block)
+void WindowBlockManager::removeBlockFromHashMap(BlockPtr const& block)
 {
     if (mContextBlocksByHash.empty() || block->getBlockKey().uniqueTokens.empty())
     {
@@ -672,7 +844,12 @@ void BlockManager::removeBlockFromHashMap(BlockPtr block)
     TLLM_LOG_DEBUG("Trying to remove block %d by %zx that is not in hash map", block->getBlockId(), block->getHash());
 }
 
-void BlockManager::onboardBlock(BlockPtr const& offloadBlock)
+void BlockManager::onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize)
+{
+    mWindowBlockManagers.at(windowSize).onboardBlock(offloadBlock);
+}
+
+void WindowBlockManager::onboardBlock(BlockPtr const& offloadBlock)
 {
     if (mOnboardBlocks && !offloadBlock->isPrimary())
     {
@@ -691,7 +868,12 @@ void BlockManager::onboardBlock(BlockPtr const& offloadBlock)
     }
 }
 
-void BlockManager::offloadBlock(BlockPtr const& block)
+void BlockManager::offloadBlock(BlockPtr const& block, SizeType32 windowSize)
+{
+    mWindowBlockManagers.at(windowSize).offloadBlock(block);
+}
+
+void WindowBlockManager::offloadBlock(BlockPtr const& block)
 {
     if (mOnboardBlocks && block->isPrimary())
     {
@@ -713,7 +895,16 @@ void BlockManager::offloadBlock(BlockPtr const& block)
     }
 }
 
-std::optional<BlockKey> BlockManager::findNewContextBlock(
+[[nodiscard]] std::optional<BlockKey> BlockManager::findNewContextBlock(
+    VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const
+{
+    TLLM_CHECK_WITH_INFO(
+        !isVariableWindow(), "The optimization of delaying requests won't work for variable window attention");
+    auto const& onlyManager = mWindowBlockManagers.cbegin()->second;
+    return onlyManager.findNewContextBlock(uniqueTokens, llmRequest);
+}
+
+std::optional<BlockKey> WindowBlockManager::findNewContextBlock(
     VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const
 {
     auto blockedUniqueTokens
@@ -737,12 +928,12 @@ std::optional<BlockKey> BlockManager::findNewContextBlock(
     return std::nullopt;
 }
 
-bool BlockManager::blockInRadixTree(BlockPtr const& block)
+bool WindowBlockManager::blockInRadixTree(BlockPtr const& block)
 {
     return !block->getUniqueTokens().empty() && block->getPrevBlock() != nullptr;
 }
 
-SizeType32 BlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const& blockKeys, SizeType32 numContextBlocks,
+SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const& blockKeys, SizeType32 numContextBlocks,
     GenerationRequest& sequence, std::vector<executor::RetentionPriorityAndDuration> const& perBlockRetentions)
 {
     SizeType32 numMatchedTokens{0};
@@ -780,16 +971,16 @@ SizeType32 BlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const& block
                     mTransferManager->onboard(matchingBlock, newBlock, mPools, numMatched);
                     // TODO: (optional) Send out event
                     matchingBlock = newBlock;
-                    TLLM_LOG_DEBUG(
-                        "BlockManager::loadOrAllocateBlocks - Copied partially filled block %d", matchingBlockId);
+                    TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Copied partially filled block %d", mLogPrefix.c_str(),
+                        matchingBlockId);
                 }
                 else
                 {
                     // Leaf block that nobody is using. Make block private and reuse
                     claimLeafBlock(
                         matchingBlock, perBlockRetentions[bi].retentionPriority, perBlockRetentions[bi].durationMs);
-                    TLLM_LOG_DEBUG(
-                        "BlockManager::loadOrAllocateBlocks - Reused partially filled block %d", matchingBlockId);
+                    TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Reused partially filled block %d", mLogPrefix.c_str(),
+                        matchingBlockId);
                     addBlockToHashMap(matchingBlock);
                 }
                 searchRoot = nullptr; // no matching needed for following blocks
@@ -799,7 +990,7 @@ SizeType32 BlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const& block
                 // Recover block and reuse
                 mEvictionPolicy->claimBlock(
                     matchingBlock, perBlockRetentions[bi].retentionPriority, perBlockRetentions[bi].durationMs);
-                TLLM_LOG_DEBUG("BlockManager::loadOrAllocateBlocks - Matched full block %d", matchingBlockId);
+                TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Matched full block %d", mLogPrefix.c_str(), matchingBlockId);
                 addBlockToHashMap(matchingBlock);
                 searchRoot = matchingBlock;
             }
@@ -821,8 +1012,8 @@ SizeType32 BlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const& block
                                               executor::KvCacheRetentionConfig::kDefaultRetentionPriority),
                 perBlockRetentions[bi].durationMs);
             addBlockToAllBeams(freeBlock, sequence);
-            TLLM_LOG_DEBUG("BlockManager::loadOrAllocateBlocks - No match, allocated new block %d for sequence %lu",
-                freeBlock->getBlockId(), sequence.getRequestId());
+            TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - No match, allocated new block %d for sequence %lu",
+                mLogPrefix.c_str(), freeBlock->getBlockId(), sequence.getRequestId());
             searchRoot = nullptr; // no matching needed for following blocks
             if (blockItr != blockKeys.end())
             {
@@ -856,8 +1047,8 @@ SizeType32 BlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const& block
             }
             freeBlock->setHash();
             addBlockToHashMap(freeBlock);
-            TLLM_LOG_DEBUG("BlockManager::loadOrAllocateBlocks - Beam %d. Allocated non-shared block %d for bi %d",
-                beamIdx, freeBlock->getBlockId(), bi);
+            TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Beam %d. Allocated non-shared block %d for bi %d",
+                mLogPrefix.c_str(), beamIdx, freeBlock->getBlockId(), bi);
         }
         ++mMissedBlocks;
         if (blockItr != blockKeys.end())
@@ -870,12 +1061,26 @@ SizeType32 BlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const& block
 }
 
 void BlockManager::refreshBlocks()
+{
+    for (auto& [_, manager] : mWindowBlockManagers)
+    {
+        manager.refreshBlocks();
+    }
+}
+
+void WindowBlockManager::refreshBlocks()
 {
     mEvictionPolicy->refresh();
     mTransferManager->syncTransfers();
 }
 
-void BlockManager::addSequence(
+void BlockManager::addSequence(GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks,
+    LlmRequest& llmRequest, SizeType32 windowSize)
+{
+    mWindowBlockManagers.at(windowSize).addSequence(sequence, inputLength, numContextBlocks, llmRequest);
+}
+
+void WindowBlockManager::addSequence(
     GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks, LlmRequest& llmRequest)
 {
     auto const requestId = sequence.getRequestId();
@@ -912,7 +1117,13 @@ void BlockManager::addSequence(
         inputLength, prepopulatedPromptLen);
 }
 
-void BlockManager::addSequence(GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx)
+void BlockManager::addSequence(
+    GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx, SizeType32 windowSize)
+{
+    mWindowBlockManagers.at(windowSize).addSequence(sequence, numBlocks, unsharedBlockIdx);
+}
+
+void WindowBlockManager::addSequence(GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx)
 {
     auto const requestId = sequence.getRequestId();
     auto const [seqIt, emplaceDone] = mAllocatedBlocksPerSeq.emplace(requestId, std::vector<BlockPtr>{});
@@ -926,23 +1137,23 @@ void BlockManager::addSequence(GenerationRequest& sequence, SizeType32 numBlocks
     }
 }
 
-void BlockManager::addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx)
+void WindowBlockManager::addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx)
 {
     auto const requestId = sequence.getRequestId();
     block->incRefCount();
-    if (sequence.getCacheBlockIds().at(beamIdx).size() == 0)
+    if (sequence.getCacheBlockIds(mWindowSize).at(beamIdx).size() == 0)
     {
         block->setPrevBlockInSeq(nullptr);
     }
     else
     {
-        block->setPrevBlockInSeq(mAllBlocksById.at(sequence.getCacheBlockIds()[beamIdx].back()));
+        block->setPrevBlockInSeq(mAllBlocksById.at(sequence.getCacheBlockIds(mWindowSize)[beamIdx].back()));
     }
-    sequence.addCacheBlock(beamIdx, block->getBlockId());
+    sequence.addCacheBlock(mWindowSize, beamIdx, block->getBlockId());
     mAllocatedBlocksPerSeq.at(requestId).push_back(block);
 }
 
-void BlockManager::addBlockToAllBeams(BlockPtr& block, GenerationRequest& sequence)
+void WindowBlockManager::addBlockToAllBeams(BlockPtr& block, GenerationRequest& sequence)
 {
     auto const beamWidth = sequence.getBeamWidth();
 
@@ -952,7 +1163,12 @@ void BlockManager::addBlockToAllBeams(BlockPtr& block, GenerationRequest& sequen
     }
 }
 
-void BlockManager::allocateBlock(GenerationRequest& sequence, bool shareAmongBeams)
+void BlockManager::allocateBlock(GenerationRequest& sequence, SizeType32 windowSize)
+{
+    mWindowBlockManagers.at(windowSize).allocateBlock(sequence, false);
+}
+
+void WindowBlockManager::allocateBlock(GenerationRequest& sequence, bool shareAmongBeams)
 {
     auto const beamWidth = sequence.getBeamWidth();
     auto const requiredBlocks = shareAmongBeams ? 1 : beamWidth;
@@ -979,9 +1195,11 @@ void BlockManager::allocateBlock(GenerationRequest& sequence, bool shareAmongBea
     }
 }
 
-void BlockManager::storeBlocks(std::vector<BlockKey> blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds)
+void WindowBlockManager::storeBlocks(
+    std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds)
 {
-    TLLM_LOG_DEBUG("BlockManager::storeBlocks - %zu blockKeys, %zu blockIds", blockKeys.size(), blockIds.size());
+    TLLM_LOG_DEBUG(
+        "%s::storeBlocks - %zu blockKeys, %zu blockIds", mLogPrefix.c_str(), blockKeys.size(), blockIds.size());
 
     auto searchRoot = mCachedBlocksRoot;
     bool needMatch = true;
@@ -991,7 +1209,7 @@ void BlockManager::storeBlocks(std::vector<BlockKey> blockKeys, std::vector<KVCa
     for (std::size_t blockCnt = 0; blockCnt < numBlocks; ++blockCnt)
     {
         auto const bid = blockIds[blockCnt];
-        TLLM_LOG_DEBUG("BlockManager::storeBlocks - Searching match for block %d", bid);
+        TLLM_LOG_DEBUG("%s::storeBlocks - Searching match for block %d", mLogPrefix.c_str(), bid);
         auto& block = mAllBlocksById[bid];
         auto const& blockKey = blockKeys[blockCnt];
 
@@ -1000,7 +1218,8 @@ void BlockManager::storeBlocks(std::vector<BlockKey> blockKeys, std::vector<KVCa
         if (matchedBlock != nullptr)
         {
             // Found match
-            TLLM_LOG_DEBUG("BlockManager::storeBlocks - Found matching block %d, traverse", matchedBlock->getBlockId());
+            TLLM_LOG_DEBUG(
+                "%s::storeBlocks - Found matching block %d, traverse", mLogPrefix.c_str(), matchedBlock->getBlockId());
             searchRoot = matchedBlock;
             // TODO possible optimization: if bid != matchedBlock->getBlockId(),
             // block can be freed and inserted at mFreePrimaryBlocks.begin()
@@ -1008,8 +1227,8 @@ void BlockManager::storeBlocks(std::vector<BlockKey> blockKeys, std::vector<KVCa
         else
         {
             // No match
-            TLLM_LOG_DEBUG(
-                "BlockManager::storeBlocks - No match, inserting block %d into search structure", block->getBlockId());
+            TLLM_LOG_DEBUG("%s::storeBlocks - No match, inserting block %d into search structure", mLogPrefix.c_str(),
+                block->getBlockId());
             needMatch = false; // no matching needed for following blocks
             block->setBlockKey(blockKey, static_cast<SizeType32>(blockKey.uniqueTokens.size()) == mTokensPerBlock);
             block->setPrevBlock(searchRoot);
@@ -1040,7 +1259,12 @@ void BlockManager::storeBlocks(std::vector<BlockKey> blockKeys, std::vector<KVCa
     }
 }
 
-void BlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeType32 blockIdx)
+void BlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeType32 windowSize, SizeType32 blockIdx)
+{
+    mWindowBlockManagers.at(windowSize).replaceSharedBlock(sequence, blockIdx);
+}
+
+void WindowBlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeType32 blockIdx)
 {
     auto const requestId = sequence.getRequestId();
     auto const beamWidth = sequence.getBeamWidth();
@@ -1071,25 +1295,31 @@ void BlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeType32 bl
     {
         auto block = getFreeBlock();
         block->incRefCount();
-        if (sequence.getCacheBlockIds().at(beamIdx).size() == 0)
+        if (sequence.getCacheBlockIds(mWindowSize).at(beamIdx).size() == 0)
         {
             block->setPrevBlockInSeq(nullptr);
         }
         else
         {
-            block->setPrevBlockInSeq(mAllBlocksById.at(sequence.getCacheBlockIds()[beamIdx].back()));
+            block->setPrevBlockInSeq(mAllBlocksById.at(sequence.getCacheBlockIds(mWindowSize)[beamIdx].back()));
         }
         block->setBlockKey(blockKey, isFull);
         block->setHash();
-        sequence.changeCacheBlock(beamIdx, blockIdx, block->getBlockId());
+        sequence.changeCacheBlock(mWindowSize, beamIdx, blockIdx, block->getBlockId());
         allocatedBlocks.at(blockIdx * beamWidth + beamIdx) = block;
     }
 }
 
-std::vector<KVCacheBlock::IdType> BlockManager::getNewlyAllocatedBlockIds(GenerationRequest const& sequence) const
+std::vector<KVCacheBlock::IdType> BlockManager::getNewlyAllocatedBlockIds(
+    GenerationRequest const& sequence, SizeType32 windowSize) const
+{
+    return mWindowBlockManagers.at(windowSize).getNewlyAllocatedBlockIds(sequence);
+}
+
+std::vector<KVCacheBlock::IdType> WindowBlockManager::getNewlyAllocatedBlockIds(GenerationRequest const& sequence) const
 {
     std::vector<KVCacheBlock::IdType> allocatedBlockIds;
-    for (auto const& beamBlockIds : sequence.getCacheBlockIds())
+    for (auto const& beamBlockIds : sequence.getCacheBlockIds(mWindowSize))
     {
         for (auto const& blockId : beamBlockIds)
         {
@@ -1103,7 +1333,12 @@ std::vector<KVCacheBlock::IdType> BlockManager::getNewlyAllocatedBlockIds(Genera
     return allocatedBlockIds;
 }
 
-void BlockManager::releaseLastBlock(GenerationRequest& sequence)
+void BlockManager::releaseLastBlock(GenerationRequest& sequence, SizeType32 windowSize)
+{
+    mWindowBlockManagers.at(windowSize).releaseLastBlock(sequence);
+}
+
+void WindowBlockManager::releaseLastBlock(GenerationRequest& sequence)
 {
     auto const requestId = sequence.getRequestId();
     auto& allocatedBlocks = mAllocatedBlocksPerSeq.at(requestId);
@@ -1120,10 +1355,10 @@ void BlockManager::releaseLastBlock(GenerationRequest& sequence)
     // Remove block from allocated blocks
     allocatedBlocks.pop_back();
     // Remove stored block ids in sequence
-    sequence.removeLastBlock();
+    sequence.removeLastBlock(mWindowSize);
 }
 
-[[nodiscard]] SizeType32 BlockManager::getNumFreeBlocks() const noexcept
+[[nodiscard]] SizeType32 WindowBlockManager::getNumFreeBlocks() const noexcept
 {
     return mEvictionPolicy->getNumFreeBlocks(kPrimaryLevel);
 }
@@ -1135,10 +1370,6 @@ std::deque<tle::KVCacheEvent> BlockManager::getLatestEvents(std::optional<std::c
 
 void BlockManager::releaseBlocks(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest)
 {
-    auto const requestId = sequence.getRequestId();
-
-    // TODO: refactor this method in two: store blocks for reuse and just 'release blocks'. Only the caller
-    // can know which blocks to store for reuse and which to just release.
     // When releasing the blocks for a sequence, we store those blocks for potential reuse only if:
     // - Block reuse is enabled.
     // - A request was provided to this function call to identify which tokens these blocks cover
@@ -1146,23 +1377,39 @@ void BlockManager::releaseBlocks(GenerationRequest& sequence, OptionalRef<LlmReq
     // - The sequence was not marked for use with cyclic kv-cache when it was added (when its context is too long to fit
     // the max attention window).
     // - The sequence did not switch to cyclic kv-cache during generation phase.
+    //  A sequence is cyclic if its *minimum window size* is crossed, even if other window sizes were not reached.
     bool const storeBlocksForReuse = sequence.getBeamWidth() == 1 && llmRequest.has_value() && !sequence.isCyclic();
-    if (storeBlocksForReuse)
+    for (auto& [_, manager] : mWindowBlockManagers)
     {
-        auto constexpr beamIdx = 0;
-        auto const& uniqueTokens = llmRequest->getUniqueTokens(beamIdx);
-        auto const& cacheBlockIds = sequence.getCacheBlockIds();
-
-        // TODO: get the caller to mark tokens as filled / not filled, so that the kv-cache manager doesn't
-        // have to guess. Only (length - 1) tokens of the sequence have their kv-state recorded in kv-cache. We assume
-        // the last token's state is not filled yet.
-        auto const usableSize = static_cast<runtime::SizeType32>(uniqueTokens.size()) - 1;
-        auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, usableSize, mTokensPerBlock, true);
-        auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);
-        storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx]);
+        if (storeBlocksForReuse)
+        {
+            manager.storeBlocksForReuse(sequence, llmRequest);
+        }
+        manager.releaseBlocks(sequence);
     }
+}
+
+void WindowBlockManager::storeBlocksForReuse(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest)
+{
+    auto constexpr beamIdx = 0;
+    auto const& uniqueTokens = llmRequest->getUniqueTokens(beamIdx);
+    auto const& cacheBlockIds = sequence.getCacheBlockIds(mWindowSize);
+
+    // TODO: get the caller to mark tokens as filled / not filled, so that the kv-cache manager doesn't
+    // have to guess. Only (length - 1) tokens of the sequence have their kv-state recorded in kv-cache. We assume
+    // the last token's state is not filled yet.
+    auto const usableSize = static_cast<runtime::SizeType32>(uniqueTokens.size()) - 1;
+    auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, usableSize, mTokensPerBlock, true);
+    auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);
+    storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx]);
+}
+
+void WindowBlockManager::releaseBlocks(GenerationRequest& sequence)
+{
+    auto const requestId = sequence.getRequestId();
 
     auto node = mAllocatedBlocksPerSeq.extract(requestId);
+    TLLM_CHECK(node);
     auto& allocatedBlocks = node.mapped();
     for (auto it = allocatedBlocks.rbegin(); it != allocatedBlocks.rend(); ++it)
     {
@@ -1177,10 +1424,18 @@ void BlockManager::releaseBlocks(GenerationRequest& sequence, OptionalRef<LlmReq
         }
     }
     // Remove stored block ids in sequence
-    sequence.clearCacheBlocks();
+    sequence.clearCacheBlocks(mWindowSize);
 }
 
 void BlockManager::schedulingReleaseBlocks(RequestIdType requestId)
+{
+    for (auto& [_, manager] : mWindowBlockManagers)
+    {
+        manager.schedulingReleaseBlocks(requestId);
+    }
+}
+
+void WindowBlockManager::schedulingReleaseBlocks(RequestIdType requestId)
 {
     for (auto& block : mAllocatedBlocksPerSeq.at(requestId))
     {
@@ -1197,11 +1452,11 @@ void BlockManager::schedulingReleaseBlocks(RequestIdType requestId)
 KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead,
     SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
     SizeType32 maxNumSequences, SizeType32 maxBeamWidth, std::vector<SizeType32> const& maxAttentionWindowVec,
-    SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, int64_t stream,
-    std::optional<runtime::SizeType32> maxSequenceLength, bool enableBlockReuse, bool onboardBlocks,
-    CacheType cacheType, bool enablePartialReuse, bool copyOnPartialReuse)
+    std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
+    SizeType32 sinkTokenLength, int64_t stream, std::optional<runtime::SizeType32> maxSequenceLength,
+    bool enableBlockReuse, bool onboardBlocks, CacheType cacheType, bool enablePartialReuse, bool copyOnPartialReuse)
     : KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, temporaryAttentionWindow,
+        blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype,
         sinkTokenLength, std::make_shared<runtime::CudaStream>(reinterpret_cast<cudaStream_t>(stream)),
         maxSequenceLength, enableBlockReuse, onboardBlocks, cacheType, std::nullopt, nullptr, false, enablePartialReuse,
         copyOnPartialReuse)
@@ -1211,102 +1466,89 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size
 KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead,
     SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
     SizeType32 maxNumSequences, SizeType32 maxBeamWidth, std::vector<SizeType32> const& maxAttentionWindowVec,
-    SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, int64_t stream,
-    std::optional<runtime::SizeType32> maxSequenceLength, bool enableBlockReuse, bool onboardBlocks,
-    CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
+    std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
+    SizeType32 sinkTokenLength, int64_t stream, std::optional<runtime::SizeType32> maxSequenceLength,
+    bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
+    std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
     std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
     : KVCacheManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool,
-        maxNumSequences, maxBeamWidth, maxAttentionWindowVec, temporaryAttentionWindow, sinkTokenLength,
+        maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
         std::make_shared<runtime::CudaStream>(reinterpret_cast<cudaStream_t>(stream)), maxSequenceLength,
-        enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority, std::move(eventManager), false,
-        enablePartialReuse, copyOnPartialReuse)
+        enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority, eventManager, enablePartialReuse,
+        copyOnPartialReuse)
 {
 }
 
 KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead,
     SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
     SizeType32 maxNumSequences, SizeType32 maxBeamWidth, std::vector<SizeType32> const& maxAttentionWindowVec,
-    SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, CudaStreamPtr stream,
-    std::optional<runtime::SizeType32> maxSequenceLength, bool enableBlockReuse, bool onboardBlocks,
-    CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
+    std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
+    SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
+    bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
+    std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
     std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
     bool copyOnPartialReuse)
     : mMaxBeamWidth(maxBeamWidth)
+    , mDataType(dtype)
     , mMaxAttentionWindow(*std::max_element(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end()))
-    , mMinAttentionWindow(*std::min_element(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end()))
-    , mTemporaryAttentionWindow(temporaryAttentionWindow)
     , mTokensPerBlock(tokensPerBlock)
     , mSinkBubbleLength(BaseKVCacheManager::getSinkBubbleLength(sinkTokenLength, tokensPerBlock))
     , mSinkBlockTokenLength(mSinkBubbleLength + sinkTokenLength)
     , mBlockManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool,
-          maxNumSequences, std::move(stream), onboardBlocks, cacheType, secondaryOffloadMinPriority,
+          maxNumSequences, std::move(stream), maxSequenceLength, maxBeamWidth, maxAttentionWindowVec,
+          tempAttentionWindowInputs, dtype, mSinkBubbleLength, onboardBlocks, cacheType, secondaryOffloadMinPriority,
           std::move(eventManager), enableHashKey, enablePartialReuse, copyOnPartialReuse)
     // disable block reuse for sink bubble since chopVectorIntoBlocks does not match KV cache blocks in this case
     , mEnableBlockReuse{mSinkBubbleLength > 0 ? false : enableBlockReuse}
     , mEnableHashKey{enableHashKey}
 {
+    TLLM_CHECK_DEBUG(std::find(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end(), mMaxAttentionWindow)
+        != maxAttentionWindowVec.end());
     // The sink tokens are stored in blocks separate from other tokens.
     // If the last block of sink tokens is only partially filled,
     // we fill that block with a "bubble" to reach the number of tokens per block.
 
     TLLM_CHECK(mSinkBlockTokenLength % tokensPerBlock == 0);
-
-    mMaxTokenNum = mMaxAttentionWindow + mSinkBubbleLength;
-
-    // If maxBeamWidth > 1, use one more block for each sequence in the paged kv cache to avoid dropping the needed
-    // tokens, when enabling cyclic kv cache.
-    mUseOneMoreBlock
-        = maxSequenceLength.has_value() && maxSequenceLength.value() > mMaxAttentionWindow && maxBeamWidth > 1;
-    TLLM_CHECK_WITH_INFO(!mUseOneMoreBlock || mTemporaryAttentionWindow == 0,
-        "Can't support sliding window attention, paged context fmha, and beam search are used together.");
-    if (mUseOneMoreBlock)
-    {
-        mMaxTokenNum += tokensPerBlock;
-    }
-
-    // Consider the mTemporaryAttentionWindow when allocating blocks.
-    mMaxBlocksPerSeq = tc::ceilDiv(mMaxTokenNum + mTemporaryAttentionWindow, tokensPerBlock);
-
     TLLM_LOG_DEBUG("KV cache block reuse is %s", mEnableBlockReuse ? "enabled" : "disabled");
-    TLLM_LOG_DEBUG("Max KV cache pages per sequence: %d", mMaxBlocksPerSeq);
     mSequences.reserve(maxNumSequences);
 }
 
 KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead,
     SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
     SizeType32 maxNumSequences, SizeType32 maxBeamWidth, std::vector<SizeType32> const& maxAttentionWindowVec,
-    SizeType32 temporaryAttentionWindow, SizeType32 sinkTokenLength, CudaStreamPtr stream,
-    std::optional<runtime::SizeType32> maxSequenceLength, bool enableBlockReuse, bool onboardBlocks,
-    CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
+    std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
+    SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
+    bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
+    std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
     std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
     bool copyOnPartialReuse)
     : KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, temporaryAttentionWindow,
+        blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype,
         sinkTokenLength, std::move(stream), maxSequenceLength, enableBlockReuse, onboardBlocks, cacheType,
         secondaryOffloadMinPriority, std::move(eventManager), enableHashKey, enablePartialReuse, copyOnPartialReuse)
 {
 }
 
-void KVCacheManager::allocatePools(nvinfer1::DataType dtype, bool useUvm)
+void KVCacheManager::allocatePools(bool useUvm)
 {
-    mBlockManager.allocatePools(dtype, useUvm);
+    mBlockManager.allocatePools(useUvm);
+    auto const numPools = mBlockManager.getNumPools();
 
     if (tc::Logger::getLogger()->getLevel() == tc::Logger::INFO)
     {
         uint64_t cacheSizeBytes = 0;
-        auto const numPools = mBlockManager.getNumPools();
         for (SizeType32 poolIdx = 0; poolIdx < numPools; poolIdx++)
         {
             auto const cacheShape = mBlockManager.getPrimaryPool(poolIdx)->getShape();
             auto const cacheVolume = ITensor::volume(cacheShape);
 #ifdef ENABLE_FP4
-            auto const isFp4 = dtype == nvinfer1::DataType::kFP4;
+            auto const isFp4 = mDataType == nvinfer1::DataType::kFP4;
 #else
             auto const isFp4 = false;
 #endif
             if (!isFp4)
             {
-                cacheSizeBytes += cacheVolume * BufferDataType(dtype).getSize();
+                cacheSizeBytes += cacheVolume * BufferDataType(mDataType).getSize();
             }
             else
             {
@@ -1320,7 +1562,6 @@ void KVCacheManager::allocatePools(nvinfer1::DataType dtype, bool useUvm)
             cacheSizeBytes / static_cast<double>(1 << 30), maxNumTokens);
     }
 
-    auto const numPools = mBlockManager.getNumPools();
     auto const numKVPools = mBlockManager.getNumPools(/*include_block_scalar_pools=*/false);
     auto const numBlockScalePools = numPools - numKVPools;
 
@@ -1336,13 +1577,12 @@ void KVCacheManager::allocatePools(nvinfer1::DataType dtype, bool useUvm)
 
     for (SizeType32 poolIdx = 0; poolIdx < numPools; poolIdx++)
     {
-        auto const isBlockScale = mBlockManager.containsBlockScales(poolIdx);
-        auto& outIdx = isBlockScale ? blockScalePoolIdx : kvPoolIdx;
-        auto& outRange = isBlockScale ? blockScalePtrsRange : poolPtrsRange;
+        auto const& pool = mBlockManager.getPool(poolIdx);
+        auto& outIdx = pool.containsBlockScales ? blockScalePoolIdx : kvPoolIdx;
+        auto& outRange = pool.containsBlockScales ? blockScalePtrsRange : poolPtrsRange;
 
-        outRange[outIdx * 2] = mBlockManager.getPrimaryPool(poolIdx)->data();
-        auto secondaryPool = mBlockManager.getSecondaryPool(poolIdx);
-        outRange[outIdx * 2 + 1] = secondaryPool ? secondaryPool->data() : nullptr;
+        outRange[outIdx * 2] = pool.primaryPtr->data();
+        outRange[outIdx * 2 + 1] = pool.secondaryPtr ? pool.secondaryPtr->data() : nullptr;
         ++outIdx;
     }
 
@@ -1368,7 +1608,8 @@ void KVCacheManager::startScheduling()
     mBlockManager.startScheduling();
 }
 
-SizeType32 KVCacheManager::getNeededBlocksOneStep(LlmRequest const& req, bool twoStepsLookAhead) const
+SizeType32 KVCacheManager::getNeededBlocksOneStep(
+    LlmRequest const& req, bool twoStepsLookAhead, SizeType32 windowSize) const
 {
     SizeType32 numRequiredBlocks = 0;
     SizeType32 const numDraftTokens = req.getNumDraftTokens();
@@ -1380,8 +1621,7 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(LlmRequest const& req, bool tw
     {
         // Assumes shared among beam = True
         auto const promptCacheLen
-            = std::min((isCrossKv() ? req.getEncoderOutputLen() : req.mPromptLen) + numDraftTokensPerStep,
-                  mMaxAttentionWindow)
+            = std::min((isCrossKv() ? req.getEncoderOutputLen() : req.mPromptLen) + numDraftTokensPerStep, windowSize)
             + mSinkBubbleLength;
         auto const numSharedBlocks = promptCacheLen / getTokensPerBlock();
         auto const numUnSharedTokens = promptCacheLen % getTokensPerBlock();
@@ -1401,7 +1641,7 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(LlmRequest const& req, bool tw
         auto const numPastTokens = req.mPromptLen + generatedTokens + mSinkBubbleLength - 1;
         auto const numNextTokens = numPastTokens + (twoStepsLookAhead ? 2 : 1) * numTokensPerStep;
 
-        if (numNextTokens > mMaxTokenNum)
+        if (numNextTokens > mBlockManager.getWindowSizeMetadata(windowSize).maxTokenNum)
         {
             return 0;
         }
@@ -1413,8 +1653,9 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(LlmRequest const& req, bool tw
     return numRequiredBlocks;
 }
 
-SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req) const
+SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req, SizeType32 windowSize) const
 {
+
     if (isCrossKv())
     {
         if (req.isContextInitState() && req.getContextCurrentPosition() == 0)
@@ -1424,14 +1665,15 @@ SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req)
 
         return 0; // cross KV cache doesn't grow after the initial context phase
     }
+
+    auto const temporaryAttentionWindow = mBlockManager.getWindowSizeMetadata(windowSize).temporaryAttentionWindow;
+
     SizeType32 const numContextBlocks
-        = (std::min(req.mPromptLen, mMaxAttentionWindow + mTemporaryAttentionWindow) + mSinkBubbleLength)
-        / getTokensPerBlock();
+        = (std::min(req.mPromptLen, windowSize + temporaryAttentionWindow) + mSinkBubbleLength) / getTokensPerBlock();
 
-    SizeType32 const numTotalBlocksPerBeam
-        = tc::ceilDiv(std::min(req.mPromptLen + req.mMaxNewTokens, mMaxAttentionWindow + mTemporaryAttentionWindow)
-                + mSinkBubbleLength,
-            getTokensPerBlock());
+    SizeType32 const numTotalBlocksPerBeam = tc::ceilDiv(
+        std::min(req.mPromptLen + req.mMaxNewTokens, windowSize + temporaryAttentionWindow) + mSinkBubbleLength,
+        getTokensPerBlock());
 
     SizeType32 const numGenBlocksPerBeam = numTotalBlocksPerBeam - numContextBlocks;
 
@@ -1442,7 +1684,7 @@ SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req)
         if (seqIt != mSequences.end())
         {
             auto const& seq = seqIt->second;
-            numAllocBlocksPerBeam = seq.getCacheBlockIds().at(0).size();
+            numAllocBlocksPerBeam = seq.getCacheBlockIds(windowSize).at(0).size();
         }
     }
 
@@ -1454,10 +1696,10 @@ SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req)
     return (numTotalBlocksPerBeam - numAllocBlocksPerBeam) * req.mSamplingConfig.beamWidth;
 }
 
-void KVCacheManager::cacheBlockOffsets(GenerationRequest& sequence)
+void KVCacheManager::cacheBlockOffsets(GenerationRequest& sequence, SizeType32 windowSize)
 {
-    auto const& cacheBlocks = sequence.getCacheBlockIds();
-    auto& cacheBlocksTensor = sequence.getCacheBlockIndices();
+    auto const& cacheBlocks = sequence.getCacheBlockIds(windowSize);
+    auto& cacheBlocksTensor = sequence.getCacheBlockIndices(windowSize);
     auto const beamWidth = sequence.getBeamWidth();
 
     auto* offsetsPtr = bufferCast<tk::KVCacheIndex>(cacheBlocksTensor);
@@ -1469,15 +1711,15 @@ void KVCacheManager::cacheBlockOffsets(GenerationRequest& sequence)
         for (SizeType32 blockIdx = 0; blockIdx < static_cast<SizeType32>(beamCacheBlock.size()); ++blockIdx)
         {
             auto const blockId = beamCacheBlock.at(blockIdx);
-            setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId);
+            mBlockManager.setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId, windowSize);
         }
     }
 }
 
-void KVCacheManager::cacheNewBlockOffsets(GenerationRequest& sequence)
+void KVCacheManager::cacheNewBlockOffsets(GenerationRequest& sequence, SizeType32 windowSize)
 {
-    auto const& cacheBlocks = sequence.getCacheBlockIds();
-    auto& cacheBlocksTensor = sequence.getCacheBlockIndices();
+    auto const& cacheBlocks = sequence.getCacheBlockIds(windowSize);
+    auto& cacheBlocksTensor = sequence.getCacheBlockIndices(windowSize);
     auto const beamWidth = sequence.getBeamWidth();
 
     auto* offsetsPtr = bufferCast<tk::KVCacheIndex>(cacheBlocksTensor);
@@ -1488,14 +1730,14 @@ void KVCacheManager::cacheNewBlockOffsets(GenerationRequest& sequence)
         auto const& beamCacheBlock = cacheBlocks[beamIdx];
         auto const blockId = beamCacheBlock.back();
         auto const blockIdx = static_cast<SizeType32>(beamCacheBlock.size() - 1);
-        setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId);
+        mBlockManager.setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId, windowSize);
     }
 }
 
-void KVCacheManager::updateNewBlockPointer(GenerationRequest& sequence, SizeType32 blockIdx)
+void KVCacheManager::updateNewBlockPointer(GenerationRequest& sequence, SizeType32 windowSize, SizeType32 blockIdx)
 {
-    auto const& cacheBlocks = sequence.getCacheBlockIds();
-    auto& cacheBlocksTensor = sequence.getCacheBlockIndices();
+    auto const& cacheBlocks = sequence.getCacheBlockIds(windowSize);
+    auto& cacheBlocksTensor = sequence.getCacheBlockIndices(windowSize);
     auto const beamWidth = sequence.getBeamWidth();
 
     auto* offsetsPtr = bufferCast<tk::KVCacheIndex>(cacheBlocksTensor);
@@ -1505,7 +1747,7 @@ void KVCacheManager::updateNewBlockPointer(GenerationRequest& sequence, SizeType
     {
         auto const& beamCacheBlock = cacheBlocks[beamIdx];
         auto const blockId = beamCacheBlock.at(blockIdx);
-        setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId);
+        mBlockManager.setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId, windowSize);
     }
 }
 
@@ -1529,39 +1771,43 @@ void KVCacheManager::updateToken(GenerationRequest& sequence, bool addToken)
         std::swap(currNumTokens, newNumTokens);
     }
 
-    SizeType32 const cyclicTokenNum = mMaxTokenNum - mSinkBlockTokenLength;
-    SizeType32 const nextTokenIdxInCycle = (currNumTokens - mSinkBlockTokenLength) % cyclicTokenNum;
-    SizeType32 const nextTokenIdxInCache = mSinkBlockTokenLength + nextTokenIdxInCycle;
+    for (auto const [windowSize, metadata] : mBlockManager.getWindowSizesMetadata())
+    {
+        auto const maxTokenNum = metadata.maxTokenNum;
+        SizeType32 const cyclicTokenNum = maxTokenNum - mSinkBlockTokenLength;
+        SizeType32 const nextTokenIdxInCycle = (currNumTokens - mSinkBlockTokenLength) % cyclicTokenNum;
+        SizeType32 const nextTokenIdxInCache = mSinkBlockTokenLength + nextTokenIdxInCycle;
 
-    // (nextTokenIdxInCache - mSinkBlockTokenLength) % cyclicTokenNum == 0)
-    // <=> nextTokenIdxInCycle == 0
-    // <=> nextTokenIdxInCache == mSinkBlockTokenLength
-    // => nextTokenIdxInCache % getTokensPerBlock() == 0
+        // (nextTokenIdxInCache - mSinkBlockTokenLength) % cyclicTokenNum == 0)
+        // <=> nextTokenIdxInCycle == 0
+        // <=> nextTokenIdxInCache == mSinkBlockTokenLength
+        // => nextTokenIdxInCache % getTokensPerBlock() == 0
 
-    // Check if require a new block
-    if (nextTokenIdxInCache % getTokensPerBlock() == 0)
-    {
-        if (newNumTokens <= mMaxTokenNum)
+        // Check if require a new block
+        if (nextTokenIdxInCache % getTokensPerBlock() == 0)
         {
-            if (addToken)
+            if (newNumTokens <= maxTokenNum)
             {
-                mBlockManager.allocateBlock(sequence);
-                cacheNewBlockOffsets(sequence);
+                if (addToken)
+                {
+                    mBlockManager.allocateBlock(sequence, windowSize);
+                    cacheNewBlockOffsets(sequence, windowSize);
+                }
+                else
+                {
+                    mBlockManager.releaseLastBlock(sequence, windowSize);
+                }
             }
-            else
+            else if (sequence.getBeamWidth() > 1)
             {
-                mBlockManager.releaseLastBlock(sequence);
+                TLLM_CHECK_WITH_INFO(addToken, "Remove token is not supported with beam search");
+                // Get next block index
+                SizeType32 nextBlockIdx = nextTokenIdxInCache / getTokensPerBlock();
+                // Replace the shared block with the unshared ones
+                mBlockManager.replaceSharedBlock(sequence, windowSize, nextBlockIdx);
+                updateNewBlockPointer(sequence, windowSize, nextBlockIdx);
             }
         }
-        else if (sequence.getBeamWidth() > 1 || mEnableBlockReuse)
-        {
-            TLLM_CHECK_WITH_INFO(addToken, "Remove token is not supported with beam search");
-            // Get next block index
-            SizeType32 nextBlockIdx = nextTokenIdxInCache / getTokensPerBlock();
-            // Replace the shared block with the unshared ones
-            mBlockManager.replaceSharedBlock(sequence, nextBlockIdx);
-            updateNewBlockPointer(sequence, nextBlockIdx);
-        }
     }
 }
 
@@ -1591,78 +1837,83 @@ void KVCacheManager::addSequence(
     auto const [seqIt, emplaceDone] = [&]
     {
         auto lck = std::scoped_lock(mSequencesMtx);
-        return mSequences.emplace(requestId,
-            GenerationRequest(requestId, inputLength, beamWidth, mMaxBlocksPerSeq,
-                mMinAttentionWindow + mSinkBubbleLength, // When the request has started cycling, disable reuse.
-                mBlockManager.getNumPools(), kvCacheRetentionConfig));
+        return mSequences.try_emplace(requestId, requestId, inputLength, beamWidth,
+            mBlockManager.getWindowSizesMetadata(), kvCacheRetentionConfig);
     }();
     TLLM_CHECK(emplaceDone);
     auto& sequence = seqIt->second;
 
-    // Get the final token index in kv cache
-    SizeType32 const finalTokenKVIdx
-        = mSinkBlockTokenLength + ((inputLength - 1 - mSinkBlockTokenLength) % (mMaxTokenNum - mSinkBlockTokenLength));
-
-    // Get block index that with shareAmongBeams=False.
-    // For cross kv cache in encoder-decoder models, always shareAmongBeams=True.
-    SizeType32 unsharedBlockIdx = -1;
-    if ((!sequence.isCyclic() || beamWidth > 1 || finalTokenKVIdx % getTokensPerBlock() > 0) && !isCrossKv())
-    {
-        unsharedBlockIdx = ((finalTokenKVIdx + 1) % getTokensPerBlock() == 0)
-            ? finalTokenKVIdx / getTokensPerBlock() + 1
-            : finalTokenKVIdx / getTokensPerBlock();
-    }
-
-    // Consider the mTemporaryAttentionWindow when allocating blocks.
-    inputLength = std::min(inputLength, mMaxTokenNum + mTemporaryAttentionWindow);
-    auto const numContextBlocks = tc::ceilDiv(inputLength, getTokensPerBlock());
-
     // Get statistics for block allocations/reuse pre request.
     SizeType32 const numAllocTotalBlocksPreRequest = mBlockManager.getNumAllocTotalBlocks();
     SizeType32 const numAllocNewBlocksPreRequest = mBlockManager.getNumAllocNewBlocks();
     SizeType32 const numReusedBlocksPreRequest = mBlockManager.getNumReusedBlocks();
     SizeType32 const numMissedBlocksPreRequest = mBlockManager.getNumMissedBlocks();
 
-    if (!sequence.isCyclic() && mEnableBlockReuse)
-    {
-        mBlockManager.addSequence(sequence, inputLength, numContextBlocks, *llmRequest);
-    }
-    else
+    for (auto const [windowSize, metadata] : mBlockManager.getWindowSizesMetadata())
     {
-        if (!mEnableBlockReuse && llmRequest && llmRequest->getKvCacheRetentionConfig().has_value())
+        auto const maxTokenNum = metadata.maxTokenNum;
+        auto const temporaryAttentionWindow = metadata.temporaryAttentionWindow;
+
+        // Get the final token index in kv cache
+        SizeType32 const finalTokenKVIdx = mSinkBlockTokenLength
+            + ((inputLength - 1 - mSinkBlockTokenLength) % (maxTokenNum - mSinkBlockTokenLength));
+
+        // Get block index that with shareAmongBeams=False.
+        // For cross kv cache in encoder-decoder models, always shareAmongBeams=True.
+        SizeType32 unsharedBlockIdx = -1;
+        if ((!sequence.isCyclic() || beamWidth > 1 || finalTokenKVIdx % getTokensPerBlock() > 0) && !isCrossKv())
         {
-            TLLM_LOG_WARNING(
-                "Request %d has a retention configuration set, but block reuse is disabled. The retention config will "
-                "have no effect.",
-                llmRequest->mRequestId);
+            unsharedBlockIdx = ((finalTokenKVIdx + 1) % getTokensPerBlock() == 0)
+                ? finalTokenKVIdx / getTokensPerBlock() + 1
+                : finalTokenKVIdx / getTokensPerBlock();
         }
-        mBlockManager.addSequence(sequence, numContextBlocks, unsharedBlockIdx);
-        if (mEnableHashKey && llmRequest.has_value() && beamWidth == 1)
+
+        // Consider the temporaryAttentionWindow when allocating blocks.
+        auto const effectiveInputLength = std::min(inputLength, maxTokenNum + temporaryAttentionWindow);
+        auto const numContextBlocks = tc::ceilDiv(effectiveInputLength, getTokensPerBlock());
+        if (!sequence.isCyclic() && mEnableBlockReuse)
         {
-            constexpr SizeType32 beamIdx = 0;
-            auto const& blockIds = sequence.getCacheBlockIds().at(beamIdx);
-            auto const& uniqueTokens = llmRequest->getUniqueTokens(beamIdx);
-            auto blockedUniqueTokens
-                = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, uniqueTokens.size() - 1, getTokensPerBlock(), true);
-            auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);
-            auto tokensPerBlock = static_cast<size_t>(getTokensPerBlock());
-            for (size_t i = 0; i < blockIds.size(); i++)
+            mBlockManager.addSequence(sequence, effectiveInputLength, numContextBlocks, *llmRequest, windowSize);
+        }
+        else
+        {
+            if (!mEnableBlockReuse && llmRequest && llmRequest->getKvCacheRetentionConfig().has_value())
             {
-                auto const& block = mBlockManager.getBlockById(blockIds[i]);
-                if (i < blockKeys.size())
-                {
-                    block->setBlockKey(blockKeys[i], blockKeys[i].uniqueTokens.size() == tokensPerBlock);
-                }
-                else
+                TLLM_LOG_WARNING(
+                    "Request %d has a retention configuration set, but block reuse is disabled. The retention "
+                    "config "
+                    "will "
+                    "have no effect.",
+                    llmRequest->mRequestId);
+            }
+            mBlockManager.addSequence(sequence, numContextBlocks, unsharedBlockIdx, windowSize);
+            if (mEnableHashKey && llmRequest.has_value() && beamWidth == 1)
+            {
+                constexpr SizeType32 beamIdx = 0;
+                auto const& blockIds = sequence.getCacheBlockIds(windowSize).at(beamIdx);
+                auto const& uniqueTokens = llmRequest->getUniqueTokens(beamIdx);
+                auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(
+                    uniqueTokens, uniqueTokens.size() - 1, getTokensPerBlock(), true);
+                auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);
+                auto tokensPerBlock = static_cast<size_t>(getTokensPerBlock());
+                for (size_t i = 0; i < blockIds.size(); i++)
                 {
-                    block->setBlockKey({}, false);
+                    auto const& block = mBlockManager.getBlockById(blockIds[i], windowSize);
+                    if (i < blockKeys.size())
+                    {
+                        block->setBlockKey(blockKeys[i], blockKeys[i].uniqueTokens.size() == tokensPerBlock);
+                    }
+                    else
+                    {
+                        block->setBlockKey({}, false);
+                    }
+                    block->setHash();
+                    mBlockManager.addBlockToHashMap(block, windowSize);
                 }
-                block->setHash();
-                mBlockManager.addBlockToHashMap(block);
             }
         }
+        cacheBlockOffsets(sequence, windowSize);
     }
-    cacheBlockOffsets(sequence);
 
     if (llmRequest)
     {
@@ -1702,7 +1953,7 @@ void KVCacheManager::removeSequence(RequestIdType requestId, OptionalRef<LlmRequ
         }
         else
         {
-            mBlockManager.releaseBlocks(sequenceNode.mapped(), {});
+            mBlockManager.releaseBlocks(sequenceNode.mapped(), std::nullopt);
         }
     }
     TLLM_LOG_TRACE("[%s]::%s stop", isCrossKv() ? "CROSS" : "SELF", __PRETTY_FUNCTION__);
@@ -1717,34 +1968,38 @@ void KVCacheManager::schedulingRemoveSequence(RequestIdType requestId)
 SizeType32 KVCacheManager::copyBlockOffsets(ITensor& output, SizeType32 outputSlotOffset, RequestIdType requestId) const
 {
     auto const& sequence = getSequence(requestId);
-    auto const& cacheBlocksTensor = sequence.getCacheBlockIndices();
     auto const beamWidth = sequence.getBeamWidth();
 
     auto* dstPtr = bufferCast<tk::KVCacheIndex>(output);
-    auto const* srcPtr = bufferCast<tk::KVCacheIndex>(cacheBlocksTensor);
     auto const& dstShape = output.getShape();
-    auto const& srcShape = cacheBlocksTensor.getShape();
 
     SizeType32 constexpr kIdx = 0;
     SizeType32 constexpr vIdx = 1;
 
     SizeType32 maxBlockCount{0};
     // Get page table for each KV cache pool
-    auto const numPools = mBlockManager.getNumPools();
-
-    for (SizeType32 poolIdx = 0; poolIdx < numPools; poolIdx++)
-    {
-        for (SizeType32 beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
+    SizeType32 absolutePoolIdx = 0;
+    for (auto const [windowSize, metadata] : mBlockManager.getWindowSizesMetadata())
+    {
+        auto const& cacheBlocksTensor = sequence.getCacheBlockIndices(windowSize);
+        auto const* srcPtr = bufferCast<tk::KVCacheIndex>(cacheBlocksTensor);
+        auto const& srcShape = cacheBlocksTensor.getShape();
+        auto const& cacheBlockIds = sequence.getCacheBlockIds(windowSize);
+        for (SizeType32 poolIdx = 0; poolIdx < metadata.numPools; poolIdx++, absolutePoolIdx++)
         {
-            auto const beamBlockCount = sequence.getCacheBlockIds()[beamIdx].size();
-            auto const copyChunkSize = beamBlockCount * sizeof(tk::KVCacheIndex);
-            for (auto xIdx : {kIdx, vIdx})
+            for (SizeType32 beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
             {
-                auto const srcIndex = tc::flat_index(srcShape.d, poolIdx, beamIdx, xIdx, 0);
-                auto const dstIndex = tc::flat_index(dstShape.d, poolIdx, outputSlotOffset + beamIdx, xIdx, 0);
-                std::memcpy(dstPtr + dstIndex, srcPtr + srcIndex, copyChunkSize);
+                auto const beamBlockCount = cacheBlockIds[beamIdx].size();
+                auto const copyChunkSize = beamBlockCount * sizeof(tk::KVCacheIndex);
+                for (auto xIdx : {kIdx, vIdx})
+                {
+                    auto const srcIndex = tc::flat_index(srcShape.d, poolIdx, beamIdx, xIdx, 0);
+                    auto const dstIndex
+                        = tc::flat_index(dstShape.d, absolutePoolIdx, outputSlotOffset + beamIdx, xIdx, 0);
+                    std::memcpy(dstPtr + dstIndex, srcPtr + srcIndex, copyChunkSize);
+                }
+                maxBlockCount = std::max<SizeType32>(maxBlockCount, static_cast<SizeType32>(beamBlockCount));
             }
-            maxBlockCount = std::max<SizeType32>(maxBlockCount, static_cast<SizeType32>(beamBlockCount));
         }
     }
     return maxBlockCount;
@@ -1860,32 +2115,29 @@ SizeType32 BaseKVCacheManager::getSinkBubbleLength(SizeType32 sinkTokenLen, Size
     return sinkBubbleLength;
 }
 
-bool KVCacheManager::schedulingHasFreeBlocks(SizeType32 numRequired) const
-{
-    return mBlockManager.schedulingHasFreeBlocks(numRequired);
-}
-
-std::vector<std::vector<SizeType32>> const& KVCacheManager::getCacheBlockIds(RequestIdType requestId) const
+std::vector<std::vector<SizeType32>> const& KVCacheManager::getCacheBlockIds(
+    RequestIdType requestId, SizeType32 windowSize) const
 {
-    return getSequence(requestId).getCacheBlockIds();
+    return getSequence(requestId).getCacheBlockIds(windowSize);
 }
 
 std::vector<std::vector<std::vector<SizeType32>>> KVCacheManager::getBatchCacheBlockIds(
-    std::vector<LlmRequest::RequestIdType> const& requestIds) const
+    std::vector<LlmRequest::RequestIdType> const& requestIds, SizeType32 windowSize) const
 {
     std::vector<std::vector<std::vector<SizeType32>>> result{};
     result.reserve(requestIds.size());
     for (auto const& requestId : requestIds)
     {
         auto const& sequence = getSequence(requestId);
-        result.emplace_back(sequence.getCacheBlockIds());
+        result.emplace_back(sequence.getCacheBlockIds(windowSize));
     }
     return result;
 }
 
-std::vector<SizeType32> KVCacheManager::getNewlyAllocatedBlockIds(LlmRequest::RequestIdType requestId) const
+std::vector<SizeType32> KVCacheManager::getNewlyAllocatedBlockIds(
+    LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
 {
-    return mBlockManager.getNewlyAllocatedBlockIds(mSequences.at(requestId));
+    return mBlockManager.getNewlyAllocatedBlockIds(mSequences.at(requestId), windowSize);
 }
 
 runtime::ITensor::SharedPtr KVCacheManager::getPrimaryPool(SizeType32 layer_idx) const
@@ -1895,7 +2147,7 @@ runtime::ITensor::SharedPtr KVCacheManager::getPrimaryPool(SizeType32 layer_idx)
 
 SizeType32 KVCacheManager::getMaxCapacityBatchSize(SizeType32 inputLength, SizeType32 outputLength) const
 {
-
+    // TODO(nhaber): mMaxAttentionWindow -> Check this call
     auto const blockRequirementsPerSequence = KVCacheManager::calculateMaxBlockRequirements(
         inputLength, outputLength, mSinkBlockTokenLength, mMaxAttentionWindow, mMaxBeamWidth, mTokensPerBlock);
 
@@ -1922,8 +2174,8 @@ SizeType32 KVCacheManager::calculateMaxBlockRequirements(SizeType32 inputLength,
             wholeSequenceLength, sinkTokenLength, maxAttentionWindow, tokensPerBlock);
     }
 
-    // If the whole attention window can fit in the output, then we can simply multiply the cost of a sequence of length
-    // max attention window by the beam width.
+    // If the whole attention window can fit in the output, then we can simply multiply the cost of a sequence of
+    // length max attention window by the beam width.
     if (maxAttentionWindow <= outputLength)
     {
         return KVCacheManager::calculateMaxBlockRequirementsPerBeam(
@@ -1955,9 +2207,9 @@ SizeType32 KVCacheManager::calculateMaxBlockRequirements(SizeType32 inputLength,
         return (blockCapacity / beamWidth) * tokensPerBlock;
     }
 
-    // Otherwise, we need to determine how many context tokens we can fit on top of the output tokens. First, there are
-    // a few context tokens we might be able to fit 'for free' because the output is not a multiple of the number of
-    // tokens per block.
+    // Otherwise, we need to determine how many context tokens we can fit on top of the output tokens. First, there
+    // are a few context tokens we might be able to fit 'for free' because the output is not a multiple of the
+    // number of tokens per block.
     auto const leftoverBlockCapacity = blockCapacity - outputBlockRequirements;
     return std::min(outputLength + leftoverBlockCapacity * tokensPerBlock, inputLength + outputLength);
 }
diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
index 329147b4d0c..78badb9bd55 100644
--- a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
+++ b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
@@ -107,7 +107,7 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest,
 
     constexpr SizeType32 beam{0};
     auto const numPools = mCacheManager->getBlockManager().getNumPools();
-    auto blockRange = BlockRange(*mCacheManager, llmRequest.mRequestId, beam);
+    auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mCacheManager, llmRequest.mRequestId, beam);
 
     int blockNum = 0;
     std::vector<runtime::ITensor::SharedPtr> inputKvCacheBlocks;
@@ -121,7 +121,7 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest,
         }
     }
     TLLM_CHECK(blockNum > 0);
-    int deviceId = mCacheManager->getBlockManager().getBufferManager().getStream().getDevice();
+    int deviceId = mCacheManager->getBlockManager().getStreamDevice();
 
     if (common::getEnvTryZCopyForKVCacheTransfer()
         && destConfig.getParallelConfig().mPipelineParallelism == selfConfig.getParallelConfig().mPipelineParallelism)
@@ -330,7 +330,7 @@ void MLACacheFormatter::formatInput(LlmRequest const& llmRequest,
     auto pickUpConnections = pickRecvConnections(connections, selfConfig, selfIdx, destConfig);
     // diff end
     constexpr SizeType32 beam{0};
-    auto blockRange = BlockRange(*mCacheManager, llmRequest.mRequestId, beam);
+    auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mCacheManager, llmRequest.mRequestId, beam);
     std::vector<runtime::ITensor::SharedPtr> recvBufferTmps;
     std::vector<runtime::ITensor::SharedPtr> outputBuffers;
     auto const numPools = mCacheManager->getBlockManager().getNumPools();
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
index b6ad1a1f585..5d3e6620081 100644
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -200,12 +200,6 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
 
     setupSpeculativeDecodingModule(mDecodingConfig);
 
-    if (mModelConfig.getSpeculativeDecodingMode().needsKVCacheRewind())
-    {
-        TLLM_CHECK_WITH_INFO(
-            mModelConfig.isKVCacheEnabled(), "When needsKVCacheRewind() returns true, KV cache needs to be enabled.");
-    }
-
     if (mWorldConfig.isLastPipelineParallelRank() && optionalParams.guidedDecodingConfig)
     {
         mGuidedDecoder = std::make_unique<GuidedDecoder>(optionalParams.guidedDecodingConfig.value(),
@@ -299,6 +293,31 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
             = CacheTransceiverFactory::createCacheTransceiver(mKvCacheManager.get(), mModelConfig, mWorldConfig);
     }
 
+    if (mModelConfig.getSpeculativeDecodingMode().needsKVCacheRewind())
+    {
+        TLLM_CHECK_WITH_INFO(
+            mModelConfig.isKVCacheEnabled(), "When needsKVCacheRewind() returns true, KV cache needs to be enabled.");
+        auto const& blockManager = mKvCacheManager->getBlockManager();
+
+        TLLM_CHECK_WITH_INFO(blockManager.getNumPools() == 1,
+            "Rewinding KV cache blocks for models with multiple pools is not supported");
+
+        // Two "redundant" checks given the pool size check above, but those below don't rely on an implementation
+        // detail I guess.
+        TLLM_CHECK_WITH_INFO(
+            !blockManager.isVariableWindow(), "Rewinding KV cache blocks for variable SWA models isn't supported");
+        auto const maxBlocksPerSeq = blockManager.getMaxBlockPerSeqWhenSingleWindowSize();
+        auto const isUseOneMoreBlock = kv_cache_manager::BlockManager::isUseOneMoreBlock(
+            getMaxAttentionWindow(), getMaxSequenceLen(), getMaxBeamWidth());
+
+        // TODO(oargov): VGQA is not supported, assume all layers have the same num_kv_heads
+        TLLM_CHECK_WITH_INFO(
+            !blockManager.isVariableGQA(), "Rewinding KV cache blocks for variable GQA models isn't supported");
+        auto const numKvHeads = mModelConfig.getNbKvHeads(0);
+
+        mRewindInputs = RewindInputs{maxBlocksPerSeq, isUseOneMoreBlock, numKvHeads};
+    }
+
     if (mWorldConfig.isPipelineParallel())
     {
         mAsyncSendWaitThread = std::make_unique<tensorrt_llm::mpi::MpiWaitThread>(
@@ -347,7 +366,7 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
         SizeType32 chunkUnitSize = mKvCacheManager->getTokensPerBlock();
         // If sliding window attention is used, then make sure the unit size aligns with the paged context fmha's kv
         // step size.
-        if (getMaxInputLen() > getMaxAttentionWindow())
+        if (getMaxInputLen() > getMaxAttentionWindow()) // TODO(nhaber): minAttentionWindow
         {
             chunkUnitSize = std::max(/* maxKvStepSizeInFmha */ 256, chunkUnitSize);
             TLLM_LOG_INFO("ChunkUnitSize is set to %d as sliding window attention is used.", chunkUnitSize);
@@ -484,8 +503,7 @@ void TrtGptModelInflightBatching::setupSpeculativeDecodingModule(executor::Decod
     }
 }
 
-void TrtGptModelInflightBatching::reshapeKvTensors(
-    SizeType32 maxBlocksPerSeq, kv_cache_manager::CacheType kvCacheType, SizeType32 numPools)
+void TrtGptModelInflightBatching::reshapeKvTensors(OffsetTableDimensions const& dims)
 {
     TLLM_CHECK(mBuffers.size() == static_cast<size_t>(mNumBuffers));
     auto const& manager = mRuntime->getBufferManager();
@@ -495,7 +513,7 @@ void TrtGptModelInflightBatching::reshapeKvTensors(
         // any method that operates on transformerBuffers must distinguish between self and cross cache, because
         // transformerBuffers is not managed by KVCacheManager same rule applies to kv pool pointers below
         buffers->transformerBuffers->reshapeKvTensors(
-            getMaxBatchSize(), mOperatingBeamWidth, maxBlocksPerSeq, kvCacheType, numPools, manager);
+            getMaxBatchSize(), mOperatingBeamWidth, dims.maxBlocksPerSeq, dims.cacheType, dims.numPools, manager);
     }
 }
 
@@ -558,6 +576,7 @@ std::shared_ptr<kv_cache_manager::KVCacheManager> TrtGptModelInflightBatching::c
             break;
         }
     }
+    // Below assertion should be removed once SWA/VSWA is no longer cyclic.
     TLLM_CHECK_WITH_INFO(
         getMaxBeamWidth() == 1 || !enableCyclicKvCache, "Can't support cyclic kv cache with beam search.");
 
@@ -575,23 +594,18 @@ std::shared_ptr<kv_cache_manager::KVCacheManager> TrtGptModelInflightBatching::c
         adjustMaxAttentionWindow(blocksInPrimaryPool, tokensPerBlock);
     }
 
-    auto const& maxAttentionWindowVec = kvCacheType == KvCacheType::kSELF
-        ? getMaxAttentionWindowVec()
-        : std::vector<SizeType32>{mModelConfig.getMaxEncoderLen()};
+    auto maxAttentionWindowVec = getMaxAttentionWindowVec();
 
-    // Only needed when sliding window attention + paged context fmha are used together.
-    // In that case, a temporary kv cache buffer with maximum chunk size (maxNumTokens) is needed.
-    // TODO: There are several things that can be improved later.
-    //  1. a dynamic temporary kv cache allocation based on real chunk size might be needed.
-    //  2. reuse the same temporary kv cache buffer among all layers in the same pool.
-    SizeType32 temporaryKvCacheLength{0};
-    if (mModelConfig.getPagedContextFMHA() && (getMaxInputLen() > getMaxAttentionWindow()))
+    if (kvCacheType != KvCacheType::kSELF) // TODO(nhaber): more foolproof way of initing cross-kvcache-manager
     {
-        TLLM_CHECK_WITH_INFO(getMaxNumTokens(), "Max number of tokens is not set in model config.");
-        temporaryKvCacheLength = std::min(getMaxNumTokens().value(), getMaxInputLen() - getMaxAttentionWindow());
-        TLLM_LOG_INFO("TemporaryKvCacheLength for sliding window attention: %d", temporaryKvCacheLength);
+        maxAttentionWindowVec = std::vector<SizeType32>{mModelConfig.getMaxEncoderLen()};
     }
 
+    kv_cache_manager::TempAttentionWindowInputs tempAttentionWindowInputs;
+    tempAttentionWindowInputs.pagedContextFMHA = mModelConfig.getPagedContextFMHA();
+    tempAttentionWindowInputs.maxInputLen = getMaxInputLen();
+    tempAttentionWindowInputs.maxNumTokens = getMaxNumTokens().value();
+
     if (kvCacheType == KvCacheType::kCROSS && kvCacheConfig.enableBlockReuse)
     {
         TLLM_LOG_INFO(
@@ -602,18 +616,16 @@ std::shared_ptr<kv_cache_manager::KVCacheManager> TrtGptModelInflightBatching::c
 
     auto kvCacheManager = std::make_shared<KVCacheManager>(numKvHeadsPerLayer, sizePerHead, tokensPerBlock,
         blocksInPrimaryPool, blocksInSecondaryPool, getMaxNumSequences(), getMaxBeamWidth(), maxAttentionWindowVec,
-        temporaryKvCacheLength, getSinkTokenLen(), mRuntime->getStreamPtr(), std::nullopt, enableBlockReuse,
+        tempAttentionWindowInputs, kvDtype, getSinkTokenLen(), mRuntime->getStreamPtr(), std::nullopt, enableBlockReuse,
         kvCacheConfig.onboardBlocks, kvCacheType, kvCacheConfig.secondaryOffloadMinPriority,
         kvCacheConfig.eventBufferMaxSize > 0
             ? std::make_unique<kv_cache_manager::KVCacheEventManager>(kvCacheConfig.eventBufferMaxSize)
             : nullptr,
         false, kvCacheConfig.enablePartialReuse, kvCacheConfig.copyOnPartialReuse);
 
-    auto const& blockManager = kvCacheManager->getBlockManager();
+    reshapeKvTensors(kvCacheManager->getOffsetTableDimensions());
 
-    reshapeKvTensors(kvCacheManager->getMaxBlocksPerSeq(), blockManager.getCacheType(), blockManager.getNumPools());
-
-    kvCacheManager->allocatePools(kvDtype, kvCacheConfig.useUvm);
+    kvCacheManager->allocatePools(kvCacheConfig.useUvm);
 
     TensorMap inputBuffers;
     TensorPtr poolPointers = kvCacheManager->getBlockPoolPointers();
@@ -1047,11 +1059,20 @@ void TrtGptModelInflightBatching::forwardAsync(RequestList const& activeRequests
         mMicroBatchId = getNextMicroBatchId(mMicroBatchId);
     }
     // In case of error, we need to free the batch slot associated with those requests
-    catch (std::exception const& e)
+    catch (std::exception const&)
     {
-        for (auto const& llmReq : activeRequests)
+        try
+        {
+            for (auto const& llmReq : activeRequests)
+            {
+                terminateRequest(llmReq);
+            }
+        }
+        catch (std::exception const& e)
         {
-            terminateRequest(llmReq);
+            TLLM_LOG_ERROR("forwardAsync catch-all catch block that runs `terminateRequest` has failed with:");
+            TLLM_LOG_EXCEPTION(e);
+            TLLM_LOG_ERROR("Rethrowing *outer* exception:");
         }
         throw;
     }
@@ -2308,8 +2329,6 @@ std::vector<std::unique_ptr<DecoderStepAsyncSend>> TrtGptModelInflightBatching::
 void TrtGptModelInflightBatching::rewindKVCacheBlocks(SizeType32 numSequences)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    TLLM_CHECK_WITH_INFO(mKvCacheManager->getBlockManager().getNumPools() == 1,
-        "Rewinding KV cache blocks for models with multiple pools is not supported");
     auto const bufferId = getFusedBufferId();
     auto& runtimeBuffers = *mBuffers.at(bufferId);
 
@@ -2323,13 +2342,9 @@ void TrtGptModelInflightBatching::rewindKVCacheBlocks(SizeType32 numSequences)
         localNbLayers -= eagleModulePtr->getNumTransformerLayers();
     }
 
-    // TODO: VGQA is not supported, assume all layers have the same num_kv_heads
-    auto const numKvHeads = mModelConfig.getNbKvHeads(0);
     auto const tokensPerBlock = mModelConfig.getTokensPerBlock();
     auto const elemSize = BufferDataType(mModelConfig.getKvDataType()).getSize();
     auto const sizeInBytesPerKVHead = mModelConfig.getSizePerHead() * elemSize;
-    auto const maxBlocksPerSeq = mKvCacheManager->getMaxBlocksPerSeq();
-    auto const useOneMoreBlock = mKvCacheManager->isUseOneMoreBlock();
 
     auto const poolPointers = mKvCacheManager->getBlockPoolPointers();
     auto* const* pointerArrayPtr = bufferCast<void*>(*poolPointers);
@@ -2347,9 +2362,10 @@ void TrtGptModelInflightBatching::rewindKVCacheBlocks(SizeType32 numSequences)
     tensorrt_llm::runtime::kernels::invokeUpdateKVBlockArrayDraftTokenLocation(
         *mDecoderBuffers->draftBuffers.acceptedLengthsCumSumDevice,
         *mDecoderBuffers->draftBuffers.acceptedPackedPathsDevice, *runtimeBuffers.sequenceLengthsDevice,
-        pointerArrayPtr, offsetArrayPtr, localNbLayers, numSequences, numKvHeads, sizeInBytesPerKVHead, commonRewindLen,
-        rewindLens, *runtimeBuffers.seqSlotRemappingDevice, *runtimeBuffers.sortedSeqSlots, getMaxAttentionWindow(),
-        maxBlocksPerSeq, tokensPerBlock, useOneMoreBlock, mRuntime->getStreamPtr()->get());
+        pointerArrayPtr, offsetArrayPtr, localNbLayers, numSequences, mRewindInputs.numKvHeads, sizeInBytesPerKVHead,
+        commonRewindLen, rewindLens, *runtimeBuffers.seqSlotRemappingDevice, *runtimeBuffers.sortedSeqSlots,
+        getMaxAttentionWindow(), mRewindInputs.maxBlocksPerSeq, tokensPerBlock, mRewindInputs.isUseOneMoreBlock,
+        mRuntime->getStreamPtr()->get());
 
     sync_check_cuda_error(mRuntime->getStream().get());
 
@@ -2377,15 +2393,13 @@ void TrtGptModelInflightBatching::changeBeamWidth(SizeType32 beamWidth)
 
     if (static_cast<bool>(mKvCacheManager))
     {
-        auto const& blockManager = mKvCacheManager->getBlockManager();
-        reshapeKvTensors(
-            mKvCacheManager->getMaxBlocksPerSeq(), blockManager.getCacheType(), blockManager.getNumPools());
+        auto const dims = mKvCacheManager->getOffsetTableDimensions();
+        reshapeKvTensors(dims);
     }
     if (static_cast<bool>(mCrossKvCacheManager))
     {
-        auto const& blockManager = mCrossKvCacheManager->getBlockManager();
-        reshapeKvTensors(
-            mCrossKvCacheManager->getMaxBlocksPerSeq(), blockManager.getCacheType(), blockManager.getNumPools());
+        auto const dims = mCrossKvCacheManager->getOffsetTableDimensions();
+        reshapeKvTensors(dims);
     }
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h
index 91cf6d76df6..e66e0ff765b 100644
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h
@@ -50,6 +50,7 @@ namespace tensorrt_llm::batch_manager
 namespace kv_cache_manager
 {
 class KVCacheManager;
+struct OffsetTableDimensions;
 } // namespace kv_cache_manager
 
 namespace rnn_state_manager
@@ -86,9 +87,17 @@ namespace utils
 class CudaGraphExecutorCache;
 } // namespace utils
 
+struct RewindInputs
+{
+    SizeType32 maxBlocksPerSeq;
+    bool isUseOneMoreBlock;
+    SizeType32 numKvHeads;
+};
+
 class TrtGptModelInflightBatching : public TrtGptModel
 {
     using BaseKVCacheManager = kv_cache_manager::BaseKVCacheManager;
+    using OffsetTableDimensions = kv_cache_manager::OffsetTableDimensions;
     using KVCacheManager = kv_cache_manager::KVCacheManager;
     using KvCacheType = kv_cache_manager::CacheType;
     using KvCacheConfig = kv_cache_manager::KvCacheConfig;
@@ -330,7 +339,7 @@ class TrtGptModelInflightBatching : public TrtGptModel
 
     [[nodiscard]] nvinfer1::Dims getTensorShape(std::string const& name) const override;
 
-    void reshapeKvTensors(SizeType32 maxBlocksPerSeq, kv_cache_manager::CacheType kvCacheType, SizeType32 numPools);
+    void reshapeKvTensors(OffsetTableDimensions const& dims);
 
     [[nodiscard]] bool hasSpeculativeDecodingFastLogits() const noexcept override
     {
@@ -533,6 +542,7 @@ class TrtGptModelInflightBatching : public TrtGptModel
     RequestVector mDraftRequestsWaitingToSendLogits;
     SizeType32 mSeamlessLADMaxDraftLen{0};
     bool mUseSeamlessLookahead{false};
+    RewindInputs mRewindInputs;
 
     /******************** Algorithms ********************/
     // Algorithms are reentrant, they are assigned a state at
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
index 926d3733f9c..d39ca65db7c 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -55,9 +55,9 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager
 {
 public:
     // using BaseKVCacheManager::BaseKVCacheManager; // Inherit constructors
-    void allocatePools(nvinfer1::DataType dtype, bool useUvm = false) override
+    void allocatePools(bool useUvm = false) override
     {
-        PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, allocatePools, dtype, useUvm);
+        PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, allocatePools, useUvm);
     }
 
     void releasePools() override
@@ -90,21 +90,6 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager
         PYBIND11_OVERLOAD_PURE(tbk::KvCacheStats, tbk::BaseKVCacheManager, getKvCacheStats);
     }
 
-    SizeType32 getMaxBlocksPerSeq() const override
-    {
-        PYBIND11_OVERLOAD_PURE(SizeType32, tbk::BaseKVCacheManager, getMaxBlocksPerSeq);
-    }
-
-    SizeType32 getNeededBlocksOneStep(tb::LlmRequest const& req, bool twoStepsLookAhead) const override
-    {
-        PYBIND11_OVERLOAD_PURE(SizeType32, tbk::BaseKVCacheManager, getNeededBlocksOneStep, req, twoStepsLookAhead);
-    }
-
-    SizeType32 getRemainingBlocksToCompletion(tb::LlmRequest const& req) const override
-    {
-        PYBIND11_OVERLOAD_PURE(SizeType32, tbk::BaseKVCacheManager, getRemainingBlocksToCompletion, req);
-    }
-
     void addToken(tb::LlmRequest::RequestIdType requestId) override
     {
         PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, addToken, requestId);
@@ -164,11 +149,6 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager
         PYBIND11_OVERLOAD_PURE(bool, tbk::BaseKVCacheManager, isEnableBlockReuse);
     }
 
-    bool isUseOneMoreBlock() const override
-    {
-        PYBIND11_OVERLOAD_PURE(bool, tbk::BaseKVCacheManager, isUseOneMoreBlock);
-    }
-
     void rewindKVCache(tb::LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override
     {
         PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, rewindKVCache, requestId, rewindLengths);
@@ -191,27 +171,25 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager
         PYBIND11_OVERLOAD_PURE(void, tbk::BaseKVCacheManager, storeContextBlocks, llmRequest);
     }
 
-    bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const override
+    std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
+        tb::LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override
     {
-        PYBIND11_OVERLOAD_PURE(bool, tbk::BaseKVCacheManager, schedulingHasFreeBlocks, numRequired);
-    }
-
-    std::vector<std::vector<SizeType32>> const& getCacheBlockIds(tb::LlmRequest::RequestIdType requestId) const override
-    {
-        PYBIND11_OVERLOAD_PURE(
-            std::vector<std::vector<SizeType32>> const&, tbk::BaseKVCacheManager, getCacheBlockIds, requestId);
+        PYBIND11_OVERLOAD_PURE(std::vector<std::vector<SizeType32>> const&, tbk::BaseKVCacheManager, getCacheBlockIds,
+            requestId, windowSize);
     }
 
     std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
-        std::vector<tb::LlmRequest::RequestIdType> const& requestIds) const override
+        std::vector<tb::LlmRequest::RequestIdType> const& requestIds, SizeType32 windowSize) const override
     {
         PYBIND11_OVERLOAD_PURE(std::vector<std::vector<std::vector<SizeType32>>>, tbk::BaseKVCacheManager,
-            getBatchCacheBlockIds, requestIds);
+            getBatchCacheBlockIds, requestIds, windowSize);
     }
 
-    std::vector<SizeType32> getNewlyAllocatedBlockIds(tb::LlmRequest::RequestIdType requestId) const override
+    std::vector<SizeType32> getNewlyAllocatedBlockIds(
+        tb::LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override
     {
-        PYBIND11_OVERLOAD_PURE(std::vector<SizeType32>, tbk::BaseKVCacheManager, getNewlyAllocatedBlockIds, requestId);
+        PYBIND11_OVERLOAD_PURE(
+            std::vector<SizeType32>, tbk::BaseKVCacheManager, getNewlyAllocatedBlockIds, requestId, windowSize);
     }
 
     SizeType32 getUsedNumBlocks() const override
@@ -322,6 +300,12 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
         .def_readwrite("missed_blocks", &tbk::KvCacheStats::missedBlocks)
         .def_readwrite("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate);
 
+    py::class_<tbk::TempAttentionWindowInputs>(m, "TempAttentionWindowInputs")
+        .def(py::init<>())
+        .def_readwrite("paged_context_fmha", &tbk::TempAttentionWindowInputs::pagedContextFMHA)
+        .def_readwrite("max_input_len", &tbk::TempAttentionWindowInputs::maxInputLen)
+        .def_readwrite("max_num_tokens", &tbk::TempAttentionWindowInputs::maxNumTokens);
+
     py::class_<tbk::KVCacheEventManager, std::shared_ptr<tbk::KVCacheEventManager>>(m, "KVCacheEventManager")
         .def(py::init<size_t>(), py::arg("max_kv_event_entries"));
 
@@ -336,7 +320,8 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
         .def_property_readonly("max_num_blocks", &BaseKVCacheManager::getMaxNumBlocks)
         .def_property_readonly("num_pools", &BaseKVCacheManager::getNumPools)
         .def("get_kv_cache_stats", &BaseKVCacheManager::getKvCacheStats)
-        .def_property_readonly("max_blocks_per_seq", &BaseKVCacheManager::getMaxBlocksPerSeq)
+        .def_property_readonly("max_blocks_per_seq",
+            [](tbk::BaseKVCacheManager& self) { return self.getOffsetTableDimensions().maxBlocksPerSeq; })
         .def("get_needed_blocks_one_step", &BaseKVCacheManager::getNeededBlocksOneStep)
         .def("get_remaining_blocks_to_completion", &BaseKVCacheManager::getRemainingBlocksToCompletion)
         .def("add_token", &BaseKVCacheManager::addToken)
@@ -414,11 +399,9 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
             },
             py::arg("timeout_ms") = std::nullopt)
         .def_property_readonly("enable_block_reuse", &BaseKVCacheManager::isEnableBlockReuse)
-        .def_property_readonly("use_one_more_block", &BaseKVCacheManager::isUseOneMoreBlock)
         .def("rewind_kv_cache", &BaseKVCacheManager::rewindKVCache)
         .def_property_readonly("cross_kv", &BaseKVCacheManager::isCrossKv)
         .def("store_context_blocks", &BaseKVCacheManager::storeContextBlocks)
-        .def("scheduling_has_free_blocks", &BaseKVCacheManager::schedulingHasFreeBlocks)
         .def("get_cache_block_ids", &BaseKVCacheManager::getCacheBlockIds)
         .def("get_batch_cache_block_ids", &BaseKVCacheManager::getBatchCacheBlockIds)
         .def("get_newly_allocated_block_ids", &BaseKVCacheManager::getNewlyAllocatedBlockIds)
@@ -431,13 +414,14 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
 
     py::classh<tbk::KVCacheManager, tbk::BaseKVCacheManager>(m, "KVCacheManager")
         .def(py::init<std::vector<SizeType32> const&, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32,
-                 SizeType32, std::vector<SizeType32> const&, SizeType32, SizeType32, bool, int64_t, bool, bool,
-                 tbk::CacheType, std::optional<tensorrt_llm::executor::RetentionPriority>,
-                 std::shared_ptr<tbk::KVCacheEventManager>, bool, bool>(),
+                 SizeType32, std::vector<SizeType32> const&, std::optional<tbk::TempAttentionWindowInputs> const&,
+                 nvinfer1::DataType, SizeType32, bool, int64_t, bool, bool, tbk::CacheType,
+                 std::optional<tensorrt_llm::executor::RetentionPriority>, std::shared_ptr<tbk::KVCacheEventManager>,
+                 bool, bool>(),
             py::arg("num_kv_heads_per_layer"), py::arg("size_per_head"), py::arg("tokens_per_block"),
             py::arg("blocks_in_primary_pool"), py::arg("blocks_in_secondary_pool"), py::arg("max_num_sequences"),
-            py::arg("max_beam_width"), py::arg("max_attention_window_vec"), py::arg("temporary_attention_window"),
-            py::arg("sink_token_length"), py::arg("stream"), py::arg("max_sequence_length"),
+            py::arg("max_beam_width"), py::arg("max_attention_window_vec"), py::arg("temp_attention_window_inputs"),
+            py::arg("dtype"), py::arg("sink_token_length"), py::arg("stream"), py::arg("max_sequence_length"),
             py::arg("enable_block_reuse") = false, py::arg("onboard_blocks") = true,
             py::arg_v("cache_type", tbk::CacheType::kSELF, "bindings.internal.batch_manager.CacheType.SELF"),
             py::arg("secondary_offload_min_priority") = std::nullopt, py::arg("event_manager") = nullptr,
diff --git a/cpp/tensorrt_llm/runtime/gptSession.cpp b/cpp/tensorrt_llm/runtime/gptSession.cpp
index 7147b0f1252..4e5732a992f 100644
--- a/cpp/tensorrt_llm/runtime/gptSession.cpp
+++ b/cpp/tensorrt_llm/runtime/gptSession.cpp
@@ -241,10 +241,10 @@ void GptSession::createKvCacheManager(SizeType32 maxBatchSize, SizeType32 maxBea
     mKvCacheManager = std::make_shared<bmkv::KVCacheManager>(
         std::vector<SizeType32>(numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd), sizePerHead, tokensPerBlock,
         blocksInPrimaryPool, blocksInSecondaryPool, maxBatchSize, maxBeamWidth, mDecoderMaxAttentionWindowVec,
-        /*temporaryAttentionWindow*/ 0, sinkTokenLength, mRuntime->getStreamPtr(), maxSequenceLength, enableBlockReuse,
-        kvCacheConfig.onboardBlocks);
+        /*tempAttentionWindowInputs*/ std::nullopt, kvDtype, sinkTokenLength, mRuntime->getStreamPtr(),
+        maxSequenceLength, enableBlockReuse, kvCacheConfig.onboardBlocks);
 
-    auto const maxBlocksPerSeq = mKvCacheManager->getMaxBlocksPerSeq();
+    auto const maxBlocksPerSeq = mKvCacheManager->getOffsetTableDimensions().maxBlocksPerSeq;
 
     TLLM_CHECK(mBuffers.size() == static_cast<size_t>(mMicroBatchConfig.numGenBatches));
     for (auto& buffers : mBuffers)
@@ -253,7 +253,7 @@ void GptSession::createKvCacheManager(SizeType32 maxBatchSize, SizeType32 maxBea
         buffers->transformerBuffers->reshapeKvTensors(maxBatchSize, maxBeamWidth, maxBlocksPerSeq, *mRuntime);
     }
 
-    mKvCacheManager->allocatePools(kvDtype, kvCacheConfig.useUvm);
+    mKvCacheManager->allocatePools(kvCacheConfig.useUvm);
 
     for (auto& buffers : mBuffers)
     {
diff --git a/cpp/tests/batch_manager/cacheTransceiverTest.cpp b/cpp/tests/batch_manager/cacheTransceiverTest.cpp
index 047df9a2bbc..7db3bb89c1b 100644
--- a/cpp/tests/batch_manager/cacheTransceiverTest.cpp
+++ b/cpp/tests/batch_manager/cacheTransceiverTest.cpp
@@ -295,7 +295,6 @@ class SymmetricalCacheTest : public ::testing::Test // NOLINT(cppcoreguidelines-
 
         auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq;
         auto constexpr maxAttentionWindow = maxNumTokens;
-        auto constexpr temporaryAttentionWindow = 0;
         auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1;
         auto constexpr numSharedBlocks = inputLength / tokensPerBlock;
         auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth;
@@ -308,9 +307,9 @@ class SymmetricalCacheTest : public ::testing::Test // NOLINT(cppcoreguidelines-
         auto constexpr dataType = nvinfer1::DataType::kFLOAT;
 
         mManager = std::make_unique<KVCacheManager>(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks,
-            blocksInSecondaryPool, mMaxNumSequences, maxBeamWidth, std::vector<SizeType32>{maxAttentionWindow},
-            temporaryAttentionWindow, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks,
-            CacheType::kSELF, std::nullopt, nullptr, true);
+            blocksInSecondaryPool, mMaxNumSequences, maxBeamWidth,
+            std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, dataType, sinkTokenLength, stream,
+            std::nullopt, enableBlockReuse, onboardBlocks, CacheType::kSELF, std::nullopt, nullptr, true);
         mCacheState = std::make_unique<texec::kv_cache::CacheState>(
             numLayers, numHeads, sizePerHead, tokensPerBlock, 1, 1, dataType);
 
@@ -377,7 +376,7 @@ class SymmetricalCacheTest : public ::testing::Test // NOLINT(cppcoreguidelines-
         }
         // UVM seems to be incompatible with MPI, and it is continuing to investigate.
         bool constexpr useUvm = false;
-        mManager->allocatePools(dataType, useUvm);
+        mManager->allocatePools(useUvm);
     }
 
     void setUpCacheTransceiver()
@@ -414,7 +413,7 @@ class SymmetricalCacheTest : public ::testing::Test // NOLINT(cppcoreguidelines-
         mManager->addSequence(llmRequest->mRequestId, llmRequest->getNumTokens(beamIdx), beamWidth, llmRequest);
         if (isSender)
         {
-            auto blockRange = BlockRange(*mManager, llmRequest->mRequestId, beamIdx, 0);
+            auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mManager, llmRequest->mRequestId);
             for (auto& block : blockRange)
             {
                 // fill cache with tokens (= request length), for reuse test
@@ -427,7 +426,7 @@ class SymmetricalCacheTest : public ::testing::Test // NOLINT(cppcoreguidelines-
             auto future = mRequester->requestAndReceiveAsync(*llmRequest);
             future.get();
             TLLM_CUDA_CHECK(cudaDeviceSynchronize());
-            auto blockRange = BlockRange(*mManager, llmRequest->mRequestId, beamIdx, 0);
+            auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mManager, llmRequest->mRequestId);
             for (auto& block : blockRange)
             {
                 std::vector<uint8_t> bytes(block.getSizeInBytes());
@@ -619,7 +618,6 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
 
         auto maxNumTokens = tokensPerBlock * maxBlocksPerSeq;
         auto maxAttentionWindow = maxNumTokens;
-        auto constexpr temporaryAttentionWindow = 0;
         auto inputLength = maxNumTokens - tokensPerBlock - 1;
         auto numSharedBlocks = inputLength / tokensPerBlock;
         auto numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth;
@@ -657,7 +655,7 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
         }
         mManager = std::make_unique<KVCacheManager>(numLayers / mPpSize, numHeadsPerRank, sizePerHead, tokensPerBlock,
             totalNumBlocks, blocksInSecondaryPool, mMaxNumSequences, maxBeamWidth,
-            std::vector<SizeType32>{maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream,
+            std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, dataType, sinkTokenLength, stream,
             std::nullopt, enableBlockReuse, onboardBlocks, cacheType, std::nullopt, nullptr, true);
         texec::kv_cache::CacheState::AttentionType attentionType = isMLA
             ? texec::kv_cache::CacheState::AttentionType::kMLA
@@ -670,7 +668,7 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
 
         // UVM seems to be incompatible with MPI, and it is continuing to investigate.
         bool constexpr useUvm = false;
-        mManager->allocatePools(dataType, useUvm);
+        mManager->allocatePools(useUvm);
     }
 
     void setUpCacheTransceiver()
@@ -830,14 +828,21 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
         auto constexpr beamIdx{0};
         auto constexpr beamWidth{1};
         mManager->addSequence(llmRequest->mRequestId, llmRequest->getNumTokens(beamIdx), beamWidth, llmRequest);
-        auto blockRange = BlockRange(*mManager, llmRequest->mRequestId, beamIdx, 0);
+        auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mManager, llmRequest->mRequestId);
         int blockIdx = 0;
         for (auto& block : blockRange)
         {
             fillBlockData(block, blockIdx, llmRequest->getPromptLen());
             blockIdx++;
         }
-        mManager->getBlockManager().getBufferManager().getStream().synchronize();
+        auto const& blockManager = mManager->getBlockManager();
+        if (blockManager.getNumPools() != 1)
+        {
+            throw std::runtime_error("Test assumes that just a single pool (single window size) is used");
+        }
+        auto const onlyWindowSize = blockManager.getPoolWindowSize(0);
+
+        blockManager.getBufferManager(onlyWindowSize).getStream().synchronize();
         auto future = mResponder->respondAndSendAsync(*llmRequest);
         return future;
     }
@@ -859,7 +864,7 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
 
         TLLM_CUDA_CHECK(cudaDeviceSynchronize());
 
-        auto blockRange = BlockRange(*mManager, llmRequest->mRequestId, beamIdx, 0);
+        auto blockRange = BlockRange::fromOldAllocatedBlockIds(*mManager, llmRequest->mRequestId);
         for (auto& block : blockRange)
         {
             verifyBlockData(block, blockIdx, llmRequest->getPromptLen());
@@ -869,8 +874,12 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
 
     void fillBlockData(tensorrt_llm::runtime::ITensor& blockData, int blockId, size_t initial)
     {
-        auto hostTensor
-            = mManager->getBlockManager().getBufferManager().cpu(blockData.getShape(), blockData.getDataType());
+        auto const& blockManager = mManager->getBlockManager();
+        ASSERT_EQ(blockManager.getNumPools(), 1);
+        auto const onlyWindowSize = blockManager.getPoolWindowSize(0);
+        auto const& bufferManager = blockManager.getBufferManager(onlyWindowSize);
+
+        auto hostTensor = tensorrt_llm::runtime::BufferManager::cpu(blockData.getShape(), blockData.getDataType());
         int layerSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.size() / mPpSize;
         int startLayerId = layerSizePerRank * mPpRank;
         int headSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.at(0);
@@ -924,13 +933,17 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
                 }
             }
         }
-        mManager->getBlockManager().getBufferManager().copy(*hostTensor, blockData);
+        bufferManager.copy(*hostTensor, blockData);
     }
 
     void verifyBlockData(tensorrt_llm::runtime::ITensor& blockData, int blockId, size_t initial)
     {
-        auto hostTensor
-            = mManager->getBlockManager().getBufferManager().cpu(blockData.getShape(), blockData.getDataType());
+        auto const& blockManager = mManager->getBlockManager();
+        ASSERT_EQ(blockManager.getNumPools(), 1);
+        auto const onlyWindowSize = blockManager.getPoolWindowSize(0);
+        auto const& bufferManager = blockManager.getBufferManager(onlyWindowSize);
+
+        auto hostTensor = tensorrt_llm::runtime::BufferManager::cpu(blockData.getShape(), blockData.getDataType());
         int layerSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.size() / mPpSize;
         int startLayerId = layerSizePerRank * mPpRank;
         int headSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.at(0);
@@ -945,8 +958,8 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
         int startTokenId = blockId * tokensPerBlock;
         int sizePerHead = mCacheState->getModelConfig().mSizePerHead;
 
-        mManager->getBlockManager().getBufferManager().copy(blockData, *hostTensor);
-        mManager->getBlockManager().getBufferManager().getStream().synchronize();
+        bufferManager.copy(blockData, *hostTensor);
+        bufferManager.getStream().synchronize();
 
         for (int layerId = 0; layerId < layerSizePerRank; layerId++)
         {
diff --git a/cpp/tests/unit_tests/batch_manager/capacitySchedulerTest.cpp b/cpp/tests/unit_tests/batch_manager/capacitySchedulerTest.cpp
index e420e8b8124..372de1b109d 100644
--- a/cpp/tests/unit_tests/batch_manager/capacitySchedulerTest.cpp
+++ b/cpp/tests/unit_tests/batch_manager/capacitySchedulerTest.cpp
@@ -131,8 +131,8 @@ class CapacitySchedulerTest : public ::testing::Test // NOLINT(cppcoreguidelines
 
         // init KV cache block manager
         return std::make_shared<kv_cache_manager::KVCacheManager>(numLayers, nbKvHeads, sizePerHead, tokensPerBlock,
-            maxNumBlocks, 0, maxNumRequests, 1, std::vector<SizeType32>{maxNumTokensPerSeq}, 0, sinkTokenLength,
-            streamPtr, std::nullopt, enableReuse, onboardBlocks, cacheType);
+            maxNumBlocks, 0, maxNumRequests, 1, std::vector<SizeType32>{maxNumTokensPerSeq}, std::nullopt, kvDtype,
+            sinkTokenLength, streamPtr, std::nullopt, enableReuse, onboardBlocks, cacheType);
     }
 
     static std::shared_ptr<BasePeftCacheManager> getPeftCacheManager()
diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
index 8dbee68798a..470911ec9b5 100644
--- a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
+++ b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
@@ -47,6 +47,8 @@ namespace tr = tensorrt_llm::runtime;
 
 using ParamType = bool;
 
+namespace
+{
 std::string generateTestName(testing::TestParamInfo<ParamType> const& info)
 {
     auto const homogeneousLayers = info.param;
@@ -62,6 +64,14 @@ std::string generateTestName(testing::TestParamInfo<ParamType> const& info)
     return name;
 }
 
+SizeType32 theOnlyWindowSize(KVCacheManager const& kvCacheManager)
+{
+    auto const& blockManager = kvCacheManager.getBlockManager();
+    EXPECT_EQ(blockManager.getWindowSizesMetadata().size(), 1) << "Assuming a single window size";
+    return blockManager.getPoolWindowSize(0);
+}
+} // namespace
+
 class KVCacheManagerTest
     : public ::testing::Test,
       public ::testing::WithParamInterface<ParamType> // NOLINT(cppcoreguidelines-pro-type-member-init)
@@ -81,14 +91,6 @@ class KVCacheManagerTest
 
 namespace
 {
-void allocateBlocks(BlockManager& manager, GenerationRequest& sequence, std::size_t numBlocks, bool shareAmongBeams)
-{
-    for (std::size_t i = 0; i < numBlocks; ++i)
-    {
-        manager.allocateBlock(sequence, shareAmongBeams);
-    }
-}
-
 // TODO: This is really ugly. Flushing the event queue is done in a separate thread, so if we want to check the value we
 // need to wait for the thread to complete. It works, but it's technically not deterministic.
 std::deque<tle::KVCacheEvent> getEvents(KVCacheManager& kvCacheManager)
@@ -112,25 +114,27 @@ TEST_F(KVCacheManagerTest, BlockManagerTest)
     auto const stream = std::make_shared<tr::CudaStream>();
     auto constexpr onboardBlocks = true;
 
-    BlockManager blockManager(std::vector<BlockManager::SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock,
-        blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks);
-    blockManager.allocatePools(nvinfer1::DataType::kHALF, false);
+    auto constexpr beamWidth = 8;
+    auto constexpr numBlocksPerBeam = blocksInPrimaryPool / beamWidth;
+    auto constexpr numTokens = tokensPerBlock * numBlocksPerBeam;
+    auto constexpr maxAttentionWindow = numTokens;
+
+    BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool,
+        blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth,
+        std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0,
+        onboardBlocks);
+    blockManager.allocatePools(false);
 
     EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
 
     auto constexpr requestId = 42;
-    auto constexpr beamWidth = 8;
-    auto constexpr numBlocksPerBeam = blocksInPrimaryPool / beamWidth;
-    auto constexpr numTokens = tokensPerBlock * numBlocksPerBeam;
-    auto constexpr maxAttentionWindow = numTokens;
-    auto const maxAttentionWindowVec = std::vector<BlockManager::SizeType32>{maxAttentionWindow};
-    GenerationRequest seq0{requestId, numTokens, beamWidth, numBlocksPerBeam, maxAttentionWindow};
-    blockManager.addSequence(seq0, numBlocksPerBeam, numBlocksPerBeam - 1);
+    GenerationRequest seq0{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
+    blockManager.addSequence(seq0, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow);
     auto constexpr occupiedBlocks = (numBlocksPerBeam - 1) + beamWidth;
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - occupiedBlocks);
-    auto const& ids = seq0.getCacheBlockIds();
+    auto const& ids = seq0.getCacheBlockIds(maxAttentionWindow);
     std::set<std::int32_t> idSet{};
     EXPECT_EQ(ids.size(), beamWidth);
     for (auto const& beam : ids)
@@ -142,7 +146,7 @@ TEST_F(KVCacheManagerTest, BlockManagerTest)
     blockManager.releaseBlocks(seq0);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
 
-    blockManager.addSequence(seq0, numBlocksPerBeam, -1);
+    blockManager.addSequence(seq0, numBlocksPerBeam, -1, maxAttentionWindow);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocksPerBeam);
     EXPECT_EQ(ids.size(), beamWidth);
     for (std::size_t i = 0u; i < ids.front().size(); ++i)
@@ -156,15 +160,17 @@ TEST_F(KVCacheManagerTest, BlockManagerTest)
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
 
     // occupy 22/24 blocks
-    EXPECT_NO_THROW(blockManager.addSequence(seq0, numBlocksPerBeam, numBlocksPerBeam - 1));
-    GenerationRequest seq1{requestId + 1, numTokens, beamWidth, numBlocksPerBeam, maxAttentionWindow};
-    EXPECT_NO_THROW(blockManager.addSequence(seq1, numBlocksPerBeam, numBlocksPerBeam - 1));
+    EXPECT_NO_THROW(blockManager.addSequence(seq0, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow));
+    GenerationRequest seq1{requestId + 1, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
+    EXPECT_NO_THROW(blockManager.addSequence(seq1, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow));
     // same requestId not allowed
-    GenerationRequest seq2{requestId, numTokens, beamWidth, numBlocksPerBeam, maxAttentionWindow};
-    EXPECT_THROW(blockManager.addSequence(seq2, numBlocksPerBeam, numBlocksPerBeam - 1), std::runtime_error);
+    GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
+    EXPECT_THROW(
+        blockManager.addSequence(seq2, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow), std::runtime_error);
     // no more blocks
-    GenerationRequest seq3{requestId + 2, numTokens, beamWidth, numBlocksPerBeam, maxAttentionWindow};
-    EXPECT_THROW(blockManager.addSequence(seq3, numBlocksPerBeam, numBlocksPerBeam - 1), std::runtime_error);
+    GenerationRequest seq3{requestId + 2, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
+    EXPECT_THROW(
+        blockManager.addSequence(seq3, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow), std::runtime_error);
 }
 
 template <typename T, nvinfer1::DataType type, int mask>
@@ -195,8 +201,9 @@ void runPartialCopyTest()
     bool constexpr isStreaming{false};
 
     BlockManager blockManager(std::vector<BlockManager::SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock,
-        blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks);
-    blockManager.allocatePools(type, false);
+        blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth,
+        std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, type, 0, onboardBlocks);
+    blockManager.allocatePools(false);
 
     auto oneLayerBlockSize = blockManager.getBlockSize(0);
     EXPECT_EQ(oneLayerBlockSize, numKvHeads * sizePerHead * tokensPerBlock);
@@ -224,21 +231,21 @@ void runPartialCopyTest()
     auto const inputLength = static_cast<SizeType32>(inputTokens->size());
     LlmRequest::RequestIdType requestId{0};
     auto llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
-    GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
     auto promptLen0 = llmRequest0->getNumTokens(beamIdx);
     auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0);
+    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
     EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0);
-    auto cacheBlockIds = seq0.getCacheBlockIds().at(beamIdx);
+    auto cacheBlockIds = seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx);
     EXPECT_THAT(cacheBlockIds, ::testing::ElementsAreArray({0, 1, 2}));
 
     // Offload all 3 blocks, fill with predictable pattern, onboard
     for (auto cacheBlockId : cacheBlockIds)
     {
-        auto block = blockManager.getBlockById(cacheBlockId);
+        auto block = blockManager.getBlockById(cacheBlockId, maxAttentionWindow);
         EXPECT_TRUE(block->isPrimary());
         // offload so we can write to block in CPU code
-        blockManager.offloadBlock(block);
+        blockManager.offloadBlock(block, maxAttentionWindow);
         EXPECT_FALSE(block->isPrimary());
         // need to sync so D2H transfer is done before accessing blocks
         EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
@@ -251,10 +258,10 @@ void runPartialCopyTest()
             rawBlockPtr[i] = i & mask;
         }
         // onboard
-        blockManager.onboardBlock(block);
+        blockManager.onboardBlock(block, maxAttentionWindow);
         EXPECT_TRUE(block->isPrimary());
         EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
-        EXPECT_TRUE(blockManager.verifyQueueIntegrity());
+        EXPECT_TRUE(blockManager.verifyQueueIntegrity(maxAttentionWindow));
     }
     blockManager.releaseBlocks(seq0, llmRequest0);
 
@@ -263,12 +270,12 @@ void runPartialCopyTest()
     auto const inputLength1 = static_cast<SizeType32>(inputTokens1->size());
     requestId = 1;
     auto llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming);
-    GenerationRequest seq1{requestId, inputLength1, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq1{requestId, inputLength1, beamWidth, blockManager.getWindowSizesMetadata()};
     auto promptLen1 = llmRequest1->getNumTokens(beamIdx);
     auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1);
+    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
     EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 16);
-    auto cacheBlockIds1 = seq1.getCacheBlockIds().at(beamIdx);
+    auto cacheBlockIds1 = seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx);
     EXPECT_THAT(cacheBlockIds1, ::testing::ElementsAreArray({0, 1, 6}));
     // store blocks 0, 1 ([0,1,2,3,4,5,6,7], [8,9,10,11,12,13,14,15])
     blockManager.storeContextBlocks(seq1, *llmRequest1);
@@ -277,7 +284,7 @@ void runPartialCopyTest()
     // Add sequence [0,1,2,3,4,5,6,7,8,9,10,11] again.
     // Reuse blocks 0 and 1(pc). Block 1 is partially reused, but already referenced by seq1 so must be partial copied
     // into new block 2. Clear block 2 so we can see what was partial copied.
-    auto block2 = blockManager.getBlockById(2);
+    auto block2 = blockManager.getBlockById(2, maxAttentionWindow);
     auto memoryPoolIndex2 = block2->getMemoryPoolBlockIndex();
     auto block2Ptr{tr::ITensor::slice(primaryPoolPtr, memoryPoolIndex2, 1)};
     EXPECT_EQ(cudaMemset(block2Ptr->data(), 0, blockSize * sizeof(T)), cudaSuccess);
@@ -287,18 +294,18 @@ void runPartialCopyTest()
     auto const inputLength2 = static_cast<SizeType32>(inputTokens2->size());
     requestId = 2;
     auto llmRequest2 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens2, samplingConfig, isStreaming);
-    GenerationRequest seq2{requestId, inputLength2, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq2{requestId, inputLength2, beamWidth, blockManager.getWindowSizesMetadata()};
     auto promptLen2 = llmRequest2->getNumTokens(beamIdx);
     auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2);
+    blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow);
     EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 11);
-    auto cacheBlockIds2 = seq2.getCacheBlockIds().at(beamIdx);
+    auto cacheBlockIds2 = seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx);
     EXPECT_THAT(cacheBlockIds2, ::testing::ElementsAreArray({0, 2}));
     EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
 
     // Verify partial copied block 2
     // Block has shape [2, numLayers, numKvHeads, tokensPerBlock, sizePerHead]
-    blockManager.offloadBlock(block2);
+    blockManager.offloadBlock(block2, maxAttentionWindow);
     EXPECT_FALSE(block2->isPrimary());
     // need to sync so D2H transfer is done before accessing blocks
     EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
@@ -326,7 +333,7 @@ void runPartialCopyTest()
         }
     }
     EXPECT_EQ(numBad, 0);
-    blockManager.onboardBlock(block2);
+    blockManager.onboardBlock(block2, maxAttentionWindow);
     EXPECT_TRUE(block2->isPrimary());
     EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
 
@@ -383,6 +390,34 @@ TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyFP8)
 }
 #endif
 
+TEST_F(KVCacheManagerTest, BlockManagerTestBlocksPerWindowSize)
+{
+    auto constexpr numPrimaryBlocks = 16384;
+    // Single window size
+    {
+        std::map<SizeType32, std::vector<SizeType32>> windowSizeToLayers{{1024, {0, 1, 2}}};
+        auto result = BlockManager::blocksPerWindowSize(numPrimaryBlocks, windowSizeToLayers);
+        EXPECT_EQ(result.size(), 1);
+        EXPECT_EQ(result.at(1024), numPrimaryBlocks);
+    }
+    // Variable window size
+    {
+        std::map<SizeType32, std::vector<SizeType32>> windowSizeToLayers{
+            {1024, {1}},       // contribution = 1024*1 = 1024  / 29696
+            {4096, {0, 4, 5}}, // contribution = 4096*3 = 12288 / 29696
+            {8192, {2, 3}},    // contribution = 8192*2 = 16384 / 29696
+        };
+        auto result = BlockManager::blocksPerWindowSize(numPrimaryBlocks, windowSizeToLayers);
+        EXPECT_EQ(result.size(), 3);
+        EXPECT_EQ(std::accumulate(result.begin(), result.end(), 0, [](auto sum, auto cur) { return sum + cur.second; }),
+            numPrimaryBlocks);
+        // Two blocks that were lost due to rounding down were awarded in order to the smallest window sizes:
+        EXPECT_EQ(result.at(1024), 565);  // 564 + 1
+        EXPECT_EQ(result.at(4096), 6780); // 6779 + 1
+        EXPECT_EQ(result.at(8192), 9039); // 9039 + 0
+    }
+}
+
 #ifdef ENABLE_FP4
 TEST_F(KVCacheManagerTest, FP4BlockScaleManagementTest)
 {
@@ -398,11 +433,12 @@ TEST_F(KVCacheManagerTest, FP4BlockScaleManagementTest)
     auto const stream = std::make_shared<tr::CudaStream>();
     auto constexpr beamWidth = 1;
 
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, false, stream, true,
-        onboardBlocks);
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, nvinfer1::DataType::kFP4, false, stream, true, onboardBlocks);
 
-    kvCacheManager.allocatePools(nvinfer1::DataType::kFP4, /*useUvm=*/false);
+    kvCacheManager.allocatePools(/*useUvm=*/false);
 
     // We should have one additional pool for the block scales.
     EXPECT_EQ(kvCacheManager.getBlockManager().getNumPools(), 2);
@@ -429,16 +465,19 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     auto constexpr onboardBlocks = true;
     auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
 
-    BlockManager blockManager(std::vector<BlockManager::SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock,
-        blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks);
-    blockManager.allocatePools(nvinfer1::DataType::kHALF, false);
+    auto constexpr beamWidth = 1;
+
+    BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool,
+        blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth,
+        std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0,
+        onboardBlocks);
+    blockManager.allocatePools(false);
 
     EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
 
     SizeType32 constexpr maxNewTokens{0};
-    auto constexpr beamWidth = 1;
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
@@ -447,7 +486,7 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     LlmRequest::RequestIdType requestId{0};
     auto llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
 
-    GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
 
     ///////////////////////////////////////////////////////////////////////////
     // add request and then remove it
@@ -455,9 +494,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     auto constexpr beamIdx = 0;
     auto promptLen0 = llmRequest0->getNumTokens(beamIdx);
     auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0);
+    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
     EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
+    EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
     llmRequest0->addNewToken(9, beamIdx);  // block 2 contains [8]
     llmRequest0->addNewToken(10, beamIdx); // block 2 contains [8, 9]
     auto numTokens = llmRequest0->getNumTokens(beamIdx);
@@ -475,14 +514,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     // new request with same tokens [0, 1, 2, 3, 4, 5, 6, 7, 8] and then remove it
     requestId = 1;
     auto llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
-    GenerationRequest seq1{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
 
     // reuse blocks 0, 1 ([0, 1, 2, 3], [4, 5, 6, 7]) and get new block 3
     auto promptLen1 = llmRequest1->getNumTokens(beamIdx);
     auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1);
+    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
     EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock);
-    EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 3}));
+    EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3}));
     llmRequest1->addNewToken(9, beamIdx);  // block 3 contains [8]
     llmRequest1->addNewToken(10, beamIdx); // block 3 contains [8, 9]
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
@@ -502,9 +541,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens0, samplingConfig, isStreaming);
     promptLen0 = llmRequest0->getNumTokens(beamIdx);
     numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0);
+    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
     EXPECT_EQ(llmRequest0->getContextCurrentPosition(), promptLen0 - 1);
-    EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
+    EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
     // note that seq0 is holding blocks 0, 1 and 2 until releaseBlocks is called
@@ -515,9 +554,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming);
     promptLen1 = llmRequest1->getNumTokens(beamIdx);
     numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1);
+    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
     EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock);
-    EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
+    EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
     llmRequest1->addNewToken(10, beamIdx); // block 4 contains [8, 9, 10]
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1);
@@ -541,13 +580,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     auto llmRequest2 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens2, samplingConfig, isStreaming);
 
     numTokens = llmRequest2->getNumTokens(beamIdx);
-    GenerationRequest seq2{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // reuse block 0 ([0, 1, 2, 3]), get new block 5
     auto promptLen2 = llmRequest2->getNumTokens(beamIdx);
     auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2);
+    blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow);
     EXPECT_EQ(llmRequest2->getContextCurrentPosition(), tokensPerBlock);
-    EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 5}));
+    EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 5}));
     llmRequest2->addNewToken(5, beamIdx); // block 5 contains [4]
     numTokens = llmRequest2->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
@@ -562,13 +601,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     auto llmRequest3 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens3, samplingConfig, isStreaming);
 
     numTokens = llmRequest3->getNumTokens(beamIdx);
-    GenerationRequest seq3{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // reuse blocks 0, 1, 4(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8, 9])
     auto promptLen3 = llmRequest3->getNumTokens(beamIdx);
     auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3);
+    blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow);
     EXPECT_EQ(llmRequest3->getContextCurrentPosition(), numTokens - 1);
-    EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
+    EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
     llmRequest3->addNewToken(11, beamIdx); // block 4 contains [8, 9, 11]
     numTokens = llmRequest3->getNumTokens(beamIdx);
     // one block used by both seq2 and seq3
@@ -591,14 +630,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     auto llmRequest4 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming);
 
     numTokens = llmRequest4->getNumTokens(beamIdx);
-    GenerationRequest seq4{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
 
     // reuse blocks 0, 1, 4(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8,9])
     auto promptLen4 = llmRequest4->getNumTokens(beamIdx);
     auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4);
+    blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow);
     EXPECT_EQ(llmRequest4->getContextCurrentPosition(), promptLen4 - 1);
-    EXPECT_THAT(seq4.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
+    EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
     numTokens = llmRequest4->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
@@ -624,9 +663,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     llmRequest4 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming);
     promptLen4 = llmRequest4->getNumTokens(beamIdx);
     numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4);
+    blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow);
     EXPECT_EQ(llmRequest4->getContextCurrentPosition(), promptLen4 - 2);
-    EXPECT_THAT(seq4.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
+    EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
     numTokens = llmRequest4->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
@@ -645,11 +684,11 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     auto llmRequest5 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens5, samplingConfig, isStreaming);
 
     numTokens = llmRequest5->getNumTokens(beamIdx);
-    GenerationRequest seq5{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq5{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // no reuse, all blocks need to be freed
     auto promptLen5 = llmRequest5->getNumTokens(beamIdx);
     auto numContextBlocks5 = tc::ceilDiv(promptLen5, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5);
+    blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5, maxAttentionWindow);
     llmRequest5->addNewToken(0, beamIdx);
     EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 1); // incidental reuse
 
@@ -668,11 +707,11 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
     auto llmRequest6 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens6, samplingConfig, isStreaming);
 
     numTokens = llmRequest6->getNumTokens(beamIdx);
-    GenerationRequest seq6{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq6{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // no reuse, all blocks need to be freed
     auto promptLen6 = llmRequest6->getNumTokens(beamIdx);
     auto numContextBlocks6 = tc::ceilDiv(promptLen6, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq6, promptLen6, numContextBlocks6, *llmRequest6);
+    blockManager.addSequence(seq6, promptLen6, numContextBlocks6, *llmRequest6, maxAttentionWindow);
     llmRequest6->addNewToken(0, beamIdx);
     // no reuse occurs because we are unable to reuse last input token and inputLength6 == 1.
     EXPECT_EQ(llmRequest6->getContextCurrentPosition(), 0);
@@ -703,16 +742,19 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest)
     auto constexpr numReturnSequences = 1;
     auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
 
+    auto constexpr beamWidth = 1;
+
     BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks);
-    blockManager.allocatePools(nvinfer1::DataType::kHALF, false);
+        blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth,
+        std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0,
+        onboardBlocks);
+    blockManager.allocatePools(false);
 
     EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
 
     SizeType32 constexpr maxNewTokens{0};
-    auto constexpr beamWidth = 1;
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
@@ -728,16 +770,16 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest)
         std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
         inputTokenExtraIds, numReturnSequences);
 
-    GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
 
     ///////////////////////////////////////////////////////////////////////////
     // add request and then remove it
     auto constexpr beamIdx = 0;
     auto promptLen0 = llmRequest0->getNumTokens(beamIdx);
     auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0);
+    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
     EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
+    EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
     llmRequest0->addNewToken(3, beamIdx);
     llmRequest0->addNewToken(4, beamIdx);
     auto numTokens = llmRequest0->getNumTokens(beamIdx);
@@ -760,14 +802,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest)
         false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5,
         std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
         inputTokenExtraIds, numReturnSequences);
-    GenerationRequest seq1{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
 
     // reuse blocks 0, 1 and get new block 3
     auto promptLen1 = llmRequest1->getNumTokens(beamIdx);
     auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1);
+    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
     EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock);
-    EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 3}));
+    EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3}));
     llmRequest1->addNewToken(3, beamIdx);
     llmRequest1->addNewToken(4, beamIdx);
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
@@ -789,10 +831,10 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest)
         inputTokenExtraIds, numReturnSequences);
     promptLen0 = llmRequest0->getNumTokens(beamIdx);
     numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0);
+    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
     llmRequest0->addNewToken(3, beamIdx);
     EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock);
-    EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
+    EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
 
@@ -809,9 +851,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest)
         inputTokenExtraIds1, numReturnSequences);
     promptLen1 = llmRequest1->getNumTokens(beamIdx);
     numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1);
+    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
     EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1);
-    EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
+    EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
     llmRequest1->addNewToken(5, beamIdx);
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1);
@@ -836,13 +878,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest)
         inputTokenExtraIds2, numReturnSequences);
 
     numTokens = llmRequest2->getNumTokens(beamIdx);
-    GenerationRequest seq2{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // no reuse, get new block 5, 6, 7
     auto promptLen2 = llmRequest2->getNumTokens(beamIdx);
     auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2);
+    blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow);
     EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({5, 6, 7}));
+    EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7}));
     llmRequest2->addNewToken(3, beamIdx);
     numTokens = llmRequest2->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
@@ -861,13 +903,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest)
         inputTokenExtraIds3, numReturnSequences);
 
     numTokens = llmRequest3->getNumTokens(beamIdx);
-    GenerationRequest seq3{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // reuse block 0, get new block 8, 9
     auto promptLen3 = llmRequest3->getNumTokens(beamIdx);
     auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3);
+    blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow);
     EXPECT_EQ(llmRequest3->getContextCurrentPosition(), tokensPerBlock);
-    EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 8, 9}));
+    EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 8, 9}));
     llmRequest3->addNewToken(3, beamIdx);
     numTokens = llmRequest3->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
@@ -897,16 +939,19 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
     auto constexpr onboardBlocks = true;
     auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
 
+    auto constexpr beamWidth = 1;
+
     BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks);
-    blockManager.allocatePools(nvinfer1::DataType::kHALF, false);
+        blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth,
+        std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0,
+        onboardBlocks);
+    blockManager.allocatePools(false);
 
     EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
 
     SizeType32 constexpr maxNewTokens{0};
-    auto constexpr beamWidth = 1;
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
@@ -918,7 +963,7 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
     auto llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming,
         std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
         std::nullopt, std::nullopt, loraTaskId);
-    GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
 
     ///////////////////////////////////////////////////////////////////////////
     // add request and then remove it
@@ -926,9 +971,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
     auto promptLen0 = llmRequest0->getNumTokens(beamIdx);
     auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
     // get new blocks 0, 1, 2 ([0,1,2,3], [4,5,6,7], [8])
-    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0);
+    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
     EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
+    EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
     llmRequest0->addNewToken(9, beamIdx);
     llmRequest0->addNewToken(10, beamIdx);
     auto numTokens = llmRequest0->getNumTokens(beamIdx);
@@ -949,14 +994,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
     auto llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming,
         std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
         std::nullopt, std::nullopt, loraTaskId);
-    GenerationRequest seq1{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
 
     // reuse blocks 0, 1 and get new block 3
     auto promptLen1 = llmRequest1->getNumTokens(beamIdx);
     auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1);
+    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
     EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock);
-    EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 3}));
+    EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3}));
     llmRequest1->addNewToken(9, beamIdx);
     llmRequest1->addNewToken(10, beamIdx);
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
@@ -976,7 +1021,7 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
         std::nullopt, std::nullopt, loraTaskId);
     promptLen0 = llmRequest0->getNumTokens(beamIdx);
     numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0);
+    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
     // nb! addNewToken adds new generated token, number of input tokens stay the same.
     // calling addNewToken before addSequence potentially triggers this error message:
     // Assertion failed: prepopulatedPromptLen < promptLen
@@ -984,7 +1029,7 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
     // but promptLen is number of input tokens.
     llmRequest0->addNewToken(9, beamIdx);
     EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock);
-    EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
+    EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
 
@@ -996,9 +1041,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
     promptLen1 = llmRequest1->getNumTokens(beamIdx);
     numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
     // reuse 0, 1, 2(p) ([0,1,2,3], [4,5,6,7], [8])
-    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1);
+    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
     EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1);
-    EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
+    EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
     llmRequest1->addNewToken(10, beamIdx);
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1);
@@ -1021,14 +1066,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
         std::nullopt, std::nullopt, loraTaskId);
 
     numTokens = llmRequest2->getNumTokens(beamIdx);
-    GenerationRequest seq2{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // no reuse, get new block 5, 6, 7
     auto promptLen2 = llmRequest2->getNumTokens(beamIdx);
     auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2);
+    blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow);
     // no reuse expected. Input tokens match blocks 0 and 1, but lora task id differs.
     EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({5, 6, 7}));
+    EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7}));
     llmRequest2->addNewToken(9, beamIdx);
     numTokens = llmRequest2->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
@@ -1048,13 +1093,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
         std::nullopt, std::nullopt, loraTaskId);
 
     numTokens = llmRequest3->getNumTokens(beamIdx);
-    GenerationRequest seq3{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // reuse blocks 5, 6, 7(p) ([0,1,2,3], [4,5,6,7], [8])
     auto promptLen3 = llmRequest3->getNumTokens(beamIdx);
     auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3);
+    blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow);
     EXPECT_EQ(llmRequest3->getContextCurrentPosition(), promptLen3 - 2);
-    EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({5, 6, 7}));
+    EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7}));
     llmRequest3->addNewToken(11, beamIdx);
     numTokens = llmRequest3->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
@@ -1076,13 +1121,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
         std::nullopt, std::nullopt, loraTaskId);
 
     numTokens = llmRequest4->getNumTokens(beamIdx);
-    GenerationRequest seq4{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // reuse blocks 0, get new block 8
     auto promptLen4 = llmRequest4->getNumTokens(beamIdx);
     auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4);
+    blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow);
     EXPECT_EQ(llmRequest4->getContextCurrentPosition(), tokensPerBlock);
-    EXPECT_THAT(seq4.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 8}));
+    EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 8}));
     llmRequest4->addNewToken(5, beamIdx);
     numTokens = llmRequest4->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
@@ -1099,13 +1144,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
         std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
 
     numTokens = llmRequest5->getNumTokens(beamIdx);
-    GenerationRequest seq5{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq5{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // no reuse, get new block 9, 10, 11
     auto promptLen5 = llmRequest5->getNumTokens(beamIdx);
     auto numContextBlocks5 = tc::ceilDiv(promptLen5, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5);
+    blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5, maxAttentionWindow);
     EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq5.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({9, 10, 11}));
+    EXPECT_THAT(seq5.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({9, 10, 11}));
     llmRequest5->addNewToken(9, beamIdx);
     numTokens = llmRequest5->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
@@ -1134,16 +1179,19 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
     auto constexpr onboardBlocks = true;
     auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
 
+    auto constexpr beamWidth = 1;
+
     BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks);
-    blockManager.allocatePools(nvinfer1::DataType::kHALF, false);
+        blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth,
+        std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0,
+        onboardBlocks);
+    blockManager.allocatePools(false);
 
     EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
 
     SizeType32 constexpr maxNewTokens{0};
-    auto constexpr beamWidth = 1;
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
@@ -1160,16 +1208,16 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
         std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
         inputTokenExtraIds);
 
-    GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
 
     ///////////////////////////////////////////////////////////////////////////
     // add request with loraTaskId 1 and then remove it
     auto constexpr beamIdx = 0;
     auto promptLen0 = llmRequest0->getNumTokens(beamIdx);
     auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0);
+    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
     EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
+    EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
     llmRequest0->addNewToken(3, beamIdx);
     llmRequest0->addNewToken(4, beamIdx);
     auto numTokens = llmRequest0->getNumTokens(beamIdx);
@@ -1193,14 +1241,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
         false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5,
         std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
         inputTokenExtraIds);
-    GenerationRequest seq1{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
 
     // no reuse, get new block 3, 4, 5
     auto promptLen1 = llmRequest1->getNumTokens(beamIdx);
     auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1);
+    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
     EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({3, 4, 5}));
+    EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5}));
     llmRequest1->addNewToken(3, beamIdx);
     llmRequest1->addNewToken(4, beamIdx);
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
@@ -1222,10 +1270,10 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
     promptLen0 = llmRequest0->getNumTokens(beamIdx);
     numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
     // reuse blocks 0, 1 and get new block 6
-    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0);
+    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
     llmRequest0->addNewToken(3, beamIdx);
     EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock);
-    EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 6}));
+    EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 6}));
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
 
@@ -1242,9 +1290,9 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
         inputTokenExtraIds1);
     promptLen1 = llmRequest1->getNumTokens(beamIdx);
     numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1);
+    blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
     EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1);
-    EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({3, 4, 5}));
+    EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5}));
     llmRequest1->addNewToken(5, beamIdx);
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2);
@@ -1268,13 +1316,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
         inputTokenExtraIds2);
 
     numTokens = llmRequest2->getNumTokens(beamIdx);
-    GenerationRequest seq2{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // no reuse, get new block 7, 8, 9
     auto promptLen2 = llmRequest2->getNumTokens(beamIdx);
     auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2);
+    blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow);
     EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({7, 8, 9}));
+    EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({7, 8, 9}));
     llmRequest2->addNewToken(3, beamIdx);
     numTokens = llmRequest2->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
@@ -1293,13 +1341,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
         inputTokenExtraIds3);
 
     numTokens = llmRequest3->getNumTokens(beamIdx);
-    GenerationRequest seq3{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // reuse block 0, get new block 10, 11
     auto promptLen3 = llmRequest3->getNumTokens(beamIdx);
     auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3);
+    blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow);
     EXPECT_EQ(llmRequest3->getContextCurrentPosition(), tokensPerBlock);
-    EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 10, 11}));
+    EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 10, 11}));
     llmRequest3->addNewToken(3, beamIdx);
     numTokens = llmRequest3->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
@@ -1317,13 +1365,13 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
         inputTokenExtraIds3);
 
     numTokens = llmRequest4->getNumTokens(beamIdx);
-    GenerationRequest seq4{requestId, numTokens, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
     // reuse block 3, get new block 12, 13
     auto promptLen4 = llmRequest4->getNumTokens(beamIdx);
     auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4);
+    blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow);
     EXPECT_EQ(llmRequest4->getContextCurrentPosition(), tokensPerBlock);
-    EXPECT_THAT(seq4.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({3, 12, 13}));
+    EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 12, 13}));
     llmRequest4->addNewToken(3, beamIdx);
     numTokens = llmRequest4->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
@@ -1355,10 +1403,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerPerRequestStatsTest)
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream,
-        std::nullopt, true, onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks);
+    kvCacheManager.allocatePools(false);
 
     auto inputTokens = std::make_shared<VecTokens>(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8});
     auto const inputLength = static_cast<SizeType32>(inputTokens->size());
@@ -1406,16 +1455,19 @@ TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest)
     auto constexpr onboardBlocks = true;
     auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
 
-    BlockManager blockManager(std::vector<BlockManager::SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock,
-        blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks);
-    blockManager.allocatePools(nvinfer1::DataType::kHALF, false);
+    auto constexpr beamWidth = 1;
+
+    BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool,
+        blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth,
+        std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0,
+        onboardBlocks);
+    blockManager.allocatePools(false);
 
     EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
 
     SizeType32 constexpr maxNewTokens{0};
-    auto constexpr beamWidth = 1;
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
@@ -1427,17 +1479,17 @@ TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest)
         KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, 4, 90),
                                    KvCacheRetentionConfig::TokenRangeRetentionConfig(4, 8, 10)},
             20));
-    GenerationRequest seq0{0, inputLength0, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq0{0, inputLength0, beamWidth, blockManager.getWindowSizesMetadata()};
     auto numContextBlocks0 = tc::ceilDiv(inputLength0, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq0, llmRequest0->getNumTokens(0), numContextBlocks0, *llmRequest0);
+    blockManager.addSequence(seq0, llmRequest0->getNumTokens(0), numContextBlocks0, *llmRequest0, maxAttentionWindow);
 
     // Add another sequence with different tokens, at a low priority
     auto inputTokens1 = std::make_shared<VecTokens>(VecTokens{8, 9, 10, 11, 12, 13, 14, 15});
     auto const inputLength1 = static_cast<SizeType32>(inputTokens1->size());
     auto llmRequest1 = std::make_shared<LlmRequest>(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming);
-    GenerationRequest seq1{1, inputLength1, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq1{1, inputLength1, beamWidth, blockManager.getWindowSizesMetadata()};
     auto numContextBlocks1 = tc::ceilDiv(inputLength1, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq1, llmRequest1->getNumTokens(0), numContextBlocks1, *llmRequest1);
+    blockManager.addSequence(seq1, llmRequest1->getNumTokens(0), numContextBlocks1, *llmRequest1, maxAttentionWindow);
 
     // Release both sequences
     blockManager.releaseBlocks(seq0, llmRequest0);
@@ -1449,18 +1501,18 @@ TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest)
     auto llmRequest2 = std::make_shared<LlmRequest>(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming);
     llmRequest2->setKvCacheRetentionConfig(
         KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 20)}, 20));
-    GenerationRequest seq2{2, inputLength2, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq2{2, inputLength2, beamWidth, blockManager.getWindowSizesMetadata()};
     auto numContextBlocks2 = tc::ceilDiv(inputLength2, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq2, llmRequest2->getNumTokens(0), numContextBlocks2, *llmRequest2);
+    blockManager.addSequence(seq2, llmRequest2->getNumTokens(0), numContextBlocks2, *llmRequest2, maxAttentionWindow);
     blockManager.releaseBlocks(seq2, llmRequest2);
 
     // Check that request 1 blocks were overwritten
     auto inputTokens3 = std::make_shared<VecTokens>(VecTokens{8, 9, 10, 11, 12, 13, 14, 15});
     auto const inputLength3 = static_cast<SizeType32>(inputTokens3->size());
     auto llmRequest3 = std::make_shared<LlmRequest>(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming);
-    GenerationRequest seq3{3, inputLength3, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq3{3, inputLength3, beamWidth, blockManager.getWindowSizesMetadata()};
     auto numContextBlocks3 = tc::ceilDiv(inputLength3, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq3, llmRequest3->getNumTokens(0), numContextBlocks3, *llmRequest3);
+    blockManager.addSequence(seq3, llmRequest3->getNumTokens(0), numContextBlocks3, *llmRequest3, maxAttentionWindow);
 
     EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 4);
 
@@ -1471,9 +1523,9 @@ TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest)
     auto inputTokens4 = std::make_shared<VecTokens>(VecTokens{0, 1, 2, 3, 4, 5, 6, 7});
     auto const inputLength4 = static_cast<SizeType32>(inputTokens4->size());
     auto llmRequest4 = std::make_shared<LlmRequest>(4, maxNewTokens, inputTokens4, samplingConfig, isStreaming);
-    GenerationRequest seq4{4, inputLength3, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq4{4, inputLength3, beamWidth, blockManager.getWindowSizesMetadata()};
     auto numContextBlocks4 = tc::ceilDiv(inputLength4, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq4, llmRequest4->getNumTokens(0), numContextBlocks4, *llmRequest4);
+    blockManager.addSequence(seq4, llmRequest4->getNumTokens(0), numContextBlocks4, *llmRequest4, maxAttentionWindow);
 
     EXPECT_EQ(llmRequest4->getContextCurrentPosition(), 4);
 
@@ -1481,9 +1533,9 @@ TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest)
     auto inputTokens5 = std::make_shared<VecTokens>(VecTokens{16, 17, 18, 19, 20, 21, 22, 23});
     auto const inputLength5 = static_cast<SizeType32>(inputTokens5->size());
     auto llmRequest5 = std::make_shared<LlmRequest>(5, maxNewTokens, inputTokens5, samplingConfig, isStreaming);
-    GenerationRequest seq5{5, inputLength5, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq5{5, inputLength5, beamWidth, blockManager.getWindowSizesMetadata()};
     auto numContextBlocks5 = tc::ceilDiv(inputLength5, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq5, llmRequest5->getNumTokens(0), numContextBlocks5, *llmRequest5);
+    blockManager.addSequence(seq5, llmRequest5->getNumTokens(0), numContextBlocks5, *llmRequest5, maxAttentionWindow);
 
     EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 0);
 }
@@ -1506,10 +1558,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerDecodeBlockPriorityTest)
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream,
-        std::nullopt, true, onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks);
+    kvCacheManager.allocatePools(false);
 
     auto const& blockManager = kvCacheManager.getBlockManager();
 
@@ -1608,10 +1661,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerTimedEvictionTest)
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream,
-        std::nullopt, true, onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks);
+    kvCacheManager.allocatePools(false);
 
     auto inputTokens0 = std::make_shared<VecTokens>(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
     auto const inputLength0 = static_cast<SizeType32>(inputTokens0->size());
@@ -1672,10 +1726,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerDecodeTimedEvictionTest)
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream,
-        std::nullopt, true, onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks);
+    kvCacheManager.allocatePools(false);
     {
         auto inputTokens0 = std::make_shared<VecTokens>(VecTokens{1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
         auto const inputLength0 = static_cast<SizeType32>(inputTokens0->size());
@@ -1745,10 +1800,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerSecondaryBlockPrimaryChildTest)
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, false, stream, true,
-        onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, nvinfer1::DataType::kHALF, false, stream, true, onboardBlocks);
+    kvCacheManager.allocatePools(false);
 
     auto inputTokens0 = std::make_shared<VecTokens>(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
     auto const inputLength0 = static_cast<SizeType32>(inputTokens0->size());
@@ -1817,10 +1873,11 @@ TEST_F(KVCacheManagerTest, KVCacheManagerLeafBlockTest)
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, false, stream, true,
-        onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, nvinfer1::DataType::kHALF, false, stream, true, onboardBlocks);
+    kvCacheManager.allocatePools(false);
 
     auto inputTokens0 = std::make_shared<VecTokens>(VecTokens{0, 1, 2, 3});
     auto const inputLength0 = static_cast<SizeType32>(inputTokens0->size());
@@ -1842,7 +1899,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerLeafBlockTest)
     GenerationRequest const& seq1 = kvCacheManager.getSequence(1);
     EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 0);
     // Block 1 should NOT be reused. It was not freed even if partial.
-    EXPECT_THAT(seq1.getCacheBlockIds().at(0), ::testing::ElementsAreArray({2}));
+    EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(0), ::testing::ElementsAreArray({2}));
 
     // Allocate the remaining 3 blocks in primary
     auto inputTokens2 = std::make_shared<VecTokens>(VecTokens{2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
@@ -1890,7 +1947,6 @@ TEST_P(KVCacheManagerTest, DISABLED_KVCacheManagerAllocationTest)
 
     auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq;
     auto constexpr maxAttentionWindow = maxNumTokens;
-    auto constexpr temporaryAttentionWindow = 0;
     auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1;
     auto constexpr numSharedBlocks = inputLength / tokensPerBlock;
     auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth;
@@ -1906,15 +1962,17 @@ TEST_P(KVCacheManagerTest, DISABLED_KVCacheManagerAllocationTest)
     auto const granularity = tensorrt_llm::common::getAllocationGranularity();
     KVCacheManager kvCacheManager = homogeneousLayers
         ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool,
-            maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream,
-            std::nullopt, enableBlockReuse, onboardBlocks)
+            maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
+            nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks)
         : KVCacheManager(std::vector<KVCacheManager::SizeType32>(numLayers, numHeads), sizePerHead, tokensPerBlock,
-            totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, {maxAttentionWindow},
-            temporaryAttentionWindow, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks);
+            totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth,
+            std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF,
+            sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks);
 
-    auto const& bufferManager = kvCacheManager.getBlockManager().getBufferManager();
+    auto const blockManager = kvCacheManager.getBlockManager();
+    auto const& bufferManager = blockManager.getBufferManager(theOnlyWindowSize(kvCacheManager));
     auto const memoryPoolUsedBefore = bufferManager.memoryPoolUsed();
-    kvCacheManager.allocatePools(dtype, useUvm);
+    kvCacheManager.allocatePools(useUvm);
     auto const memoryPoolUsedAfter = bufferManager.memoryPoolUsed();
 
     EXPECT_GT(memoryPoolUsedAfter, memoryPoolUsedBefore);
@@ -1948,7 +2006,6 @@ TEST_P(KVCacheManagerTest, KVCacheManagerTest)
     auto constexpr requestId = 7;
     auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq;
     auto constexpr maxAttentionWindow = maxNumTokens;
-    auto constexpr temporaryAttentionWindow = 0;
     auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1;
     auto constexpr numSharedBlocks = inputLength / tokensPerBlock;
     auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth;
@@ -1963,14 +2020,14 @@ TEST_P(KVCacheManagerTest, KVCacheManagerTest)
 
     KVCacheManager kvCacheManager = homogeneousLayers
         ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool,
-            maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream,
-            std::nullopt, enableBlockReuse, onboardBlocks)
+            maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
+            nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks)
         : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool,
-            maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream,
-            std::nullopt, enableBlockReuse, onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+            maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
+            nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks);
+    kvCacheManager.allocatePools(false);
 
-    EXPECT_EQ(kvCacheManager.getMaxBlocksPerSeq(), maxBlocksPerSeq);
+    EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq);
     EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks);
 
@@ -2096,7 +2153,6 @@ TEST_P(KVCacheManagerTest, KVCacheManagerRewindTokensTest)
     auto constexpr requestId = 7;
     auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq;
     auto constexpr maxAttentionWindow = maxNumTokens;
-    auto constexpr temporaryAttentionWindow = 0;
     auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1;
     auto constexpr numSharedBlocks = inputLength / tokensPerBlock;
     auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth;
@@ -2110,12 +2166,13 @@ TEST_P(KVCacheManagerTest, KVCacheManagerRewindTokensTest)
 
     KVCacheManager kvCacheManager = homogeneousLayers
         ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool,
-            maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream,
-            std::nullopt, enableBlockReuse, onboardBlocks)
+            maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
+            nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks)
         : KVCacheManager(std::vector<KVCacheManager::SizeType32>(numLayers, numHeads), sizePerHead, tokensPerBlock,
-            totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, {maxAttentionWindow},
-            temporaryAttentionWindow, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+            totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth,
+            std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF,
+            sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks);
+    kvCacheManager.allocatePools(false);
 
     EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks);
@@ -2181,7 +2238,6 @@ TEST_P(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowTest)
     auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1;
     // Enable cyclic kv cache for all new generated tokens.
     auto constexpr maxAttentionWindow = inputLength;
-    auto constexpr temporaryAttentionWindow = 0;
     auto constexpr numSharedBlocks = std::min(inputLength, maxAttentionWindow) / tokensPerBlock;
     auto constexpr numBlocksPerSeq = numSharedBlocks + (blockLengthPerSeq - numSharedBlocks) * maxBeamWidth;
     auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxAttentionWindow, tokensPerBlock);
@@ -2196,14 +2252,14 @@ TEST_P(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowTest)
 
     KVCacheManager kvCacheManager = homogeneousLayers
         ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool,
-            maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream,
-            std::nullopt, enableBlockReuse, onboardBlocks)
+            maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
+            nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks)
         : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool,
-            maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream,
-            std::nullopt, enableBlockReuse, onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+            maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
+            nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks);
+    kvCacheManager.allocatePools(false);
 
-    EXPECT_EQ(kvCacheManager.getMaxBlocksPerSeq(), maxBlocksPerSeq);
+    EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq);
     EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks);
 
@@ -2303,7 +2359,6 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
 
     // Enable cyclic kv cache for long input tokens.
     auto constexpr maxAttentionWindow = 16;
-    auto constexpr temporaryAttentionWindow = 0;
     auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxAttentionWindow, tokensPerBlock);
 
     auto constexpr blocksInPrimaryPool = 16;
@@ -2313,9 +2368,10 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     auto constexpr onboardBlocks = true;
 
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow,
-        sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+        blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse,
+        onboardBlocks);
+    kvCacheManager.allocatePools(false);
 
     auto const& blockManager = kvCacheManager.getBlockManager();
 
@@ -2340,7 +2396,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 3}));
+    EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 3}));
 
     // add tokens to enable cyclic kv cache
     llmRequest->addNewToken(1016, beamIdx);
@@ -2348,7 +2404,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     llmRequest->addNewToken(1017, beamIdx);
     kvCacheManager.addToken(requestId);
     auto numTokens = llmRequest->getNumTokens(beamIdx);
-    auto numBlocks = seq0.getCacheBlockIds()[beamIdx].size();
+    auto numBlocks = seq0.getCacheBlockIds(maxAttentionWindow)[beamIdx].size();
     EXPECT_EQ(numBlocks, maxBlocksPerSeq);
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
@@ -2366,14 +2422,14 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({4, 5}));
+    EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5}));
 
     llmRequest->addNewToken(1007, beamIdx);
     kvCacheManager.addToken(requestId);
     llmRequest->addNewToken(1008, beamIdx);
     kvCacheManager.addToken(requestId);
     numTokens = llmRequest->getNumTokens(beamIdx);
-    numBlocks = seq1.getCacheBlockIds()[beamIdx].size();
+    numBlocks = seq1.getCacheBlockIds(maxAttentionWindow)[beamIdx].size();
     EXPECT_EQ(numBlocks, 3);
     EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
     // store blocks 4, 5 for reuse ([1000,1001,1002,1003], [1004,1005,1006,1007])
@@ -2389,7 +2445,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     GenerationRequest const& seq2 = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 2 * tokensPerBlock);
-    EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({4, 5, 7}));
+    EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5, 7}));
 
     numTokens = llmRequest->getNumTokens(beamIdx);
     numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
@@ -2400,7 +2456,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     ///////////////////////////////////////////////////////////////////////////
     // add a longer request within attention window and try to reuse
     // reuse blocks 4, 5, 7(p) and get new block 8
-    // upon reached attention window, shared block 4 is replaced with unshared block 9
+    // upon reaching the attention window, the block ids shouldn't change
     requestId = 3;
     inputLength = 15;
     inputTokens->resize(inputLength);
@@ -2409,13 +2465,15 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     GenerationRequest const& seq3 = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 9);
-    EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({4, 5, 7, 8}));
+    EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5, 7, 8}));
 
     llmRequest->addNewToken(1015, beamIdx);
     kvCacheManager.addToken(requestId);
     llmRequest->addNewToken(1016, beamIdx);
     kvCacheManager.addToken(requestId);
-    EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({9, 5, 7, 8}));
+    // FIXME: This means that reuse will break here - the window will start writing to a reused block, and the following
+    // sequence that tries to reuse the block will read garbage. This will be fixed by removing the cyclic kv cache.
+    EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5, 7, 8}));
     EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
 
     ///////////////////////////////////////////////////////////////////////////
@@ -2428,7 +2486,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0);
     GenerationRequest const& seq4 = kvCacheManager.getSequence(requestId);
-    EXPECT_THAT(seq4.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({10, 11, 12, 13}));
+    EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({9, 10, 11, 12}));
 }
 
 TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
@@ -2440,13 +2498,13 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
     auto constexpr maxNumSequences = 8;
     auto constexpr maxBeamWidth = 1;
     auto constexpr sinkTokenLength = 0;
+    auto constexpr dtype = nvinfer1::DataType::kHALF;
     auto const stream = std::make_shared<tr::CudaStream>();
 
     // Enable cyclic kv cache for long input tokens.
     auto constexpr minAttentionWindow = 8;
     auto constexpr maxAttentionWindow = 16;
     auto const maxAttentionWindowVec = std::vector<SizeType32>{maxAttentionWindow, minAttentionWindow};
-    auto constexpr temporaryAttentionWindow = 0;
     auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxAttentionWindow, tokensPerBlock);
 
     auto constexpr blocksInPrimaryPool = 16;
@@ -2456,12 +2514,18 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
     auto constexpr onboardBlocks = true;
 
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, temporaryAttentionWindow,
+        blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, std::nullopt, dtype,
         sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+    kvCacheManager.allocatePools(false);
 
     auto const& blockManager = kvCacheManager.getBlockManager();
 
+    auto const allBlocksInPrimaryPools = blockManager.getNumPrimaryBlocks();
+    EXPECT_THAT(allBlocksInPrimaryPools, blocksInPrimaryPool);
+
+    ASSERT_EQ(blockManager.isVariableWindow(), true);
+    ASSERT_EQ(blockManager.isVariableGQA(), false);
+
     SizeType32 constexpr maxNewTokens = 4;
 
     // prepare tokens with token[i] = 1000 + i
@@ -2483,21 +2547,31 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq0.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({0, 1}));
 
-    // add tokens to enable cyclic kv cache
+    auto const assertBlocks
+        = [minAttentionWindow, maxAttentionWindow, beamIdx](GenerationRequest seq,
+              std::initializer_list<int> expectedBlocksMin, std::initializer_list<int> expectedBlocksMax)
+    {
+        auto blocksMin = seq.getCacheBlockIds(minAttentionWindow).at(beamIdx);
+        auto blocksMax = seq.getCacheBlockIds(maxAttentionWindow).at(beamIdx);
+        EXPECT_THAT(blocksMin, ::testing::ElementsAreArray(expectedBlocksMin));
+        EXPECT_THAT(blocksMax, ::testing::ElementsAreArray(expectedBlocksMax));
+        return blocksMin.size() + blocksMax.size();
+    };
+
+    assertBlocks(seq0, {0, 1}, {0, 1});
+
+    // add tokens to enable cyclic kv cache for minimum but not maximum
     llmRequest->addNewToken(1016, beamIdx);
     kvCacheManager.addToken(requestId);
     llmRequest->addNewToken(1017, beamIdx);
     kvCacheManager.addToken(requestId);
-    auto numTokens = llmRequest->getNumTokens(beamIdx);
-    auto numBlocks = seq0.getCacheBlockIds()[beamIdx].size();
-    EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
-    EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
+    auto const numBlocks = assertBlocks(seq0, {0, 1}, {0, 1, 2});
+    EXPECT_EQ(blockManager.getNumFreeBlocks(), allBlocksInPrimaryPools - numBlocks);
     EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
     // no blocks stored because cyclic KV cache was enabled
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
-    EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
+    EXPECT_EQ(blockManager.getNumFreeBlocks(), allBlocksInPrimaryPools);
 
     ///////////////////////////////////////////////////////////////////////////
     // add a short request that is between the min and max attention window
@@ -2508,15 +2582,12 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq1.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({3, 4, 5}));
-
+    assertBlocks(seq1, {2, 3}, {3, 4, 5});
     llmRequest->addNewToken(1007, beamIdx);
     kvCacheManager.addToken(requestId);
     llmRequest->addNewToken(1008, beamIdx);
     kvCacheManager.addToken(requestId);
-    numTokens = llmRequest->getNumTokens(beamIdx);
-    numBlocks = seq1.getCacheBlockIds()[beamIdx].size();
-    EXPECT_EQ(numBlocks, 3);
+    assertBlocks(seq1, {2, 3}, {3, 4, 5});
     EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
 
     ///////////////////////////////////////////////////////////////////////////
@@ -2529,11 +2600,10 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     GenerationRequest const& seq2 = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0);
-    EXPECT_THAT(seq2.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({6}));
+    assertBlocks(seq2, {4}, {6});
 
-    numTokens = llmRequest->getNumTokens(beamIdx);
-    numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
-    EXPECT_EQ(numBlocks, 1);
+    auto const numTokens = llmRequest->getNumTokens(beamIdx);
+    EXPECT_EQ(tc::ceilDiv(numTokens, tokensPerBlock), 1);
     EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
     // store block 6 for reuse
 
@@ -2547,7 +2617,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     GenerationRequest const& seq3 = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 3);
-    EXPECT_THAT(seq3.getCacheBlockIds().at(beamIdx), ::testing::ElementsAreArray({6}));
+    assertBlocks(seq3, {4}, {6});
 }
 
 namespace
@@ -2566,31 +2636,31 @@ KVCacheManager setupKvCacheManagerForHashTest(bool enableBlockReuse)
     auto constexpr maxBlocksPerSeq = 8;
     auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq;
     auto constexpr maxAttentionWindow = maxNumTokens;
-    auto constexpr temporaryAttentionWindow = 0;
 
     auto constexpr blocksInPrimaryPool = 16;
     auto constexpr blocksInSecondaryPool = 0;
 
     auto constexpr onboardBlocks = true;
 
-    return {std::vector<SizeType32>(numLayers, numHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow,
+    return KVCacheManager(std::vector<SizeType32>(numLayers, numHeads), sizePerHead, tokensPerBlock,
+        blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, maxBeamWidth,
+        std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF,
         sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks, CacheType::kSELF, std::nullopt, nullptr,
-        /*enableHashKey*/ true};
+        /*enableHashKey*/ true);
 }
 
 std::vector<size_t> getHashAndRetrieveBlocksByHashTest(
-    BlockManager const& blockManager, std::vector<KVCacheBlock::IdType> const& blockIds)
+    BlockManager const& blockManager, std::vector<KVCacheBlock::IdType> const& blockIds, SizeType32 windowSize)
 {
     std::vector<size_t> blockHashes;
     for (auto blockId : blockIds)
     {
-        blockHashes.emplace_back(blockManager.getBlockById(blockId)->getHash());
+        blockHashes.emplace_back(blockManager.getBlockById(blockId, windowSize)->getHash());
     }
     std::vector<BlockPtr> blockPtrs;
     for (auto hash : blockHashes)
     {
-        auto range = blockManager.getBlocksByHash(hash);
+        auto range = blockManager.getBlocksByHash(hash, windowSize);
         BlockPtr const prevBlock = blockPtrs.empty() ? nullptr : blockPtrs.back();
         BlockPtr thisBlock = nullptr;
         for (auto it = range.first; it != range.second; ++it)
@@ -2607,7 +2677,7 @@ std::vector<size_t> getHashAndRetrieveBlocksByHashTest(
     EXPECT_EQ(blockHashes.size(), blockPtrs.size());
     for (size_t i = 0; i < blockHashes.size(); i++)
     {
-        EXPECT_EQ(blockManager.getBlockById(blockIds[i]), blockPtrs[i]);
+        EXPECT_EQ(blockManager.getBlockById(blockIds[i], windowSize), blockPtrs[i]);
     }
     return blockHashes;
 }
@@ -2640,18 +2710,21 @@ TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     GenerationRequest const& seq = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0);
-    auto& blockIds = seq.getCacheBlockIds().at(beamIdx);
+
+    auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager);
+
+    auto& blockIds = seq.getCacheBlockIds(onlyWindowSize).at(beamIdx);
     EXPECT_THAT(blockIds, ::testing::ElementsAreArray({0, 1, 2, 3}));
 
     // get blocks by hash and try to retrieve them by hash
-    auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds);
+    auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds, onlyWindowSize);
 
     EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
 
     // blocks are all removed
     for (auto hash : blockHashes)
     {
-        auto range = blockManager.getBlocksByHash(hash);
+        auto range = blockManager.getBlocksByHash(hash, onlyWindowSize);
         EXPECT_EQ(range.first, range.second);
     }
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
@@ -2684,14 +2757,17 @@ TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyWithReuseTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0);
-    auto& blockIds0 = seq0.getCacheBlockIds().at(beamIdx);
+
+    EXPECT_EQ(blockManager.getNumPools(), 1);
+    auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager);
+
+    auto& blockIds0 = seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx);
     EXPECT_THAT(blockIds0, ::testing::ElementsAreArray({0, 1, 2, 3}));
 
     // get blocks by hash and try to retrieve them by hash
-    auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds0);
+    auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds0, onlyWindowSize);
 
     EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
-    // store 4 blocks with total of 15 reusable tokens (last token is not stored).
 
     // TODO: Make reused blocks accessible by hash, after sequence removed. Test here.
 
@@ -2705,21 +2781,21 @@ TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyWithReuseTest)
     kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
     GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 15);
-    auto& blockIds1 = seq1.getCacheBlockIds().at(beamIdx);
+    auto& blockIds1 = seq1.getCacheBlockIds(onlyWindowSize).at(beamIdx);
     EXPECT_THAT(blockIds1, ::testing::ElementsAreArray({0, 1, 2, 3, 4}));
 
-    std::ignore = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds1);
+    std::ignore = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds1, onlyWindowSize);
 
     // blocks are reused, so reused blocks are still accessible by previous hashes
     for (size_t i = 0; i < 4; i++)
     {
-        auto range = blockManager.getBlocksByHash(blockHashes[i]);
+        auto range = blockManager.getBlocksByHash(blockHashes[i], onlyWindowSize);
         EXPECT_NE(range.first, range.second);
     }
     // evicted block is not accessible
     {
         size_t i = 4;
-        auto range = blockManager.getBlocksByHash(blockHashes[i]);
+        auto range = blockManager.getBlocksByHash(blockHashes[i], onlyWindowSize);
         EXPECT_EQ(range.first, range.second);
     }
     EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 5);
@@ -2744,11 +2820,12 @@ TEST_F(KVCacheManagerTest, KVCacheManagerEventStream)
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream,
-        std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt,
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt,
         std::make_unique<tlk::KVCacheEventManager>(1024));
-    kvCacheManager.allocatePools(dtype, false);
+    kvCacheManager.allocatePools(false);
 
     auto events = getEvents(kvCacheManager);
     EXPECT_EQ(events.size(), 1);
@@ -2896,11 +2973,12 @@ TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamOverflow)
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream,
-        std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt,
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt,
         std::make_unique<tlk::KVCacheEventManager>(1));
-    kvCacheManager.allocatePools(dtype, false);
+    kvCacheManager.allocatePools(false);
 
     auto inputTokens0 = std::make_shared<VecTokens>(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
     auto llmRequest0 = std::make_shared<LlmRequest>(0, 0, inputTokens0, samplingConfig, true);
@@ -2950,11 +3028,12 @@ TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamPriority)
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream,
-        std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt,
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt,
         std::make_unique<tlk::KVCacheEventManager>(1024));
-    kvCacheManager.allocatePools(dtype, false);
+    kvCacheManager.allocatePools(false);
 
     auto inputTokens0 = std::make_shared<VecTokens>(VecTokens{0, 1, 2, 3, 4, 5, 6, 7});
     auto llmRequest0 = std::make_shared<LlmRequest>(0, 0, inputTokens0, samplingConfig, true);
@@ -3021,18 +3100,19 @@ TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamBlocking)
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManagerTest(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream,
-        std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt);
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt);
 
     EXPECT_EQ(getEvents(kvCacheManagerTest).size(), 0);
 
     KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool,
-        blocksInSecondaryPool, maxNumSequences, beamWidth, {tokensPerBlock * maxBlocksPerSeq}, 0, 0, stream,
-        std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt,
-        std::make_unique<tlk::KVCacheEventManager>(1024));
+        blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+        std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF,
+        std::nullopt, std::make_unique<tlk::KVCacheEventManager>(1024));
 
-    kvCacheManager.allocatePools(dtype, false);
+    kvCacheManager.allocatePools(false);
     kvCacheManager.flushIterationEvents();
     auto events = kvCacheManager.getLatestEvents(std::chrono::seconds(1));
 
@@ -3113,7 +3193,6 @@ TEST_P(KVCacheManagerTest, KVCacheManagerSinkTokenLengthTest)
     auto constexpr bubbleLength = (sinkTokensInLastBlock) ? tokensPerBlock - sinkTokensInLastBlock : 0;
     auto constexpr inputLength = tokensPerBlock * maxBlocksPerSeq - bubbleLength - 1;
     auto constexpr maxAttentionWindow = inputLength - tokensPerBlock;
-    auto constexpr temporaryAttentionWindow = 0;
 
     auto constexpr numSharedBlocks = (sinkTokenLength + bubbleLength) / tokensPerBlock;
     auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth;
@@ -3130,14 +3209,14 @@ TEST_P(KVCacheManagerTest, KVCacheManagerSinkTokenLengthTest)
     auto const maxSequenceLength = tokensPerBlock * maxBlocksPerSeq;
     KVCacheManager kvCacheManager = homogeneousLayers
         ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool,
-            maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream,
-            maxSequenceLength, enableBlockReuse, onboardBlocks)
+            maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
+            nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks)
         : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool,
-            maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream,
-            maxSequenceLength, enableBlockReuse, onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+            maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
+            nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks);
+    kvCacheManager.allocatePools(false);
 
-    EXPECT_EQ(kvCacheManager.getMaxBlocksPerSeq(), maxBlocksPerSeq);
+    EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq);
     EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks);
 
@@ -3252,7 +3331,6 @@ TEST_P(KVCacheManagerTest, KVCacheManagerBatchTest)
 
     auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq;
     auto constexpr maxAttentionWindow = maxNumTokens;
-    auto constexpr temporaryAttentionWindow = 0;
     auto constexpr inputLength = maxNumTokens - 2;
     auto constexpr numBlocksPerSeq = maxBlocksPerSeq - 1 + maxBeamWidth;
 
@@ -3266,14 +3344,14 @@ TEST_P(KVCacheManagerTest, KVCacheManagerBatchTest)
 
     KVCacheManager kvCacheManager = homogeneousLayers
         ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool,
-            maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream,
-            std::nullopt, enableBlockReuse, onboardBlocks)
+            maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
+            nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks)
         : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool,
-            maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength, stream,
-            std::nullopt, enableBlockReuse, onboardBlocks);
-    kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+            maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
+            nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks);
+    kvCacheManager.allocatePools(false);
 
-    EXPECT_EQ(kvCacheManager.getMaxBlocksPerSeq(), maxBlocksPerSeq);
+    EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq);
     EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks);
 
@@ -3395,22 +3473,23 @@ void testNeededBlocksOneStep(bool kv_cache_block_reuse, int beamWidth, int draft
             auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq;
             // auto constexpr maxAttentionWindow = maxNumTokens / 2;
             auto constexpr maxAttentionWindow = 46;
-            auto constexpr temporaryAttentionWindow = 0;
             auto constexpr totalNumBlocks = maxNumSequences * maxBlocksPerSeq;
             auto constexpr blocksInSecondaryPool = 0;
             auto constexpr onboardBlocks = true;
 
             KVCacheManager kvCacheManager = homogeneousLayers
                 ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks,
-                    blocksInSecondaryPool, maxNumSequences, maxBeamWidth, {maxAttentionWindow},
-                    temporaryAttentionWindow, sinkTokenLength, stream, std::nullopt, kv_cache_block_reuse,
-                    onboardBlocks)
+                    blocksInSecondaryPool, maxNumSequences, maxBeamWidth,
+                    std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF,
+                    sinkTokenLength, stream, std::nullopt, kv_cache_block_reuse, onboardBlocks)
                 : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool,
-                    maxNumSequences, maxBeamWidth, {maxAttentionWindow}, temporaryAttentionWindow, sinkTokenLength,
-                    stream, std::nullopt, kv_cache_block_reuse, onboardBlocks);
-            kvCacheManager.allocatePools(nvinfer1::DataType::kHALF, false);
+                    maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow},
+                    std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt,
+                    kv_cache_block_reuse, onboardBlocks);
+            kvCacheManager.allocatePools(false);
 
-            EXPECT_EQ(kvCacheManager.getMaxBlocksPerSeq(), tc::ceilDiv(maxAttentionWindow, tokensPerBlock));
+            EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq,
+                tc::ceilDiv(maxAttentionWindow, tokensPerBlock));
 
             auto inputTokens = std::make_shared<VecTokens>(VecTokens(inputLength, 0));
 
@@ -3419,8 +3498,11 @@ void testNeededBlocksOneStep(bool kv_cache_block_reuse, int beamWidth, int draft
                 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
             llmRequest->setDraftTokens(draftTokens);
 
-            auto remainingBlocksToCompletion = kvCacheManager.getRemainingBlocksToCompletion(*llmRequest);
-            auto neededBlocksOneStep = kvCacheManager.getNeededBlocksOneStep(*llmRequest, false);
+            auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager);
+
+            auto remainingBlocksToCompletion
+                = kvCacheManager.getRemainingBlocksToCompletion(*llmRequest, onlyWindowSize);
+            auto neededBlocksOneStep = kvCacheManager.getNeededBlocksOneStep(*llmRequest, false, onlyWindowSize);
 
             EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth, llmRequest));
             for (int di = 0; di < draftLen && di < maxNewTokens && (inputLength + di) < maxAttentionWindow; ++di)
@@ -3445,7 +3527,7 @@ void testNeededBlocksOneStep(bool kv_cache_block_reuse, int beamWidth, int draft
                     llmRequest->addNewToken(1, beam);
                 }
 
-                neededBlocksOneStep = kvCacheManager.getNeededBlocksOneStep(*llmRequest, false);
+                neededBlocksOneStep = kvCacheManager.getNeededBlocksOneStep(*llmRequest, false, onlyWindowSize);
 
                 for (int beam = 0; beam < maxBeamWidth; beam++)
                 {
@@ -3468,7 +3550,7 @@ void testNeededBlocksOneStep(bool kv_cache_block_reuse, int beamWidth, int draft
 
             // After adding all tokens, we should match remainingBlocksToCompletion
             EXPECT_EQ(remainingBlocksToCompletion, kvCacheManager.getUsedNumBlocks());
-            EXPECT_EQ(kvCacheManager.getRemainingBlocksToCompletion(*llmRequest), 0);
+            EXPECT_EQ(kvCacheManager.getRemainingBlocksToCompletion(*llmRequest, onlyWindowSize), 0);
         }
     }
 }
@@ -3587,6 +3669,8 @@ struct KvCacheManagerInstantiationParameters
     SizeType32 maxBeamWidth;
     SizeType32 maxNumTokens;
     bool kvCacheBlockReuse;
+    std::vector<SizeType32> maxAttentionWindowVec = {maxAttentionWindow};
+    nvinfer1::DataType dtype = nvinfer1::DataType::kFLOAT;
 };
 
 struct GetRemainingBlocksToCompletionOneRequestParameters
@@ -3608,8 +3692,8 @@ std::shared_ptr<KVCacheManager> createKvCacheManager(
     KvCacheManagerInstantiationParameters const& kvCacheInstantiationParameters, StreamPtr stream)
 {
     auto const maxInputLength = kvCacheInstantiationParameters.maxNumTokens - 1;
-    auto const temporaryKvCacheLength = std::min(kvCacheInstantiationParameters.maxNumTokens,
-        maxInputLength - kvCacheInstantiationParameters.maxAttentionWindow);
+    auto const temporaryKvCacheInputs
+        = TempAttentionWindowInputs{true, maxInputLength, kvCacheInstantiationParameters.maxNumTokens};
 
     if (std::holds_alternative<SizeType32>(kvCacheInstantiationParameters.numHeadsPerLayer))
     {
@@ -3619,9 +3703,9 @@ std::shared_ptr<KVCacheManager> createKvCacheManager(
         return std::make_shared<KVCacheManager>(numHeadsPerLayerVec, kvCacheInstantiationParameters.sizePerHead,
             kvCacheInstantiationParameters.tokensPerBlock, kvCacheInstantiationParameters.numBlocksInPrimaryPool, 0,
             kvCacheInstantiationParameters.numBlocksInPrimaryPool, kvCacheInstantiationParameters.maxBeamWidth,
-            std::vector<SizeType32>{kvCacheInstantiationParameters.maxAttentionWindow}, temporaryKvCacheLength,
-            kvCacheInstantiationParameters.sinkTokenLength, stream, std::nullopt,
-            kvCacheInstantiationParameters.kvCacheBlockReuse, true);
+            std::vector<SizeType32>{kvCacheInstantiationParameters.maxAttentionWindow}, temporaryKvCacheInputs,
+            kvCacheInstantiationParameters.dtype, kvCacheInstantiationParameters.sinkTokenLength, stream, std::nullopt,
+            kvCacheInstantiationParameters.kvCacheBlockReuse, true, CacheType::kSELF);
     }
     if (std::holds_alternative<std::vector<SizeType32>>(kvCacheInstantiationParameters.numHeadsPerLayer))
     {
@@ -3630,9 +3714,9 @@ std::shared_ptr<KVCacheManager> createKvCacheManager(
         return std::make_shared<KVCacheManager>(numHeadsPerLayer, kvCacheInstantiationParameters.sizePerHead,
             kvCacheInstantiationParameters.tokensPerBlock, kvCacheInstantiationParameters.numBlocksInPrimaryPool, 0,
             kvCacheInstantiationParameters.numBlocksInPrimaryPool, kvCacheInstantiationParameters.maxBeamWidth,
-            std::vector<SizeType32>{kvCacheInstantiationParameters.maxAttentionWindow}, temporaryKvCacheLength,
-            kvCacheInstantiationParameters.sinkTokenLength, stream, std::nullopt,
-            kvCacheInstantiationParameters.kvCacheBlockReuse, true);
+            std::vector<SizeType32>{kvCacheInstantiationParameters.maxAttentionWindow}, temporaryKvCacheInputs,
+            kvCacheInstantiationParameters.dtype, kvCacheInstantiationParameters.sinkTokenLength, stream, std::nullopt,
+            kvCacheInstantiationParameters.kvCacheBlockReuse, true, CacheType::kSELF);
     }
     TLLM_THROW("Unhandled type of num heads per layer provided.");
 }
@@ -3652,7 +3736,8 @@ std::vector<LlmRequest> fillKvCacheManager(KVCacheManager& kvCacheManager, SizeT
     // Adding enough requests to fill the kv-cache.
     auto remainingFreeBlocks = kvCacheManager.getNumFreeBlocks();
     auto llmRequests = std::vector<LlmRequest>{};
-    auto const remainingBlocksToCompletionFromStart = kvCacheManager.getRemainingBlocksToCompletion(llmRequest);
+    auto const remainingBlocksToCompletionFromStart
+        = kvCacheManager.getRemainingBlocksToCompletion(llmRequest, theOnlyWindowSize(kvCacheManager));
     do
     {
         ++requestIdStart;
@@ -3682,7 +3767,7 @@ class RemainingBlocksToCompletionTest
         auto const stream = std::make_shared<tr::CudaStream>();
         auto const params = GetParam();
         kvCacheManager = createKvCacheManager(params.kvCacheManagerInstantiationParameters, stream);
-        kvCacheManager->allocatePools(nvinfer1::DataType::kFLOAT);
+        kvCacheManager->allocatePools(false);
     }
 
     void TearDown() override {}
@@ -3701,7 +3786,7 @@ TEST_P(RemainingBlocksToCompletionTest, RemainingBlocksToCompletionCorrectlyEsti
         tensorrt_llm::runtime::SamplingConfig{params.kvCacheManagerInstantiationParameters.maxBeamWidth},
         true,
     };
-    auto const result = kvCacheManager->getRemainingBlocksToCompletion(llmRequest);
+    auto const result = kvCacheManager->getRemainingBlocksToCompletion(llmRequest, theOnlyWindowSize(*kvCacheManager));
     ASSERT_EQ(result, params.expectedRemainingBlocksToCompletion);
 }
 
@@ -3766,7 +3851,7 @@ class FillKvCacheAndCompleteRequestsTest : public ::testing::TestWithParam<FillK
         auto const stream = std::make_shared<tr::CudaStream>();
         auto const params = GetParam();
         kvCacheManager = createKvCacheManager(params.kvCacheManagerInstantiationParameters, stream);
-        kvCacheManager->allocatePools(nvinfer1::DataType::kFLOAT);
+        kvCacheManager->allocatePools(false);
     }
 
     void TearDown() override {}
diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheUtilsTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheUtilsTest.cpp
index 7bb9a096193..8c167e1219f 100644
--- a/cpp/tests/unit_tests/batch_manager/kvCacheUtilsTest.cpp
+++ b/cpp/tests/unit_tests/batch_manager/kvCacheUtilsTest.cpp
@@ -84,16 +84,21 @@ TEST_F(BlockIteratorTest, CacheManagerTest)
     auto const stream = std::make_shared<tr::CudaStream>();
     auto constexpr onboardBlocks = true;
 
+    auto constexpr beamWidth = 1;
+    auto constexpr numBlocksPerBeam = blocksInPrimaryPool / beamWidth;
+    auto constexpr maxSequenceLength = tokensPerBlock * numBlocksPerBeam;
+    auto const maxAttentionWindowVec = std::vector<BlockManager::SizeType32>{maxAttentionWindow};
+
     BlockManager blockManager(std::vector<BlockManager::SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock,
-        blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, onboardBlocks);
-    blockManager.allocatePools(dataType, false);
+        blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, maxSequenceLength, beamWidth,
+        maxAttentionWindowVec, std::nullopt, dataType, 0, onboardBlocks);
+    blockManager.allocatePools(false);
 
     EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock);
     EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool);
     EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
 
     SizeType32 constexpr maxNewTokens{0};
-    auto constexpr beamWidth = 1;
     tr::SamplingConfig const samplingConfig{beamWidth};
     bool constexpr isStreaming{false};
 
@@ -102,14 +107,14 @@ TEST_F(BlockIteratorTest, CacheManagerTest)
     LlmRequest::RequestIdType requestId{0};
     auto llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
 
-    GenerationRequest seq0{requestId, inputLength, beamWidth, maxBlocksPerSeq, maxAttentionWindow};
+    GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
 
     auto constexpr beamIdx = 0;
     auto promptLen0 = llmRequest0->getNumTokens(beamIdx);
     auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
-    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0);
+    blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
 
-    auto const blockIds = seq0.getCacheBlockIds().at(beamIdx);
+    auto const blockIds = seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx);
     EXPECT_THAT(blockIds, ::testing::ElementsAreArray({0, 1, 2}));
 
     auto const pool = blockManager.getPrimaryPool(0);
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index 817d030b27d..498d4ef565a 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -147,7 +147,10 @@ def __init__(
         if kv_cache_config.max_attention_window is None:
             max_attention_window = max_seq_len
         else:
-            max_attention_window = max(kv_cache_config.max_attention_window)
+            assert len(
+                kv_cache_config.max_attention_window
+            ) == 1, "Python KvCacheManager doesn't currently support variable window attention"
+            max_attention_window = kv_cache_config.max_attention_window[0]
 
         sink_token_length = (kv_cache_config.sink_token_length
                              if kv_cache_config.sink_token_length is not None
@@ -176,8 +179,7 @@ def __init__(
             max_attention_window = max_atten_window_upper_bound
             self.max_seq_len = max_atten_window_upper_bound
 
-        max_kv_cache_len = (max_attention_window if kv_cache_type
-                            == CacheTypeCpp.SELF else self.max_seq_len)
+        self.max_attention_window = max_attention_window if kv_cache_type == CacheTypeCpp.SELF else self.max_seq_len
 
         # Note that this stream is unused for now. Will be used for copying to host
         # when that feature is enabled.
@@ -190,8 +192,9 @@ def __init__(
             'blocks_in_secondary_pool': self.blocks_in_secondary_pool,
             'max_num_sequences': max_batch_size,
             'max_beam_width': 1,  # TODO: more than 1 beam?
-            'max_attention_window_vec': [max_kv_cache_len],
-            'temporary_attention_window': 0,
+            'max_attention_window_vec': [self.max_attention_window],
+            'temp_attention_window_inputs': None,
+            'dtype': dtype,
             'sink_token_length': sink_token_length,
             'stream': self._stream.cuda_stream,
             'max_sequence_length': max_seq_len,
@@ -207,7 +210,7 @@ def __init__(
 
         self.impl = KVCacheManagerCpp(**kwargs)
 
-        self.impl.allocate_pools(dtype, False)
+        self.impl.allocate_pools(False)
         self.kv_cache_pool_pointers = self.impl.get_block_pool_pointers()
         self.kv_cache_pool_mapping = self.impl.get_layer_to_pool_mapping()
         self.num_pools = self.impl.num_pools
@@ -407,7 +410,8 @@ def get_max_atten_window_upper_bound(self, blocks_in_primary_pool,
         return max_atten_window_upper_bound
 
     def get_cache_indices(self, request: LlmRequest) -> List[int]:
-        result = self.impl.get_cache_block_ids(request.py_request_id)
+        result = self.impl.get_cache_block_ids(request.py_request_id,
+                                               self.max_attention_window)
         assert len(result) == 1
         return result[0]
 
@@ -415,7 +419,8 @@ def get_batch_cache_indices(
         self,
         request_ids: List[int],
     ) -> Dict[int, List[int]]:
-        result = self.impl.get_batch_cache_block_ids(request_ids)
+        result = self.impl.get_batch_cache_block_ids(request_ids,
+                                                     self.max_attention_window)
         for i in range(len(result)):
             assert (len(result[i])) == 1
             result[i] = result[i][0]
diff --git a/tensorrt_llm/models/gemma/config.py b/tensorrt_llm/models/gemma/config.py
index d95244f79d2..e37926818ba 100644
--- a/tensorrt_llm/models/gemma/config.py
+++ b/tensorrt_llm/models/gemma/config.py
@@ -12,6 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from json import loads
+from pathlib import Path
 from typing import TYPE_CHECKING, Optional, Union
 
 from tensorrt_llm.functional import PositionEmbeddingType
@@ -145,6 +147,19 @@ def to_dict(self):
             } if self.is_gemma_3 else {})
         }
 
+    @staticmethod
+    def get_hf_config(config_dir: "Union[str, PathLike]"):
+        import transformers
+        model_type = loads(
+            (Path(config_dir) / "config.json").read_text())["model_type"]
+        HFConfigClass = {
+            "gemma2": transformers.GemmaConfig,
+            "gemma": transformers.Gemma2Config,
+            "gemma3_text": transformers.Gemma3TextConfig,
+        }[model_type]
+        hf_config = HFConfigClass.from_pretrained(config_dir)
+        return hf_config
+
     @classmethod
     def from_hugging_face(
         cls,
@@ -158,8 +173,7 @@ def from_hugging_face(
         if isinstance(hf_config_or_dir, transformers.PretrainedConfig):
             hf_config = hf_config_or_dir
         else:
-            hf_config = transformers.GemmaConfig.from_pretrained(
-                hf_config_or_dir)
+            hf_config = cls.get_hf_config(hf_config_or_dir)
 
         dtype = infer_dtype(dtype, getattr(hf_config, 'torch_dtype', None))
 
diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py
index d3da7e46711..049f177de65 100644
--- a/tensorrt_llm/models/generation_mixin.py
+++ b/tensorrt_llm/models/generation_mixin.py
@@ -339,17 +339,17 @@ def prepare_attention_inputs(
                         math.ceil(kv_cache_range[0][2] / tokens_per_block)
                     ]] * num_profiles
 
-                num_kv_cache_pools = 1 if num_kv_heads_per_layer is None else len(
-                    set(num_kv_heads_per_layer[num_attn_layers_lower_ranks:
-                                               num_attn_layers_lower_ranks +
-                                               len(local_attn_layers)]))
+                NUM_KV_CACHE_POOLS = -1  # the number of unique variable window sizes, which is only known at runtime, affects the number of pools.
+                # dim_range=(min=1, opt=1 (this is the usual case - non vgqa, non vsliding_window), max=num_layers,
+                # TODO(nhaber): Benchmark if making NUM_KV_CACHE_POOLS dynamic has a significant performance hit?
+
+                kv_pools_range = [[1, 1, len(local_attn_layers)]] * num_profiles
                 kv_cache_block_offsets = Tensor(
                     name=f'kv_cache_block_offsets',
                     dtype=trt.int32,
-                    shape=[num_kv_cache_pools, -1, 2, -1],
+                    shape=[NUM_KV_CACHE_POOLS, -1, 2, -1],
                     dim_range=OrderedDict([
-                        ('num_kv_cache_pools',
-                         [num_kv_cache_pools] * num_profiles),
+                        ('num_kv_cache_pools', kv_pools_range),
                         ('batch_size_beam_width', bb_range),
                         ('kv', [2] * num_profiles),
                         ('max_blocks_per_seq', max_blocks_per_seq_range),
@@ -357,10 +357,9 @@ def prepare_attention_inputs(
                 host_kv_cache_block_offsets = Tensor(
                     name=f'host_kv_cache_block_offsets',
                     dtype=trt.int32,
-                    shape=[num_kv_cache_pools, -1, 2, -1],
+                    shape=[NUM_KV_CACHE_POOLS, -1, 2, -1],
                     dim_range=OrderedDict([
-                        ('num_kv_cache_pools',
-                         [num_kv_cache_pools] * num_profiles),
+                        ('num_kv_cache_pools', kv_pools_range),
                         ('batch_size_beam_width', bb_range),
                         ('kv', [2] * num_profiles),
                         ('max_blocks_per_seq', max_blocks_per_seq_range),
@@ -368,10 +367,9 @@ def prepare_attention_inputs(
                 host_kv_cache_pool_pointers = Tensor(
                     name=f'host_kv_cache_pool_pointers',
                     dtype=trt.int64,
-                    shape=[num_kv_cache_pools, 2],
+                    shape=[NUM_KV_CACHE_POOLS, 2],
                     dim_range=OrderedDict([
-                        ('num_pools_layers',
-                         [num_kv_cache_pools] * num_profiles),
+                        ('num_pools_layers', kv_pools_range),
                         ('num_pools_kv', [2] * num_profiles),
                     ]))
 
diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py
index cdeaf6eb8ac..67d2a084f08 100644
--- a/tests/integration/defs/common.py
+++ b/tests/integration/defs/common.py
@@ -561,6 +561,8 @@ def generate_summary_cmd(example_root, *args, **kwargs):
         if isinstance(value, bool):
             if value:
                 summary_cmd.append(f"--{key}")
+        elif isinstance(value, list):  # Support max_attention_window
+            summary_cmd.extend([f"--{key}", *map(str, value)])
         else:
             summary_cmd.extend([f"--{key}", f"{value}"])
 
diff --git a/tests/integration/defs/examples/test_gemma.py b/tests/integration/defs/examples/test_gemma.py
index f16bfb6deff..5194f7dfa84 100644
--- a/tests/integration/defs/examples/test_gemma.py
+++ b/tests/integration/defs/examples/test_gemma.py
@@ -71,19 +71,71 @@ def get_ckpt_type(model_path):
     return ckpt_type
 
 
+GEMMA_2_9B_IT = "gemma-2-9b-it"
+GEMMA_2_27B_IT = "gemma-2-27b-it"
+GEMMA_3_1B_IT = "gemma-3-1b-it"
+VSWA_ATTENTION = {
+    GEMMA_2_9B_IT: [4096, 8192],
+    GEMMA_2_27B_IT: [4096, 8192],
+    GEMMA_3_1B_IT: [512, 512, 512, 512, 512, 32768]
+}
+"""
+* Gemma-2: (local `4096`: https://huggingface.co/google/gemma-2-9b-it/blob/main/config.json#L27, global `8192`: https://huggingface.co/google/gemma-2-9b-it/blob/main/config.json#L18)
+* Gemma-3-1b: (local `512`: https://huggingface.co/google/gemma-3-1b-it/blob/main/config.json#L31, global `32768`: https://huggingface.co/google/gemma-3-1b-it/blob/9b99be8/config.json#L20)
+* (global `131072`: All other gemma 3 models https://github.com/huggingface/transformers/blob/ae5ce226644c8576c9047987e6b1d2e9bdeaed24/src/transformers/models/gemma3/modular_gemma3.py#L200C33-L200C40)
+"""
+VSWA_MODELS = VSWA_ATTENTION.keys()
+
+GEMMA2_MODELS = {GEMMA_2_9B_IT, GEMMA_2_27B_IT}
+
+
+@pytest.mark.skip(reason="untested")
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("data_type", ['bfloat16'])
+@pytest.mark.parametrize("qformat", ['fp8'])
+@pytest.mark.parametrize("gemma_model_root", VSWA_MODELS, indirect=True)
+def test_llm_hf_gemma_quantization_1gpu_vswa(batch_size, data_type,
+                                             gemma_model_root, llm_venv,
+                                             cmodel_dir, engine_dir,
+                                             gemma_example_root,
+                                             llm_datasets_root, llm_rouge_root,
+                                             qformat):
+    max_attention_window = VSWA_ATTENTION[Path(gemma_model_root).stem]
+    hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root,
+                               llm_venv, cmodel_dir, engine_dir,
+                               gemma_example_root, llm_datasets_root,
+                               llm_rouge_root, qformat, max_attention_window)
+
+
 @skip_post_blackwell
 @skip_pre_hopper
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
 @pytest.mark.parametrize("qformat", ['fp8', 'int4_awq', 'int8_sq'])
-@pytest.mark.parametrize(
-    "gemma_model_root",
-    ["gemma-2b", "gemma-7b", "gemma-2-9b-it", "gemma-2-27b-it"],
-    indirect=True)
+@pytest.mark.parametrize("gemma_model_root",
+                         ["gemma-2b", "gemma-7b", *GEMMA2_MODELS],
+                         indirect=True)
 def test_llm_hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root,
                                         llm_venv, cmodel_dir, engine_dir,
                                         gemma_example_root, llm_datasets_root,
                                         llm_rouge_root, qformat):
+    hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root,
+                               llm_venv, cmodel_dir, engine_dir,
+                               gemma_example_root, llm_datasets_root,
+                               llm_rouge_root, qformat)
+
+
+def hf_gemma_quantization_1gpu(batch_size,
+                               data_type,
+                               gemma_model_root,
+                               llm_venv,
+                               cmodel_dir,
+                               engine_dir,
+                               gemma_example_root,
+                               llm_datasets_root,
+                               llm_rouge_root,
+                               qformat,
+                               max_attention_window: list[int] | None = None):
     "run gemma quantization tests"
     print("Convert checkpoint by modelopt...")
     kv_cache_dtype = 'fp8' if qformat == 'fp8' else 'int8'
@@ -121,6 +173,11 @@ def test_llm_hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root,
     if "gemma-7b" in gemma_model_root:
         threshold_score = 18
 
+    window = [
+        "--max_attention_window_size",
+        *max_attention_window,
+    ] if max_attention_window is not None else []
+
     summary_cmd = [
         f"{gemma_example_root}/../summarize.py",
         "--test_trt_llm",
@@ -133,10 +190,27 @@ def test_llm_hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root,
         f"--batch_size={batch_size}",
         f"--dataset_dir={llm_datasets_root}",
         f"--rouge_dir={llm_rouge_root}",
+        *window,
     ]
     venv_check_call(llm_venv, summary_cmd)
 
 
+# max_seq_len=3100, one local value that won't slide, and one that will
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("data_type", ['bfloat16'])
+@pytest.mark.parametrize("test_case", ['other'])
+@pytest.mark.parametrize("gemma_model_root", VSWA_MODELS, indirect=True)
+def test_llm_gemma_1gpu_summary_vswa(batch_size, data_type, gemma_model_root,
+                                     llm_venv, cmodel_dir, engine_dir,
+                                     gemma_example_root, llm_datasets_root,
+                                     llm_rouge_root, test_case):
+    max_attention_window = VSWA_ATTENTION[Path(gemma_model_root).stem]
+    gemma_1gpu_summary(batch_size, data_type, gemma_model_root, llm_venv,
+                       cmodel_dir, engine_dir, gemma_example_root,
+                       llm_datasets_root, llm_rouge_root, test_case,
+                       max_attention_window)
+
+
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("data_type", ['float16', 'bfloat16'])
 @pytest.mark.parametrize("test_case", [
@@ -150,13 +224,29 @@ def test_llm_hf_gemma_quantization_1gpu(batch_size, data_type, gemma_model_root,
 @pytest.mark.parametrize("gemma_model_root", [
     "gemma-2b", "gemma-7b", "gemma-2b-torch", "gemma-7b-torch",
     "gemma-2b-keras", "gemma-7b-keras", "gemma-2b-it-flax", "gemma-7b-it-flax",
-    "gemma-2-9b-it", "gemma-2-27b-it", "gemma-3-1b-it"
+    *GEMMA2_MODELS
 ],
                          indirect=True)
 def test_llm_gemma_1gpu_summary(batch_size, data_type, gemma_model_root,
                                 llm_venv, cmodel_dir, engine_dir,
                                 gemma_example_root, llm_datasets_root,
                                 llm_rouge_root, test_case):
+    gemma_1gpu_summary(batch_size, data_type, gemma_model_root, llm_venv,
+                       cmodel_dir, engine_dir, gemma_example_root,
+                       llm_datasets_root, llm_rouge_root, test_case)
+
+
+def gemma_1gpu_summary(batch_size,
+                       data_type,
+                       gemma_model_root,
+                       llm_venv,
+                       cmodel_dir,
+                       engine_dir,
+                       gemma_example_root,
+                       llm_datasets_root,
+                       llm_rouge_root,
+                       test_case,
+                       max_attention_window: list[int] | None = None):
     "run gemm test on 1 gpu"
     skip_fp8_pre_ada(use_fp8=test_case == "fp8_kv_cache")
     if "smooth_quant" in test_case and "bfloat16" in data_type:
@@ -215,6 +305,10 @@ def test_llm_gemma_1gpu_summary(batch_size, data_type, gemma_model_root,
 
     check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
 
+    window = {
+        'max_attention_window_size': max_attention_window
+    } if max_attention_window is not None else {}
+
     print("Run summarize...")
     summary_cmd = generate_summary_cmd(gemma_example_root,
                                        engine_dir=engine_dir,
@@ -222,7 +316,8 @@ def test_llm_gemma_1gpu_summary(batch_size, data_type, gemma_model_root,
                                        batch_size=batch_size,
                                        tensorrt_llm_rouge1_threshold=15,
                                        dataset_dir=llm_datasets_root,
-                                       rouge_dir=llm_rouge_root)
+                                       rouge_dir=llm_rouge_root,
+                                       **window)
 
     if ckpt_type == "hf":
         summary_cmd.extend([
@@ -232,13 +327,6 @@ def test_llm_gemma_1gpu_summary(batch_size, data_type, gemma_model_root,
     else:
         summary_cmd.append(f"--vocab_file={vocab_file}")
 
-    os.path.basename(gemma_model_root)
-    if 'gemma-3-1b-it' in gemma_model_root:
-        max_attention_window_size = [512, 512, 512, 512, 512, 3100]
-        summary_cmd.append(f"--max_attention_window_size")
-        for window_size in max_attention_window_size:
-            summary_cmd.append(str(window_size))
-
     venv_check_call(llm_venv, summary_cmd)
 
 
@@ -436,10 +524,9 @@ def test_llm_gemma_1gpu_evaltool(gemma_model_root, llm_venv, cmodel_dir,
 
 
 @skip_pre_hopper
-@pytest.mark.parametrize(
-    "gemma_model_root",
-    ["gemma-2b", "gemma-7b", "gemma-2-9b-it", "gemma-2-27b-it"],
-    indirect=True)
+@pytest.mark.parametrize("gemma_model_root",
+                         ["gemma-2b", "gemma-7b", *GEMMA2_MODELS],
+                         indirect=True)
 def test_hf_gemma_fp8_base_bf16_multi_lora(gemma_model_root,
                                            llm_venv,
                                            cmodel_dir,
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
index f392317cccc..e4e88d623d4 100644
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -35,7 +35,7 @@ examples/test_exaone.py::test_llm_exaone_1gpu[disable_weight_only-exaone_3.0_7.8
 examples/test_exaone.py::test_llm_exaone_1gpu[enable_weight_only-exaone_deep_2.4b-float16-nb:1]
 examples/test_exaone.py::test_llm_exaone_2gpu[exaone_3.0_7.8b_instruct-float16-nb:1]
 examples/test_gemma.py::test_llm_gemma_1gpu_summary[gemma-2-27b-it-other-bfloat16-8]
-examples/test_gemma.py::test_llm_gemma_1gpu_summary[gemma-3-1b-it-other-bfloat16-8]
+examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8]
 examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8]
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it]
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it]
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 554dda10c74..89b64d0eb88 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -113,7 +113,7 @@ l0_h100:
   - examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-1b]
   - examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_1.5b_instruct]
   - examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it]
-  - examples/test_gemma.py::test_llm_gemma_1gpu_summary[gemma-3-1b-it-other-bfloat16-8]
+  - examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8]
   - examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16]
   - unittest/trt/model_api/test_model_level_api.py # 9 mins on H100
   - unittest/trt/model_api/test_model_api_multi_gpu.py # 0.5 mins on H100
diff --git a/tests/unittest/bindings/test_bindings_ut.py b/tests/unittest/bindings/test_bindings_ut.py
index 3849018bf8c..de5a8a811a4 100644
--- a/tests/unittest/bindings/test_bindings_ut.py
+++ b/tests/unittest/bindings/test_bindings_ut.py
@@ -555,8 +555,10 @@ def test_KvCache_events_binding():
         'max_beam_width':
         1,
         'max_attention_window_vec': [10],
-        'temporary_attention_window':
-        0,
+        'temp_attention_window_inputs':
+        None,
+        'dtype':
+        _tb.DataType.FLOAT,
         'sink_token_length':
         0,
         'stream':