From fc1d412f9763f41693f97bcd003663aef30ac0b3 Mon Sep 17 00:00:00 2001
From: yuhangh <58161490+heyuhhh@users.noreply.github.com>
Date: Sun, 12 Apr 2026 14:35:07 +0000
Subject: [PATCH 1/4] add history_hint in capacity resize

Signed-off-by: yuhangh <58161490+heyuhhh@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/resource_manager.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index 3e661ad838cd..c0593d15d204 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -2294,7 +2294,12 @@ def release_resources(current_request: LlmRequest,
                     return None
                 kv_cache.stop_committing()
                 dummy_capacity = token_num + self.num_extra_kv_tokens + num_extra_decoding_steps
-                success = kv_cache.resize(dummy_capacity)
+                # Gen requests only attend within the sliding window, so
+                # treat full capacity as history to activate stale-block
+                # optimization and match the solver's pool budget.
+                history_hint = dummy_capacity if is_gen else None
+                success = kv_cache.resize(dummy_capacity,
+                                          history_length=history_hint)
                 if not success:
                     release_resources(req)
                     return None

From d710087598f997bfb892e6119d734bc8fcfe0409 Mon Sep 17 00:00:00 2001
From: yuhangh <58161490+heyuhhh@users.noreply.github.com>
Date: Sun, 12 Apr 2026 20:56:19 -0700
Subject: [PATCH 2/4] Fix to use real meaning of token_num in gen phase

Signed-off-by: yuhangh <58161490+heyuhhh@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/resource_manager.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index c0593d15d204..3830e200a3ca 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -2295,9 +2295,10 @@ def release_resources(current_request: LlmRequest,
                 kv_cache.stop_committing()
                 dummy_capacity = token_num + self.num_extra_kv_tokens + num_extra_decoding_steps
                 # Gen requests only attend within the sliding window, so
-                # treat full capacity as history to activate stale-block
+                # hint the committed history to activate stale-block
                 # optimization and match the solver's pool budget.
-                history_hint = dummy_capacity if is_gen else None
+                # token_num - 1 is the past history length in generation.
+                history_hint = max(0, token_num - 1) if is_gen else None
                 success = kv_cache.resize(dummy_capacity,
                                           history_length=history_hint)
                 if not success:

From 4727bd5defdf949aa72c407ec40048444fc02366 Mon Sep 17 00:00:00 2001
From: yuhangh <58161490+heyuhhh@users.noreply.github.com>
Date: Sun, 12 Apr 2026 21:01:37 -0700
Subject: [PATCH 3/4] minor

Signed-off-by: yuhangh <58161490+heyuhhh@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/resource_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index 3830e200a3ca..37cddafe427a 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -2271,6 +2271,8 @@ def release_resources(current_request: LlmRequest,
             # during warmup.
             token_num = token_nums[
                 i] if token_nums is not None else 1 + max_num_draft_tokens
+            # token_num - 1 is the past history length in generation.
+            history_hint = max(0, token_num - 1) if is_gen else None
             # TODO: support cross attention
             encoder_input_tokens = None
             # Using 1 instead of 0 prevents NaN during warmup in e.g. Deepseek
@@ -2297,8 +2299,6 @@ def release_resources(current_request: LlmRequest,
                 # Gen requests only attend within the sliding window, so
                 # hint the committed history to activate stale-block
                 # optimization and match the solver's pool budget.
-                # token_num - 1 is the past history length in generation.
-                history_hint = max(0, token_num - 1) if is_gen else None
                 success = kv_cache.resize(dummy_capacity,
                                           history_length=history_hint)
                 if not success:
@@ -2326,7 +2326,7 @@ def release_resources(current_request: LlmRequest,
                 req.py_draft_tokens = [1] * max_num_draft_tokens
                 if prepare_resource:
                     new_capacity = kv_cache.capacity + max_num_draft_tokens + 1
-                    success = kv_cache.resize(new_capacity)
+                    success = kv_cache.resize(new_capacity, history_length=history_hint)
                     if not success:
                         release_resources(req)
                         return None

From c2ad9895b7b6702a50fec14c103f67615588fa64 Mon Sep 17 00:00:00 2001
From: yuhangh <58161490+heyuhhh@users.noreply.github.com>
Date: Mon, 13 Apr 2026 04:25:31 +0000
Subject: [PATCH 4/4] minor fix

Signed-off-by: yuhangh <58161490+heyuhhh@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/resource_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index 37cddafe427a..f67127490a52 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -2296,8 +2296,7 @@ def release_resources(current_request: LlmRequest,
                     return None
                 kv_cache.stop_committing()
                 dummy_capacity = token_num + self.num_extra_kv_tokens + num_extra_decoding_steps
-                # Gen requests only attend within the sliding window, so
-                # hint the committed history to activate stale-block
+                # Need to hint the committed history to activate stale-block
                 # optimization and match the solver's pool budget.
                 success = kv_cache.resize(dummy_capacity,
                                           history_length=history_hint)
@@ -2326,7 +2325,8 @@ def release_resources(current_request: LlmRequest,
                 req.py_draft_tokens = [1] * max_num_draft_tokens
                 if prepare_resource:
                     new_capacity = kv_cache.capacity + max_num_draft_tokens + 1
-                    success = kv_cache.resize(new_capacity, history_length=history_hint)
+                    success = kv_cache.resize(new_capacity,
+                                              history_length=history_hint)
                     if not success:
                         release_resources(req)
                         return None