From fc1d412f9763f41693f97bcd003663aef30ac0b3 Mon Sep 17 00:00:00 2001 From: yuhangh <58161490+heyuhhh@users.noreply.github.com> Date: Sun, 12 Apr 2026 14:35:07 +0000 Subject: [PATCH 1/4] add history_hint in capacity resize Signed-off-by: yuhangh <58161490+heyuhhh@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/resource_manager.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 3e661ad838cd..c0593d15d204 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -2294,7 +2294,12 @@ def release_resources(current_request: LlmRequest, return None kv_cache.stop_committing() dummy_capacity = token_num + self.num_extra_kv_tokens + num_extra_decoding_steps - success = kv_cache.resize(dummy_capacity) + # Gen requests only attend within the sliding window, so + # treat full capacity as history to activate stale-block + # optimization and match the solver's pool budget. + history_hint = dummy_capacity if is_gen else None + success = kv_cache.resize(dummy_capacity, + history_length=history_hint) if not success: release_resources(req) return None From d710087598f997bfb892e6119d734bc8fcfe0409 Mon Sep 17 00:00:00 2001 From: yuhangh <58161490+heyuhhh@users.noreply.github.com> Date: Sun, 12 Apr 2026 20:56:19 -0700 Subject: [PATCH 2/4] Fix to use real meaning of token_num in gen phase Signed-off-by: yuhangh <58161490+heyuhhh@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/resource_manager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index c0593d15d204..3830e200a3ca 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -2295,9 +2295,10 @@ def release_resources(current_request: LlmRequest, kv_cache.stop_committing() dummy_capacity = token_num + self.num_extra_kv_tokens + num_extra_decoding_steps # Gen requests only attend within the sliding window, so - # treat full capacity as history to activate stale-block + # hint the committed history to activate stale-block # optimization and match the solver's pool budget. - history_hint = dummy_capacity if is_gen else None + # token_num - 1 is the past history length in generation. + history_hint = max(0, token_num - 1) if is_gen else None success = kv_cache.resize(dummy_capacity, history_length=history_hint) if not success: From 4727bd5defdf949aa72c407ec40048444fc02366 Mon Sep 17 00:00:00 2001 From: yuhangh <58161490+heyuhhh@users.noreply.github.com> Date: Sun, 12 Apr 2026 21:01:37 -0700 Subject: [PATCH 3/4] minor Signed-off-by: yuhangh <58161490+heyuhhh@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/resource_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 3830e200a3ca..37cddafe427a 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -2271,6 +2271,8 @@ def release_resources(current_request: LlmRequest, # during warmup. token_num = token_nums[ i] if token_nums is not None else 1 + max_num_draft_tokens + # token_num - 1 is the past history length in generation. + history_hint = max(0, token_num - 1) if is_gen else None # TODO: support cross attention encoder_input_tokens = None # Using 1 instead of 0 prevents NaN during warmup in e.g. Deepseek @@ -2297,8 +2299,6 @@ def release_resources(current_request: LlmRequest, # Gen requests only attend within the sliding window, so # hint the committed history to activate stale-block # optimization and match the solver's pool budget. - # token_num - 1 is the past history length in generation. - history_hint = max(0, token_num - 1) if is_gen else None success = kv_cache.resize(dummy_capacity, history_length=history_hint) if not success: @@ -2326,7 +2326,7 @@ def release_resources(current_request: LlmRequest, req.py_draft_tokens = [1] * max_num_draft_tokens if prepare_resource: new_capacity = kv_cache.capacity + max_num_draft_tokens + 1 - success = kv_cache.resize(new_capacity) + success = kv_cache.resize(new_capacity, history_length=history_hint) if not success: release_resources(req) return None From c2ad9895b7b6702a50fec14c103f67615588fa64 Mon Sep 17 00:00:00 2001 From: yuhangh <58161490+heyuhhh@users.noreply.github.com> Date: Mon, 13 Apr 2026 04:25:31 +0000 Subject: [PATCH 4/4] minor fix Signed-off-by: yuhangh <58161490+heyuhhh@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/resource_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 37cddafe427a..f67127490a52 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -2296,8 +2296,7 @@ def release_resources(current_request: LlmRequest, return None kv_cache.stop_committing() dummy_capacity = token_num + self.num_extra_kv_tokens + num_extra_decoding_steps - # Gen requests only attend within the sliding window, so - # hint the committed history to activate stale-block + # Need to hint the committed history to activate stale-block # optimization and match the solver's pool budget. success = kv_cache.resize(dummy_capacity, history_length=history_hint) @@ -2326,7 +2325,8 @@ def release_resources(current_request: LlmRequest, req.py_draft_tokens = [1] * max_num_draft_tokens if prepare_resource: new_capacity = kv_cache.capacity + max_num_draft_tokens + 1 - success = kv_cache.resize(new_capacity, history_length=history_hint) + success = kv_cache.resize(new_capacity, + history_length=history_hint) if not success: release_resources(req) return None