diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index e42fa249acda..010a768d7dc3 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -2271,6 +2271,8 @@ def release_resources(current_request: LlmRequest, # during warmup. token_num = token_nums[ i] if token_nums is not None else 1 + max_num_draft_tokens + # token_num - 1 is the past history length in generation. + history_hint = max(0, token_num - 1) if is_gen else None # TODO: support cross attention encoder_input_tokens = None # Using 1 instead of 0 prevents NaN during warmup in e.g. Deepseek @@ -2294,7 +2296,10 @@ def release_resources(current_request: LlmRequest, return None kv_cache.stop_committing() dummy_capacity = token_num + self.num_extra_kv_tokens + num_extra_decoding_steps - success = kv_cache.resize(dummy_capacity) + # Need to hint the committed history to activate stale-block + # optimization and match the solver's pool budget. + success = kv_cache.resize(dummy_capacity, + history_length=history_hint) if not success: release_resources(req) return None @@ -2320,7 +2325,8 @@ def release_resources(current_request: LlmRequest, req.py_draft_tokens = [1] * max_num_draft_tokens if prepare_resource: new_capacity = kv_cache.capacity + max_num_draft_tokens + 1 - success = kv_cache.resize(new_capacity) + success = kv_cache.resize(new_capacity, + history_length=history_hint) if not success: release_resources(req) return None