Skip to content
10 changes: 8 additions & 2 deletions tensorrt_llm/_torch/pyexecutor/resource_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2271,6 +2271,8 @@ def release_resources(current_request: LlmRequest,
# during warmup.
token_num = token_nums[
i] if token_nums is not None else 1 + max_num_draft_tokens
# token_num - 1 is the past history length in generation.
history_hint = max(0, token_num - 1) if is_gen else None
# TODO: support cross attention
encoder_input_tokens = None
# Using 1 instead of 0 prevents NaN during warmup in e.g. Deepseek
Expand All @@ -2294,7 +2296,10 @@ def release_resources(current_request: LlmRequest,
return None
kv_cache.stop_committing()
dummy_capacity = token_num + self.num_extra_kv_tokens + num_extra_decoding_steps
success = kv_cache.resize(dummy_capacity)
# Need to hint the committed history to activate stale-block
# optimization and match the solver's pool budget.
success = kv_cache.resize(dummy_capacity,
history_length=history_hint)
if not success:
release_resources(req)
return None
Expand All @@ -2320,7 +2325,8 @@ def release_resources(current_request: LlmRequest,
req.py_draft_tokens = [1] * max_num_draft_tokens
if prepare_resource:
new_capacity = kv_cache.capacity + max_num_draft_tokens + 1
success = kv_cache.resize(new_capacity)
success = kv_cache.resize(new_capacity,
history_length=history_hint)
if not success:
release_resources(req)
return None
Expand Down
Loading