NVIDIA · lfr-0531 · Apr 17, 2026 · Apr 12, 2026 · Apr 13, 2026 · Apr 13, 2026
@@ -2271,6 +2271,8 @@ def release_resources(current_request: LlmRequest,
             # during warmup.
             token_num = token_nums[
                 i] if token_nums is not None else 1 + max_num_draft_tokens
+            # token_num - 1 is the past history length in generation.
+            history_hint = max(0, token_num - 1) if is_gen else None
             # TODO: support cross attention
             encoder_input_tokens = None
             # Using 1 instead of 0 prevents NaN during warmup in e.g. Deepseek
@@ -2294,7 +2296,10 @@ def release_resources(current_request: LlmRequest,
                     return None
                 kv_cache.stop_committing()
                 dummy_capacity = token_num + self.num_extra_kv_tokens + num_extra_decoding_steps
-                success = kv_cache.resize(dummy_capacity)
+                # Need to hint the committed history to activate stale-block
+                # optimization and match the solver's pool budget.
+                success = kv_cache.resize(dummy_capacity,
+                                          history_length=history_hint)
                 if not success:
                     release_resources(req)
                     return None
@@ -2320,7 +2325,8 @@ def release_resources(current_request: LlmRequest,
                 req.py_draft_tokens = [1] * max_num_draft_tokens
                 if prepare_resource:
                     new_capacity = kv_cache.capacity + max_num_draft_tokens + 1
-                    success = kv_cache.resize(new_capacity)
+                    success = kv_cache.resize(new_capacity,
+                                              history_length=history_hint)
                     if not success:
                         release_resources(req)
                         return None