MoonshotAI · wbxl2000 · May 25, 2026 · May 25, 2026
diff --git a/.changeset/widen-kimi-completion-budget.md b/.changeset/widen-kimi-completion-budget.md
@@ -0,0 +1,6 @@
+---
+"@moonshot-ai/agent-core": patch
+"@moonshot-ai/kimi-code": patch
+---
+
+Let Kimi requests use the remaining context window for completion tokens by default while keeping explicit environment limits as hard caps.
diff --git a/docs/en/configuration/env-vars.md b/docs/en/configuration/env-vars.md
@@ -74,7 +74,7 @@ When neither `KIMI_CODE_OAUTH_HOST` nor `KIMI_OAUTH_HOST` is set, the OAuth auth
 | `KIMI_DISABLE_TELEMETRY` | Disable telemetry reporting | `1`, `true`, `t`, `yes`, `y` (case-insensitive) |
 | `KIMI_CODE_BACKGROUND_KEEP_ALIVE_ON_EXIT` | Override `[background].keep_alive_on_exit`, controlling whether still-running background tasks are kept when the session closes | True values: `1`, `true`, `yes`, `on`; false values: `0`, `false`, `no`, `off`; when unset, reads `config.toml`, then falls back to `true` |
 | `KIMI_SHELL_PATH` | Override the absolute path to Git Bash (`bash.exe`) on Windows; only needed when auto-detection fails on Windows | None |
-| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | Desired budget for `max_completion_tokens` in a single-step LLM request (the actual value is further clamped by the context window and input size); set to `0` or a negative value to disable clamping entirely. **Currently effective only for providers of type `kimi`**; for Anthropic and other providers, use `[models.<alias>].max_output_size` instead (see [Config files](./config-files.md#models)) | Defaults to 32000, influenced by `loop_control.reserved_context_size` |
+| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | Explicit hard cap for `max_completion_tokens` in a single-step LLM request. When unset, Kimi Code uses the safe remaining context window for models with a known context size. Set to `0` or a negative value to disable clamping entirely. **Currently effective only for providers of type `kimi`**; for Anthropic and other providers, use `[models.<alias>].max_output_size` instead (see [Config files](./config-files.md#models)) | Unset: computed from remaining context; unknown context falls back to `loop_control.reserved_context_size`, then 32000 |
 
 For example, to disable telemetry on a shared host:
 

diff --git a/docs/zh/configuration/env-vars.md b/docs/zh/configuration/env-vars.md
@@ -74,7 +74,7 @@ OAuth 流程默认连接 Kimi 官方的认证与托管端点，下列变量可
 | `KIMI_DISABLE_TELEMETRY` | 关闭遥测上报 | `1`、`true`、`t`、`yes`、`y`（不区分大小写） |
 | `KIMI_CODE_BACKGROUND_KEEP_ALIVE_ON_EXIT` | 覆盖 `[background].keep_alive_on_exit`，控制会话关闭时是否保留仍在运行的后台任务 | 真值：`1`、`true`、`yes`、`on`；假值：`0`、`false`、`no`、`off`；未设置时读取 `config.toml`，再回退到 `true` |
 | `KIMI_SHELL_PATH` | 覆盖 Windows 上 Git Bash (`bash.exe`) 的绝对路径，仅在 Windows 自动探测失败时需要 | 无 |
-| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | 单步 LLM 请求 `max_completion_tokens` 的期望预算（实际值按上下文窗口与输入大小再做 clamp）；设为 `0` 或负数则完全禁用 clamp。**目前只对 `kimi` 类型的供应商生效**；Anthropic 等其它供应商请改用 `[models.<alias>].max_output_size`（详见 [配置文件](./config-files.md#models)） | 默认 32000，受 `loop_control.reserved_context_size` 影响 |
+| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | 单步 LLM 请求 `max_completion_tokens` 的显式硬上限。未设置时，对于已知上下文窗口的模型，Kimi Code 会使用安全的剩余上下文窗口；设为 `0` 或负数则完全禁用 clamp。**目前只对 `kimi` 类型的供应商生效**；Anthropic 等其它供应商请改用 `[models.<alias>].max_output_size`（详见 [配置文件](./config-files.md#models)） | 未设置：按剩余上下文计算；未知上下文窗口时回退到 `loop_control.reserved_context_size`，再回退到 32000 |
 
 例如在共享主机上禁用遥测：
 

diff --git a/packages/agent-core/src/agent/turn/index.ts b/packages/agent-core/src/agent/turn/index.ts
@@ -366,7 +366,7 @@ export class TurnFlow {
       const model = this.agent.config.model;
       const provider = this.agent.config.provider.withThinking(this.agent.config.thinkingLevel);
       const loopControl = this.agent.providerManager?.config.loopControl;
-      const completionBudget = resolveCompletionBudget({
+      const completionBudgetConfig = resolveCompletionBudget({
         reservedContextSize: loopControl?.reservedContextSize,
       });
 
@@ -380,7 +380,7 @@ export class TurnFlow {
             systemPrompt: this.agent.config.systemPrompt,
             capability: this.agent.config.modelCapabilities,
             generate: this.agent.generate,
-            completionBudget,
+            completionBudgetConfig,
           }),
           buildMessages: () => this.agent.context.messages,
           dispatchEvent: this.buildDispatchEvent(turnId),

diff --git a/packages/agent-core/src/agent/turn/kosong-llm.ts b/packages/agent-core/src/agent/turn/kosong-llm.ts
@@ -32,7 +32,7 @@ import {
 import type { LLM, LLMChatParams, LLMChatResponse, LLMRequestLogContext } from '../../loop';
 import {
   applyCompletionBudget,
-  type CompletionBudget,
+  type CompletionBudgetConfig,
 } from '../../utils/completion-budget';
 
 export const GENERATE_REQUEST_LOG_CONTEXT = '__kimiRequestLogContext';
@@ -56,12 +56,10 @@ export interface KosongLLMConfig {
    */
   readonly generate?: GenerateFn | undefined;
   /**
-   * Per-request completion-token budget. When set, each `chat()` call
-   * clones the configured provider with a clamped `max_completion_tokens`
-   * derived from the current input size and model context window. The
-   * clone is local to the call and never replaces `this.provider`.
+   * Completion budget config resolved from agent/provider settings. The
+   * final cap is computed per request from the current messages and tools.
    */
-  readonly completionBudget?: CompletionBudget | undefined;
+  readonly completionBudgetConfig?: CompletionBudgetConfig | undefined;
 }
 
 export class KosongLLM implements LLM {
@@ -71,15 +69,15 @@ export class KosongLLM implements LLM {
 
   private readonly provider: ChatProvider;
   private readonly generate: GenerateFn;
-  private readonly completionBudget: CompletionBudget | undefined;
+  private readonly completionBudgetConfig: CompletionBudgetConfig | undefined;
 
   constructor(config: KosongLLMConfig) {
     this.provider = config.provider;
     this.modelName = config.modelName;
     this.systemPrompt = config.systemPrompt;
     this.capability = config.capability;
     this.generate = config.generate ?? kosongGenerate;
-    this.completionBudget = config.completionBudget;
+    this.completionBudgetConfig = config.completionBudgetConfig;
   }
 
   async chat(params: LLMChatParams): Promise<LLMChatResponse> {
@@ -98,7 +96,7 @@ export class KosongLLM implements LLM {
     // context can still slip past the limit.
     const effectiveProvider = applyCompletionBudget({
       provider: this.provider,
-      budget: this.completionBudget,
+      budget: this.completionBudgetConfig,
       capability: this.capability,
       messages: params.messages,
       systemPrompt: this.systemPrompt,

diff --git a/packages/agent-core/src/utils/completion-budget.ts b/packages/agent-core/src/utils/completion-budget.ts
@@ -11,61 +11,41 @@ import {
   estimateTokensForTools,
 } from './tokens';
 
-/**
- * Desired completion-token budget for the next LLM step.
- *
- * The budget is a request, not a guarantee: it is clamped against the
- * current input size and the model's context window before being applied
- * to the provider. This avoids two failure modes for Kimi reasoning
- * models:
- *   1. A small cap can return HTTP 200 with empty `content` because the
- *      whole budget was spent on `reasoning_content`.
- *   2. A large cap may exceed the remaining context window and trigger
- *      `Invalid request: Your request exceeded model token limit`.
- */
-export interface CompletionBudget {
-  /** Desired completion budget when the model context window allows it. */
-  readonly desired: number;
-  /**
-   * Safety margin reserved between current input and the context limit,
-   * to absorb tokenizer estimation error and provider-side overhead.
-   */
-  readonly safetyMargin?: number | undefined;
+/** Completion-token budget for the next LLM request. */
+export interface CompletionBudgetConfig {
+  /** Explicit user-configured maximum. */
+  readonly hardCap?: number;
+  /** Conservative cap for providers/models whose context window is unknown. */
+  readonly fallback?: number;
+  /** Tokens kept out of the output budget to absorb estimation drift. */
+  readonly safetyMargin?: number;
 }
 
 const MIN_FLOOR = 1;
 const DEFAULT_SAFETY_MARGIN = 1024;
-const DEFAULT_DESIRED_BUDGET = 32000;
+const DEFAULT_UNKNOWN_CONTEXT_FALLBACK = 32000;
 
 /**
- * Resolve the completion budget for a turn from configuration and Kimi
- * environment variables.
- *
- * Priority (first wins): `KIMI_MODEL_MAX_COMPLETION_TOKENS`,
- * `KIMI_MODEL_MAX_TOKENS` (legacy alias), `reservedContextSize`,
- * `DEFAULT_DESIRED_BUDGET` (32000, preserves pre-PR-2332 behavior).
- *
- * Operators can opt out of clamping entirely by setting the env var to
- * `0` or a negative integer; in that case this function returns
- * `undefined`, which `applyCompletionBudget` treats as a no-op.
+ * Resolve configured completion budget. Env values are explicit hard caps;
+ * non-positive env values disable clamping.
  */
 export function resolveCompletionBudget(args: {
-  readonly reservedContextSize?: number | undefined;
-  readonly env?: NodeJS.ProcessEnv | undefined;
-}): CompletionBudget | undefined {
+  readonly reservedContextSize?: number;
+  readonly env?: NodeJS.ProcessEnv;
+}): CompletionBudgetConfig | undefined {
   const env = args.env ?? process.env;
   const fromNew = parseEnvBudget(env['KIMI_MODEL_MAX_COMPLETION_TOKENS']);
   if (fromNew !== 'absent') {
-    return fromNew === 'disabled' ? undefined : { desired: fromNew };
+    return fromNew === 'disabled' ? undefined : { hardCap: fromNew };
   }
   const fromLegacy = parseEnvBudget(env['KIMI_MODEL_MAX_TOKENS']);
   if (fromLegacy !== 'absent') {
-    return fromLegacy === 'disabled' ? undefined : { desired: fromLegacy };
+    return fromLegacy === 'disabled' ? undefined : { hardCap: fromLegacy };
   }
   if (args.reservedContextSize !== undefined && args.reservedContextSize > 0) {
-    return { desired: args.reservedContextSize };
+    return { fallback: args.reservedContextSize };
   }
-  return { desired: DEFAULT_DESIRED_BUDGET };
+  return { fallback: DEFAULT_UNKNOWN_CONTEXT_FALLBACK };
 }
 
 type EnvBudget = number | 'disabled' | 'absent';
@@ -79,37 +59,23 @@ function parseEnvBudget(raw: string | undefined): EnvBudget {
 }
 
 /**
- * Compute the effective `max_completion_tokens` cap for the next request.
- *
- *   cap = clamp(desired, MIN_FLOOR, max_context_tokens - input - safetyMargin)
- *
- * `input` accounts for everything the provider will actually serialize:
- * the conversation history, the system prompt, and the tool schemas.
- * Counting only `messages` underestimates by enough to push a near-limit
- * request past the model context window.
- *
- * When the model context size is unknown, the desired value is returned
- * unchanged (floored at `MIN_FLOOR`).
- *
- * When the remaining window is non-positive (input already at or above
- * the limit), `MIN_FLOOR` is returned — we can't honor a meaningful cap
- * and the API will surface the overflow on its own.
- *
- * Note: the floor never exceeds `remaining`, so a near-full context
- * cannot be pushed past the limit by `MIN_FLOOR` itself.
+ * Compute the effective `max_completion_tokens` cap. Known-context requests
+ * use the remaining window unless a hard cap is configured.
  */
 export function computeCompletionBudgetCap(args: {
-  readonly budget: CompletionBudget;
+  readonly budget: CompletionBudgetConfig;
   readonly capability: ModelCapability | undefined;
   readonly messages: readonly Message[];
-  readonly systemPrompt?: string | undefined;
-  readonly tools?: readonly Tool[] | undefined;
+  readonly systemPrompt?: string;
+  readonly tools?: readonly Tool[];
 }): number {
-  const desired = args.budget.desired;
   const safetyMargin = args.budget.safetyMargin ?? DEFAULT_SAFETY_MARGIN;
   const maxCtx = args.capability?.max_context_tokens ?? 0;
   if (maxCtx <= 0) {
-    return Math.max(MIN_FLOOR, desired);
+    return Math.max(
+      MIN_FLOOR,
+      args.budget.hardCap ?? args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK,
+    );
   }
   const input =
     estimateTokensForMessages([...args.messages]) +
@@ -119,7 +85,10 @@ export function computeCompletionBudgetCap(args: {
   if (remaining <= 0) {
     return MIN_FLOOR;
   }
-  return Math.max(MIN_FLOOR, Math.min(desired, remaining));
+  if (args.budget.hardCap === undefined) {
+    return Math.max(MIN_FLOOR, remaining);
+  }
+  return Math.max(MIN_FLOOR, Math.min(args.budget.hardCap, remaining));
 }
 
 /**
@@ -134,11 +103,11 @@ export function computeCompletionBudgetCap(args: {
  */
 export function applyCompletionBudget(args: {
   readonly provider: ChatProvider;
-  readonly budget: CompletionBudget | undefined;
+  readonly budget: CompletionBudgetConfig | undefined;
   readonly capability: ModelCapability | undefined;
   readonly messages: readonly Message[];
-  readonly systemPrompt?: string | undefined;
-  readonly tools?: readonly Tool[] | undefined;
+  readonly systemPrompt?: string;
+  readonly tools?: readonly Tool[];
 }): ChatProvider {
   if (args.budget === undefined) return args.provider;
   if (args.provider.withMaxCompletionTokens === undefined) return args.provider;