diff --git a/.changeset/widen-kimi-completion-budget.md b/.changeset/widen-kimi-completion-budget.md
new file mode 100644
index 0000000..6aa2238
--- /dev/null
+++ b/.changeset/widen-kimi-completion-budget.md
@@ -0,0 +1,6 @@
+---
+"@moonshot-ai/agent-core": patch
+"@moonshot-ai/kimi-code": patch
+---
+
+Let Kimi requests use the remaining context window for completion tokens by default while keeping explicit environment limits as hard caps.
diff --git a/docs/en/configuration/env-vars.md b/docs/en/configuration/env-vars.md
index 78a5959..5e00443 100644
--- a/docs/en/configuration/env-vars.md
+++ b/docs/en/configuration/env-vars.md
@@ -74,7 +74,7 @@ When neither `KIMI_CODE_OAUTH_HOST` nor `KIMI_OAUTH_HOST` is set, the OAuth auth
 | `KIMI_DISABLE_TELEMETRY` | Disable telemetry reporting | `1`, `true`, `t`, `yes`, `y` (case-insensitive) |
 | `KIMI_CODE_BACKGROUND_KEEP_ALIVE_ON_EXIT` | Override `[background].keep_alive_on_exit`, controlling whether still-running background tasks are kept when the session closes | True values: `1`, `true`, `yes`, `on`; false values: `0`, `false`, `no`, `off`; when unset, reads `config.toml`, then falls back to `true` |
 | `KIMI_SHELL_PATH` | Override the absolute path to Git Bash (`bash.exe`) on Windows; only needed when auto-detection fails on Windows | None |
-| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | Desired budget for `max_completion_tokens` in a single-step LLM request (the actual value is further clamped by the context window and input size); set to `0` or a negative value to disable clamping entirely. **Currently effective only for providers of type `kimi`**; for Anthropic and other providers, use `[models.<alias>].max_output_size` instead (see [Config files](./config-files.md#models)) | Defaults to 32000, influenced by `loop_control.reserved_context_size` |
+| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | Explicit hard cap for `max_completion_tokens` in a single-step LLM request. When unset, Kimi Code uses the safe remaining context window for models with a known context size. Set to `0` or a negative value to disable clamping entirely. **Currently effective only for providers of type `kimi`**; for Anthropic and other providers, use `[models.<alias>].max_output_size` instead (see [Config files](./config-files.md#models)) | Unset: computed from remaining context; unknown context falls back to `loop_control.reserved_context_size`, then 32000 |
 
 For example, to disable telemetry on a shared host:
 
diff --git a/docs/zh/configuration/env-vars.md b/docs/zh/configuration/env-vars.md
index 7116ab4..e413cbe 100644
--- a/docs/zh/configuration/env-vars.md
+++ b/docs/zh/configuration/env-vars.md
@@ -74,7 +74,7 @@ OAuth 流程默认连接 Kimi 官方的认证与托管端点，下列变量可
 | `KIMI_DISABLE_TELEMETRY` | 关闭遥测上报 | `1`、`true`、`t`、`yes`、`y`（不区分大小写） |
 | `KIMI_CODE_BACKGROUND_KEEP_ALIVE_ON_EXIT` | 覆盖 `[background].keep_alive_on_exit`，控制会话关闭时是否保留仍在运行的后台任务 | 真值：`1`、`true`、`yes`、`on`；假值：`0`、`false`、`no`、`off`；未设置时读取 `config.toml`，再回退到 `true` |
 | `KIMI_SHELL_PATH` | 覆盖 Windows 上 Git Bash (`bash.exe`) 的绝对路径，仅在 Windows 自动探测失败时需要 | 无 |
-| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | 单步 LLM 请求 `max_completion_tokens` 的期望预算（实际值按上下文窗口与输入大小再做 clamp）；设为 `0` 或负数则完全禁用 clamp。**目前只对 `kimi` 类型的供应商生效**；Anthropic 等其它供应商请改用 `[models.<alias>].max_output_size`（详见 [配置文件](./config-files.md#models)） | 默认 32000，受 `loop_control.reserved_context_size` 影响 |
+| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | 单步 LLM 请求 `max_completion_tokens` 的显式硬上限。未设置时，对于已知上下文窗口的模型，Kimi Code 会使用安全的剩余上下文窗口；设为 `0` 或负数则完全禁用 clamp。**目前只对 `kimi` 类型的供应商生效**；Anthropic 等其它供应商请改用 `[models.<alias>].max_output_size`（详见 [配置文件](./config-files.md#models)） | 未设置：按剩余上下文计算；未知上下文窗口时回退到 `loop_control.reserved_context_size`，再回退到 32000 |
 
 例如在共享主机上禁用遥测：
 
diff --git a/packages/agent-core/src/agent/turn/index.ts b/packages/agent-core/src/agent/turn/index.ts
index 1b14b2a..8aecd7b 100644
--- a/packages/agent-core/src/agent/turn/index.ts
+++ b/packages/agent-core/src/agent/turn/index.ts
@@ -366,7 +366,7 @@ export class TurnFlow {
       const model = this.agent.config.model;
       const provider = this.agent.config.provider.withThinking(this.agent.config.thinkingLevel);
       const loopControl = this.agent.providerManager?.config.loopControl;
-      const completionBudget = resolveCompletionBudget({
+      const completionBudgetConfig = resolveCompletionBudget({
         reservedContextSize: loopControl?.reservedContextSize,
       });
 
@@ -380,7 +380,7 @@ export class TurnFlow {
             systemPrompt: this.agent.config.systemPrompt,
             capability: this.agent.config.modelCapabilities,
             generate: this.agent.generate,
-            completionBudget,
+            completionBudgetConfig,
           }),
           buildMessages: () => this.agent.context.messages,
           dispatchEvent: this.buildDispatchEvent(turnId),
diff --git a/packages/agent-core/src/agent/turn/kosong-llm.ts b/packages/agent-core/src/agent/turn/kosong-llm.ts
index 834fa47..d61eaa2 100644
--- a/packages/agent-core/src/agent/turn/kosong-llm.ts
+++ b/packages/agent-core/src/agent/turn/kosong-llm.ts
@@ -32,7 +32,7 @@ import {
 import type { LLM, LLMChatParams, LLMChatResponse, LLMRequestLogContext } from '../../loop';
 import {
   applyCompletionBudget,
-  type CompletionBudget,
+  type CompletionBudgetConfig,
 } from '../../utils/completion-budget';
 
 export const GENERATE_REQUEST_LOG_CONTEXT = '__kimiRequestLogContext';
@@ -56,12 +56,10 @@ export interface KosongLLMConfig {
    */
   readonly generate?: GenerateFn | undefined;
   /**
-   * Per-request completion-token budget. When set, each `chat()` call
-   * clones the configured provider with a clamped `max_completion_tokens`
-   * derived from the current input size and model context window. The
-   * clone is local to the call and never replaces `this.provider`.
+   * Completion budget config resolved from agent/provider settings. The
+   * final cap is computed per request from the current messages and tools.
    */
-  readonly completionBudget?: CompletionBudget | undefined;
+  readonly completionBudgetConfig?: CompletionBudgetConfig | undefined;
 }
 
 export class KosongLLM implements LLM {
@@ -71,7 +69,7 @@ export class KosongLLM implements LLM {
 
   private readonly provider: ChatProvider;
   private readonly generate: GenerateFn;
-  private readonly completionBudget: CompletionBudget | undefined;
+  private readonly completionBudgetConfig: CompletionBudgetConfig | undefined;
 
   constructor(config: KosongLLMConfig) {
     this.provider = config.provider;
@@ -79,7 +77,7 @@ export class KosongLLM implements LLM {
     this.systemPrompt = config.systemPrompt;
     this.capability = config.capability;
     this.generate = config.generate ?? kosongGenerate;
-    this.completionBudget = config.completionBudget;
+    this.completionBudgetConfig = config.completionBudgetConfig;
   }
 
   async chat(params: LLMChatParams): Promise<LLMChatResponse> {
@@ -98,7 +96,7 @@ export class KosongLLM implements LLM {
     // context can still slip past the limit.
     const effectiveProvider = applyCompletionBudget({
       provider: this.provider,
-      budget: this.completionBudget,
+      budget: this.completionBudgetConfig,
       capability: this.capability,
       messages: params.messages,
       systemPrompt: this.systemPrompt,
diff --git a/packages/agent-core/src/utils/completion-budget.ts b/packages/agent-core/src/utils/completion-budget.ts
index abcf027..361d654 100644
--- a/packages/agent-core/src/utils/completion-budget.ts
+++ b/packages/agent-core/src/utils/completion-budget.ts
@@ -11,61 +11,41 @@ import {
   estimateTokensForTools,
 } from './tokens';
 
-/**
- * Desired completion-token budget for the next LLM step.
- *
- * The budget is a request, not a guarantee: it is clamped against the
- * current input size and the model's context window before being applied
- * to the provider. This avoids two failure modes for Kimi reasoning
- * models:
- *   1. A small cap can return HTTP 200 with empty `content` because the
- *      whole budget was spent on `reasoning_content`.
- *   2. A large cap may exceed the remaining context window and trigger
- *      `Invalid request: Your request exceeded model token limit`.
- */
-export interface CompletionBudget {
-  /** Desired completion budget when the model context window allows it. */
-  readonly desired: number;
-  /**
-   * Safety margin reserved between current input and the context limit,
-   * to absorb tokenizer estimation error and provider-side overhead.
-   */
-  readonly safetyMargin?: number | undefined;
+/** Completion-token budget for the next LLM request. */
+export interface CompletionBudgetConfig {
+  /** Explicit user-configured maximum. */
+  readonly hardCap?: number;
+  /** Conservative cap for providers/models whose context window is unknown. */
+  readonly fallback?: number;
+  /** Tokens kept out of the output budget to absorb estimation drift. */
+  readonly safetyMargin?: number;
 }
 
 const MIN_FLOOR = 1;
 const DEFAULT_SAFETY_MARGIN = 1024;
-const DEFAULT_DESIRED_BUDGET = 32000;
+const DEFAULT_UNKNOWN_CONTEXT_FALLBACK = 32000;
 
 /**
- * Resolve the completion budget for a turn from configuration and Kimi
- * environment variables.
- *
- * Priority (first wins): `KIMI_MODEL_MAX_COMPLETION_TOKENS`,
- * `KIMI_MODEL_MAX_TOKENS` (legacy alias), `reservedContextSize`,
- * `DEFAULT_DESIRED_BUDGET` (32000, preserves pre-PR-2332 behavior).
- *
- * Operators can opt out of clamping entirely by setting the env var to
- * `0` or a negative integer; in that case this function returns
- * `undefined`, which `applyCompletionBudget` treats as a no-op.
+ * Resolve configured completion budget. Env values are explicit hard caps;
+ * non-positive env values disable clamping.
  */
 export function resolveCompletionBudget(args: {
-  readonly reservedContextSize?: number | undefined;
-  readonly env?: NodeJS.ProcessEnv | undefined;
-}): CompletionBudget | undefined {
+  readonly reservedContextSize?: number;
+  readonly env?: NodeJS.ProcessEnv;
+}): CompletionBudgetConfig | undefined {
   const env = args.env ?? process.env;
   const fromNew = parseEnvBudget(env['KIMI_MODEL_MAX_COMPLETION_TOKENS']);
   if (fromNew !== 'absent') {
-    return fromNew === 'disabled' ? undefined : { desired: fromNew };
+    return fromNew === 'disabled' ? undefined : { hardCap: fromNew };
   }
   const fromLegacy = parseEnvBudget(env['KIMI_MODEL_MAX_TOKENS']);
   if (fromLegacy !== 'absent') {
-    return fromLegacy === 'disabled' ? undefined : { desired: fromLegacy };
+    return fromLegacy === 'disabled' ? undefined : { hardCap: fromLegacy };
   }
   if (args.reservedContextSize !== undefined && args.reservedContextSize > 0) {
-    return { desired: args.reservedContextSize };
+    return { fallback: args.reservedContextSize };
   }
-  return { desired: DEFAULT_DESIRED_BUDGET };
+  return { fallback: DEFAULT_UNKNOWN_CONTEXT_FALLBACK };
 }
 
 type EnvBudget = number | 'disabled' | 'absent';
@@ -79,37 +59,23 @@ function parseEnvBudget(raw: string | undefined): EnvBudget {
 }
 
 /**
- * Compute the effective `max_completion_tokens` cap for the next request.
- *
- *   cap = clamp(desired, MIN_FLOOR, max_context_tokens - input - safetyMargin)
- *
- * `input` accounts for everything the provider will actually serialize:
- * the conversation history, the system prompt, and the tool schemas.
- * Counting only `messages` underestimates by enough to push a near-limit
- * request past the model context window.
- *
- * When the model context size is unknown, the desired value is returned
- * unchanged (floored at `MIN_FLOOR`).
- *
- * When the remaining window is non-positive (input already at or above
- * the limit), `MIN_FLOOR` is returned — we can't honor a meaningful cap
- * and the API will surface the overflow on its own.
- *
- * Note: the floor never exceeds `remaining`, so a near-full context
- * cannot be pushed past the limit by `MIN_FLOOR` itself.
+ * Compute the effective `max_completion_tokens` cap. Known-context requests
+ * use the remaining window unless a hard cap is configured.
  */
 export function computeCompletionBudgetCap(args: {
-  readonly budget: CompletionBudget;
+  readonly budget: CompletionBudgetConfig;
   readonly capability: ModelCapability | undefined;
   readonly messages: readonly Message[];
-  readonly systemPrompt?: string | undefined;
-  readonly tools?: readonly Tool[] | undefined;
+  readonly systemPrompt?: string;
+  readonly tools?: readonly Tool[];
 }): number {
-  const desired = args.budget.desired;
   const safetyMargin = args.budget.safetyMargin ?? DEFAULT_SAFETY_MARGIN;
   const maxCtx = args.capability?.max_context_tokens ?? 0;
   if (maxCtx <= 0) {
-    return Math.max(MIN_FLOOR, desired);
+    return Math.max(
+      MIN_FLOOR,
+      args.budget.hardCap ?? args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK,
+    );
   }
   const input =
     estimateTokensForMessages([...args.messages]) +
@@ -119,7 +85,10 @@ export function computeCompletionBudgetCap(args: {
   if (remaining <= 0) {
     return MIN_FLOOR;
   }
-  return Math.max(MIN_FLOOR, Math.min(desired, remaining));
+  if (args.budget.hardCap === undefined) {
+    return Math.max(MIN_FLOOR, remaining);
+  }
+  return Math.max(MIN_FLOOR, Math.min(args.budget.hardCap, remaining));
 }
 
 /**
@@ -134,11 +103,11 @@ export function computeCompletionBudgetCap(args: {
  */
 export function applyCompletionBudget(args: {
   readonly provider: ChatProvider;
-  readonly budget: CompletionBudget | undefined;
+  readonly budget: CompletionBudgetConfig | undefined;
   readonly capability: ModelCapability | undefined;
   readonly messages: readonly Message[];
-  readonly systemPrompt?: string | undefined;
-  readonly tools?: readonly Tool[] | undefined;
+  readonly systemPrompt?: string;
+  readonly tools?: readonly Tool[];
 }): ChatProvider {
   if (args.budget === undefined) return args.provider;
   if (args.provider.withMaxCompletionTokens === undefined) return args.provider;
diff --git a/packages/agent-core/test/utils/completion-budget.test.ts b/packages/agent-core/test/utils/completion-budget.test.ts
index fe84a3b..77767c6 100644
--- a/packages/agent-core/test/utils/completion-budget.test.ts
+++ b/packages/agent-core/test/utils/completion-budget.test.ts
@@ -45,45 +45,55 @@ function makeTool(name: string, asciiCharsInDescription: number): Tool {
 }
 
 describe('computeCompletionBudgetCap', () => {
-  it('returns desired when context size is unknown', () => {
+  it('uses fallback when context size is unknown and no hard cap is set', () => {
     const cap = computeCompletionBudgetCap({
-      budget: { desired: 8192 },
+      budget: { fallback: 8192 },
       capability: undefined,
       messages: makeMessages(100),
     });
     expect(cap).toBe(8192);
   });
 
-  it('preserves a small desired when context size is unknown — no artificial floor', () => {
+  it('uses an explicit hard cap when context size is unknown', () => {
     const cap = computeCompletionBudgetCap({
-      budget: { desired: 10 },
+      budget: { hardCap: 10, fallback: 8192 },
       capability: makeCapability(0),
       messages: makeMessages(100),
     });
     expect(cap).toBe(10);
   });
 
-  it('floors at 1 when desired is zero or negative', () => {
+  it('floors at 1 when hard cap is zero or negative', () => {
     expect(
       computeCompletionBudgetCap({
-        budget: { desired: 0 },
+        budget: { hardCap: 0 },
         capability: undefined,
         messages: makeMessages(10),
       }),
     ).toBe(1);
     expect(
       computeCompletionBudgetCap({
-        budget: { desired: -100 },
+        budget: { hardCap: -100 },
         capability: undefined,
         messages: makeMessages(10),
       }),
     ).toBe(1);
   });
 
-  it('clamps desired down to the remaining context window', () => {
+  it('uses the remaining context window when no hard cap is set', () => {
+    const maxCtx = 100000;
+    const cap = computeCompletionBudgetCap({
+      budget: { fallback: 32000 },
+      capability: makeCapability(maxCtx),
+      messages: makeMessages(1000),
+    });
+    expect(cap).toBe(maxCtx - 1001 - 1024);
+  });
+
+  it('clamps explicit hard cap down to the remaining context window', () => {
     // max_context_tokens 10000, input ~ 1000, safetyMargin 1024 → remaining ~ 7976
     const cap = computeCompletionBudgetCap({
-      budget: { desired: 32000 },
+      budget: { hardCap: 32000 },
       capability: makeCapability(10000),
       messages: makeMessages(1000),
     });
@@ -93,7 +103,7 @@ describe('computeCompletionBudgetCap', () => {
 
   it('returns 1 when input already exceeds context minus margin', () => {
     const cap = computeCompletionBudgetCap({
-      budget: { desired: 32000 },
+      budget: { fallback: 32000 },
       capability: makeCapability(10000),
       messages: makeMessages(11000),
     });
@@ -105,7 +115,7 @@ describe('computeCompletionBudgetCap', () => {
     // The cap MUST stay <= remaining so the request does not overflow.
     const maxCtx = 10000;
     const cap = computeCompletionBudgetCap({
-      budget: { desired: 32000 },
+      budget: { fallback: 32000 },
       capability: makeCapability(maxCtx),
       messages: makeMessages(8900),
     });
@@ -115,7 +125,7 @@ describe('computeCompletionBudgetCap', () => {
 
   it('respects custom safetyMargin', () => {
     const cap = computeCompletionBudgetCap({
-      budget: { desired: 32000, safetyMargin: 4096 },
+      budget: { fallback: 32000, safetyMargin: 4096 },
       capability: makeCapability(20000),
       messages: makeMessages(1000),
     });
@@ -123,9 +133,9 @@ describe('computeCompletionBudgetCap', () => {
     expect(cap).toBe(14903);
   });
 
-  it('keeps desired when smaller than remaining', () => {
+  it('keeps explicit hard cap when smaller than remaining', () => {
     const cap = computeCompletionBudgetCap({
-      budget: { desired: 1024 },
+      budget: { hardCap: 1024 },
       capability: makeCapability(100000),
       messages: makeMessages(1000),
     });
@@ -137,7 +147,7 @@ describe('computeCompletionBudgetCap', () => {
     const safetyMargin = 1024;
     const systemPrompt = 'a'.repeat(2000 * 4); // ~2000 tokens
     const cap = computeCompletionBudgetCap({
-      budget: { desired: 32000, safetyMargin },
+      budget: { fallback: 32000, safetyMargin },
       capability: makeCapability(maxCtx),
       messages: makeMessages(1000),
       systemPrompt,
@@ -155,13 +165,13 @@ describe('computeCompletionBudgetCap', () => {
       makeTool('tool_b', 4000),
     ];
     const capWithTools = computeCompletionBudgetCap({
-      budget: { desired: 32000, safetyMargin },
+      budget: { fallback: 32000, safetyMargin },
       capability: makeCapability(maxCtx),
       messages: makeMessages(1000),
       tools,
     });
     const capWithoutTools = computeCompletionBudgetCap({
-      budget: { desired: 32000, safetyMargin },
+      budget: { fallback: 32000, safetyMargin },
       capability: makeCapability(maxCtx),
       messages: makeMessages(1000),
     });
@@ -210,7 +220,7 @@ describe('applyCompletionBudget', () => {
     const opaque = rest as unknown as ChatProvider;
     const result = applyCompletionBudget({
       provider: opaque,
-      budget: { desired: 8192 },
+      budget: { hardCap: 8192 },
       capability: makeCapability(10000),
       messages: makeMessages(100),
     });
@@ -220,7 +230,7 @@ describe('applyCompletionBudget', () => {
   it('clones the provider with the clamped cap when budget is configured', () => {
     const result = applyCompletionBudget({
       provider: original,
-      budget: { desired: 32000 },
+      budget: { fallback: 32000 },
       capability: makeCapability(10000),
       messages: makeMessages(1000),
     });
@@ -236,7 +246,7 @@ describe('applyCompletionBudget', () => {
     const systemPrompt = 'a'.repeat(4000); // ~1000 tokens
     applyCompletionBudget({
       provider: original,
-      budget: { desired: 32000 },
+      budget: { fallback: 32000 },
       capability: makeCapability(10000),
       messages: makeMessages(1000),
       systemPrompt,
@@ -246,7 +256,7 @@ describe('applyCompletionBudget', () => {
     withMaxCompletionTokens.mockClear();
     applyCompletionBudget({
       provider: original,
-      budget: { desired: 32000 },
+      budget: { fallback: 32000 },
       capability: makeCapability(10000),
       messages: makeMessages(1000),
     });
@@ -264,7 +274,7 @@ describe('resolveCompletionBudget', () => {
         KIMI_MODEL_MAX_TOKENS: '2048',
       },
     });
-    expect(budget?.desired).toBe(4096);
+    expect(budget?.hardCap).toBe(4096);
   });
 
   it('falls back to legacy KIMI_MODEL_MAX_TOKENS when the new var is unset', () => {
@@ -272,20 +282,22 @@ describe('resolveCompletionBudget', () => {
       reservedContextSize: 1000,
       env: { KIMI_MODEL_MAX_TOKENS: '2048' },
     });
-    expect(budget?.desired).toBe(2048);
+    expect(budget?.hardCap).toBe(2048);
   });
 
-  it('uses reservedContextSize when no env var is set', () => {
+  it('uses reservedContextSize as the unknown-context fallback when no env var is set', () => {
     const budget = resolveCompletionBudget({
       reservedContextSize: 12345,
       env: {},
     });
-    expect(budget?.desired).toBe(12345);
+    expect(budget?.hardCap).toBeUndefined();
+    expect(budget?.fallback).toBe(12345);
   });
 
-  it('falls back to the historical default 32000 when nothing is configured', () => {
+  it('falls back to 32000 only for unknown context when nothing is configured', () => {
     const budget = resolveCompletionBudget({ env: {} });
-    expect(budget?.desired).toBe(32000);
+    expect(budget?.hardCap).toBeUndefined();
+    expect(budget?.fallback).toBe(32000);
   });
 
   it('ignores reservedContextSize when it is 0', () => {
@@ -293,7 +305,8 @@ describe('resolveCompletionBudget', () => {
       reservedContextSize: 0,
       env: {},
     });
-    expect(budget?.desired).toBe(32000);
+    expect(budget?.hardCap).toBeUndefined();
+    expect(budget?.fallback).toBe(32000);
   });
 
   it('treats non-positive KIMI_MODEL_MAX_COMPLETION_TOKENS as an opt-out', () => {
@@ -327,13 +340,14 @@ describe('resolveCompletionBudget', () => {
         KIMI_MODEL_MAX_TOKENS: '-1',
       },
     });
-    expect(budget?.desired).toBe(4096);
+    expect(budget?.hardCap).toBe(4096);
   });
 
   it('falls back to defaults when the env var is non-numeric garbage', () => {
     const budget = resolveCompletionBudget({
       env: { KIMI_MODEL_MAX_COMPLETION_TOKENS: 'not-a-number' },
     });
-    expect(budget?.desired).toBe(32000);
+    expect(budget?.hardCap).toBeUndefined();
+    expect(budget?.fallback).toBe(32000);
   });
 });