diff --git a/.changeset/widen-kimi-completion-budget.md b/.changeset/widen-kimi-completion-budget.md new file mode 100644 index 0000000..6aa2238 --- /dev/null +++ b/.changeset/widen-kimi-completion-budget.md @@ -0,0 +1,6 @@ +--- +"@moonshot-ai/agent-core": patch +"@moonshot-ai/kimi-code": patch +--- + +Let Kimi requests use the remaining context window for completion tokens by default while keeping explicit environment limits as hard caps. diff --git a/docs/en/configuration/env-vars.md b/docs/en/configuration/env-vars.md index 78a5959..5e00443 100644 --- a/docs/en/configuration/env-vars.md +++ b/docs/en/configuration/env-vars.md @@ -74,7 +74,7 @@ When neither `KIMI_CODE_OAUTH_HOST` nor `KIMI_OAUTH_HOST` is set, the OAuth auth | `KIMI_DISABLE_TELEMETRY` | Disable telemetry reporting | `1`, `true`, `t`, `yes`, `y` (case-insensitive) | | `KIMI_CODE_BACKGROUND_KEEP_ALIVE_ON_EXIT` | Override `[background].keep_alive_on_exit`, controlling whether still-running background tasks are kept when the session closes | True values: `1`, `true`, `yes`, `on`; false values: `0`, `false`, `no`, `off`; when unset, reads `config.toml`, then falls back to `true` | | `KIMI_SHELL_PATH` | Override the absolute path to Git Bash (`bash.exe`) on Windows; only needed when auto-detection fails on Windows | None | -| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | Desired budget for `max_completion_tokens` in a single-step LLM request (the actual value is further clamped by the context window and input size); set to `0` or a negative value to disable clamping entirely. **Currently effective only for providers of type `kimi`**; for Anthropic and other providers, use `[models.].max_output_size` instead (see [Config files](./config-files.md#models)) | Defaults to 32000, influenced by `loop_control.reserved_context_size` | +| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | Explicit hard cap for `max_completion_tokens` in a single-step LLM request. When unset, Kimi Code uses the safe remaining context window for models with a known context size. Set to `0` or a negative value to disable clamping entirely. **Currently effective only for providers of type `kimi`**; for Anthropic and other providers, use `[models.].max_output_size` instead (see [Config files](./config-files.md#models)) | Unset: computed from remaining context; unknown context falls back to `loop_control.reserved_context_size`, then 32000 | For example, to disable telemetry on a shared host: diff --git a/docs/zh/configuration/env-vars.md b/docs/zh/configuration/env-vars.md index 7116ab4..e413cbe 100644 --- a/docs/zh/configuration/env-vars.md +++ b/docs/zh/configuration/env-vars.md @@ -74,7 +74,7 @@ OAuth 流程默认连接 Kimi 官方的认证与托管端点,下列变量可 | `KIMI_DISABLE_TELEMETRY` | 关闭遥测上报 | `1`、`true`、`t`、`yes`、`y`(不区分大小写) | | `KIMI_CODE_BACKGROUND_KEEP_ALIVE_ON_EXIT` | 覆盖 `[background].keep_alive_on_exit`,控制会话关闭时是否保留仍在运行的后台任务 | 真值:`1`、`true`、`yes`、`on`;假值:`0`、`false`、`no`、`off`;未设置时读取 `config.toml`,再回退到 `true` | | `KIMI_SHELL_PATH` | 覆盖 Windows 上 Git Bash (`bash.exe`) 的绝对路径,仅在 Windows 自动探测失败时需要 | 无 | -| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | 单步 LLM 请求 `max_completion_tokens` 的期望预算(实际值按上下文窗口与输入大小再做 clamp);设为 `0` 或负数则完全禁用 clamp。**目前只对 `kimi` 类型的供应商生效**;Anthropic 等其它供应商请改用 `[models.].max_output_size`(详见 [配置文件](./config-files.md#models)) | 默认 32000,受 `loop_control.reserved_context_size` 影响 | +| `KIMI_MODEL_MAX_COMPLETION_TOKENS` | 单步 LLM 请求 `max_completion_tokens` 的显式硬上限。未设置时,对于已知上下文窗口的模型,Kimi Code 会使用安全的剩余上下文窗口;设为 `0` 或负数则完全禁用 clamp。**目前只对 `kimi` 类型的供应商生效**;Anthropic 等其它供应商请改用 `[models.].max_output_size`(详见 [配置文件](./config-files.md#models)) | 未设置:按剩余上下文计算;未知上下文窗口时回退到 `loop_control.reserved_context_size`,再回退到 32000 | 例如在共享主机上禁用遥测: diff --git a/packages/agent-core/src/agent/turn/index.ts b/packages/agent-core/src/agent/turn/index.ts index 1b14b2a..8aecd7b 100644 --- a/packages/agent-core/src/agent/turn/index.ts +++ b/packages/agent-core/src/agent/turn/index.ts @@ -366,7 +366,7 @@ export class TurnFlow { const model = this.agent.config.model; const provider = this.agent.config.provider.withThinking(this.agent.config.thinkingLevel); const loopControl = this.agent.providerManager?.config.loopControl; - const completionBudget = resolveCompletionBudget({ + const completionBudgetConfig = resolveCompletionBudget({ reservedContextSize: loopControl?.reservedContextSize, }); @@ -380,7 +380,7 @@ export class TurnFlow { systemPrompt: this.agent.config.systemPrompt, capability: this.agent.config.modelCapabilities, generate: this.agent.generate, - completionBudget, + completionBudgetConfig, }), buildMessages: () => this.agent.context.messages, dispatchEvent: this.buildDispatchEvent(turnId), diff --git a/packages/agent-core/src/agent/turn/kosong-llm.ts b/packages/agent-core/src/agent/turn/kosong-llm.ts index 834fa47..d61eaa2 100644 --- a/packages/agent-core/src/agent/turn/kosong-llm.ts +++ b/packages/agent-core/src/agent/turn/kosong-llm.ts @@ -32,7 +32,7 @@ import { import type { LLM, LLMChatParams, LLMChatResponse, LLMRequestLogContext } from '../../loop'; import { applyCompletionBudget, - type CompletionBudget, + type CompletionBudgetConfig, } from '../../utils/completion-budget'; export const GENERATE_REQUEST_LOG_CONTEXT = '__kimiRequestLogContext'; @@ -56,12 +56,10 @@ export interface KosongLLMConfig { */ readonly generate?: GenerateFn | undefined; /** - * Per-request completion-token budget. When set, each `chat()` call - * clones the configured provider with a clamped `max_completion_tokens` - * derived from the current input size and model context window. The - * clone is local to the call and never replaces `this.provider`. + * Completion budget config resolved from agent/provider settings. The + * final cap is computed per request from the current messages and tools. */ - readonly completionBudget?: CompletionBudget | undefined; + readonly completionBudgetConfig?: CompletionBudgetConfig | undefined; } export class KosongLLM implements LLM { @@ -71,7 +69,7 @@ export class KosongLLM implements LLM { private readonly provider: ChatProvider; private readonly generate: GenerateFn; - private readonly completionBudget: CompletionBudget | undefined; + private readonly completionBudgetConfig: CompletionBudgetConfig | undefined; constructor(config: KosongLLMConfig) { this.provider = config.provider; @@ -79,7 +77,7 @@ export class KosongLLM implements LLM { this.systemPrompt = config.systemPrompt; this.capability = config.capability; this.generate = config.generate ?? kosongGenerate; - this.completionBudget = config.completionBudget; + this.completionBudgetConfig = config.completionBudgetConfig; } async chat(params: LLMChatParams): Promise { @@ -98,7 +96,7 @@ export class KosongLLM implements LLM { // context can still slip past the limit. const effectiveProvider = applyCompletionBudget({ provider: this.provider, - budget: this.completionBudget, + budget: this.completionBudgetConfig, capability: this.capability, messages: params.messages, systemPrompt: this.systemPrompt, diff --git a/packages/agent-core/src/utils/completion-budget.ts b/packages/agent-core/src/utils/completion-budget.ts index abcf027..361d654 100644 --- a/packages/agent-core/src/utils/completion-budget.ts +++ b/packages/agent-core/src/utils/completion-budget.ts @@ -11,61 +11,41 @@ import { estimateTokensForTools, } from './tokens'; -/** - * Desired completion-token budget for the next LLM step. - * - * The budget is a request, not a guarantee: it is clamped against the - * current input size and the model's context window before being applied - * to the provider. This avoids two failure modes for Kimi reasoning - * models: - * 1. A small cap can return HTTP 200 with empty `content` because the - * whole budget was spent on `reasoning_content`. - * 2. A large cap may exceed the remaining context window and trigger - * `Invalid request: Your request exceeded model token limit`. - */ -export interface CompletionBudget { - /** Desired completion budget when the model context window allows it. */ - readonly desired: number; - /** - * Safety margin reserved between current input and the context limit, - * to absorb tokenizer estimation error and provider-side overhead. - */ - readonly safetyMargin?: number | undefined; +/** Completion-token budget for the next LLM request. */ +export interface CompletionBudgetConfig { + /** Explicit user-configured maximum. */ + readonly hardCap?: number; + /** Conservative cap for providers/models whose context window is unknown. */ + readonly fallback?: number; + /** Tokens kept out of the output budget to absorb estimation drift. */ + readonly safetyMargin?: number; } const MIN_FLOOR = 1; const DEFAULT_SAFETY_MARGIN = 1024; -const DEFAULT_DESIRED_BUDGET = 32000; +const DEFAULT_UNKNOWN_CONTEXT_FALLBACK = 32000; /** - * Resolve the completion budget for a turn from configuration and Kimi - * environment variables. - * - * Priority (first wins): `KIMI_MODEL_MAX_COMPLETION_TOKENS`, - * `KIMI_MODEL_MAX_TOKENS` (legacy alias), `reservedContextSize`, - * `DEFAULT_DESIRED_BUDGET` (32000, preserves pre-PR-2332 behavior). - * - * Operators can opt out of clamping entirely by setting the env var to - * `0` or a negative integer; in that case this function returns - * `undefined`, which `applyCompletionBudget` treats as a no-op. + * Resolve configured completion budget. Env values are explicit hard caps; + * non-positive env values disable clamping. */ export function resolveCompletionBudget(args: { - readonly reservedContextSize?: number | undefined; - readonly env?: NodeJS.ProcessEnv | undefined; -}): CompletionBudget | undefined { + readonly reservedContextSize?: number; + readonly env?: NodeJS.ProcessEnv; +}): CompletionBudgetConfig | undefined { const env = args.env ?? process.env; const fromNew = parseEnvBudget(env['KIMI_MODEL_MAX_COMPLETION_TOKENS']); if (fromNew !== 'absent') { - return fromNew === 'disabled' ? undefined : { desired: fromNew }; + return fromNew === 'disabled' ? undefined : { hardCap: fromNew }; } const fromLegacy = parseEnvBudget(env['KIMI_MODEL_MAX_TOKENS']); if (fromLegacy !== 'absent') { - return fromLegacy === 'disabled' ? undefined : { desired: fromLegacy }; + return fromLegacy === 'disabled' ? undefined : { hardCap: fromLegacy }; } if (args.reservedContextSize !== undefined && args.reservedContextSize > 0) { - return { desired: args.reservedContextSize }; + return { fallback: args.reservedContextSize }; } - return { desired: DEFAULT_DESIRED_BUDGET }; + return { fallback: DEFAULT_UNKNOWN_CONTEXT_FALLBACK }; } type EnvBudget = number | 'disabled' | 'absent'; @@ -79,37 +59,23 @@ function parseEnvBudget(raw: string | undefined): EnvBudget { } /** - * Compute the effective `max_completion_tokens` cap for the next request. - * - * cap = clamp(desired, MIN_FLOOR, max_context_tokens - input - safetyMargin) - * - * `input` accounts for everything the provider will actually serialize: - * the conversation history, the system prompt, and the tool schemas. - * Counting only `messages` underestimates by enough to push a near-limit - * request past the model context window. - * - * When the model context size is unknown, the desired value is returned - * unchanged (floored at `MIN_FLOOR`). - * - * When the remaining window is non-positive (input already at or above - * the limit), `MIN_FLOOR` is returned — we can't honor a meaningful cap - * and the API will surface the overflow on its own. - * - * Note: the floor never exceeds `remaining`, so a near-full context - * cannot be pushed past the limit by `MIN_FLOOR` itself. + * Compute the effective `max_completion_tokens` cap. Known-context requests + * use the remaining window unless a hard cap is configured. */ export function computeCompletionBudgetCap(args: { - readonly budget: CompletionBudget; + readonly budget: CompletionBudgetConfig; readonly capability: ModelCapability | undefined; readonly messages: readonly Message[]; - readonly systemPrompt?: string | undefined; - readonly tools?: readonly Tool[] | undefined; + readonly systemPrompt?: string; + readonly tools?: readonly Tool[]; }): number { - const desired = args.budget.desired; const safetyMargin = args.budget.safetyMargin ?? DEFAULT_SAFETY_MARGIN; const maxCtx = args.capability?.max_context_tokens ?? 0; if (maxCtx <= 0) { - return Math.max(MIN_FLOOR, desired); + return Math.max( + MIN_FLOOR, + args.budget.hardCap ?? args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK, + ); } const input = estimateTokensForMessages([...args.messages]) + @@ -119,7 +85,10 @@ export function computeCompletionBudgetCap(args: { if (remaining <= 0) { return MIN_FLOOR; } - return Math.max(MIN_FLOOR, Math.min(desired, remaining)); + if (args.budget.hardCap === undefined) { + return Math.max(MIN_FLOOR, remaining); + } + return Math.max(MIN_FLOOR, Math.min(args.budget.hardCap, remaining)); } /** @@ -134,11 +103,11 @@ export function computeCompletionBudgetCap(args: { */ export function applyCompletionBudget(args: { readonly provider: ChatProvider; - readonly budget: CompletionBudget | undefined; + readonly budget: CompletionBudgetConfig | undefined; readonly capability: ModelCapability | undefined; readonly messages: readonly Message[]; - readonly systemPrompt?: string | undefined; - readonly tools?: readonly Tool[] | undefined; + readonly systemPrompt?: string; + readonly tools?: readonly Tool[]; }): ChatProvider { if (args.budget === undefined) return args.provider; if (args.provider.withMaxCompletionTokens === undefined) return args.provider; diff --git a/packages/agent-core/test/utils/completion-budget.test.ts b/packages/agent-core/test/utils/completion-budget.test.ts index fe84a3b..77767c6 100644 --- a/packages/agent-core/test/utils/completion-budget.test.ts +++ b/packages/agent-core/test/utils/completion-budget.test.ts @@ -45,45 +45,55 @@ function makeTool(name: string, asciiCharsInDescription: number): Tool { } describe('computeCompletionBudgetCap', () => { - it('returns desired when context size is unknown', () => { + it('uses fallback when context size is unknown and no hard cap is set', () => { const cap = computeCompletionBudgetCap({ - budget: { desired: 8192 }, + budget: { fallback: 8192 }, capability: undefined, messages: makeMessages(100), }); expect(cap).toBe(8192); }); - it('preserves a small desired when context size is unknown — no artificial floor', () => { + it('uses an explicit hard cap when context size is unknown', () => { const cap = computeCompletionBudgetCap({ - budget: { desired: 10 }, + budget: { hardCap: 10, fallback: 8192 }, capability: makeCapability(0), messages: makeMessages(100), }); expect(cap).toBe(10); }); - it('floors at 1 when desired is zero or negative', () => { + it('floors at 1 when hard cap is zero or negative', () => { expect( computeCompletionBudgetCap({ - budget: { desired: 0 }, + budget: { hardCap: 0 }, capability: undefined, messages: makeMessages(10), }), ).toBe(1); expect( computeCompletionBudgetCap({ - budget: { desired: -100 }, + budget: { hardCap: -100 }, capability: undefined, messages: makeMessages(10), }), ).toBe(1); }); - it('clamps desired down to the remaining context window', () => { + it('uses the remaining context window when no hard cap is set', () => { + const maxCtx = 100000; + const cap = computeCompletionBudgetCap({ + budget: { fallback: 32000 }, + capability: makeCapability(maxCtx), + messages: makeMessages(1000), + }); + expect(cap).toBe(maxCtx - 1001 - 1024); + }); + + it('clamps explicit hard cap down to the remaining context window', () => { // max_context_tokens 10000, input ~ 1000, safetyMargin 1024 → remaining ~ 7976 const cap = computeCompletionBudgetCap({ - budget: { desired: 32000 }, + budget: { hardCap: 32000 }, capability: makeCapability(10000), messages: makeMessages(1000), }); @@ -93,7 +103,7 @@ describe('computeCompletionBudgetCap', () => { it('returns 1 when input already exceeds context minus margin', () => { const cap = computeCompletionBudgetCap({ - budget: { desired: 32000 }, + budget: { fallback: 32000 }, capability: makeCapability(10000), messages: makeMessages(11000), }); @@ -105,7 +115,7 @@ describe('computeCompletionBudgetCap', () => { // The cap MUST stay <= remaining so the request does not overflow. const maxCtx = 10000; const cap = computeCompletionBudgetCap({ - budget: { desired: 32000 }, + budget: { fallback: 32000 }, capability: makeCapability(maxCtx), messages: makeMessages(8900), }); @@ -115,7 +125,7 @@ describe('computeCompletionBudgetCap', () => { it('respects custom safetyMargin', () => { const cap = computeCompletionBudgetCap({ - budget: { desired: 32000, safetyMargin: 4096 }, + budget: { fallback: 32000, safetyMargin: 4096 }, capability: makeCapability(20000), messages: makeMessages(1000), }); @@ -123,9 +133,9 @@ describe('computeCompletionBudgetCap', () => { expect(cap).toBe(14903); }); - it('keeps desired when smaller than remaining', () => { + it('keeps explicit hard cap when smaller than remaining', () => { const cap = computeCompletionBudgetCap({ - budget: { desired: 1024 }, + budget: { hardCap: 1024 }, capability: makeCapability(100000), messages: makeMessages(1000), }); @@ -137,7 +147,7 @@ describe('computeCompletionBudgetCap', () => { const safetyMargin = 1024; const systemPrompt = 'a'.repeat(2000 * 4); // ~2000 tokens const cap = computeCompletionBudgetCap({ - budget: { desired: 32000, safetyMargin }, + budget: { fallback: 32000, safetyMargin }, capability: makeCapability(maxCtx), messages: makeMessages(1000), systemPrompt, @@ -155,13 +165,13 @@ describe('computeCompletionBudgetCap', () => { makeTool('tool_b', 4000), ]; const capWithTools = computeCompletionBudgetCap({ - budget: { desired: 32000, safetyMargin }, + budget: { fallback: 32000, safetyMargin }, capability: makeCapability(maxCtx), messages: makeMessages(1000), tools, }); const capWithoutTools = computeCompletionBudgetCap({ - budget: { desired: 32000, safetyMargin }, + budget: { fallback: 32000, safetyMargin }, capability: makeCapability(maxCtx), messages: makeMessages(1000), }); @@ -210,7 +220,7 @@ describe('applyCompletionBudget', () => { const opaque = rest as unknown as ChatProvider; const result = applyCompletionBudget({ provider: opaque, - budget: { desired: 8192 }, + budget: { hardCap: 8192 }, capability: makeCapability(10000), messages: makeMessages(100), }); @@ -220,7 +230,7 @@ describe('applyCompletionBudget', () => { it('clones the provider with the clamped cap when budget is configured', () => { const result = applyCompletionBudget({ provider: original, - budget: { desired: 32000 }, + budget: { fallback: 32000 }, capability: makeCapability(10000), messages: makeMessages(1000), }); @@ -236,7 +246,7 @@ describe('applyCompletionBudget', () => { const systemPrompt = 'a'.repeat(4000); // ~1000 tokens applyCompletionBudget({ provider: original, - budget: { desired: 32000 }, + budget: { fallback: 32000 }, capability: makeCapability(10000), messages: makeMessages(1000), systemPrompt, @@ -246,7 +256,7 @@ describe('applyCompletionBudget', () => { withMaxCompletionTokens.mockClear(); applyCompletionBudget({ provider: original, - budget: { desired: 32000 }, + budget: { fallback: 32000 }, capability: makeCapability(10000), messages: makeMessages(1000), }); @@ -264,7 +274,7 @@ describe('resolveCompletionBudget', () => { KIMI_MODEL_MAX_TOKENS: '2048', }, }); - expect(budget?.desired).toBe(4096); + expect(budget?.hardCap).toBe(4096); }); it('falls back to legacy KIMI_MODEL_MAX_TOKENS when the new var is unset', () => { @@ -272,20 +282,22 @@ describe('resolveCompletionBudget', () => { reservedContextSize: 1000, env: { KIMI_MODEL_MAX_TOKENS: '2048' }, }); - expect(budget?.desired).toBe(2048); + expect(budget?.hardCap).toBe(2048); }); - it('uses reservedContextSize when no env var is set', () => { + it('uses reservedContextSize as the unknown-context fallback when no env var is set', () => { const budget = resolveCompletionBudget({ reservedContextSize: 12345, env: {}, }); - expect(budget?.desired).toBe(12345); + expect(budget?.hardCap).toBeUndefined(); + expect(budget?.fallback).toBe(12345); }); - it('falls back to the historical default 32000 when nothing is configured', () => { + it('falls back to 32000 only for unknown context when nothing is configured', () => { const budget = resolveCompletionBudget({ env: {} }); - expect(budget?.desired).toBe(32000); + expect(budget?.hardCap).toBeUndefined(); + expect(budget?.fallback).toBe(32000); }); it('ignores reservedContextSize when it is 0', () => { @@ -293,7 +305,8 @@ describe('resolveCompletionBudget', () => { reservedContextSize: 0, env: {}, }); - expect(budget?.desired).toBe(32000); + expect(budget?.hardCap).toBeUndefined(); + expect(budget?.fallback).toBe(32000); }); it('treats non-positive KIMI_MODEL_MAX_COMPLETION_TOKENS as an opt-out', () => { @@ -327,13 +340,14 @@ describe('resolveCompletionBudget', () => { KIMI_MODEL_MAX_TOKENS: '-1', }, }); - expect(budget?.desired).toBe(4096); + expect(budget?.hardCap).toBe(4096); }); it('falls back to defaults when the env var is non-numeric garbage', () => { const budget = resolveCompletionBudget({ env: { KIMI_MODEL_MAX_COMPLETION_TOKENS: 'not-a-number' }, }); - expect(budget?.desired).toBe(32000); + expect(budget?.hardCap).toBeUndefined(); + expect(budget?.fallback).toBe(32000); }); });