diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 6821b49..53c6b1d 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -6,41 +6,41 @@ This document explains the architecture of opencode-codebase-index, including da ``` ┌─────────────────────────────────────────────────────────────────────────────┐ -│ OpenCode Agent │ +│ OpenCode Agent │ │ │ -│ Tools: codebase_search, index_codebase, index_status, index_health_check │ +│ Tools: codebase_search, index_codebase, index_status, index_health_check │ │ Commands: /search, /find, /index, /status │ └─────────────────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────────────────┐ -│ TypeScript Layer │ +│ TypeScript Layer │ │ │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ -│ │ Indexer │ │ Embeddings │ │ Watcher │ │ Git │ │ -│ │ │ │ Provider │ │ │ │ Detector │ │ -│ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Indexer │ │ Embeddings │ │ Watcher │ │ Git │ │ +│ │ │ │ Provider │ │ │ │ Detector │ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ │ └─────────────────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────────────────┐ │ Rust Native Module (NAPI) │ │ │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ -│ │ Tree-sitter │ │ usearch │ │ SQLite │ │ BM25 │ │ -│ │ Parser │ │ Vectors │ │ Database │ │ Inverted Idx │ │ -│ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Tree-sitter │ │ usearch │ │ SQLite │ │ BM25 │ │ +│ │ Parser │ │ Vectors │ │ Database │ │ Inverted Idx │ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ │ └─────────────────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────────────────┐ -│ Storage Layer │ +│ Storage Layer │ │ │ │ .opencode/index/ │ -│ ├── codebase.db # SQLite: embeddings, chunks, branch catalog │ -│ ├── vectors.usearch # Vector index (uSearch) │ -│ ├── inverted-index.json # BM25 keyword index │ -│ └── file-hashes.json # File change detection │ +│ ├── codebase.db # SQLite: embeddings, chunks, branch catalog │ +│ ├── vectors.usearch # Vector index (uSearch) │ +│ ├── inverted-index.json # BM25 keyword index │ +│ └── file-hashes.json # File change detection │ └─────────────────────────────────────────────────────────────────────────────┘ ``` diff --git a/package-lock.json b/package-lock.json index 684aa98..55cde0f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "opencode-codebase-index", - "version": "0.3.0", + "version": "0.4.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "opencode-codebase-index", - "version": "0.3.0", + "version": "0.4.1", "license": "MIT", "dependencies": { "chokidar": "^5.0.0", @@ -2161,7 +2161,6 @@ "integrity": "sha512-DhGl4xMVFGVIyMwswXeyzdL4uXD5OGILGX5N8Y+f6W7LhC1Ze2poSNrkF/fedpVDHEEZ+PHFW0vL14I+mm8K3Q==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@octokit/auth-token": "^6.0.0", "@octokit/graphql": "^9.0.3", @@ -2746,7 +2745,6 @@ "integrity": "sha512-powIePYMmC3ibL0UJ2i2s0WIbq6cg6UyVFQxSCpaPxxzAaziRfimGivjdF943sSGV6RADVbk0Nvlm5P/FB44Zg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "undici-types": "~7.16.0" } @@ -2786,7 +2784,6 @@ "integrity": "sha512-npiaib8XzbjtzS2N4HlqPvlpxpmZ14FjSJrteZpPxGUaYPlvhzlzUZ4mZyABo0EFrOWnvyd0Xxroq//hKhtAWg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.53.0", "@typescript-eslint/types": "8.53.0", @@ -3146,7 +3143,6 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -3530,7 +3526,6 @@ "dev": true, "hasInstallScript": true, "license": "MIT", - "peer": true, "bin": { "esbuild": "bin/esbuild" }, @@ -3585,7 +3580,6 @@ "integrity": "sha512-LEyamqS7W5HB3ujJyvi0HQK/dtVINZvd5mAAp9eT5S/ujByGjiZLCzPcHVzuXbpJDJF/cxwHlfceVUDZ2lnSTw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.1", @@ -4526,7 +4520,6 @@ } ], "license": "MIT", - "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -4931,7 +4924,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -5109,7 +5101,6 @@ "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "dev": true, "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -5179,7 +5170,6 @@ "integrity": "sha512-w+N7Hifpc3gRjZ63vYBXA56dvvRlNWRczTdmCBBa+CotUzAPf5b7YMdMR/8CQoeYE5LX3W4wj6RYTgonm1b9DA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "^0.27.0", "fdir": "^6.5.0", @@ -5273,7 +5263,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, diff --git a/src/config/constants.ts b/src/config/constants.ts new file mode 100644 index 0000000..b4453b0 --- /dev/null +++ b/src/config/constants.ts @@ -0,0 +1,103 @@ +export const DEFAULT_INCLUDE = [ + "**/*.{ts,tsx,js,jsx,mjs,cjs}", + "**/*.{py,pyi}", + "**/*.{go,rs,java,kt,scala}", + "**/*.{c,cpp,cc,h,hpp}", + "**/*.{rb,php,swift}", + "**/*.{vue,svelte,astro}", + "**/*.{sql,graphql,proto}", + "**/*.{yaml,yml,toml}", + "**/*.{md,mdx}", + "**/*.{sh,bash,zsh}", +]; + +export const DEFAULT_EXCLUDE = [ + "**/node_modules/**", + "**/.git/**", + "**/dist/**", + "**/build/**", + "**/*.min.js", + "**/*.bundle.js", + "**/vendor/**", + "**/__pycache__/**", + "**/target/**", + "**/coverage/**", + "**/.next/**", + "**/.nuxt/**", + "**/.opencode/**", +]; + + +export const EMBEDDING_MODELS = { + "google": { + // `text-embedding-004` is DEPRECATED - https://ai.google.dev/gemini-api/docs/deprecations + "text-embedding-005": { + provider: "google", + model: "text-embedding-005", + dimensions: 768, + maxTokens: 2048, + costPer1MTokens: 0.025, + taskAble: false, + // Note: on reality, this model allows for task-specific embeddings. See: https://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types + }, + "gemini-embedding-001": { + provider: "google", + model: "gemini-embedding-001", + // Native output is 3072D, but we use Matryoshka truncation via outputDimensionality + // to reduce to 1536D for better storage/search efficiency with minimal quality loss. + // Google recommends 768, 1536, or 3072. See: https://ai.google.dev/gemini-api/docs/embeddings + dimensions: 1536, + maxTokens: 2048, + costPer1MTokens: 0.15, + taskAble: true, + }, + }, + "openai": { + "text-embedding-3-small": { + provider: "openai", + model: "text-embedding-3-small", + dimensions: 1536, + maxTokens: 8191, + costPer1MTokens: 0.02, + }, + "text-embedding-3-large": { + provider: "openai", + model: "text-embedding-3-large", + dimensions: 3072, + maxTokens: 8191, + costPer1MTokens: 0.13, + }, + }, + "ollama": { + "nomic-embed-text": { + provider: "ollama", + model: "nomic-embed-text", + dimensions: 768, + maxTokens: 8192, + costPer1MTokens: 0.00, + }, + "mxbai-embed-large": { + provider: "ollama", + model: "mxbai-embed-large", + dimensions: 1024, + maxTokens: 512, + costPer1MTokens: 0.00, + }, + }, + "github-copilot": { + "text-embedding-3-small": { + provider: "github-copilot", + model: "text-embedding-3-small", + dimensions: 1536, + maxTokens: 8191, + costPer1MTokens: 0.00, + }, + }, +} as const; + +export const DEFAULT_PROVIDER_MODELS = { + "github-copilot": "text-embedding-3-small", + "openai": "text-embedding-3-small", + "google": "text-embedding-005", + "ollama": "nomic-embed-text", +} as const diff --git a/src/config/index.ts b/src/config/index.ts index 20cdc26..2b5945c 100644 --- a/src/config/index.ts +++ b/src/config/index.ts @@ -1 +1,2 @@ export * from "./schema.js"; +export * from "./constants.js"; diff --git a/src/config/schema.ts b/src/config/schema.ts index c504461..105e32c 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -1,6 +1,6 @@ // Config schema without zod dependency to avoid version conflicts with OpenCode SDK -export type EmbeddingProvider = "auto" | "github-copilot" | "openai" | "google" | "ollama"; +import { DEFAULT_INCLUDE, DEFAULT_EXCLUDE, EMBEDDING_MODELS, DEFAULT_PROVIDER_MODELS } from "./constants.js"; export type IndexScope = "project" | "global"; @@ -46,8 +46,8 @@ export interface DebugConfig { } export interface CodebaseIndexConfig { - embeddingProvider: EmbeddingProvider; - embeddingModel: string; + embeddingProvider: EmbeddingProvider | 'auto'; + embeddingModel?: EmbeddingModelName; scope: IndexScope; indexing?: Partial; search?: Partial; @@ -62,35 +62,6 @@ export type ParsedCodebaseIndexConfig = CodebaseIndexConfig & { debug: DebugConfig; }; -const DEFAULT_INCLUDE = [ - "**/*.{ts,tsx,js,jsx,mjs,cjs}", - "**/*.{py,pyi}", - "**/*.{go,rs,java,kt,scala}", - "**/*.{c,cpp,cc,h,hpp}", - "**/*.{rb,php,swift}", - "**/*.{vue,svelte,astro}", - "**/*.{sql,graphql,proto}", - "**/*.{yaml,yml,toml}", - "**/*.{md,mdx}", - "**/*.{sh,bash,zsh}", -]; - -const DEFAULT_EXCLUDE = [ - "**/node_modules/**", - "**/.git/**", - "**/dist/**", - "**/build/**", - "**/*.min.js", - "**/*.bundle.js", - "**/vendor/**", - "**/__pycache__/**", - "**/target/**", - "**/coverage/**", - "**/.next/**", - "**/.nuxt/**", - "**/.opencode/**", -]; - function getDefaultIndexingConfig(): IndexingConfig { return { autoIndex: false, @@ -130,12 +101,18 @@ function getDefaultDebugConfig(): DebugConfig { }; } -const VALID_PROVIDERS: EmbeddingProvider[] = ["auto", "github-copilot", "openai", "google", "ollama"]; const VALID_SCOPES: IndexScope[] = ["project", "global"]; const VALID_LOG_LEVELS: LogLevel[] = ["error", "warn", "info", "debug"]; function isValidProvider(value: unknown): value is EmbeddingProvider { - return typeof value === "string" && VALID_PROVIDERS.includes(value as EmbeddingProvider); + return typeof value === "string" && Object.keys(EMBEDDING_MODELS).includes(value); +} + +export function isValidModel

( + value: unknown, + provider: P +): value is ProviderModels[P] { + return typeof value === "string" && Object.keys(EMBEDDING_MODELS[provider]).includes(value); } function isValidScope(value: unknown): value is IndexScope { @@ -152,11 +129,11 @@ function isValidLogLevel(value: unknown): value is LogLevel { export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig { const input = (raw && typeof raw === "object" ? raw : {}) as Record; - + const defaultIndexing = getDefaultIndexingConfig(); const defaultSearch = getDefaultSearchConfig(); const defaultDebug = getDefaultDebugConfig(); - + const rawIndexing = (input.indexing && typeof input.indexing === "object" ? input.indexing : {}) as Record; const indexing: IndexingConfig = { autoIndex: typeof rawIndexing.autoIndex === "boolean" ? rawIndexing.autoIndex : defaultIndexing.autoIndex, @@ -171,7 +148,7 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig { gcOrphanThreshold: typeof rawIndexing.gcOrphanThreshold === "number" ? Math.max(0, rawIndexing.gcOrphanThreshold) : defaultIndexing.gcOrphanThreshold, requireProjectMarker: typeof rawIndexing.requireProjectMarker === "boolean" ? rawIndexing.requireProjectMarker : defaultIndexing.requireProjectMarker, }; - + const rawSearch = (input.search && typeof input.search === "object" ? input.search : {}) as Record; const search: SearchConfig = { maxResults: typeof rawSearch.maxResults === "number" ? rawSearch.maxResults : defaultSearch.maxResults, @@ -180,7 +157,7 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig { hybridWeight: typeof rawSearch.hybridWeight === "number" ? Math.min(1, Math.max(0, rawSearch.hybridWeight)) : defaultSearch.hybridWeight, contextLines: typeof rawSearch.contextLines === "number" ? Math.min(50, Math.max(0, rawSearch.contextLines)) : defaultSearch.contextLines, }; - + const rawDebug = (input.debug && typeof input.debug === "object" ? input.debug : {}) as Record; const debug: DebugConfig = { enabled: typeof rawDebug.enabled === "boolean" ? rawDebug.enabled : defaultDebug.enabled, @@ -192,10 +169,22 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig { logBranch: typeof rawDebug.logBranch === "boolean" ? rawDebug.logBranch : defaultDebug.logBranch, metrics: typeof rawDebug.metrics === "boolean" ? rawDebug.metrics : defaultDebug.metrics, }; + + let embeddingProvider: EmbeddingProvider | 'auto'; + let embeddingModel: EmbeddingModelName | undefined = undefined; + if (isValidProvider(input.embeddingProvider)) { + embeddingProvider = input.embeddingProvider; + if (input.embeddingModel) { + embeddingModel = isValidModel(input.embeddingModel, embeddingProvider) ? input.embeddingModel : DEFAULT_PROVIDER_MODELS[embeddingProvider]; + } + } else { + embeddingProvider = 'auto'; + } + return { - embeddingProvider: isValidProvider(input.embeddingProvider) ? input.embeddingProvider : "auto", - embeddingModel: typeof input.embeddingModel === "string" ? input.embeddingModel : "auto", + embeddingProvider, + embeddingModel, scope: isValidScope(input.scope) ? input.scope : "project", include: isStringArray(input.include) ? input.include : DEFAULT_INCLUDE, exclude: isStringArray(input.exclude) ? input.exclude : DEFAULT_EXCLUDE, @@ -205,80 +194,24 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig { }; } -export function getDefaultConfig(): CodebaseIndexConfig { - return { - embeddingProvider: "auto", - embeddingModel: "auto", - scope: "project", - include: DEFAULT_INCLUDE, - exclude: DEFAULT_EXCLUDE, - }; +export function getDefaultModelForProvider(provider: EmbeddingProvider): EmbeddingModelInfo { + const models = EMBEDDING_MODELS[provider] + const providerDefault = DEFAULT_PROVIDER_MODELS[provider] + return models[providerDefault as keyof typeof models] } -export interface EmbeddingModelInfo { - provider: EmbeddingProvider; - model: string; - dimensions: number; - maxTokens: number; - costPer1MTokens: number; +export type EmbeddingProvider = keyof typeof EMBEDDING_MODELS; + +export const availableProviders: EmbeddingProvider[] = Object.keys(EMBEDDING_MODELS) as EmbeddingProvider[] + +export type ProviderModels = { + [P in keyof typeof EMBEDDING_MODELS]: keyof (typeof EMBEDDING_MODELS)[P] } -export const EMBEDDING_MODELS: Record = { - "github-copilot/text-embedding-3-small": { - provider: "github-copilot", - model: "text-embedding-3-small", - dimensions: 1536, - maxTokens: 8191, - costPer1MTokens: 0.00, - }, - "openai/text-embedding-3-small": { - provider: "openai", - model: "text-embedding-3-small", - dimensions: 1536, - maxTokens: 8191, - costPer1MTokens: 0.02, - }, - "openai/text-embedding-3-large": { - provider: "openai", - model: "text-embedding-3-large", - dimensions: 3072, - maxTokens: 8191, - costPer1MTokens: 0.13, - }, - "google/text-embedding-004": { - provider: "google", - model: "text-embedding-004", - dimensions: 768, - maxTokens: 2048, - costPer1MTokens: 0.00, - }, - "ollama/nomic-embed-text": { - provider: "ollama", - model: "nomic-embed-text", - dimensions: 768, - maxTokens: 8192, - costPer1MTokens: 0.00, - }, - "ollama/mxbai-embed-large": { - provider: "ollama", - model: "mxbai-embed-large", - dimensions: 1024, - maxTokens: 512, - costPer1MTokens: 0.00, - }, -}; +export type EmbeddingModelName = ProviderModels[keyof ProviderModels] -export function getDefaultModelForProvider(provider: EmbeddingProvider): EmbeddingModelInfo { - switch (provider) { - case "github-copilot": - return EMBEDDING_MODELS["github-copilot/text-embedding-3-small"]; - case "openai": - return EMBEDDING_MODELS["openai/text-embedding-3-small"]; - case "google": - return EMBEDDING_MODELS["google/text-embedding-004"]; - case "ollama": - return EMBEDDING_MODELS["ollama/nomic-embed-text"]; - default: - return EMBEDDING_MODELS["github-copilot/text-embedding-3-small"]; - } +export type EmbeddingProviderModelInfo = { + [P in EmbeddingProvider]: (typeof EMBEDDING_MODELS)[P][keyof (typeof EMBEDDING_MODELS)[P]] } + +export type EmbeddingModelInfo = EmbeddingProviderModelInfo[EmbeddingProvider] diff --git a/src/embeddings/detector.ts b/src/embeddings/detector.ts index 1853a8c..5718674 100644 --- a/src/embeddings/detector.ts +++ b/src/embeddings/detector.ts @@ -1,4 +1,4 @@ -import { EmbeddingProvider, getDefaultModelForProvider, EmbeddingModelInfo } from "../config/schema.js"; +import { type EmbeddingProvider, getDefaultModelForProvider, isValidModel, availableProviders, EmbeddingModelName, EMBEDDING_MODELS } from "../config"; import { existsSync, readFileSync } from "fs"; import * as path from "path"; import * as os from "os"; @@ -12,18 +12,13 @@ export interface ProviderCredentials { tokenExpires?: number; } -export interface DetectedProvider { - provider: EmbeddingProvider; - credentials: ProviderCredentials; - modelInfo: EmbeddingModelInfo; -} - -const EMBEDDING_CAPABLE_PROVIDERS: EmbeddingProvider[] = [ - "github-copilot", - "openai", - "google", - "ollama", -]; +export type ConfiguredProviderInfo = { + [P in EmbeddingProvider]: { + provider: P; + credentials: ProviderCredentials; + modelInfo: (typeof EMBEDDING_MODELS)[P][keyof (typeof EMBEDDING_MODELS)[P]]; + } +}[EmbeddingProvider] interface OpenCodeAuthOAuth { type: "oauth"; @@ -56,36 +51,49 @@ function loadOpenCodeAuth(): Record { return {}; } -export async function detectEmbeddingProvider( - preferredProvider?: EmbeddingProvider -): Promise { - if (preferredProvider && preferredProvider !== "auto") { - const credentials = await getProviderCredentials(preferredProvider); - if (credentials) { +export async function detectEmbeddingProvider

( + preferredProvider: P, model?: EmbeddingModelName +): Promise { + const credentials = await getProviderCredentials(preferredProvider); + if (credentials) { + if (!model) { return { provider: preferredProvider, credentials, modelInfo: getDefaultModelForProvider(preferredProvider), - }; + } as ConfiguredProviderInfo; } - throw new Error( - `Preferred provider '${preferredProvider}' is not configured or authenticated` - ); + if (!isValidModel(model, preferredProvider)) { + throw new Error( + `Model '${model}' is not supported by provider '${preferredProvider}'` + ); + } + const providerModels = EMBEDDING_MODELS[preferredProvider]; + return { + provider: preferredProvider, + credentials, + modelInfo: providerModels[model], + } as ConfiguredProviderInfo; } + throw new Error( + `Preferred provider '${preferredProvider}' is not configured or authenticated` + ); +} - for (const provider of EMBEDDING_CAPABLE_PROVIDERS) { +export async function tryDetectProvider(): Promise { + for (const provider of availableProviders) { const credentials = await getProviderCredentials(provider); if (credentials) { return { provider, credentials, modelInfo: getDefaultModelForProvider(provider), - }; + } as ConfiguredProviderInfo; } } throw new Error( - `No embedding-capable provider found. Please authenticate with OpenCode using one of: ${EMBEDDING_CAPABLE_PROVIDERS.join(", ")}.` + `No embedding-capable provider found. Please authenticate with OpenCode using one of: ${availableProviders.join(", ")}.` ); } diff --git a/src/embeddings/provider.ts b/src/embeddings/provider.ts index e1a5c2b..9ffacbb 100644 --- a/src/embeddings/provider.ts +++ b/src/embeddings/provider.ts @@ -1,5 +1,5 @@ -import { EmbeddingModelInfo } from "../config/schema.js"; -import { ProviderCredentials } from "./detector.js"; +import { EmbeddingModelInfo, EmbeddingProviderModelInfo } from "../config/schema.js"; +import { ConfiguredProviderInfo, ProviderCredentials } from "./detector.js"; export interface EmbeddingResult { embedding: number[]; @@ -12,34 +12,36 @@ export interface EmbeddingBatchResult { } export interface EmbeddingProviderInterface { - embed(text: string): Promise; + embedQuery(query: string): Promise; + embedDocument(document: string): Promise; embedBatch(texts: string[]): Promise; getModelInfo(): EmbeddingModelInfo; } export function createEmbeddingProvider( - credentials: ProviderCredentials, - modelInfo: EmbeddingModelInfo + configuredProviderInfo: ConfiguredProviderInfo, ): EmbeddingProviderInterface { - switch (credentials.provider) { + switch (configuredProviderInfo.provider) { case "github-copilot": - return new GitHubCopilotEmbeddingProvider(credentials, modelInfo); + return new GitHubCopilotEmbeddingProvider(configuredProviderInfo.credentials, configuredProviderInfo.modelInfo); case "openai": - return new OpenAIEmbeddingProvider(credentials, modelInfo); + return new OpenAIEmbeddingProvider(configuredProviderInfo.credentials, configuredProviderInfo.modelInfo); case "google": - return new GoogleEmbeddingProvider(credentials, modelInfo); + return new GoogleEmbeddingProvider(configuredProviderInfo.credentials, configuredProviderInfo.modelInfo); case "ollama": - return new OllamaEmbeddingProvider(credentials, modelInfo); - default: - throw new Error(`Unsupported embedding provider: ${credentials.provider}`); + return new OllamaEmbeddingProvider(configuredProviderInfo.credentials, configuredProviderInfo.modelInfo); + default: { + const _exhaustive: never = configuredProviderInfo; + throw new Error(`Unsupported embedding provider: ${(_exhaustive as ConfiguredProviderInfo).provider}`); + } } } class GitHubCopilotEmbeddingProvider implements EmbeddingProviderInterface { constructor( private credentials: ProviderCredentials, - private modelInfo: EmbeddingModelInfo - ) {} + private modelInfo: EmbeddingProviderModelInfo['github-copilot'] + ) { } private getToken(): string { if (!this.credentials.refreshToken) { @@ -48,8 +50,16 @@ class GitHubCopilotEmbeddingProvider implements EmbeddingProviderInterface { return this.credentials.refreshToken; } - async embed(text: string): Promise { - const result = await this.embedBatch([text]); + async embedQuery(query: string): Promise { + const result = await this.embedBatch([query]); + return { + embedding: result.embeddings[0], + tokensUsed: result.totalTokensUsed, + }; + } + + async embedDocument(document: string): Promise { + const result = await this.embedBatch([document]); return { embedding: result.embeddings[0], tokensUsed: result.totalTokensUsed, @@ -97,11 +107,19 @@ class GitHubCopilotEmbeddingProvider implements EmbeddingProviderInterface { class OpenAIEmbeddingProvider implements EmbeddingProviderInterface { constructor( private credentials: ProviderCredentials, - private modelInfo: EmbeddingModelInfo - ) {} + private modelInfo: EmbeddingProviderModelInfo['openai'] + ) { } - async embed(text: string): Promise { - const result = await this.embedBatch([text]); + async embedQuery(query: string): Promise { + const result = await this.embedBatch([query]); + return { + embedding: result.embeddings[0], + tokensUsed: result.totalTokensUsed, + }; + } + + async embedDocument(document: string): Promise { + const result = await this.embedBatch([document]); return { embedding: result.embeddings[0], tokensUsed: result.totalTokensUsed, @@ -143,13 +161,25 @@ class OpenAIEmbeddingProvider implements EmbeddingProviderInterface { } class GoogleEmbeddingProvider implements EmbeddingProviderInterface { + private static readonly BATCH_SIZE = 20; + constructor( private credentials: ProviderCredentials, - private modelInfo: EmbeddingModelInfo - ) {} + private modelInfo: EmbeddingProviderModelInfo['google'] + ) { } - async embed(text: string): Promise { - const result = await this.embedBatch([text]); + async embedQuery(query: string): Promise { + const taskType = this.modelInfo.taskAble ? "CODE_RETRIEVAL_QUERY" : undefined; + const result = await this.embedWithTaskType([query], taskType); + return { + embedding: result.embeddings[0], + tokensUsed: result.totalTokensUsed, + }; + } + + async embedDocument(document: string): Promise { + const taskType = this.modelInfo.taskAble ? "RETRIEVAL_DOCUMENT" : undefined; + const result = await this.embedWithTaskType([document], taskType); return { embedding: result.embeddings[0], tokensUsed: result.totalTokensUsed, @@ -157,20 +187,44 @@ class GoogleEmbeddingProvider implements EmbeddingProviderInterface { } async embedBatch(texts: string[]): Promise { - const results = await Promise.all( - texts.map(async (text) => { + const taskType = this.modelInfo.taskAble ? "RETRIEVAL_DOCUMENT" : undefined; + return this.embedWithTaskType(texts, taskType); + } + + /** + * Embeds texts using the Google embedContent API. + * Sends multiple texts as parts in batched requests (up to BATCH_SIZE per call). + * When taskType is provided (gemini-embedding-001), includes it in the request + * for task-specific embedding optimization. + */ + private async embedWithTaskType( + texts: string[], + taskType?: string + ): Promise { + const batches: string[][] = []; + for (let i = 0; i < texts.length; i += GoogleEmbeddingProvider.BATCH_SIZE) { + batches.push(texts.slice(i, i + GoogleEmbeddingProvider.BATCH_SIZE)); + } + + const batchResults = await Promise.all( + batches.map(async (batch) => { + const requests = batch.map((text) => ({ + model: `models/${this.modelInfo.model}`, + content: { + parts: [{ text }], + }, + taskType, + outputDimensionality: this.modelInfo.dimensions, + })); + const response = await fetch( - `${this.credentials.baseUrl}/models/${this.modelInfo.model}:embedContent?key=${this.credentials.apiKey}`, + `${this.credentials.baseUrl}/models/${this.modelInfo.model}:batchEmbedContents?key=${this.credentials.apiKey}`, { method: "POST", headers: { "Content-Type": "application/json", }, - body: JSON.stringify({ - content: { - parts: [{ text }], - }, - }), + body: JSON.stringify({ requests }), } ); @@ -179,20 +233,20 @@ class GoogleEmbeddingProvider implements EmbeddingProviderInterface { throw new Error(`Google embedding API error: ${response.status} - ${error}`); } - const data = await response.json() as { - embedding: { values: number[] }; + const data = (await response.json()) as { + embeddings: Array<{ values: number[] }>; }; return { - embedding: data.embedding.values, - tokensUsed: Math.ceil(text.length / 4), + embeddings: data.embeddings.map((e) => e.values), + tokensUsed: batch.reduce((sum, text) => sum + Math.ceil(text.length / 4), 0), }; }) ); return { - embeddings: results.map((r) => r.embedding), - totalTokensUsed: results.reduce((sum, r) => sum + r.tokensUsed, 0), + embeddings: batchResults.flatMap((r) => r.embeddings), + totalTokensUsed: batchResults.reduce((sum, r) => sum + r.tokensUsed, 0), }; } @@ -204,39 +258,55 @@ class GoogleEmbeddingProvider implements EmbeddingProviderInterface { class OllamaEmbeddingProvider implements EmbeddingProviderInterface { constructor( private credentials: ProviderCredentials, - private modelInfo: EmbeddingModelInfo - ) {} + private modelInfo: EmbeddingProviderModelInfo['ollama'] + ) { } - async embed(text: string): Promise { - const response = await fetch(`${this.credentials.baseUrl}/api/embeddings`, { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model: this.modelInfo.model, - prompt: text, - }), - }); - - if (!response.ok) { - const error = await response.text(); - throw new Error(`Ollama embedding API error: ${response.status} - ${error}`); - } - - const data = await response.json() as { - embedding: number[]; + async embedQuery(query: string): Promise { + const result = await this.embedBatch([query]); + return { + embedding: result.embeddings[0], + tokensUsed: result.totalTokensUsed, }; + } + async embedDocument(document: string): Promise { + const result = await this.embedBatch([document]); return { - embedding: data.embedding, - tokensUsed: Math.ceil(text.length / 4), + embedding: result.embeddings[0], + tokensUsed: result.totalTokensUsed, }; } async embedBatch(texts: string[]): Promise { - const results = await Promise.all(texts.map((text) => this.embed(text))); - + const results = await Promise.all( + texts.map(async (text) => { + const response = await fetch(`${this.credentials.baseUrl}/api/embeddings`, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: this.modelInfo.model, + prompt: text, + }), + }); + + if (!response.ok) { + const error = await response.text(); + throw new Error(`Ollama embedding API error: ${response.status} - ${error}`); + } + + const data = (await response.json()) as { + embedding: number[]; + }; + + return { + embedding: data.embedding, + tokensUsed: Math.ceil(text.length / 4), + }; + }) + ); + return { embeddings: results.map((r) => r.embedding), totalTokensUsed: results.reduce((sum, r) => sum + r.tokensUsed, 0), diff --git a/src/indexer/index.ts b/src/indexer/index.ts index 93b9981..5990bc4 100644 --- a/src/indexer/index.ts +++ b/src/indexer/index.ts @@ -5,7 +5,7 @@ import PQueue from "p-queue"; import pRetry from "p-retry"; import { ParsedCodebaseIndexConfig } from "../config/schema.js"; -import { detectEmbeddingProvider, DetectedProvider } from "../embeddings/detector.js"; +import { detectEmbeddingProvider, ConfiguredProviderInfo, tryDetectProvider } from "../embeddings/detector.js"; import { createEmbeddingProvider, EmbeddingProviderInterface, @@ -69,6 +69,34 @@ export interface IndexStats { failedBatchesPath?: string; } +export interface SearchResult { + filePath: string; + startLine: number; + endLine: number; + content: string; + score: number; + chunkType: string; + name?: string; +} + +export interface HealthCheckResult { + removed: number; + filePaths: string[]; + gcOrphanEmbeddings: number; + gcOrphanChunks: number; +} + +export interface StatusResult { + indexed: boolean; + vectorCount: number; + provider: string; + model: string; + indexPath: string; + currentBranch: string; + baseBranch: string; + compatibility: IndexCompatibility | null; +} + export interface IndexProgress { phase: "scanning" | "parsing" | "embedding" | "storing" | "complete"; filesProcessed: number; @@ -104,13 +132,16 @@ interface IndexMetadata { updatedAt: string; } +enum IncompatibilityCode { + DIMENSION_MISMATCH = "DIMENSION_MISMATCH", + MODEL_MISMATCH = "MODEL_MISMATCH", +} + interface IndexCompatibility { compatible: boolean; + code?: IncompatibilityCode; reason?: string; storedMetadata?: IndexMetadata; - currentProvider?: string; - currentModel?: string; - currentDimensions?: number; } const INDEX_METADATA_VERSION = "1"; @@ -123,7 +154,7 @@ export class Indexer { private invertedIndex: InvertedIndex | null = null; private database: Database | null = null; private provider: EmbeddingProviderInterface | null = null; - private detectedProvider: DetectedProvider | null = null; + private configuredProviderInfo: ConfiguredProviderInfo | null = null; private fileHashCache: Map = new Map(); private fileHashCachePath: string = ""; private failedBatchesPath: string = ""; @@ -201,14 +232,14 @@ export class Indexer { private async recoverFromInterruptedIndexing(): Promise { this.logger.warn("Detected interrupted indexing session, recovering..."); - + if (existsSync(this.fileHashCachePath)) { unlinkSync(this.fileHashCachePath); } - + await this.healthCheck(); this.releaseIndexingLock(); - + this.logger.info("Recovery complete, next index will re-process all files"); } @@ -227,7 +258,7 @@ export class Indexer { private saveFailedBatches(batches: FailedBatch[]): void { if (batches.length === 0) { if (existsSync(this.failedBatchesPath)) { - fsPromises.unlink(this.failedBatchesPath).catch(() => {}); + fsPromises.unlink(this.failedBatchesPath).catch(() => { }); } return; } @@ -245,11 +276,11 @@ export class Indexer { this.saveFailedBatches(existing); } - private getProviderRateLimits(provider: string): { - concurrency: number; - intervalMs: number; - minRetryMs: number; - maxRetryMs: number; + private getProviderRateLimits(provider: string): { + concurrency: number; + intervalMs: number; + minRetryMs: number; + maxRetryMs: number; } { switch (provider) { case "github-copilot": @@ -266,23 +297,25 @@ export class Indexer { } async initialize(): Promise { - this.detectedProvider = await detectEmbeddingProvider(this.config.embeddingProvider); - if (!this.detectedProvider) { + if (this.config.embeddingProvider === 'auto') { + this.configuredProviderInfo = await tryDetectProvider(); + } else { + this.configuredProviderInfo = await detectEmbeddingProvider(this.config.embeddingProvider, this.config.embeddingModel); + } + + if (!this.configuredProviderInfo) { throw new Error( "No embedding provider available. Configure GitHub, OpenAI, Google, or Ollama." ); } this.logger.info("Initializing indexer", { - provider: this.detectedProvider.provider, - model: this.detectedProvider.modelInfo.model, + provider: this.configuredProviderInfo.provider, + model: this.configuredProviderInfo.modelInfo.model, scope: this.config.scope, }); - this.provider = createEmbeddingProvider( - this.detectedProvider.credentials, - this.detectedProvider.modelInfo - ); + this.provider = createEmbeddingProvider(this.configuredProviderInfo); await fsPromises.mkdir(this.indexPath, { recursive: true }); @@ -290,7 +323,7 @@ export class Indexer { await this.recoverFromInterruptedIndexing(); } - const dimensions = this.detectedProvider.modelInfo.dimensions; + const dimensions = this.configuredProviderInfo.modelInfo.dimensions; const storePath = path.join(this.indexPath, "vectors"); this.store = new VectorStore(storePath, dimensions); @@ -318,16 +351,12 @@ export class Indexer { this.migrateFromLegacyIndex(); } - this.indexCompatibility = this.validateIndexCompatibility(this.detectedProvider); + this.indexCompatibility = this.validateIndexCompatibility(this.configuredProviderInfo); if (!this.indexCompatibility.compatible) { this.logger.warn("Index compatibility issue detected", { reason: this.indexCompatibility.reason, - storedProvider: this.indexCompatibility.storedMetadata?.embeddingProvider, - storedModel: this.indexCompatibility.storedMetadata?.embeddingModel, - storedDimensions: this.indexCompatibility.storedMetadata?.embeddingDimensions, - currentProvider: this.indexCompatibility.currentProvider, - currentModel: this.indexCompatibility.currentModel, - currentDimensions: this.indexCompatibility.currentDimensions, + storedMetadata: this.indexCompatibility.storedMetadata, + configuredProviderInfo: this.configuredProviderInfo, }); } @@ -432,7 +461,7 @@ export class Indexer { }; } - private saveIndexMetadata(provider: DetectedProvider): void { + private saveIndexMetadata(provider: ConfiguredProviderInfo): void { if (!this.database) return; const now = new Date().toISOString(); @@ -449,7 +478,7 @@ export class Indexer { } } - private validateIndexCompatibility(provider: DetectedProvider): IndexCompatibility { + private validateIndexCompatibility(provider: ConfiguredProviderInfo): IndexCompatibility { const storedMetadata = this.loadIndexMetadata(); if (!storedMetadata) { @@ -463,66 +492,66 @@ export class Indexer { if (storedMetadata.embeddingDimensions !== currentDimensions) { return { compatible: false, - reason: `Dimension mismatch: index has ${storedMetadata.embeddingDimensions}D vectors (${storedMetadata.embeddingProvider}/${storedMetadata.embeddingModel}), but current provider uses ${currentDimensions}D (${currentProvider}/${currentModel}). Run "index --force" to rebuild.`, + code: IncompatibilityCode.DIMENSION_MISMATCH, + reason: `Dimension mismatch: index has ${storedMetadata.embeddingDimensions}D vectors (${storedMetadata.embeddingProvider}/${storedMetadata.embeddingModel}), but current provider uses ${currentDimensions}D (${currentProvider}/${currentModel}). Run index_codebase with force=true to rebuild.`, storedMetadata, - currentProvider, - currentModel, - currentDimensions, }; } if (storedMetadata.embeddingModel !== currentModel) { return { compatible: false, - reason: `Model mismatch: index was built with "${storedMetadata.embeddingModel}", but current model is "${currentModel}". Embeddings may be incompatible. Run "index --force" to rebuild.`, + code: IncompatibilityCode.MODEL_MISMATCH, + reason: `Model mismatch: index was built with "${storedMetadata.embeddingModel}", but current model is "${currentModel}". Embeddings are incompatible. Run index_codebase with force=true to rebuild.`, storedMetadata, - currentProvider, - currentModel, - currentDimensions, }; } + if (storedMetadata.embeddingProvider !== currentProvider) { + this.logger.warn("Provider changed", { + storedProvider: storedMetadata.embeddingProvider, + currentProvider, + }); + } + return { compatible: true, storedMetadata, - currentProvider, - currentModel, - currentDimensions, }; } checkCompatibility(): IndexCompatibility { - if (this.indexCompatibility) { - return this.indexCompatibility; - } - return { compatible: true }; - } + if (!this.indexCompatibility) { + if (!this.configuredProviderInfo) { + throw new Error('No embedding provider info, you must initialize the indexer first.'); + } - requiresRebuild(): boolean { - return !this.checkCompatibility().compatible; + this.indexCompatibility = this.validateIndexCompatibility(this.configuredProviderInfo); + } + return this.indexCompatibility; } private async ensureInitialized(): Promise<{ store: VectorStore; provider: EmbeddingProviderInterface; invertedIndex: InvertedIndex; - detectedProvider: DetectedProvider; + configuredProviderInfo: ConfiguredProviderInfo; database: Database; }> { - if (!this.store || !this.provider || !this.invertedIndex || !this.detectedProvider || !this.database) { + if (!this.store || !this.provider || !this.invertedIndex || !this.configuredProviderInfo || !this.database) { await this.initialize(); } return { store: this.store!, provider: this.provider!, invertedIndex: this.invertedIndex!, - detectedProvider: this.detectedProvider!, + configuredProviderInfo: this.configuredProviderInfo!, database: this.database!, }; } async estimateCost(): Promise { - const { detectedProvider } = await this.ensureInitialized(); + const { configuredProviderInfo } = await this.ensureInitialized(); const { files } = await collectFiles( this.projectRoot, @@ -531,11 +560,18 @@ export class Indexer { this.config.indexing.maxFileSize ); - return createCostEstimate(files, detectedProvider); + return createCostEstimate(files, configuredProviderInfo); } async index(onProgress?: ProgressCallback): Promise { - const { store, provider, invertedIndex, database, detectedProvider } = await this.ensureInitialized(); + const { store, provider, invertedIndex, database, configuredProviderInfo } = await this.ensureInitialized(); + + if (!this.indexCompatibility?.compatible) { + throw new Error( + `${this.indexCompatibility?.reason} ` + + `Run index_codebase with force=true to rebuild the index.` + ); + } this.acquireIndexingLock(); this.logger.recordIndexingStart(); @@ -588,7 +624,7 @@ export class Indexer { for (const f of files) { const currentHash = hashFile(f.path); currentFileHashes.set(f.path, currentHash); - + if (this.fileHashCache.get(f.path) === currentHash) { unchangedFilePaths.add(f.path); this.logger.recordCacheHit(); @@ -615,7 +651,7 @@ export class Indexer { const parseStartTime = performance.now(); const parsedFiles = parseFiles(changedFiles); const parseMs = performance.now() - parseStartTime; - + this.logger.recordFilesParsed(parsedFiles.length); this.logger.recordParseDuration(parseMs); this.logger.debug("Parsed changed files", { parsedCount: parsedFiles.length, parseMs: parseMs.toFixed(2) }); @@ -647,27 +683,27 @@ export class Indexer { for (const parsed of parsedFiles) { currentFilePaths.add(parsed.path); - + if (parsed.chunks.length === 0) { const relativePath = path.relative(this.projectRoot, parsed.path); stats.parseFailures.push(relativePath); } - + let fileChunkCount = 0; for (const chunk of parsed.chunks) { if (fileChunkCount >= this.config.indexing.maxChunksPerFile) { break; } - + if (this.config.indexing.semanticOnly && chunk.chunkType === "other") { continue; } - + const id = generateChunkId(parsed.path, chunk); const contentHash = generateChunkHash(chunk); currentChunkIds.add(id); - const chunkData: ChunkData = { + chunkDataBatch.push({ chunkId: id, contentHash, filePath: parsed.path, @@ -676,8 +712,7 @@ export class Indexer { nodeType: chunk.chunkType, name: chunk.name, language: chunk.language, - }; - chunkDataBatch.push(chunkData); + }); if (existingChunks.get(id) === contentHash) { fileChunkCount++; @@ -771,7 +806,7 @@ export class Indexer { const allContentHashes = pendingChunks.map((c) => c.contentHash); const missingHashes = new Set(database.getMissingEmbeddings(allContentHashes)); - + const chunksNeedingEmbedding = pendingChunks.filter((c) => missingHashes.has(c.contentHash)); const chunksWithExistingEmbedding = pendingChunks.filter((c) => !missingHashes.has(c.contentHash)); @@ -792,11 +827,11 @@ export class Indexer { } } - const providerRateLimits = this.getProviderRateLimits(detectedProvider.provider); - const queue = new PQueue({ - concurrency: providerRateLimits.concurrency, - interval: providerRateLimits.intervalMs, - intervalCap: providerRateLimits.concurrency + const providerRateLimits = this.getProviderRateLimits(configuredProviderInfo.provider); + const queue = new PQueue({ + concurrency: providerRateLimits.concurrency, + interval: providerRateLimits.intervalMs, + intervalCap: providerRateLimits.concurrency }); const dynamicBatches = createDynamicBatches(chunksNeedingEmbedding); let rateLimitBackoffMs = 0; @@ -806,7 +841,7 @@ export class Indexer { if (rateLimitBackoffMs > 0) { await new Promise(resolve => setTimeout(resolve, rateLimitBackoffMs)); } - + try { const result = await pRetry( async () => { @@ -836,7 +871,7 @@ export class Indexer { }, } ); - + if (rateLimitBackoffMs > 0) { rateLimitBackoffMs = Math.max(0, rateLimitBackoffMs - 2000); } @@ -848,12 +883,12 @@ export class Indexer { })); store.addBatch(items); - + const embeddingBatchItems = batch.map((chunk, i) => ({ contentHash: chunk.contentHash, embedding: float32ArrayToBuffer(result.embeddings[i]), chunkText: chunk.text, - model: detectedProvider.modelInfo.model, + model: configuredProviderInfo.modelInfo.model, })); database.upsertEmbeddingsBatch(embeddingBatchItems); @@ -861,7 +896,7 @@ export class Indexer { invertedIndex.removeChunk(chunk.id); invertedIndex.addChunk(chunk.id, chunk.content); } - + stats.indexedChunks += batch.length; stats.tokensUsed += result.totalTokensUsed; @@ -915,8 +950,8 @@ export class Indexer { } stats.durationMs = Date.now() - startTime; - - this.saveIndexMetadata(detectedProvider); + + this.saveIndexMetadata(configuredProviderInfo); this.indexCompatibility = { compatible: true }; this.logger.recordIndexingEnd(); @@ -949,16 +984,16 @@ export class Indexer { private async getQueryEmbedding(query: string, provider: EmbeddingProviderInterface): Promise { const now = Date.now(); const cached = this.queryEmbeddingCache.get(query); - + if (cached && (now - cached.timestamp) < this.queryCacheTtlMs) { this.logger.cache("debug", "Query embedding cache hit (exact)", { query: query.slice(0, 50) }); this.logger.recordQueryCacheHit(); return cached.embedding; } - + const similarMatch = this.findSimilarCachedQuery(query, now); if (similarMatch) { - this.logger.cache("debug", "Query embedding cache hit (similar)", { + this.logger.cache("debug", "Query embedding cache hit (similar)", { query: query.slice(0, 50), similarTo: similarMatch.key.slice(0, 50), similarity: similarMatch.similarity.toFixed(3), @@ -966,45 +1001,45 @@ export class Indexer { this.logger.recordQueryCacheSimilarHit(); return similarMatch.embedding; } - + this.logger.cache("debug", "Query embedding cache miss", { query: query.slice(0, 50) }); this.logger.recordQueryCacheMiss(); - const { embedding, tokensUsed } = await provider.embed(query); + const { embedding, tokensUsed } = await provider.embedQuery(query); this.logger.recordEmbeddingApiCall(tokensUsed); - + if (this.queryEmbeddingCache.size >= this.maxQueryCacheSize) { const oldestKey = this.queryEmbeddingCache.keys().next().value; if (oldestKey) { this.queryEmbeddingCache.delete(oldestKey); } } - + this.queryEmbeddingCache.set(query, { embedding, timestamp: now }); return embedding; } private findSimilarCachedQuery( - query: string, + query: string, now: number ): { key: string; embedding: number[]; similarity: number } | null { const queryTokens = this.tokenize(query); if (queryTokens.size === 0) return null; - + let bestMatch: { key: string; embedding: number[]; similarity: number } | null = null; - + for (const [cachedQuery, { embedding, timestamp }] of this.queryEmbeddingCache) { if ((now - timestamp) >= this.queryCacheTtlMs) continue; - + const cachedTokens = this.tokenize(cachedQuery); const similarity = this.jaccardSimilarity(queryTokens, cachedTokens); - + if (similarity >= this.querySimilarityThreshold) { if (!bestMatch || similarity > bestMatch.similarity) { bestMatch = { key: cachedQuery, embedding, similarity }; } } } - + return bestMatch; } @@ -1021,12 +1056,12 @@ export class Indexer { private jaccardSimilarity(a: Set, b: Set): number { if (a.size === 0 && b.size === 0) return 1; if (a.size === 0 || b.size === 0) return 0; - + let intersection = 0; for (const token of a) { if (b.has(token)) intersection++; } - + const union = a.size + b.size - intersection; return intersection / union; } @@ -1043,24 +1078,18 @@ export class Indexer { filterByBranch?: boolean; metadataOnly?: boolean; } - ): Promise< - Array<{ - filePath: string; - startLine: number; - endLine: number; - content: string; - score: number; - chunkType: string; - name?: string; - }> - > { + ): Promise { + const { store, provider, database } = await this.ensureInitialized(); + const compatibility = this.checkCompatibility(); if (!compatibility.compatible) { - throw new Error(compatibility.reason ?? "Index is incompatible with current embedding provider"); + throw new Error( + `${compatibility.reason ?? "Index is incompatible with current embedding provider."} ` + + `A possible solution is to run index_codebase with force=true to rebuild the index.` + ); } const searchStartTime = performance.now(); - const { store, provider, database } = await this.ensureInitialized(); if (store.count() === 0) { this.logger.search("debug", "Search on empty index", { query }); @@ -1111,8 +1140,8 @@ export class Indexer { if (options?.directory) { const normalizedDir = options.directory.replace(/^\/|\/$/g, ""); - if (!r.metadata.filePath.includes(`/${normalizedDir}/`) && - !r.metadata.filePath.includes(`${normalizedDir}/`)) return false; + if (!r.metadata.filePath.includes(`/${normalizedDir}/`) && + !r.metadata.filePath.includes(`${normalizedDir}/`)) return false; } if (options?.chunkType) { @@ -1146,7 +1175,7 @@ export class Indexer { let content = ""; let contextStartLine = r.metadata.startLine; let contextEndLine = r.metadata.endLine; - + if (!metadataOnly && this.config.search.includeContext) { try { const fileContent = await fsPromises.readFile( @@ -1155,10 +1184,10 @@ export class Indexer { ); const lines = fileContent.split("\n"); const contextLines = options?.contextLines ?? this.config.search.contextLines; - + contextStartLine = Math.max(1, r.metadata.startLine - contextLines); contextEndLine = Math.min(lines.length, r.metadata.endLine + contextLines); - + content = lines .slice(contextStartLine - 1, contextEndLine) .join("\n"); @@ -1186,7 +1215,7 @@ export class Indexer { ): Promise> { const { store, invertedIndex } = await this.ensureInitialized(); const scores = invertedIndex.search(query); - + if (scores.size === 0) { return []; } @@ -1246,25 +1275,18 @@ export class Indexer { return results.slice(0, limit); } - async getStatus(): Promise<{ - indexed: boolean; - vectorCount: number; - provider: string; - model: string; - indexPath: string; - currentBranch: string; - baseBranch: string; - }> { - const { store, detectedProvider } = await this.ensureInitialized(); + async getStatus(): Promise { + const { store, configuredProviderInfo } = await this.ensureInitialized(); return { indexed: store.count() > 0, vectorCount: store.count(), - provider: detectedProvider.provider, - model: detectedProvider.modelInfo.model, + provider: configuredProviderInfo.provider, + model: configuredProviderInfo.modelInfo.model, indexPath: this.indexPath, currentBranch: this.currentBranch, baseBranch: this.baseBranch, + compatibility: this.indexCompatibility, }; } @@ -1275,16 +1297,27 @@ export class Indexer { store.save(); invertedIndex.clear(); invertedIndex.save(); - + // Clear file hash cache so all files are re-parsed this.fileHashCache.clear(); this.saveFileHashCache(); - + // Clear branch catalog database.clearBranch(this.currentBranch); + + // Clear index metadata so compatibility is re-evaluated from scratch + database.deleteMetadata("index.version"); + database.deleteMetadata("index.embeddingProvider"); + database.deleteMetadata("index.embeddingModel"); + database.deleteMetadata("index.embeddingDimensions"); + database.deleteMetadata("index.createdAt"); + database.deleteMetadata("index.updatedAt"); + + // Re-validate compatibility (no stored metadata = compatible) + this.indexCompatibility = this.validateIndexCompatibility(this.configuredProviderInfo!); } - async healthCheck(): Promise<{ removed: number; filePaths: string[]; gcOrphanEmbeddings: number; gcOrphanChunks: number }> { + async healthCheck(): Promise { const { store, invertedIndex, database } = await this.ensureInitialized(); this.logger.gc("info", "Starting health check"); @@ -1334,7 +1367,7 @@ export class Indexer { async retryFailedBatches(): Promise<{ succeeded: number; failed: number; remaining: number }> { const { store, provider, invertedIndex } = await this.ensureInitialized(); - + const failedBatches = this.loadFailedBatches(); if (failedBatches.length === 0) { return { succeeded: 0, failed: 0, remaining: 0 }; @@ -1372,7 +1405,7 @@ export class Indexer { this.logger.recordChunksEmbedded(batch.chunks.length); this.logger.recordEmbeddingApiCall(result.totalTokensUsed); - + succeeded += batch.chunks.length; } catch (error) { failed += batch.chunks.length; @@ -1387,7 +1420,7 @@ export class Indexer { } this.saveFailedBatches(stillFailing); - + if (succeeded > 0) { store.save(); invertedIndex.save(); @@ -1426,7 +1459,7 @@ export class Indexer { async findSimilar( code: string, - limit?: number, + limit: number = this.config.search.maxResults, options?: { fileType?: string; directory?: string; @@ -1434,41 +1467,39 @@ export class Indexer { excludeFile?: string; filterByBranch?: boolean; } - ): Promise< - Array<{ - filePath: string; - startLine: number; - endLine: number; - content: string; - score: number; - chunkType: string; - name?: string; - }> - > { - const searchStartTime = performance.now(); + ): Promise { const { store, provider, database } = await this.ensureInitialized(); + + const compatibility = this.checkCompatibility(); + if (!compatibility.compatible) { + throw new Error( + `${compatibility.reason ?? "Index is incompatible with current embedding provider."} ` + + `Run index_codebase with force=true to rebuild the index.` + ); + } + + const searchStartTime = performance.now(); if (store.count() === 0) { this.logger.search("debug", "Find similar on empty index"); return []; } - const maxResults = limit ?? this.config.search.maxResults; const filterByBranch = options?.filterByBranch ?? true; this.logger.search("debug", "Starting find similar", { codeLength: code.length, - maxResults, + limit, filterByBranch, }); const embeddingStartTime = performance.now(); - const { embedding, tokensUsed } = await provider.embed(code); + const { embedding, tokensUsed } = await provider.embedDocument(code); const embeddingMs = performance.now() - embeddingStartTime; this.logger.recordEmbeddingApiCall(tokensUsed); const vectorStartTime = performance.now(); - const semanticResults = store.search(embedding, maxResults * 2); + const semanticResults = store.search(embedding, limit * 2); const vectorMs = performance.now() - vectorStartTime; let branchChunkIds: Set | null = null; @@ -1492,8 +1523,8 @@ export class Indexer { if (options?.directory) { const normalizedDir = options.directory.replace(/^\/|\/$/g, ""); - if (!r.metadata.filePath.includes(`/${normalizedDir}/`) && - !r.metadata.filePath.includes(`${normalizedDir}/`)) return false; + if (!r.metadata.filePath.includes(`/${normalizedDir}/`) && + !r.metadata.filePath.includes(`${normalizedDir}/`)) return false; } if (options?.chunkType) { @@ -1501,7 +1532,7 @@ export class Indexer { } return true; - }).slice(0, maxResults); + }).slice(0, limit); const totalSearchMs = performance.now() - searchStartTime; this.logger.recordSearch(totalSearchMs, { @@ -1521,7 +1552,7 @@ export class Indexer { return Promise.all( filtered.map(async (r) => { let content = ""; - + if (this.config.search.includeContext) { try { const fileContent = await fsPromises.readFile( diff --git a/src/tools/index.ts b/src/tools/index.ts index 3704782..3b2d3c0 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -1,23 +1,22 @@ import { tool, type ToolDefinition } from "@opencode-ai/plugin"; -import { Indexer, IndexStats, IndexProgress } from "../indexer/index.js"; +import { Indexer } from "../indexer/index.js"; import { ParsedCodebaseIndexConfig } from "../config/schema.js"; import { formatCostEstimate } from "../utils/cost.js"; import type { LogLevel } from "../config/schema.js"; +import { + formatProgressTitle, + formatIndexStats, + formatStatus, + calculatePercentage, + formatCodebasePeek, + formatHealthCheck, + formatLogs, + formatSearchResults, +} from "./utils.js"; const z = tool.schema; -const MAX_CONTENT_LINES = 30; - -function truncateContent(content: string): string { - const lines = content.split("\n"); - if (lines.length <= MAX_CONTENT_LINES) return content; - return ( - lines.slice(0, MAX_CONTENT_LINES).join("\n") + - `\n// ... (${lines.length - MAX_CONTENT_LINES} more lines)` - ); -} - let sharedIndexer: Indexer | null = null; export function initializeTools(projectRoot: string, config: ParsedCodebaseIndexConfig): void { @@ -31,42 +30,6 @@ function getIndexer(): Indexer { return sharedIndexer; } -export const codebase_search: ToolDefinition = tool({ - description: - "Search codebase by MEANING, not keywords. Returns full code content. Use when you need to see actual implementation. For just finding WHERE code is (saves ~90% tokens), use codebase_peek instead. For known identifiers like 'validateToken', use grep - it's faster.", - args: { - query: z.string().describe("Natural language description of what code you're looking for. Describe behavior, not syntax."), - limit: z.number().optional().default(5).describe("Maximum number of results to return"), - fileType: z.string().optional().describe("Filter by file extension (e.g., 'ts', 'py', 'rs')"), - directory: z.string().optional().describe("Filter by directory path (e.g., 'src/utils', 'lib')"), - chunkType: z.enum(["function", "class", "method", "interface", "type", "enum", "struct", "impl", "trait", "module", "other"]).optional().describe("Filter by code chunk type"), - contextLines: z.number().optional().describe("Number of extra lines to include before/after each match (default: 0)"), - }, - async execute(args) { - const indexer = getIndexer(); - const results = await indexer.search(args.query, args.limit ?? 5, { - fileType: args.fileType, - directory: args.directory, - chunkType: args.chunkType, - contextLines: args.contextLines, - }); - - if (results.length === 0) { - return "No matching code found. Try a different query or run index_codebase first."; - } - - const formatted = results.map((r, idx) => { - const header = r.name - ? `[${idx + 1}] ${r.chunkType} "${r.name}" in ${r.filePath}:${r.startLine}-${r.endLine}` - : `[${idx + 1}] ${r.chunkType} in ${r.filePath}:${r.startLine}-${r.endLine}`; - - return `${header} (score: ${r.score.toFixed(2)})\n\`\`\`\n${truncateContent(r.content)}\n\`\`\``; - }); - - return `Found ${results.length} results for "${args.query}":\n\n${formatted.join("\n\n")}`; - }, -}); - export const codebase_peek: ToolDefinition = tool({ description: "Quick lookup of code locations by meaning. Returns only metadata (file, line, name, type) WITHOUT code content. Use this first to find WHERE code is, then use Read tool to examine specific files. Saves tokens by not returning full code blocks. Best for: discovery, navigation, finding multiple related locations.", @@ -86,17 +49,7 @@ export const codebase_peek: ToolDefinition = tool({ metadataOnly: true, }); - if (results.length === 0) { - return "No matching code found. Try a different query or run index_codebase first."; - } - - const formatted = results.map((r, idx) => { - const location = `${r.filePath}:${r.startLine}-${r.endLine}`; - const name = r.name ? `"${r.name}"` : "(anonymous)"; - return `[${idx + 1}] ${r.chunkType} ${name} at ${location} (score: ${r.score.toFixed(2)})`; - }); - - return `Found ${results.length} locations for "${args.query}":\n\n${formatted.join("\n")}\n\nUse Read tool to examine specific files.`; + return formatCodebasePeek(results, args.query); }, }); @@ -156,29 +109,7 @@ export const index_health_check: ToolDefinition = tool({ const indexer = getIndexer(); const result = await indexer.healthCheck(); - if (result.removed === 0 && result.gcOrphanEmbeddings === 0 && result.gcOrphanChunks === 0) { - return "Index is healthy. No stale entries found."; - } - - const lines = [`Health check complete:`]; - - if (result.removed > 0) { - lines.push(` Removed stale entries: ${result.removed}`); - } - - if (result.gcOrphanEmbeddings > 0) { - lines.push(` Garbage collected orphan embeddings: ${result.gcOrphanEmbeddings}`); - } - - if (result.gcOrphanChunks > 0) { - lines.push(` Garbage collected orphan chunks: ${result.gcOrphanChunks}`); - } - - if (result.filePaths.length > 0) { - lines.push(` Cleaned paths: ${result.filePaths.join(", ")}`); - } - - return lines.join("\n"); + return formatHealthCheck(result); }, }); @@ -227,14 +158,7 @@ export const index_logs: ToolDefinition = tool({ logs = logger.getLogs(args.limit); } - if (logs.length === 0) { - return "No logs recorded yet. Logs are captured during indexing and search operations."; - } - - return logs.map(l => { - const dataStr = l.data ? ` ${JSON.stringify(l.data)}` : ""; - return `[${l.timestamp}] [${l.level.toUpperCase()}] [${l.category}] ${l.message}${dataStr}`; - }).join("\n"); + return formatLogs(logs); }, }); @@ -251,7 +175,7 @@ export const find_similar: ToolDefinition = tool({ }, async execute(args) { const indexer = getIndexer(); - const results = await indexer.findSimilar(args.code, args.limit ?? 10, { + const results = await indexer.findSimilar(args.code, args.limit, { fileType: args.fileType, directory: args.directory, chunkType: args.chunkType, @@ -262,132 +186,34 @@ export const find_similar: ToolDefinition = tool({ return "No similar code found. Try a different snippet or run index_codebase first."; } - const formatted = results.map((r, idx) => { - const header = r.name - ? `[${idx + 1}] ${r.chunkType} "${r.name}" in ${r.filePath}:${r.startLine}-${r.endLine}` - : `[${idx + 1}] ${r.chunkType} in ${r.filePath}:${r.startLine}-${r.endLine}`; - - return `${header} (similarity: ${(r.score * 100).toFixed(1)}%)\n\`\`\`\n${truncateContent(r.content)}\n\`\`\``; - }); - - return `Found ${results.length} similar code blocks:\n\n${formatted.join("\n\n")}`; + return `Found ${results.length} similar code blocks:\n\n${formatSearchResults(results)}`; }, }); -function formatIndexStats(stats: IndexStats, verbose: boolean = false): string { - const lines: string[] = []; - - if (stats.indexedChunks === 0 && stats.removedChunks === 0) { - lines.push(`Indexed. ${stats.totalFiles} files processed, ${stats.existingChunks} code chunks already up to date.`); - } else if (stats.indexedChunks === 0) { - lines.push(`Indexed. ${stats.totalFiles} files, removed ${stats.removedChunks} stale chunks, ${stats.existingChunks} chunks remain.`); - } else { - let main = `Indexed. ${stats.totalFiles} files processed, ${stats.indexedChunks} new chunks embedded.`; - if (stats.existingChunks > 0) { - main += ` ${stats.existingChunks} unchanged chunks skipped.`; - } - lines.push(main); - - if (stats.removedChunks > 0) { - lines.push(`Removed ${stats.removedChunks} stale chunks.`); - } - - if (stats.failedChunks > 0) { - lines.push(`Failed: ${stats.failedChunks} chunks.`); - } - - lines.push(`Tokens: ${stats.tokensUsed.toLocaleString()}, Duration: ${(stats.durationMs / 1000).toFixed(1)}s`); - } - - if (verbose) { - if (stats.skippedFiles.length > 0) { - const tooLarge = stats.skippedFiles.filter(f => f.reason === "too_large"); - const excluded = stats.skippedFiles.filter(f => f.reason === "excluded"); - const gitignored = stats.skippedFiles.filter(f => f.reason === "gitignore"); - - lines.push(""); - lines.push(`Skipped files: ${stats.skippedFiles.length}`); - if (tooLarge.length > 0) { - lines.push(` Too large (${tooLarge.length}): ${tooLarge.slice(0, 5).map(f => f.path).join(", ")}${tooLarge.length > 5 ? "..." : ""}`); - } - if (excluded.length > 0) { - lines.push(` Excluded (${excluded.length}): ${excluded.slice(0, 5).map(f => f.path).join(", ")}${excluded.length > 5 ? "..." : ""}`); - } - if (gitignored.length > 0) { - lines.push(` Gitignored (${gitignored.length}): ${gitignored.slice(0, 5).map(f => f.path).join(", ")}${gitignored.length > 5 ? "..." : ""}`); - } - } +export const codebase_search: ToolDefinition = tool({ + description: + "Search codebase by MEANING, not keywords. Returns full code content. Use when you need to see actual implementation. For just finding WHERE code is (saves ~90% tokens), use codebase_peek instead. For known identifiers like 'validateToken', use grep - it's faster.", + args: { + query: z.string().describe("Natural language description of what code you're looking for. Describe behavior, not syntax."), + limit: z.number().optional().default(5).describe("Maximum number of results to return"), + fileType: z.string().optional().describe("Filter by file extension (e.g., 'ts', 'py', 'rs')"), + directory: z.string().optional().describe("Filter by directory path (e.g., 'src/utils', 'lib')"), + chunkType: z.enum(["function", "class", "method", "interface", "type", "enum", "struct", "impl", "trait", "module", "other"]).optional().describe("Filter by code chunk type"), + contextLines: z.number().optional().describe("Number of extra lines to include before/after each match (default: 0)"), + }, + async execute(args) { + const indexer = getIndexer(); + const results = await indexer.search(args.query, args.limit ?? 5, { + fileType: args.fileType, + directory: args.directory, + chunkType: args.chunkType, + contextLines: args.contextLines, + }); - if (stats.parseFailures.length > 0) { - lines.push(""); - lines.push(`Files with no extractable chunks (${stats.parseFailures.length}): ${stats.parseFailures.slice(0, 10).join(", ")}${stats.parseFailures.length > 10 ? "..." : ""}`); + if (results.length === 0) { + return "No matching code found. Try a different query or run index_codebase first."; } - } - - return lines.join("\n"); -} - -function formatStatus(status: { - indexed: boolean; - vectorCount: number; - provider: string; - model: string; - indexPath: string; - currentBranch: string; - baseBranch: string; -}): string { - if (!status.indexed) { - return "Codebase is not indexed. Run index_codebase to create an index."; - } - - const lines = [ - `Index status:`, - ` Indexed chunks: ${status.vectorCount.toLocaleString()}`, - ` Provider: ${status.provider}`, - ` Model: ${status.model}`, - ` Location: ${status.indexPath}`, - ]; - if (status.currentBranch !== "default") { - lines.push(` Current branch: ${status.currentBranch}`); - lines.push(` Base branch: ${status.baseBranch}`); - } - - return lines.join("\n"); -} - -function formatProgressTitle(progress: IndexProgress): string { - switch (progress.phase) { - case "scanning": - return "Scanning files..."; - case "parsing": - return `Parsing: ${progress.filesProcessed}/${progress.totalFiles} files`; - case "embedding": - return `Embedding: ${progress.chunksProcessed}/${progress.totalChunks} chunks`; - case "storing": - return "Storing index..."; - case "complete": - return "Indexing complete"; - default: - return "Indexing..."; - } -} - -function calculatePercentage(progress: IndexProgress): number { - if (progress.phase === "scanning") return 0; - if (progress.phase === "complete") return 100; - - if (progress.phase === "parsing") { - if (progress.totalFiles === 0) return 5; - return Math.round(5 + (progress.filesProcessed / progress.totalFiles) * 15); - } - - if (progress.phase === "embedding") { - if (progress.totalChunks === 0) return 20; - return Math.round(20 + (progress.chunksProcessed / progress.totalChunks) * 70); - } - - if (progress.phase === "storing") return 95; - - return 0; -} + return `Found ${results.length} results for "${args.query}":\n\n${formatSearchResults(results, "score")}`; + }, +}); diff --git a/src/tools/utils.ts b/src/tools/utils.ts new file mode 100644 index 0000000..c81f3d1 --- /dev/null +++ b/src/tools/utils.ts @@ -0,0 +1,206 @@ +import { IndexStats, IndexProgress, SearchResult, HealthCheckResult, StatusResult } from "../indexer/index.js"; +import type { LogEntry } from "../utils/logger.js"; + +const MAX_CONTENT_LINES = 30; + +function truncateContent(content: string): string { + const lines = content.split("\n"); + if (lines.length <= MAX_CONTENT_LINES) return content; + return ( + lines.slice(0, MAX_CONTENT_LINES).join("\n") + + `\n// ... (${lines.length - MAX_CONTENT_LINES} more lines)` + ); +} + +export function formatIndexStats(stats: IndexStats, verbose: boolean = false): string { + const lines: string[] = []; + + if (stats.indexedChunks === 0 && stats.removedChunks === 0) { + lines.push(`Indexed. ${stats.totalFiles} files processed, ${stats.existingChunks} code chunks already up to date.`); + } else if (stats.indexedChunks === 0) { + lines.push(`Indexed. ${stats.totalFiles} files, removed ${stats.removedChunks} stale chunks, ${stats.existingChunks} chunks remain.`); + } else { + let main = `Indexed. ${stats.totalFiles} files processed, ${stats.indexedChunks} new chunks embedded.`; + if (stats.existingChunks > 0) { + main += ` ${stats.existingChunks} unchanged chunks skipped.`; + } + lines.push(main); + + if (stats.removedChunks > 0) { + lines.push(`Removed ${stats.removedChunks} stale chunks.`); + } + + if (stats.failedChunks > 0) { + lines.push(`Failed: ${stats.failedChunks} chunks.`); + } + + lines.push(`Tokens: ${stats.tokensUsed.toLocaleString()}, Duration: ${(stats.durationMs / 1000).toFixed(1)}s`); + } + + if (verbose) { + if (stats.skippedFiles.length > 0) { + const tooLarge = stats.skippedFiles.filter(f => f.reason === "too_large"); + const excluded = stats.skippedFiles.filter(f => f.reason === "excluded"); + const gitignored = stats.skippedFiles.filter(f => f.reason === "gitignore"); + + lines.push(""); + lines.push(`Skipped files: ${stats.skippedFiles.length}`); + if (tooLarge.length > 0) { + lines.push(` Too large (${tooLarge.length}): ${tooLarge.slice(0, 5).map(f => f.path).join(", ")}${tooLarge.length > 5 ? "..." : ""}`); + } + if (excluded.length > 0) { + lines.push(` Excluded (${excluded.length}): ${excluded.slice(0, 5).map(f => f.path).join(", ")}${excluded.length > 5 ? "..." : ""}`); + } + if (gitignored.length > 0) { + lines.push(` Gitignored (${gitignored.length}): ${gitignored.slice(0, 5).map(f => f.path).join(", ")}${gitignored.length > 5 ? "..." : ""}`); + } + } + + if (stats.parseFailures.length > 0) { + lines.push(""); + lines.push(`Files with no extractable chunks (${stats.parseFailures.length}): ${stats.parseFailures.slice(0, 10).join(", ")}${stats.parseFailures.length > 10 ? "..." : ""}`); + } + } + + return lines.join("\n"); +} + +export function formatStatus(status: StatusResult): string { + if (!status.indexed) { + return "Codebase is not indexed. Run index_codebase to create an index."; + } + + const lines = [ + `Index status:`, + ` Indexed chunks: ${status.vectorCount.toLocaleString()}`, + ` Provider: ${status.provider}`, + ` Model: ${status.model}`, + ` Location: ${status.indexPath}`, + ]; + + if (status.currentBranch !== "default") { + lines.push(` Current branch: ${status.currentBranch}`); + lines.push(` Base branch: ${status.baseBranch}`); + } + + if (status.compatibility && !status.compatibility.compatible) { + lines.push(""); + lines.push(`COMPATIBILITY WARNING: ${status.compatibility.reason}`); + if (status.compatibility.storedMetadata) { + const stored = status.compatibility.storedMetadata; + lines.push(` Index was built with: ${stored.embeddingProvider}/${stored.embeddingModel} (${stored.embeddingDimensions}D)`); + lines.push(` Current config: ${status.provider}/${status.model}`); + } + } else if (!status.compatibility) { + lines.push(` Compatibility: No compatibility information found. Maybe the index is not initialized yet, try running index_codebase.`); + } else { + lines.push(` Compatibility: Index is compatible with the current provider and model.`); + } + + return lines.join("\n"); +} + +export function formatProgressTitle(progress: IndexProgress): string { + switch (progress.phase) { + case "scanning": + return "Scanning files..."; + case "parsing": + return `Parsing: ${progress.filesProcessed}/${progress.totalFiles} files`; + case "embedding": + return `Embedding: ${progress.chunksProcessed}/${progress.totalChunks} chunks`; + case "storing": + return "Storing index..."; + case "complete": + return "Indexing complete"; + default: + return "Indexing..."; + } +} + +export function calculatePercentage(progress: IndexProgress): number { + if (progress.phase === "scanning") return 0; + if (progress.phase === "complete") return 100; + + if (progress.phase === "parsing") { + if (progress.totalFiles === 0) return 5; + return Math.round(5 + (progress.filesProcessed / progress.totalFiles) * 15); + } + + if (progress.phase === "embedding") { + if (progress.totalChunks === 0) return 20; + return Math.round(20 + (progress.chunksProcessed / progress.totalChunks) * 70); + } + + if (progress.phase === "storing") return 95; + + return 0; +} + +export function formatCodebasePeek(results: SearchResult[], query: string): string { + if (results.length === 0) { + return "No matching code found. Try a different query or run index_codebase first."; + } + + const formatted = results.map((r, idx) => { + const location = `${r.filePath}:${r.startLine}-${r.endLine}`; + const name = r.name ? `"${r.name}"` : "(anonymous)"; + return `[${idx + 1}] ${r.chunkType} ${name} at ${location} (score: ${r.score.toFixed(2)})`; + }); + + return `Found ${results.length} locations for "${query}":\n\n${formatted.join("\n")}\n\nUse Read tool to examine specific files.`; +} + +export function formatHealthCheck(result: HealthCheckResult): string { + if (result.removed === 0 && result.gcOrphanEmbeddings === 0 && result.gcOrphanChunks === 0) { + return "Index is healthy. No stale entries found."; + } + + const lines = [`Health check complete:`]; + + if (result.removed > 0) { + lines.push(` Removed stale entries: ${result.removed}`); + } + + if (result.gcOrphanEmbeddings > 0) { + lines.push(` Garbage collected orphan embeddings: ${result.gcOrphanEmbeddings}`); + } + + if (result.gcOrphanChunks > 0) { + lines.push(` Garbage collected orphan chunks: ${result.gcOrphanChunks}`); + } + + if (result.filePaths.length > 0) { + lines.push(` Cleaned paths: ${result.filePaths.join(", ")}`); + } + + return lines.join("\n"); +} + +export function formatLogs(logs: LogEntry[]): string { + if (logs.length === 0) { + return "No logs recorded yet. Logs are captured during indexing and search operations."; + } + + return logs.map(l => { + const dataStr = l.data ? ` ${JSON.stringify(l.data)}` : ""; + return `[${l.timestamp}] [${l.level.toUpperCase()}] [${l.category}] ${l.message}${dataStr}`; + }).join("\n"); +} + +export type ScoreFormat = "score" | "similarity"; + +export function formatSearchResults(results: SearchResult[], scoreFormat: ScoreFormat = "similarity"): string { + const formatted = results.map((r, idx) => { + const header = r.name + ? `[${idx + 1}] ${r.chunkType} "${r.name}" in ${r.filePath}:${r.startLine}-${r.endLine}` + : `[${idx + 1}] ${r.chunkType} in ${r.filePath}:${r.startLine}-${r.endLine}`; + + const scoreLabel = scoreFormat === "similarity" + ? `(similarity: ${(r.score * 100).toFixed(1)}%)` + : `(score: ${r.score.toFixed(2)})`; + + return `${header} ${scoreLabel}\n\`\`\`\n${truncateContent(r.content)}\n\`\`\``; + }); + + return formatted.join("\n\n"); +} diff --git a/src/utils/cost.ts b/src/utils/cost.ts index 58c176a..311e1d3 100644 --- a/src/utils/cost.ts +++ b/src/utils/cost.ts @@ -1,5 +1,5 @@ import { EmbeddingModelInfo } from "../config/schema.js"; -import { getProviderDisplayName, DetectedProvider } from "../embeddings/detector.js"; +import { getProviderDisplayName, ConfiguredProviderInfo } from "../embeddings/detector.js"; export interface CostEstimate { filesCount: number; @@ -39,7 +39,7 @@ export function estimateCost( export function createCostEstimate( files: Array<{ path: string; size: number }>, - provider: DetectedProvider + provider: ConfiguredProviderInfo ): CostEstimate { const filesCount = files.length; const totalSizeBytes = files.reduce((sum, f) => sum + f.size, 0); diff --git a/tests/config.test.ts b/tests/config.test.ts index 88cf388..c12af9e 100644 --- a/tests/config.test.ts +++ b/tests/config.test.ts @@ -1,10 +1,13 @@ import { describe, it, expect } from "vitest"; import { parseConfig, - getDefaultConfig, getDefaultModelForProvider, - EMBEDDING_MODELS, + isValidModel, } from "../src/config/schema.js"; +import { + EMBEDDING_MODELS, + DEFAULT_PROVIDER_MODELS, +} from "../src/config/constants.js"; describe("config schema", () => { describe("parseConfig", () => { @@ -12,7 +15,7 @@ describe("config schema", () => { const config = parseConfig(undefined); expect(config.embeddingProvider).toBe("auto"); - expect(config.embeddingModel).toBe("auto"); + expect(config.embeddingModel).toBeUndefined(); expect(config.scope).toBe("project"); expect(config.include).toHaveLength(10); expect(config.exclude).toHaveLength(13); @@ -36,10 +39,10 @@ describe("config schema", () => { expect(parseConfig({ embeddingProvider: "google" }).embeddingProvider).toBe("google"); expect(parseConfig({ embeddingProvider: "ollama" }).embeddingProvider).toBe("ollama"); expect(parseConfig({ embeddingProvider: "github-copilot" }).embeddingProvider).toBe("github-copilot"); - expect(parseConfig({ embeddingProvider: "auto" }).embeddingProvider).toBe("auto"); }); it("should fallback to auto for invalid embeddingProvider", () => { + expect(parseConfig({ embeddingProvider: "auto" }).embeddingProvider).toBe("auto"); expect(parseConfig({ embeddingProvider: "invalid" }).embeddingProvider).toBe("auto"); expect(parseConfig({ embeddingProvider: 123 }).embeddingProvider).toBe("auto"); expect(parseConfig({ embeddingProvider: null }).embeddingProvider).toBe("auto"); @@ -55,13 +58,51 @@ describe("config schema", () => { expect(parseConfig({ scope: 123 }).scope).toBe("project"); }); - it("should parse embeddingModel as string", () => { - expect(parseConfig({ embeddingModel: "custom-model" }).embeddingModel).toBe("custom-model"); - }); + describe("embeddingModel parsing", () => { + it("should be undefined when no provider and no model given", () => { + expect(parseConfig({}).embeddingModel).toBeUndefined(); + }); + + it("should be undefined when valid provider but no model given", () => { + expect(parseConfig({ embeddingProvider: "openai" }).embeddingModel).toBeUndefined(); + }); + + it("should keep valid model for matching provider", () => { + const config = parseConfig({ embeddingProvider: "openai", embeddingModel: "text-embedding-3-large" }); + expect(config.embeddingModel).toBe("text-embedding-3-large"); + }); + + it("should fallback to provider default for invalid model", () => { + const config = parseConfig({ embeddingProvider: "openai", embeddingModel: "nonexistent-model" }); + expect(config.embeddingModel).toBe(DEFAULT_PROVIDER_MODELS["openai"]); + }); + + it("should fallback to provider default for model belonging to different provider", () => { + const config = parseConfig({ embeddingProvider: "openai", embeddingModel: "nomic-embed-text" }); + expect(config.embeddingModel).toBe(DEFAULT_PROVIDER_MODELS["openai"]); + }); + + it("should be undefined when provider is invalid even if model is specified", () => { + const config = parseConfig({ embeddingProvider: "invalid", embeddingModel: "text-embedding-3-small" }); + expect(config.embeddingProvider).toBe("auto"); + expect(config.embeddingModel).toBeUndefined(); + }); + + it("should fallback to provider default for non-string embeddingModel when truthy", () => { + expect(parseConfig({ embeddingProvider: "openai", embeddingModel: 123 }).embeddingModel).toBe(DEFAULT_PROVIDER_MODELS["openai"]); + }); + + it("should be undefined for falsy non-string embeddingModel", () => { + expect(parseConfig({ embeddingProvider: "openai", embeddingModel: null }).embeddingModel).toBeUndefined(); + expect(parseConfig({ embeddingProvider: "openai", embeddingModel: 0 }).embeddingModel).toBeUndefined(); + expect(parseConfig({ embeddingProvider: "openai", embeddingModel: "" }).embeddingModel).toBeUndefined(); + }); - it("should fallback to auto for non-string embeddingModel", () => { - expect(parseConfig({ embeddingModel: 123 }).embeddingModel).toBe("auto"); - expect(parseConfig({ embeddingModel: null }).embeddingModel).toBe("auto"); + it("should handle each provider with its valid model", () => { + expect(parseConfig({ embeddingProvider: "github-copilot", embeddingModel: "text-embedding-3-small" }).embeddingModel).toBe("text-embedding-3-small"); + expect(parseConfig({ embeddingProvider: "google", embeddingModel: "gemini-embedding-001" }).embeddingModel).toBe("gemini-embedding-001"); + expect(parseConfig({ embeddingProvider: "ollama", embeddingModel: "mxbai-embed-large" }).embeddingModel).toBe("mxbai-embed-large"); + }); }); it("should parse include as string array", () => { @@ -190,18 +231,6 @@ describe("config schema", () => { }); }); - describe("getDefaultConfig", () => { - it("should return expected default values", () => { - const config = getDefaultConfig(); - - expect(config.embeddingProvider).toBe("auto"); - expect(config.embeddingModel).toBe("auto"); - expect(config.scope).toBe("project"); - expect(config.include).toContain("**/*.{ts,tsx,js,jsx,mjs,cjs}"); - expect(config.exclude).toContain("**/node_modules/**"); - }); - }); - describe("getDefaultModelForProvider", () => { it("should return correct model for github-copilot", () => { const model = getDefaultModelForProvider("github-copilot"); @@ -219,7 +248,7 @@ describe("config schema", () => { it("should return correct model for google", () => { const model = getDefaultModelForProvider("google"); expect(model.provider).toBe("google"); - expect(model.model).toBe("text-embedding-004"); + expect(model.model).toBe("text-embedding-005"); expect(model.dimensions).toBe(768); }); @@ -228,32 +257,97 @@ describe("config schema", () => { expect(model.provider).toBe("ollama"); expect(model.model).toBe("nomic-embed-text"); }); + }); - it("should return github-copilot model for auto (default case)", () => { - const model = getDefaultModelForProvider("auto"); - expect(model.provider).toBe("github-copilot"); + describe("isValidModel", () => { + it("should return true for valid model of a provider", () => { + expect(isValidModel("text-embedding-3-small", "openai")).toBe(true); + expect(isValidModel("text-embedding-3-large", "openai")).toBe(true); + expect(isValidModel("text-embedding-3-small", "github-copilot")).toBe(true); + expect(isValidModel("nomic-embed-text", "ollama")).toBe(true); + expect(isValidModel("mxbai-embed-large", "ollama")).toBe(true); + expect(isValidModel("text-embedding-005", "google")).toBe(true); + expect(isValidModel("gemini-embedding-001", "google")).toBe(true); + }); + + it("should return false for model belonging to a different provider", () => { + expect(isValidModel("nomic-embed-text", "openai")).toBe(false); + expect(isValidModel("text-embedding-3-small", "ollama")).toBe(false); + expect(isValidModel("gemini-embedding-001", "openai")).toBe(false); + }); + + it("should return false for non-existent model", () => { + expect(isValidModel("nonexistent-model", "openai")).toBe(false); + expect(isValidModel("gpt-4", "openai")).toBe(false); + }); + + it("should return false for non-string values", () => { + expect(isValidModel(123, "openai")).toBe(false); + expect(isValidModel(null, "openai")).toBe(false); + expect(isValidModel(undefined, "openai")).toBe(false); + expect(isValidModel(true, "openai")).toBe(false); }); }); describe("EMBEDDING_MODELS", () => { - it("should have expected models defined", () => { - expect(EMBEDDING_MODELS).toHaveProperty("github-copilot/text-embedding-3-small"); - expect(EMBEDDING_MODELS).toHaveProperty("openai/text-embedding-3-small"); - expect(EMBEDDING_MODELS).toHaveProperty("openai/text-embedding-3-large"); - expect(EMBEDDING_MODELS).toHaveProperty("google/text-embedding-004"); - expect(EMBEDDING_MODELS).toHaveProperty("ollama/nomic-embed-text"); - expect(EMBEDDING_MODELS).toHaveProperty("ollama/mxbai-embed-large"); + it("should have all expected providers", () => { + expect(EMBEDDING_MODELS).toHaveProperty("github-copilot"); + expect(EMBEDDING_MODELS).toHaveProperty("openai"); + expect(EMBEDDING_MODELS).toHaveProperty("google"); + expect(EMBEDDING_MODELS).toHaveProperty("ollama"); + }); + + it("should have expected models per provider", () => { + expect(EMBEDDING_MODELS["github-copilot"]).toHaveProperty("text-embedding-3-small"); + expect(EMBEDDING_MODELS["openai"]).toHaveProperty("text-embedding-3-small"); + expect(EMBEDDING_MODELS["openai"]).toHaveProperty("text-embedding-3-large"); + expect(EMBEDDING_MODELS["google"]).toHaveProperty("text-embedding-005"); + expect(EMBEDDING_MODELS["google"]).toHaveProperty("gemini-embedding-001"); + expect(EMBEDDING_MODELS["ollama"]).toHaveProperty("nomic-embed-text"); + expect(EMBEDDING_MODELS["ollama"]).toHaveProperty("mxbai-embed-large"); }); it("should have correct cost for free providers", () => { - expect(EMBEDDING_MODELS["github-copilot/text-embedding-3-small"].costPer1MTokens).toBe(0); - expect(EMBEDDING_MODELS["google/text-embedding-004"].costPer1MTokens).toBe(0); - expect(EMBEDDING_MODELS["ollama/nomic-embed-text"].costPer1MTokens).toBe(0); + expect(EMBEDDING_MODELS["github-copilot"]["text-embedding-3-small"].costPer1MTokens).toBe(0); + expect(EMBEDDING_MODELS["ollama"]["nomic-embed-text"].costPer1MTokens).toBe(0); + expect(EMBEDDING_MODELS["ollama"]["mxbai-embed-large"].costPer1MTokens).toBe(0); }); it("should have non-zero cost for paid providers", () => { - expect(EMBEDDING_MODELS["openai/text-embedding-3-small"].costPer1MTokens).toBeGreaterThan(0); - expect(EMBEDDING_MODELS["openai/text-embedding-3-large"].costPer1MTokens).toBeGreaterThan(0); + expect(EMBEDDING_MODELS["openai"]["text-embedding-3-small"].costPer1MTokens).toBeGreaterThan(0); + expect(EMBEDDING_MODELS["openai"]["text-embedding-3-large"].costPer1MTokens).toBeGreaterThan(0); + expect(EMBEDDING_MODELS["google"]["text-embedding-005"].costPer1MTokens).toBeGreaterThan(0); + expect(EMBEDDING_MODELS["google"]["gemini-embedding-001"].costPer1MTokens).toBeGreaterThan(0); + }); + + it("should have taskAble property on google models", () => { + expect(EMBEDDING_MODELS["google"]["text-embedding-005"].taskAble).toBe(false); + expect(EMBEDDING_MODELS["google"]["gemini-embedding-001"].taskAble).toBe(true); + }); + + it("should have valid dimensions for all models", () => { + for (const [_provider, models] of Object.entries(EMBEDDING_MODELS)) { + for (const [_modelName, info] of Object.entries(models)) { + expect(info.dimensions).toBeGreaterThan(0); + expect(info.maxTokens).toBeGreaterThan(0); + } + } + }); + }); + + describe("DEFAULT_PROVIDER_MODELS", () => { + it("should reference models that exist in EMBEDDING_MODELS", () => { + for (const [provider, model] of Object.entries(DEFAULT_PROVIDER_MODELS)) { + const providerModels = EMBEDDING_MODELS[provider as keyof typeof EMBEDDING_MODELS]; + expect(providerModels).toBeDefined(); + expect(providerModels).toHaveProperty(model); + } + }); + + it("should have an entry for every provider in EMBEDDING_MODELS", () => { + const providers = Object.keys(EMBEDDING_MODELS); + const defaultProviders = Object.keys(DEFAULT_PROVIDER_MODELS); + expect(defaultProviders.sort()).toEqual(providers.sort()); }); }); }); diff --git a/tests/cost.test.ts b/tests/cost.test.ts index e252fc1..654ade4 100644 --- a/tests/cost.test.ts +++ b/tests/cost.test.ts @@ -106,12 +106,12 @@ describe("cost utilities", () => { describe("estimateCost", () => { it("should calculate cost for free provider", () => { const modelInfo = { - provider: "github-copilot" as const, + provider: "github-copilot", model: "text-embedding-3-small", dimensions: 1536, maxTokens: 8191, costPer1MTokens: 0, - }; + } as const; const cost = estimateCost(1000000, modelInfo); expect(cost).toBe(0); @@ -119,12 +119,12 @@ describe("cost utilities", () => { it("should calculate cost for paid provider", () => { const modelInfo = { - provider: "openai" as const, + provider: "openai", model: "text-embedding-3-small", dimensions: 1536, maxTokens: 8191, costPer1MTokens: 0.02, - }; + } as const; const cost = estimateCost(1000000, modelInfo); expect(cost).toBe(0.02); @@ -132,12 +132,12 @@ describe("cost utilities", () => { it("should calculate proportional cost", () => { const modelInfo = { - provider: "openai" as const, + provider: "openai", model: "text-embedding-3-small", dimensions: 1536, maxTokens: 8191, costPer1MTokens: 0.02, - }; + } as const; const cost = estimateCost(500000, modelInfo); expect(cost).toBe(0.01); @@ -145,12 +145,12 @@ describe("cost utilities", () => { it("should handle zero tokens", () => { const modelInfo = { - provider: "openai" as const, + provider: "openai", model: "text-embedding-3-small", dimensions: 1536, maxTokens: 8191, costPer1MTokens: 0.02, - }; + } as const; const cost = estimateCost(0, modelInfo); expect(cost).toBe(0); diff --git a/tests/embeddings.test.ts b/tests/embeddings.test.ts index b49ef3f..ee0af10 100644 --- a/tests/embeddings.test.ts +++ b/tests/embeddings.test.ts @@ -19,8 +19,10 @@ describe("embeddings detector", () => { expect(getProviderDisplayName("ollama")).toBe("Ollama (Local)"); }); - it("should return the provider name as-is for auto", () => { - expect(getProviderDisplayName("auto")).toBe("auto"); + it("should return the provider name as-is for unknown provider (default branch)", () => { + // "auto" is no longer a valid EmbeddingProvider, but the default branch + // still returns the input string for forward-compatibility + expect(getProviderDisplayName("some-future-provider" as never)).toBe("some-future-provider"); }); }); }); diff --git a/tests/tools-utils.test.ts b/tests/tools-utils.test.ts new file mode 100644 index 0000000..1f41e10 --- /dev/null +++ b/tests/tools-utils.test.ts @@ -0,0 +1,605 @@ +import { describe, it, expect } from "vitest"; +import { + formatIndexStats, + formatStatus, + formatProgressTitle, + calculatePercentage, + formatCodebasePeek, + formatHealthCheck, + formatLogs, + formatSearchResults, +} from "../src/tools/utils.js"; +import type { IndexStats, IndexProgress, SearchResult, HealthCheckResult, StatusResult } from "../src/indexer/index.js"; +import type { LogEntry } from "../src/utils/logger.js"; + +function createBaseStats(overrides: Partial = {}): IndexStats { + return { + totalFiles: 0, + totalChunks: 0, + indexedChunks: 0, + failedChunks: 0, + tokensUsed: 0, + durationMs: 0, + existingChunks: 0, + removedChunks: 0, + skippedFiles: [], + parseFailures: [], + ...overrides, + }; +} + +describe("tools utils", () => { + describe("formatIndexStats", () => { + it("should show up-to-date message when nothing changed", () => { + const stats = createBaseStats({ totalFiles: 50, existingChunks: 200 }); + const result = formatIndexStats(stats); + + expect(result).toContain("50 files processed"); + expect(result).toContain("200 code chunks already up to date"); + }); + + it("should show removal message when only chunks removed", () => { + const stats = createBaseStats({ totalFiles: 10, removedChunks: 5, existingChunks: 15 }); + const result = formatIndexStats(stats); + + expect(result).toContain("removed 5 stale chunks"); + expect(result).toContain("15 chunks remain"); + }); + + it("should show new chunks embedded", () => { + const stats = createBaseStats({ + totalFiles: 20, + indexedChunks: 30, + tokensUsed: 5000, + durationMs: 2500, + }); + const result = formatIndexStats(stats); + + expect(result).toContain("30 new chunks embedded"); + expect(result).toContain("5,000"); + expect(result).toContain("2.5s"); + }); + + it("should show existing chunks skipped alongside new chunks", () => { + const stats = createBaseStats({ totalFiles: 20, indexedChunks: 10, existingChunks: 40, tokensUsed: 1000, durationMs: 1000 }); + const result = formatIndexStats(stats); + + expect(result).toContain("10 new chunks embedded"); + expect(result).toContain("40 unchanged chunks skipped"); + }); + + it("should show removed chunks when new chunks were also embedded", () => { + const stats = createBaseStats({ totalFiles: 20, indexedChunks: 5, removedChunks: 3, tokensUsed: 500, durationMs: 500 }); + const result = formatIndexStats(stats); + + expect(result).toContain("Removed 3 stale chunks"); + }); + + it("should show failed chunks", () => { + const stats = createBaseStats({ totalFiles: 10, indexedChunks: 5, failedChunks: 2, tokensUsed: 500, durationMs: 500 }); + const result = formatIndexStats(stats); + + expect(result).toContain("Failed: 2 chunks"); + }); + + it("should not include verbose details by default", () => { + const stats = createBaseStats({ + totalFiles: 10, + indexedChunks: 5, + tokensUsed: 500, + durationMs: 500, + skippedFiles: [{ path: "big.js", reason: "too_large" }], + parseFailures: ["empty.ts"], + }); + const result = formatIndexStats(stats); + + expect(result).not.toContain("Skipped files"); + expect(result).not.toContain("big.js"); + expect(result).not.toContain("no extractable chunks"); + }); + + it("should include verbose skipped file details", () => { + const stats = createBaseStats({ + totalFiles: 10, + indexedChunks: 5, + tokensUsed: 500, + durationMs: 500, + skippedFiles: [ + { path: "big.js", reason: "too_large" }, + { path: "vendor.js", reason: "excluded" }, + { path: ".env", reason: "gitignore" }, + ], + }); + const result = formatIndexStats(stats, true); + + expect(result).toContain("Skipped files: 3"); + expect(result).toContain("Too large (1)"); + expect(result).toContain("big.js"); + expect(result).toContain("Excluded (1)"); + expect(result).toContain("vendor.js"); + expect(result).toContain("Gitignored (1)"); + expect(result).toContain(".env"); + }); + + it("should include verbose parse failures", () => { + const stats = createBaseStats({ + totalFiles: 5, + indexedChunks: 3, + tokensUsed: 300, + durationMs: 300, + parseFailures: ["empty.ts", "broken.js"], + }); + const result = formatIndexStats(stats, true); + + expect(result).toContain("no extractable chunks (2)"); + expect(result).toContain("empty.ts"); + expect(result).toContain("broken.js"); + }); + }); + + describe("formatStatus", () => { + it("should return not-indexed message when not indexed", () => { + const status: StatusResult = { + indexed: false, + vectorCount: 0, + provider: "openai", + model: "text-embedding-3-small", + indexPath: "/tmp/index", + currentBranch: "default", + baseBranch: "default", + compatibility: null, + }; + const result = formatStatus(status); + + expect(result).toContain("not indexed"); + expect(result).toContain("Run index_codebase"); + }); + + it("should show basic status for indexed codebase on default branch", () => { + const status: StatusResult = { + indexed: true, + vectorCount: 500, + provider: "openai", + model: "text-embedding-3-small", + indexPath: "/tmp/index", + currentBranch: "default", + baseBranch: "default", + compatibility: { compatible: true }, + }; + const result = formatStatus(status); + + expect(result).toContain("500"); + expect(result).toContain("openai"); + expect(result).toContain("text-embedding-3-small"); + expect(result).toContain("/tmp/index"); + expect(result).not.toContain("Current branch"); + expect(result).toContain("compatible"); + }); + + it("should show branch info when not on default branch", () => { + const status: StatusResult = { + indexed: true, + vectorCount: 100, + provider: "github-copilot", + model: "text-embedding-3-small", + indexPath: "/tmp/index", + currentBranch: "feature-x", + baseBranch: "main", + compatibility: { compatible: true }, + }; + const result = formatStatus(status); + + expect(result).toContain("Current branch: feature-x"); + expect(result).toContain("Base branch: main"); + }); + + it("should show compatibility warning when incompatible", () => { + const status: StatusResult = { + indexed: true, + vectorCount: 100, + provider: "openai", + model: "text-embedding-3-small", + indexPath: "/tmp/index", + currentBranch: "default", + baseBranch: "default", + compatibility: { + compatible: false, + reason: "Dimension mismatch", + storedMetadata: { + indexVersion: "1", + embeddingProvider: "google", + embeddingModel: "text-embedding-004", + embeddingDimensions: 768, + createdAt: "2025-01-01", + updatedAt: "2025-01-01", + }, + }, + }; + const result = formatStatus(status); + + expect(result).toContain("COMPATIBILITY WARNING"); + expect(result).toContain("Dimension mismatch"); + expect(result).toContain("google/text-embedding-004"); + expect(result).toContain("768D"); + }); + + it("should show no-compatibility-info message when compatibility is null", () => { + const status: StatusResult = { + indexed: true, + vectorCount: 100, + provider: "openai", + model: "text-embedding-3-small", + indexPath: "/tmp/index", + currentBranch: "default", + baseBranch: "default", + compatibility: null, + }; + const result = formatStatus(status); + + expect(result).toContain("No compatibility information found"); + }); + }); + + describe("formatProgressTitle", () => { + it("should format scanning phase", () => { + expect(formatProgressTitle({ phase: "scanning", filesProcessed: 0, totalFiles: 0, chunksProcessed: 0, totalChunks: 0 })).toBe("Scanning files..."); + }); + + it("should format parsing phase with counts", () => { + expect(formatProgressTitle({ phase: "parsing", filesProcessed: 5, totalFiles: 20, chunksProcessed: 0, totalChunks: 0 })).toBe("Parsing: 5/20 files"); + }); + + it("should format embedding phase with counts", () => { + expect(formatProgressTitle({ phase: "embedding", filesProcessed: 20, totalFiles: 20, chunksProcessed: 30, totalChunks: 100 })).toBe("Embedding: 30/100 chunks"); + }); + + it("should format storing phase", () => { + expect(formatProgressTitle({ phase: "storing", filesProcessed: 20, totalFiles: 20, chunksProcessed: 100, totalChunks: 100 })).toBe("Storing index..."); + }); + + it("should format complete phase", () => { + expect(formatProgressTitle({ phase: "complete", filesProcessed: 20, totalFiles: 20, chunksProcessed: 100, totalChunks: 100 })).toBe("Indexing complete"); + }); + }); + + describe("calculatePercentage", () => { + const progress = (phase: IndexProgress["phase"], opts: Partial = {}): IndexProgress => ({ + phase, + filesProcessed: 0, + totalFiles: 0, + chunksProcessed: 0, + totalChunks: 0, + ...opts, + }); + + it("should return 0 for scanning", () => { + expect(calculatePercentage(progress("scanning"))).toBe(0); + }); + + it("should return 100 for complete", () => { + expect(calculatePercentage(progress("complete"))).toBe(100); + }); + + it("should return 5 for parsing with zero total files", () => { + expect(calculatePercentage(progress("parsing", { totalFiles: 0 }))).toBe(5); + }); + + it("should calculate parsing percentage in 5-20 range", () => { + const result = calculatePercentage(progress("parsing", { filesProcessed: 5, totalFiles: 10 })); + expect(result).toBeGreaterThanOrEqual(5); + expect(result).toBeLessThanOrEqual(20); + }); + + it("should return 20 at end of parsing", () => { + expect(calculatePercentage(progress("parsing", { filesProcessed: 10, totalFiles: 10 }))).toBe(20); + }); + + it("should return 20 for embedding with zero total chunks", () => { + expect(calculatePercentage(progress("embedding", { totalChunks: 0 }))).toBe(20); + }); + + it("should calculate embedding percentage in 20-90 range", () => { + const result = calculatePercentage(progress("embedding", { chunksProcessed: 50, totalChunks: 100 })); + expect(result).toBeGreaterThanOrEqual(20); + expect(result).toBeLessThanOrEqual(90); + }); + + it("should return 90 at end of embedding", () => { + expect(calculatePercentage(progress("embedding", { chunksProcessed: 100, totalChunks: 100 }))).toBe(90); + }); + + it("should return 95 for storing", () => { + expect(calculatePercentage(progress("storing"))).toBe(95); + }); + }); + + describe("formatCodebasePeek", () => { + it("should return empty message for no results", () => { + const result = formatCodebasePeek([], "test query"); + + expect(result).toContain("No matching code found"); + }); + + it("should format results with names", () => { + const results: SearchResult[] = [{ + filePath: "src/index.ts", + startLine: 10, + endLine: 20, + content: "", + score: 0.85, + chunkType: "function", + name: "initialize", + }]; + const result = formatCodebasePeek(results, "init function"); + + expect(result).toContain("1 locations"); + expect(result).toContain('"initialize"'); + expect(result).toContain("src/index.ts:10-20"); + expect(result).toContain("0.85"); + expect(result).toContain("function"); + expect(result).toContain("Use Read tool"); + }); + + it("should format results without names as anonymous", () => { + const results: SearchResult[] = [{ + filePath: "src/utils.ts", + startLine: 1, + endLine: 5, + content: "", + score: 0.70, + chunkType: "other", + }]; + const result = formatCodebasePeek(results, "utils"); + + expect(result).toContain("(anonymous)"); + }); + + it("should include query in output", () => { + const results: SearchResult[] = [{ + filePath: "a.ts", + startLine: 1, + endLine: 2, + content: "", + score: 0.5, + chunkType: "function", + name: "foo", + }]; + const result = formatCodebasePeek(results, "my search query"); + + expect(result).toContain('"my search query"'); + }); + }); + + describe("formatHealthCheck", () => { + it("should return healthy message when nothing to clean", () => { + const result = formatHealthCheck({ + removed: 0, + filePaths: [], + gcOrphanEmbeddings: 0, + gcOrphanChunks: 0, + }); + + expect(result).toBe("Index is healthy. No stale entries found."); + }); + + it("should show removed stale entries", () => { + const result = formatHealthCheck({ + removed: 5, + filePaths: ["src/old.ts", "src/deleted.ts"], + gcOrphanEmbeddings: 0, + gcOrphanChunks: 0, + }); + + expect(result).toContain("Removed stale entries: 5"); + expect(result).toContain("src/old.ts"); + expect(result).toContain("src/deleted.ts"); + }); + + it("should show orphan embeddings", () => { + const result = formatHealthCheck({ + removed: 0, + filePaths: [], + gcOrphanEmbeddings: 10, + gcOrphanChunks: 0, + }); + + expect(result).toContain("orphan embeddings: 10"); + }); + + it("should show orphan chunks", () => { + const result = formatHealthCheck({ + removed: 0, + filePaths: [], + gcOrphanEmbeddings: 0, + gcOrphanChunks: 3, + }); + + expect(result).toContain("orphan chunks: 3"); + }); + + it("should show all fields when all have values", () => { + const result = formatHealthCheck({ + removed: 2, + filePaths: ["a.ts"], + gcOrphanEmbeddings: 5, + gcOrphanChunks: 3, + }); + + expect(result).toContain("Removed stale entries: 2"); + expect(result).toContain("orphan embeddings: 5"); + expect(result).toContain("orphan chunks: 3"); + expect(result).toContain("a.ts"); + }); + }); + + describe("formatLogs", () => { + it("should return empty message for no logs", () => { + const result = formatLogs([]); + + expect(result).toContain("No logs recorded yet"); + }); + + it("should format log entries with timestamp, level, category, and message", () => { + const logs: LogEntry[] = [{ + timestamp: "2025-01-15T10:00:00Z", + level: "info", + category: "search", + message: "Query completed", + }]; + const result = formatLogs(logs); + + expect(result).toContain("[2025-01-15T10:00:00Z]"); + expect(result).toContain("[INFO]"); + expect(result).toContain("[search]"); + expect(result).toContain("Query completed"); + }); + + it("should include data as JSON when present", () => { + const logs: LogEntry[] = [{ + timestamp: "2025-01-15T10:00:00Z", + level: "debug", + category: "embedding", + message: "Batch sent", + data: { batchSize: 10, tokensUsed: 500 }, + }]; + const result = formatLogs(logs); + + expect(result).toContain("[DEBUG]"); + expect(result).toContain('"batchSize":10'); + expect(result).toContain('"tokensUsed":500'); + }); + + it("should format multiple log entries on separate lines", () => { + const logs: LogEntry[] = [ + { timestamp: "T1", level: "info", category: "search", message: "First" }, + { timestamp: "T2", level: "warn", category: "gc", message: "Second" }, + ]; + const result = formatLogs(logs); + const lines = result.split("\n"); + + expect(lines).toHaveLength(2); + expect(lines[0]).toContain("First"); + expect(lines[1]).toContain("Second"); + }); + }); + + describe("formatSearchResults", () => { + it("should format results with names", () => { + const results: SearchResult[] = [{ + filePath: "src/auth.ts", + startLine: 10, + endLine: 25, + content: "function validateToken() {\n return true;\n}", + score: 0.92, + chunkType: "function", + name: "validateToken", + }]; + const result = formatSearchResults(results); + + expect(result).toContain('[1] function "validateToken" in src/auth.ts:10-25'); + expect(result).toContain("92.0%"); + expect(result).toContain("```"); + expect(result).toContain("function validateToken()"); + }); + + it("should format results without names", () => { + const results: SearchResult[] = [{ + filePath: "src/config.ts", + startLine: 1, + endLine: 3, + content: "const x = 1;", + score: 0.50, + chunkType: "other", + }]; + const result = formatSearchResults(results); + + expect(result).toContain("[1] other in src/config.ts:1-3"); + expect(result).not.toContain('"null"'); + }); + + it("should truncate content longer than 30 lines", () => { + const longContent = Array.from({ length: 50 }, (_, i) => `line ${i + 1}`).join("\n"); + const results: SearchResult[] = [{ + filePath: "src/big.ts", + startLine: 1, + endLine: 50, + content: longContent, + score: 0.80, + chunkType: "function", + name: "bigFunction", + }]; + const result = formatSearchResults(results); + + expect(result).toContain("line 1"); + expect(result).toContain("line 30"); + expect(result).not.toContain("line 31"); + expect(result).toContain("20 more lines"); + }); + + it("should not truncate content with exactly 30 lines", () => { + const content = Array.from({ length: 30 }, (_, i) => `line ${i + 1}`).join("\n"); + const results: SearchResult[] = [{ + filePath: "src/exact.ts", + startLine: 1, + endLine: 30, + content, + score: 0.75, + chunkType: "function", + name: "exactFunction", + }]; + const result = formatSearchResults(results); + + expect(result).toContain("line 30"); + expect(result).not.toContain("more lines"); + }); + + it("should format multiple results with numbered indices", () => { + const results: SearchResult[] = [ + { filePath: "a.ts", startLine: 1, endLine: 2, content: "a", score: 0.9, chunkType: "function", name: "first" }, + { filePath: "b.ts", startLine: 3, endLine: 4, content: "b", score: 0.8, chunkType: "class", name: "second" }, + { filePath: "c.ts", startLine: 5, endLine: 6, content: "c", score: 0.7, chunkType: "method", name: "third" }, + ]; + const result = formatSearchResults(results); + + expect(result).toContain("[1]"); + expect(result).toContain("[2]"); + expect(result).toContain("[3]"); + expect(result).toContain('"first"'); + expect(result).toContain('"second"'); + expect(result).toContain('"third"'); + }); + + it("should use raw score format when scoreFormat is 'score'", () => { + const results: SearchResult[] = [{ + filePath: "src/auth.ts", + startLine: 10, + endLine: 25, + content: "function validateToken() {\n return true;\n}", + score: 0.85, + chunkType: "function", + name: "validateToken", + }]; + const result = formatSearchResults(results, "score"); + + expect(result).toContain("(score: 0.85)"); + expect(result).not.toContain("similarity"); + expect(result).not.toContain("%"); + }); + + it("should use similarity percentage format when scoreFormat is 'similarity'", () => { + const results: SearchResult[] = [{ + filePath: "src/auth.ts", + startLine: 10, + endLine: 25, + content: "function validateToken() {\n return true;\n}", + score: 0.92, + chunkType: "function", + name: "validateToken", + }]; + const result = formatSearchResults(results, "similarity"); + + expect(result).toContain("(similarity: 92.0%)"); + expect(result).not.toContain("(score:"); + }); + }); +}); diff --git a/tests/watcher.test.ts b/tests/watcher.test.ts index b70c0f0..aed82bd 100644 --- a/tests/watcher.test.ts +++ b/tests/watcher.test.ts @@ -7,7 +7,7 @@ import { ParsedCodebaseIndexConfig } from "../src/config/schema.js"; const createTestConfig = (overrides: Partial = {}): ParsedCodebaseIndexConfig => ({ embeddingProvider: "auto", - embeddingModel: "auto", + embeddingModel: undefined, scope: "project", include: ["**/*.ts", "**/*.js"], exclude: [], @@ -22,6 +22,7 @@ const createTestConfig = (overrides: Partial = {}): P autoGc: true, gcIntervalDays: 7, gcOrphanThreshold: 100, + requireProjectMarker: true, }, search: { maxResults: 20, @@ -30,6 +31,16 @@ const createTestConfig = (overrides: Partial = {}): P hybridWeight: 0.5, contextLines: 0, }, + debug: { + enabled: false, + logLevel: "info", + logSearch: true, + logEmbedding: true, + logCache: true, + logGc: true, + logBranch: true, + metrics: true, + }, ...overrides, });