Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions ingesters/__tests__/vectorStoreUtils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ describe('findChunksToUpdateAndRemove', () => {

const result = findChunksToUpdateAndRemove(freshChunks, storedChunkHashes);

expect(result.chunksToUpdate).toEqual([
expect(result.contentChanged).toEqual([
{
metadata: {
name: '2',
Expand All @@ -113,6 +113,7 @@ describe('findChunksToUpdateAndRemove', () => {
pageContent: 'Some Content 3',
},
]);
expect(result.metadataOnlyChanged).toEqual([]);
expect(result.chunksToRemove).toEqual(['3']);
});

Expand Down Expand Up @@ -173,14 +174,15 @@ describe('findChunksToUpdateAndRemove', () => {

const result = findChunksToUpdateAndRemove(freshChunks, storedChunkHashes);

expect(result.chunksToUpdate).toEqual([]);
expect(result.contentChanged).toEqual([]);
expect(result.metadataOnlyChanged).toEqual([]);
expect(result.chunksToRemove).toEqual([]);
});

it('should handle empty inputs correctly', () => {
const result = findChunksToUpdateAndRemove([], []);

expect(result.chunksToUpdate).toEqual([]);
expect(result.contentChanged).toEqual([]);
expect(result.metadataOnlyChanged).toEqual([]);
expect(result.chunksToRemove).toEqual([]);
});

Expand Down Expand Up @@ -217,8 +219,8 @@ describe('findChunksToUpdateAndRemove', () => {

const result = findChunksToUpdateAndRemove(freshChunks, storedChunkHashes);

// Should update because metadata changed (sourceLink and title)
expect(result.chunksToUpdate).toEqual([
// Should update metadata-only because metadata changed (sourceLink and title)
expect(result.metadataOnlyChanged).toEqual([
{
metadata: {
name: '1',
Expand All @@ -232,6 +234,7 @@ describe('findChunksToUpdateAndRemove', () => {
pageContent: 'Some Content 1',
},
]);
expect(result.contentChanged).toEqual([]);
expect(result.chunksToRemove).toEqual([]);
});
});
47 changes: 47 additions & 0 deletions ingesters/src/db/postgresVectorStore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,53 @@ export class VectorStore {
}
}

/**
* Update only the metadata (and source column for consistency) for existing documents.
* Does NOT modify content, embedding, or contentHash.
*/
async updateDocumentsMetadata(
documents: DocumentInterface[],
options?: { ids?: string[] },
): Promise<void> {
if (documents.length === 0) return;

logger.info(`Updating metadata for ${documents.length} documents`);

try {
const client = await this.pool.connect();
try {
await client.query('BEGIN');

const updates = documents.map((doc, i) => {
const uniqueId = options?.ids?.[i] || doc.metadata.uniqueId || null;
const source = doc.metadata.source || null;
const query = `
UPDATE ${this.tableName}
SET metadata = $2,
source = $3
WHERE uniqueId = $1
`;
return client.query(query, [
uniqueId,
JSON.stringify(doc.metadata),
source,
]);
});

await Promise.all(updates);
await client.query('COMMIT');
} catch (error) {
await client.query('ROLLBACK');
throw error;
} finally {
client.release();
}
} catch (error) {
logger.error('Error updating document metadata:', error);
throw DatabaseError.handlePgError(error as PgError);
}
}

/**
* Find a specific book chunk by name
* @param name - Name of the book chunk
Expand Down
6 changes: 3 additions & 3 deletions ingesters/src/ingesters/CairoBookIngester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ export class CairoBookIngester extends MarkdownIngester {
chunkOverlap: 512,
baseUrl: 'https://book.cairo-lang.org',
urlSuffix: '.html',
useUrlMapping: false,
useUrlMapping: true,
};

super(config, DocumentSource.CAIRO_BOOK);
Expand Down Expand Up @@ -71,7 +71,7 @@ export class CairoBookIngester extends MarkdownIngester {
maxChars: 2048,
minChars: 500,
overlap: 256,
headerLevels: [1, 2], // Split on H1 and H2 headers
headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
preserveCodeBlocks: true,
idPrefix: 'cairo-book',
trim: true,
Expand All @@ -97,7 +97,7 @@ export class CairoBookIngester extends MarkdownIngester {
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
contentHash: contentHash,
uniqueId: chunk.meta.uniqueId,
sourceLink: this.config.baseUrl,
sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
source: this.source,
},
});
Expand Down
4 changes: 2 additions & 2 deletions ingesters/src/ingesters/CoreLibDocsIngester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ export class CoreLibDocsIngester extends MarkdownIngester {
maxChars: 2048,
minChars: 500,
overlap: 256,
headerLevels: [1, 2], // Split on H1 and H2 headers
headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
preserveCodeBlocks: true,
idPrefix: 'corelib',
trim: true,
Expand All @@ -101,7 +101,7 @@ export class CoreLibDocsIngester extends MarkdownIngester {
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
contentHash: contentHash,
uniqueId: chunk.meta.uniqueId,
sourceLink: this.config.baseUrl,
sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
source: this.source,
},
});
Expand Down
4 changes: 2 additions & 2 deletions ingesters/src/ingesters/OpenZeppelinDocsIngester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ export class OpenZeppelinDocsIngester extends MarkdownIngester {
maxChars: 2048,
minChars: 500,
overlap: 256,
headerLevels: [1, 2], // Split on H1 and H2 headers
headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
preserveCodeBlocks: true,
idPrefix: 'openzeppelin-docs',
trim: true,
Expand All @@ -101,7 +101,7 @@ export class OpenZeppelinDocsIngester extends MarkdownIngester {
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
contentHash: contentHash,
uniqueId: chunk.meta.uniqueId,
sourceLink: this.config.baseUrl,
sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
source: this.source,
},
});
Expand Down
83 changes: 56 additions & 27 deletions ingesters/src/utils/vectorStoreUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ export function findChunksToUpdateAndRemove(
metadata: BookChunk;
}[],
): {
chunksToUpdate: Document<BookChunk>[];
contentChanged: Document<BookChunk>[];
metadataOnlyChanged: Document<BookChunk>[];
chunksToRemove: string[];
} {
const storedDataMap = new Map(
Expand All @@ -33,31 +34,51 @@ export function findChunksToUpdateAndRemove(
freshChunks.map((chunk) => [chunk.metadata.uniqueId, chunk]),
);

// Find chunks that need to be updated (content or metadata has changed)
const chunksToUpdate = freshChunks.filter((chunk) => {
const storedMetadata = storedDataMap.get(chunk.metadata.uniqueId);
if (!storedMetadata) {
// New chunk that doesn't exist in storage
return true;
const contentChanged: Document<BookChunk>[] = [];
const metadataOnlyChanged: Document<BookChunk>[] = [];

for (const fresh of freshChunks) {
const stored = storedDataMap.get(fresh.metadata.uniqueId);
if (!stored) {
// New doc: requires full insert + embedding
contentChanged.push(fresh);
continue;
}

const storedHash = stored.contentHash;
const freshHash = fresh.metadata.contentHash;
if (storedHash !== freshHash) {
// Content changed: re-embed and upsert fully
contentChanged.push(fresh);
continue;
}
// Update if content hash changed or any metadata field changed
for (const key in chunk.metadata) {
if (
storedMetadata[key as keyof BookChunk] !==
chunk.metadata[key as keyof BookChunk]
) {
return true;

// Content same, check if any metadata field differs
const keys = new Set<keyof BookChunk>([
...(Object.keys(stored) as (keyof BookChunk)[]),
...(Object.keys(fresh.metadata) as (keyof BookChunk)[]),
]);

let metaDiffers = false;
for (const key of keys) {
// Ignore contentHash here since we already know it's equal
if (key === 'contentHash') continue;
if (stored[key] !== fresh.metadata[key]) {
metaDiffers = true;
break;
}
}
return false;
});
if (metaDiffers) {
metadataOnlyChanged.push(fresh);
}
}

// Find chunks that need to be removed (no longer exist in fresh chunks)
const chunksToRemove = storedChunkHashes
.filter((stored) => !freshChunksMap.has(stored.uniqueId))
.map((stored) => stored.uniqueId);

return { chunksToUpdate, chunksToRemove };
return { contentChanged, metadataOnlyChanged, chunksToRemove };
}

/**
Expand All @@ -80,16 +101,18 @@ export async function updateVectorStore(
await vectorStore.getStoredBookPagesMetadata(source);

// Find chunks to update and remove
const { chunksToUpdate, chunksToRemove } = findChunksToUpdateAndRemove(
chunks,
storedChunkHashes,
);
const { contentChanged, metadataOnlyChanged, chunksToRemove } =
findChunksToUpdateAndRemove(chunks, storedChunkHashes);

logger.info(
`Found ${storedChunkHashes.length} stored chunks for source: ${source}. ${chunksToUpdate.length} chunks to update and ${chunksToRemove.length} chunks to remove`,
`Found ${storedChunkHashes.length} stored chunks for source: ${source}. ${contentChanged.length} content changes, ${metadataOnlyChanged.length} metadata-only changes, and ${chunksToRemove.length} removals`,
);

if (chunksToUpdate.length === 0 && chunksToRemove.length === 0) {
if (
contentChanged.length === 0 &&
metadataOnlyChanged.length === 0 &&
chunksToRemove.length === 0
) {
logger.info('No changes to update or remove');
return;
}
Expand Down Expand Up @@ -129,13 +152,19 @@ export async function updateVectorStore(
}

// Update chunks that have changed
if (chunksToUpdate.length > 0) {
await vectorStore.addDocuments(chunksToUpdate, {
ids: chunksToUpdate.map((chunk) => chunk.metadata.uniqueId),
if (contentChanged.length > 0) {
await vectorStore.addDocuments(contentChanged, {
ids: contentChanged.map((chunk) => chunk.metadata.uniqueId),
});
}

if (metadataOnlyChanged.length > 0) {
await vectorStore.updateDocumentsMetadata(metadataOnlyChanged, {
ids: metadataOnlyChanged.map((chunk) => chunk.metadata.uniqueId),
});
}

logger.info(
`Updated ${chunksToUpdate.length} chunks and removed ${chunksToRemove.length} chunks for source: ${source}.`,
`Updated ${contentChanged.length} content chunks, ${metadataOnlyChanged.length} metadata-only chunks, and removed ${chunksToRemove.length} chunks for source: ${source}.`,
);
}
Loading
Loading