Skip to content
This repository was archived by the owner on Apr 29, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions backend/endpoints/v1/documents/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ const { validEmbedding } = require("../../../utils/tokenizer");
const { documentDeletedJob } = require("../../../utils/jobs/documentDeleteJob");
const { cloneDocumentJob } = require("../../../utils/jobs/cloneDocumentJob");
const { selectConnector } = require("../../../utils/vectordatabases/providers");
const {
documentEmbeddingSearch,
} = require("../../../utils/search/documentEmbeddings");

process.env.NODE_ENV === "development"
? require("dotenv").config({ path: `.env.${process.env.NODE_ENV}` })
Expand Down Expand Up @@ -325,6 +328,41 @@ function documentEndpoints(app) {
}
}
);

app.get(
"/v1/documents/:documentId/search-embeddings",
[validSessionForUser],
async function (request, response) {
try {
const { documentId } = request.params;
const { method, q: query } = request.query;
const user = await userFromSession(request);
if (!user) {
response.sendStatus(403).end();
return;
}

const document = await WorkspaceDocument.get(`id = ${documentId}`);
if (!document) {
response.status(200).json({
fragments: [],
error: "No document found.",
});
return;
}

const { fragments, error } = await documentEmbeddingSearch(
document,
method,
query
);
response.status(200).json({ fragments, error });
} catch (e) {
console.log(e.message, e);
response.sendStatus(500).end();
}
}
);
}

module.exports = { documentEndpoints };
40 changes: 40 additions & 0 deletions backend/utils/search/documentEmbeddings/exactText.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
const { fuzzyMatch } = require("..");
const { DocumentVectors } = require("../../../models/documentVectors");
const { WorkspaceDocument } = require("../../../models/workspaceDocument");
const { readJSON } = require("../../storage");

async function findTextInDoc(wsDoc, query) {
try {
const fragmentIds = [];
const data = await readJSON(WorkspaceDocument.vectorFilepath(wsDoc));

for (const chunk of data) {
if (!chunk.hasOwnProperty("metadata")) continue;
for (const value of Object.values(chunk?.metadata)) {
const valid = fuzzyMatch(query, String(value));
if (valid) fragmentIds.push(chunk.vectorDbId);
}
}

return fragmentIds;
} catch (e) {
console.error(e);
return [];
}
}

async function exactTextSearch(document, query) {
const matchingVectorIds = await findTextInDoc(document, query);
if (matchingVectorIds.length === 0) return { fragments: [], error: null };

const queryString = matchingVectorIds.map((vid) => `'${vid}'`).join(",");
const fragments = await DocumentVectors.where(
`vectorId IN (${queryString})`,
100
);
return { fragments, error: null };
}

module.exports = {
exactTextSearch,
};
32 changes: 32 additions & 0 deletions backend/utils/search/documentEmbeddings/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
const { Telemetry } = require("../../../models/telemetry");
const { exactTextSearch } = require("./exactText");
const { metadataSearch } = require("./metadata");
const { semanticSearch } = require("./semantic");
const { vectorIdSearch } = require("./vectorId");

const SEARCH_METHODS = {
semantic: semanticSearch,
exactText: exactTextSearch,
metadata: metadataSearch,
vectorId: vectorIdSearch,
};

function validSearchMethod(method) {
return Object.keys(SEARCH_METHODS).includes(method);
}

async function documentEmbeddingSearch(document, method, query) {
try {
if (!validSearchMethod(method))
throw new Error(`Invalid search method ${method}`);
await Telemetry.sendTelemetry("search_executed", { searchMethod: method });
return await SEARCH_METHODS[method](document, decodeURIComponent(query));
} catch (e) {
console.error("Workspace document search", e.message);
return { fragments: [], error: e.message };
}
}

module.exports = {
documentEmbeddingSearch,
};
43 changes: 43 additions & 0 deletions backend/utils/search/documentEmbeddings/metadata.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
const { fuzzyMatch } = require("..");
const { DocumentVectors } = require("../../../models/documentVectors");
const { WorkspaceDocument } = require("../../../models/workspaceDocument");
const { readJSON } = require("../../storage");

async function findKeyValueInDoc(wsDoc, query) {
try {
const fragmentIds = [];
const data = await readJSON(WorkspaceDocument.vectorFilepath(wsDoc));
const [keyToFind, valueToFind] = query.split(":");

for (const chunk of data) {
if (!chunk.hasOwnProperty("metadata")) continue;
for (const [key, value] of Object.entries(chunk?.metadata)) {
const validKey = fuzzyMatch(keyToFind, key);
if (!validKey) continue;
const match = fuzzyMatch(valueToFind, String(value));
if (match) fragmentIds.push(chunk.vectorDbId);
}
}

return fragmentIds;
} catch (e) {
console.error(e);
return [];
}
}

async function metadataSearch(document, query) {
const matchingVectorIds = await findKeyValueInDoc(document, query);
if (matchingVectorIds.length === 0) return { fragments: [], error: null };

const queryString = matchingVectorIds.map((vid) => `'${vid}'`).join(",");
const fragments = await DocumentVectors.where(
`vectorId IN (${queryString})`,
100
);
return { fragments, error: null };
}

module.exports = {
metadataSearch,
};
53 changes: 53 additions & 0 deletions backend/utils/search/documentEmbeddings/semantic.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
const { DocumentVectors } = require("../../../models/documentVectors");
const {
OrganizationConnection,
} = require("../../../models/organizationConnection");
const {
OrganizationWorkspace,
} = require("../../../models/organizationWorkspace");
const { SystemSettings } = require("../../../models/systemSettings");
const { OpenAi } = require("../../openAi");
const { selectConnector } = require("../../vectordatabases/providers");

async function semanticSearch(document, query) {
const workspace = await OrganizationWorkspace.get(
`id = ${document.workspace_id}`
);
const connector = await OrganizationConnection.get(
`organization_id = ${document.organization_id}`
);
if (!connector)
return { fragments: [], error: "No connector found for org." };

const openAiKey = (await SystemSettings.get(`label = 'open_ai_api_key'`))
?.value;
if (!openAiKey)
return { fragments: [], error: "No OpenAI key available to embed query." };

const vectorDb = selectConnector(connector);
const openai = new OpenAi(openAiKey);

const queryVector = await openai.embedTextChunk(query);
if (!queryVector) return { fragments: [], error: "Failed to embed query." };

// Execute Similarity search for vector DB provider so we can find inferred documents.
const searchResults = await vectorDb.similarityResponse(
workspace.slug,
queryVector
);

// From similarity search we can find all document vector DB items to infer their associated
// document record.
const searchString = searchResults.vectorIds
.map((vid) => `'${vid}'`)
.join(",");
const fragments = await DocumentVectors.where(
`vectorId IN (${searchString})`,
100
);
return { fragments, error: null };
}

module.exports = {
semanticSearch,
};
12 changes: 12 additions & 0 deletions backend/utils/search/documentEmbeddings/vectorId.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
const { DocumentVectors } = require("../../../models/documentVectors");

async function vectorIdSearch(_document, query) {
const documentVector = await DocumentVectors.get(`vectorId = '${query}'`);
if (!documentVector)
return { fragments: [], error: "No document vector found with that id." };
return { fragments: [documentVector], error: null };
}

module.exports = {
vectorIdSearch,
};
11 changes: 11 additions & 0 deletions backend/utils/search/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Dirty, but works fast for most cases. Wont be perfect but also not something we should rely
// heavily on for exact text searching.
function fuzzyMatch(pattern, str) {
pattern = ".*" + pattern.split("").join(".*") + ".*";
const re = new RegExp(pattern);
return re.test(str);
}

module.exports = {
fuzzyMatch,
};
9 changes: 1 addition & 8 deletions backend/utils/search/workspaceDocuments/exactText.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,7 @@
const { fuzzyMatch } = require("..");
const { WorkspaceDocument } = require("../../../models/workspaceDocument");
const { readJSON } = require("../../storage");

// Dirty, but works fast for most cases. Wont be perfect but also not something we should rely
// heavily on for exact text searching.
function fuzzyMatch(pattern, str) {
pattern = ".*" + pattern.split("").join(".*") + ".*";
const re = new RegExp(pattern);
return re.test(str);
}

async function findTextInDoc(wsDoc, query) {
try {
const data = await readJSON(WorkspaceDocument.vectorFilepath(wsDoc));
Expand Down
9 changes: 1 addition & 8 deletions backend/utils/search/workspaceDocuments/metadata.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,7 @@
const { fuzzyMatch } = require("..");
const { WorkspaceDocument } = require("../../../models/workspaceDocument");
const { readJSON } = require("../../storage");

// Dirty, but works fast for most cases. Wont be perfect but also not something we should rely
// heavily on for exact text searching.
function fuzzyMatch(pattern, str) {
pattern = ".*" + pattern.split("").join(".*") + ".*";
const re = new RegExp(pattern);
return re.test(str);
}

async function findKeyValueInDoc(wsDoc, query) {
try {
const data = await readJSON(WorkspaceDocument.vectorFilepath(wsDoc));
Expand Down
22 changes: 22 additions & 0 deletions frontend/src/models/document.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { ISearchTypes } from '../pages/DocumentView/FragmentList/SearchView';
import { API_BASE } from '../utils/constants';
import { baseHeaders } from '../utils/request';

Expand Down Expand Up @@ -106,6 +107,27 @@ const Document = {
return { success: false, error: e.message };
});
},
searchEmbeddings: async (
documentId: number,
method: ISearchTypes,
query: string
): Promise<{ documents: object[] }> => {
const searchEndpoint = new URL(
`${API_BASE}/v1/documents/${documentId}/search-embeddings`
);
searchEndpoint.searchParams.append('method', method);
searchEndpoint.searchParams.append('q', encodeURIComponent(query));
return await fetch(searchEndpoint, {
method: 'GET',
headers: baseHeaders(),
})
.then((res) => res.json())
.then((res) => res?.fragments || [])
.catch((e) => {
console.error(e.message);
return [];
});
},
};

export default Document;
Loading