From 4004cd3a4b31d6b2e9513d59f3ba04d105790a14 Mon Sep 17 00:00:00 2001 From: timothycarambat Date: Wed, 7 Jun 2023 21:29:47 -0700 Subject: [PATCH] Implement Chroma Support --- README.md | 4 +- frontend/src/components/DefaultChat/index.jsx | 51 +-- frontend/src/components/Modals/Keys.jsx | 44 ++- .../src/components/Modals/ManageWorkspace.jsx | 2 +- .../ChatHistory/PromptReply/index.jsx | 1 + package.json | 1 + server/.env.example | 13 +- server/.nvmrc | 2 +- server/endpoints/chat.js | 17 +- server/endpoints/system.js | 56 +-- server/endpoints/workspaces.js | 73 ++-- server/index.js | 62 +-- server/models/documents.js | 110 ++++-- server/models/vectors.js | 72 ++-- server/models/workspace.js | 82 ++-- server/models/workspaceChats.js | 90 +++-- server/package.json | 11 +- server/utils/chats/commands/reset.js | 8 +- server/utils/chats/index.js | 94 +++-- server/utils/chroma/CHROMA_SETUP.md | 24 ++ server/utils/chroma/index.js | 361 ++++++++++++++++++ server/utils/files/index.js | 88 +++-- server/utils/helpers/index.js | 18 + server/utils/http/index.js | 2 +- server/utils/middleware/validatedRequest.js | 14 +- server/utils/openAi/index.js | 94 +++-- server/utils/pinecone/index.js | 231 ++++++----- 27 files changed, 1116 insertions(+), 509 deletions(-) create mode 100644 server/utils/chroma/CHROMA_SETUP.md create mode 100644 server/utils/chroma/index.js create mode 100644 server/utils/helpers/index.js diff --git a/README.md b/README.md index 5ed5259d90..4aebf2635b 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/tim.svg?style=social&label=Follow%20%40Timothy%20Carambat)](https://twitter.com/tcarambat) [![](https://dcbadge.vercel.app/api/server/6UyHPeGZAC?compact=true&style=flat)](https://discord.gg/6UyHPeGZAC) -A full-stack application and tool suite that enables you to turn any document, resource, or piece of content into a piece of data that any LLM can use as reference during chatting. This application runs with very minimal overhead as by default the LLM and vectorDB are hosted remotely, but can be swapped for local instances. Currently this project supports Pinecone and OpenAI. +A full-stack application and tool suite that enables you to turn any document, resource, or piece of content into a piece of data that any LLM can use as reference during chatting. This application runs with very minimal overhead as by default the LLM and vectorDB are hosted remotely, but can be swapped for local instances. Currently this project supports Pinecone & ChromaDB for vector storage and OpenAI for chatting. ![Chatting](/images/screenshots/chat.png) [view more screenshots](/images/screenshots/SCREENSHOTS.md) @@ -38,7 +38,7 @@ This monorepo consists of three main sections: - `yarn` and `node` on your machine - `python` 3.8+ for running scripts in `collector/`. - access to an LLM like `GPT-3.5`, `GPT-4`*. -- a [Pinecone.io](https://pinecone.io) free account*. +- a [Pinecone.io](https://pinecone.io) free account* **or** Local Chroma instance running. *you can use drop in replacements for these. This is just the easiest to get up and running fast. ### How to get started diff --git a/frontend/src/components/DefaultChat/index.jsx b/frontend/src/components/DefaultChat/index.jsx index 4342299d16..d85cfd1c41 100644 --- a/frontend/src/components/DefaultChat/index.jsx +++ b/frontend/src/components/DefaultChat/index.jsx @@ -16,9 +16,8 @@ export default function DefaultChatContainer() { const MESSAGES = [

@@ -34,9 +33,8 @@ export default function DefaultChatContainer() {

@@ -51,17 +49,16 @@ export default function DefaultChatContainer() {

AnythingLLM can run totally locally on your machine with little overhead you wont even notice it's there! No GPU needed. Cloud and - on-premises installtion is available as well. + on-premises installation is available as well.
- The AI tooling ecosytem gets more powerful everyday. AnythingLLM + The AI tooling ecosystem gets more powerful everyday. AnythingLLM makes it easy to use.

@@ -93,9 +89,8 @@ export default function DefaultChatContainer() {

@@ -122,14 +117,13 @@ export default function DefaultChatContainer() {

Is this like an AI dropbox or something? What about chatting? It is - a chatbot isnt it? + a chatbot isn't it?

@@ -137,9 +131,8 @@ export default function DefaultChatContainer() {

@@ -168,9 +161,8 @@ export default function DefaultChatContainer() {

@@ -182,9 +174,8 @@ export default function DefaultChatContainer() {

diff --git a/frontend/src/components/Modals/Keys.jsx b/frontend/src/components/Modals/Keys.jsx index 24cb2efb51..2ef8445929 100644 --- a/frontend/src/components/Modals/Keys.jsx +++ b/frontend/src/components/Modals/Keys.jsx @@ -74,20 +74,38 @@ export default function KeysModal({ hideModal = noop }) { />

- - + {settings?.VectorDB === "pinecone" && ( + <> + + + + + )} + {settings?.VectorDB === "chroma" && ( + <> + + + )}
)}
diff --git a/frontend/src/components/Modals/ManageWorkspace.jsx b/frontend/src/components/Modals/ManageWorkspace.jsx index 4cc52d1f0b..67dfcf1a82 100644 --- a/frontend/src/components/Modals/ManageWorkspace.jsx +++ b/frontend/src/components/Modals/ManageWorkspace.jsx @@ -41,7 +41,7 @@ export default function ManageWorkspace({ hideModal = noop, workspace }) { const deleteWorkspace = async () => { if ( !window.confirm( - `You are about to delete your entire ${workspace.name} workspace. This will remove all vector embeddings on your vector database.\n\nThe original source files will remiain untouched. This action is irreversible.` + `You are about to delete your entire ${workspace.name} workspace. This will remove all vector embeddings on your vector database.\n\nThe original source files will remain untouched. This action is irreversible.` ) ) return false; diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/PromptReply/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/PromptReply/index.jsx index 3cbe8957c5..ed0c603067 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/PromptReply/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/PromptReply/index.jsx @@ -2,6 +2,7 @@ import { memo, useEffect, useRef, useState } from "react"; import { AlertTriangle } from "react-feather"; import Jazzicon from "../../../../UserIcon"; import { decode as HTMLDecode } from "he"; +import { v4 } from "uuid"; function PromptReply({ uuid, diff --git a/package.json b/package.json index fab5326038..bab164aca0 100644 --- a/package.json +++ b/package.json @@ -6,6 +6,7 @@ "author": "Timothy Carambat (Mintplex Labs)", "license": "MIT", "scripts": { + "lint": "cd server && yarn lint && cd .. && cd frontend && yarn lint", "setup": "cd server && yarn && cd .. && yarn setup:envs && echo \"Please run yarn dev:server and yarn dev:frontend in separate terminal tabs.\"", "setup:envs": "cd server && cp -n .env.example .env.development && cd ../collector && cp -n .env.example .env && cd ..", "dev:server": "cd server && yarn dev", diff --git a/server/.env.example b/server/.env.example index 723964cece..17a2e5afd3 100644 --- a/server/.env.example +++ b/server/.env.example @@ -1,8 +1,15 @@ SERVER_PORT=5000 OPEN_AI_KEY= OPEN_MODEL_PREF='gpt-3.5-turbo' +# AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. +CACHE_VECTORS="true" + +# Enable all below if you are using vector database: Chroma. +# VECTOR_DB="chroma" +# CHROMA_ENDPOINT='http://localhost:8000' + +# Enable all below if you are using vector database: Pinecone. +VECTOR_DB="pinecone" PINECONE_ENVIRONMENT= PINECONE_API_KEY= -PINECONE_INDEX= -AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. -CACHE_VECTORS="true" \ No newline at end of file +PINECONE_INDEX= \ No newline at end of file diff --git a/server/.nvmrc b/server/.nvmrc index 95c758cad6..59f4a2f3ab 100644 --- a/server/.nvmrc +++ b/server/.nvmrc @@ -1 +1 @@ -v18.12.1 \ No newline at end of file +v18.13.0 \ No newline at end of file diff --git a/server/endpoints/chat.js b/server/endpoints/chat.js index 75242c97d9..dcc81fb5dc 100644 --- a/server/endpoints/chat.js +++ b/server/endpoints/chat.js @@ -1,13 +1,13 @@ -const { reqBody } = require('../utils/http'); -const { Workspace } = require('../models/workspace'); -const { chatWithWorkspace } = require('../utils/chats'); +const { reqBody } = require("../utils/http"); +const { Workspace } = require("../models/workspace"); +const { chatWithWorkspace } = require("../utils/chats"); function chatEndpoints(app) { if (!app) return; - app.post('/workspace/:slug/chat', async (request, response) => { - const { slug } = request.params - const { message, mode = 'query' } = reqBody(request) + app.post("/workspace/:slug/chat", async (request, response) => { + const { slug } = request.params; + const { message, mode = "query" } = reqBody(request); const workspace = await Workspace.get(`slug = '${slug}'`); if (!workspace) { response.sendStatus(400).end(); @@ -16,8 +16,7 @@ function chatEndpoints(app) { const result = await chatWithWorkspace(workspace, message, mode); response.status(200).json({ ...result }); - }) - + }); } -module.exports = { chatEndpoints } \ No newline at end of file +module.exports = { chatEndpoints }; diff --git a/server/endpoints/system.js b/server/endpoints/system.js index fcc6d2685f..b86af9f094 100644 --- a/server/endpoints/system.js +++ b/server/endpoints/system.js @@ -1,34 +1,46 @@ -require('dotenv').config({ path: `.env.${process.env.NODE_ENV}` }) -const { Pinecone } = require('../utils/pinecone'); -const { viewLocalFiles } = require('../utils/files'); +require("dotenv").config({ path: `.env.${process.env.NODE_ENV}` }); +const { viewLocalFiles } = require("../utils/files"); +const { getVectorDbClass } = require("../utils/helpers"); function systemEndpoints(app) { if (!app) return; - app.get('/ping', (_, response) => { + app.get("/ping", (_, response) => { response.sendStatus(200); - }) + }); - app.get('/setup-complete', (_, response) => { + app.get("/setup-complete", (_, response) => { + const vectorDB = process.env.VECTOR_DB || "pinecone"; const results = { + VectorDB: vectorDB, OpenAiKey: !!process.env.OPEN_AI_KEY, - OpenAiModelPref: process.env.OPEN_MODEL_PREF || 'gpt-3.5-turbo', - PineConeEnvironment: process.env.PINECONE_ENVIRONMENT, - PineConeKey: !!process.env.PINECONE_API_KEY, - PinceConeIndex: process.env.PINECONE_INDEX, - } - response.status(200).json({ results }) - }) + OpenAiModelPref: process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo", + ...(vectorDB === "pinecone" + ? { + PineConeEnvironment: process.env.PINECONE_ENVIRONMENT, + PineConeKey: !!process.env.PINECONE_API_KEY, + PineConeIndex: process.env.PINECONE_INDEX, + } + : {}), + ...(vectorDB === "chroma" + ? { + ChromaEndpoint: process.env.CHROMA_ENDPOINT, + } + : {}), + }; + response.status(200).json({ results }); + }); - app.get('/system-vectors', async (_, response) => { - const vectorCount = await Pinecone.totalIndicies(); - response.status(200).json({ vectorCount }) - }) + app.get("/system-vectors", async (_, response) => { + const VectorDb = getVectorDbClass(); + const vectorCount = await VectorDb.totalIndicies(); + response.status(200).json({ vectorCount }); + }); - app.get('/local-files', async (_, response) => { - const localFiles = await viewLocalFiles() - response.status(200).json({ localFiles }) - }) + app.get("/local-files", async (_, response) => { + const localFiles = await viewLocalFiles(); + response.status(200).json({ localFiles }); + }); } -module.exports = { systemEndpoints } \ No newline at end of file +module.exports = { systemEndpoints }; \ No newline at end of file diff --git a/server/endpoints/workspaces.js b/server/endpoints/workspaces.js index 37fbe79214..d61ef3bedf 100644 --- a/server/endpoints/workspaces.js +++ b/server/endpoints/workspaces.js @@ -1,21 +1,21 @@ -const { Pinecone } = require('../utils/pinecone'); -const { reqBody } = require('../utils/http'); -const { Workspace } = require('../models/workspace'); -const { Document } = require('../models/documents'); -const { DocumentVectors } = require('../models/vectors'); -const { WorkspaceChats } = require('../models/workspaceChats'); -const { convertToChatHistory } = require('../utils/chats'); +const { reqBody } = require("../utils/http"); +const { Workspace } = require("../models/workspace"); +const { Document } = require("../models/documents"); +const { DocumentVectors } = require("../models/vectors"); +const { WorkspaceChats } = require("../models/workspaceChats"); +const { convertToChatHistory } = require("../utils/chats"); +const { getVectorDbClass } = require("../utils/helpers"); function workspaceEndpoints(app) { if (!app) return; - app.post('/workspace/new', async (request, response) => { + app.post("/workspace/new", async (request, response) => { const { name = null } = reqBody(request); const { workspace, message } = await Workspace.new(name); - response.status(200).json({ workspace, message }) - }) + response.status(200).json({ workspace, message }); + }); - app.post('/workspace/:slug/update-embeddings', async (request, response) => { + app.post("/workspace/:slug/update-embeddings", async (request, response) => { const { slug = null } = request.params; const { adds = [], deletes = [] } = reqBody(request); const currWorkspace = await Workspace.get(`slug = '${slug}'`); @@ -28,11 +28,12 @@ function workspaceEndpoints(app) { await Document.removeDocuments(currWorkspace, deletes); await Document.addDocuments(currWorkspace, adds); const updatedWorkspace = await Workspace.get(`slug = '${slug}'`); - response.status(200).json({ workspace: updatedWorkspace }) - }) + response.status(200).json({ workspace: updatedWorkspace }); + }); - app.delete('/workspace/:slug', async (request, response) => { - const { slug = '' } = request.params + app.delete("/workspace/:slug", async (request, response) => { + const VectorDb = getVectorDbClass(); + const { slug = "" } = request.params; const workspace = await Workspace.get(`slug = '${slug}'`); if (!workspace) { @@ -42,34 +43,38 @@ function workspaceEndpoints(app) { await Workspace.delete(`slug = '${slug.toLowerCase()}'`); await DocumentVectors.deleteForWorkspace(workspace.id); - await Document.delete(`workspaceId = ${Number(workspace.id)}`) - await WorkspaceChats.delete(`workspaceId = ${Number(workspace.id)}`) - try { await Pinecone['delete-namespace']({ namespace: slug }) } catch (e) { console.error(e.message) } - response.sendStatus(200).end() - }) + await Document.delete(`workspaceId = ${Number(workspace.id)}`); + await WorkspaceChats.delete(`workspaceId = ${Number(workspace.id)}`); + try { + await VectorDb["delete-namespace"]({ namespace: slug }); + } catch (e) { + console.error(e.message); + } + response.sendStatus(200).end(); + }); - app.get('/workspaces', async (_, response) => { + app.get("/workspaces", async (_, response) => { const workspaces = await Workspace.where(); - response.status(200).json({ workspaces }) - }) + response.status(200).json({ workspaces }); + }); - app.get('/workspace/:slug', async (request, response) => { - const { slug } = request.params + app.get("/workspace/:slug", async (request, response) => { + const { slug } = request.params; const workspace = await Workspace.get(`slug = '${slug}'`); - response.status(200).json({ workspace }) - }) + response.status(200).json({ workspace }); + }); - app.get('/workspace/:slug/chats', async (request, response) => { - const { slug } = request.params + app.get("/workspace/:slug/chats", async (request, response) => { + const { slug } = request.params; const workspace = await Workspace.get(`slug = '${slug}'`); if (!workspace) { - response.sendStatus(400).end() + response.sendStatus(400).end(); return; } - const history = await WorkspaceChats.forWorkspace(workspace.id) - response.status(200).json({ history: convertToChatHistory(history) }) - }) + const history = await WorkspaceChats.forWorkspace(workspace.id); + response.status(200).json({ history: convertToChatHistory(history) }); + }); } -module.exports = { workspaceEndpoints } \ No newline at end of file +module.exports = { workspaceEndpoints }; diff --git a/server/index.js b/server/index.js index d014dc150c..a8d22b44f0 100644 --- a/server/index.js +++ b/server/index.js @@ -1,54 +1,62 @@ -require('dotenv').config({ path: `.env.${process.env.NODE_ENV}` }) -const express = require('express') -const bodyParser = require('body-parser') -const cors = require('cors'); -const { validatedRequest } = require('./utils/middleware/validatedRequest'); -const { Pinecone } = require('./utils/pinecone'); -const { reqBody } = require('./utils/http'); -const { systemEndpoints } = require('./endpoints/system'); -const { workspaceEndpoints } = require('./endpoints/workspaces'); -const { chatEndpoints } = require('./endpoints/chat'); +require("dotenv").config({ path: `.env.${process.env.NODE_ENV}` }); +const express = require("express"); +const bodyParser = require("body-parser"); +const cors = require("cors"); +const { validatedRequest } = require("./utils/middleware/validatedRequest"); +const { reqBody } = require("./utils/http"); +const { systemEndpoints } = require("./endpoints/system"); +const { workspaceEndpoints } = require("./endpoints/workspaces"); +const { chatEndpoints } = require("./endpoints/chat"); +const { getVectorDbClass } = require("./utils/helpers"); const app = express(); app.use(cors({ origin: true })); app.use(validatedRequest); app.use(bodyParser.text()); app.use(bodyParser.json()); -app.use(bodyParser.urlencoded({ - extended: true -})); +app.use( + bodyParser.urlencoded({ + extended: true, + }) +); systemEndpoints(app); workspaceEndpoints(app); chatEndpoints(app); -app.post('/v/:command', async (request, response) => { - const { command } = request.params - if (!Object.getOwnPropertyNames(Pinecone).includes(command)) { - response.status(500).json({ message: 'invalid interface command', commands: Object.getOwnPropertyNames(Pinecone.prototype) }); - return +app.post("/v/:command", async (request, response) => { + const VectorDb = getVectorDbClass(); + const { command } = request.params; + if (!Object.getOwnPropertyNames(VectorDb).includes(command)) { + response.status(500).json({ + message: "invalid interface command", + commands: Object.getOwnPropertyNames(VectorDb), + }); + return; } try { const body = reqBody(request); - const resBody = await Pinecone[command](body) + const resBody = await VectorDb[command](body); response.status(200).json({ ...resBody }); } catch (e) { // console.error(e) - console.error(JSON.stringify(e)) + console.error(JSON.stringify(e)); response.status(500).json({ error: e.message }); } return; -}) - +}); -app.all('*', function (_, response) { +app.all("*", function (_, response) { response.sendStatus(404); }); -app.listen(process.env.SERVER_PORT || 5000, () => { - console.log(`Example app listening on port ${process.env.SERVER_PORT || 5000}`) -}) +app + .listen(process.env.SERVER_PORT || 5000, () => { + console.log( + `Example app listening on port ${process.env.SERVER_PORT || 5000}` + ); + }) .on("error", function (err) { process.once("SIGUSR2", function () { process.kill(process.pid, "SIGUSR2"); @@ -56,4 +64,4 @@ app.listen(process.env.SERVER_PORT || 5000, () => { process.on("SIGINT", function () { process.kill(process.pid, "SIGINT"); }); - }); \ No newline at end of file + }); diff --git a/server/models/documents.js b/server/models/documents.js index 1b280f7627..148ae8c4ec 100644 --- a/server/models/documents.js +++ b/server/models/documents.js @@ -1,8 +1,9 @@ -const { fileData } = require('../utils/files'); -const { v4: uuidv4 } = require('uuid'); +const { fileData } = require("../utils/files"); +const { v4: uuidv4 } = require("uuid"); +const { getVectorDbClass } = require("../utils/helpers"); const Document = { - tablename: 'workspace_documents', + tablename: "workspace_documents", colsInit: ` id INTEGER PRIMARY KEY AUTOINCREMENT, docId TEXT NOT NULL UNIQUE, @@ -14,64 +15,82 @@ const Document = { lastUpdatedAt TEXT DEFAULT CURRENT_TIMESTAMP `, db: async function () { - const sqlite3 = require('sqlite3').verbose(); - const { open } = require('sqlite'); + const sqlite3 = require("sqlite3").verbose(); + const { open } = require("sqlite"); const db = await open({ - filename: 'anythingllm.db', - driver: sqlite3.Database - }) + filename: "anythingllm.db", + driver: sqlite3.Database, + }); - await db.exec(`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`); - db.on('trace', (sql) => console.log(sql)) - return db + await db.exec( + `CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})` + ); + db.on("trace", (sql) => console.log(sql)); + return db; }, forWorkspace: async function (workspaceId = null) { if (!workspaceId) return []; return await this.where(`workspaceId = ${workspaceId}`); }, - delete: async function (clause = '') { - const db = await this.db() - await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`) - db.close() - return true + delete: async function (clause = "") { + const db = await this.db(); + await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`); + db.close(); + return true; }, - where: async function (clause = '', limit = null) { - const db = await this.db() - const results = await db.all(`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ''} ${!!limit ? `LIMIT ${limit}` : ''}`) + where: async function (clause = "", limit = null) { + const db = await this.db(); + const results = await db.all( + `SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ""} ${ + !!limit ? `LIMIT ${limit}` : "" + }` + ); - db.close() - return results + db.close(); + return results; }, - firstWhere: async function (clause = '') { + firstWhere: async function (clause = "") { const results = await this.where(clause); - return results.length > 0 ? results[0] : null + return results.length > 0 ? results[0] : null; }, addDocuments: async function (workspace, additions = []) { - const { Pinecone } = require('../utils/pinecone'); + const VectorDb = getVectorDbClass(); if (additions.length === 0) return; - const db = await this.db() - const stmt = await db.prepare(`INSERT INTO ${this.tablename} (docId, filename, docpath, workspaceId, metadata) VALUES (?,?,?,?,?)`) + const db = await this.db(); + const stmt = await db.prepare( + `INSERT INTO ${this.tablename} (docId, filename, docpath, workspaceId, metadata) VALUES (?,?,?,?,?)` + ); for (const path of additions) { const data = await fileData(path); if (!data) continue; const docId = uuidv4(); - const { pageContent, ...metadata } = data + const { pageContent, ...metadata } = data; const newDoc = { docId, - filename: path.split('/')[1], + filename: path.split("/")[1], docpath: path, workspaceId: Number(workspace.id), - metadata: JSON.stringify(metadata) - } - const vectorized = await Pinecone.addDocumentToNamespace(workspace.slug, { ...data, docId }, path); + metadata: JSON.stringify(metadata), + }; + const vectorized = await VectorDb.addDocumentToNamespace( + workspace.slug, + { ...data, docId }, + path + ); if (!vectorized) { - console.error('Failed to vectorize', path) + console.error("Failed to vectorize", path); continue; } - stmt.run([docId, newDoc.filename, newDoc.docpath, newDoc.workspaceId, newDoc.metadata]) + stmt.run([ + docId, + newDoc.filename, + newDoc.docpath, + newDoc.workspaceId, + newDoc.metadata, + ]); } stmt.finalize(); db.close(); @@ -79,21 +98,28 @@ const Document = { return; }, removeDocuments: async function (workspace, removals = []) { - const { Pinecone } = require('../utils/pinecone'); + const VectorDb = getVectorDbClass(); if (removals.length === 0) return; - const db = await this.db() - const stmt = await db.prepare(`DELETE FROM ${this.tablename} WHERE docpath = ? AND workspaceId = ?`); + const db = await this.db(); + const stmt = await db.prepare( + `DELETE FROM ${this.tablename} WHERE docpath = ? AND workspaceId = ?` + ); for (const path of removals) { - const document = await this.firstWhere(`docPath = '${path}' AND workspaceId = ${workspace.id}`) + const document = await this.firstWhere( + `docPath = '${path}' AND workspaceId = ${workspace.id}` + ); if (!document) continue; - await Pinecone.deleteDocumentFromNamespace(workspace.slug, document.docId); - stmt.run([path, workspace.id]) + await VectorDb.deleteDocumentFromNamespace( + workspace.slug, + document.docId + ); + stmt.run([path, workspace.id]); } stmt.finalize(); db.close(); return true; - } -} + }, +}; -module.exports = { Document } \ No newline at end of file +module.exports = { Document }; diff --git a/server/models/vectors.js b/server/models/vectors.js index 39c9e00f5d..d3ad9cc878 100644 --- a/server/models/vectors.js +++ b/server/models/vectors.js @@ -1,10 +1,10 @@ -const { Document } = require('./documents'); +const { Document } = require("./documents"); // TODO: Do we want to store entire vectorized chunks in here // so that we can easily spin up temp-namespace clones for threading // const DocumentVectors = { - tablename: 'document_vectors', + tablename: "document_vectors", colsInit: ` id INTEGER PRIMARY KEY AUTOINCREMENT, docId TEXT NOT NULL, @@ -13,51 +13,63 @@ const DocumentVectors = { lastUpdatedAt TEXT DEFAULT CURRENT_TIMESTAMP `, db: async function () { - const sqlite3 = require('sqlite3').verbose(); - const { open } = require('sqlite'); + const sqlite3 = require("sqlite3").verbose(); + const { open } = require("sqlite"); const db = await open({ - filename: 'anythingllm.db', - driver: sqlite3.Database - }) + filename: "anythingllm.db", + driver: sqlite3.Database, + }); - await db.exec(`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`); - db.on('trace', (sql) => console.log(sql)) - return db + await db.exec( + `CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})` + ); + db.on("trace", (sql) => console.log(sql)); + return db; }, bulkInsert: async function (vectorRecords = []) { if (vectorRecords.length === 0) return; const db = await this.db(); - const stmt = await db.prepare(`INSERT INTO ${this.tablename} (docId, vectorId) VALUES (?, ?)`); + const stmt = await db.prepare( + `INSERT INTO ${this.tablename} (docId, vectorId) VALUES (?, ?)` + ); for (const record of vectorRecords) { - const { docId, vectorId } = record - stmt.run([docId, vectorId]) + const { docId, vectorId } = record; + stmt.run([docId, vectorId]); } - stmt.finalize() - db.close() + stmt.finalize(); + db.close(); return { documentsInserted: vectorRecords.length }; }, deleteForWorkspace: async function (workspaceId) { const documents = await Document.forWorkspace(workspaceId); - const docIds = [...(new Set(documents.map((doc) => doc.docId)))]; - const ids = (await this.where(`docId IN (${docIds.map((id) => `'${id}'`).join(',')})`)).map((doc) => doc.id) - await this.deleteIds(ids) + const docIds = [...new Set(documents.map((doc) => doc.docId))]; + const ids = ( + await this.where(`docId IN (${docIds.map((id) => `'${id}'`).join(",")})`) + ).map((doc) => doc.id); + await this.deleteIds(ids); return true; }, - where: async function (clause = '', limit = null) { - const db = await this.db() - const results = await db.all(`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ''} ${!!limit ? `LIMIT ${limit}` : ''}`) + where: async function (clause = "", limit = null) { + const db = await this.db(); + const results = await db.all( + `SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ""} ${ + !!limit ? `LIMIT ${limit}` : "" + }` + ); - db.close() - return results + db.close(); + return results; }, deleteIds: async function (ids = []) { - const db = await this.db() - await db.get(`DELETE FROM ${this.tablename} WHERE id IN (${ids.join(', ')}) `) - db.close() - return true - } -} + const db = await this.db(); + await db.get( + `DELETE FROM ${this.tablename} WHERE id IN (${ids.join(", ")}) ` + ); + db.close(); + return true; + }, +}; -module.exports = { DocumentVectors } \ No newline at end of file +module.exports = { DocumentVectors }; diff --git a/server/models/workspace.js b/server/models/workspace.js index db2e532f3d..ddb8faa93d 100644 --- a/server/models/workspace.js +++ b/server/models/workspace.js @@ -1,8 +1,8 @@ -const slugify = require('slugify'); -const { Document } = require('./documents'); +const slugify = require("slugify"); +const { Document } = require("./documents"); const Workspace = { - tablename: 'workspaces', + tablename: "workspaces", colsInit: ` id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL UNIQUE, @@ -12,52 +12,66 @@ const Workspace = { lastUpdatedAt TEXT DEFAULT CURRENT_TIMESTAMP `, db: async function () { - const sqlite3 = require('sqlite3').verbose(); - const { open } = require('sqlite'); + const sqlite3 = require("sqlite3").verbose(); + const { open } = require("sqlite"); const db = await open({ - filename: 'anythingllm.db', - driver: sqlite3.Database - }) + filename: "anythingllm.db", + driver: sqlite3.Database, + }); - await db.exec(`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`); - db.on('trace', (sql) => console.log(sql)) - return db + await db.exec( + `CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})` + ); + db.on("trace", (sql) => console.log(sql)); + return db; }, new: async function (name = null) { - if (!name) return { result: null, message: 'name cannot be null' }; + if (!name) return { result: null, message: "name cannot be null" }; - const db = await this.db() - const { id, success, message } = await db.run(`INSERT INTO ${this.tablename} (name, slug) VALUES (?, ?)`, [name, slugify(name, { lower: true })]) + const db = await this.db(); + const { id, success, message } = await db + .run(`INSERT INTO ${this.tablename} (name, slug) VALUES (?, ?)`, [ + name, + slugify(name, { lower: true }), + ]) .then((res) => { - return { id: res.lastID, success: true, message: null } + return { id: res.lastID, success: true, message: null }; }) .catch((error) => { - return { id: null, success: false, message: error.message } - }) - if (!success) return { workspace: null, message } + return { id: null, success: false, message: error.message }; + }); + if (!success) return { workspace: null, message }; - const workspace = await db.get(`SELECT * FROM ${this.tablename} WHERE id = ${id}`) - return { workspace, message: null } + const workspace = await db.get( + `SELECT * FROM ${this.tablename} WHERE id = ${id}` + ); + return { workspace, message: null }; }, - get: async function (clause = '') { - const db = await this.db() - const result = await db.get(`SELECT * FROM ${this.tablename} WHERE ${clause}`).then((res) => res || null) + get: async function (clause = "") { + const db = await this.db(); + const result = await db + .get(`SELECT * FROM ${this.tablename} WHERE ${clause}`) + .then((res) => res || null); if (!result) return null; const documents = await Document.forWorkspace(result.id); - return { ...result, documents } + return { ...result, documents }; }, - delete: async function (clause = '') { - const db = await this.db() - await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`) - return true + delete: async function (clause = "") { + const db = await this.db(); + await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`); + return true; }, - where: async function (clause = '', limit = null) { - const db = await this.db() - const results = await db.all(`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ''} ${!!limit ? `LIMIT ${limit}` : ''}`) - return results + where: async function (clause = "", limit = null) { + const db = await this.db(); + const results = await db.all( + `SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ""} ${ + !!limit ? `LIMIT ${limit}` : "" + }` + ); + return results; }, -} +}; -module.exports = { Workspace } \ No newline at end of file +module.exports = { Workspace }; diff --git a/server/models/workspaceChats.js b/server/models/workspaceChats.js index 711b5cd201..8a545b15c1 100644 --- a/server/models/workspaceChats.js +++ b/server/models/workspaceChats.js @@ -1,6 +1,5 @@ - const WorkspaceChats = { - tablename: 'workspace_chats', + tablename: "workspace_chats", colsInit: ` id INTEGER PRIMARY KEY AUTOINCREMENT, workspaceId INTEGER NOT NULL, @@ -11,58 +10,79 @@ const WorkspaceChats = { lastUpdatedAt TEXT DEFAULT CURRENT_TIMESTAMP `, db: async function () { - const sqlite3 = require('sqlite3').verbose(); - const { open } = require('sqlite'); + const sqlite3 = require("sqlite3").verbose(); + const { open } = require("sqlite"); const db = await open({ - filename: 'anythingllm.db', - driver: sqlite3.Database - }) + filename: "anythingllm.db", + driver: sqlite3.Database, + }); - await db.exec(`CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})`); - db.on('trace', (sql) => console.log(sql)) - return db + await db.exec( + `CREATE TABLE IF NOT EXISTS ${this.tablename} (${this.colsInit})` + ); + db.on("trace", (sql) => console.log(sql)); + return db; }, new: async function ({ workspaceId, prompt, response = {} }) { - const db = await this.db() - const { id, success, message } = await db.run(`INSERT INTO ${this.tablename} (workspaceId, prompt, response) VALUES (?, ?, ?)`, [workspaceId, prompt, JSON.stringify(response)]) + const db = await this.db(); + const { id, success, message } = await db + .run( + `INSERT INTO ${this.tablename} (workspaceId, prompt, response) VALUES (?, ?, ?)`, + [workspaceId, prompt, JSON.stringify(response)] + ) .then((res) => { - return { id: res.lastID, success: true, message: null } + return { id: res.lastID, success: true, message: null }; }) .catch((error) => { - return { id: null, success: false, message: error.message } - }) - if (!success) return { chat: null, message } + return { id: null, success: false, message: error.message }; + }); + if (!success) return { chat: null, message }; - const chat = await db.get(`SELECT * FROM ${this.tablename} WHERE id = ${id}`) - return { chat, message: null } + const chat = await db.get( + `SELECT * FROM ${this.tablename} WHERE id = ${id}` + ); + return { chat, message: null }; }, forWorkspace: async function (workspaceId = null) { if (!workspaceId) return []; - return await this.where(`workspaceId = ${workspaceId} AND include = true`, null, 'ORDER BY id ASC') + return await this.where( + `workspaceId = ${workspaceId} AND include = true`, + null, + "ORDER BY id ASC" + ); }, markHistoryInvalid: async function (workspaceId = null) { if (!workspaceId) return; - const db = await this.db() - await db.run(`UPDATE ${this.tablename} SET include = false WHERE workspaceId = ?`, [workspaceId]); + const db = await this.db(); + await db.run( + `UPDATE ${this.tablename} SET include = false WHERE workspaceId = ?`, + [workspaceId] + ); return; }, - get: async function (clause = '') { - const db = await this.db() - const result = await db.get(`SELECT * FROM ${this.tablename} WHERE ${clause}`).then((res) => res || null) + get: async function (clause = "") { + const db = await this.db(); + const result = await db + .get(`SELECT * FROM ${this.tablename} WHERE ${clause}`) + .then((res) => res || null); if (!result) return null; - return result + return result; }, - delete: async function (clause = '') { - const db = await this.db() - await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`) - return true + delete: async function (clause = "") { + const db = await this.db(); + await db.get(`DELETE FROM ${this.tablename} WHERE ${clause}`); + return true; }, - where: async function (clause = '', limit = null, order = null) { - const db = await this.db() - const results = await db.all(`SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ''} ${!!limit ? `LIMIT ${limit}` : ''} ${!!order ? order : ''}`) - return results + where: async function (clause = "", limit = null, order = null) { + const db = await this.db(); + const results = await db.all( + `SELECT * FROM ${this.tablename} ${clause ? `WHERE ${clause}` : ""} ${ + !!limit ? `LIMIT ${limit}` : "" + } ${!!order ? order : ""}` + ); + return results; }, -} +}; -module.exports = { WorkspaceChats } \ No newline at end of file +module.exports = { WorkspaceChats }; diff --git a/server/package.json b/server/package.json index 8886b3307b..5810cabd27 100644 --- a/server/package.json +++ b/server/package.json @@ -10,17 +10,19 @@ "node": ">=18.12.1" }, "scripts": { - "dev": "NODE_ENV=development nodemon --ignore documents index.js", - "start": "NODE_ENV=production node index.js" + "dev": "NODE_ENV=development nodemon --ignore documents --ignore vector-cache --trace-warnings index.js", + "start": "NODE_ENV=production node index.js", + "lint": "yarn prettier --write ./endpoints ./models ./utils index.js" }, "dependencies": { "@googleapis/youtube": "^9.0.0", "@pinecone-database/pinecone": "^0.1.6", "body-parser": "^1.20.2", + "chromadb": "^1.5.2", "cors": "^2.8.5", "dotenv": "^16.0.3", "express": "^4.18.2", - "langchain": "^0.0.81", + "langchain": "^0.0.90", "moment": "^2.29.4", "openai": "^3.2.1", "pinecone-client": "^1.1.0", @@ -30,6 +32,7 @@ "uuid": "^9.0.0" }, "devDependencies": { - "nodemon": "^2.0.22" + "nodemon": "^2.0.22", + "prettier": "^2.4.1" } } \ No newline at end of file diff --git a/server/utils/chats/commands/reset.js b/server/utils/chats/commands/reset.js index 39c11b3d32..59f9448e77 100644 --- a/server/utils/chats/commands/reset.js +++ b/server/utils/chats/commands/reset.js @@ -4,8 +4,8 @@ async function resetMemory(workspace, _message, msgUUID) { await WorkspaceChats.markHistoryInvalid(workspace.id); return { uuid: msgUUID, - type: 'textResponse', - textResponse: 'Workspace chat memory was reset!', + type: "textResponse", + textResponse: "Workspace chat memory was reset!", sources: [], close: true, error: false, @@ -13,5 +13,5 @@ async function resetMemory(workspace, _message, msgUUID) { } module.exports = { - resetMemory -} \ No newline at end of file + resetMemory, +}; diff --git a/server/utils/chats/index.js b/server/utils/chats/index.js index a080ea34eb..7459e37e26 100644 --- a/server/utils/chats/index.js +++ b/server/utils/chats/index.js @@ -1,50 +1,49 @@ -const { v4: uuidv4 } = require('uuid'); -const { OpenAi } = require('../openAi'); -const { Pinecone } = require('../pinecone'); -const { WorkspaceChats } = require('../../models/workspaceChats'); +const { v4: uuidv4 } = require("uuid"); +const { OpenAi } = require("../openAi"); +const { WorkspaceChats } = require("../../models/workspaceChats"); const { resetMemory } = require("./commands/reset"); -const moment = require('moment') +const moment = require("moment"); +const { getVectorDbClass } = require("../helpers"); function convertToChatHistory(history = []) { - const formattedHistory = [] + const formattedHistory = []; history.forEach((history) => { - const { prompt, response, createdAt } = history + const { prompt, response, createdAt } = history; const data = JSON.parse(response); formattedHistory.push([ { - role: 'user', + role: "user", content: prompt, sentAt: moment(createdAt).unix(), }, { - role: 'assistant', + role: "assistant", content: data.text, sources: data.sources || [], sentAt: moment(createdAt).unix(), }, - ]) - }) + ]); + }); - return formattedHistory.flat() + return formattedHistory.flat(); } function convertToPromptHistory(history = []) { - const formattedHistory = [] + const formattedHistory = []; history.forEach((history) => { - const { prompt, response } = history + const { prompt, response } = history; const data = JSON.parse(response); formattedHistory.push([ - { role: 'user', content: prompt }, - { role: 'assistant', content: data.text }, - ]) - }) - return formattedHistory.flat() + { role: "user", content: prompt }, + { role: "assistant", content: data.text }, + ]); + }); + return formattedHistory.flat(); } - const VALID_COMMANDS = { - '/reset': resetMemory, -} + "/reset": resetMemory, +}; function grepCommand(message) { const availableCommands = Object.keys(VALID_COMMANDS); @@ -57,52 +56,63 @@ function grepCommand(message) { } } - return null + return null; } -async function chatWithWorkspace(workspace, message, chatMode = 'query') { +async function chatWithWorkspace(workspace, message, chatMode = "query") { const uuid = uuidv4(); const openai = new OpenAi(); + const VectorDb = getVectorDbClass(); + const command = grepCommand(message); - const command = grepCommand(message) if (!!command && Object.keys(VALID_COMMANDS).includes(command)) { return await VALID_COMMANDS[command](workspace, message, uuid); } - const { safe, reasons = [] } = await openai.isSafe(message) + const { safe, reasons = [] } = await openai.isSafe(message); if (!safe) { return { id: uuid, - type: 'abort', + type: "abort", textResponse: null, sources: [], close: true, - error: `This message was moderated and will not be allowed. Violations for ${reasons.join(', ')} found.` + error: `This message was moderated and will not be allowed. Violations for ${reasons.join( + ", " + )} found.`, }; } - const hasVectorizedSpace = await Pinecone.hasNamespace(workspace.slug); + const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug); if (!hasVectorizedSpace) { - const rawHistory = await WorkspaceChats.forWorkspace(workspace.id) + const rawHistory = await WorkspaceChats.forWorkspace(workspace.id); const chatHistory = convertToPromptHistory(rawHistory); const response = await openai.sendChat(chatHistory, message); - const data = { text: response, sources: [], type: 'chat' } + const data = { text: response, sources: [], type: "chat" }; - await WorkspaceChats.new({ workspaceId: workspace.id, prompt: message, response: data }) + await WorkspaceChats.new({ + workspaceId: workspace.id, + prompt: message, + response: data, + }); return { id: uuid, - type: 'textResponse', + type: "textResponse", textResponse: response, sources: [], close: true, error: null, }; } else { - const { response, sources, message: error } = await Pinecone[chatMode]({ namespace: workspace.slug, input: message }); + const { + response, + sources, + message: error, + } = await VectorDb[chatMode]({ namespace: workspace.slug, input: message }); if (!response) { return { id: uuid, - type: 'abort', + type: "abort", textResponse: null, sources: [], close: true, @@ -110,11 +120,15 @@ async function chatWithWorkspace(workspace, message, chatMode = 'query') { }; } - const data = { text: response, sources, type: chatMode } - await WorkspaceChats.new({ workspaceId: workspace.id, prompt: message, response: data }) + const data = { text: response, sources, type: chatMode }; + await WorkspaceChats.new({ + workspaceId: workspace.id, + prompt: message, + response: data, + }); return { id: uuid, - type: 'textResponse', + type: "textResponse", textResponse: response, sources, close: true, @@ -124,5 +138,5 @@ async function chatWithWorkspace(workspace, message, chatMode = 'query') { } module.exports = { convertToChatHistory, - chatWithWorkspace -} \ No newline at end of file + chatWithWorkspace, +}; diff --git a/server/utils/chroma/CHROMA_SETUP.md b/server/utils/chroma/CHROMA_SETUP.md new file mode 100644 index 0000000000..396ee348ea --- /dev/null +++ b/server/utils/chroma/CHROMA_SETUP.md @@ -0,0 +1,24 @@ +# How to setup a local (or remote) Chroma Vector Database + +[Official Chroma Docs](https://docs.trychroma.com/usage-guide#running-chroma-in-clientserver-mode) for reference. + +### How to get started + +**Requirements** + +- Docker +- `git` available in your CLI/terminal + +**Instructions** + +- `git clone git@github.com:chroma-core/chroma.git` to somewhere on computer. +- `cd chroma` +- `docker-compose up -d --build` +- set the `CHROMA_ENDPOINT=` .env variable in `server` and also set `VECTOR_DB=` to `chroma`. + +eg: `server/.env.development` + +``` +VECTOR_DB="chroma" +CHROMA_ENDPOINT='http://localhost:8000' +``` diff --git a/server/utils/chroma/index.js b/server/utils/chroma/index.js new file mode 100644 index 0000000000..92dcec928c --- /dev/null +++ b/server/utils/chroma/index.js @@ -0,0 +1,361 @@ +const { ChromaClient, OpenAIEmbeddingFunction } = require("chromadb"); +const { Chroma: ChromaStore } = require("langchain/vectorstores/chroma"); +const { OpenAI } = require("langchain/llms/openai"); +const { ChatOpenAI } = require("langchain/chat_models/openai"); +const { + VectorDBQAChain, + LLMChain, + RetrievalQAChain, + ConversationalRetrievalQAChain, +} = require("langchain/chains"); +const { OpenAIEmbeddings } = require("langchain/embeddings/openai"); +// const { VectorStoreRetrieverMemory, BufferMemory } = require("langchain/memory"); +// const { PromptTemplate } = require("langchain/prompts"); +const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +const { storeVectorResult, cachedVectorInformation } = require("../files"); +const { Configuration, OpenAIApi } = require("openai"); +const { v4: uuidv4 } = require("uuid"); + +const toChunks = (arr, size) => { + return Array.from({ length: Math.ceil(arr.length / size) }, (_v, i) => + arr.slice(i * size, i * size + size) + ); +}; + +function curateSources(sources = []) { + const knownDocs = []; + const documents = []; + for (const source of sources) { + const { metadata = {} } = source; + if ( + Object.keys(metadata).length > 0 && + !knownDocs.includes(metadata.title) + ) { + documents.push({ ...metadata }); + knownDocs.push(metadata.title); + } + } + + return documents; +} + +const Chroma = { + name: 'Chroma', + connect: async function () { + const client = new ChromaClient({ + path: process.env.CHROMA_ENDPOINT, // if not set will fallback to localhost:8000 + }); + + const isAlive = await client.heartbeat(); + if (!isAlive) + throw new Error( + "ChromaDB::Invalid Heartbeat received - is the instance online?" + ); + return { client }; + }, + heartbeat: async function () { + const { client } = await this.connect(); + return { heartbeat: await client.heartbeat() }; + }, + totalIndicies: async function () { + const { client } = await this.connect(); + const collections = await client.listCollections(); + var totalVectors = 0; + for (const collectionObj of collections) { + const collection = await client + .getCollection({ name: collectionObj.name }) + .catch(() => null); + if (!collection) continue; + totalVectors += await collection.count(); + } + return totalVectors; + }, + embeddingFunc: function () { + return new OpenAIEmbeddingFunction({ + openai_api_key: process.env.OPEN_AI_KEY, + }); + }, + embedder: function () { + return new OpenAIEmbeddings({ openAIApiKey: process.env.OPEN_AI_KEY }); + }, + openai: function () { + const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY }); + const openai = new OpenAIApi(config); + return openai; + }, + llm: function () { + const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo"; + return new OpenAI({ + openAIApiKey: process.env.OPEN_AI_KEY, + temperature: 0.7, + modelName: model, + }); + }, + chatLLM: function () { + const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo"; + return new ChatOpenAI({ + openAIApiKey: process.env.OPEN_AI_KEY, + temperature: 0.7, + modelName: model, + }); + }, + embedChunk: async function (openai, textChunk) { + const { + data: { data }, + } = await openai.createEmbedding({ + model: "text-embedding-ada-002", + input: textChunk, + }); + return data.length > 0 && data[0].hasOwnProperty("embedding") + ? data[0].embedding + : null; + }, + namespace: async function (client, namespace = null) { + if (!namespace) throw new Error("No namespace value provided."); + const collection = await client + .getCollection({ name: namespace }) + .catch(() => null); + if (!collection) return null; + + return { + ...collection, + vectorCount: await collection.count(), + }; + }, + hasNamespace: async function (namespace = null) { + if (!namespace) return false; + const { client } = await this.connect(); + return await this.namespaceExists(client, namespace); + }, + namespaceExists: async function (client, namespace = null) { + if (!namespace) throw new Error("No namespace value provided."); + const collection = await client + .getCollection({ name: namespace }) + .catch((e) => { + console.error("ChromaDB::namespaceExists", e.message); + return null; + }); + return !!collection; + }, + deleteVectorsInNamespace: async function (client, namespace = null) { + await client.deleteCollection({ name: namespace }); + return true; + }, + addDocumentToNamespace: async function ( + namespace, + documentData = {}, + fullFilePath = null + ) { + const { DocumentVectors } = require("../../models/vectors"); + try { + const { pageContent, docId, ...metadata } = documentData; + if (!pageContent || pageContent.length == 0) return false; + + console.log("Adding new vectorized document into namespace", namespace); + const cacheResult = await cachedVectorInformation(fullFilePath); + if (cacheResult.exists) { + const { client } = await this.connect(); + const collection = await client.getOrCreateCollection({ + name: namespace, + metadata: { "hnsw:space": "cosine" }, + embeddingFunction: this.embeddingFunc(), + }); + const { chunks } = cacheResult; + const documentVectors = []; + + for (const chunk of chunks) { + const submission = { + ids: [], + embeddings: [], + metadatas: [], + documents: [], + }; + + // Before sending to Chroma and saving the records to our db + // we need to assign the id of each chunk that is stored in the cached file. + chunk.forEach((chunk) => { + const id = uuidv4(); + const { id: _id, ...metadata } = chunk.metadata; + documentVectors.push({ docId, vectorId: id }); + submission.ids.push(id); + submission.embeddings.push(chunk.values); + submission.metadatas.push(metadata); + submission.documents.push(metadata.text); + }); + + const additionResult = await collection.add(submission); + if (!additionResult) + throw new Error("Error embedding into ChromaDB", additionResult); + } + + await DocumentVectors.bulkInsert(documentVectors); + return true; + } + + // If we are here then we are going to embed and store a novel document. + // We have to do this manually as opposed to using LangChains `Chroma.fromDocuments` + // because we then cannot atomically control our namespace to granularly find/remove documents + // from vectordb. + const textSplitter = new RecursiveCharacterTextSplitter({ + chunkSize: 1000, + chunkOverlap: 20, + }); + const textChunks = await textSplitter.splitText(pageContent); + + console.log("Chunks created from document:", textChunks.length); + const documentVectors = []; + const vectors = []; + const openai = this.openai(); + + const submission = { + ids: [], + embeddings: [], + metadatas: [], + documents: [], + }; + + for (const textChunk of textChunks) { + const vectorValues = await this.embedChunk(openai, textChunk); + + if (!!vectorValues) { + const vectorRecord = { + id: uuidv4(), + values: vectorValues, + // [DO NOT REMOVE] + // LangChain will be unable to find your text if you embed manually and dont include the `text` key. + // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64 + metadata: { ...metadata, text: textChunk }, + }; + + submission.ids.push(vectorRecord.id); + submission.embeddings.push(vectorRecord.values); + submission.metadatas.push(metadata); + submission.documents.push(textChunk); + + vectors.push(vectorRecord); + documentVectors.push({ docId, vectorId: vectorRecord.id }); + } else { + console.error( + "Could not use OpenAI to embed document chunk! This document will not be recorded." + ); + } + } + + const { client } = await this.connect(); + const collection = await client.getOrCreateCollection({ + name: namespace, + metadata: { "hnsw:space": "cosine" }, + embeddingFunction: this.embeddingFunc(), + }); + + if (vectors.length > 0) { + const chunks = []; + + console.log("Inserting vectorized chunks into Chroma collection."); + for (const chunk of toChunks(vectors, 500)) chunks.push(chunk); + + const additionResult = await collection.add(submission); + if (!additionResult) + throw new Error("Error embedding into ChromaDB", additionResult); + + await storeVectorResult(chunks, fullFilePath); + } + + await DocumentVectors.bulkInsert(documentVectors); + return true; + } catch (e) { + console.error("addDocumentToNamespace", e.message); + return false; + } + }, + deleteDocumentFromNamespace: async function (namespace, docId) { + const { DocumentVectors } = require("../../models/vectors"); + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) return; + const collection = await client.getCollection({ + name: namespace, + embeddingFunction: this.embeddingFunc(), + }); + + const knownDocuments = await DocumentVectors.where(`docId = '${docId}'`); + if (knownDocuments.length === 0) return; + + const vectorIds = knownDocuments.map((doc) => doc.vectorId); + await collection.delete({ ids: vectorIds }); + + const indexes = knownDocuments.map((doc) => doc.id); + await DocumentVectors.deleteIds(indexes); + return true; + }, + query: async function (reqBody = {}) { + const { namespace = null, input } = reqBody; + if (!namespace || !input) throw new Error("Invalid request body"); + + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) { + return { + response: null, + sources: [], + message: "Invalid query - no documents found for workspace!", + }; + } + + // const collection = await client.getCollection({ name: namespace, embeddingFunction: this.embeddingFunc() }) + // const results = await collection.get({ + // where: { + // description: 'a custom file uploaded by the user.' + // }, + // includes: ['ids'] + // }) + // console.log(results) + // return { response: null, sources: [], } + + const vectorStore = await ChromaStore.fromExistingCollection( + this.embedder(), + { collectionName: namespace, url: process.env.CHROMA_ENDPOINT } + ); + const model = this.llm(); + const chain = VectorDBQAChain.fromLLM(model, vectorStore, { + k: 5, + returnSourceDocuments: true, + }); + const response = await chain.call({ query: input }); + return { + response: response.text, + sources: curateSources(response.sourceDocuments), + message: false, + }; + }, + "namespace-stats": async function (reqBody = {}) { + const { namespace = null } = reqBody; + if (!namespace) throw new Error("namespace required"); + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) + throw new Error("Namespace by that name does not exist."); + const stats = await this.namespace(client, namespace); + return stats + ? stats + : { message: "No stats were able to be fetched from DB for namespace" }; + }, + "delete-namespace": async function (reqBody = {}) { + const { namespace = null } = reqBody; + const { client } = await this.connect(); + if (!(await this.namespaceExists(client, namespace))) + throw new Error("Namespace by that name does not exist."); + + const details = await this.namespace(client, namespace); + await this.deleteVectorsInNamespace(client, namespace); + return { + message: `Namespace ${namespace} was deleted along with ${details?.vectorCount} vectors.`, + }; + }, + reset: async function () { + const { client } = await this.connect(); + await client.reset(); + return { reset: true }; + }, +}; + +module.exports = { + Chroma, +}; diff --git a/server/utils/files/index.js b/server/utils/files/index.js index 8fc8a3258d..21021f004a 100644 --- a/server/utils/files/index.js +++ b/server/utils/files/index.js @@ -1,21 +1,24 @@ -const fs = require("fs") -const path = require('path'); -const { v5: uuidv5 } = require('uuid'); +const fs = require("fs"); +const path = require("path"); +const { v5: uuidv5 } = require("uuid"); async function collectDocumentData(folderName = null) { - if (!folderName) throw new Error('No docPath provided in request'); - const folder = path.resolve(__dirname, `../../documents/${folderName}`) + if (!folderName) throw new Error("No docPath provided in request"); + const folder = path.resolve(__dirname, `../../documents/${folderName}`); const dirExists = fs.existsSync(folder); - if (!dirExists) throw new Error(`No documents folder for ${folderName} - did you run collector/main.py for this element?`); + if (!dirExists) + throw new Error( + `No documents folder for ${folderName} - did you run collector/main.py for this element?` + ); const files = fs.readdirSync(folder); const fileData = []; - files.forEach(file => { - if (path.extname(file) === '.json') { + files.forEach((file) => { + if (path.extname(file) === ".json") { const filePath = path.join(folder, file); - const data = fs.readFileSync(filePath, 'utf8'); + const data = fs.readFileSync(filePath, "utf8"); console.log(`Parsing document: ${file}`); - fileData.push(JSON.parse(data)) + fileData.push(JSON.parse(data)); } }); return fileData; @@ -24,75 +27,78 @@ async function collectDocumentData(folderName = null) { // Should take in a folder that is a subfolder of documents // eg: youtube-subject/video-123.json async function fileData(filePath = null) { - if (!filePath) throw new Error('No docPath provided in request'); - const fullPath = path.resolve(__dirname, `../../documents/${filePath}`) + if (!filePath) throw new Error("No docPath provided in request"); + const fullPath = path.resolve(__dirname, `../../documents/${filePath}`); const fileExists = fs.existsSync(fullPath); if (!fileExists) return null; - const data = fs.readFileSync(fullPath, 'utf8'); - return JSON.parse(data) + const data = fs.readFileSync(fullPath, "utf8"); + return JSON.parse(data); } async function viewLocalFiles() { - const folder = path.resolve(__dirname, `../../documents`) + const folder = path.resolve(__dirname, `../../documents`); const dirExists = fs.existsSync(folder); - if (!dirExists) return {} + if (!dirExists) return {}; const directory = { name: "documents", type: "folder", items: [], - } + }; for (const file of fs.readdirSync(folder)) { - if (path.extname(file) === '.md') continue; - const folderPath = path.resolve(__dirname, `../../documents/${file}`) - const isFolder = fs.lstatSync(folderPath).isDirectory() + if (path.extname(file) === ".md") continue; + const folderPath = path.resolve(__dirname, `../../documents/${file}`); + const isFolder = fs.lstatSync(folderPath).isDirectory(); if (isFolder) { const subdocs = { name: file, type: "folder", items: [], - } + }; const subfiles = fs.readdirSync(folderPath); for (const subfile of subfiles) { - if (path.extname(subfile) !== '.json') continue; + if (path.extname(subfile) !== ".json") continue; const filePath = path.join(folderPath, subfile); - const rawData = fs.readFileSync(filePath, 'utf8'); - const cachefilename = `${file}/${subfile}` - const { pageContent, ...metadata } = JSON.parse(rawData) + const rawData = fs.readFileSync(filePath, "utf8"); + const cachefilename = `${file}/${subfile}`; + const { pageContent, ...metadata } = JSON.parse(rawData); subdocs.items.push({ name: subfile, type: "file", ...metadata, - cached: await cachedVectorInformation(cachefilename, true) - }) + cached: await cachedVectorInformation(cachefilename, true), + }); } - directory.items.push(subdocs) + directory.items.push(subdocs); } - }; + } - return directory + return directory; } // Searches the vector-cache folder for existing information so we dont have to re-embed a // document and can instead push directly to vector db. async function cachedVectorInformation(filename = null, checkOnly = false) { - if (!process.env.CACHE_VECTORS) return checkOnly ? false : { exists: false, chunks: [] }; + if (!process.env.CACHE_VECTORS) + return checkOnly ? false : { exists: false, chunks: [] }; if (!filename) return checkOnly ? false : { exists: false, chunks: [] }; const digest = uuidv5(filename, uuidv5.URL); const file = path.resolve(__dirname, `../../vector-cache/${digest}.json`); const exists = fs.existsSync(file); - if (checkOnly) return exists - if (!exists) return { exists, chunks: [] } + if (checkOnly) return exists; + if (!exists) return { exists, chunks: [] }; - console.log(`Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`) - const rawData = fs.readFileSync(file, 'utf8'); - return { exists: true, chunks: JSON.parse(rawData) } + console.log( + `Cached vectorized results of ${filename} found! Using cached data to save on embed costs.` + ); + const rawData = fs.readFileSync(file, "utf8"); + return { exists: true, chunks: JSON.parse(rawData) }; } // vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc @@ -100,14 +106,16 @@ async function cachedVectorInformation(filename = null, checkOnly = false) { async function storeVectorResult(vectorData = [], filename = null) { if (!process.env.CACHE_VECTORS) return; if (!filename) return; - console.log(`Caching vectorized results of ${filename} to prevent duplicated embedding.`) + console.log( + `Caching vectorized results of ${filename} to prevent duplicated embedding.` + ); const folder = path.resolve(__dirname, `../../vector-cache`); if (!fs.existsSync(folder)) fs.mkdirSync(folder); const digest = uuidv5(filename, uuidv5.URL); const writeTo = path.resolve(folder, `${digest}.json`); - fs.writeFileSync(writeTo, JSON.stringify(vectorData), 'utf8'); + fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8"); return; } @@ -116,5 +124,5 @@ module.exports = { collectDocumentData, viewLocalFiles, storeVectorResult, - fileData -} \ No newline at end of file + fileData, +}; diff --git a/server/utils/helpers/index.js b/server/utils/helpers/index.js new file mode 100644 index 0000000000..aa4cf3e89d --- /dev/null +++ b/server/utils/helpers/index.js @@ -0,0 +1,18 @@ +const { Pinecone } = require("../pinecone"); +const { Chroma } = require("../chroma"); + +function getVectorDbClass() { + const vectorSelection = process.env.VECTOR_DB || "pinecone"; + switch (vectorSelection) { + case "pinecone": + return Pinecone; + case "chroma": + return Chroma; + default: + return Pinecone; + } +} + +module.exports = { + getVectorDbClass, +}; diff --git a/server/utils/http/index.js b/server/utils/http/index.js index 719d96c216..e4f2813d47 100644 --- a/server/utils/http/index.js +++ b/server/utils/http/index.js @@ -1,5 +1,5 @@ function reqBody(request) { - return typeof request.body === 'string' + return typeof request.body === "string" ? JSON.parse(request.body) : request.body; } diff --git a/server/utils/middleware/validatedRequest.js b/server/utils/middleware/validatedRequest.js index ab93f5d07e..a1d9bf3b6c 100644 --- a/server/utils/middleware/validatedRequest.js +++ b/server/utils/middleware/validatedRequest.js @@ -1,30 +1,30 @@ function validatedRequest(request, response, next) { // When in development passthrough auth token for ease of development. - if (process.env.NODE_ENV === 'development' || !process.env.AUTH_TOKEN) { + if (process.env.NODE_ENV === "development" || !process.env.AUTH_TOKEN) { next(); return; } if (!process.env.AUTH_TOKEN) { response.status(403).json({ - error: "You need to set an AUTH_TOKEN environment variable." + error: "You need to set an AUTH_TOKEN environment variable.", }); return; } - const auth = request.header('Authorization'); - const token = auth ? auth.split(' ')[1] : null; + const auth = request.header("Authorization"); + const token = auth ? auth.split(" ")[1] : null; if (!token) { response.status(403).json({ - error: "No auth token found." + error: "No auth token found.", }); return; } if (token !== process.env.AUTH_TOKEN) { response.status(403).json({ - error: "Invalid auth token found." + error: "Invalid auth token found.", }); return; } @@ -34,4 +34,4 @@ function validatedRequest(request, response, next) { module.exports = { validatedRequest, -}; \ No newline at end of file +}; diff --git a/server/utils/openAi/index.js b/server/utils/openAi/index.js index 8103ea839a..a1629e2f90 100644 --- a/server/utils/openAi/index.js +++ b/server/utils/openAi/index.js @@ -1,64 +1,76 @@ -const { Configuration, OpenAIApi } = require('openai') +const { Configuration, OpenAIApi } = require("openai"); class OpenAi { constructor() { - const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY, organization: 'org-amIuvAIIcdUmN5YCiwRayVfb' }) + const config = new Configuration({ + apiKey: process.env.OPEN_AI_KEY, + organization: "org-amIuvAIIcdUmN5YCiwRayVfb", + }); const openai = new OpenAIApi(config); - this.openai = openai + this.openai = openai; } - isValidChatModel(modelName = '') { - const validModels = ['gpt-4', 'gpt-3.5-turbo'] - return validModels.includes(modelName) + isValidChatModel(modelName = "") { + const validModels = ["gpt-4", "gpt-3.5-turbo"]; + return validModels.includes(modelName); } - async isSafe(input = '') { - const { flagged = false, categories = {} } = await this.openai.createModeration({ input }) + async isSafe(input = "") { + const { flagged = false, categories = {} } = await this.openai + .createModeration({ input }) .then((json) => { const res = json.data; - if (!res.hasOwnProperty('results')) throw new Error('OpenAI moderation: No results!'); - if (res.results.length === 0) throw new Error('OpenAI moderation: No results length!'); - return res.results[0] - }) + if (!res.hasOwnProperty("results")) + throw new Error("OpenAI moderation: No results!"); + if (res.results.length === 0) + throw new Error("OpenAI moderation: No results length!"); + return res.results[0]; + }); if (!flagged) return { safe: true, reasons: [] }; - const reasons = Object.keys(categories).map((category) => { - const value = categories[category] - if (value === true) { - return category.replace('/', ' or '); - } else { - return null; - } - }).filter((reason) => !!reason) + const reasons = Object.keys(categories) + .map((category) => { + const value = categories[category]; + if (value === true) { + return category.replace("/", " or "); + } else { + return null; + } + }) + .filter((reason) => !!reason); - return { safe: false, reasons } + return { safe: false, reasons }; } async sendChat(chatHistory = [], prompt) { - const model = process.env.OPEN_MODEL_PREF - if (!this.isValidChatModel(model)) throw new Error(`OpenAI chat: ${model} is not valid for chat completion!`); + const model = process.env.OPEN_MODEL_PREF; + if (!this.isValidChatModel(model)) + throw new Error( + `OpenAI chat: ${model} is not valid for chat completion!` + ); - const textResponse = await this.openai.createChatCompletion({ - model, - temperature: 0.7, - n: 1, - messages: [ - { role: 'system', content: '' }, - ...chatHistory, - { role: 'user', content: prompt }, - ] - }) - .then((json) => { - const res = json.data - if (!res.hasOwnProperty('choices')) throw new Error('OpenAI chat: No results!'); - if (res.choices.length === 0) throw new Error('OpenAI chat: No results length!'); - return res.choices[0].message.content + const textResponse = await this.openai + .createChatCompletion({ + model, + temperature: 0.7, + n: 1, + messages: [ + { role: "system", content: "" }, + ...chatHistory, + { role: "user", content: prompt }, + ], }) + .then((json) => { + const res = json.data; + if (!res.hasOwnProperty("choices")) + throw new Error("OpenAI chat: No results!"); + if (res.choices.length === 0) + throw new Error("OpenAI chat: No results length!"); + return res.choices[0].message.content; + }); - return textResponse + return textResponse; } } module.exports = { OpenAi, }; - - diff --git a/server/utils/pinecone/index.js b/server/utils/pinecone/index.js index 4c3d353344..be217854e8 100644 --- a/server/utils/pinecone/index.js +++ b/server/utils/pinecone/index.js @@ -1,30 +1,41 @@ const { PineconeClient } = require("@pinecone-database/pinecone"); const { PineconeStore } = require("langchain/vectorstores/pinecone"); const { OpenAI } = require("langchain/llms/openai"); -const { ChatOpenAI } = require('langchain/chat_models/openai'); -const { VectorDBQAChain, LLMChain, RetrievalQAChain, ConversationalRetrievalQAChain } = require("langchain/chains"); +const { ChatOpenAI } = require("langchain/chat_models/openai"); +const { + VectorDBQAChain, + LLMChain, + RetrievalQAChain, + ConversationalRetrievalQAChain, +} = require("langchain/chains"); const { OpenAIEmbeddings } = require("langchain/embeddings/openai"); -const { VectorStoreRetrieverMemory, BufferMemory } = require("langchain/memory"); +const { + VectorStoreRetrieverMemory, + BufferMemory, +} = require("langchain/memory"); const { PromptTemplate } = require("langchain/prompts"); const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); -const { storeVectorResult, cachedVectorInformation } = require('../files'); -const { Configuration, OpenAIApi } = require('openai') -const { v4: uuidv4 } = require('uuid'); +const { storeVectorResult, cachedVectorInformation } = require("../files"); +const { Configuration, OpenAIApi } = require("openai"); +const { v4: uuidv4 } = require("uuid"); const toChunks = (arr, size) => { return Array.from({ length: Math.ceil(arr.length / size) }, (_v, i) => arr.slice(i * size, i * size + size) ); -} +}; function curateSources(sources = []) { const knownDocs = []; - const documents = [] + const documents = []; for (const source of sources) { - const { metadata = {} } = source - if (Object.keys(metadata).length > 0 && !knownDocs.includes(metadata.title)) { - documents.push({ ...metadata }) - knownDocs.push(metadata.title) + const { metadata = {} } = source; + if ( + Object.keys(metadata).length > 0 && + !knownDocs.includes(metadata.title) + ) { + documents.push({ ...metadata }); + knownDocs.push(metadata.title); } } @@ -32,6 +43,7 @@ function curateSources(sources = []) { } const Pinecone = { + name: 'Pinecone', connect: async function () { const client = new PineconeClient(); await client.init({ @@ -39,91 +51,112 @@ const Pinecone = { environment: process.env.PINECONE_ENVIRONMENT, }); const pineconeIndex = client.Index(process.env.PINECONE_INDEX); - const { status } = await client.describeIndex({ indexName: process.env.PINECONE_INDEX }); + const { status } = await client.describeIndex({ + indexName: process.env.PINECONE_INDEX, + }); - if (!status.ready) throw new Error("Pinecode::Index not ready.") + if (!status.ready) throw new Error("Pinecode::Index not ready."); return { client, pineconeIndex, indexName: process.env.PINECONE_INDEX }; }, embedder: function () { return new OpenAIEmbeddings({ openAIApiKey: process.env.OPEN_AI_KEY }); }, openai: function () { - const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY }) + const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY }); const openai = new OpenAIApi(config); - return openai + return openai; }, embedChunk: async function (openai, textChunk) { - const { data: { data } } = await openai.createEmbedding({ - model: 'text-embedding-ada-002', - input: textChunk - }) - return data.length > 0 && data[0].hasOwnProperty('embedding') ? data[0].embedding : null + const { + data: { data }, + } = await openai.createEmbedding({ + model: "text-embedding-ada-002", + input: textChunk, + }); + return data.length > 0 && data[0].hasOwnProperty("embedding") + ? data[0].embedding + : null; }, llm: function () { - const model = process.env.OPEN_MODEL_PREF || 'gpt-3.5-turbo' - return new OpenAI({ openAIApiKey: process.env.OPEN_AI_KEY, temperature: 0.7, modelName: model }); + const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo"; + return new OpenAI({ + openAIApiKey: process.env.OPEN_AI_KEY, + temperature: 0.7, + modelName: model, + }); }, chatLLM: function () { - const model = process.env.OPEN_MODEL_PREF || 'gpt-3.5-turbo' - return new ChatOpenAI({ openAIApiKey: process.env.OPEN_AI_KEY, temperature: 0.7, modelName: model }); + const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo"; + return new ChatOpenAI({ + openAIApiKey: process.env.OPEN_AI_KEY, + temperature: 0.7, + modelName: model, + }); }, totalIndicies: async function () { const { pineconeIndex } = await this.connect(); const { namespaces } = await pineconeIndex.describeIndexStats1(); - return Object.values(namespaces).reduce((a, b) => a + (b?.vectorCount || 0), 0) + return Object.values(namespaces).reduce( + (a, b) => a + (b?.vectorCount || 0), + 0 + ); }, namespace: async function (index, namespace = null) { if (!namespace) throw new Error("No namespace value provided."); const { namespaces } = await index.describeIndexStats1(); - return namespaces.hasOwnProperty(namespace) ? namespaces[namespace] : null + return namespaces.hasOwnProperty(namespace) ? namespaces[namespace] : null; }, hasNamespace: async function (namespace = null) { if (!namespace) return false; const { pineconeIndex } = await this.connect(); - return await this.namespaceExists(pineconeIndex, namespace) + return await this.namespaceExists(pineconeIndex, namespace); }, namespaceExists: async function (index, namespace = null) { if (!namespace) throw new Error("No namespace value provided."); const { namespaces } = await index.describeIndexStats1(); - return namespaces.hasOwnProperty(namespace) + return namespaces.hasOwnProperty(namespace); }, deleteVectorsInNamespace: async function (index, namespace = null) { - await index.delete1({ namespace, deleteAll: true }) - return true + await index.delete1({ namespace, deleteAll: true }); + return true; }, - addDocumentToNamespace: async function (namespace, documentData = {}, fullFilePath = null) { + addDocumentToNamespace: async function ( + namespace, + documentData = {}, + fullFilePath = null + ) { const { DocumentVectors } = require("../../models/vectors"); try { - const { pageContent, docId, ...metadata } = documentData + const { pageContent, docId, ...metadata } = documentData; if (!pageContent || pageContent.length == 0) return false; console.log("Adding new vectorized document into namespace", namespace); - const cacheResult = await cachedVectorInformation(fullFilePath) + const cacheResult = await cachedVectorInformation(fullFilePath); if (cacheResult.exists) { const { pineconeIndex } = await this.connect(); - const { chunks } = cacheResult - const documentVectors = [] + const { chunks } = cacheResult; + const documentVectors = []; for (const chunk of chunks) { // Before sending to Pinecone and saving the records to our db // we need to assign the id of each chunk that is stored in the cached file. const newChunks = chunk.map((chunk) => { - const id = uuidv4() + const id = uuidv4(); documentVectors.push({ docId, vectorId: id }); - return { ...chunk, id } - }) + return { ...chunk, id }; + }); // Push chunks with new ids to pinecone. await pineconeIndex.upsert({ upsertRequest: { vectors: [...newChunks], namespace, - } - }) + }, + }); } - await DocumentVectors.bulkInsert(documentVectors) - return true + await DocumentVectors.bulkInsert(documentVectors); + return true; } // If we are here then we are going to embed and store a novel document. @@ -131,13 +164,16 @@ const Pinecone = { // because we then cannot atomically control our namespace to granularly find/remove documents // from vectordb. // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167 - const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 20 }); - const textChunks = await textSplitter.splitText(pageContent) + const textSplitter = new RecursiveCharacterTextSplitter({ + chunkSize: 1000, + chunkOverlap: 20, + }); + const textChunks = await textSplitter.splitText(pageContent); - console.log('Chunks created from document:', textChunks.length) - const documentVectors = [] - const vectors = [] - const openai = this.openai() + console.log("Chunks created from document:", textChunks.length); + const documentVectors = []; + const vectors = []; + const openai = this.openai(); for (const textChunk of textChunks) { const vectorValues = await this.embedChunk(openai, textChunk); @@ -149,87 +185,97 @@ const Pinecone = { // LangChain will be unable to find your text if you embed manually and dont include the `text` key. // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64 metadata: { ...metadata, text: textChunk }, - } + }; vectors.push(vectorRecord); documentVectors.push({ docId, vectorId: vectorRecord.id }); } else { - console.error('Could not use OpenAI to embed document chunk! This document will not be recorded.') + console.error( + "Could not use OpenAI to embed document chunk! This document will not be recorded." + ); } } if (vectors.length > 0) { - const chunks = [] + const chunks = []; const { pineconeIndex } = await this.connect(); - console.log('Inserting vectorized chunks into Pinecone.') + console.log("Inserting vectorized chunks into Pinecone."); for (const chunk of toChunks(vectors, 100)) { - chunks.push(chunk) + chunks.push(chunk); await pineconeIndex.upsert({ upsertRequest: { vectors: [...chunk], namespace, - } - }) + }, + }); } - await storeVectorResult(chunks, fullFilePath) + await storeVectorResult(chunks, fullFilePath); } - await DocumentVectors.bulkInsert(documentVectors) + await DocumentVectors.bulkInsert(documentVectors); return true; } catch (e) { - console.error('addDocumentToNamespace', e.message) + console.error("addDocumentToNamespace", e.message); return false; } }, deleteDocumentFromNamespace: async function (namespace, docId) { const { DocumentVectors } = require("../../models/vectors"); const { pineconeIndex } = await this.connect(); - if (!await this.namespaceExists(pineconeIndex, namespace)) return; + if (!(await this.namespaceExists(pineconeIndex, namespace))) return; - const knownDocuments = await DocumentVectors.where(`docId = '${docId}'`) + const knownDocuments = await DocumentVectors.where(`docId = '${docId}'`); if (knownDocuments.length === 0) return; const vectorIds = knownDocuments.map((doc) => doc.vectorId); await pineconeIndex.delete1({ ids: vectorIds, namespace, - }) + }); const indexes = knownDocuments.map((doc) => doc.id); - await DocumentVectors.deleteIds(indexes) + await DocumentVectors.deleteIds(indexes); return true; }, - 'namespace-stats': async function (reqBody = {}) { - const { namespace = null } = reqBody + "namespace-stats": async function (reqBody = {}) { + const { namespace = null } = reqBody; if (!namespace) throw new Error("namespace required"); const { pineconeIndex } = await this.connect(); - if (!await this.namespaceExists(pineconeIndex, namespace)) throw new Error('Namespace by that name does not exist.'); - const stats = await this.namespace(pineconeIndex, namespace) - return stats ? stats : { message: 'No stats were able to be fetched from DB' } + if (!(await this.namespaceExists(pineconeIndex, namespace))) + throw new Error("Namespace by that name does not exist."); + const stats = await this.namespace(pineconeIndex, namespace); + return stats + ? stats + : { message: "No stats were able to be fetched from DB" }; }, - 'delete-namespace': async function (reqBody = {}) { - const { namespace = null } = reqBody + "delete-namespace": async function (reqBody = {}) { + const { namespace = null } = reqBody; const { pineconeIndex } = await this.connect(); - if (!await this.namespaceExists(pineconeIndex, namespace)) throw new Error('Namespace by that name does not exist.'); + if (!(await this.namespaceExists(pineconeIndex, namespace))) + throw new Error("Namespace by that name does not exist."); const details = await this.namespace(pineconeIndex, namespace); await this.deleteVectorsInNamespace(pineconeIndex, namespace); - return { message: `Namespace ${namespace} was deleted along with ${details.vectorCount} vectors.` } + return { + message: `Namespace ${namespace} was deleted along with ${details.vectorCount} vectors.`, + }; }, query: async function (reqBody = {}) { const { namespace = null, input } = reqBody; if (!namespace || !input) throw new Error("Invalid request body"); const { pineconeIndex } = await this.connect(); - if (!await this.namespaceExists(pineconeIndex, namespace)) { + if (!(await this.namespaceExists(pineconeIndex, namespace))) { return { - response: null, sources: [], message: 'Invalid query - no documents found for workspace!' - } + response: null, + sources: [], + message: "Invalid query - no documents found for workspace!", + }; } - const vectorStore = await PineconeStore.fromExistingIndex( - this.embedder(), - { pineconeIndex, namespace } - ); + const vectorStore = await PineconeStore.fromExistingIndex(this.embedder(), { + pineconeIndex, + namespace, + }); const model = this.llm(); const chain = VectorDBQAChain.fromLLM(model, vectorStore, { @@ -237,7 +283,11 @@ const Pinecone = { returnSourceDocuments: true, }); const response = await chain.call({ query: input }); - return { response: response.text, sources: curateSources(response.sourceDocuments), message: false } + return { + response: response.text, + sources: curateSources(response.sourceDocuments), + message: false, + }; }, // This implementation of chat also expands the memory of the chat itself // and adds more tokens to the PineconeDB instance namespace @@ -246,12 +296,15 @@ const Pinecone = { if (!namespace || !input) throw new Error("Invalid request body"); const { pineconeIndex } = await this.connect(); - if (!await this.namespaceExists(pineconeIndex, namespace)) throw new Error("Invalid namespace - has it been collected and seeded yet?"); + if (!(await this.namespaceExists(pineconeIndex, namespace))) + throw new Error( + "Invalid namespace - has it been collected and seeded yet?" + ); - const vectorStore = await PineconeStore.fromExistingIndex( - this.embedder(), - { pineconeIndex, namespace } - ); + const vectorStore = await PineconeStore.fromExistingIndex(this.embedder(), { + pineconeIndex, + namespace, + }); const memory = new VectorStoreRetrieverMemory({ vectorStoreRetriever: vectorStore.asRetriever(1), @@ -270,10 +323,10 @@ const Pinecone = { const chain = new LLMChain({ llm: model, prompt, memory }); const response = await chain.call({ input }); - return { response: response.text, sources: [], message: false } + return { response: response.text, sources: [], message: false }; }, -} +}; module.exports = { - Pinecone -} \ No newline at end of file + Pinecone, +};