In [1]:
from dotenv import load_dotenv
load_dotenv()

from os import getenv
from random import random
import re
from glob import glob
from orjson import loads
from tqdm import tqdm
import numpy as np
import faiss
from supabase import create_client, Client

In [2]:
supabase = create_client(getenv("SUPABASE_URL"), getenv("SUPABASE_KEY"))

In [3]:
EMBEDDING_DIM = 3072
NLIST = 1024

quantizer = faiss.IndexFlatL2(EMBEDDING_DIM)
index = faiss.IndexIDMap(
	faiss.IndexIVFScalarQuantizer(
		quantizer,
		EMBEDDING_DIM,
		NLIST,
		faiss.ScalarQuantizer.QT_8bit
	)
)

In [4]:
files = sorted(glob("/Volumes/Vault/OpenAI Embeddings/*.jsonl"))
len(files)

99

In [5]:
table = supabase.table("documents")

documents = []
offset = 0
while True:
	response = table.select("*").range(offset, offset + 1000).execute()
	if not response.data:
		break

	documents.extend(response.data)
	offset += 1000

documentIndexMap = {row["document"]: row["id"] for row in documents}

In [6]:
def train_index():
	embeddings = []

	for filename in tqdm(files):
		with open(filename, "r") as f:
			for line in f:
				if random() < 0.7:
					continue

				data = loads(line)
				body = data["response"]["body"]
				if "data" not in body:
					continue

				embedding = np.array(body["data"][0]["embedding"], dtype=np.float32)
				embeddings.append(embedding)

	index.train(np.array(embeddings, dtype=np.float32))

train_index()

100%|██████████| 99/99 [04:28<00:00,  2.71s/it]


In [7]:
def process_dense_vectors(filename):
	identifiers = []
	embeddings = []

	with open(filename, "r") as f:
		for line in f:
			data = loads(line)
			body = data["response"]["body"]
			if "data" not in body:
				continue

			identifier = documentIndexMap[re.sub(r'_\d+_\d+$', '', data["custom_id"])]
			embedding = np.array(body["data"][0]["embedding"], dtype=np.float32)
			identifiers.append(identifier)
			embeddings.append(embedding)

	identifiers = np.array(identifiers, dtype=np.int64)
	embeddings = np.array(embeddings, dtype=np.float32)

	index.add_with_ids(embeddings, identifiers)

for filename in tqdm(files):
	process_dense_vectors(filename)

index.ntotal

100%|██████████| 99/99 [13:08<00:00,  7.97s/it]


4942142

In [8]:
faiss.write_index(index, "output/dense_index.faiss")