In [1]:
from dotenv import load_dotenv
load_dotenv()

from os import getenv
import numpy as np
from openai import OpenAI
import faiss
from supabase import create_client, Client

In [2]:
supabase = create_client(getenv("SUPABASE_URL"), getenv("SUPABASE_KEY"))

In [3]:
client = OpenAI()

In [4]:
table = supabase.table("documents")

documents = []
offset = 0
while True:
	response = table.select("*").range(offset, offset + 1000).execute()
	if not response.data:
		break

	documents.extend(response.data)
	offset += 1000

indexDocumentMap = {row["id"]: row["document"] for row in documents}

In [5]:
index = faiss.read_index("output/dense_index.faiss")
index.ntotal

4942142

In [6]:
def search_index(query, k):
	response = client.embeddings.create(
		input=query,
		model="text-embedding-3-large"
	)
	embedding = np.array(response.data[0].embedding, dtype=np.float32).reshape(1, -1)

	distances, identifiers = index.search(embedding, k)

	for i in range(k):
		document = indexDocumentMap[identifiers[0, i]]
		neighbor_distance = distances[0, i]
		print(f"{document}: {neighbor_distance:.4f}")

search_index("state of the art image segmentation", k=5)

Tt9GSaDaHyVLN1oMDrnj0A: 0.4451
kMaAZU7oMBrunMCHZqfVFQ: 0.4870
UXDJP0bThAmEqOhRYGpJvw: 0.4875
tg-pd-uYVcy0a4iZFF4_Gg: 0.5673
t3fWAVVd7Q_9TQAz4RjuIw: 0.5948
