In [1]:
from dotenv import load_dotenv
load_dotenv()

from os import getenv
from json import dump
import sqlite3
import numpy as np
from openai import OpenAI
import faiss

In [2]:
client = OpenAI()

In [3]:
DB_PATH = "./output/sparse_index.db"
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
cursor = conn.cursor()

In [4]:
cursor.execute("SELECT id, filename FROM documents")
documents = cursor.fetchall()

indexDocumentMap = {row[0]: row[1] for row in documents}

In [5]:
index = faiss.read_index("output/dense_index.faiss")
index.ntotal

4942142

In [6]:
def search_index(query, k):
	response = client.embeddings.create(
		input=query,
		model="text-embedding-3-large"
	)
	embedding = np.array(response.data[0].embedding, dtype=np.float32).reshape(1, -1)

	distances, identifiers = index.search(embedding, k * 4)

	documentIds = []
	results = []
	for i in range(identifiers.shape[1]):
		document = indexDocumentMap[identifiers[0, i]]
		if document not in documentIds:
			documentIds.append(document)
			results.append((document, float(1 / (distances[0, i] + 0.00000001))))

	return results[:k]

In [7]:
results = search_index("state of the art image segmentation", k=5)
for filename, distance in results:
    print(f"{filename}: {distance:.4f}")

Tt9GSaDaHyVLN1oMDrnj0A: 2.2462
UXDJP0bThAmEqOhRYGpJvw: 2.0511
kMaAZU7oMBrunMCHZqfVFQ: 2.0508
tg-pd-uYVcy0a4iZFF4_Gg: 1.7629
xxe1EcEz25CACxXxd36mYQ: 1.6825
