In [26]:
from endee import Endee, Precision

client = Endee("apikey")

client.create_index(
    name="regulatory_docs",
    dimension=384,
    space_type="cosine",
    precision=Precision.INT8D
)

index = client.get_index(name="regulatory_docs")


In [27]:
import json

EMBEDDINGS_FILE = "chunks_with_embeddings.json"

with open(EMBEDDINGS_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

assert isinstance(data, list), "embeddings.json must be a list"
assert "embedding" in data[0], "Missing embedding field"
assert "metadata" in data[0], "Missing metadata field"


In [None]:
def build_upsert_items(data):
    items = []

    for i, item in enumerate(data):
        metadata = item["metadata"]
        text = item.get("text", "").strip()

        if not text:
            raise ValueError(f"Chunk {i} has empty text — fix chunking before insert")

        vector_item = {
            "id": f"chunk_{i}",
            "vector": item["embedding"],
            "meta": {
                **metadata,         
                "text": text         
            },
            "filter": {
                "clause_id": metadata.get("clause_id"),
                "document": metadata.get("source"),
                "section": metadata.get("section")
            }
        }

        items.append(vector_item)

    return items


In [None]:
UPSERT_BATCH_SIZE = 100 

def batched_upsert(index, items, batch_size):
    total = len(items)
    batches = (total + batch_size - 1) // batch_size

    for i in range(batches):
        start = i * batch_size
        end = min(start + batch_size, total)

        print(f"⬆️ Upserting batch {i+1}/{batches} ({start}–{end})")

        index.upsert(items[start:end])


In [None]:
items = build_upsert_items(data)

batched_upsert(
    index=index,
    items=items,
    batch_size=UPSERT_BATCH_SIZE
)

print(f"\n Inserted {len(items)} vectors into Endee")


⬆️ Upserting batch 1/5 (0–100)
⬆️ Upserting batch 2/5 (100–200)
⬆️ Upserting batch 3/5 (200–300)
⬆️ Upserting batch 4/5 (300–400)
⬆️ Upserting batch 5/5 (400–415)

✅ Inserted 415 vectors into Endee


In [31]:
test_vector = data[0]["embedding"]

results = index.query(
    vector=test_vector,
    top_k=3
)

for r in results:
    print(
        "ID:", r["id"],
        "| similarity:", r["similarity"],
        "| clause:", r["meta"].get("clause_id")
    )


ID: chunk_0 | similarity: 0.9998679757118225 | clause: 1.1.1.1
ID: chunk_1 | similarity: 0.6635578870773315 | clause: 1.1.1.3
ID: chunk_20 | similarity: 0.6333188414573669 | clause: 1.2.5


In [32]:
print(data[0].keys())
print(data[0])


dict_keys(['text', 'metadata', 'embedding'])
{'text': 'Permanent Account Number (PAN) to be the sole identification number for\n\nall transactions in the securities market1 \nWith effect from July 02, 2007, PAN is the sole identification number for all \ntransactions in the securities market, irrespective of the amount of transaction. \nA copy of the PAN card with photograph may be accepted as Proof of Identity. \nIn this regard, intermediaries shall:- \na. Put necessary systems in place so that the databases of the clients and their \ntransactions are linked to the PAN details of the client.  \nb. Build necessary infrastructure to enable accessibility and query based on \nPAN thereby enabling retrieval of all the details of the clients.  \nc. Collect copies of PAN cards issued to the existing as well as new clients by \nthe Income Tax Department and maintain the same in their record after \nverifying with the original. \nd. Cross-check the aforesaid details collected from their client