In [None]:
# import libraries and helpers
import weaviate
from weaviate.classes.config import Configure, Property, DataType
import os
from dotenv import find_dotenv, load_dotenv
import json
import numpy as np
from tqdm.notebook import tqdm

In [None]:
# load environment variables
load_dotenv(find_dotenv(".env"))

True

In [None]:
# read Weaviate URL and API key from env
WEAVIATE_URL=os.getenv("SUNMARKE_WEAVIATE_URL")
WEAVIATE_API_KEY=os.getenv("SUNMARKE_WEAVIATE_API_KEY")

In [None]:
# connect to Weaviate cloud
weav_client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=WEAVIATE_API_KEY
)

In [None]:
# create collection schema for precomputed embeddings
weav_client.collections.create(
    name="sunmarke_data",

    # Precomputed embeddings
    vectorizer_config=Configure.Vectorizer.none(),

    properties=[
        Property(
            name="chunk_id",
            data_type=DataType.TEXT,
            description="Unique chunk identifier"
        ),
        Property(
            name="category",
            data_type=DataType.TEXT,
            description="Top-level category"
        ),
        Property(
            name="page_name",
            data_type=DataType.TEXT,
            description="Page identifier"
        ),
        Property(
            name="subpage",
            data_type=DataType.TEXT,
            description="Subpage if any"
        ),
        Property(
            name="url",
            data_type=DataType.TEXT,
            description="Source URL"
        ),
        Property(
            name="content",
            data_type=DataType.TEXT,
            description="Main chunk content"
        )
    ]
)

<weaviate.collections.collection.sync.Collection at 0x1ba3b202150>

In [None]:
# load precomputed chunk embeddings from file
with open("chunks_embeddings.json", "r") as file:
    data = json.load(file)

In [None]:
# batch-insert objects with vectors into Weaviate
collection = weav_client.collections.get("sunmarke_data")

with collection.batch.dynamic() as batch:
    for item in data:
        try:
            batch.add_object(
                properties={
                    "chunk_id": item["chunk_id"],
                    "category": item["category"],
                    "page_name": item["page_name"],
                    "subpage": item["subpage"],
                    "url": item["url"],
                    "content": item["content"],
                },
                vector=np.array(item["embedding"], dtype=np.float32)
            )
        except Exception as e:
            print(f"Failed chunk {item.get('chunk_id')}: {e}")

In [None]:
# show collection count
collection = weav_client.collections.get("sunmarke_data")

count = collection.aggregate.over_all(total_count=True)
print(count)

AggregateReturn(properties={}, total_count=81)
