In [1]:
# Cell 1: Imports and configuration

import os
import json
from typing import List, Dict, Any

import requests
import numpy as np
import pandas as pd

from sklearn.preprocessing import normalize
from openai import OpenAI

import umap
import hdbscan

from dotenv import load_dotenv

# ----- load env -----
load_dotenv()

WEAVIATE_URL = os.getenv("WEAVIATE_URL")           # e.g. https://xxx.weaviate.network
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "MyDocs")

OPENAI_API_KEY_CHAT = os.getenv("OPENAI_API_KEY_CHAT")       # chat model key for labeling
OPENAI_MODEL = os.getenv("OPENAI_MODEL")       # chat model key for labeling 

assert WEAVIATE_URL and WEAVIATE_API_KEY, "Weaviate env vars missing"
assert OPENAI_API_KEY_CHAT, "OPENAI_API_KEY_CHAT missing"

# Optional cap while experimenting; set to None for full ~50k
MAX_OBJECTS = 5000    # e.g. 5000 for testing, then None

# Weaviate fetch settings
PAGE_LIMIT = 500      # objects per page

# UMAP settings (clustering space)
UMAP_N_COMPONENTS = 5
UMAP_N_NEIGHBORS = 15
UMAP_MIN_DIST = 0.0

# HDBSCAN settings
HDBSCAN_MIN_CLUSTER_SIZE = 50   # tune for your corpus size
HDBSCAN_MIN_SAMPLES = 10

client = OpenAI(api_key=OPENAI_API_KEY_CHAT)

weaviate_headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {WEAVIATE_API_KEY}",
}


In [2]:
# Cell 2: Helper to fetch pages from Weaviate (GraphQL)

def fetch_page(cursor: str = None, limit: int = PAGE_LIMIT) -> List[Dict[str, Any]]:
    """
    Fetch one page of objects (with embeddings) from Weaviate using GraphQL.
    Returns a list of objects with properties and _additional fields.
    """
    after_clause = f'after: "{cursor}"' if cursor else ""
    query = {
        "query": f"""
        {{
          Get {{
            {COLLECTION_NAME}(
              limit: {limit}
              {after_clause}
            ) {{
              chunk_id
              doc_id
              text
              _additional {{
                id
                vectors {{
                  default
                }}
              }}
            }}
          }}
        }}
        """
    }

    resp = requests.post(
        f"{WEAVIATE_URL}/v1/graphql",
        json=query,
        headers=weaviate_headers,
        timeout=60,
    )
    resp.raise_for_status()
    data = resp.json()

    if "errors" in data:
        raise RuntimeError(f"Weaviate GraphQL error: {data['errors']}")

    return data["data"]["Get"][COLLECTION_NAME]


In [3]:
# Cell 3: Pull all chunks + embeddings into DataFrame

all_rows = []
cursor = None

while True:
    rows = fetch_page(cursor, PAGE_LIMIT)
    if not rows:
        break

    all_rows.extend(rows)
    cursor = rows[-1]["_additional"]["id"]

    print(f"Fetched {len(all_rows)} objects so far...")

    if MAX_OBJECTS is not None and len(all_rows) >= MAX_OBJECTS:
        all_rows = all_rows[:MAX_OBJECTS]
        print(f"Reached MAX_OBJECTS={MAX_OBJECTS}, stopping fetch.")
        break

print(f"Total fetched from Weaviate: {len(all_rows)}")

records = []
for r in all_rows:
    add = r["_additional"]
    records.append(
        {
            "_id": add["id"],
            "embedding": add["vectors"]["default"],
            "chunk_id": r.get("chunk_id"),
            "doc_id": r.get("doc_id"),
            "text": r.get("text"), 
        }
    )

df = pd.DataFrame(records)
print(df.head())
print("DataFrame shape:", df.shape)

X = np.array(df["embedding"].tolist(), dtype=np.float32)
print("Embedding matrix shape:", X.shape)


Fetched 500 objects so far...
Fetched 1000 objects so far...
Fetched 1500 objects so far...
Fetched 2000 objects so far...
Fetched 2500 objects so far...
Fetched 3000 objects so far...
Fetched 3500 objects so far...
Fetched 4000 objects so far...
Fetched 4500 objects so far...
Fetched 5000 objects so far...
Reached MAX_OBJECTS=5000, stopping fetch.
Total fetched from Weaviate: 5000
                                    _id  \
0  000203a2-f584-43f1-8527-e167b0bf8c6e   
1  00046a59-5d95-4bdd-9bef-2acc2feead61   
2  0008b585-596a-42b9-8d0a-9be00a32b994   
3  00091260-d45d-4f44-bb4e-8100c280de0e   
4  00099043-8c80-4015-8d4c-6912225c5d60   

                                           embedding     chunk_id  doc_id  \
0  [-0.006832002, 0.058254257, 0.033376772, 0.006...  016697_c937  016697   
1  [-0.0023835688, 0.040449306, 0.038565286, 0.02...  023731_c153  023731   
2  [-0.028839508, -0.014247406, -0.013418496, 0.0...  027333_c015  027333   
3  [0.042262483, 0.022982152, 0.054571044, 0.060

In [4]:
# Cell 4: Normalize embeddings (for cosine-like behavior)

X_norm = normalize(X, norm="l2", axis=1)
print("Done normalization.")


Done normalization.


In [5]:
# Cell 5: UMAP projection to low-dimensional clustering space

umap_model = umap.UMAP(
    n_components=UMAP_N_COMPONENTS,
    n_neighbors=UMAP_N_NEIGHBORS,
    min_dist=UMAP_MIN_DIST,
    metric="euclidean",      # on normalized vectors this approximates cosine
    random_state=42,
)

X_umap = umap_model.fit_transform(X_norm)
print("UMAP output shape:", X_umap.shape)

# Optional: store in df for later visualization if you want
for i in range(UMAP_N_COMPONENTS):
    df[f"umap_{i}"] = X_umap[:, i]


  warn(


UMAP output shape: (5000, 5)


In [6]:
# Cell 6: HDBSCAN clustering

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=HDBSCAN_MIN_CLUSTER_SIZE,
    min_samples=HDBSCAN_MIN_SAMPLES,
    metric="euclidean",
    cluster_selection_method="eom",
)

labels = clusterer.fit_predict(X_umap)
df["cluster_id"] = labels

print(df["cluster_id"].value_counts().head(20))
print("Number of clusters (excluding noise):", len(set(labels) - {-1}))
print("Number of noise points (label -1):", np.sum(labels == -1))


cluster_id
-1     752
 17    477
 1     326
 10    309
 5     281
 27    254
 21    250
 2     186
 22    176
 3     143
 24    119
 26    114
 13    110
 28    109
 4     109
 15    107
 20    104
 25    102
 18    100
 11     98
Name: count, dtype: int64
Number of clusters (excluding noise): 30
Number of noise points (label -1): 752




In [7]:
# Cell 7: Helper to get representative texts per cluster

def get_representatives_for_cluster(cluster_id: int, top_k: int = 30):
    """
    Return up to top_k representative texts from a given cluster,
    ordered by similarity to the cluster centroid in UMAP space.
    """
    mask = df["cluster_id"] == cluster_id
    idx = np.where(mask.values)[0]
    if len(idx) == 0:
        return []

    cluster_vecs = X_umap[idx]
    centroid = cluster_vecs.mean(axis=0)

    scores = cluster_vecs @ centroid
    order = np.argsort(-scores)
    top_idx = idx[order[:top_k]]

    reps = df.loc[top_idx, "text"].astype(str).tolist()
    return reps


In [8]:
# Cell 8: LLM helper to label clusters

def llm_label_cluster(cluster_id: int, example_texts, max_chars: int = 4000):
    """
    Ask OpenAI to propose a short name and summary for a cluster.
    """
    if not example_texts:
        return f"Cluster {cluster_id}", "No representative examples available."

    combined = "\n\n".join(example_texts)
    combined = combined[:max_chars]

    system_msg = (
        "You are given example paragraphs that belong to the same semantic cluster from a larger corpus. \n"
        "Your task:\n"
        "1) Provide a short descriptive cluster name (max 8 words)\n"
        "2) Provide a concise 3 sentence summary of the main theme\n"
        "Respond ONLY in JSON with keys: cluster_name, cluster_summary."
    )

    user_msg = f"Cluster ID: {cluster_id}\n\nExample paragraphs:\n\n{combined}"

    resp = client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        response_format={"type": "json_object"},
    )

    data = json.loads(resp.choices[0].message.content)
    cluster_name = data.get("cluster_name", f"Cluster {cluster_id}")
    cluster_summary = data.get("cluster_summary", "")
    return cluster_name, cluster_summary


In [9]:
# Cell 9: Label all clusters with OpenAI and merge

cluster_labels = []
max_examples_per_cluster = 25  # tune for cost / quality

unique_clusters = sorted(c for c in df["cluster_id"].unique() if c != -1)

for cid in unique_clusters:
    examples = get_representatives_for_cluster(cid, top_k=max_examples_per_cluster)
    if not examples:
        cluster_name = f"Cluster {cid}"
        cluster_summary = "No representative examples available."
    else:
        cluster_name, cluster_summary = llm_label_cluster(cid, examples)

    cluster_labels.append(
        {
            "cluster_id": cid,
            "cluster_name": cluster_name,
            "cluster_summary": cluster_summary,
            "num_examples": len(examples),
        }
    )
    print(cid, "->", cluster_name)

# For noise cluster -1, you can optionally add a default label
if -1 in df["cluster_id"].unique():
    cluster_labels.append(
        {
            "cluster_id": -1,
            "cluster_name": "Noise / Misc",
            "cluster_summary": "Outlier or weakly clustered chunks.",
            "num_examples": int((df['cluster_id'] == -1).sum()),
        }
    )

cluster_labels_df = pd.DataFrame(cluster_labels)
cluster_labels_df.head()


0 -> Cooperative Property Transactions
1 -> Condominium Sales and Valuations
2 -> Residential Property Identifiers and Details
3 -> Political Discussions and Economic Views
4 -> Real Estate Transactions and Investments
5 -> Real Estate Transaction Records
6 -> U.S. Anti-Terrorism Laws and Enforcement
7 -> Snowden, NSA, and Media Collaboration
8 -> Reality TV Shows and Media
9 -> Political News and Developments
10 -> Military Operations and Tactical Planning
11 -> Culinary Disasters and Life Lessons
12 -> Victims' Rights in Presentence Reports
13 -> FCPA Compliance and Successor Liability
14 -> Safety and Violence in Urban India
15 -> Sexual Consent and Communication
16 -> U.S. Tax Implications for Investment Funds
17 -> Government Fiscal Policies and Medicare Liabilities
18 -> Dictatorship, Democracy, and Social Change
19 -> Analysis of Warfare and Disease Propagation
20 -> Behavioral Responses to Psychological Changes
21 -> Nonlinear Dynamics in Neuroscience and Physiology
22 -> Legal

Unnamed: 0,cluster_id,cluster_name,cluster_summary,num_examples
0,0,Cooperative Property Transactions,This cluster contains details about various tr...,25
1,1,Condominium Sales and Valuations,This cluster provides a detailed accounting of...,25
2,2,Residential Property Identifiers and Details,This cluster contains a series of residential ...,25
3,3,Political Discussions and Economic Views,The messages focus on discussions surrounding ...,25
4,4,Real Estate Transactions and Investments,This cluster highlights numerous real estate t...,25


In [10]:
# Cell 10: Join labels onto df and save

df = df.merge(cluster_labels_df, on="cluster_id", how="left")

print(df[["chunk_id", "doc_id", "cluster_id", "cluster_name"]].head())

OUT_CSV = "chunks_with_hdbscan_clusters_and_labels_from_weaviate_HDBSCAN.csv"
df.to_csv(OUT_CSV, index=False)
print("Saved CSV:", OUT_CSV)


      chunk_id  doc_id  cluster_id  \
0  016697_c937  016697           2   
1  023731_c153  023731          11   
2  027333_c015  027333           3   
3  025231_c005  025231          22   
4  017088_c153  017088          23   

                                   cluster_name  
0  Residential Property Identifiers and Details  
1           Culinary Disasters and Life Lessons  
2      Political Discussions and Economic Views  
3             Legal Communication and Oversight  
4          Freedom of Speech and Privacy Rights  
Saved CSV: chunks_with_hdbscan_clusters_and_labels_from_weaviate_HDBSCAN.csv
