In [1]:
# Cell 1: Imports and configuration

import os
import json
import math
from typing import List, Dict, Any
from dotenv import load_dotenv

import requests
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from openai import OpenAI

# Configure from env   
load_dotenv()
WEAVIATE_URL = os.getenv("WEAVIATE_URL")
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")
OPENAI_API_KEY_CHAT = os.getenv("OPENAI_API_KEY_CHAT")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "MyDocs")
OPENAI_MODEL = os.getenv("OPENAI_MODEL") 

# KMeans settings
N_CLUSTERS = 60  # tune this
MAX_OBJECTS = None  # or an int to cap total objects fetched

# Weaviate fetch settings
PAGE_LIMIT = 200  # objects per page

# OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY_CHAT)

weaviate_headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {WEAVIATE_API_KEY}",
}


In [2]:
def fetch_page(cursor: str = None, limit: int = PAGE_LIMIT) -> List[Dict[str, Any]]:
    after_clause = f'after: "{cursor}"' if cursor else ""
    query = {
        "query": f"""
        {{
          Get {{
            {COLLECTION_NAME}(
              limit: {limit}
              {after_clause}
            ) {{
              chunk_id
              doc_id
              text
              _additional {{
                id
                vectors {{
                  default
                }}
              }}
            }}
          }}
        }}
        """
    }

    resp = requests.post(
        f"{WEAVIATE_URL}/v1/graphql",
        json=query,
        headers=weaviate_headers,
        timeout=60,
    )
    resp.raise_for_status()
    data = resp.json()

    if "errors" in data:
        raise RuntimeError(f"Weaviate GraphQL error: {data['errors']}")

    objects = data["data"]["Get"][COLLECTION_NAME]
    return objects


In [3]:
# Cell 3: Pull all chunks + embeddings into a DataFrame 
all_rows = []
cursor = None

while True:
    rows = fetch_page(cursor, PAGE_LIMIT)
    if not rows:
        break

    all_rows.extend(rows)
    cursor = rows[-1]["_additional"]["id"]  # use last id as cursor

    print(f"Fetched {len(all_rows)} objects so far...")

    if MAX_OBJECTS is not None and len(all_rows) >= MAX_OBJECTS:
        all_rows = all_rows[:MAX_OBJECTS]
        print(f"Reached MAX_OBJECTS={MAX_OBJECTS}, stopping fetch.")
        break

print(f"Total fetched from Weaviate: {len(all_rows)}")

# Convert to DataFrame
records = []
for r in all_rows:
    add = r["_additional"]
    props = {
        "_id": add["id"],
        "embedding": add["vectors"]["default"],
        "chunk_id": r.get("chunk_id"),
        "doc_id": r.get("doc_id"),
        "text": r.get("text"),
    }
    records.append(props)

df = pd.DataFrame(records)
print(df.head())
print("DataFrame shape:", df.shape)

# Build embedding matrix
X = np.array(df["embedding"].tolist(), dtype=np.float32)
print("Embeddings shape:", X.shape)


RuntimeError: Weaviate GraphQL error: [{'locations': [{'column': 17, 'line': 13}], 'message': 'Cannot query field "vectors" on type "MyDocsCleanedAdditional". Did you mean "vector"?', 'path': None}]

In [4]:
# Cell 4: Normalize embeddings for cosine-like distance

# L2 normalize rows so dot product approximates cosine similarity
X_norm = normalize(X, norm="l2", axis=1)
print("Normalized embeddings shape:", X_norm.shape)

Normalized embeddings shape: (51088, 1536)


In [5]:
# Cell 5: KMeans clustering

kmeans = KMeans(
    n_clusters=N_CLUSTERS,
    random_state=42,
    n_init=10,
)

cluster_ids = kmeans.fit_predict(X_norm)
df["cluster_id"] = cluster_ids

print(df["cluster_id"].value_counts().head())
print("Number of clusters with at least one point:", df["cluster_id"].nunique())

cluster_id
7     3141
57    1787
22    1554
0     1428
45    1387
Name: count, dtype: int64
Number of clusters with at least one point: 60


In [6]:
# Cell 6: Helper to get representative texts per cluster

def get_representatives_for_cluster(cluster_id: int, top_k: int = 30) -> List[str]:
    """
    Return up to top_k representative texts from a given cluster, ordered by
    similarity to the cluster centroid.
    """
    mask = df["cluster_id"] == cluster_id
    idx = np.where(mask.values)[0]

    if len(idx) == 0:
        return []

    cluster_vectors = X_norm[idx]
    centroid = kmeans.cluster_centers_[cluster_id]

    # similarity scores = dot product with centroid
    scores = cluster_vectors @ centroid
    order = np.argsort(-scores)
    top_idx = idx[order[:top_k]]

    reps = df.loc[top_idx, "text"].astype(str).tolist()
    return reps

In [7]:
# Cell 7: OpenAI helper to label clusters

def llm_label_cluster(cluster_id: int, example_texts: List[str], max_chars: int = 4000):
    """
    Ask OpenAI to propose a short name and summary for a cluster, given example texts.
    Returns (cluster_name, cluster_summary).
    """
    if not example_texts:
        return "Unknown", "No examples available for this cluster."

    combined = "\n\n".join(example_texts)
    combined = combined[:max_chars]

    system_msg = (
        "You receive multiple paragraphs that belong to one semantic cluster produced by KMeans.\n"
        "Task:\n"
        "1. Produce a short descriptive cluster name, max 8 words.\n"
        "2. Produce a concise summary of the main theme, exactly 3 sentences.\n"
        "3. Output must be valid JSON with keys: cluster_name, cluster_summary."
        "\n"
        "Rules:\n"
        "- Base your name and summary only on the provided paragraphs.\n"
        "- Prefer specificity over vague themes.\n"
    )

    user_msg = f"Cluster ID: {cluster_id}\n\nExample paragraphs:\n\n{combined}"

    resp = client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        response_format={"type": "json_object"},
    )

    data = json.loads(resp.choices[0].message.content)
    cluster_name = data.get("cluster_name", f"Cluster {cluster_id}")
    cluster_summary = data.get("cluster_summary", "")
    return cluster_name, cluster_summary

In [8]:
# Cell 8: Label all clusters with OpenAI

cluster_labels = []
max_examples_per_cluster = 25  # tune for cost / quality

for cid in sorted(df["cluster_id"].unique()):
    examples = get_representatives_for_cluster(cid, top_k=max_examples_per_cluster)
    if not examples:
        cluster_name = f"Cluster {cid}"
        cluster_summary = "No representative examples available."
    else:
        cluster_name, cluster_summary = llm_label_cluster(cid, examples)

    cluster_labels.append(
        {
            "cluster_id": cid,
            "cluster_name": cluster_name,
            "cluster_summary": cluster_summary,
            "num_examples": len(examples),
        }
    )
    print(cid, "->", cluster_name)

cluster_labels_df = pd.DataFrame(cluster_labels)
cluster_labels_df.head()

0 -> Confidential Communication and Planning
1 -> Real Estate Transaction Records
2 -> Exploitation and Abuse at Epstein's Ranch
3 -> Israeli-Palestinian Conflict Dynamics
4 -> Confidential Communication and Privilege Notice
5 -> Market Sentiment on Financial Stocks
6 -> Power Dynamics in Digital Networks
7 -> Condominium Sales Data
8 -> Victims' Rights in Criminal Justice
9 -> Social Connections and Beliefs
10 -> Global Regulatory Landscape of Cannabis
11 -> China's Global Influence Strategies
12 -> Residential Data Transactions
13 -> Challenges and Reflections on Artificial Intelligence
14 -> Sexual Misconduct Allegations Against Krauss
15 -> Prominent Leaders in Business and Academia
16 -> Underage Massage Encounters
17 -> Youth Disillusionment and Parental Conflict
18 -> Epstein's Legal Proceedings and Victims' Rights
19 -> Taboo-Breaking Comedy and Social Commentary
20 -> Exploring BDSM and Sexual Consent
21 -> Tax Policy and Economic Trends
22 -> Email Campaign Tracking and Engag

Unnamed: 0,cluster_id,cluster_name,cluster_summary,num_examples
0,0,Confidential Communication and Planning,These paragraphs include personal communicatio...,25
1,1,Real Estate Transaction Records,This cluster contains data outlining various r...,25
2,2,Exploitation and Abuse at Epstein's Ranch,The paragraphs detail Jeffrey Epstein's system...,25
3,3,Israeli-Palestinian Conflict Dynamics,The text discusses the evolving threats faced ...,25
4,4,Confidential Communication and Privilege Notice,The paragraphs emphasize the confidentiality o...,25


In [9]:
# Cell 9: Join labels back to df and optionally save locally

df = df.merge(cluster_labels_df, on="cluster_id", how="left")

print(df[["chunk_id", "doc_id", "cluster_id", "cluster_name"]].head())

# Optional: save to CSV for inspection
df.to_csv("chunks_with_kmeans_clusters_and_labels_from_weaviate_FULL.csv", index=False)
print("Saved CSV: chunks_with_kmeans_clusters_and_labels_from_weaviate_FULL.csv")

      chunk_id  doc_id  cluster_id  \
0  016552_c680  016552          33   
1  023361_c103  023361          25   
2  028621_c044  028621          15   
3  016221_c286  016221           9   
4  016697_c495  016697          12   

                                   cluster_name  
0  Cooperative Apartments and Property Listings  
1    Legal Actions Against Terrorism Supporters  
2    Prominent Leaders in Business and Academia  
3                Social Connections and Beliefs  
4                 Residential Data Transactions  
Saved CSV: chunks_with_kmeans_clusters_and_labels_from_weaviate_FULL.csv
