In [2]:
import numpy as np
import uuid
import json
from typing import Optional, List, Dict, Any
from qdrant_client import QdrantClient, models
from sklearn.decomposition import PCA
from langchain_huggingface import HuggingFaceEmbeddings
import hdbscan

In [3]:
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
COLLECTION_NAME = "raptor_collection"

qdrant = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
print(f"Connected to Qdrant at {QDRANT_HOST}:{QDRANT_PORT}")
print(f"Collections: {[c.name for c in qdrant.get_collections().collections]}")

Connected to Qdrant at localhost:6333
Collections: ['raptor_collection', 'papers_collection']


  qdrant = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)


In [4]:
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)
print("Embedding model loaded")

  from .autonotebook import tqdm as notebook_tqdm


Embedding model loaded


In [5]:
def get_nodes_by_level(level: int, limit: int = 10000) -> List[Dict[str, Any]]:
    results, _ = qdrant.scroll(
        collection_name=COLLECTION_NAME,
        scroll_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="metadata.level",
                    match=models.MatchValue(value=level)
                )
            ]
        ),
        limit=limit,
        with_payload=True,
        with_vectors=True
    )
    
    return [
        {
            "node_id": point.id,
            "text": point.payload.get("text", ""),
            "metadata": point.payload.get("metadata", {}),
            "vector": point.vector
        }
        for point in results
    ]

In [23]:
def reduce_dimensions(embeddings: np.ndarray, n_components: int = 10) -> np.ndarray:
    n_samples = embeddings.shape[0]
    n_features = embeddings.shape[1]
    
    target_components = min(n_components, n_samples, n_features)
    
    if target_components < 2:
        return embeddings
    
    pca = PCA(n_components=target_components)
    return pca.fit_transform(embeddings)


def cluster_embeddings_hdbscan(
    embeddings: np.ndarray,
    min_cluster_size: int = 5,
    min_samples: int = 2,
    reduce_dims: bool = True,
    n_components: int = 50
) -> np.ndarray:
    """
    Cluster embeddings using HDBSCAN.
    
    HDBSCAN automatically determines the number of clusters and can identify noise points.
    Noise points (label=-1) are assigned to their nearest cluster.
    """
    n_samples = embeddings.shape[0]
    
    if n_samples < min_cluster_size:
        return np.zeros(n_samples, dtype=int)
    
    if reduce_dims and embeddings.shape[1] > n_components:
        reduced = reduce_dimensions(embeddings, n_components=min(n_components, n_samples - 1))
    else:
        reduced = embeddings
    
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean',
        cluster_selection_method='eom'
    )
    
    labels = clusterer.fit_predict(reduced)
    
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    print(f"HDBSCAN found {n_clusters} clusters, {n_noise} noise points")
    
    if n_noise > 0 and n_clusters > 0:
        labels = assign_noise_to_nearest_cluster(reduced, labels)
    elif n_clusters == 0:
        print("No clusters found, treating all as one cluster")
        return np.zeros(n_samples, dtype=int)
    
    return labels


def assign_noise_to_nearest_cluster(embeddings: np.ndarray, labels: np.ndarray) -> np.ndarray:
    """Assign noise points (label=-1) to their nearest cluster centroid."""
    labels = labels.copy()
    
    unique_clusters = [l for l in set(labels) if l != -1]
    
    centroids = {}
    for cluster_id in unique_clusters:
        cluster_mask = labels == cluster_id
        centroids[cluster_id] = embeddings[cluster_mask].mean(axis=0)
    
    noise_indices = np.where(labels == -1)[0]
    
    for idx in noise_indices:
        point = embeddings[idx]
        min_dist = float('inf')
        nearest_cluster = unique_clusters[0]
        
        for cluster_id, centroid in centroids.items():
            dist = np.linalg.norm(point - centroid)
            if dist < min_dist:
                min_dist = dist
                nearest_cluster = cluster_id
        
        labels[idx] = nearest_cluster
    
    print(f"Reassigned {len(noise_indices)} noise points to nearest clusters")
    return labels


def group_nodes_by_cluster(nodes: List[Dict], labels: np.ndarray) -> Dict[int, List[Dict]]:
    groups = {}
    for node, label in zip(nodes, labels):
        label_int = int(label)
        if label_int not in groups:
            groups[label_int] = []
        groups[label_int].append(node)
    return groups

In [7]:
def mock_summarize(texts: List[str]) -> str:
    """Mock summarizer - just concatenates first 500 chars of each text."""
    combined = " ".join([t[:200] for t in texts[:3]])
    return f"[MOCK SUMMARY] {combined[:500]}..."

In [14]:
tree_storage = {
    "levels": {},
    "parent_child_map": {},
    "stats": {}
}

In [15]:
level0_nodes = get_nodes_by_level(0)
print(f"Found {len(level0_nodes)} level 0 nodes in Qdrant")

tree_storage["levels"][0] = level0_nodes
tree_storage["stats"][0] = {
    "count": len(level0_nodes),
    "node_type": "chunk"
}

Found 10000 level 0 nodes in Qdrant


In [10]:
if level0_nodes:
    print("\n--- Sample Level 0 Node ---")
    sample = level0_nodes[0]
    print(f"Node ID: {sample['node_id']}")
    print(f"Text (first 300 chars): {sample['text'][:300]}...")
    print(f"Metadata: {json.dumps(sample['metadata'], indent=2)}")
    print(f"Vector shape: {np.array(sample['vector']).shape}")


--- Sample Level 0 Node ---
Node ID: 0000b40c-138b-4d17-8b15-9e22432ee915
Text (first 300 chars): Medium : statement that accomplishes one of
these elements. (iii)High : statement that clearly
accomplishes at least two speciﬁcity elements.
Even though we do not explicitly use labels
for the four speciﬁcity elements, we found that
explicitly breaking down speciﬁcity into multiple
components helpe...
Metadata: {
  "filename": "64/article.pdf",
  "paper_id": "64",
  "level": 0,
  "node_type": "chunk",
  "node_id": "0000b40c-138b-4d17-8b15-9e22432ee915",
  "children_ids": [],
  "parent_ids": [],
  "cluster_id": null,
  "chunk_index": 23,
  "total_chunks": 47,
  "token_count": 121
}
Vector shape: (384,)


In [21]:
def build_next_level_local(
    current_nodes: List[Dict],
    current_level: int,
    summarize_fn,
    min_cluster_size: int = 5
) -> Dict[str, Any]:
    
    if len(current_nodes) < min_cluster_size:
        print(f"Not enough nodes ({len(current_nodes)}) to cluster")
        return {
            "level": current_level + 1,
            "nodes": [],
            "stopped": True,
            "reason": "insufficient_nodes"
        }
    
    embeddings = np.array([node["vector"] for node in current_nodes])
    print(f"Embeddings shape: {embeddings.shape}")
    
    labels = cluster_embeddings_hdbscan(
        embeddings,
        min_cluster_size=min_cluster_size,
        min_samples=2
    )
    n_clusters = len(set(labels))
    print(f"Final cluster count: {n_clusters}")
    
    groups = group_nodes_by_cluster(current_nodes, labels)
    
    print(f"\nCluster sizes:")
    for cluster_id, nodes in groups.items():
        print(f"  Cluster {cluster_id}: {len(nodes)} nodes")
    
    next_level = current_level + 1
    new_nodes = []
    parent_child_map = {}
    
    for cluster_id, cluster_nodes in groups.items():
        texts = [node["text"] for node in cluster_nodes]
        children_ids = [node["node_id"] for node in cluster_nodes]
        
        summary = summarize_fn(texts)
        
        summary_embedding = embedding_model.embed_query(summary)
        
        node_id = str(uuid.uuid4())
        
        new_node = {
            "node_id": node_id,
            "text": summary,
            "vector": summary_embedding,
            "metadata": {
                "level": next_level,
                "node_type": "summary",
                "children_ids": children_ids,
                "parent_ids": [],
                "cluster_id": int(cluster_id),
                "children_count": len(children_ids),
                "token_count": len(summary) // 4,
            }
        }
        
        new_nodes.append(new_node)
        parent_child_map[node_id] = children_ids
    
    print(f"\nBuilt {len(new_nodes)} summary nodes for level {next_level}")
    
    return {
        "level": next_level,
        "nodes": new_nodes,
        "parent_child_map": parent_child_map,
        "stopped": False
    }

In [22]:
print("=" * 50)
print("Building Level 1 from Level 0")
print("=" * 50)

result_level1 = build_next_level_local(
    current_nodes=tree_storage["levels"][0],
    current_level=0,
    summarize_fn=mock_summarize,
)

if not result_level1["stopped"]:
    tree_storage["levels"][1] = result_level1["nodes"]
    tree_storage["parent_child_map"].update(result_level1["parent_child_map"])
    tree_storage["stats"][1] = {
        "count": len(result_level1["nodes"]),
        "node_type": "summary"
    }

Building Level 1 from Level 0
Embeddings shape: (10000, 384)




HDBSCAN found 199 clusters, 7756 noise points
Reassigned 7756 noise points to nearest clusters
Final cluster count: 199

Cluster sizes:
  Cluster 10: 37 nodes
  Cluster 138: 92 nodes
  Cluster 95: 40 nodes
  Cluster 145: 69 nodes
  Cluster 137: 107 nodes
  Cluster 119: 57 nodes
  Cluster 157: 46 nodes
  Cluster 0: 17 nodes
  Cluster 128: 39 nodes
  Cluster 163: 49 nodes
  Cluster 42: 61 nodes
  Cluster 156: 51 nodes
  Cluster 170: 122 nodes
  Cluster 19: 61 nodes
  Cluster 60: 161 nodes
  Cluster 139: 257 nodes
  Cluster 125: 138 nodes
  Cluster 44: 72 nodes
  Cluster 55: 29 nodes
  Cluster 165: 140 nodes
  Cluster 173: 148 nodes
  Cluster 191: 59 nodes
  Cluster 182: 118 nodes
  Cluster 121: 19 nodes
  Cluster 198: 52 nodes
  Cluster 164: 27 nodes
  Cluster 150: 25 nodes
  Cluster 91: 39 nodes
  Cluster 195: 146 nodes
  Cluster 77: 46 nodes
  Cluster 78: 55 nodes
  Cluster 68: 46 nodes
  Cluster 106: 22 nodes
  Cluster 146: 48 nodes
  Cluster 58: 46 nodes
  Cluster 123: 110 nodes
  Cl

In [None]:
if 1 in tree_storage["levels"] and tree_storage["levels"][1]:
    print("\n--- Sample Level 1 Node ---")
    sample = tree_storage["levels"][1][0]
    print(f"Node ID: {sample['node_id']}")
    print(f"Summary: {sample['text'][:500]}")
    print(f"Children count: {sample['metadata']['children_count']}")
    print(f"Children IDs: {sample['metadata']['children_ids'][:3]}...")

In [None]:
if 1 in tree_storage["levels"] and len(tree_storage["levels"][1]) >= 3:
    print("=" * 50)
    print("Building Level 2 from Level 1")
    print("=" * 50)
    
    result_level2 = build_next_level_local(
        current_nodes=tree_storage["levels"][1],
        current_level=1,
        summarize_fn=mock_summarize
    )
    
    if not result_level2["stopped"]:
        tree_storage["levels"][2] = result_level2["nodes"]
        tree_storage["parent_child_map"].update(result_level2["parent_child_map"])
        tree_storage["stats"][2] = {
            "count": len(result_level2["nodes"]),
            "node_type": "summary"
        }
else:
    print("Not enough level 1 nodes to build level 2")

In [None]:
if 2 in tree_storage["levels"] and len(tree_storage["levels"][2]) >= 3:
    print("=" * 50)
    print("Building Level 3 from Level 2")
    print("=" * 50)
    
    result_level3 = build_next_level_local(
        current_nodes=tree_storage["levels"][2],
        current_level=2,
        summarize_fn=mock_summarize
    )
    
    if not result_level3["stopped"]:
        tree_storage["levels"][3] = result_level3["nodes"]
        tree_storage["parent_child_map"].update(result_level3["parent_child_map"])
        tree_storage["stats"][3] = {
            "count": len(result_level3["nodes"]),
            "node_type": "summary"
        }
else:
    print("Not enough level 2 nodes to build level 3")

In [None]:
print("\n" + "=" * 50)
print("FINAL TREE STATISTICS")
print("=" * 50)

total_nodes = 0
for level, stats in tree_storage["stats"].items():
    print(f"Level {level}: {stats['count']} nodes ({stats['node_type']})")
    total_nodes += stats['count']

print(f"\nTotal nodes: {total_nodes}")
print(f"Max level: {max(tree_storage['stats'].keys())}")
print(f"Parent-child relationships: {len(tree_storage['parent_child_map'])}")

In [None]:
def visualize_tree_sample(storage: Dict, max_children: int = 3):
    """Visualize a sample branch of the tree."""
    max_level = max(storage["levels"].keys())
    
    if max_level == 0:
        print("Only level 0 exists, no tree structure to visualize")
        return
    
    print("\n" + "=" * 50)
    print("SAMPLE TREE BRANCH")
    print("=" * 50)
    
    top_node = storage["levels"][max_level][0]
    
    def print_node(node, indent=0):
        prefix = "  " * indent
        level = node["metadata"]["level"]
        node_type = node["metadata"]["node_type"]
        text_preview = node["text"][:80].replace("\n", " ")
        print(f"{prefix}[L{level}:{node_type}] {text_preview}...")
        
        if "children_ids" in node["metadata"] and node["metadata"]["children_ids"]:
            children_ids = node["metadata"]["children_ids"][:max_children]
            child_level = level - 1
            
            if child_level in storage["levels"]:
                for child_id in children_ids:
                    child_node = next(
                        (n for n in storage["levels"][child_level] if n["node_id"] == child_id),
                        None
                    )
                    if child_node:
                        print_node(child_node, indent + 1)
    
    print_node(top_node)

visualize_tree_sample(tree_storage)

In [None]:
import pickle

with open("data/test_tree_storage.pkl", "wb") as f:
    pickle.dump(tree_storage, f)

print("Tree storage saved to data/test_tree_storage.pkl")

In [None]:
# Reload and verify
with open("data/test_tree_storage.pkl", "rb") as f:
    loaded_storage = pickle.load(f)

print(f"Loaded tree with {len(loaded_storage['levels'])} levels")
print(f"Stats: {loaded_storage['stats']}")