In [14]:
import json

import numpy as np
import tqdm as notebook_tqdm
from sentence_transformers import SentenceTransformer
from sklearn.cluster import HDBSCAN

In [None]:
def conjoin_signatures_semantic(
    registry_output: dict,
    embedding_model: str = "all-MiniLM-L6-v2",
    cache_dir: str = "./.models",
):
    hashes = list(registry_output.keys())
    if not hashes:
        return {}

    signatures_as_text = []
    for h in hashes:
        h_dict = dict(registry_output[h]["signature"])
        # remove the 'black hole' field that swallows everything
        h_dict.pop("_unparsed", None)

        # sort keys to ensure structural identity regardless of log order
        sorted_keys = sorted(h_dict.keys())

        if not sorted_keys:
            text_rep = "schema:empty_blob"
        else:
            # 'field:' prefix to define the role of the tokens
            text_rep = " ".join([f"field:{k}" for k in sorted_keys])

        signatures_as_text.append(text_rep)

    model = SentenceTransformer(embedding_model, cache_folder=cache_dir)
    embeddings = model.encode(signatures_as_text)
    X = np.ascontiguousarray(embeddings, dtype=np.float64)

    clusterer = HDBSCAN(
        min_cluster_size=2,
        min_samples=1,
        metric="cosine",
        cluster_selection_epsilon=0.08,
        cluster_selection_method="eom",
        allow_single_cluster=True,
    )

    labels = clusterer.fit_predict(X)

    conjoined_map = {}
    for i, cluster_id in enumerate(labels):
        h = hashes[i]
        # unique IDs to outliers so they don't group into one '-1' bucket
        final_id = int(cluster_id) if cluster_id != -1 else (400 + i)

        conjoined_map[h] = {
            "cluster_id": final_id,
            "keys": list(registry_output[h]["signature"].keys()),
            "is_outlier": cluster_id == -1,
        }

    return conjoined_map

In [43]:
with open("sample_registry.json") as f:
    registry = json.load(f)

registry

{'fd116cd512d5ecd2e59edf12fc258b32': {'signature': {'order': 'str',
   'buyer': 'str',
   'location': 'str',
   'total': 'str',
   'items': 'str'},
  'records': [{'raw': 'Order 1001: Buyer=John Davis, Location=Columbus, OH, Total=$742.10, Items: laptop, hdmi cable',
    'parsed': {'order': '1001',
     'buyer': 'John Davis',
     'location': 'Columbus, OH',
     'total': '$742.10',
     'items': 'laptop, hdmi cable'}},
   {'raw': 'Order 1004:   Buyer=  AMANDA SMITH ,Location=Seattle, WA,Total=$50.00, Items: desk lamp',
    'parsed': {'order': '1004',
     'buyer': 'AMANDA SMITH',
     'location': 'Seattle, WA',
     'total': '$50.00',
     'items': 'desk lamp'}},
   {'raw': 'Order 1006: total=$89.99, location=Miami, FL, buyer=Elena Rossi, Items: keyboard',
    'parsed': {'order': '1006',
     'total': '$89.99',
     'location': 'Miami, FL',
     'buyer': 'Elena Rossi',
     'items': 'keyboard'}},
   {'raw': 'Order 1007: Buyer=Chris P., Location=Denver, CO, Total=$12.00, Items: stickers

In [44]:
cluster_mapping = conjoin_signatures_semantic(registry)

group_by_cluster = {}
for k, v in cluster_mapping.items():
    cluster_id = v["cluster_id"]
    keys = v["keys"]
    is_outlier = bool(v["is_outlier"])
    group_by_cluster[cluster_id] = group_by_cluster.get(cluster_id, [])
    group_by_cluster[cluster_id].append(
        {"signature_hash": k, "fields": keys, "is_outlier": is_outlier}
    )


print(f"Semantic Clusters: \n{json.dumps(group_by_cluster, indent=4)}\n")

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 2016.83it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Semantic Clusters: 
{
    "0": [
        {
            "signature_hash": "fd116cd512d5ecd2e59edf12fc258b32",
            "fields": [
                "order",
                "buyer",
                "location",
                "total",
                "items"
            ],
            "is_outlier": false
        },
        {
            "signature_hash": "50eb97a85647221ecc7f65f74d68d156",
            "fields": [
                "order",
                "buyer",
                "total",
                "items"
            ],
            "is_outlier": false
        },
        {
            "signature_hash": "28d9f3b14d0e5516a186062212502d0c",
            "fields": [
                "order",
                "buyer",
                "locadtion",
                "total",
                "items"
            ],
            "is_outlier": false
        }
    ],
    "1": [
        {
            "signature_hash": "6f2b720d18e351508e6a8b520ae97f92",
            "fields": [
                "maple

  warn(
