In [16]:
import json

import tqdm as notebook_tqdm
from sentence_transformers import SentenceTransformer
from sklearn.cluster import HDBSCAN

In [17]:
def conjoin_signatures_semantic(registry_output: dict):
    hashes = list(registry_output.keys())

    signatures_as_text = [
        ", ".join(registry_output[h]["signature"].keys()) for h in hashes
    ]

    model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder="./.models")
    embeddings = model.encode(signatures_as_text)

    clusterer = HDBSCAN(
        min_cluster_size=2,
        min_samples=1,
        metric="cosine",
        cluster_selection_epsilon=0.18,
        cluster_selection_method="eom",
        copy=True,
    )
    labels = clusterer.fit_predict(embeddings.astype("float64"))

    conjoined_map = {}
    for i, cluster_id in enumerate(labels):
        h = hashes[i]
        conjoined_map[h] = {
            "cluster_id": int(cluster_id),
            "keys": list(registry_output[h]["signature"].keys()),
            "is_outlier": cluster_id == -1,
        }

    return conjoined_map

In [18]:
with open("sample_registry.json") as f:
    registry = json.load(f)

registry

{'fd116cd512d5ecd2e59edf12fc258b32': {'signature': {'buyer': 'str',
   'items': 'str',
   'location': 'str',
   'order': 'str',
   'total': 'str'},
  'records': [{'raw': 'Order 1001: Buyer=John Davis, Location=Columbus, OH, Total=$742.10, Items: laptop, hdmi cable',
    'parsed': {'Buyer': 'John Davis',
     'Items': 'laptop, hdmi cable',
     'Location': 'Columbus, OH',
     'Order': '1001',
     'Total': '$742.10'}},
   {'raw': 'Order 1004:   Buyer=  AMANDA SMITH ,Location=Seattle, WA,Total=$50.00, Items: desk lamp',
    'parsed': {'Buyer': 'AMANDA SMITH',
     'Items': 'desk lamp',
     'Location': 'Seattle, WA',
     'Order': '1004',
     'Total': '$50.00'}},
   {'raw': 'Order 1006: total=$89.99, location=Miami, FL, buyer=Elena Rossi, Items: keyboard',
    'parsed': {'Items': 'keyboard',
     'Order': '1006',
     'buyer': 'Elena Rossi',
     'location': 'Miami, FL',
     'total': '$89.99'}},
   {'raw': 'Order 1007: Buyer=Chris P., Location=Denver, CO, Total=$12.00, Items: stickers

In [19]:
cluster_mapping = conjoin_signatures_semantic(registry)

group_by_cluster = {}
for k, v in cluster_mapping.items():
    cluster_id = v["cluster_id"]
    keys = v["keys"]
    is_outlier = bool(v["is_outlier"])
    group_by_cluster[cluster_id] = group_by_cluster.get(cluster_id, [])
    group_by_cluster[cluster_id].append(
        {"signature_hash": k, "fields": keys, "is_outlier": is_outlier}
    )


print(f"Semantic Clusters: \n{json.dumps(group_by_cluster, indent=4)}\n")

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1707.45it/s, Materializing param=pooler.dense.weight]                             


Semantic Clusters: 
{
    "0": [
        {
            "signature_hash": "fd116cd512d5ecd2e59edf12fc258b32",
            "fields": [
                "buyer",
                "items",
                "location",
                "order",
                "total"
            ],
            "is_outlier": false
        },
        {
            "signature_hash": "50eb97a85647221ecc7f65f74d68d156",
            "fields": [
                "buyer",
                "items",
                "order",
                "total"
            ],
            "is_outlier": false
        },
        {
            "signature_hash": "28d9f3b14d0e5516a186062212502d0c",
            "fields": [
                "buyer",
                "items",
                "locadtion",
                "order",
                "total"
            ],
            "is_outlier": false
        },
        {
            "signature_hash": "6e1f3e68f259f916acfb5dec4affefdd",
            "fields": [
                "user.id",
            