In [1]:
import json

from sklearn.cluster import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def conjoin_signatures(registry_output: dict):
    hashes = list(registry_output.keys())

    signatures_as_text = [
        " ".join(registry_output[h]["signature"].keys()) for h in hashes
    ]

    vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3, 5))
    matrix = vectorizer.fit_transform(signatures_as_text)
    clusterer = HDBSCAN(
        min_cluster_size=2,
        metric="euclidean",
        copy=True,
    )
    labels = clusterer.fit_predict(matrix.toarray())
    conjoined_map = {}
    for i, cluster_id in enumerate(labels):
        h = hashes[i]
        conjoined_map[h] = {
            "cluster_id": int(cluster_id),
            "keys": list(registry_output[h]["signature"].keys()),
            "is_outlier": cluster_id == -1,
        }

    return conjoined_map

In [3]:
with open("sample_registry.json") as f:
    registry = json.load(f)

registry

{'fd116cd512d5ecd2e59edf12fc258b32': {'signature': {'order': 'str',
   'buyer': 'str',
   'location': 'str',
   'total': 'str',
   'items': 'str'},
  'records': [{'raw': 'Order 1001: Buyer=John Davis, Location=Columbus, OH, Total=$742.10, Items: laptop, hdmi cable',
    'parsed': {'order': '1001',
     'buyer': 'John Davis',
     'location': 'Columbus, OH',
     'total': '$742.10',
     'items': 'laptop, hdmi cable'}},
   {'raw': 'Order 1004:   Buyer=  AMANDA SMITH ,Location=Seattle, WA,Total=$50.00, Items: desk lamp',
    'parsed': {'order': '1004',
     'buyer': 'AMANDA SMITH',
     'location': 'Seattle, WA',
     'total': '$50.00',
     'items': 'desk lamp'}},
   {'raw': 'Order 1006: total=$89.99, location=Miami, FL, buyer=Elena Rossi, Items: keyboard',
    'parsed': {'order': '1006',
     'total': '$89.99',
     'location': 'Miami, FL',
     'buyer': 'Elena Rossi',
     'items': 'keyboard'}},
   {'raw': 'Order 1007: Buyer=Chris P., Location=Denver, CO, Total=$12.00, Items: stickers

In [4]:
conjoin_signatures(registry)

{'fd116cd512d5ecd2e59edf12fc258b32': {'cluster_id': 1,
  'keys': ['order', 'buyer', 'location', 'total', 'items'],
  'is_outlier': np.False_},
 '50eb97a85647221ecc7f65f74d68d156': {'cluster_id': 1,
  'keys': ['order', 'buyer', 'total', 'items'],
  'is_outlier': np.False_},
 '28d9f3b14d0e5516a186062212502d0c': {'cluster_id': 1,
  'keys': ['order', 'buyer', 'locadtion', 'total', 'items'],
  'is_outlier': np.False_},
 '6f2b720d18e351508e6a8b520ae97f92': {'cluster_id': 0,
  'keys': ['maples', 'name'],
  'is_outlier': np.False_},
 '3340e11ee417dd9f9cab0fd70836ccb4': {'cluster_id': 0,
  'keys': ['name', 'hobby', 'id'],
  'is_outlier': np.False_},
 'ce59f3f30262af34c2a4b11cec9950dd': {'cluster_id': -1,
  'keys': ['version', 'product'],
  'is_outlier': np.True_},
 '3baae1f59cac077e89e2f0b7d47a36cf': {'cluster_id': 3,
  'keys': ['user_id', '_unparsed'],
  'is_outlier': np.False_},
 'df87355cb94200d773396f5befa867d4': {'cluster_id': 3,
  'keys': ['_unparsed'],
  'is_outlier': np.False_},
 'c2aeb