In [1]:
import logging

import dedupe

dedupe_logger = logging.getLogger(dedupe.__name__)
dedupe_logger.setLevel(logging.DEBUG)
dedupe_logger.handlers = []
dedupe_logger.addHandler(logging.StreamHandler())

In [2]:
import pandas as pd

from neo4j_app import ROOT_DIR

DATA_PATH = ROOT_DIR.joinpath("data")
records_path = DATA_PATH / "person_records.csv"
trained_model_path = DATA_PATH / "person_model.pickle"
excluded_set_path = DATA_PATH / "excluded.txt"
clusters_path = DATA_PATH / "person_clusters.json"
dedupe_path = DATA_PATH / "person_deduped.csv"

In [3]:
from neo4j_app.ml.graph_dedupe import HardStaticDedupe

with trained_model_path.open(mode="rb") as f:
    deduper = HardStaticDedupe(doc_key="docId", settings_file=f)

Predicate set:
TfidfTextCanopyPredicate: (0.6, mentionNorm)
PartialPredicate: (sameFiveCharStartPredicate, mentionNorm, Surname)
(PartialPredicate: (sameSevenCharStartPredicate, mentionNorm, CorporationName), PartialIndexLevenshteinCanopyPredicate: (1, mentionNorm, Surname))
SimplePredicate: (oneGramFingerprint, mentionNorm)
(PartialPredicate: (commonSixGram, mentionNorm, Surname), TfidfNGramCanopyPredicate: (0.6, mentionNorm))
PartialPredicate: (sortedAcronym, mentionNorm, CorporationName)
(LevenshteinCanopyPredicate: (2, mentionNorm), SimplePredicate: (suffixArray, mentionNorm))


In [4]:
from neo4j_app.constants import NE_MENTION_NORM
from neo4j_app.ml.graph_dedupe import read_records

with excluded_set_path.open() as f:
    invalid_ids = (line.strip() for line in f)
    invalid_ids = set(i for i in invalid_ids if i)

with records_path.open() as f:
    data = read_records(f, id_column=NE_MENTION_NORM, invalid_ids=invalid_ids)

In [5]:
# TODO: load the full data and not just the subset

In [6]:
from typing import Dict, List


def compute_membership(partition: List) -> Dict:
    membership = dict()
    for cluster_id, (records, scores) in enumerate(partition):
        for record_id, score in zip(records, scores):
            membership[record_id] = {
                "cluster_id": cluster_id,
                "cluster_confidence": float(score),
            }
    return membership

In [7]:
import json

clusters = None
THRESHOLD = 0.4
if clusters_path.exists():
    clusters = json.loads(clusters_path.read_text())
else:
    clustered_dupes = deduper.partition(data, threshold=THRESHOLD)
    clusters = compute_membership(clustered_dupes)
    clusters_path.write_text(json.dumps(clusters))

In [8]:
for rec_id, record in data.items():
    data[rec_id].update(clusters[rec_id])

In [9]:
data_df = pd.DataFrame.from_dict(data, orient="index")
data_df.index.name = "mentionNorm"
data_df.drop(columns=["mentionNorm"], inplace=True)

In [10]:
order_cols = ["docId", "cluster_id", "mentionNorm", "cluster_confidence"]
data_df.sort_values(by=order_cols, inplace=True)
data_df = data_df[["cluster_id", "cluster_confidence", "docId"]]

In [17]:
data_df.to_csv(dedupe_path)

In [12]:
count = data_df["cluster_id"].value_counts().value_counts()
count

count
1     62727
2      2751
3       286
4        63
5        24
6        15
7        11
9         7
8         6
15        2
10        1
33        1
11        1
13        1
21        1
24        1
31        1
54        1
Name: count, dtype: int64

In [69]:
count.describe()

count       13.000000
mean      5120.230769
std      17622.829181
min          1.000000
25%          1.000000
50%          5.000000
75%         34.000000
max      63727.000000
Name: count, dtype: float64

In [None]:
import plotly.express as px

fig = px.bar(count)
fig.show()

In [11]:
len(data)

69964