In [1]:
import logging

import dedupe

dedupe_logger = logging.getLogger(dedupe.__name__)
dedupe_logger.setLevel(logging.DEBUG)
dedupe_logger.handlers = []
dedupe_logger.addHandler(logging.StreamHandler())

In [10]:
import pandas as pd

from neo4j_app import ROOT_DIR

DATA_PATH = ROOT_DIR.joinpath("data")
graph_level_trained_model_path = DATA_PATH / "graph_level_person_model.pickle"
graph_level_records_path = DATA_PATH / "graph_level_person_records.csv"
graph_level_training_path = DATA_PATH / "graph_level_person_training.json"
graph_level_excluded_set_path = DATA_PATH / "graph_level_excluded.txt"
graph_level_clusters_path = DATA_PATH / "graph_level_clusters.json"
graph_level_dedupe_path = DATA_PATH / "graph_level_person_dedupe.csv"

In [5]:
from dedupe import StaticDedupe

with graph_level_trained_model_path.open(mode="rb") as f:
    deduper = StaticDedupe(settings_file=f)

Predicate set:
(SimplePredicate: (wholeFieldPredicate, docFilename), SimplePredicate: (sameThreeCharStartPredicate, mentionNorm))
(SimplePredicate: (wholeFieldPredicate, docFilename), PartialPredicate: (commonSixGram, mentionNorm, Surname))
(PartialIndexTfidfTextCanopyPredicate: (0.4, mentionNorm, CorporationName), TfidfTextCanopyPredicate: (0.8, docDirname))
(SimplePredicate: (wholeFieldPredicate, docId), TfidfNGramCanopyPredicate: (0.4, mentionNorm))
(SimplePredicate: (wholeFieldPredicate, docId), SimplePredicate: (wholeFieldPredicate, neMentionClusterID))
(SimplePredicate: (commonThreeTokens, mentionNorm), PartialIndexTfidfNGramCanopyPredicate: (0.8, mentionNorm, Surname))
(PartialPredicate: (firstTwoTokensPredicate, mentionNorm, Surname), SimplePredicate: (oneGramFingerprint, docDirname))
(SimplePredicate: (commonThreeTokens, mentionNorm), PartialIndexTfidfNGramCanopyPredicate: (0.6, mentionNorm, CorporationName))


In [6]:
from neo4j_app.ml.graph_dedupe import NE_MENTION_NORM_DOC_ID, read_records

with graph_level_excluded_set_path.open() as f:
    invalid_ids = (line.strip() for line in f)
    invalid_ids = set(i for i in invalid_ids if i)

with graph_level_records_path.open() as f:
    data = read_records(f, id_column=NE_MENTION_NORM_DOC_ID, invalid_ids=invalid_ids)

In [5]:
# TODO: load the full data and not just the subset

In [7]:
import json
from neo4j_app.ml.graph_dedupe import compute_membership

clusters = None
THRESHOLD = 0.4
if graph_level_clusters_path.exists():
    clusters = json.loads(graph_level_clusters_path.read_text())
else:
    clustered_dupes = deduper.partition(data, threshold=THRESHOLD)
    clusters = compute_membership(
        clustered_dupes,
        cluster_key="graph_cluster_id",
        confidence_key="graph_cluster_confidence",
    )
    graph_level_clusters_path.write_text(json.dumps(clusters))

Removing stop word de
Canopy: PartialIndexTfidfTextCanopyPredicate: (0.4, mentionNorm, CorporationName)
Removing stop word he
Removing stop word om
Removing stop word ro
Removing stop word th
Removing stop word ai
Removing stop word in
Removing stop word nt
Removing stop word sa
Removing stop word st
Removing stop word  g
Removing stop word an
Removing stop word au
Removing stop word co
Removing stop word e 
Removing stop word ea
Removing stop word et
Removing stop word il
Removing stop word je
Removing stop word la
Removing stop word ll
Removing stop word ma
Removing stop word me
Removing stop word mo
Removing stop word n 
Removing stop word li
Removing stop word t 
Removing stop word te
Removing stop word er
Removing stop word na
Removing stop word nd
Removing stop word ni
Removing stop word on
Removing stop word  p
Removing stop word al
Removing stop word at
Removing stop word en
Removing stop word ge
Removing stop word ic
Removing stop word le
Removing stop word pa
Removing stop wo

In [8]:
for rec_id, record in data.items():
    data[rec_id].update(clusters[rec_id])

In [9]:
data_df = pd.DataFrame.from_dict(data, orient="index")
data_df.index.name = NE_MENTION_NORM_DOC_ID
data_df.drop(columns=[NE_MENTION_NORM_DOC_ID], inplace=True)

# Analyze precision

In [21]:
from neo4j_app.ml.graph_dedupe import NE_DEBUG_DOC_URL

order_cols = ["graph_cluster_id", "docId"]
data_df.sort_values(by=order_cols, inplace=True)
data_df = data_df[["graph_cluster_id", "docId", NE_DEBUG_DOC_URL, "graph_cluster_confidence"]]
data_df.to_csv(graph_level_dedupe_path)

In [23]:
count = data_df["graph_cluster_id"].value_counts()
count

graph_cluster_id
160       1664
2403       682
198        679
166        621
2552       539
          ... 
94948        1
94949        1
94950        1
94951        1
219828       1
Name: count, Length: 219827, dtype: int64

In [28]:
count.shape

(219827,)

In [None]:
count = count.value_counts()

In [20]:
import plotly.express as px

fig = px.bar(count)
fig.show()

# Analyze recall