In [11]:
import logging

import dedupe

dedupe_logger = logging.getLogger(dedupe.__name__)
dedupe_logger.setLevel(logging.DEBUG)
dedupe_logger.handlers = []
dedupe_logger.addHandler(logging.StreamHandler())

In [12]:
from neo4j_app import ROOT_DIR

DATA_PATH = ROOT_DIR.joinpath("data")
records_path = DATA_PATH / "person_records.csv"
trained_model_path = DATA_PATH / "person_model.pickle"
excluded_set_path = DATA_PATH / "excluded.txt"
clusters_path = DATA_PATH / "person_clusters.json"
dedupe_path = DATA_PATH / "person_deduped.csv"
training_set_path = DATA_PATH / "person_training.csv"

In [13]:
from dedupe import read_training

with training_set_path.open() as f:
    training_set = read_training(f)

# Load a sample of the data

In [14]:
from neo4j_app.constants import NE_MENTION_NORM
from neo4j_app.ml.graph_dedupe import read_records

with excluded_set_path.open() as f:
    invalid_ids = (line.strip() for line in f)
    invalid_ids = set(i for i in invalid_ids if i)

with records_path.open() as f:
    data = read_records(f, id_column=NE_MENTION_NORM, invalid_ids=invalid_ids)

# Downsample a bit the data

In [21]:
from typing import Sequence
from dedupe._typing import Data


def sample_data(dataset: Data, n_samples: int, sort_keys: Sequence[str]) -> Data:
    samples = sorted(dataset.items(), key=lambda i: tuple((i[1][k] for k in sort_keys)))
    return dict(samples[:n_samples])

# Compute graph level features

In [25]:
from copy import deepcopy
from dedupe._typing import RecordDict
from typing import Dict, Generator, Iterable

NE_MENTION_CLUSTER = "neMentionClusterID"


def add_mention_cluster_field(
    records: Iterable[RecordDict],
    clusters: Dict,
    *,
    id_field: str,
    cluster_field_name: str
) -> Generator[RecordDict, None, None]:
    for rec in records:
        rec = deepcopy(rec)
        rec[cluster_field_name] = clusters[rec[id_field]]
        yield rec

In [18]:
with dedupe_path.open() as f:
    doc_level_clusters = read_records(
        f, id_column=NE_MENTION_NORM, invalid_ids=invalid_ids
    )
doc_level_clusters = {
    rec_id: rec[NE_MENTION_NORM] for rec_id, rec in doc_level_clusters.items()
}

In [None]:
N_SAMPLES = 20000
data = sample_data(data, n_samples=N_SAMPLES, sort_keys=["docId"])

In [27]:
data_keys, data_values = zip(*data.items())
new_training_set = dict(zip(data_keys, add_mention_cluster_field(
    data_values,
    clusters=doc_level_clusters,
    id_field=NE_MENTION_NORM,
    cluster_field_name=NE_MENTION_CLUSTER,
)))
len(new_training_set)

20000

# Make the most of the already annotated data

Keep the training set, compute the new features