In [1]:
import logging

import dedupe

dedupe_logger = logging.getLogger(dedupe.__name__)
dedupe_logger.setLevel(logging.DEBUG)
dedupe_logger.handlers = []
dedupe_logger.addHandler(logging.StreamHandler())

In [2]:
from neo4j_app import ROOT_DIR

DATA_PATH = ROOT_DIR.joinpath("data")
records_path = DATA_PATH / "person_records.csv"
excluded_set_path = DATA_PATH / "excluded.txt"
clusters_path = DATA_PATH / "person_clusters.json"
dedupe_path = DATA_PATH / "person_deduped.csv"
training_set_path = DATA_PATH / "person_training.csv"
graph_level_trained_model_path = DATA_PATH / "graph_level_person_model.pickle"
graph_level_records_path = DATA_PATH / "graph_level_person_records.csv"
graph_level_training_path = DATA_PATH / "graph_level_person_training.json"

# Load a sample of the data

In [3]:
from neo4j_app.constants import NE_MENTION_NORM
from neo4j_app.ml.graph_dedupe import read_records

with excluded_set_path.open() as f:
    invalid_ids = (line.strip() for line in f)
    invalid_ids = set(i for i in invalid_ids if i)

with records_path.open() as f:
    data = read_records(f, id_column=NE_MENTION_NORM, invalid_ids=invalid_ids)

In [4]:
from copy import deepcopy
from typing import Set
from dedupe._typing import TrainingData
from dedupe import read_training


def filter_training_set(labeled_pairs: TrainingData, invalid: Set[str]) -> TrainingData:
    labeled_pairs = deepcopy(labeled_pairs)
    labeled_pairs["distinct"] = [
        (left, right)
        for left, right in labeled_pairs["distinct"]
        if not left[NE_MENTION_NORM] in invalid
        and not right[NE_MENTION_NORM] in invalid
    ]
    labeled_pairs["match"] = [
        (left, right)
        for left, right in labeled_pairs["match"]
        if not left[NE_MENTION_NORM] in invalid
        and not right[NE_MENTION_NORM] in invalid
    ]
    return labeled_pairs


with training_set_path.open() as f:
    training_set = filter_training_set(read_training(f), invalid=invalid_ids)

# Downsample a bit the data

In [5]:
from typing import Sequence
from dedupe._typing import Data


def sample_data(dataset: Data, n_samples: int, sort_keys: Sequence[str]) -> Data:
    samples = sorted(dataset.items(), key=lambda i: tuple((i[1][k] for k in sort_keys)))
    return dict(samples[:n_samples])

# Compute graph level features

In [6]:
from dedupe._typing import RecordDict, TrainingData
from typing import Callable, Dict


def add_mention_cluster_field(
    record: RecordDict, clusters: Dict, *, id_field: str, cluster_field_name: str
) -> RecordDict:
    record = deepcopy(record)
    record[cluster_field_name] = clusters[record[id_field]]
    return record


def augment_training_set(
    labeled_pairs: TrainingData, augment_fn: Callable[[RecordDict], RecordDict]
) -> TrainingData:
    distinct = [
        (augment_fn(lhs), augment_fn(rhs)) for lhs, rhs in labeled_pairs["distinct"]
    ]
    match = [(augment_fn(lhs), augment_fn(rhs)) for lhs, rhs in labeled_pairs["match"]]
    training = TrainingData(distinct=distinct, match=match)
    return training

In [7]:
with dedupe_path.open() as f:
    doc_level_clusters = read_records(
        f, id_column=NE_MENTION_NORM, invalid_ids=invalid_ids
    )
doc_level_clusters = {
    rec_id: rec["cluster_id"] for rec_id, rec in doc_level_clusters.items()
}

In [8]:
from neo4j_app.ml.graph_dedupe import NE_MENTION_CLUSTER

data = {
    rec_id: add_mention_cluster_field(
        rec,
        doc_level_clusters,
        id_field=NE_MENTION_NORM,
        cluster_field_name=NE_MENTION_CLUSTER,
    )
    for rec_id, rec in data.items()
}

In [9]:
from neo4j_app.ml.graph_dedupe import NE_FIELDNAMES, write_dataset

N_SAMPLES = None
NEW_FIELDNAMES = NE_FIELDNAMES + [NE_MENTION_CLUSTER]
if N_SAMPLES is not None:
    data = sample_data(data, n_samples=N_SAMPLES, sort_keys=["docId"])
with graph_level_records_path.open("w") as f:
    write_dataset(data.values(), fieldnames=NEW_FIELDNAMES, dataset_f=f)

In [10]:
import functools
from dedupe import write_training

if not graph_level_training_path.exists():
    add_mention_cluster_field_fn = functools.partial(
        add_mention_cluster_field,
        clusters=doc_level_clusters,
        id_field=NE_MENTION_NORM,
        cluster_field_name=NE_MENTION_CLUSTER,
    )
    new_training_set = augment_training_set(training_set, add_mention_cluster_field_fn)
    with graph_level_training_path.open("w") as f:
        write_training(new_training_set, f)

# Make the most of the already annotated data

Keep the training set, compute the new features

In [11]:
from dedupe import Dedupe
import functools
from neo4j_app.ml.graph_dedupe import (
    ConfigurableClassifierDedupe,
    person_fields,
    run_training,
)

# TODO: increase
training_sample_size = 50000
target_recall = 0.8

clf_args = {"max_iter": 100000}
model = run_training(
    graph_level_records_path,
    dedupe_getter=functools.partial(ConfigurableClassifierDedupe, clf_args=clf_args),
    fields_getter=functools.partial(person_fields, inside_docs=False),
    excluded_path=excluded_set_path,
    model_path=graph_level_trained_model_path,
    training_path=graph_level_training_path,
    sample_size=training_sample_size,
    id_column=NE_MENTION_NORM,
    recall=target_recall,
)

reading training from file
Canopy: TfidfNGramCanopyPredicate: (0.8, mentionNorm)
Canopy: TfidfNGramCanopyPredicate: (0.2, mentionNorm)
Canopy: TfidfNGramCanopyPredicate: (0.6, mentionNorm)
Canopy: TfidfNGramCanopyPredicate: (0.4, mentionNorm)
Canopy: LevenshteinCanopyPredicate: (1, mentionNorm)
Canopy: LevenshteinCanopyPredicate: (2, mentionNorm)
Canopy: LevenshteinCanopyPredicate: (3, mentionNorm)
Canopy: LevenshteinCanopyPredicate: (4, mentionNorm)
Canopy: PartialIndexTfidfNGramCanopyPredicate: (0.2, mentionNorm, Surname)
Canopy: PartialIndexTfidfNGramCanopyPredicate: (0.6, mentionNorm, CorporationName)
Canopy: PartialIndexTfidfNGramCanopyPredicate: (0.2, mentionNorm, CorporationName)
Canopy: PartialIndexTfidfNGramCanopyPredicate: (0.4, mentionNorm, CorporationName)
Canopy: PartialIndexTfidfNGramCanopyPredicate: (0.8, mentionNorm, Surname)
Canopy: PartialIndexTfidfNGramCanopyPredicate: (0.6, mentionNorm, Surname)
Canopy: PartialIndexTfidfNGramCanopyPredicate: (0.4, mentionNorm, Surna