In [1]:
from __future__ import annotations

import ibis
from ibis import _
from ibis.expr.types import BooleanValue, Table

from mismo.block import Blocking, block_on_arrays, block
from mismo.datasets import load_patents

ibis.options.interactive = True

In [2]:
patents = load_patents()
patents

In [3]:
from mismo.clean.strings import norm_whitespace  # noqa: E402


def clean_names(names):
    names = norm_whitespace(names)
    names = names.upper()
    # Only want to keep letters, numbers, and spaces
    names = names.re_replace("[^0-9A-Z ]", "")
    # Now have to do whitespace fixup again
    names = norm_whitespace(names)
    return names


def bin_coordinate(coord):
    """Bin coordinates to 0.1 degree precision, which is ~6 miles.

    38.5323 -> 385
    38.4923 -> 385
    """
    return (coord.round(1) * 10).cast("int16")


cleaned = patents
cleaned = cleaned.select(
    "record_id",
    "label_true",
    "name_true",
    "name",
    name_cleaned=clean_names(_.name),
    latitude=_.latitude.nullif(0),
    longitude=_.longitude.nullif(0),
    coauthors=_.coauthors.upper().split("**").map(norm_whitespace).sort(),
    classes=_.classes.upper().split("**").map(norm_whitespace).sort(),
)
cleaned = cleaned.mutate(
    name_tokens=_.name_cleaned.split(" ").map(norm_whitespace).sort(),
    name_first3=_.name_cleaned[0:3],
    latitude_binned=bin_coordinate(_.latitude),
    longitude_binned=bin_coordinate(_.longitude),
)
cleaned

In [4]:
def coords_match(left: Table, right: Table) -> BooleanValue:
    return [
        (
            (left.latitude_binned == right.latitude_binned)
            & (left.longitude_binned == right.longitude_binned)
        ),
        (
            (left.latitude_binned == right.latitude_binned)
            & left.longitude_binned.isnull()
            & right.longitude_binned.isnull()
        ),
        (
            left.latitude_binned.isnull()
            & right.latitude_binned.isnull()
            & (left.longitude_binned == right.longitude_binned)
        ),
    ]


rules = [
    coords_match,
    lambda left, right: left.name_cleaned == right.name_cleaned,
    block_on_arrays("coauthors", "classes"),
]


cleaned = cleaned.cache()
left = cleaned
right = cleaned.view()
blocking = block(left, right, rules)
blocked = blocking.blocked.cache()
blocked

In [5]:
blocking.inners

(<mismo.block._blocking.OrBlocking at 0x11dd8c5d0>,
 <mismo.block._blocking.ConditionBlocking at 0x11dff8290>,
 <mismo.block._blocking.IdsBlocking at 0x11def2fd0>)

In [6]:
blocked.count()

[1;36m398527[0m

In [7]:
from mismo.compare import (  # noqa: E402
    Comparison,
    ComparisonLevel,
    exact_level,
    jaccard,
)
from mismo.compare.fs import FellegiSunterComparer  # noqa: E402

exact = exact_level("name")
almost_level = exact_level("name_first3")
name_comparison = Comparison(name="Name", levels=[exact, almost_level])

classes_exact_match = exact_level("classes")
classes_jaccard_50 = ComparisonLevel(
    name="Classes 50% Jaccard",
    condition=lambda table: jaccard(table.classes_l, table.classes_r) >= 0.5,
)
classes_comparison = Comparison(
    name="Classes",
    levels=[classes_exact_match, classes_jaccard_50],
)

fs = FellegiSunterComparer([name_comparison, classes_comparison], weights=None)
fs.comparisons

[Comparison(name=Name, levels=[ComparisonLevel(name=exact_name, description=Exact match on `name`), ComparisonLevel(name=exact_name_first3, description=Exact match on `name_first3`)]),
 Comparison(name=Classes, levels=[ComparisonLevel(name=exact_classes, description=Exact match on `classes`), ComparisonLevel(name=Classes 50% Jaccard)])]

In [8]:
fs_trained = fs.trained(left, right, max_pairs=10_000, seed=42)
fs_trained.weights.prior = 3 / left.count().execute()
fs_trained

FellegiSunterComparer([Comparison(name=Name, levels=[ComparisonLevel(name=exact_name, description=Exact match on `name`), ComparisonLevel(name=exact_name_first3, description=Exact match on `name_first3`)]), Comparison(name=Classes, levels=[ComparisonLevel(name=exact_classes, description=Exact match on `classes`), ComparisonLevel(name=Classes 50% Jaccard)])], <mismo.compare.fs._base.Weights object at 0x11e030bd0>)

In [9]:
comparisons = fs_trained.compare(blocked)
comparisons

Look at the different combinations of comparisons that we found, and see which ones
led to the best match, and which ones to the worst.

Unsurprisingly, the exact match levels have the highest bayes factor, and the
ELSE levels have the lowest. The other levels are somewhere in between.

In [10]:
comparisons.distinct(on=["Name_cmp", "Classes_cmp"]).order_by("bf")

Let's be really picky and only take the most likely matches as true matches, and
then perform connected components to label each patent with its inventor:

In [11]:
links = comparisons[_.bf == _.bf.max()]
links = links.cache()
links.count()

[1;36m2656[0m

In [12]:
from mismo.cluster import connected_components  # noqa: F401

labels = connected_components(links)
labels = labels.relabel({"component": "inventor_id"})
labels

In [13]:
with_labels = patents.left_join(labels, "record_id")
with_labels

In [14]:
# Some records don't appear in the labeling because they didn't appear in the links.
# connected_components() should be updated to so you can supply nodes as well,
# so every node gets a component.
with_labels[_.inventor_id.isnull()]

In [15]:
links[(_.record_id_l == 317966) | (_.record_id_r == 317966)]