In [1]:
from __future__ import annotations

import ibis
from ibis import _
from ibis.expr.types import BooleanValue, Table

from mismo.examples import load_patents
from mismo.block import Blocking, block_on_arrays

ibis.options.interactive = True

In [2]:
patents = load_patents()
patents

In [3]:
from mismo.clean.strings import norm_whitespace


def clean_names(names):
    names = norm_whitespace(names)
    names = names.upper()
    # Only want to keep letters, numbers, and spaces
    names = names.re_replace("[^0-9A-Z ]", "")
    # Now have to do whitespace fixup again
    names = norm_whitespace(names)
    return names


def hash_coord_to(coord):
    return (coord.round(1) * 10).cast("int16")


cleaned = patents
cleaned = cleaned.select(
    "record_id",
    "label_true",
    "name_true",
    "name",
    name_cleaned=clean_names(_.name),
    latitude=_.latitude.nullif(0),
    longitude=_.longitude.nullif(0),
    coauthors=_.coauthors.upper().split("**").map(norm_whitespace),
    classes=_.classes.upper().split("**").map(norm_whitespace),
)
cleaned = cleaned.mutate(
    name_tokens=_.name_cleaned.split(" ").map(norm_whitespace),
    latitude_hash=hash_coord_to(_.latitude),
    longitude_hash=hash_coord_to(_.longitude),
)
cleaned

In [4]:
def name_rule(left: Table, right: Table) -> BooleanValue:
    return left.name_cleaned == right.name_cleaned


def coord_rules(left: Table, right: Table) -> BooleanValue:
    return [
        (
            (left.latitude_hash == right.latitude_hash)
            & (left.longitude_hash == right.longitude_hash)
        ),
        (
            (left.latitude_hash == right.latitude_hash)
            & left.longitude_hash.isnull()
            & right.longitude_hash.isnull()
        ),
        (
            left.latitude_hash.isnull()
            & right.latitude_hash.isnull()
            & (left.longitude_hash == right.longitude_hash)
        ),
    ]


rules = [
    name_rule,
    coord_rules,
    block_on_arrays("name_tokens", "name_tokens"),
    block_on_arrays("coauthors", "classes"),
]


cleaned = cleaned.cache()
left = cleaned
right = cleaned.view()
blocking = Blocking(left, right, rules)
blocked = blocking.blocked.cache()
blocked

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [5]:
blocked.count()

[1;36m2337943[0m