In [2]:
from __future__ import annotations

import ibis
from ibis import _
from ibis.expr.types import BooleanValue, Table

from mismo.examples import load_patents
from mismo.block import block
from mismo import DedupeDatasetPair

ibis.options.interactive = True

In [3]:
patents = load_patents()
raw = patents.table
raw

In [4]:
from mismo.clean.strings import norm_whitespace


def clean_names(names):
    names = norm_whitespace(names)
    names = names.upper()
    # Only want to keep letters, numbers, and spaces
    names = names.re_replace("[^0-9A-Z ]", "")
    # Now have to do whitespace fixup again
    names = norm_whitespace(names)
    return names


def hash_coord_to(coord):
    return (coord.round(1) * 10).cast("int16")


cleaned = raw
cleaned = cleaned.select(
    "record_id",
    "label_true",
    "name_true",
    "name",
    name_cleaned=clean_names(_.name),
    latitude=_.latitude.nullif(0),
    longitude=_.longitude.nullif(0),
    coauthors=_.coauthors.upper().split("**").map(norm_whitespace),
    classes=_.classes.upper().split("**").map(norm_whitespace),
)
cleaned = cleaned.mutate(
    name_tokens=_.name_cleaned.split(" ").map(norm_whitespace),
    latitude_hash=hash_coord_to(_.latitude),
    longitude_hash=hash_coord_to(_.longitude),
)
cleaned

In [10]:
def blocker(left: Table, right: Table) -> list[BooleanValue]:
    rules = [
        left.name_cleaned == right.name_cleaned,
        left.latitude_hash == right.latitude_hash,
    ]
    return rules[0]


cleaned_dataset = DedupeDatasetPair(cleaned)
blocking = block(cleaned_dataset, blocker)
blocking.blocked_data

In [7]:
ibis.show_sql(blocking.blocked_data)

CompileError: Multiple, unrelated CTEs found with the same name: 't9'

In [7]:
len(blocking)

212511