In [1]:
from __future__ import annotations

import ibis
from ibis import _
from ibis.expr.types import BooleanValue, Table

from mismo.block import Blocking, block_on_arrays
from mismo.examples import load_patents

ibis.options.interactive = True

In [2]:
patents = load_patents()
patents

In [3]:
from mismo.clean.strings import norm_whitespace


def clean_names(names):
    names = norm_whitespace(names)
    names = names.upper()
    # Only want to keep letters, numbers, and spaces
    names = names.re_replace("[^0-9A-Z ]", "")
    # Now have to do whitespace fixup again
    names = norm_whitespace(names)
    return names


def bin_coordinate(coord):
    return (coord.round(1) * 10).cast("int16")


cleaned = patents
cleaned = cleaned.select(
    "record_id",
    "label_true",
    "name_true",
    "name",
    name_cleaned=clean_names(_.name),
    latitude=_.latitude.nullif(0),
    longitude=_.longitude.nullif(0),
    coauthors=_.coauthors.upper().split("**").map(norm_whitespace).sort(),
    classes=_.classes.upper().split("**").map(norm_whitespace).sort(),
)
cleaned = cleaned.mutate(
    name_tokens=_.name_cleaned.split(" ").map(norm_whitespace).sort(),
    name_first3=_.name_cleaned[0:3],
    latitude_binned=bin_coordinate(_.latitude),
    longitude_binned=bin_coordinate(_.longitude),
)
cleaned

In [4]:
def name_rule(left: Table, right: Table) -> BooleanValue:
    return left.name_cleaned == right.name_cleaned


def coord_rules(left: Table, right: Table) -> BooleanValue:
    return [
        (
            (left.latitude_binned == right.latitude_binned)
            & (left.longitude_binned == right.longitude_binned)
        ),
        (
            (left.latitude_binned == right.latitude_binned)
            & left.longitude_binned.isnull()
            & right.longitude_binned.isnull()
        ),
        (
            left.latitude_binned.isnull()
            & right.latitude_binned.isnull()
            & (left.longitude_binned == right.longitude_binned)
        ),
    ]


rules = [
    name_rule,
    coord_rules,
    block_on_arrays("name_tokens", "name_tokens"),
    block_on_arrays("coauthors", "classes"),
]


cleaned = cleaned.cache()
left = cleaned
right = cleaned.view()
blocking = Blocking(left, right, rules)
blocked = blocking.blocked.cache()
blocked

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [5]:
blocked.count()

[1;36m2337943[0m

In [6]:
from mismo.compare import Comparison, ComparisonLevel, array_overlap_norm, exact_level
from mismo.compare.fs import FellegiSunterComparer

exact = exact_level("name")
almost_level = exact_level("name_first3")
name_comparison = Comparison(name="Name", levels=[exact, almost_level])

classes_exact_match = exact_level("classes")
classes_50_percent_overlap = ComparisonLevel(
    name="Classes 50% overlap",
    condition=lambda table: array_overlap_norm(table["classes_l"], table["classes_r"])
    >= 0.5,
)
classes_comparison = Comparison(
    name="Classes",
    levels=[classes_exact_match, classes_50_percent_overlap],
)

guess_n_matches_per_record = 3
prior = guess_n_matches_per_record / patents.count().execute()
fs = FellegiSunterComparer([name_comparison, classes_comparison], prior=prior)
fs.comparisons

[FSComparison(comparison=Comparison(name='Name', levels=[ComparisonLevel(name='exact_name', condition=<function exact.<locals>.equals at 0x1326c28e0>, description='Exact match on `name`'), ComparisonLevel(name='exact_name_first3', condition=<function exact.<locals>.equals at 0x10a849440>, description='Exact match on `name_first3`')], description=None), weights=None),
 FSComparison(comparison=Comparison(name='Classes', levels=[ComparisonLevel(name='exact_classes', condition=<function exact.<locals>.equals at 0x1332480e0>, description='Exact match on `classes`'), ComparisonLevel(name='Classes 50% overlap', condition=<function <lambda> at 0x13324b380>, description='Class 50% overlap')], description=None), weights=None)]

In [7]:
# fs_trained = fs.trained(left, right, seed=42)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [8]:
# comparisons = fs_trained.compare(blocked)
# comparisons

In [10]:
# comparison_vectors = comparisons.distinct(on=["Name_cmp", "Classes_cmp"])
# comparison_vectors = comparison_vectors.cache()
# comparison_vectors

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [11]:
# comparison_vectors.order_by("bf")