In [1]:
# uncomment and run if mismo is not installed
# %pip install -q git+https://github.com/NickCrews/mismo@main

In [1]:
from __future__ import annotations

import ibis
from ibis import _
from ibis.backends.duckdb import Backend as DuckDBBackend

import mismo

ibis.options.interactive = True

In [2]:
conn: DuckDBBackend = ibis.duckdb.connect()
clean = conn.read_parquet("../../data/alaska-addresses-clean.parquet")
dirty = conn.read_parquet("../../data/alaska-addresses-dirty.parquet")
print(clean.count().execute())
clean

315659


In [3]:
print(dirty.count().execute())
dirty

992329


In [4]:
def norm_string(s: ibis.ir.StringValue) -> ibis.ir.StringValue:
    return mismo.text.norm_whitespace(s).upper().nullif("")


normed_strings = ibis.selectors.across(ibis.selectors.of_type(str), norm_string)

clean = clean.mutate(normed_strings).distinct().cache()
dirty = dirty.mutate(normed_strings).cache()

In [7]:
exact_linker = mismo.KeyLinker(("city", "postcode", "street", "unit"))
exact_linker.pair_counts(clean, dirty).chart()

AttributeError: 'Table' object has no attribute '_KEY_COUNTS_SPEC'

In [10]:
many_linkages = [
    mismo.KeyLinker(("city", "street", "unit"))(clean, dirty),
    mismo.KeyLinker(("postcode", "street", "unit"))(clean, dirty),
]
linkage = mismo.linkage.intersect(many_linkages)
linkage

<mismo.linkage._combine.IntersectionLinkage at 0x1296f80b0>

In [11]:
linkage.links

In [12]:
linkage.__sublinkages__()

(Linkage<left=315_659, right=992_329, links=102>,
 Linkage<left=315_659, right=992_329, links=102>)

In [14]:
linkage2 = mismo.linkage.union(
    [linkage, mismo.KeyLinker(("city", _.street[:5], "unit"))(clean, dirty)]
)
linkage2

TypeError: ('received some non-Linkages: [<mismo.linkage._combine.IntersectionLinkage object at 0x1296f80b0>]', [<mismo.linkage._combine.IntersectionLinkage object at 0x1296f80b0>])

In [10]:
linkage2.__sublinkages__()

(<mismo.linkage._combine.AndJoinConditionsLinkage at 0x128866bd0>,
 KeyLinkage(
     keys=(city, _.street[slice(None, 5, None)], unit)
     nleft=315_659
     nright=992_329
     nlinks=3_417
 ))

In [None]:
linkage = exact_linker.linkage(clean, dirty)
linkage

KeyLinkage(
    keys=(city, postcode, street, unit)
    nleft=315_659
    nright=992_329
    nlinks=102
)

In [7]:
is_laurel = _.street.contains("4228 LAUREL")
dirty.filter(is_laurel), clean.filter(is_laurel)

(┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━┓
 ┃[1m [0m[1mrecord_id[0m[1m [0m┃[1m [0m[1mstate[0m[1m [0m[1m [0m┃[1m [0m[1mcity[0m[1m     [0m[1m [0m┃[1m [0m[1mpostcode[0m[1m [0m┃[1m [0m[1mstreet[0m[1m        [0m[1m [0m┃[1m [0m[1munit[0m[1m  [0m[1m [0m┃
 ┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━┩
 │ [2mint64[0m     │ [2mstring[0m │ [2mstring[0m    │ [2mstring[0m   │ [2mstring[0m         │ [2mstring[0m │
 ├───────────┼────────┼───────────┼──────────┼────────────────┼────────┤
 │    [1;36m354101[0m │ [32mAK    [0m │ [32mANCHORAGE[0m │ [32m99508   [0m │ [32m4228 LAUREL ST[0m │ [2mNULL[0m   │
 │    [1;36m354102[0m │ [32mAK    [0m │ [32mANCHORAGE[0m │ [32m99508   [0m │ [32m4228 LAUREL ST[0m │ [2mNULL[0m   │
 │    [1;36m354103[0m │ [32mAK    [0m │ [32mANCHORAGE[0m │ [32m99508   [0m │ [32m4228 LAUREL ST[0m │ [2mNULL[0m   │
 │    [1;36m354104[0m │ 

In [8]:
def value_trigrams(s: ibis.ir.StringValue) -> ibis.ir.ArrayColumn:
    return (
        mismo.text.tokenize(s).map(lambda token: mismo.text.ngrams(token, 3)).flatten()
    )


def trigrams(t: ibis.Table) -> ibis.ir.ArrayColumn:
    return (
        ibis.array(
            [
                value_trigrams(t.city),
                value_trigrams(t.postcode),
                value_trigrams(t.street),
            ]
        )
        .flatten()
        .unique()
        .name("trigrams")
    )


clean = clean.mutate(trigrams(clean)).cache()
dirty = dirty.mutate(trigrams(dirty)).cache()
clean

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [35]:
easy_linkage = mismo.KeyLinker("state", "city", "postcode", "street")
easy_linkage.pair_counts(clean, dirty)

In [16]:
clean.group_by(
    "state",
    "city",
    "postcode",
    "street",
).agg(
    n=_.count(),
    units=_.unit.collect(),
).order_by(_.n.desc())

In [10]:
def trigram_to_record_ids(t: ibis.Table) -> ibis.Table:
    counts_table = (
        t.select("record_id", t.trigrams.unnest().name("trigram"))
        .group_by("trigram")
        .agg(
            n_records=_.count(),
            record_ids=_.record_id.collect(),
        )
    )
    counts_table = counts_table.cache()
    return counts_table


trigrams_clean = trigram_to_record_ids(clean)
trigrams_dirty = trigram_to_record_ids(dirty)
trigrams_clean

In [19]:
def rare_trigrams(t: ibis.Table, threshold: int) -> ibis.Table:
    return t.filter(_.n_records < threshold)


trigrams_clean_rare = rare_trigrams(trigrams_clean, 100)
trigrams_dirty_rare = rare_trigrams(trigrams_dirty, 100)
trigrams_clean_rare

In [20]:
join_table = ibis.join(
    trigrams_clean_rare,
    trigrams_dirty_rare,
    how="inner",
    predicates="trigram",
    lname="{name}_l",
    rname="{name}_r",
)
join_table

In [26]:
links = join_table.select(
    "trigram",
    record_id_l=_.record_ids_l.unnest(),
    record_id_r=_.record_ids_r.unnest(),
)
print(links.count())
links

┌───────┐
│ [1;36m56920[0m │
└───────┘


In [97]:
trigram_linkage = mismo.linkage.LinkTableLinkage(clean, dirty, links)
trigram_linkage = trigram_linkage.cache()
trigram_linkage

LinkTableLinkage(
    left=315_659,
    right=992_329,
    links=56_920,
)

In [98]:
trigram_linkage.links.with_left().with_right()

TypeError: LinksTable._swap_perspective() takes 1 positional argument but 2 were given

In [91]:
clean.count(), trigram_linkage.left.count(), trigram_linkage.left.with_n_links().count()

(┌────────┐
 │ [1;36m315659[0m │
 └────────┘,
 ┌────────┐
 │ [1;36m315659[0m │
 └────────┘,
 ┌────────┐
 │ [1;36m315659[0m │
 └────────┘)

In [93]:
trigram_linkage.left.with_n_links()

In [94]:
trigram_linkage.left.with_n_links().n_links.topk(10)

In [89]:
trigram_linkage.left.count()

┌────────┐
│ [1;36m315659[0m │
└────────┘

In [96]:
trigram_linkage.left.link_counts()

28164


In [50]:
links_full = trigram_linkage.links.with_left().with_right()
links_full = links_full.select(sorted(links_full.columns))
links_full

In [46]:
links_full.mutate(jaccard=mismo.sets.jaccard(_.trigrams_l, _.trigrams_r)).order_by(
    _.jaccard.desc()
).filter(
    _.jaccard < 1,
    _.jaccard > 0.5,
)

In [43]:
linkage.link_counts_chart()

In [8]:
linkage.links.count()

┌────────┐
│ [1;36m429374[0m │
└────────┘