# Setup

In [1]:
from splink import (
    DuckDBAPI,
    block_on,
    splink_datasets
)

from splink.blocking_analysis import (
    count_comparisons_from_blocking_rule,
    n_largest_blocks
)

# Load Data

In [2]:
df = splink_datasets.fake_1000

# Counting Comparisons for Blocking Rules

In [3]:
db_api = DuckDBAPI()

In [4]:
def count_comparisons(blocking_rule):
    return count_comparisons_from_blocking_rule(
        table_or_tables=df,
        blocking_rule=blocking_rule,
        link_type="dedupe_only",
        db_api=db_api
    )

In [5]:
def find_largest_blocks(blocking_rule, n=3):
    return 

## Initial and Surname

In [6]:
br_fname_sname = block_on("substr(first_name, 1,1)", "surname")
count_comparisons(br_fname_sname)

{'number_of_comparisons_generated_pre_filter_conditions': 1632,
 'number_of_comparisons_to_be_scored_post_filter_conditions': 473,
 'filter_conditions_identified': '',
 'equi_join_conditions_identified': 'SUBSTRING(l.first_name, 1, 1) = SUBSTRING(r.first_name, 1, 1) AND l."surname" = r."surname"',
 'link_type_join_condition': 'where l."unique_id" < r."unique_id"'}

## Forename and Fuzzy Surname

In [7]:
br_fname_fuzz_sname = "l.first_name = r.first_name and levenshtein(l.surname, r.surname) < 2"
count_comparisons(br_fname_sname)

{'number_of_comparisons_generated_pre_filter_conditions': 1632,
 'number_of_comparisons_to_be_scored_post_filter_conditions': 473,
 'filter_conditions_identified': '',
 'equi_join_conditions_identified': 'SUBSTRING(l.first_name, 1, 1) = SUBSTRING(r.first_name, 1, 1) AND l."surname" = r."surname"',
 'link_type_join_condition': 'where l."unique_id" < r."unique_id"'}

# Worst Blocking Variables

In [8]:
br_city_fname = block_on("city", "first_name")
largest_city_fname = n_largest_blocks(
    table_or_tables=df,
    blocking_rule=br_city_fname,
    link_type="dedupe_only",
    db_api=db_api,
    n_largest=3
)

In [9]:
largest_city_fname.as_pandas_dataframe()

Unnamed: 0,key_0,key_1,count_l,count_r,block_count
0,Birmingham,Theodore,7,7,49
1,London,Oliver,7,7,49
2,London,James,6,6,36
