# Setup

In [11]:
from splink import (
  DuckDBAPI,
  block_on,
  splink_datasets
)

from splink.blocking_analysis import (
  count_comparisons_from_blocking_rule,
  n_largest_blocks
)

# Load Data

In [2]:
df = splink_datasets.fake_1000

# Counting Comparisons for Blocking Rules

In [3]:
db_api = DuckDBAPI()

## Initial and Surname

In [4]:
br_initial_surname = block_on("substr(first_name, 1,1)", "surname")

In [5]:
counts_initial_surname = count_comparisons_from_blocking_rule(
  table_or_tables=df,
  blocking_rule=br_initial_surname,
  link_type="dedupe_only",
  db_api=db_api,
)

In [6]:
counts_initial_surname

{'number_of_comparisons_generated_pre_filter_conditions': 1632,
 'number_of_comparisons_to_be_scored_post_filter_conditions': 473,
 'filter_conditions_identified': '',
 'equi_join_conditions_identified': 'SUBSTRING(l.first_name, 1, 1) = SUBSTRING(r.first_name, 1, 1) AND l."surname" = r."surname"',
 'link_type_join_condition': 'where l."unique_id" < r."unique_id"'}

### Worst Offending Values

In [None]:
result = n_largest_blocks(
  table_or_tables=df,
  blocking_rule= block_on("city", "first_name"),
  link_type="dedupe_only",
  db_api=db_api,
  n_largest=3
)

## Exact Forename and Fuzzy Surname

In [7]:
br_forename_fuzz_surname = "l.first_name = r.first_name and levenshtein(l.surname, r.surname) < 2"

In [8]:
counts_forename_fuzz_surname = count_comparisons_from_blocking_rule(
  table_or_tables=df,
  blocking_rule= br_forename_fuzz_surname,
  link_type="dedupe_only",
  db_api=db_api,
)


In [9]:
counts_forename_fuzz_surname

{'number_of_comparisons_generated_pre_filter_conditions': 4827,
 'number_of_comparisons_to_be_scored_post_filter_conditions': 372,
 'filter_conditions_identified': 'LEVENSHTEIN(l.surname, r.surname) < 2',
 'equi_join_conditions_identified': 'l.first_name = r.first_name',
 'link_type_join_condition': 'where l."unique_id" < r."unique_id"'}

### Worst Offending Values

In [None]:
# TODO