# Splink basic example

In [1]:
import pandas as pd
from splink import DuckDBAPI, Linker, SettingsCreator, block_on

# Hypothetical datasets
data1 = {
    'Name': ['Davide Smith', 'Bill Johnson', 'Charles Brown'],
    'DateOfBirth': ['1990-01-01', '1985-05-12', '1980-07-23'],
    'Gender': ['F', 'M', 'M'],
    'Address': ['123 Main St', '456 Elm St', '789 Oak St'],
    'Email': ['david@smith.net', 'billjohnson@smith.net', 'charles-brown@smith.net']
}

data2 = {
    'Name': ['David Smith', 'Robert Johnson', 'Chad Brade'],
    'DateOfBirth': ['1990-01-01', '1985-05-12', '1980-07-25'],
    'Gender': ['F', 'M', 'M'],
    'Address': ['123 Main Street', '456 Elm Street', '789 Oak Street'],
    'Email': ['david@smith.net', 'billyjohnson@smith.net', 'charles-brwon@smith.net']
}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Add unique ID and source_dataset columns
df1['unique_id'] = range(1, len(df1) + 1)
df1['source_dataset'] = 'df1'

df2['unique_id'] = range(1, len(df2) + 1)
df2['source_dataset'] = 'df2'

df_combined = pd.concat([df1, df2], ignore_index=True)

<jemalloc>: Out-of-range conf value: narenas:0


In [2]:
df1

Unnamed: 0,Name,DateOfBirth,Gender,Address,Email,unique_id,source_dataset
0,Davide Smith,1990-01-01,F,123 Main St,david@smith.net,1,df1
1,Bill Johnson,1985-05-12,M,456 Elm St,billjohnson@smith.net,2,df1
2,Charles Brown,1980-07-23,M,789 Oak St,charles-brown@smith.net,3,df1


In [3]:
df2

Unnamed: 0,Name,DateOfBirth,Gender,Address,Email,unique_id,source_dataset
0,David Smith,1990-01-01,F,123 Main Street,david@smith.net,1,df2
1,Robert Johnson,1985-05-12,M,456 Elm Street,billyjohnson@smith.net,2,df2
2,Chad Brade,1980-07-25,M,789 Oak Street,charles-brwon@smith.net,3,df2


In [4]:
db_api = DuckDBAPI(connection=":memory:")

In [5]:
import splink.comparison_library as cl

settings = SettingsCreator(
    link_type="link_only",
    blocking_rules_to_generate_predictions=[
        block_on("Name"),
        block_on("DateOfBirth"),
        block_on("Gender"),
        block_on("Address"),
        block_on("Email")
    ],
    comparisons=[
        cl.ForenameSurnameComparison("Name", "Gender"),
        cl.DateOfBirthComparison(
            "DateOfBirth",
            input_is_string=True,
        ),
        cl.EmailComparison("Email"),
    ],
    retain_intermediate_calculation_columns=True,
)

In [6]:
linker = Linker(df_combined, settings, db_api=db_api)


In [7]:
deterministic_rules = [
    "l.Name = r.Name and levenshtein(r.DateOfBirth, l.DateOfBirth) <= 1",
    "l.Email = r.Email",
]

linker.training.estimate_probability_two_random_records_match(
    deterministic_rules, recall=0.7
)

Probability two random records match is estimated to be  0.159.
This means that amongst all possible pairwise record comparisons, one in 6.30 are expected to match.  With 9 total possible comparisons, we expect a total of around 1.43 matching pairs


In [8]:
df_predictions = linker.inference.predict(threshold_match_probability=0.2)
df_predictions.as_pandas_dataframe(limit=5)

Blocking time: 0.01 seconds
Predict time: 0.28 seconds

You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'Name_Gender':
    m values not fully trained
Comparison: 'Name_Gender':
    u values not fully trained
Comparison: 'DateOfBirth':
    m values not fully trained
Comparison: 'DateOfBirth':
    u values not fully trained
Comparison: 'Email':
    m values not fully trained
Comparison: 'Email':
    u values not fully trained


Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,Name_l,Name_r,Gender_l,Gender_r,...,Email_l,Email_r,gamma_Email,tf_Email_l,tf_Email_r,bf_Email,bf_tf_adj_Email,Address_l,Address_r,match_key
0,10.50497,0.999312,df1,df2,1,1,Davide Smith,David Smith,F,F,...,david@smith.net,david@smith.net,4,0.333333,0.333333,1024.0,0.002783,123 Main St,123 Main Street,1
1,1.605413,0.752648,df1,df2,2,2,Bill Johnson,Robert Johnson,M,M,...,billjohnson@smith.net,billyjohnson@smith.net,2,0.166667,0.166667,1.259921,1.0,456 Elm St,456 Elm Street,1
