In [3]:
#!pip install splink


In [1]:
import duckdb
from tqdm.auto import tqdm 
import pandas as pd

connection_source = duckdb.connect(database="../dbt/database_name.duckdb", read_only = True)

In [None]:
# Filtrer les propriétaires ayant un logements vacant dans la prod (lovac 2024)
# Logement est le lien entre un propriétaire national et départemental 
# Aller chercher les logements via les ID locaux dans les fichiers fonciers
# Aller chercher les logements via les ID locaux dans la prod 

In [20]:
# Queries pour charger les DataFrames
query_prod_owners = """ 
SELECT
    CAST(id AS VARCHAR) AS unique_id,
    full_name AS owner_fullname,
    birth_date AS owner_birth_date,
    list_aggregate(address_dgfip, 'string_agg', ' ') as owner_address,
    kind_class AS owner_category_detail,
    CAST(idpersonne AS VARCHAR) AS owner_idpersonne,
    city AS owner_city,
    postal_code as owner_postal_code
FROM main_stg.owners
JOIN main_stg.ban_addresses ba
JOIN main_stg.stg_production_owners_housing
JOIN main_stg.stg_production_housing ON stg_production_owners_housing.housing_id = stg_production_housing.id
    WHERE occupancy_source = 'V' OR occupancy = 'V'
ON ba.address_kind = 'Owner' AND ba.ref_id = owners.id;
"""
query_ff_owners = """SELECT 
    CAST(ff_owner_idpersonne AS VARCHAR) AS unique_id,
    CAST(ff_owner_idpersonne AS VARCHAR) AS owner_idpersonne, 
    ff_owner_address_1  || ' ' || ff_owner_address_2 || ' ' || ff_owner_address_3 || ' ' || ff_owner_address_4 AS owner_address,
    ff_owner_postal_code AS owner_postal_code, 
    ff_owner_birth_date AS owner_birth_date, 
    ff_owner_lastname AS owner_lastname, 
    ff_owner_firstname AS owner_firstname, 
    ff_owner_fullname AS owner_fullname, 
    ff_owner_category_text AS owner_category_detail,
    ff_owner_city AS owner_city
FROM main_int.int_ff_owners_dedup;
"""

df_ff_owners = connection_source.execute(query_ff_owners).fetchdf()
df_prod_owners = connection_source.execute(query_prod_owners).fetchdf()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [21]:
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on

# Connexion à DuckDB
db_api = DuckDBAPI()

# Création des settings Splink
settings = SettingsCreator(
    link_type="link_only",  # Comparaison entre deux datasets
    comparisons=[
        cl.JaroWinklerAtThresholds(
            "owner_fullname", 
            [0.9, 0.8], 
        ),
        cl.DateOfBirthComparison(
            "owner_birth_date", 
            input_is_string=False, 
            datetime_metrics=["year", "month"], 
            datetime_thresholds=[1, 2],
        ),
        cl.LevenshteinAtThresholds("owner_address"),
        cl.ExactMatch("owner_postal_code"),
        cl.ExactMatch("owner_city"),
        cl.ExactMatch("owner_idpersonne"),
        cl.ExactMatch("owner_category_detail"),
    ],
    blocking_rules_to_generate_predictions=[
        block_on("owner_postal_code"),
        block_on("owner_city"),
        block_on("owner_category_detail"),
    ],
    retain_intermediate_calculation_columns=True,
)

# Instanciation du linker
linker = Linker(
    input_table_or_tables= [df_prod_owners, df_ff_owners],
    settings=settings,
    db_api=db_api
)

# Estimation de la probabilité de correspondance aléatoire
linker.training.estimate_probability_two_random_records_match(
    deterministic_matching_rules=[
        block_on("owner_postal_code","owner_birth_date"),
    ],
    recall=0.8,
)

Probability two random records match is estimated to be  3.43e-09.
This means that amongst all possible pairwise record comparisons, one in 291,705,781.58 are expected to match.  With 74,165,830,334,910 total possible comparisons, we expect a total of around 254,248.75 matching pairs


In [22]:
pairwise_predictions = linker.inference.predict(threshold_match_weight=-5)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

SplinkException: Error executing the following sql for table `__splink__blocked_id_pairs`(__splink__blocked_id_pairs_004cfdde1):
CREATE TABLE __splink__blocked_id_pairs_004cfdde1 AS
WITH __splink__df_concat_with_tf AS (
  SELECT
    *
  FROM __splink__df_concat_with_tf_439d0e2ff
), __splink__df_concat_with_tf_left AS (
  SELECT
    *
  FROM __splink__df_concat_with_tf
  WHERE
    source_dataset = (
      SELECT
        MIN(source_dataset)
      FROM __splink__df_concat_with_tf
    )
), __splink__df_concat_with_tf_right AS (
  SELECT
    *
  FROM __splink__df_concat_with_tf
  WHERE
    source_dataset = (
      SELECT
        MAX(source_dataset)
      FROM __splink__df_concat_with_tf
    )
)
SELECT
  '0' AS match_key,
  l."source_dataset" || '-__-' || l."unique_id" AS join_key_l,
  r."source_dataset" || '-__-' || r."unique_id" AS join_key_r
FROM __splink__df_concat_with_tf_left AS l
INNER JOIN __splink__df_concat_with_tf_right AS r
  ON (
    l."owner_postal_code" = r."owner_postal_code"
  )
WHERE
  1 = 1
UNION ALL
SELECT
  '1' AS match_key,
  l."source_dataset" || '-__-' || l."unique_id" AS join_key_l,
  r."source_dataset" || '-__-' || r."unique_id" AS join_key_r
FROM __splink__df_concat_with_tf_left AS l
INNER JOIN __splink__df_concat_with_tf_right AS r
  ON (
    l."owner_city" = r."owner_city"
  )
WHERE
  1 = 1
  AND NOT (
    COALESCE((
      l."owner_postal_code" = r."owner_postal_code"
    ), FALSE)
  )
UNION ALL
SELECT
  '2' AS match_key,
  l."source_dataset" || '-__-' || l."unique_id" AS join_key_l,
  r."source_dataset" || '-__-' || r."unique_id" AS join_key_r
FROM __splink__df_concat_with_tf_left AS l
INNER JOIN __splink__df_concat_with_tf_right AS r
  ON (
    l."owner_category_detail" = r."owner_category_detail"
  )
WHERE
  1 = 1
  AND NOT (
    COALESCE((
      l."owner_postal_code" = r."owner_postal_code"
    ), FALSE)
    OR COALESCE((
      l."owner_city" = r."owner_city"
    ), FALSE)
  )

Error was: Out of Memory Error: failed to offload data block of size 256.0 KiB (293.6 GiB/293.6 GiB used).
This limit was set by the 'max_temp_directory_size' setting.
By default, this setting utilizes the available disk space on the drive where the 'temp_directory' is located.
You can adjust this setting, by using (for example) PRAGMA max_temp_directory_size='10GiB'

In [17]:
for col in df_ff_owners.columns:
    try:
        if df_ff_owners[col].str.contains("76MBK6P6").any():
            print(col)
    except Exception as e:
        print(e)

        print(col)
        pass

unique_id
owner_idpersonne
Can only use .str accessor with string values!
owner_birth_date


In [19]:
df_ff_owners.owner_idpersonne

0          14MCFH2S
1          14MCKPQW
2          14MCGRC9
3          14MCGK4P
4          14MCKRNN
             ...   
4195449    97MBC2KQ
4195450    97MBDFQS
4195451    97MBC3R5
4195452    97MBC42N
4195453    97MBDXTH
Name: owner_idpersonne, Length: 4195454, dtype: object

In [18]:

df_ff_owners.query("unique_id == '76MBK6P6'")

Unnamed: 0,unique_id,owner_idpersonne,owner_address,owner_postal_code,owner_birth_date,owner_lastname,owner_firstname,owner_fullname,owner_category_detail,owner_city
614400,76MBK6P6,76MBK6P6,,76290,1973-03-07,CAUVIGNY,BENOIT,CAUVIGNY/BENOIT PASCAL JEROME,Particulier,76290 MONTIVILLIERS


In [None]:
df_prod_owners.dtypes

unique_id                                      object
owner_fullname                                 object
owner_birth_date         datetime64[us, Europe/Paris]
owner_address                                  object
owner_category_detail                          object
owner_idpersonne                               object
owner_city                                     object
owner_postal_code                              object
dtype: object

In [12]:
≈.dtypes

unique_id                        object
owner_idpersonne                 object
owner_address                    object
owner_postal_code                object
owner_birth_date         datetime64[us]
owner_lastname                   object
owner_firstname                  object
owner_fullname                   object
owner_category_detail            object
owner_city                       object
dtype: object

In [9]:
df_ff_owners.columns

Index(['ff_owner_idpersonne', 'ff_owner_idprodroit', 'ff_owner_idprocpte',
       'ff_owner_address_1', 'ff_owner_address_2', 'ff_owner_address_3',
       'ff_owner_address_4', 'ff_owner_postal_code', 'ff_owner_birth_date',
       'ff_owner_lastname', 'ff_owner_firstname', 'ff_owner_city',
       'ff_owner_fullname', 'ff_owner_category', 'ff_owner_category_text'],
      dtype='object')

In [10]:
df_ff_owners.head()

Unnamed: 0,ff_owner_idpersonne,ff_owner_idprodroit,ff_owner_idprocpte,ff_owner_address_1,ff_owner_address_2,ff_owner_address_3,ff_owner_address_4,ff_owner_postal_code,ff_owner_birth_date,ff_owner_lastname,ff_owner_firstname,ff_owner_city,ff_owner_fullname,ff_owner_category,ff_owner_category_text
0,11PBBJN2,[11069+0815602],[11069+08156],CITE ADMINIST BT E,0000 BD ARMAND DUPORTAL,,31000 TOULOUSE,31000,NaT,,,31000 TOULOUSE,D R DES SERVICES PENITENTIAIRES,P1b,Etat et collectivité territoriale
1,11PBDTG2,[11069+0820001],[11069+08200],,0067 RUE DE VERDUN,,11000 CARCASSONNE,11000,NaT,,,11000 CARCASSONNE,SCI DES JARDINS DE CERES,G1a,"SCI, Copropriété, Autres personnes morales"
2,11PBDTNZ,[11069+0821801],[11069+08218],,0024 BD JEAN JAURES,,11620 VILLEMOUSTAUSSOU,11620,NaT,,,11620 VILLEMOUSTAUSSOU,ARTEAEDIFICANDI,F7b,"Promoteur, Investisseur privé"
3,11PBDVLB,[11069+0829601],[11069+08296],,0008 RUE DU TRENCAVEL,,11600 MALVES EN MINERVOIS,11600,NaT,,,11600 MALVES EN MINERVOIS,AQUILA SUD IMMO,G1a,"SCI, Copropriété, Autres personnes morales"
4,11PBDVL9,[11069+0830001],[11069+08300],,0000 DOM DU SIESTOU,,11800 LAURE MINERVOIS,11800,NaT,,,11800 LAURE MINERVOIS,SCI DU MOULIN,G1a,"SCI, Copropriété, Autres personnes morales"
