In [13]:
import duckdb
from tqdm.auto import tqdm 
import pandas as pd

from splink import DuckDBAPI, Linker, SettingsCreator, block_on
import splink.comparison_library as cl
import splink.comparison_level_library as cll
from splink.exploratory import completeness_chart, profile_columns
connection_source = duckdb.connect(database="../../dbt/database_name.duckdb")

In [14]:
histo_path = "/Users/raphaelcourivaud/Downloads/5000-signalement.csv"

df_histo = pd.read_csv(histo_path)[["ban_id_occupant", "nb_pieces_logement",
           "superficie", "prenom_proprio", "nom_proprio", 
           "code_postal_proprio", "ville_proprio", "adresse_proprio"
           ]]

In [15]:
# Création d'un ID unique pour la base externe
df_histo['unique_id'] = df_histo.index + 1000000

# Renommage de ban_id_occupant pour l'alignement
df_histo = df_histo.rename(columns={'ban_id_occupant': 'ban_id'})

# Conversion des types pour la base externe
df_histo['nb_pieces_logement'] = pd.to_numeric(df_histo['nb_pieces_logement'], errors='coerce')
df_histo['superficie'] = pd.to_numeric(df_histo['superficie'], errors='coerce')
df_histo['code_postal_proprio'] = df_histo['code_postal_proprio'].astype(str)


In [16]:
query_prepare_housing = """
CREATE OR REPLACE TABLE housing_prepared AS
SELECT 
    local_id AS unique_id,
    ban_id_housing AS ban_id,
    full_name,
    CAST(rooms_count AS INTEGER) AS nb_pieces_logement,
    CAST(living_area AS FLOAT) AS superficie,
    CASE 
        WHEN full_name IS NOT NULL THEN SPLIT_PART(full_name, ' ', 1) 
        ELSE NULL 
    END AS prenom_proprio,
    CASE 
        WHEN full_name IS NOT NULL AND LENGTH(full_name) - LENGTH(REPLACE(full_name, ' ', '')) > 0 
        THEN SUBSTRING(full_name FROM POSITION(' ' IN full_name) + 1) 
        ELSE full_name 
    END AS nom_proprio,
    owner_postal_code AS code_postal_proprio,
    CASE 
        WHEN address_dgfip IS NOT NULL THEN array_to_string(address_dgfip, ' ')
        ELSE NULL
    END AS adresse_proprio,
    CASE 
        WHEN address_dgfip IS NOT NULL THEN 
            (CASE 
                WHEN array_length(address_dgfip) > 0 AND REGEXP_MATCHES(address_dgfip[array_length(address_dgfip)-1], '([0-9]{5})') 
                THEN REGEXP_EXTRACT(address_dgfip[array_length(address_dgfip)-1], '([0-9]{5})')
                ELSE NULL 
            END)
        ELSE NULL
    END AS ville_proprio
FROM explore.housing_histolog
WHERE
    owner_postal_code IS NOT NULL AND housing_postal_code IS NOT NULL
"""

In [9]:
import uuid 
df_histo["unique_id"] = df_histo.index + 10000000
df_histo["full_name"] = df_histo["prenom_proprio"].fillna("") + " " + df_histo["nom_proprio"].fillna("")

In [10]:
connection_source.execute("CREATE OR REPLACE TABLE histo_data AS SELECT * FROM df_histo")
connection_source.execute(query_prepare_housing)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x10409f930>

In [11]:
df_histo_to_link = connection_source.execute("SELECT * FROM histo_data").fetchdf()
df_housing_to_link = connection_source.execute("""
                                                SELECT housing_prepared.*
                                                FROM housing_prepared
                                                JOIN histo_data ON housing_prepared.ban_id = histo_data.ban_id
                                                """).fetchdf()

In [56]:
query_v2 = """

WITH matching_scores AS (
  SELECT 
    h.unique_id AS unique_id_housing,
    h.ban_id,
    d.unique_id AS unique_id_histo,
    
    -- Score pour superficie (max 10 points)
    CASE
      WHEN h.superficie IS NULL OR d.superficie IS NULL THEN 0
      WHEN h.superficie = d.superficie THEN 10
      WHEN h.superficie = 0 AND d.superficie = 0 THEN 10
      WHEN GREATEST(h.superficie, d.superficie) = 0 THEN 0  -- Éviter division par zéro
      WHEN ABS(h.superficie - d.superficie) / GREATEST(h.superficie, d.superficie) < 0.01 THEN 9
      WHEN ABS(h.superficie - d.superficie) / GREATEST(h.superficie, d.superficie) < 0.03 THEN 8
      WHEN ABS(h.superficie - d.superficie) / GREATEST(h.superficie, d.superficie) < 0.1 THEN 6
      WHEN ABS(h.superficie - d.superficie) / GREATEST(h.superficie, d.superficie) < 0.3 THEN 3
      ELSE 1
    END AS superficie_score,
    
    -- Score pour nom complet (max 10 points) - Version améliorée
    CASE
      WHEN h.full_name IS NULL OR d.full_name IS NULL THEN 0
      -- Correspondance exacte
      WHEN h.full_name = d.full_name THEN 10
      -- Normaliser en supprimant les préfixes courants
      WHEN trim(replace(replace(replace(lower(h.full_name), 'm ', ''), 'mme ', ''), 'sci ', '')) = 
           trim(replace(replace(replace(lower(d.full_name), 'm ', ''), 'mme ', ''), 'sci ', '')) THEN 9
      -- Cas spécifique pour l'inversion nom/prénom
      WHEN (
          -- Les deux mots principaux du nom h sont dans d
          (
            split_part(lower(h.full_name), ' ', 2) != '' AND 
            split_part(lower(h.full_name), ' ', 3) != '' AND
            lower(d.full_name) LIKE '%' || split_part(lower(h.full_name), ' ', 2) || '%' AND 
            lower(d.full_name) LIKE '%' || split_part(lower(h.full_name), ' ', 3) || '%' AND
            length(split_part(h.full_name, ' ', 2)) > 2 AND
            length(split_part(h.full_name, ' ', 3)) > 2
          )
          OR
          -- Les deux mots principaux du nom d sont dans h
          (
            split_part(lower(d.full_name), ' ', 1) != '' AND 
            split_part(lower(d.full_name), ' ', 2) != '' AND
            lower(h.full_name) LIKE '%' || split_part(lower(d.full_name), ' ', 1) || '%' AND 
            lower(h.full_name) LIKE '%' || split_part(lower(d.full_name), ' ', 2) || '%' AND
            length(split_part(d.full_name, ' ', 1)) > 2 AND
            length(split_part(d.full_name, ' ', 2)) > 2
          )
        ) THEN 8
      -- Similarité élevée
      WHEN jaro_winkler_similarity(lower(h.full_name), lower(d.full_name)) >= 0.92 THEN 7
      -- Un nom complet contient l'autre
      WHEN lower(h.full_name) LIKE '%' || lower(d.full_name) || '%' OR
           lower(d.full_name) LIKE '%' || lower(h.full_name) || '%' THEN 6
      -- Similarité moyenne mais significative
      WHEN jaro_winkler_similarity(lower(h.full_name), lower(d.full_name)) >= 0.85 THEN 5
      -- Similarité plus faible mais toujours pertinente
      WHEN jaro_winkler_similarity(lower(h.full_name), lower(d.full_name)) >= 0.7 THEN 3
      ELSE 0
    END AS full_name_score,
    
    -- Score pour prénom (max 10 points) - Version améliorée
    CASE
      WHEN h.prenom_proprio IS NULL OR d.prenom_proprio IS NULL THEN 0
      -- Correspondance exacte
      WHEN lower(trim(h.prenom_proprio)) = lower(trim(d.prenom_proprio)) THEN 10
      -- Similarité très élevée
      WHEN jaro_winkler_similarity(lower(trim(h.prenom_proprio)), lower(trim(d.prenom_proprio))) >= 0.92 THEN 8
      -- Prénom présent dans le nom complet de l'autre
      WHEN lower(trim(d.full_name)) LIKE '%' || lower(trim(h.prenom_proprio)) || '%' OR
           lower(trim(h.full_name)) LIKE '%' || lower(trim(d.prenom_proprio)) || '%' THEN 7
      -- Similarité élevée
      WHEN jaro_winkler_similarity(lower(trim(h.prenom_proprio)), lower(trim(d.prenom_proprio))) >= 0.85 THEN 6
      -- Première lettre identique et similarité correcte
      WHEN left(lower(trim(h.prenom_proprio)), 1) = left(lower(trim(d.prenom_proprio)), 1) AND
           jaro_winkler_similarity(lower(trim(h.prenom_proprio)), lower(trim(d.prenom_proprio))) >= 0.7 THEN 5
      -- Similarité moyenne
      WHEN jaro_winkler_similarity(lower(trim(h.prenom_proprio)), lower(trim(d.prenom_proprio))) >= 0.7 THEN 3
      -- Forme abrégée (ex: J. pour Jean)
      WHEN left(lower(trim(h.prenom_proprio)), 1) = left(lower(trim(d.prenom_proprio)), 1) THEN 2
      ELSE 0
    END AS prenom_score,
    
    -- Score pour nom (max 10 points) - Version améliorée
    CASE
      WHEN h.nom_proprio IS NULL OR d.nom_proprio IS NULL THEN 0
      -- Correspondance exacte
      WHEN lower(trim(h.nom_proprio)) = lower(trim(d.nom_proprio)) THEN 10
      -- Nom présent dans le nom complet et significatif
      WHEN (lower(trim(d.full_name)) LIKE '%' || lower(trim(h.nom_proprio)) || '%' OR
            lower(trim(h.full_name)) LIKE '%' || lower(trim(d.nom_proprio)) || '%') AND
           length(trim(h.nom_proprio)) > 3 AND length(trim(d.nom_proprio)) > 3 THEN 8
      -- Similarité très élevée
      WHEN jaro_winkler_similarity(lower(trim(h.nom_proprio)), lower(trim(d.nom_proprio))) >= 0.92 THEN 8
      -- Nom présent dans le nom complet
      WHEN lower(trim(d.full_name)) LIKE '%' || lower(trim(h.nom_proprio)) || '%' OR
           lower(trim(h.full_name)) LIKE '%' || lower(trim(d.nom_proprio)) || '%' THEN 7
      -- Similarité élevée
      WHEN jaro_winkler_similarity(lower(trim(h.nom_proprio)), lower(trim(d.nom_proprio))) >= 0.85 THEN 6
      -- Nom sans particules nobiliaires
      WHEN replace(replace(replace(replace(replace(lower(trim(h.nom_proprio)), 'de ', ''), 'le ', ''), 'la ', ''), 'du ', ''), 'van ', '') = 
           replace(replace(replace(replace(replace(lower(trim(d.nom_proprio)), 'de ', ''), 'le ', ''), 'la ', ''), 'du ', ''), 'van ', '') THEN 6
      -- Similarité moyenne
      WHEN jaro_winkler_similarity(lower(trim(h.nom_proprio)), lower(trim(d.nom_proprio))) >= 0.7 THEN 3
      -- Premier mot du nom correspond (pour les noms composés)
      WHEN split_part(lower(trim(h.nom_proprio)), ' ', 1) = split_part(lower(trim(d.nom_proprio)), ' ', 1) AND
           length(split_part(lower(trim(h.nom_proprio)), ' ', 1)) > 2 THEN 2
      ELSE 0
    END AS nom_score,
    
    -- Score pour code postal (max 10 points)
    CASE
      WHEN h.code_postal_proprio IS NULL OR d.code_postal_proprio IS NULL THEN 0
      WHEN h.code_postal_proprio = d.code_postal_proprio THEN 10
      -- Mêmes premiers chiffres (département/zone)
      WHEN left(h.code_postal_proprio, 3) = left(d.code_postal_proprio, 3) THEN 7
      WHEN left(h.code_postal_proprio, 2) = left(d.code_postal_proprio, 2) THEN 5
      ELSE 0
    END AS code_postal_score,
    
    -- Score pour ville (max 10 points)
    CASE
      WHEN h.ville_proprio IS NULL OR d.ville_proprio IS NULL THEN 0
      WHEN lower(trim(h.ville_proprio)) = lower(trim(d.ville_proprio)) THEN 10
      -- Petites différences textuelles (fautes de frappe, accents)
      WHEN levenshtein(lower(trim(h.ville_proprio)), lower(trim(d.ville_proprio))) <= 2 THEN 8
      -- Ville similaire mais avec plus de différences
      WHEN levenshtein(lower(trim(h.ville_proprio)), lower(trim(d.ville_proprio))) <= 6 THEN 5
      -- Différences plus importantes
      WHEN levenshtein(lower(trim(h.ville_proprio)), lower(trim(d.ville_proprio))) <= 10 THEN 2
      ELSE 0
    END AS ville_score,
    
    -- Score pour adresse (max 10 points)
    CASE
      WHEN h.adresse_proprio IS NULL OR d.adresse_proprio IS NULL THEN 0
      WHEN lower(trim(h.adresse_proprio)) = lower(trim(d.adresse_proprio)) THEN 10
      -- Petites différences textuelles
      WHEN levenshtein(lower(trim(h.adresse_proprio)), lower(trim(d.adresse_proprio))) <= 2 THEN 8
      -- Adresse similaire mais avec plus de différences
      WHEN levenshtein(lower(trim(h.adresse_proprio)), lower(trim(d.adresse_proprio))) <= 6 THEN 5
      -- Différences plus importantes
      WHEN levenshtein(lower(trim(h.adresse_proprio)), lower(trim(d.adresse_proprio))) <= 10 THEN 2
      ELSE 0
    END AS adresse_score,
    
    -- Score pour nombre de pièces (max 10 points)
    CASE
      WHEN h.nb_pieces_logement IS NULL OR d.nb_pieces_logement IS NULL THEN 0
      WHEN h.nb_pieces_logement = d.nb_pieces_logement THEN 10
      WHEN h.nb_pieces_logement = 0 AND d.nb_pieces_logement = 0 THEN 10  -- Cas spécial quand les deux sont zéro
      WHEN GREATEST(h.nb_pieces_logement, d.nb_pieces_logement) = 0 THEN 0  -- Éviter division par zéro
      WHEN ABS(h.nb_pieces_logement - d.nb_pieces_logement) / GREATEST(h.nb_pieces_logement, d.nb_pieces_logement) < 0.01 THEN 9
      WHEN ABS(h.nb_pieces_logement - d.nb_pieces_logement) / GREATEST(h.nb_pieces_logement, d.nb_pieces_logement) < 0.03 THEN 8
      WHEN ABS(h.nb_pieces_logement - d.nb_pieces_logement) / GREATEST(h.nb_pieces_logement, d.nb_pieces_logement) < 0.1 THEN 6
      WHEN ABS(h.nb_pieces_logement - d.nb_pieces_logement) / GREATEST(h.nb_pieces_logement, d.nb_pieces_logement) < 0.3 THEN 3
      -- Si la différence est d'exactement 1 pièce (important pour les petits logements)
      WHEN ABS(h.nb_pieces_logement - d.nb_pieces_logement) = 1 THEN 5
      ELSE 0
    END AS nb_pieces_score,
    
    -- Bonus pour le même ban_id (20 points supplémentaires)
    CASE
      WHEN h.ban_id = d.ban_id AND h.ban_id IS NOT NULL AND d.ban_id IS NOT NULL THEN 20
      ELSE 0
    END AS ban_id_score
    
  FROM housing_prepared h
  INNER JOIN histo_data d
  -- Blocage amélioré pour optimiser les performances tout en maximisant les correspondances
  ON (
      -- Même code postal ou code postal NULL
      (h.code_postal_proprio = d.code_postal_proprio 
        OR h.code_postal_proprio IS NULL 
        OR d.code_postal_proprio IS NULL)
      
      -- OU Même ban_id
      OR (h.ban_id = d.ban_id AND h.ban_id IS NOT NULL)
      
      -- OU Correspondance forte sur le nom et prénom
      OR (
          (lower(trim(h.nom_proprio)) = lower(trim(d.nom_proprio)) OR
           jaro_winkler_similarity(lower(trim(h.nom_proprio)), lower(trim(d.nom_proprio))) >= 0.9)
          AND
          (lower(trim(h.prenom_proprio)) = lower(trim(d.prenom_proprio)) OR
           jaro_winkler_similarity(lower(trim(h.prenom_proprio)), lower(trim(d.prenom_proprio))) >= 0.9)
      )
      
      -- OU Correspondance forte sur le nom complet
      OR (
          jaro_winkler_similarity(
              replace(replace(replace(lower(trim(h.full_name)), 'm ', ''), 'mme ', ''), 'sci ', ''),
              replace(replace(replace(lower(trim(d.full_name)), 'm ', ''), 'mme ', ''), 'sci ', '')
          ) >= 0.9
      )
  )
)

SELECT 
  matching_scores.unique_id_housing,
  matching_scores.ban_id,
  matching_scores.unique_id_histo,
  
  -- Colonnes de la table housing_prepared
  h.full_name AS full_name_housing,
  h.nb_pieces_logement AS nb_pieces_logement_housing,
  h.superficie AS superficie_housing,
  h.prenom_proprio AS prenom_proprio_housing,
  h.nom_proprio AS nom_proprio_housing,
  h.code_postal_proprio AS code_postal_proprio_housing,
  h.adresse_proprio AS adresse_proprio_housing,
  h.ville_proprio AS ville_proprio_housing,
  
  -- Colonnes de la table histo_data
  d.full_name AS full_name_histo,
  d.nb_pieces_logement AS nb_pieces_logement_histo,
  d.superficie AS superficie_histo,
  d.prenom_proprio AS prenom_proprio_histo,
  d.nom_proprio AS nom_proprio_histo,
  d.code_postal_proprio AS code_postal_proprio_histo,
  d.adresse_proprio AS adresse_proprio_histo,
  d.ville_proprio AS ville_proprio_histo,
  
  -- Scores unitaires
  superficie_score,
  prenom_score,
  full_name_score,
  nom_score,
  code_postal_score,
  ville_score,
  adresse_score,
  nb_pieces_score,
  ban_id_score,
  
  -- Calcul du score total (somme pondérée)
  (
    superficie_score * 0.10 +
    prenom_score * 0.15 +
    full_name_score * 0.10 +
    nom_score * 0.20 +
    code_postal_score * 0.15 +
    ville_score * 0.05 +
    adresse_score * 0.05 +
    nb_pieces_score * 0.10 +
    ban_id_score * 0.10
  ) AS match_score  -- Score final sur 100

FROM matching_scores
LEFT JOIN housing_prepared h ON matching_scores.unique_id_housing = h.unique_id
LEFT JOIN histo_data d ON matching_scores.unique_id_histo = d.unique_id

-- Option: Filtrer pour ne garder que les scores supérieurs à un seuil
-- WHERE match_score >= 40

ORDER BY match_score DESC
"""

In [57]:
query = """
WITH matching_scores AS (
  SELECT 
    h.unique_id AS unique_id_housing,
    h.ban_id,
    d.unique_id AS unique_id_histo,
    -- Score pour superficie (max 10 points)
    CASE
      WHEN h.superficie IS NULL OR d.superficie IS NULL THEN 0
      WHEN h.superficie = d.superficie THEN 10
      -- WHEN h.superficie = 0 AND d.superficie = 0 THEN 10
      WHEN ABS(h.superficie - d.superficie) / GREATEST(h.superficie, d.superficie) < 0.01 THEN 9
      WHEN ABS(h.superficie - d.superficie) / GREATEST(h.superficie, d.superficie) < 0.03 THEN 8
      WHEN ABS(h.superficie - d.superficie) / GREATEST(h.superficie, d.superficie) < 0.1 THEN 6
      WHEN ABS(h.superficie - d.superficie) / GREATEST(h.superficie, d.superficie) < 0.3 THEN 3
      ELSE 1
    END AS superficie_score,
-- Score pour nom complet (max 10 points)
CASE
  WHEN h.full_name IS NULL OR d.full_name IS NULL THEN 0
  
  -- Correspondance exacte
  WHEN h.full_name = d.full_name THEN 10
  
  -- Normaliser en supprimant les préfixes courants (version simple)
  WHEN trim(replace(replace(replace(lower(h.full_name), 'm ', ''), 'mme ', ''), 'sci ', '')) = 
       trim(replace(replace(replace(lower(d.full_name), 'm ', ''), 'mme ', ''), 'sci ', '')) THEN 9
  
  -- Cas spécifique pour l'inversion nom/prénom (BERTRANDIE HENRI vs Henri BERTRANDIE)
  WHEN 
    -- Vérifier si les deux premiers mots du premier nom sont présents dans le second
    (
      lower(d.full_name) LIKE '%' || split_part(lower(h.full_name), ' ', 2) || '%' AND
      lower(d.full_name) LIKE '%' || split_part(lower(h.full_name), ' ', 3) || '%' AND
      length(split_part(h.full_name, ' ', 2)) > 2 AND
      length(split_part(h.full_name, ' ', 3)) > 2
    )
    OR
    -- Vérifier si les deux premiers mots du second nom sont présents dans le premier
    (
      lower(h.full_name) LIKE '%' || split_part(lower(d.full_name), ' ', 1) || '%' AND
      lower(h.full_name) LIKE '%' || split_part(lower(d.full_name), ' ', 2) || '%' AND
      length(split_part(d.full_name, ' ', 1)) > 2 AND
      length(split_part(d.full_name, ' ', 2)) > 2
    )
  THEN 8
  
  -- Similarité élevée
  WHEN jaro_winkler_similarity(lower(h.full_name), lower(d.full_name)) >= 0.92 THEN 7
  
  -- Vérifier si un nom est contenu dans l'autre
  WHEN 
    lower(h.full_name) LIKE '%' || lower(d.full_name) || '%' OR
    lower(d.full_name) LIKE '%' || lower(h.full_name) || '%'
  THEN 6
  
  -- Similarité moyenne mais significative
  WHEN jaro_winkler_similarity(lower(h.full_name), lower(d.full_name)) >= 0.85 THEN 5
  
  -- Similarité plus faible mais toujours pertinente
  WHEN jaro_winkler_similarity(lower(h.full_name), lower(d.full_name)) >= 0.7 THEN 3
  
  ELSE 0

END AS full_name_score,
    
    -- Score pour nom (max 10 points)
  CASE
    WHEN h.prenom_proprio IS NULL OR d.prenom_proprio IS NULL THEN 0
    -- Correspondance exacte
    WHEN lower(trim(h.prenom_proprio)) = lower(trim(d.prenom_proprio)) THEN 10
    -- Similarité très élevée (diminution des exigences pour les accents/caractères spéciaux)
    WHEN jaro_winkler_similarity(lower(trim(h.prenom_proprio)), lower(trim(d.prenom_proprio))) >= 0.92 THEN 8
    -- Prénom présent dans le nom complet de l'autre enregistrement
    WHEN 
      lower(trim(d.full_name)) LIKE '%' || lower(trim(h.prenom_proprio)) || '%' OR
      lower(trim(h.full_name)) LIKE '%' || lower(trim(d.prenom_proprio)) || '%'
    THEN 7
    -- Diminution des niveaux de similarité pour être plus inclusif
    WHEN jaro_winkler_similarity(lower(trim(h.prenom_proprio)), lower(trim(d.prenom_proprio))) >= 0.85 THEN 6
    -- Première lettre identique et similarité correcte 
    WHEN 
      left(lower(trim(h.prenom_proprio)), 1) = left(lower(trim(d.prenom_proprio)), 1) AND
      jaro_winkler_similarity(lower(trim(h.prenom_proprio)), lower(trim(d.prenom_proprio))) >= 0.7
    THEN 5
    -- Similarité plus faible mais toujours pertinente
    WHEN jaro_winkler_similarity(lower(trim(h.prenom_proprio)), lower(trim(d.prenom_proprio))) >= 0.7 THEN 3
    -- Première lettre identique (forme abrégée possible)
    WHEN left(lower(trim(h.prenom_proprio)), 1) = left(lower(trim(d.prenom_proprio)), 1) THEN 2
    ELSE 0
  END AS prenom_score,
  CASE
  WHEN h.nom_proprio IS NULL OR d.nom_proprio IS NULL THEN 0
  -- Correspondance exacte
  WHEN lower(trim(h.nom_proprio)) = lower(trim(d.nom_proprio)) THEN 10
  -- Nom présent dans le nom complet de l'autre enregistrement et similarité élevée
  WHEN 
    (lower(trim(d.full_name)) LIKE '%' || lower(trim(h.nom_proprio)) || '%' OR
     lower(trim(h.full_name)) LIKE '%' || lower(trim(d.nom_proprio)) || '%') AND
    length(trim(h.nom_proprio)) > 3 AND length(trim(d.nom_proprio)) > 3  -- éviter les faux positifs avec des noms très courts
  THEN 8
  -- Similarité très élevée
  WHEN jaro_winkler_similarity(lower(trim(h.nom_proprio)), lower(trim(d.nom_proprio))) >= 0.92 THEN 8
  -- Nom présent dans le nom complet de l'autre enregistrement
  WHEN 
    lower(trim(d.full_name)) LIKE '%' || lower(trim(h.nom_proprio)) || '%' OR
    lower(trim(h.full_name)) LIKE '%' || lower(trim(d.nom_proprio)) || '%'
  THEN 7
  -- Similarité élevée
  WHEN jaro_winkler_similarity(lower(trim(h.nom_proprio)), lower(trim(d.nom_proprio))) >= 0.85 THEN 6
  -- Nom sans particules (ex: "DE", "VAN", "LE") correspond
  WHEN 
    regexp_replace(lower(trim(h.nom_proprio)), '^(de |le |la |du |des |von |van |d\')', '') = 
    regexp_replace(lower(trim(d.nom_proprio)), '^(de |le |la |du |des |von |van |d\')', '')
  THEN 6
  -- Similarité plus faible mais toujours pertinente
  WHEN jaro_winkler_similarity(lower(trim(h.nom_proprio)), lower(trim(d.nom_proprio))) >= 0.7 THEN 3
  -- Premier mot du nom correspond (pour les noms composés)
  WHEN 
    split_part(lower(trim(h.nom_proprio)), ' ', 1) = split_part(lower(trim(d.nom_proprio)), ' ', 1) AND
    length(split_part(lower(trim(h.nom_proprio)), ' ', 1)) > 2  -- éviter les particules
  THEN 2
  ELSE 0
END AS nom_score,
    
    -- Score pour code postal (max 10 points)
    CASE
      WHEN h.code_postal_proprio IS NULL OR d.code_postal_proprio IS NULL THEN 0
      WHEN h.code_postal_proprio = d.code_postal_proprio THEN 10
      -- Simplification de la vérification du secteur pour les codes postaux français
      WHEN LEFT(h.code_postal_proprio, 3) = LEFT(d.code_postal_proprio, 3) THEN 7
      WHEN LEFT(h.code_postal_proprio, 2) = LEFT(d.code_postal_proprio, 2) THEN 5
      ELSE 0
    END AS code_postal_score,
    
    -- Score pour ville (max 10 points)
    CASE
      WHEN h.ville_proprio IS NULL OR d.ville_proprio IS NULL THEN 0
      WHEN h.ville_proprio = d.ville_proprio THEN 10
      WHEN levenshtein(h.ville_proprio, d.ville_proprio) <= 2 THEN 8
      WHEN levenshtein(h.ville_proprio, d.ville_proprio) <= 6 THEN 5
      WHEN levenshtein(h.ville_proprio, d.ville_proprio) <= 10 THEN 2
      ELSE 0
    END AS ville_score,
    
    -- Score pour adresse (max 10 points)
    CASE
      WHEN h.adresse_proprio IS NULL OR d.adresse_proprio IS NULL THEN 0
      WHEN h.adresse_proprio = d.adresse_proprio THEN 10
      WHEN levenshtein(h.adresse_proprio, d.adresse_proprio) <= 2 THEN 8
      WHEN levenshtein(h.adresse_proprio, d.adresse_proprio) <= 6 THEN 5
      WHEN levenshtein(h.adresse_proprio, d.adresse_proprio) <= 10 THEN 2
      ELSE 0
    END AS adresse_score,
    
    -- Score pour nombre de pièces (max 10 points)
    CASE
      WHEN h.nb_pieces_logement IS NULL OR d.nb_pieces_logement IS NULL THEN 0
      WHEN h.nb_pieces_logement = d.nb_pieces_logement THEN 10
      WHEN ABS(h.nb_pieces_logement - d.nb_pieces_logement) / GREATEST(h.nb_pieces_logement, d.nb_pieces_logement) < 0.01 THEN 9
      WHEN ABS(h.nb_pieces_logement - d.nb_pieces_logement) / GREATEST(h.nb_pieces_logement, d.nb_pieces_logement) < 0.03 THEN 8
      WHEN ABS(h.nb_pieces_logement - d.nb_pieces_logement) / GREATEST(h.nb_pieces_logement, d.nb_pieces_logement) < 0.1 THEN 6
      WHEN ABS(h.nb_pieces_logement - d.nb_pieces_logement) / GREATEST(h.nb_pieces_logement, d.nb_pieces_logement) < 0.3 THEN 3
      ELSE 0
    END AS nb_pieces_score,
    
    -- Bonus pour le même ban_id (20 points supplémentaires)
    CASE
      WHEN h.ban_id = d.ban_id THEN 20
      ELSE 0
    END AS ban_id_score
    
  FROM housing_prepared h
  INNER JOIN histo_data d
  -- Utiliser un blocage pour améliorer les performances
  ON (
       -- Condition 1: Même code postal ou code postal NULL
       (h.code_postal_proprio = d.code_postal_proprio 
        OR h.code_postal_proprio IS NULL 
        OR d.code_postal_proprio IS NULL)
       
       -- Condition 2: Même ban_id
       OR h.ban_id = d.ban_id
     )
)

SELECT 
  matching_scores.unique_id_housing,
  matching_scores.ban_id,
  matching_scores.unique_id_histo,
  -- Colonnes de la table housing_prepared
  h.full_name AS full_name_housing,
  h.nb_pieces_logement AS nb_pieces_logement_housing,
  h.superficie AS superficie_housing,
  h.prenom_proprio AS prenom_proprio_housing,
  h.nom_proprio AS nom_proprio_housing,
  h.code_postal_proprio AS code_postal_proprio_housing,
  h.adresse_proprio AS adresse_proprio_housing,
  h.ville_proprio AS ville_proprio_housing,
  
  -- Colonnes de la table histo_data
  d.full_name AS full_name_histo,
  d.nb_pieces_logement AS nb_pieces_logement_histo,
  d.superficie AS superficie_histo,
  d.prenom_proprio AS prenom_proprio_histo,
  d.nom_proprio AS nom_proprio_histo,
  d.code_postal_proprio AS code_postal_proprio_histo,
  d.adresse_proprio AS adresse_proprio_histo,
  d.ville_proprio AS ville_proprio_histo,
  
  -- Scores unitaires
  superficie_score,
  prenom_score,
  full_name_score,
  nom_score,
  code_postal_score,
  ville_score,
  adresse_score,
  nb_pieces_score,
  ban_id_score,
  
  -- Calculer le score total (somme pondérée des scores individuels)
  (
    superficie_score * 0.10 +
    prenom_score * 0.15 +
    full_name_score * 0.10 +
    nom_score * 0.20 +
    code_postal_score * 0.15 +
    ville_score * 0.05 +
    adresse_score * 0.05 +
    nb_pieces_score * 0.10 +
    ban_id_score * 0.10
  ) / 1.0 AS match_score  -- Score final sur 100
FROM matching_scores
-- Joindre à nouveau les tables pour récupérer toutes les colonnes
LEFT JOIN housing_prepared h ON unique_id_housing = h.unique_id
LEFT JOIN histo_data d ON unique_id_histo = d.unique_id
-- Filtrer pour ne garder que les scores supérieurs à un seuil
ORDER BY match_score DESC
"""

In [None]:
#df_housing_to_link = connection_source.execute("SELECT * FROM housing_prepared").fetchdf()
df_scores = connection_source.execute(query_v2).fetchdf()

In [None]:
df_scores.filter(regex="full_name").head(100)

Unnamed: 0,full_name_housing,full_name_histo,full_name_score
0,M LAMOURI BOUAMEUR,HALIM LAMOURI BOUAMEUR,5
1,PAZZONI ISABELLE PAULETTE ANNA,HALIM LAMOURI BOUAMEUR,5
2,M BERTRANDIE HENRI,Henri BERTRANDIE,3
3,BRUNOIS ANNIE MARIE GENEVIEVE,Henri BERTRANDIE,3
4,SCI CHARLEMAGNE,SCI CHARLEMAGNE,9
...,...,...,...
95,SCI KADEM,SCI KADEM - M. BELBACHIR,6
96,SCI KADEM,SCI KADEM - M. BELBACHIR,6
97,MME ANDRE JACQUELINE,JACQUELINE ANDRE,3
98,ANDRE MAX LOUIS,JACQUELINE ANDRE,3


In [22]:
df_scores.filter(regex="full_name")

Unnamed: 0,full_name_housing,full_name_histo,full_name_score
0,M LAMOURI BOUAMEUR,HALIM LAMOURI BOUAMEUR,5
1,PAZZONI ISABELLE PAULETTE ANNA,HALIM LAMOURI BOUAMEUR,5
2,M BERTRANDIE HENRI,Henri BERTRANDIE,3
3,BRUNOIS ANNIE MARIE GENEVIEVE,Henri BERTRANDIE,3
4,SCI CHARLEMAGNE,SCI CHARLEMAGNE,9
...,...,...,...
27616,EUN,Maria WAGNER,0
27617,FEDERATION AGIRC-ARRCO,FONCIA,0
27618,FEDERATION AGIRC-ARRCO,FONCIA,0
27619,FONCIA SYNDIC,;;;;;;;;;,0


In [14]:
df_scores.filter(regex="full_name").head(100).to_clipboard(index=False)

In [162]:
df_histo.superficie.isnull().sum()

439

In [17]:
df_histo_to_link = connection_source.execute("SELECT * FROM histo_data").fetchdf()
df_housing_to_link = connection_source.execute("""
                                               SELECT * FROM housing_prepared
                                               INNER JOIN histo_data ON housing_prepared.ban_id = histo_data.ban_id
                                               """).fetchdf()

In [10]:
cols = list(set(df_histo_to_link.columns).intersection(set(df_housing_to_link.columns)) )

df_histo = df_histo_to_link[cols]
df_housing = df_housing_to_link[cols]

In [11]:
db_api = DuckDBAPI()

In [178]:
profile_columns(
    table_or_tables=[df_housing_to_link, df_histo_to_link],
    db_api=db_api,
    column_expressions=[
        "nb_pieces_logement",
        "superficie",
        "prenom_proprio",
        "nom_proprio",
    ],
)

In [39]:
# Définir les règles de blocage
blocking_rules = [
    block_on("ban_id"),
]

In [40]:
comparison_superficie = {
    "output_column_name": "superficie",
    "comparison_levels": [
        cll.NullLevel("superficie"),
        cll.ExactMatchLevel("superficie"),
        cll.PercentageDifferenceLevel("superficie", 0.01),
        cll.PercentageDifferenceLevel("superficie", 0.03),
        cll.PercentageDifferenceLevel("superficie", 0.1),
        cll.PercentageDifferenceLevel("superficie", 0.3),
        cll.ElseLevel(),
    ],
    "comparison_description": "Superficie percentage difference",
}
comparison_nb_pieces_logement = {
    "output_column_name": "nb_pieces_logement",
    "comparison_levels": [
        cll.NullLevel("nb_pieces_logement"),
        cll.ExactMatchLevel("nb_pieces_logement"),
        cll.PercentageDifferenceLevel("nb_pieces_logement", 0.01),
        cll.PercentageDifferenceLevel("nb_pieces_logement", 0.03),
        cll.PercentageDifferenceLevel("nb_pieces_logement", 0.1),
        cll.PercentageDifferenceLevel("nb_pieces_logement", 0.3),
        cll.ElseLevel(),
    ],
    "comparison_description": "Nombre de pièces percentage difference",
}

In [7]:
!open .

In [41]:
settings = SettingsCreator(
    link_type="link_only",
    probability_two_random_records_match=1 / len(df_housing_to_link),
    blocking_rules_to_generate_predictions=blocking_rules,
    comparisons=[
        comparison_superficie,
        cl.NameComparison("prenom_proprio"),
        cl.NameComparison("full_name", ),
        cl.NameComparison("nom_proprio"),
        cl.PostcodeComparison("code_postal_proprio"),
        cl.LevenshteinAtThresholds("ville_proprio", [2, 6, 10]),
        cl.LevenshteinAtThresholds("adresse_proprio", [2, 6, 10]),
        comparison_nb_pieces_logement,
    ],
    retain_intermediate_calculation_columns=True,
)

In [2]:
db_api = DuckDBAPI()
linker = Linker(
    [df_housing_to_link, df_histo_to_link],
    settings,
    input_table_aliases=["_housing", "_histo"],
    db_api=db_api,
)

NameError: name 'DuckDBAPI' is not defined

In [1]:
linker.training.estimate_u_using_random_sampling()

NameError: name 'linker' is not defined

In [44]:
df_predict = linker.inference.predict(threshold_match_probability=0.001)

Blocking time: 0.01 seconds
Predict time: 1.24 seconds

You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'superficie':
    m values not fully trained
Comparison: 'prenom_proprio':
    m values not fully trained
Comparison: 'full_name':
    m values not fully trained
Comparison: 'full_name':
    u values not fully trained
Comparison: 'nom_proprio':
    m values not fully trained
Comparison: 'nom_proprio':
    u values not fully trained
Comparison: 'code_postal_proprio':
    m values not fully trained
Comparison: 'code_postal_proprio':
    u values not fully trained
Comparison: 'ville_proprio':
    m values not fully trained
Comparison: 'ville_proprio':
    u values not fully trained
Comparison: 'adresse_proprio':
    m values not fully trained
Comparison: 'adresse_proprio':
    u values not fu

In [46]:
df_predict.as_pandas_dataframe()

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,superficie_l,superficie_r,gamma_superficie,bf_superficie,...,adresse_proprio_l,adresse_proprio_r,gamma_adresse_proprio,bf_adresse_proprio,nb_pieces_logement_l,nb_pieces_logement_r,gamma_nb_pieces_logement,bf_nb_pieces_logement,ban_id_l,ban_id_r
0,-8.812968,0.002219,_histo,_housing,10003952,26910231273,33.0,37.0,1,0.036275,...,,22 RUE LEON BLUM 02840 ATHIES SOUS LAON,-1,1.0,2,2,5,4.379041,02691_4710_00052,02691_4710_00052
1,-7.676267,0.004865,_histo,_housing,10003613,621080616337,20.0,20.0,5,79.207316,...,2 Rue du Grand Marais,226 RUE DE L IMPERATRICE 62600 BERCK,0,0.0125,1,1,5,4.379041,62108_0860_00226,62108_0860_00226
2,-7.676267,0.004865,_histo,_housing,10003613,621080616337,20.0,20.0,5,79.207316,...,2 Rue du Grand Marais,226 RUE DE L IMPERATRICE 62600 BERCK,0,0.0125,1,1,5,4.379041,62108_0860_00226,62108_0860_00226
3,-6.667805,0.00974,_histo,_housing,10001213,720890036530,84.0,84.0,5,79.207316,...,,3 SAINT CHERON 72240 MEZIERES SOUS LAVARDIN,-1,1.0,1,5,0,0.015209,72089_0360_00002,72089_0360_00002
4,-5.865826,0.016859,_histo,_housing,10001868,600570217055,30.0,30.0,5,79.207316,...,,0002 RUE DU FROMENTAL 95530 FRETTE SUR SEINE (LA),-1,1.0,1,1,5,4.379041,60057_1870_00010,60057_1870_00010
5,-5.865826,0.016859,_histo,_housing,10001868,600570217054,30.0,30.0,5,79.207316,...,,0002 RUE DU FROMENTAL 95530 FRETTE SUR SEINE (LA),-1,1.0,1,1,5,4.379041,60057_1870_00010,60057_1870_00010
6,-4.820183,0.034188,_histo,_housing,10001232,644450178589,60.0,60.0,5,79.207316,...,18 avenue Fouchet,45 BD D ALSACE LORRAINE 64000 PAU,0,0.0125,3,3,5,4.379041,64445_2690_00002,64445_2690_00002
7,-7.01023,0.007698,_histo,_housing,10000386,13220336001,102.0,101.0,4,103.386392,...,,0001 AV GEORGES POMPIDOU 69003 LYON,-1,1.0,4,4,5,4.379041,01322_0545_00111,01322_0545_00111
8,-7.01023,0.007698,_histo,_housing,10000386,13220335995,102.0,101.0,4,103.386392,...,,0001 AV GEORGES POMPIDOU 69003 LYON,-1,1.0,4,4,5,4.379041,01322_0545_00111,01322_0545_00111
9,-7.394571,0.005908,_histo,_housing,10000386,13220335984,102.0,102.0,5,79.207316,...,,0001 AV GEORGES POMPIDOU 69003 LYON,-1,1.0,4,4,5,4.379041,01322_0545_00111,01322_0545_00111


In [19]:
df = df_housing_to_link.merge(df_histo_to_link, on="ban_id", how="inner", suffixes=("_housing", "_histo"))

In [22]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from tqdm import tqdm


# Définir les colonnes à comparer et leur poids
comparison_columns = [
    # Colonnes textuelles (nom, prénom, adresse)
    {'housing': 'prenom_proprio_housing', 'histo': 'prenom_proprio_histo', 'weight': 0.15, 'type': 'text'},
    {'housing': 'nom_proprio_housing', 'histo': 'nom_proprio_histo', 'weight': 0.20, 'type': 'text'},
    {'housing': 'full_name_housing', 'histo': 'full_name_histo', 'weight': 0.10, 'type': 'text'},
    {'housing': 'adresse_proprio_housing', 'histo': 'adresse_proprio_histo', 'weight': 0.10, 'type': 'text'},
    {'housing': 'ville_proprio_housing', 'histo': 'ville_proprio_histo', 'weight': 0.10, 'type': 'text'},
    {'housing': 'code_postal_proprio_housing', 'histo': 'code_postal_proprio_histo', 'weight': 0.15, 'type': 'text'},
    
    # Colonnes numériques (superficie, nombre de pièces)
    {'housing': 'superficie_housing', 'histo': 'superficie_histo', 'weight': 0.10, 'type': 'numeric'},
    {'housing': 'nb_pieces_logement_housing', 'histo': 'nb_pieces_logement_histo', 'weight': 0.10, 'type': 'numeric'}
]

# Fonction pour calculer la similarité entre deux valeurs numériques
def numeric_similarity(val1, val2):
    if pd.isna(val1) or pd.isna(val2):
        return 0.0
    
    # Trouver la valeur maximale pour normaliser
    max_val = max(abs(val1), abs(val2))
    if max_val == 0:
        return 100.0  # Si les deux valeurs sont 0, elles sont identiques
    
    # Calculer la différence en pourcentage
    diff_percent = abs(val1 - val2) / max_val
    
    # Convertir en score de similarité (100 = identique, 0 = très différent)
    similarity = max(0, 100 * (1 - min(diff_percent, 1)))
    return similarity

# Fonction pour calculer la similarité entre deux chaînes de caractères
def text_similarity(str1, str2):
    if pd.isna(str1) or pd.isna(str2):
        return 0.0
    
    # Convertir en chaînes de caractères
    str1 = str(str1).lower().strip()
    str2 = str(str2).lower().strip()
    
    if str1 == '' or str2 == '':
        return 0.0
    
    # Utiliser fuzzywuzzy pour calculer la similarité
    return fuzz.token_sort_ratio(str1, str2)

# Fonction pour calculer le score de similarité global entre deux lignes
def calculate_similarity_score(row_housing, row_histo):
    total_score = 0
    total_weight = 0
    full_scores = {}
    
    for col_info in comparison_columns:
        col_housing = col_info['housing']
        col_histo = col_info['histo']
        weight = col_info['weight']
        col_type = col_info['type']
        
        # Vérifier si les colonnes existent dans les deux dataframes
        if col_housing in row_housing and col_histo in row_histo:
            housing_value = row_housing[col_housing]
            histo_value = row_histo[col_histo]
            
            # Calculer la similarité selon le type de colonne
            if col_type == 'numeric':
                similarity = numeric_similarity(housing_value, histo_value)
            else:  # text
                similarity = text_similarity(housing_value, histo_value)

            full_scores[col_housing.replace("housing", "score")] = similarity
            
            # Ajouter au score total
            total_score += similarity * weight
            total_weight += weight
    
    # Normaliser le score final
    if total_weight > 0:
        return total_score / total_weight, full_scores
    else:
        return 0, {}

# Créer une liste pour stocker les résultats
results = []

# Pour chaque ligne dans le dataframe housing
for idx_housing, row in tqdm(df.iterrows(), desc="Matching records", total=len(df)):
    # Pour chaque ligne dans le dataframe histo (ici on utilise le même dataframe car les colonnes sont déjà présentes)

    row_housing = row.filter(regex='_housing')
    row_histo = row.filter(regex='_histo')
    similarity_score, score_description  = calculate_similarity_score(row_housing, row_histo)
    
    # Ajouter à la liste des résultats si le score dépasse un seuil (optionnel)
    if similarity_score > 30:  # Seuil de 50%
        results.append({
            **{
            'unique_id_housing': row_housing['unique_id_housing'],
            'ban_id': row['ban_id'],
            'unique_id_histo': row_histo['unique_id_histo'],
            'match_score': similarity_score, 
            'full_name_housing': row_housing['full_name_housing'],
            'full_name_histo': row_histo['full_name_histo'],
            'superficie_housing': row_housing['superficie_housing'],
            'superficie_histo': row_histo['superficie_histo'],
            'prenom_proprio_housing': row_housing['prenom_proprio_housing'],
            'prenom_proprio_histo': row_histo['prenom_proprio_histo'],
            'nom_proprio_housing': row_housing['nom_proprio_housing'],
            'nom_proprio_histo': row_histo['nom_proprio_histo'],
        }, 
            **score_description
        })

# Convertir la liste des résultats en dataframe
results_df = pd.DataFrame(results)

# Trier par score de correspondance décroissant
results_df = results_df.sort_values(by='match_score', ascending=False)

# Supprimer les doublons en gardant le score de correspondance le plus élevé pour chaque pair unique_id_housing/unique_id_histo
results_df = results_df.drop_duplicates(subset=['unique_id_housing', 'unique_id_histo'], keep='first')

# Afficher les premiers résultats
print(results_df.head(10))

# Exporter vers un fichier CSV
results_df.to_csv('matching_results.csv', index=False)

Matching records: 100%|██████████| 19853/19853 [00:04<00:00, 3981.08it/s]

     unique_id_housing              ban_id  unique_id_histo  match_score  \
3949      330670828582    33067_0185_00002         10001137    71.618116   
2966      240370246554    24037_0670_00024         10003316    71.105556   
2965      240370246555    24037_0670_00024         10003316    70.105556   
6246      330630240712  33063_qxp5af_00003         10002502    66.250000   
4584      930660686898    93066_4775_00006         10000819    65.750000   
7255      243220059976    24322_2430_00108         10002745    65.302326   
7253      243220059977    24322_2430_00108         10002745    65.302326   
5477      060290182577    06029_1370_00007         10002952    65.300000   
6248      021100221587    02110_0220_00005         10003009    65.250000   
3759      674820184164    67482_0472_00034         10004348    65.164286   

                  full_name_housing                       full_name_histo  \
3949                SCI DU DISTRICT               SCI Soci�t� du District   
2966     




In [1]:
!open .