## DEMO: Match Natural Persons

In [None]:
%pip install pandas faker

In [None]:
import uuid
import random

import pandas as pd
from faker import Faker

from fuzzy_matching.match_multi import MultiMatcher
from fuzzy_matching.encryption import AESGCM4Encryptor

In [None]:
# Set up Faker
faker = Faker(locale="nl-NL")
faker.seed_instance(42)

In [None]:
def generate_ids(n):
    """Generate ID-like strings."""
    country_code = "NLD"

    ids = []
    for _ in range(n):
        gender = random.choice(["M", "F"])
        first = f"{random.randint(0, 9999999):07d}"
        second = f"{random.randint(0, 9999999999999999):016d}"
        ids.append(f"{country_code}{first}{gender}{second}")
    return ids


def generate_names(n):
    """Generate dummy names."""
    return pd.Series([faker.name() for _ in range(n)])


def generate_birthdates(n):
    """Generate dummy birthdates."""
    return pd.Series(
        [
            faker.date_of_birth(minimum_age=18, maximum_age=100).strftime("%d-%m-%Y")
            for _ in range(n)
        ]
    )

def make_ids(n) -> str:
    """Generate a UUID4 identifier."""
    return [uuid.uuid4().hex for _ in range(n)]


In [None]:
n = 50_000
data = pd.DataFrame({
    "uuid": make_ids(n),
    "name": generate_names(n),
    "birthdate": generate_birthdates(n),
    "national_id": generate_ids(n),
})
data.sample(5)

In [None]:
data.nunique()

In [None]:
encryption_key = AESGCM4Encryptor.generate_key()
encryption_key = b"\x0e\x84\xa1\x01\xd0\xed\x932\xb5\x1dt\x11\x05\xe5j\xf8"

In [None]:
config = {
    "name": {
        "algoritm": "vector",
        "weight": 0.2,
    },
    "birthdate": {
        "algoritm": "timedelta",
        "format": "%d-%m-%Y",
        "weight": 0.2,
    },
    "national_id": {
        "algoritm": "alignment",
        "weight": 0.6
    },
}

In [None]:
matcher = MultiMatcher(10, config, encryption_key, "storage")

In [None]:
matcher.create(data, id_column="uuid")

In [None]:
result = matcher.get({
    "name": "Sepp Ketting",
    "birthdate": "6-5-1924",
    "national_id": "nld4011381f3815034902574046"
})
result

## Dedupe testing

In [None]:
import uuid
import random

import pandas as pd

In [None]:
def generate_ids(n):
    """Generate ID-like strings."""
    country_code = "NLD"

    ids = []
    for _ in range(n):
        gender = random.choice(["M", "F"])
        first = f"{random.randint(0, 9999999):07d}"
        second = f"{random.randint(0, 9999999999999999):016d}"
        ids.append(f"{country_code}{first}{gender}{second}")
    return ids

def make_id() -> str:
    """Generate a UUID4 identifier."""
    return uuid.uuid4().hex


In [None]:
n = 3_000
values = pd.DataFrame({
    "value": generate_ids(n),
    "uuid": [make_id() for _ in range(n)]
})

len(values)

In [None]:
# Add duplicates
previous = values
for ndupes in 800, 300, 100:
    dupes = pd.DataFrame({
        "value": previous["value"].sample(ndupes).values,
        "uuid": [make_id() for _ in range(ndupes)]
    })

    previous = dupes
    values = pd.concat([values, dupes])

len(values)

In [None]:
values["duplicate_uuid"] = values.groupby(by="value", as_index=False)["uuid"].transform("first")

In [None]:
mapped = values.groupby(by="value", as_index=False).agg(uuid=("uuid", list))
mapped

In [None]:
from rapidfuzz.distance.OSA import normalized_similarity
from rapidfuzz.process import cdist

In [None]:
(
    mapped.assign(
        score=cdist(
            ["NLD0007781F8760032249488398"],
            mapped["value"],
            scorer=normalized_similarity,
            workers=-1,
        )[0]
    )
    .explode("uuid")
)

In [None]:
def flatten(items):
    for item in items:
        if isinstance(item, (list, tuple)):
            yield from flatten(item)
        else:
            yield item

In [None]:
list(flatten(["111", ["222", "333"], "444"]))

In [None]:
df1 = pd.DataFrame({
    "id": ["a", "b", "c", "d"],
    "value": ["a", "a", "b", "c"],
})

df2 = pd.DataFrame({
    "id": ["e", "f", "g", "h"],
    "value": ["a", "c", "d", "e"],
})

In [None]:
grp = df1.groupby("value", as_index=False).agg(lambda ids: list(flatten(ids)))
grp

In [None]:
pd.concat([grp, df2]).groupby("value", as_index=False).agg(lambda ids: list(flatten(ids)))