In [1]:
import random

import pandas as pd
from faker import Faker

from match_person import PersonMatcher
from encryption import AESGCM4Encryptor

In [2]:
# Set up Faker
faker = Faker(locale="nl-NL")
faker.seed_instance(42)

In [3]:
def generate_ids(n):
    """Generate ID-like strings."""
    country_code = "NLD"

    ids = []
    for _ in range(n):
        gender = random.choice(["M", "F"])
        first = f"{random.randint(0, 9999999):07d}"
        second = f"{random.randint(0, 9999999999999999):016d}"
        ids.append(f"{country_code}{first}{gender}{second}")
    return ids


def generate_names(n):
    """Generate dummy names."""
    return pd.Series([faker.name() for _ in range(n)])


def generate_birthdates(n):
    """Generate dummy birthdates."""
    return pd.Series(
        [
            faker.date_of_birth(minimum_age=18, maximum_age=100).strftime("%d-%m-%Y")
            for _ in range(n)
        ]
    )


In [4]:
n = 5_000
data = pd.DataFrame({
    "name": generate_names(n),
    "birthdate": generate_birthdates(n),
    "national_id": generate_ids(n),
})
data.sample(5)

Unnamed: 0,name,birthdate,national_id
1316,Berat van Laar-van Gent,28-07-2000,NLD2702794M6207063988425502
4405,Milo de Korte,29-02-1988,NLD3256212F9479598424320481
3272,Mads Pasman,17-08-1973,NLD0458953M6329519089602426
2695,Sebastiaan Janssen-de Boer,26-08-1938,NLD1752561M8304894656603334
952,Cas Klein,18-04-1966,NLD6586475F2421850998576461


In [5]:
encryption_key = AESGCM4Encryptor.generate_key()
encryption_key

b'\x94\xf1\x88\xa7o\x7f,(TF\xa5\x8a\xa2\xcd\xa1]'

In [6]:
config = {
    "name": {
        "algoritm": "vector",
        "weight": 0.2,
    },
    "birthdate": {
        "algoritm": "distance",
        "weight": 0.2,
    },
    "national_id": {
        "algoritm": "distance",
        "weight": 0.6
    },
}

In [7]:
matcher = PersonMatcher(config, encryption_key, "storage")

In [8]:
matcher.create(data)

In [13]:
result = matcher.get({
    "name": "Weijters, Hanna",
    "birthdate": "16-11-2000",
    "national_id": "nld9622792f4887410313628145"
})
result

Unnamed: 0,uuid,name,similarity_name,birthdate,similarity_birthdate,national_id,similarity_national_id,similarity
0,ba4dd24444bd4c00b8df4318629098fc,ali schellekens,0.000000,03 11 1998,0.4,nld3653744f4262821759363174,0.370370,0.302222
1,653d522a90cc485cac8aae86b38f33fd,finn jansdr goyaerts van waderle,0.000000,13 12 1977,0.4,nld7925249f3254059400218553,0.333333,0.280000
2,db1dbe80729a4137a6633c98232f199c,melle van brenen,0.000000,24 05 1995,0.2,nld8280615f6526040440452708,0.185185,0.151111
3,ae799dccea2f449b91812da7d19b916c,amin gellemeyer,0.000000,28 08 1977,0.2,nld1125775m0944850719978605,0.333333,0.240000
4,038ba69c37ed47f3b5823517f15a5766,floris van de elzas blonk,-0.063628,05 01 1954,0.3,nld6721726f7520047093650932,0.333333,0.247274
...,...,...,...,...,...,...,...,...
4995,fe10bf145f364f86adfa0d16717c7e4d,luke groenendaal,0.000000,04 10 1929,0.3,nld0912389m2219088377454267,0.222222,0.193333
4996,544d83c6f3cf4bc0b172f8a9b465c3e7,hamza van der wal,0.074125,07 11 1945,0.4,nld9251192f6488885430651466,0.444444,0.361492
4997,4d6d2ffbaba5463394a7437f625acd91,rik nollee,0.000000,31 05 2003,0.5,nld4708109m9767063211160647,0.222222,0.233333
4998,feaf209a50044f4880ce8a7539d132d8,kyara hemma van allemanie,-0.059131,13 06 1996,0.3,nld7658421f7821618347163268,0.333333,0.248174


In [14]:
result.sort_values(by="similarity", ascending=False)

Unnamed: 0,uuid,name,similarity_name,birthdate,similarity_birthdate,national_id,similarity_national_id,similarity
3569,a0820f40836c4b9e8e9da76e3c4636c3,hanna weijters,1.000000,17 11 2000,0.9,nld9622792f4887410313628145,1.000000,0.980000
746,8f55d4bff59f4f6dac524062c765ea6d,lot buijs pieters,0.214834,25 11 2003,0.7,nld9681095m9774808196203793,0.407407,0.427411
822,6ba959a370224383a431c5db952e5525,luc weijters,0.668994,13 06 1993,0.3,nld9691975f3592017364396501,0.370370,0.416021
3517,f63da54702f14bc0a7f1e289262fa0e5,aimee lagerweij,0.148250,26 12 2001,0.7,nld7125765f4261039312283062,0.407407,0.414094
1205,9c5e3b561e5a4ca4ad55ebbcdd27f651,mirthe schellekens,0.000000,25 11 2001,0.7,nld6379581f8751041393619188,0.444444,0.406667
...,...,...,...,...,...,...,...,...
4807,8b752979339f4422bf8d1876ba36bcfc,ravi bastiaanse,0.000000,21 06 1947,0.2,nld0445233m9160521598119630,0.148148,0.128889
4264,d74a262151a34ecabe429428ad94889d,oscar van gemert perrono,0.000000,24 09 1951,0.2,nld2106767m5319802878717069,0.148148,0.128889
4297,7e152d069b404e4b83c77685f728b061,ise van de plas,0.000000,08 08 1973,0.2,nld4170166m1525645199856336,0.148148,0.128889
3852,d5abba22872947359d64510f3ae3a57f,florian kuit,0.000000,24 02 1926,0.2,nld4530019m8366077699557380,0.148148,0.128889
