In [1]:
import random

import pandas as pd
from faker import Faker

from match_person import PersonMatcher
from encryption import AESGCM4Encryptor

In [2]:
# Set up Faker
faker = Faker(locale="nl-NL")
faker.seed_instance(42)

In [3]:
def generate_ids(n):
    """Generate ID-like strings."""
    country_code = "NLD"

    ids = []
    for _ in range(n):
        gender = random.choice(["M", "F"])
        first = f"{random.randint(0, 9999999):07d}"
        second = f"{random.randint(0, 9999999999999999):016d}"
        ids.append(f"{country_code}{first}{gender}{second}")
    return ids


def generate_names(n):
    """Generate dummy names."""
    return pd.Series([faker.name() for _ in range(n)])


def generate_birthdates(n):
    """Generate dummy birthdates."""
    return pd.Series(
        [
            faker.date_of_birth(minimum_age=18, maximum_age=100).strftime("%d-%m-%Y")
            for _ in range(n)
        ]
    )


In [4]:
n = 5_000
data = pd.DataFrame({
    "name": generate_names(n),
    "birthdate": generate_birthdates(n),
    "national_id": generate_ids(n),
})
data.sample(5)

Unnamed: 0,name,birthdate,national_id
4028,Sepp de Backer-van den Eerenbeemt,31-01-1998,NLD4098180F5178118263332397
1482,Timo Jonker,17-05-2002,NLD5315580M1913799088775485
123,Olivia van Allemanië,20-04-1969,NLD7937841F2790921242937279
4895,Catharina Schatteleijn,28-05-1993,NLD9532725F7932653706905667
2944,Jari Oversteeg,29-03-1976,NLD4715842F5052210478716120


In [5]:
encryption_key = AESGCM4Encryptor.generate_key()
encryption_key

b'?\xda\x940\x02_\xfdP\x0br\xd7\xa5\x02,\xd7\xb0'

In [6]:
config = {
    "name": {
        "algoritm": "vector",
        "weight": 0.2,
    },
    "birthdate": {
        "algoritm": "distance",
        "weight": 0.2,
    },
    "national_id": {
        "algoritm": "distance",
        "weight": 0.6
    },
}

In [7]:
matcher = PersonMatcher(10, config, encryption_key, "storage")

In [8]:
matcher.create(data)

In [9]:
result = matcher.get({
    "name": "Weijters, Hanna",
    "birthdate": "16-11-2000",
    "national_id": "nld9622792f4887410313628145"
})
result

Unnamed: 0,uuid,name,similarity_name,birthdate,similarity_birthdate,national_id,similarity_national_id,similarity
3569,c033c79a38424fa59bdbeaf576808f8c,hanna weijters,1.0,17 11 2000,0.9,nld2059966f2355650535694724,0.222222,0.513333
1802,f35f4bf236f6426f9e729fc405b758a3,frederique van gelder,0.0,26 10 2000,0.8,nld4642783f0894747639913213,0.444444,0.426667
512,fbc7b33d04564d4d9e84a3a2917c8c4a,wessel mulders,0.230769,11 11 2000,0.9,nld2692486f8378986105640428,0.333333,0.426154
2335,aea908cad02745bf9bdec4cc653e27a6,kim hanegraaff,0.153846,16 12 1960,0.6,nld9839759m1435710843200651,0.407407,0.395214
1065,747f78c68b694cb38d7177b0ebb8a0c3,stef muller,0.0,12 01 2002,0.7,nld7324606f6294553436218385,0.407407,0.384444
4706,c1cfca518256421dbb0464c380a2bf9a,dina smit,0.098058,08 12 2001,0.6,nld8682763f1785496305127957,0.407407,0.384056
3443,0ad7a522b06e4847b948492a2d5fc481,dylano westerbeek,0.138675,12 06 2004,0.6,nld2622137f9468423891205974,0.37037,0.369957
3517,aced32ed50eb41e1a1e54fbd2b5d332e,aimee lagerweij,0.14825,26 12 2001,0.7,nld9181732f9941395976457101,0.333333,0.36965
1764,72873a7b9eeb4eb383752add7d5137c2,jens hazenveld,0.153846,26 11 2005,0.8,nld4766691m7895813956650950,0.296296,0.368547
3112,8873590c80324db1b5c3cdd8b51fc14f,floortje hofman,0.0,31 08 2002,0.5,nld1662012f2817981212686804,0.444444,0.366667
