In [1]:
import random

import pandas as pd
from rapidfuzz import fuzz as rapid_fuzz
from thefuzz import fuzz

In [2]:
def generate_ids():
    """Generate ID-like strings."""
    while True:
        country_code = "NLD"
        gender = random.choice(["M", "F"])
        first = f"{random.randint(0, 9999999):07d}"
        second = f"{random.randint(0, 9999999999999999):016d}"

        yield f"{country_code}{first}{gender}{second}"\


def similarity(target_id: str, search_ids: pd.Series) -> pd.Series:
    """Compute Levenhstein distance between IDs."""
    return search_ids.map(lambda search_id: fuzz.ratio(target_id, search_id))

def rapid_similarity(target_id: str, search_ids: pd.Series) -> pd.Series:
    """Compute Levenhstein distance between IDs."""
    return search_ids.map(lambda search_id: rapid_fuzz.ratio(target_id, search_id))

def equality(target_id: str, search_ids: pd.Series) -> pd.Series:
    """Compare an ID to a list of IDs."""
    return search_ids.map(lambda search_id: search_id == target_id)

In [3]:
id_generator = generate_ids()

In [4]:
n = 5_000_000
ids = pd.Series([next(id_generator) for _ in range(n)])
ids.sample(5)

3095785    NLD5636495M4393533685427962
72707      NLD6751968M5662473946840749
4985477    NLD9921820M8853521226065694
1940558    NLD1360201F1889821708676287
2205037    NLD9851286F1361158292033152
dtype: object

In [None]:
rapid_similarity(ids[0], ids).sort_values(ascending=False)

4449134     22.222222
4350803     22.222222
4277187     22.222222
1122943     22.222222
4135562     22.222222
              ...    
804267      70.370370
941431      70.370370
3078976     74.074074
1459938     74.074074
0          100.000000
Length: 5000000, dtype: float64

In [6]:
%timeit similarity(ids[0], ids)

5.14 s ± 69.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit rapid_similarity(ids[0], ids)

2.83 s ± 65.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%timeit equality(ids[0], ids)

1.04 s ± 16.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
