In [25]:
import random

import pandas as pd

from thefuzz import fuzz

from rapidfuzz import fuzz as rapid_fuzz
from rapidfuzz.process import cdist
from rapidfuzz.distance.OSA import normalized_distance


In [26]:
def generate_ids():
    """Generate ID-like strings."""
    while True:
        country_code = "NLD"
        gender = random.choice(["M", "F"])
        first = f"{random.randint(0, 9999999):07d}"
        second = f"{random.randint(0, 9999999999999999):016d}"

        yield f"{country_code}{first}{gender}{second}"


def similarity(target_id: str, search_ids: pd.Series) -> pd.Series:
    """Compute Levenhstein distance between IDs."""
    return search_ids.map(lambda search_id: fuzz.ratio(target_id, search_id))


def rapid_similarity(target_id: str, search_ids: pd.Series) -> pd.Series:
    """Compute Levenhstein distance between IDs."""
    return pd.Series(
        cdist([target_id], search_ids, scorer=rapid_fuzz.ratio, workers=-1)[0]
    )


def rapid_distance(target_id: str, search_ids: pd.Series) -> pd.Series:
    """Compute Optimal String Alignment distance between IDs."""
    return pd.Series(
        cdist([target_id], search_ids, scorer=normalized_distance, workers=-1)[0]
    )

def equality(target_id: str, search_ids: pd.Series) -> pd.Series:
    """Compare an ID to a list of IDs."""
    return search_ids.map(lambda search_id: search_id == target_id)

In [5]:
id_generator = generate_ids()

In [6]:
n = 5_000_000
ids = pd.Series([next(id_generator) for _ in range(n)])
ids.sample(5)

3965952    NLD7862985M9584249290578852
2219008    NLD2620976M3543559410513773
1392104    NLD4190742F2322235162041885
1109549    NLD1127711F7430815707386695
2412081    NLD0493995M0682563977057378
dtype: object

In [None]:
top_n = 10

In [29]:
similarities = rapid_similarity(ids[0], ids).sort_values(ascending=False)
similarities.head(5)

0          100.000000
2808724     74.074074
749630      70.370369
4055081     70.370369
3743367     70.370369
dtype: float32

In [31]:
ids.iloc[similarities.index[0:5]]

0          NLD6979241F8477934648316939
2808724    NLD8679541F8875279148163939
749630     NLD8195179F7693444648126939
4055081    NLD6297242M7946413100639739
3743367    NLD4099141F8047744981926699
dtype: object

In [32]:
distances = rapid_distance(ids[0], ids).sort_values(ascending=True)
distances.head(5)

0          0.000000
1257044    0.370370
2379465    0.370370
1716543    0.407407
2952155    0.407407
dtype: float32

In [33]:
ids.iloc[distances.index[0:5]]

0          NLD6979241F8477934648316939
1257044    NLD6298243F4772934348316482
2379465    NLD0972341F8777994483968869
1716543    NLD5879261F8417365831868879
2952155    NLD6479011F3540794098716909
dtype: object

In [22]:
%timeit similarity(ids[0], ids)

5.3 s ± 129 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
%timeit rapid_similarity(ids[0], ids)

834 ms ± 16.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%timeit equality(ids[0], ids)

1.04 s ± 16.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
