In [2]:
import random

import pandas as pd

from thefuzz import fuzz

from rapidfuzz import fuzz as rapid_fuzz
from rapidfuzz.process import cdist
from rapidfuzz.distance.OSA import normalized_distance


In [3]:
def generate_ids():
    """Generate ID-like strings."""
    while True:
        country_code = "NLD"
        gender = random.choice(["M", "F"])
        first = f"{random.randint(0, 9999999):07d}"
        second = f"{random.randint(0, 9999999999999999):016d}"

        yield f"{country_code}{first}{gender}{second}"


def similarity(target_id: str, search_ids: pd.Series) -> pd.Series:
    """Compute Levenhstein distance between IDs."""
    return search_ids.map(lambda search_id: fuzz.ratio(target_id, search_id))


def rapid_similarity(target_id: str, search_ids: pd.Series) -> pd.Series:
    """Compute Levenhstein distance between IDs."""
    return pd.Series(
        cdist([target_id], search_ids, scorer=rapid_fuzz.ratio, workers=-1)[0]
    )


def rapid_distance(target_id: str, search_ids: pd.Series) -> pd.Series:
    """Compute Optimal String Alignment distance between IDs."""
    return pd.Series(
        cdist([target_id], search_ids, scorer=normalized_distance, workers=-1)[0]
    )

def equality(target_id: str, search_ids: pd.Series) -> pd.Series:
    """Compare an ID to a list of IDs."""
    return search_ids.map(lambda search_id: search_id == target_id)

In [4]:
id_generator = generate_ids()

In [5]:
n = 5_000_000
ids = pd.Series([next(id_generator) for _ in range(n)])
ids.sample(5)

3752767    NLD0692483F3900925858137983
3831479    NLD5856613M7312244015824872
2880786    NLD9058402M3025470748439810
3359251    NLD1606522F6870474771725584
4501183    NLD8753790F7431314311308451
dtype: object

In [6]:
top_n = 10

In [7]:
similarities = rapid_similarity(ids[0], ids).sort_values(ascending=False)
similarities.head(top_n)

0          100.000000
1743341     70.370369
359561      70.370369
4998098     70.370369
4512947     70.370369
4763983     70.370369
4450562     70.370369
2578677     70.370369
2279892     70.370369
3266551     70.370369
dtype: float32

In [8]:
ids.iloc[similarities.index[0:top_n]]

0          NLD5138782M8106782233209304
1743341    NLD4535882M8978237695352090
359561     NLD3591847M8101738922553309
4998098    NLD0387820M1623392936063450
4512947    NLD5137862M2808200338109193
4763983    NLD1238828M8036470862240230
4450562    NLD6589828F1107126233720730
2578677    NLD5348782M0496053329385046
2279892    NLD7873872M5810718283393901
3266551    NLD3561878M1267487233208549
dtype: object

In [9]:
distances = rapid_distance(ids[0], ids).sort_values(ascending=True)
distances.head(top_n)

0          0.000000
272586     0.407407
2152479    0.407407
2161794    0.407407
1442101    0.407407
4003177    0.407407
1131164    0.407407
4791544    0.407407
4224757    0.407407
126786     0.407407
dtype: float32

In [10]:
ids.iloc[distances.index[0:top_n]]

0          NLD5138782M8106782233209304
272586     NLD5113822M7636742235509006
2152479    NLD5190078M1029978263320974
2161794    NLD5387758F0037814233509354
1442101    NLD8138733M8360702833587204
4003177    NLD8719078M2815622335209357
1131164    NLD6183382M8106292236074749
4791544    NLD5195142M8906783260286204
4224757    NLD2638780F8113902233069324
126786     NLD5336682M8104762836950931
dtype: object

In [11]:
%timeit similarity(ids[0], ids)

5.51 s ± 92.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%timeit rapid_similarity(ids[0], ids)

836 ms ± 1.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%timeit rapid_distance(ids[0], ids)

1.19 s ± 3.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit equality(ids[0], ids)

969 ms ± 5.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
