In [1]:
import numpy as np
import pandas as pd
from faker import Faker

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
# Generate dummy names.
n = 5_000_000
faker = Faker(locale="nl-NL")
faker.seed_instance(42)
names = pd.Series([faker.name() for _ in range(n)])
names.head(3)

0                     Ali Schellekens
1    Finn Jansdr-Goyaerts van Waderle
2                    Melle van Brenen
dtype: object

In [3]:
import uuid
from pathlib import Path

import pandas as pd
from cryptography.fernet import Fernet
from scipy import sparse
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity


class EncryptedMatcher:
    """Fuzzy matching for large sets of encrypted names."""

    def __init__(
        self,
        storage_path: str,
        topn: int = 10,
        encoding: str = "utf8",
        n_features: int = 2**20,
        # batch_size: int = 500_000,
    ):
        self._topn = -topn
        self._encoding = encoding
        self._storage_path = Path(storage_path)
        self._storage_path.mkdir(exist_ok=True)
        # self._batch_size = batch_size

        self._vectorizer = HashingVectorizer(
            encoding=encoding,
            n_features=n_features,
            ngram_range=(3, 3),
            analyzer="char_wb",
            lowercase=True,
            strip_accents="ascii",
        )

    def add_names(self, names: pd.Series, encryption_key: bytes):
        """Store encrypted and vectorized names."""
        encryptor = Fernet(encryption_key)
        encrypted = names.map(lambda n: encryptor.encrypt(n.encode(self._encoding)))
        encrypted = encrypted.str.decode("utf8")
        vectorized = self._vectorizer.fit_transform(names)

        # Store data.
        file_id = uuid.uuid1()
        sparse.save_npz(self._storage_path / f"{file_id}.npz", vectorized)
        encrypted.to_csv(self._storage_path / f"{file_id}.crypt", index=False)

    def search(self, names: str, encryption_key: bytes):
        """Search names in the vector space."""
        encryptor = Fernet(encryption_key)
        search_vectors = self._vectorizer.fit_transform(names)

        results = []
        vector_files = self._storage_path.glob("*.npz")
        for vector_file in vector_files:
            crypt_file = self._storage_path / (vector_file.stem + ".crypt")
            encrypted = pd.read_csv(crypt_file)
            vectors = sparse.load_npz(vector_file)

            similarities = cosine_similarity(search_vectors, vectors)
            top_matches = np.argpartition(similarities, self._topn)[:, self._topn:]

            for idx, matches in enumerate(top_matches):
                for match in matches:
                    matched_crypt = encrypted.iloc[match, 0]
                    results.append({
                        "name": names[idx],
                        "target": encryptor.decrypt(matched_crypt).decode(self._encoding),
                        "encrypted": matched_crypt,
                        "similarity": float(similarities[idx, match]),
                    })

        return results


In [4]:
matcher = EncryptedMatcher("vector_store", n_features=100_000)

In [5]:
encryption_key = Fernet.generate_key()
matcher.add_names(names, encryption_key)

In [6]:
matcher.search(["Ali Schellekens", "Alicia Schellekens"], encryption_key)

[{'name': 'Ali Schellekens',
  'target': 'Alicia Schellekens',
  'encrypted': 'gAAAAABnNvrA_nMpNOpyUAWpbqJ1ps3Ib63V3xnfoWfa6VzcJb-CrAwIbSuFs_SevlqAcHxRJnh5S3XlZ-aBkFvVfVJx5GlhN7HtketZAMjgO7YgnJECfdo=',
  'similarity': 0.8426648406178137},
 {'name': 'Ali Schellekens',
  'target': 'Alicia Schellekens',
  'encrypted': 'gAAAAABnNvqEGiGwvWXZQZXO58xot9mIwJLRz0QstyUMe8KBJpYCRRHfOLSX62yqBKUyn8b-aHBvT-4OBkd_12SPat1TpsPEWW5o7BMImlrbxwULjxYQYN8=',
  'similarity': 0.8426648406178137},
 {'name': 'Ali Schellekens',
  'target': 'Alicia Schellekens',
  'encrypted': 'gAAAAABnNvrn-44aRm0zIya_umc_pgXY9OgnUG3APNaputj4j9ib_ZOXIx0fa4XXIJrd2JoV4QiilmRrh1V1xmLjIy0vMNFnAbAcUzpT4HBgHE4t0CenZ7I=',
  'similarity': 0.8426648406178137},
 {'name': 'Ali Schellekens',
  'target': 'Alicia Schellekens',
  'encrypted': 'gAAAAABnNvrlxK58mXPSNbFvWmVEWISo2WdiTsX-SyEGhbVVtMHZopULLp3f9SLIFTp9Mp3asKYr57eLlfU-ZTSSDt9VlhZs775GmQb5mSUgcLEks-eqh8g=',
  'similarity': 0.8426648406178137},
 {'name': 'Ali Schellekens',
  'target': 'Al