In [None]:
from cryptography.fernet import Fernet

import pandas as pd
from faker import Faker

In [None]:
# Increase column width to display encrypted names.
pd.options.display.max_colwidth = 400

In [None]:
# Generate dummy names.
n = 25
faker = Faker(locale="nl-NL")
faker.seed_instance(42)
names = pd.Series([faker.name() for _ in range(n)])
names.head(3)

In [None]:
# Generate Fernet encryption key and class.
# Note: Should be stored away from the matcher, e.g. in a KeyVault.
encryption_key = Fernet.generate_key()

In [None]:
from cryptography.fernet import Fernet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors


class EncryptedMatcher:
    """Matches names against an encrypted database."""

    def __init__(
        self, matches=3, threshold=0.5, encoding: str = "utf8", vectorizer=None
    ):
        self._encoding = encoding
        self._threshold = threshold

        self._vectorizer = vectorizer or TfidfVectorizer(
            encoding=encoding, ngram_range=(2, 3), analyzer="char_wb"
        )
        self._model = NearestNeighbors(n_neighbors=matches, n_jobs=-1)
        self._lookup = None

    def encrypt_names(self, names, encryption_key) -> pd.Series:
        """Encrypt names using Fernet encryption."""
        encryptor = Fernet(encryption_key)
        return names.map(lambda n: encryptor.encrypt(n.encode(self._encoding)))

    def _vectorize(self, names, train=False):
        """Vectorize a series of names."""
        if train:
            self._vectorizer.fit(names)

        return self._vectorizer.transform(names)

    def train(self, names, encryption_key):
        """Train the model on unencrypted names."""
        # Fit KNN on the vectorized data.
        # Note: Could encrypt the model for more security.
        vectorized = self._vectorize(names, train=True)
        self._model.fit(vectorized)

        # Store encrypted names as lookup table.
        self._lookup = self.encrypt_names(names, encryption_key)

    def predict(self, names):
        """Match names against the encrypted names."""
        vectorized = self._vectorize(names)
        distances, indices = self._model.kneighbors(vectorized)

        return [
            (self._lookup[index], distance)
            for index, distance in zip(indices[0], distances[0])
            if distance < self._threshold
        ]

In [None]:
matcher = EncryptedMatcher()


In [None]:
matcher.train(names, encryption_key)

In [None]:
# Match the first name excatly.
matches = matcher.predict(names[[0]])
pd.DataFrame(matches, columns=["encrypted", "distance"])

In [None]:
# Match the first name with names swapped.
# Note: The distance value has increased significantly.
matches = matcher.predict(["Schellekens, Ali"])
pd.DataFrame(matches, columns=["encrypted", "distance"])

In [None]:
# Check results against the original names.
encryptor = Fernet(encryption_key)
pd.DataFrame({
    "name": names,
    "encrypted": names.map(lambda n: encryptor.encrypt(n.encode("utf8")))
})

In [None]:
# Model lookup table only contains the encypted names.
# Without the encryption key, you cannot decrypt these.
matcher._lookup[0:5]

In [None]:
# No PII data in the vectorizer.
matcher._vectorizer.vocabulary_

In [None]:
# Fitted NN model is not interpretable.
print(matcher._model._fit_X)

In [None]:
for attrib in dir(matcher._model):
    if attrib.startswith("__"):
        continue
    print(attrib)
    print(getattr(matcher._model, attrib))
    print("---")
