## DEMO: MinHash Vectorizing

In [None]:
%pip install numpy pandas faker sklearn skrub

In [None]:
import numpy as np
import pandas as pd
from faker import Faker
from sklearn.metrics.pairwise import cosine_similarity
from skrub import MinHashEncoder

In [None]:
# Generate dummy names.
n = 50_000
faker = Faker(locale="nl-NL")
faker.seed_instance(42)
names = pd.Series([faker.name() for _ in range(n)], name="Name")
names.head(3)

In [None]:
encoder = MinHashEncoder(n_components=70, ngram_range=(2, 3))

In [None]:
vectors = encoder.fit_transform(names)
vectors

In [None]:
targets = names[10:13]
targets


In [None]:
target_vectors = encoder.transform(targets)
target_vectors

In [None]:
similarities = cosine_similarity(target_vectors, vectors)
top_matches = np.argpartition(similarities, -10)[:, -10:]
top_matches

In [None]:
for target_idx, matches in enumerate(top_matches):
    print(f"Results for: {targets.iloc[target_idx]}")
    print("-" * 60)
    for match in top_matches[target_idx]:
        print(f"{names.iloc[match]:50s} -- {similarities[target_idx][match]:.3f}")
    print("-" * 60)