## MinHashEncoder

In [1]:
import numpy as np
import pandas as pd
from faker import Faker
from sklearn.metrics.pairwise import cosine_similarity
from skrub import MinHashEncoder

In [2]:
# Generate dummy names.
n = 50_000
faker = Faker(locale="nl-NL")
faker.seed_instance(42)
names = pd.Series([faker.name() for _ in range(n)], name="Name")
names.head(3)

0                     Ali Schellekens
1    Finn Jansdr-Goyaerts van Waderle
2                    Melle van Brenen
Name: Name, dtype: object

In [3]:
encoder = MinHashEncoder(n_components=70, ngram_range=(2, 3))

In [4]:
vectors = encoder.fit_transform(names)
vectors

Unnamed: 0,Name_00,Name_01,Name_02,Name_03,Name_04,Name_05,Name_06,Name_07,Name_08,Name_09,...,Name_60,Name_61,Name_62,Name_63,Name_64,Name_65,Name_66,Name_67,Name_68,Name_69
0,-1.716479e+09,-1.778316e+09,-2.101514e+09,-1.730974e+09,-2.128262e+09,-1.825160e+09,-1.894875e+09,-1.666884e+09,-2.015478e+09,-1.992387e+09,...,-1.930386e+09,-1.925960e+09,-1.879779e+09,-1.862996e+09,-2.132644e+09,-2.107010e+09,-1.815470e+09,-2.004183e+09,-1.939591e+09,-1.748682e+09
1,-2.085421e+09,-1.929279e+09,-2.009561e+09,-2.118021e+09,-2.110896e+09,-1.944646e+09,-2.139137e+09,-2.063006e+09,-1.842104e+09,-1.975331e+09,...,-2.050375e+09,-1.902111e+09,-1.976323e+09,-2.103074e+09,-1.806967e+09,-2.107010e+09,-1.619958e+09,-2.096737e+09,-1.939591e+09,-1.679643e+09
2,-2.071476e+09,-2.022347e+09,-2.042901e+09,-1.912757e+09,-1.270586e+09,-1.653409e+09,-2.002096e+09,-1.905811e+09,-2.015478e+09,-1.730845e+09,...,-1.930386e+09,-1.762132e+09,-1.879779e+09,-2.052574e+09,-1.390767e+09,-2.062283e+09,-1.815470e+09,-1.628268e+09,-1.543910e+09,-1.740218e+09
3,-2.085421e+09,-1.525464e+09,-2.089005e+09,-2.100772e+09,-1.623990e+09,-2.054841e+09,-2.002096e+09,-2.063006e+09,-2.015478e+09,-1.945066e+09,...,-1.930386e+09,-1.737145e+09,-2.061460e+09,-1.142761e+09,-1.351528e+09,-1.623835e+09,-1.919925e+09,-1.506340e+09,-2.064106e+09,-2.095129e+09
4,-2.071476e+09,-2.022347e+09,-2.110942e+09,-2.042778e+09,-2.099561e+09,-1.716888e+09,-2.002096e+09,-1.950354e+09,-1.984807e+09,-1.814161e+09,...,-2.061668e+09,-2.054715e+09,-1.551275e+09,-2.125856e+09,-2.134989e+09,-1.907864e+09,-2.138275e+09,-1.773842e+09,-1.971879e+09,-1.901315e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-2.071476e+09,-1.539906e+09,-1.331059e+09,-1.694582e+09,-1.317381e+09,-1.962738e+09,-2.002096e+09,-1.905811e+09,-2.075504e+09,-1.730845e+09,...,-4.943174e+08,-1.491757e+09,-9.713734e+08,-1.562898e+09,-1.727266e+09,-1.878095e+09,-1.460454e+09,-1.955254e+09,-2.048436e+09,-8.456241e+08
49996,-1.793608e+09,-2.022347e+09,-1.805620e+09,-1.869686e+09,-2.124242e+09,-1.948611e+09,-1.777688e+09,-1.810457e+09,-1.984807e+09,-2.134754e+09,...,-1.766762e+09,-2.054715e+09,-2.146597e+09,-2.125856e+09,-2.134989e+09,-1.992745e+09,-1.849636e+09,-2.121201e+09,-1.847702e+09,-1.967887e+09
49997,-2.071476e+09,-1.588270e+09,-2.042901e+09,-1.557503e+09,-2.108886e+09,-1.653409e+09,-2.002096e+09,-1.905811e+09,-2.075504e+09,-1.998387e+09,...,-1.930386e+09,-1.739066e+09,-1.960902e+09,-1.726070e+09,-2.092182e+09,-2.062283e+09,-1.815470e+09,-1.632134e+09,-2.048436e+09,-1.901315e+09
49998,-2.085421e+09,-2.086339e+09,-2.146331e+09,-2.066705e+09,-2.128262e+09,-1.481777e+09,-2.139137e+09,-1.817268e+09,-1.859980e+09,-1.975331e+09,...,-2.099368e+09,-1.551034e+09,-2.142583e+09,-2.103074e+09,-1.910035e+09,-1.984347e+09,-1.815470e+09,-2.068406e+09,-2.007890e+09,-2.073922e+09


In [47]:
targets = names[10:13]
targets


10          Jonathan Gerritsen
11    Sanne Waltrade Walderade
12              Puk Broekhoven
Name: Name, dtype: object

In [48]:
target_vectors = encoder.transform(targets)
target_vectors

Unnamed: 0,Name_00,Name_01,Name_02,Name_03,Name_04,Name_05,Name_06,Name_07,Name_08,Name_09,...,Name_60,Name_61,Name_62,Name_63,Name_64,Name_65,Name_66,Name_67,Name_68,Name_69
10,-2126502000.0,-1929279000.0,-1749492000.0,-2036166000.0,-2103581000.0,-2054841000.0,-2002096000.0,-1650127000.0,-2096258000.0,-1945066000.0,...,-1957371000.0,-2080558000.0,-1879779000.0,-1906129000.0,-1910035000.0,-1991826000.0,-1815470000.0,-1663117000.0,-2064106000.0,-1922522000.0
11,-2085421000.0,-2022347000.0,-1915650000.0,-1895533000.0,-2141608000.0,-1926938000.0,-2139137000.0,-1663749000.0,-1835063000.0,-1975331000.0,...,-2050375000.0,-1893664000.0,-1976323000.0,-2103074000.0,-1766505000.0,-2001144000.0,-1849636000.0,-1760975000.0,-1543910000.0,-1967887000.0
12,-1522335000.0,-2008402000.0,-1543230000.0,-1574347000.0,-2108886000.0,-1492517000.0,-2030232000.0,-1414334000.0,-1845287000.0,-2087433000.0,...,-1560271000.0,-1925960000.0,-2100014000.0,-1820899000.0,-1701193000.0,-2062283000.0,-1815470000.0,-2004183000.0,-1819948000.0,-1740218000.0


In [49]:
similarities = cosine_similarity(target_vectors, vectors)
top_matches = np.argpartition(similarities, -10)[:, -10:]
top_matches

array([[13526,  2321,  6098, 41628, 17551, 42753, 34691,  3587, 15185,
           10],
       [ 9128, 36191, 41872, 32582,  9960, 42722, 16765, 24219, 49048,
           11],
       [20628,  1500, 12423, 42352, 27165, 31026,  8915, 15083, 42481,
           12]])

In [51]:
for target_idx, matches in enumerate(top_matches):
    print(f"Results for: {targets.iloc[target_idx]}")
    print("-" * 60)
    for match in top_matches[target_idx]:
        print(f"{names.iloc[match]:50s} -- {similarities[target_idx][match]:.3f}")
    print("-" * 60)

Results for: Jonathan Gerritsen
------------------------------------------------------------
Elisabeth Oostveen-van 't Riet                     -- 0.996
Jonathan Verboom                                   -- 0.996
Jonathan Fortuyn-Stange                            -- 0.996
Owen Beourgeois-Gerritse                           -- 0.996
Indy Risma-van der Berg                            -- 0.996
Gerrit van Brunswijk-Bruijne van der Veen          -- 0.996
Ayden Wever-Gerritsen                              -- 0.996
Megan Heyne-van den Nieuwenhuijsen                 -- 0.996
Jonathan Blees-Blom                                -- 0.996
Jonathan Gerritsen                                 -- 1.000
------------------------------------------------------------
Results for: Sanne Waltrade Walderade
------------------------------------------------------------
Seth Waltrade Walderade                            -- 0.995
Bente Waltrade Walderade                           -- 0.995
Nathan Waltrade Walderade  