## DEMO: Hashing Trick and Cosine Similarity

In [None]:
%pip install mmh3 pandas sklearn

In [None]:
import sys

import mmh3
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Vectorizing names

In [None]:
# Using a Vectorizer
vectorizer = CountVectorizer(
    ngram_range=(3,3),
    analyzer="char_wb",
)

In [None]:
# Transform the names
vectors = vectorizer.fit_transform(
    ["mark jansen", "jip jansen", "mark vos"]
)
vectors.todense()

In [None]:
# View generated vectors
# Note: smaller ngrams around word boundaries.
pd.DataFrame(
    data=vectors.todense(),
    columns=vectorizer.get_feature_names_out()
)

In [None]:
# Compute cosine similarities
cosine_similarity(vectors)

In [None]:
# Learns a vocabulary from the input data.
vectorizer.vocabulary_

In [None]:
# Vocabulary can get large...
sys.getsizeof(vectorizer.vocabulary_)

### Hashing trick

In [None]:
# Hash of an ngram
mmh3.hash("mar")

In [None]:
# Modulo of the hash.
mmh3.hash("mar") % 10

In [None]:
# Use hasing trick to determine columns.
n_columns = 10
ngrams = ["mar", "ark", "vos", "jan", "ans", "nse", "sen", "jip"]

for token in ngrams:
    print("Token:       ", token)
    print("Hashed:      ", mmh3.hash(token))
    print("Column:      ", mmh3.hash(token) % n_columns)
    print("-" * 30)

In [None]:
# Create a HashingVectorizer.
vectorizer = HashingVectorizer(
    ngram_range=(3, 3),
    analyzer="char_wb",
    n_features=20,
    norm=None,
    alternate_sign=False,
)

In [None]:
# Transform the data.
result = vectorizer.fit_transform(["mark jansen", "jip jansen", "mark vos"])
result.todense()

In [None]:
# Compute cosine similarities.
# Note: similar pattern as before.
cosine_similarity(result)