In [3]:
%pip install datasketch

Collecting datasketch
  Downloading datasketch-1.8.0-py3-none-any.whl.metadata (9.2 kB)
Downloading datasketch-1.8.0-py3-none-any.whl (96 kB)
Installing collected packages: datasketch
Successfully installed datasketch-1.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
from sklearn.neighbors import NearestNeighbors
from vertexai.language_models import TextEmbeddingModel
from datasketch import MinHash, MinHashLSH
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [3]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    print(outputs.last_hidden_state.shape)
    print(outputs.last_hidden_state.mean(dim=1).shape)
    print(outputs.last_hidden_state.mean(dim=1).squeeze().shape)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [4]:
texts = [
 "The earth is spherical.",
 "The earth is a planet.",
 "I like to eat at a restaurant."
]
query = "earth is flat"
embedding_text = np.array([get_embedding(text) for text in texts])
embedding_query = get_embedding(query)


torch.Size([1, 7, 384])
torch.Size([1, 384])
torch.Size([384])
torch.Size([1, 8, 384])
torch.Size([1, 384])
torch.Size([384])
torch.Size([1, 10, 384])
torch.Size([1, 384])
torch.Size([384])
torch.Size([1, 5, 384])
torch.Size([1, 384])
torch.Size([384])


In [5]:
nn = NearestNeighbors(n_neighbors=3, algorithm="brute", metric="cosine")
nn.fit(embedding_text)

0,1,2
,n_neighbors,3
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


# p -> metric="minkowski"

## p = 1 → Manhattan distance (L1 norm)

## p = 2 → Euclidean distance (L2 norm)

In [6]:
nn.leaf_size.as_integer_ratio()

(30, 1)

In [7]:
naive_distances, naive_indices = nn.kneighbors(np.expand_dims(embedding_query, axis = 0))
naive_distances, naive_indices

(array([[0.28763735, 0.37929606, 0.9398545 ]], dtype=float32),
 array([[0, 1, 2]]))

In [8]:
nn = NearestNeighbors(n_neighbors=3, algorithm='kd_tree')
nn.fit(embedding_text)

0,1,2
,n_neighbors,3
,radius,1.0
,algorithm,'kd_tree'
,leaf_size,30
,metric,'minkowski'
,p,2
,metric_params,
,n_jobs,


In [9]:
naive_distances, naive_indices = nn.kneighbors(np.expand_dims(embedding_query, axis = 0))
naive_distances, naive_indices

(array([[5.5001827 , 6.13302296, 9.08085319]]), array([[0, 1, 2]]))

In [10]:
nn = NearestNeighbors(n_neighbors=3, algorithm='ball_tree')
nn.fit(embedding_text)

0,1,2
,n_neighbors,3
,radius,1.0
,algorithm,'ball_tree'
,leaf_size,30
,metric,'minkowski'
,p,2
,metric_params,
,n_jobs,


In [11]:
naive_distances, naive_indices = nn.kneighbors(np.expand_dims(embedding_query, axis = 0))
naive_distances, naive_indices

(array([[5.5001827 , 6.13302296, 9.08085319]]), array([[0, 1, 2]]))

In [13]:
def get_minhash(text, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for word in text.split():
        m.update(word.encode('utf8'))
    return m

In [17]:
minhashes = [get_minhash(t) for t in texts]
query_minhash = get_minhash(query)
query_minhash.generator

<bound method MinHash.generator of <class 'datasketch.minhash.MinHash'>>

In [18]:
lsh = MinHashLSH(threshold=0.3, num_perm=128)
for i, mh in enumerate(minhashes):
    lsh.insert(f"text_{i}", mh)

In [19]:
result = lsh.query(query_minhash)
print(result)

[]
