In [1]:
%load_ext autoreload
%autoreload all
import gensim.downloader
import numpy as np
from algorithms.RandomProjectionsLsh import RandomProjectionsLsh
from algorithms.PcaBasedLsh import PcaBasedLsh
from algorithms.MinkowskiLsh import L1Lsh, L2Lsh
from algorithms.PcaMinkowskiLsh import PcaL1Lsh, PcaL2Lsh
from algorithms.SuperBitLsh import SuperBitLsh
from algorithms.HierarchicalLsh import HierarchicalLsh
from algorithms.HierarchicalHyperplaneLsh import HierarchicalHyperplaneLsh

# Load vocabulary

In [2]:
import urllib.request


def load_common_words():
    print('Loading words...')
    words = set()
    target_url = 'https://raw.githubusercontent.com/pkLazer/password_rank/master/4000-most-common-english-words-csv.csv'
    is_header_line = True
    for line in urllib.request.urlopen(target_url):
        if is_header_line:
            is_header_line = False
            continue
        words.add(line.decode('utf-8').strip())
    print(f'Words loaded: {len(words)}')

    return words

vocabulary = load_common_words()

Loading words...
Words loaded: 4319


# Embedding Models

Note that first downloads will be very slow, then first memory load will still be slow, but then each model is cached.

In [3]:
class EmbeddingsModel:
    _instances = {}

    @classmethod
    def get(cls, model_name):
        if not model_name in cls._instances:
            cls._instances[model_name] = cls(model_name)
        return cls._instances[model_name]

    def __init__(self, model_name):
        print(f'Loading {model_name} embeddings model')
        self.embeddings_model = gensim.downloader.load(model_name)

    def embed_word(self, word):
        return self.embeddings_model[word]
    
    def embed_words(self, words):
        existing_words = [word for word in words if self.word_exists(word)]
        return np.array(self.embeddings_model.vectors_for_all(existing_words).vectors)
    
    def embed_words_as_dict(self, words):
        existing_words = [word for word in words if self.word_exists(word)]
        embeddings_for_words = self.embeddings_model.vectors_for_all(existing_words).vectors
        return {word: embeddings_for_words[i] for i, word in enumerate(existing_words)}
    
    def word_exists(self, word):
        return word in self.embeddings_model.key_to_index

In [4]:
possible_models = list(gensim.downloader.info()['models'].keys())
print(f'All possible models: {possible_models}')

EMBEDDING_MODEL = 'glove-wiki-gigaword-100'

All possible models: ['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


# Comparisons

In [5]:
def _hash_vocabulary(vocabulary, embeddings_model_name, hasher):
    embeddings_model = EmbeddingsModel.get(embeddings_model_name)
    print(f'Embedding vocabulary of {len(vocabulary)} words')
    vocabulary_embeddings = embeddings_model.embed_words_as_dict(vocabulary)
    print(f'Training hasher with all {len(vocabulary_embeddings)} vocabulary embeddings')
    hasher.fit(np.array(list(vocabulary_embeddings.values())))
    print(f'Hashing all {len(vocabulary_embeddings)} vocabulary embeddings')
    vocabulary_hashes = {word: hasher.hash_vector(embedding) for word, embedding in vocabulary_embeddings.items() }
    return vocabulary_hashes

def _get_hash_words(hash, vocabulary_hashes):
    words = []
    for word, word_hash in vocabulary_hashes.items():
        if word_hash == hash:
            words.append(word)
    return words

def _print_word_neighbourhood(word, vocabulary_hashes):
    hash = vocabulary_hashes[word]
    neighbourhood = _get_hash_words(hash, vocabulary_hashes)
    print(f'Words for word {word}, hash {hash}: {neighbourhood}')

def compare_vocabulary_hashes(embeddings_model_name, hasher, search_terms):
    print(f'Comparing vocabulary hashes for {embeddings_model_name} embeddings model with {hasher.to_string()}')
    vocabulary_hashes = _hash_vocabulary(vocabulary, embeddings_model_name, hasher)
    for term in search_terms:
        _print_word_neighbourhood(term, vocabulary_hashes)

In [6]:
search_terms = ['apple', 'orange', 'banana', 'grape', 'fruit', 'cheese', 'plane', 'train', 'universe', 'love', 'anger']

In [7]:
compare_vocabulary_hashes(EMBEDDING_MODEL, RandomProjectionsLsh(hash_length=11), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-100 embeddings model with RandomProjectionsLsh (hash_length=11)
Loading glove-wiki-gigaword-100 embeddings model
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 1717: ['programming', 'apple']
Words for word orange, hash 708: ['fever', 'pink', 'hazard', 'brake', 'shock', 'orange', 'cloud', 'headache']
Words for word banana, hash 1007: ['vertical', 'chop', 'shell', 'operation', 'snake', 'banana', 'pasta', 'tooth']
Words for word grape, hash 750: ['rat', 'grape', 'cup', 'additional', 'nest', 'brush']
Words for word fruit, hash 646: ['laboratory', 'bacteria', 'fruit', 'cancer', 'sleep', 'supply']
Words for word cheese, hash 1773: ['cheese']
Words for word plane, hash 983: ['scream', 'plane', 'out', 'killer', 'off']
Words for word train, hash 963: ['spot', 'blade', 'heaven', 'hit', 'train']
Words for word universe, hash 2038: ['possibly

In [8]:
compare_vocabulary_hashes(EMBEDDING_MODEL, PcaBasedLsh(hash_length=14), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-100 embeddings model with PcaBasedLsh (hash_length=14)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 10352: ['giant', 'apple', 'buy', 'energy']
Words for word orange, hash 2393: ['fur', 'leaf', 'orange']
Words for word banana, hash 2136: ['meat', 'shrimp', 'sugar', 'fruit', 'pie', 'chicken', 'banana', 'bean', 'egg']
Words for word grape, hash 3160: ['grape']
Words for word fruit, hash 2136: ['meat', 'shrimp', 'sugar', 'fruit', 'pie', 'chicken', 'banana', 'bean', 'egg']
Words for word cheese, hash 2138: ['lemon', 'tablespoon', 'tomato', 'soup', 'onion', 'salt', 'milk', 'corn', 'butter', 'flour', 'peanut', 'honey', 'salad', 'dessert', 'sauce', 'pour', 'chocolate', 'vegetable', 'pasta', 'pepper', 'garlic', 'dried', 'juice', 'cheese', 'mixture', 'potato']
Words for word plane, hash 10672: ['nuclear', 'radar', 'plane', 'ship', 'missil

In [9]:
compare_vocabulary_hashes(EMBEDDING_MODEL, L1Lsh(dimensions_count=100, bucket_width=0.01, count_hash_tables=200), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-100 embeddings model with L1Lsh (dimensions_count=100, bucket_width=0.01, count_hash_tables=200
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 1533: ['silence', 'jungle', 'step', 'definition', 'apple']
Words for word orange, hash 1225: ['specify', 'processing', 'voter', 'assess', 'orange', 'crazy', 'astronomer']
Words for word banana, hash 1417: ['dissolve', 'climb', 'directly', 'fierce', 'strong', 'banana', 'proportion']
Words for word grape, hash 1722: ['grape', 'law']
Words for word fruit, hash 1614: ['extend', 'fruit', 'tragic']
Words for word cheese, hash 1394: ['practice', 'asleep', 'husband', 'donor', 'hunger', 'toilet', 'finding', 'cheese']
Words for word plane, hash 1690: ['plane', 'try', 'boss']
Words for word train, hash 1229: ['suspicious', 'pizza', 'roof', 'commercial', 'bathroom', 'skirt', 'train', 'effect', 'puzzle'

In [10]:
compare_vocabulary_hashes(EMBEDDING_MODEL, L2Lsh(dimensions_count=100, bucket_width=0.01, count_hash_tables=200), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-100 embeddings model with L2Lsh (dimensions_count=100, bucket_width=0.01, count_hash_tables=200
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 314: ['task', 'margin', 'careful', 'apple', 'there']
Words for word orange, hash 736: ['special', 'orange', 'accountability', 'direct']
Words for word banana, hash 834: ['very', 'poster', 'banana']
Words for word grape, hash 1808: ['grape']
Words for word fruit, hash 1325: ['fruit', 'image', 'awareness', 'chamber']
Words for word cheese, hash 1159: ['percentage', 'cheese', 'empty']
Words for word plane, hash 1864: ['plane']
Words for word train, hash 741: ['visual', 'grip', 'anywhere', 'fewer', 'courage', 'yours', 'train', 'hidden', 'supportive', 'increase', 'occasional']
Words for word universe, hash 578: ['universe', 'the', 'suspicion', 'artifact']
Words for word love, hash 672: ['love', 

In [11]:
compare_vocabulary_hashes(EMBEDDING_MODEL, PcaL1Lsh(bucket_width=0.005, count_hash_tables=50), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-100 embeddings model with PcaL1Lsh (bucket_width=0.005, count_hash_tables=50)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 376: ['arrange', 'lots', 'workout', 'graduation', 'root', 'craft', 'enact', 'reserve', 'existing', 'instead', 'apple', 'vote']
Words for word orange, hash 416: ['someone', 'thus', 'blend', 'time', 'household', 'radical', 'orange', 'alternative', 'defend', 'writer', 'scheme', 'while', 'order', 'refer', 'discover']
Words for word banana, hash 405: ['significance', 'bacteria', 'introduce', 'link', 'banana', 'walking', 'upon']
Words for word grape, hash 386: ['grape', 'acknowledge', 'pilot', 'editor', 'compound', 'below', 'daughter', 'lobby', 'judgment', 'rape', 'season']
Words for word fruit, hash 529: ['the', 'sustainable', 'fruit', 'pepper', 'territory', 'contribution', 'fuel']
Words for word cheese, hash 591

In [12]:
compare_vocabulary_hashes(EMBEDDING_MODEL, PcaL2Lsh(bucket_width=0.005, count_hash_tables=50), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-100 embeddings model with PcaL2Lsh (bucket_width=0.005, count_hash_tables=50)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 376: ['exact', 'arrange', 'lots', 'workout', 'graduation', 'root', 'craft', 'enact', 'instead', 'apple', 'full', 'duty']
Words for word orange, hash 416: ['someone', 'thus', 'blend', 'time', 'being', 'household', 'radical', 'maintenance', 'orange', 'alternative', 'defend', 'writer', 'scheme', 'while', 'refer', 'now', 'professional', 'discover']
Words for word banana, hash 405: ['goal', 'significance', 'considerable', 'brake', 'introduce', 'animal', 'link', 'stomach', 'banana', 'achievement', 'land', 'essay', 'walking', 'cord', 'upon']
Words for word grape, hash 386: ['colorful', 'grape', 'acknowledge', 'pilot', 'editor', 'offensive', 'compound', 'below', 'daughter', 'lobby', 'connection', 'judgment', 'season

In [13]:
compare_vocabulary_hashes(EMBEDDING_MODEL, SuperBitLsh(hash_length=14, num_bits_per_batch=4), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-100 embeddings model with SuperBitLsh(hash_length=14, num_bits_per_batch=4)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings


  s = (x.conj() * x).real
  proj = np.dot(vectors[j], vectors[i]) * vectors[j]
  vectors = vectors / norms
  vectors[i] -= proj
  return sqrt(add.reduce(s, axis=axis, keepdims=keepdims))


Words for word apple, hash 8224: ['compare', 'casino', 'weather', 'globe', 'galaxy', 'plunge', 'mirror', 'year', 'sensation', 'rise', 'breeze', 'fall', 'elsewhere', 'sunny', 'immune', 'builder', 'terrorist', 'sound', 'ah', 'birthday', 'flesh', 'vs', 'radiation', 'apartment', 'climate', 'damage', 'retailer', 'ecosystem', 'likelihood', 'coast', 'warehouse', 'universe', 'ship', 'inventory', 'reflect', 'recover', 'wall', 'organism', 'delight', 'bulb', 'air', 'grandparent', 'sail', 'holiday', 'afternoon', 'exposure', 'crystal', 'display', 'dealer', 'convention', 'lion', 'visual', 'its', 'computer', 'their', 'distant', 'pack', 'hotel', 'earnings', 'room', 'annual', 'cage', 'screen', 'world', 'quarter', 'mall', 'island', 'market', 'advance', 'label', 'fate', 'shell', 'glass', 'weaken', 'visitor', 'sunlight', 'ocean', 'late', 'well-being', 'battery', 'catalog', 'inherit', 'decline', 'store', 'initial', 'solar', 'sale', 'tropical', 'darkness', 'expected', 'unit', 'crew', 'developer', 'porch', '

In [31]:
compare_vocabulary_hashes(EMBEDDING_MODEL, HierarchicalLsh(num_levels=10), search_terms)
# note that this takes 60 seconds to run

Comparing vocabulary hashes for glove-wiki-gigaword-100 embeddings model with HierarchicalLsh (num_levels=10)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Using a random subset of 1000 samples for hierarchical clustering
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 1111110011: ['telephone', 'cable', 'computer', 'digital', 'mobile', 'chip', 'phone', 'virtual', 'software', 'apple', 'instant', 'communication', 'electronic', 'technology']
Words for word orange, hash 1011101111: ['fur', 'shade', 'dark', 'light', 'patch', 'pink', 'dense', 'orange', 'purple', 'red', 'blue', 'bright', 'yellow', 'pale']
Words for word banana, hash 1000111110: ['herb', 'peanut', 'palm', 'hay', 'banana', 'bean', 'potato']
Words for word grape, hash 10000111: ['leaf', 'grape', 'fruit', 'organic', 'vegetable']
Words for word fruit, hash 10000111: ['leaf', 'grape', 'fruit', 'organic', 'vegetable']
Words for word cheese, hash 10000000: ['butter', 'candy',

In [40]:
compare_vocabulary_hashes(EMBEDDING_MODEL, HierarchicalHyperplaneLsh(num_levels=10, maximum_sample_size=10000), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-100 embeddings model with HierarchicalHyperplaneLsh(num_levels=10)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Using full 4253 data items as leaves in hierarchical clustering
Building hierarchy over 4253 leaves
Enriching clusters with parents and levels and hyperplanes
10 hierarchical hyperplanes calculated
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 1111011110: ['used', 'cell', 'learning', 'determine', 'consequently', 'same', 'possibly', 'uniform', 'actually', 'common', 'weapon', 'artificial', 'convert', 'suspicious', 'contain', 'protective', 'suitable', 'very', 'assume', 'therefore', 'detail', 'aspect', 'builder', 'striking', 'exact', 'active', 'reference', 'programming', 'comparable', 'discount', 'thus', 'detailed', 'life', 'employ', 'surrounding', 'attraction', 'vary', 'journey', 'differ', 'indigenous', 'basic', 'incorporate', 'encounter', 'norm', 'standard', 'struct

# Gensim similarity comparison

In [16]:
def print_gensim_word_neighbourhood(word, embeddings_model_wrapper):
    neighbourhood = []
    for neighbourhood_word, _ in embeddings_model_wrapper.embeddings_model.most_similar(word, topn=20):
        if neighbourhood_word in vocabulary:
            neighbourhood.append(neighbourhood_word)
    print(f'Gensim words for word {word}: {neighbourhood}')

def compare_gensim_word_neighbourhood(embeddings_model_name, search_terms):
    embeddings_model_wrapper = EmbeddingsModel.get(embeddings_model_name)
    print(f"Comparing vocabulary hashes for {embeddings_model_name} embeddings model")
    for term in search_terms:
        print_gensim_word_neighbourhood(term, embeddings_model_wrapper)

In [17]:
compare_gensim_word_neighbourhood(EMBEDDING_MODEL, search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-100 embeddings model
Gensim words for word apple: ['software', 'computer']
Gensim words for word orange: ['yellow', 'red', 'blue', 'green', 'pink', 'purple', 'black', 'lemon', 'brown', 'juice', 'white', 'gray', 'bright', 'cream', 'dark']
Gensim words for word banana: ['potato', 'fruit', 'peanut', 'sugar', 'nut', 'tomato', 'coffee', 'bean']
Gensim words for word grape: ['wine', 'fruit', 'tomato']
Gensim words for word fruit: ['vegetable', 'coffee', 'juice', 'flavor', 'banana', 'grape', 'tomato', 'tea']
Gensim words for word cheese: ['cream', 'butter', 'chocolate', 'bread', 'sandwich', 'sauce', 'pasta', 'potato', 'tomato', 'goat']
Gensim words for word plane: ['airplane', 'flight', 'jet', 'crash', 'helicopter', 'flying', 'aircraft', 'pilot', 'landing', 'crew', 'cargo', 'fly']
Gensim words for word train: ['bus', 'rail', 'passenger', 'traffic', 'truck', 'ride', 'car', 'plane', 'route']
Gensim words for word universe: ['planet', 'earth', 