In [1]:
import gensim.downloader
import numpy as np
from algorithms.RandomProjectionsLsh import RandomProjectionsLsh
from algorithms.PcaBasedLsh import PcaBasedLsh
from algorithms.MinkowskiLsh import L1Lsh, L2Lsh
from algorithms.PcaMinkowskiLsh import PcaL1Lsh, PcaL2Lsh
from algorithms.SuperBitLsh import SuperBitLsh
from algorithms.HierarchicalLsh import HierarchicalLsh
from algorithms.HierarchicalHyperplaneLsh import HierarchicalHyperplaneLsh

# Load vocabulary

In [2]:
import urllib.request


def load_common_words():
    print('Loading words...')
    words = set()
    target_url = 'https://raw.githubusercontent.com/pkLazer/password_rank/master/4000-most-common-english-words-csv.csv'
    is_header_line = True
    for line in urllib.request.urlopen(target_url):
        if is_header_line:
            is_header_line = False
            continue
        words.add(line.decode('utf-8').strip())
    print(f'Words loaded: {len(words)}')

    return words

vocabulary = load_common_words()

Loading words...
Words loaded: 4319


# Embedding Models

Note that first downloads will be very slow, then first memory load will still be slow, but then each model is cached.

In [3]:
class EmbeddingsModel:
    _instances = {}

    @classmethod
    def get(cls, model_name):
        if not model_name in cls._instances:
            cls._instances[model_name] = cls(model_name)
        return cls._instances[model_name]

    def __init__(self, model_name):
        print(f'Loading {model_name} embeddings model')
        self.embeddings_model = gensim.downloader.load(model_name)

    def embed_word(self, word):
        return self.embeddings_model[word]
    
    def embed_words(self, words):
        existing_words = [word for word in words if self.word_exists(word)]
        return np.array(self.embeddings_model.vectors_for_all(existing_words).vectors)
    
    def embed_words_as_dict(self, words):
        existing_words = [word for word in words if self.word_exists(word)]
        embeddings_for_words = self.embeddings_model.vectors_for_all(existing_words).vectors
        return {word: embeddings_for_words[i] for i, word in enumerate(existing_words)}
    
    def word_exists(self, word):
        return word in self.embeddings_model.key_to_index

In [4]:
possible_models = list(gensim.downloader.info()['models'].keys())
print(f'All possible models: {possible_models}')

All possible models: ['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


# Comparisons

In [5]:
def _hash_vocabulary(vocabulary, embeddings_model_name, hasher):
    embeddings_model = EmbeddingsModel.get(embeddings_model_name)
    print(f'Embedding vocabulary of {len(vocabulary)} words')
    vocabulary_embeddings = embeddings_model.embed_words_as_dict(vocabulary)
    print(f'Training hasher with all {len(vocabulary_embeddings)} vocabulary embeddings')
    hasher.fit(np.array(list(vocabulary_embeddings.values())))
    print(f'Hashing all {len(vocabulary_embeddings)} vocabulary embeddings')
    vocabulary_hashes = {word: hasher.hash_vector(embedding) for word, embedding in vocabulary_embeddings.items() }
    return vocabulary_hashes

def _get_hash_words(hash, vocabulary_hashes):
    words = []
    for word, word_hash in vocabulary_hashes.items():
        if word_hash == hash:
            words.append(word)
    return words

def _print_word_neighbourhood(word, vocabulary_hashes):
    hash = vocabulary_hashes[word]
    neighbourhood = _get_hash_words(hash, vocabulary_hashes)
    print(f'Words for word {word}, hash {hash}: {neighbourhood}')

def compare_vocabulary_hashes(embeddings_model_name, hasher, search_terms):
    print(f'Comparing vocabulary hashes for {embeddings_model_name} embeddings model with {hasher.to_string()}')
    vocabulary_hashes = _hash_vocabulary(vocabulary, embeddings_model_name, hasher)
    for term in search_terms:
        _print_word_neighbourhood(term, vocabulary_hashes)

In [6]:
search_terms = ['apple', 'orange', 'banana', 'grape', 'fruit', 'cheese', 'plane', 'train', 'universe', 'love', 'anger']

In [7]:
compare_vocabulary_hashes('glove-wiki-gigaword-50', RandomProjectionsLsh(hash_length=13), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-50 embeddings model with RandomProjectionsLsh (hash_length=13)
Loading glove-wiki-gigaword-50 embeddings model
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 2667: ['milk', 'beer', 'apple', 'quit']
Words for word orange, hash 6734: ['which', 'seat', 'orange', 'giant', 'hotel', 'shed']
Words for word banana, hash 1611: ['crush', 'banana']
Words for word grape, hash 2637: ['grape']
Words for word fruit, hash 2922: ['fruit', 'coffee']
Words for word cheese, hash 1576: ['cheese', 'flour', 'oven']
Words for word plane, hash 4206: ['plane', 'investigate', 'service']
Words for word train, hash 4222: ['reservation', 'train']
Words for word universe, hash 2941: ['universe']
Words for word love, hash 2687: ['blond', 'crazy', 'love', 'stranger', 'lover', 'stuff', 'kid', 'guy']
Words for word anger, hash 4715: ['anger', 'urge']


In [8]:
compare_vocabulary_hashes('glove-wiki-gigaword-50', PcaBasedLsh(hash_length=14), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-50 embeddings model with PcaBasedLsh (hash_length=14)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 11497: ['medium', 'mix', 'herb', 'apple']
Words for word orange, hash 10412: ['fold', 'blue', 'tightly', 'orange', 'grass', 'pink', 'plain', 'cloth', 'thin']
Words for word banana, hash 10413: ['banana', 'slice']
Words for word grape, hash 11501: ['grape', 'wine', 'fiber']
Words for word fruit, hash 10477: ['dried', 'seed', 'leaf', 'milk', 'pound', 'pot', 'fruit', 'goat', 'salt', 'sugar', 'rabbit']
Words for word cheese, hash 10473: ['tablespoon', 'lemon', 'potato', 'teaspoon', 'cheese', 'juice', 'butter', 'liquid', 'vegetable', 'raw', 'beer', 'bean', 'pepper']
Words for word plane, hash 2373: ['plane', 'ship', 'fatal']
Words for word train, hash 332: ['civilian', 'train', 'heart', 'left', 'shooting', 'apparently', 'contact']
Words

In [9]:
compare_vocabulary_hashes('glove-wiki-gigaword-50', L1Lsh(dimensions_count=50, bucket_width=0.01, count_hash_tables=200), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-50 embeddings model with L1Lsh (dimensions_count=50, bucket_width=0.01, count_hash_tables=200
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 1243: ['yourself', 'laugh', 'safety', 'dig', 'toll', 'apple', 'wing', 'perfectly', 'grief', 'fit', 'aesthetic', 'slice', 'tackle']
Words for word orange, hash 1361: ['town', 'element', 'beg', 'wrong', 'instructional', 'orange', 'complaint', 'daily']
Words for word banana, hash 1022: ['account', 'entire', 'list', 'banana']
Words for word grape, hash 1767: ['grape', 'abortion']
Words for word fruit, hash 1513: ['columnist', 'group', 'apartment', 'journalism', 'fruit']
Words for word cheese, hash 1647: ['cheese', 'marine', 'stomach']
Words for word plane, hash 1189: ['vacuum', 'plane', 'technical', 'tide', 'search', 'send', 'particular', 'breathe', 'ratio']
Words for word train, hash 1264: ['dis

In [10]:
compare_vocabulary_hashes('glove-wiki-gigaword-50', L2Lsh(dimensions_count=50, bucket_width=0.01, count_hash_tables=200), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-50 embeddings model with L2Lsh (dimensions_count=50, bucket_width=0.01, count_hash_tables=200
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 1588: ['apple', 'hell']
Words for word orange, hash 1133: ['insurance', 'orange', 'return']
Words for word banana, hash 1354: ['water', 'recovery', 'male', 'banana']
Words for word grape, hash 562: ['grape', 'monster', 'withdrawal', 'develop', 'officer']
Words for word fruit, hash 1307: ['agent', 'fruit', 'camp']
Words for word cheese, hash 877: ['create', 'trauma', 'cheese', 'sky', 'cigarette']
Words for word plane, hash 1115: ['plane', 'strictly', 'literary', 'gross', 'index', 'traffic']
Words for word train, hash 1202: ['train']
Words for word universe, hash 899: ['universe', 'begin', 'wide', 'cold', 'both', 'jeans', 'relationship']
Words for word love, hash 1359: ['love']
Words for word a

In [11]:
compare_vocabulary_hashes('glove-wiki-gigaword-50', PcaL1Lsh(bucket_width=0.005, count_hash_tables=50), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-50 embeddings model with PcaL1Lsh (bucket_width=0.005, count_hash_tables=50)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 140: ['heart', 'sometimes', 'disappointed', 'first', 'foster', 'apple', 'oxygen']
Words for word orange, hash 213: ['alter', 'likelihood', 'bend', 'after', 'girl', 'orange', 'elevator', 'controversy', 'unemployment', 'complexity', 'additional', 'builder', 'reporter']
Words for word banana, hash 438: ['projection', 'medium', 'village', 'banana']
Words for word grape, hash 446: ['grape', 'availability', 'particle', 'sibling']
Words for word fruit, hash 428: ['fruit', 'improve', 'bake']
Words for word cheese, hash 456: ['agriculture', 'cheese', 'heat', 'ecosystem']
Words for word plane, hash 96: ['aid', 'plane', 'unexpected', 'vital', 'tune', 'recall', 'inquiry', 'terrific', 'him']
Words for word train, hash 72:

In [12]:
compare_vocabulary_hashes('glove-wiki-gigaword-50', PcaL2Lsh(bucket_width=0.005, count_hash_tables=50), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-50 embeddings model with PcaL2Lsh (bucket_width=0.005, count_hash_tables=50)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 202: ['credibility', 'norm', 'trace', 'cut', 'lose', 'blank', 'overlook', 'popular', 'publicity', 'plunge', 'influence', 'apple', 'aim', 'disappear']
Words for word orange, hash 277: ['unable', 'factor', 'signal', 'speech', 'motor', 'apply', 'area', 'lightning', 'orange', 'commonly', 'donor', 'queen', 'recipient', 'file', 'sustain', 'ocean', 'wedding', 'emotional', 'anonymous', 'sheer', 'migration', 'decent', 'fix', 'hunting']
Words for word banana, hash 438: ['projection', 'medium', 'village', 'banana', 'hip', 'clinic']
Words for word grape, hash 446: ['grape', 'corn', 'availability', 'particle', 'sibling', 'aesthetic']
Words for word fruit, hash 428: ['therapist', 'jurisdiction', 'fruit', 'improve']
Words f

In [13]:
compare_vocabulary_hashes('glove-wiki-gigaword-50', SuperBitLsh(hash_length=14, num_bits_per_batch=4), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-50 embeddings model with SuperBitLsh(hash_length=14, num_bits_per_batch=4)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Hashing all 4253 vocabulary embeddings


  s = (x.conj() * x).real
  vectors = vectors / norms
  vectors[i] -= proj
  proj = np.dot(vectors[j], vectors[i]) * vectors[j]
  return sqrt(add.reduce(s, axis=axis, keepdims=keepdims))


Words for word apple, hash 32: ['one', 'increased', 'sink', 'hear', 'alone', 'taxpayer', 'should', 'forgive', 'surprise', 'food', 'quickly', 'expansion', 'for', 'patent', 'consistently', 'room', 'poverty', 'enforce', 'openly', 'next', 'regard', 'advise', 'limit', 'yourself', 'especially', 'positive', 'anything', 'move', 'myself', 'exercise', 'besides', 'liberty', 'bonus', 'reflection', 'increase', 'allow', 'previous', 'approval', 'unhappy', 'fact', 'conscience', 'expand', 'fall', 'frankly', 'secure', 'legally', 'swear', 'provided', 'assumption', 'civic', 'reject', 'many', 'unprecedented', 'somewhat', 'expected', 'unlikely', 'sometime', 'unusual', 'about', 'proposal', 'interested', 'tremendous', 'unexpected', 'determination', 'year', 'outline', 'rent', 'stay', 'standing', 'intent', 'appearance', 'amount', 'wonder', 'publication', 'deadline', 'officially', 'world', 'strategy', 'credibility', 'seldom', 'doubt', 'collection', 'large', 'requirement', 'acquisition', 'tend', 'attendance', 'co

In [14]:
compare_vocabulary_hashes('glove-wiki-gigaword-50', HierarchicalLsh(num_levels=10), search_terms)
# note that this takes 60 seconds to run

Comparing vocabulary hashes for glove-wiki-gigaword-50 embeddings model with HierarchicalLsh (num_levels=10)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Using 850 samples for hierarchical clustering
Maximum HierarchicalClustering hash length: 10
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 011011001: ['application', 'software', 'user', 'electronic', 'computer', 'apple', 'digital', 'hardware', 'database']
Words for word orange, hash 110111001: ['orange', 'pale']
Words for word banana, hash 1101111110: ['pig', 'grape', 'cattle', 'potato', 'cow', 'mushroom', 'nut', 'banana', 'feather', 'goat', 'duck', 'salmon', 'sheep', 'rabbit']
Words for word grape, hash 1101111110: ['pig', 'grape', 'cattle', 'potato', 'cow', 'mushroom', 'nut', 'banana', 'feather', 'goat', 'duck', 'salmon', 'sheep', 'rabbit']
Words for word fruit, hash 110111010: ['tree', 'patch', 'leaf', 'flower', 'oak', 'herb', 'fruit', 'pine', 'ribbon']
Words for word ch

In [15]:
compare_vocabulary_hashes('glove-wiki-gigaword-50', HierarchicalHyperplaneLsh(num_levels=10), search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-50 embeddings model with HierarchicalHyperplaneLsh(num_levels=10)
Embedding vocabulary of 4319 words
Training hasher with all 4253 vocabulary embeddings
Using 4253 samples for hierarchical clustering
Building hierarchy over 4253 data points
Maximum HierarchicalClustering hash length: 13
Enriching clusters with parents and levels and hyperplanes
Hierarchical hyperplanes calculated
Hashing all 4253 vocabulary embeddings
Words for word apple, hash 000101110: ['peasant', 'corn', 'native', 'flower', 'lean', 'bean', 'apple', 'sugar']
Words for word orange, hash 010110111: ['neck', 'blue', 'band', 'orange', 'queen', 'ugly', 'signature', 'label', 'outfit', 'flag', 'parade', 'red']
Words for word banana, hash 000110110: ['banana', 'duck', 'salmon']
Words for word grape, hash 0001000: ['grape', 'agriculture', 'spill', 'crop', 'grow', 'output', 'mineral', 'plant', 'natural', 'subsidy', 'supply', 'production', 'oil', 'strain', 'species']
Words fo

# Gensim similarity comparison

In [20]:
def print_gensim_word_neighbourhood(word, embeddings_model_wrapper):
    neighbourhood = []
    for neighbourhood_word, _ in embeddings_model_wrapper.embeddings_model.most_similar(word, topn=20):
        if neighbourhood_word in vocabulary:
            neighbourhood.append(neighbourhood_word)
    print(f'Gensim words for word {word}: {neighbourhood}')

def compare_gensim_word_neighbourhood(embeddings_model_name, search_terms):
    embeddings_model_wrapper = EmbeddingsModel.get(embeddings_model_name)
    print(f"Comparing vocabulary hashes for {embeddings_model_name} embeddings model")
    for term in search_terms:
        print_gensim_word_neighbourhood(term, embeddings_model_wrapper)

In [21]:
compare_gensim_word_neighbourhood('glove-wiki-gigaword-50', search_terms)

Comparing vocabulary hashes for glove-wiki-gigaword-50 embeddings model
Gensim words for word apple: ['software', 'processor', 'product']
Gensim words for word orange: ['blue', 'red', 'yellow', 'black', 'purple', 'pink', 'green', 'white', 'cream', 'leaf', 'coat', 'juice']
Gensim words for word banana: ['fruit', 'sugar', 'peanut', 'shrimp', 'potato', 'corn']
Gensim words for word grape: ['wine', 'fruit']
Gensim words for word fruit: ['vegetable', 'flower', 'juice', 'tomato', 'honey', 'corn', 'dried', 'milk', 'coffee']
Gensim words for word cheese: ['butter', 'chocolate', 'cream', 'tomato', 'bread', 'sandwich', 'potato', 'pie', 'pasta', 'sauce', 'soup']
Gensim words for word plane: ['airplane', 'flight', 'jet', 'helicopter', 'crash', 'landing', 'aircraft', 'flying', 'crew', 'passenger']
Gensim words for word train: ['bus', 'passenger', 'traffic', 'car', 'rail', 'truck', 'boat', 'station', 'vehicle']
Gensim words for word universe: ['planet', 'reality', 'earth', 'realm', 'dimension', 'fan