In [1]:
index_path = '../output/monosemous.bin'
nmslib_params = {'method': 'hnsw', 'space': 'cosinesimil'}
model_path = '../output/model-h2048p512/lstm-wsd-gigaword-google'
vocab_path = '../output/vocab.2018-05-10-7d764e7.pkl'

## Testing NMSLib

In [16]:
import numpy as np
monos = np.load('../output/monosemous-context-embeddings.2018-05-26-3130513.npz')

In [17]:
mono_words, mono_embs = monos['mono_words'], monos['mono_embs']
mono_embs.shape

(904288, 512)

In [10]:
from sklearn.utils import resample
some_words, some_emds = resample(mono_words, mono_embs, replace=False, n_samples=10)
some_emds.shape

(10, 512)

In [20]:
%%time
from sklearn.metrics.pairwise import cosine_similarity
s = cosine_similarity(some_emds, mono_embs)

CPU times: user 4.59 s, sys: 3.42 s, total: 8.01 s
Wall time: 12.7 s


In [23]:
%%time
import nmslib
# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(**nmslib_params)
index.addDataPointBatch(mono_embs)
index.createIndex({'post': 2}, print_progress=True)

In [32]:
neighbors = index.knnQueryBatch(some_emds, k=10, num_threads=4)
len(neighbors)

10

In [34]:
ids, distances = neighbors[0]

In [26]:
index.saveIndex(index_path)

In [2]:
import nmslib
index = nmslib.init(**nmslib_params)
index.loadIndex(index_path)

## Querying by sentences

In [3]:
from model import LSTMLanguageModel

  from ._conv import register_converters as _register_converters


In [4]:
import tensorflow as tf
sess = tf.InteractiveSession()

In [5]:
lm = LSTMLanguageModel(sess, model_path, vocab_path)

INFO:tensorflow:Restoring parameters from ../output/model-h2048p512/lstm-wsd-gigaword-google


In [25]:
sent = 'I study computer science'.split()
target_index = 3
embs = lm.get_embeddings_sentence(sess, sent, target_index)

In [7]:
similar_word_ids, distances = index.knnQuery(embs, k=10)

In [8]:
distances

array([0.7118224 , 0.7206353 , 0.72069556, 0.7294294 , 0.7306201 ,
       0.73099047, 0.7333642 , 0.7354343 , 0.7356905 , 0.73921835],
      dtype=float32)

In [9]:
import numpy as np
word2id = np.load(vocab_path)
id2word = {i: w for w, i in word2id.items()}

In [18]:
similar_words = [id2word[mono_words[i]] for i in similar_word_ids]
similar_words

['slaw',
 'beachwear',
 'fern',
 'borrower',
 'countdown',
 'diorama',
 'tailcoat',
 'immolation',
 'scrapbook',
 'choppiness']

In [19]:
from nltk.corpus import wordnet as wn

In [26]:
wn.synsets(sent[target_index], 'n')

[Synset('science.n.01'), Synset('skill.n.02')]

In [22]:
synset = wn.synsets(similar_words[0], 'n')[0]

In [24]:
[h.name() for h in synset.hypernym_paths()[0]]

['entity.n.01',
 'physical_entity.n.01',
 'matter.n.03',
 'substance.n.07',
 'food.n.01',
 'nutriment.n.01',
 'dish.n.02',
 'salad.n.01',
 'coleslaw.n.01']