Wikipedia2vec is too slow

In [47]:
from wikipedia2vec import Wikipedia2Vec

class Wiki2Vec:
    def __init__(self, model_path):
        self.model = Wikipedia2Vec.load(model_path)

    def get_similar_words(self, word, result_count):
        return self.model.most_similar(self.model.get_word(word), result_count)

In [50]:
model_path = "../wikipedia2vec/enwiki_20180420_win10_300d.pkl"
wiki2vec = Wiki2Vec(model_path)

In [52]:
print("Similar words:\n", wiki2vec.get_similar_words("love", 5))

Similar words:
 [(<Word love>, 1.0000001), (<Word madly>, 0.74789935), (<Word loves>, 0.716889), (<Word unrequited>, 0.71679133), (<Word lover>, 0.69387823)]


Fasttext python api is broken

In [44]:
import fasttext

class FastText:
    def __init__(self, model_path):
        self.model = fasttext.load_model(model_path)

    def get_similar_words(self, word, result_count):
        # return self.model.get_nearest_neighbors(word, result_count) does not work
        return self.model.f.getNN(word, result_count) # does not work either

In [45]:
model_path = "cc.en.300.bin"
fasttext_model = FastText(model_path)




In [46]:
fasttext_model.get_similar_words("trump", 10)

AttributeError: 'fasttext_pybind.fasttext' object has no attribute 'getNN'

Python Implementation by calling the terminal
https://github.com/facebookresearch/fastText/issues/384#issuecomment-356413756

In [24]:
FASTTEXT_PATH = "fastText/fasttext"
REVIEWS_MODEL_PATH = "cc.en.300.bin"
NUM_NEIGHBORS = 10


class NNLookup:
    """Class for using the command-line interface to fasttext nn to lookup neighbours.
    It's rather fiddly and depends on exact text strings. But it is at least short and simple."""
    def __init__(self, model_path):
        self.nn_process = pexpect.spawn('%s nn %s %d' % (FASTTEXT_PATH, model_path, NUM_NEIGHBORS))
        self.nn_process.expect('Query word?')  # Flush the first prompt out.

    def get_nn(self, word):
        self.nn_process.sendline(word)
        self.nn_process.expect('Query word?')
        output = self.nn_process.before
        print(output)
        return [word] + [line.strip().split()[0] for line in output.strip().split('\n')[1:]]

In [25]:
nn_lookup = NNLookup(REVIEWS_MODEL_PATH)

In [26]:
nn_lookup.get_nn("trump")

b'? trump\r\ntrumps 0.713961\r\ntrumping 0.640264\r\ntrumph 0.601567\r\ntrump. 0.570519\r\nTrumps 0.549689\r\ndrumpf 0.529858\r\ndonald 0.523064\r\nivanka 0.521932\r\ntrumped 0.51556\r\nanti-trump 0.515227\r\n'


TypeError: a bytes-like object is required, not 'str'

Real Python Implementation https://github.com/facebookresearch/fastText/issues/384#issuecomment-399755873

In [35]:
import numpy as np

In [36]:
class FastTextNN:
    
    def __init__(self, ft_model, ft_matrix=None):
        self.ft_model = ft_model        
        self.ft_words = ft_model.get_words()
        self.word_frequencies = dict(zip(*ft_model.get_words(include_freq=True)))
        self.ft_matrix = ft_matrix
        if self.ft_matrix is None:
            self.ft_matrix = np.empty((len(self.ft_words), ft_model.get_dimension()))
            for i, word in enumerate(self.ft_words):
                self.ft_matrix[i,:] = ft_model.get_word_vector(word)
    
    def find_nearest_neighbor(self, query, vectors, n=10,  cossims=None):
        """
        query is a 1d numpy array corresponding to the vector to which you want to
        find the closest vector
        vectors is a 2d numpy array corresponding to the vectors you want to consider

        cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
        returns the index of the closest n matches to query within vectors and the cosine similarity (cosine the angle between the vectors)

        """
        if cossims is None:
            cossims = np.matmul(vectors, query, out=cossims)

        norms = np.sqrt((query**2).sum() * (vectors**2).sum(axis=1))
        cossims = cossims/norms
        result_i = np.argpartition(-cossims, range(n+1))[1:n+1]
        return list(zip(result_i, cossims[result_i]))

    def nearest_words(self, word, n=10, word_freq=None):
        result = self.find_nearest_neighbor(self.ft_model.get_word_vector(word), self.ft_matrix, n=n)
        if word_freq:
            return [(self.ft_words[r[0]], r[1]) for r in result if self.word_frequencies[self.ft_words[r[0]]] >= word_freq]
        else:
            return [(self.ft_words[r[0]], r[1]) for r in result]

In [37]:
model_path = "cc.en.300.bin"
ft_model = fasttext.load_model(model_path)
fasttext_nn = FastTextNN(ft_model)




In [56]:
fasttext_nn.nearest_words("thedressupmom")

[('CrônicasEsdrasNeemiasEsterJóSalmosProvérbiosEclesiastesCânticosIsaíasJeremiasLamentaçõesEzequielDanielOséiasJoelAmósObadiasJonasMiquéiasNaumHabacuqueSofoniasAgeuZacariasMalaquiasNovo',
  0.4190836671062928),
 ('DEky4M0BSpUOTPnSpkuL5I0GTSnRI4jMepcaFAoxIoFnX5kmJQk1aYvr2odGBAAIfkECQoABAAsCQAAABAAEgAACGcAARAYSLCgQQEABBokkFAhAQEQHQ4EMKCiQogRCVKsOOAiRocbLQ7EmJEhR4cfEWoUOTFhRIUNE44kGZOjSIQfG9rsyDCnzp0AaMYMyfNjS6JFZWpEKlDiUqALJ0KNatKmU4NDBwYEACH5BAkKAAQALAkAAAAQABIAAAhpAAEQGEiQIICDBAUgLEgAwICHAgkImBhxoMOHAyJOpGgQY8aBGxV2hJgwZMWLFTcCUIjwoEuLBym69PgxJMuDNAUqVDkz50qZLi',
  0.41522681783772486),
 ('DEky4M0BSpUOTPnSpkuL5I0GTSnRI4jMepcaFAoxIoFnX5kmJQk1aYvr2odGBAAIfkECQoABAAsCQAAABAAEgAACGcAARAYSLCgQQEABBokkFAhAQEQHQ4EMKCiQogRCVKsOOAiRocbLQ7EmJEhR4cfEWoUOTFhRIUNE44kGZOjSIQfG9rsyDCnzp0AaMYMyfNjS6JFZWpEKlDiUqALJ0KNatKmU4NDBwYEACH5BAUKAAQALAkAAAAQABIAAAhpAAEQGEiQIICDBAUgLEgAwICHAgkImBhxoMOHAyJOpGgQY8aBGxV2hJgwZMWLFTcCUIjwoEuLBym69PgxJMuDNAUqVDkz50qZLi',
  0.41358755668836666),
 ('ESTATERETAILCONSUMER