In [1]:
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath
from pprint import pprint as print

In [2]:
# Set file names for train and test data
corpus_file = datapath('lee_background.cor')

model = FastText(size=100)

# build the vocabulary
model.build_vocab(corpus_file=corpus_file)

# train the model
model.train(
    corpus_file=corpus_file, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words
)

print(model)

<gensim.models.fasttext.FastText object at 0x7f6805bc0910>


In [3]:
# saving a model trained via Gensim's fastText implementation
import tempfile
import os

with tempfile.NamedTemporaryFile(prefix='saved_model_gensim-', delete=False) as tmp:
    model.save(tmp.name, separately=[])

loaded_model = FastText.load(tmp.name)
print(loaded_model)

os.unlink(tmp.name)

<gensim.models.fasttext.FastText object at 0x7f6805bc0430>


In [4]:
print('night' in model.wv.vocab)

True


In [5]:
print('nights' in model.wv.vocab)

False


In [6]:
print(model['night'])

array([ 0.09328159,  0.00293368, -0.5825525 ,  0.4804867 ,  0.60628265,
       -0.32440963, -0.1900126 , -0.04720529,  0.4296245 ,  0.33883592,
       -0.64138836, -0.01311536, -0.6803271 ,  0.41426086,  0.28293076,
       -0.0627832 , -0.19809733,  0.18880153,  0.24111313, -0.38077462,
       -0.24081866,  0.27846235, -0.3800002 ,  0.02428199, -0.8321537 ,
        0.73555607,  0.12400806,  0.18025942,  0.41238812,  0.01032294,
       -0.6513453 ,  0.23436972,  0.08221383, -0.48725444,  0.47087866,
        0.10562417, -0.16273548, -0.06727337,  0.4205416 ,  0.21674827,
       -0.01219405, -0.0714284 ,  0.39351365, -0.06149524,  0.12587579,
        0.18490611, -0.1297693 ,  0.21808794, -0.01785759, -0.3867093 ,
       -0.5744982 , -0.55261075,  0.06789429,  0.01309222,  0.40188548,
       -0.80584663, -0.10843118, -0.18335408,  0.0093728 , -0.03013261,
        0.21745446, -0.11712731, -0.49495208, -0.10418411, -0.5365404 ,
        0.34694734,  0.00929465,  0.15118392,  0.02868891,  0.47

  print(model['night'])


In [7]:
print(model['nights'])

array([ 0.08200456,  0.00311313, -0.5067949 ,  0.41726106,  0.52656895,
       -0.28365764, -0.1659721 , -0.04026311,  0.3734566 ,  0.29544747,
       -0.56008923, -0.01283315, -0.59213215,  0.36180642,  0.24214585,
       -0.0543477 , -0.17448288,  0.16253269,  0.2081355 , -0.3317107 ,
       -0.20949489,  0.24317653, -0.33130634,  0.0217941 , -0.72467506,
        0.63971657,  0.107851  ,  0.15729423,  0.35894045,  0.00855084,
       -0.5645802 ,  0.20512997,  0.07021042, -0.4259228 ,  0.40937784,
        0.09158499, -0.1401106 , -0.0598051 ,  0.36575964,  0.18934657,
       -0.00970295, -0.06229048,  0.34186652, -0.05180154,  0.10844268,
        0.16112216, -0.11646864,  0.19153272, -0.01617219, -0.33500212,
       -0.5023316 , -0.47996855,  0.06021847,  0.01235424,  0.35026821,
       -0.7012407 , -0.09444979, -0.15837988,  0.00583872, -0.02591816,
        0.1887479 , -0.10276058, -0.43040594, -0.09123224, -0.46725813,
        0.30125707,  0.00819573,  0.1314019 ,  0.02672475,  0.41

  print(model['nights'])


# Similarity operations

Similarity operations work the same way as word2vec. **Out-of-vocabulary words can also be used, provided they have at least one character ngram present in the training data.**

In [8]:
print(model.similarity("night", "nights"))

0.99999285


  print(model.similarity("night", "nights"))


In [9]:
print(model.most_similar("nights"))

  print(model.most_similar("nights"))


[('study', 0.998267650604248),
 ('Arafat', 0.9982653856277466),
 ('boat', 0.9982650876045227),
 ('often', 0.9982650876045227),
 ('"That', 0.9982621669769287),
 ('Arafat,', 0.9982602596282959),
 ('Endeavour', 0.9982555508613586),
 ('north.', 0.998253583908081),
 ('details', 0.9982484579086304),
 ('stage', 0.9982419013977051)]


In [10]:
print(model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']))

0.9999513


  print(model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']))


Syntactically similar words generally have high similarity in fastText models, since a large number of the component char-ngrams will be the same. As a result, fastText generally does better at syntactic tasks than Word2Vec. A detailed comparison is provided [here](https://radimrehurek.com/gensim/auto_examples/tutorials/Word2Vec_FastText_Comparison.ipynb).

## Other similarity operations

The example training corpus is a toy corpus, results are not expected to be good, for proof-of-concept only

In [11]:
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

'breakfast'


  print(model.doesnt_match("breakfast cereal dinner lunch".split()))
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [12]:
print(model.accuracy(questions=datapath('questions-words.txt')))sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
sentence_president = 'The president greets the press in Chicago'.lower().split()

[{'correct': [], 'incorrect': [], 'section': 'capital-common-countries'},
 {'correct': [], 'incorrect': [], 'section': 'capital-world'},
 {'correct': [], 'incorrect': [], 'section': 'currency'},
 {'correct': [], 'incorrect': [], 'section': 'city-in-state'},
 {'correct': [],
  'incorrect': [('HE', 'SHE', 'HIS', 'HER'), ('HIS', 'HER', 'HE', 'SHE')],
  'section': 'family'},
 {'correct': [], 'incorrect': [], 'section': 'gram1-adjective-to-adverb'},
 {'correct': [], 'incorrect': [], 'section': 'gram2-opposite'},
 {'correct': [('GOOD', 'BETTER', 'GREAT', 'GREATER'),
              ('GREAT', 'GREATER', 'LOW', 'LOWER'),
              ('LONG', 'LONGER', 'GREAT', 'GREATER')],
  'incorrect': [('GOOD', 'BETTER', 'LONG', 'LONGER'),
                ('GOOD', 'BETTER', 'LOW', 'LOWER'),
                ('GREAT', 'GREATER', 'LONG', 'LONGER'),
                ('GREAT', 'GREATER', 'GOOD', 'BETTER'),
                ('LONG', 'LONGER', 'LOW', 'LOWER'),
                ('LONG', 'LONGER', 'GOOD', 'BETTER'),
  

  print(model.accuracy(questions=datapath('questions-words.txt')))


# Word Movers distance

Let’s start with two sentences:

In [13]:
sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
sentence_president = 'The president greets the press in Chicago'.lower().split()

Remove their stopwords.

In [14]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
sentence_obama = [w for w in sentence_obama if w not in stopwords]
sentence_president = [w for w in sentence_president if w not in stopwords]

Compute WMD.

In [16]:
distance = model.wmdistance(sentence_obama, sentence_president)
print(distance)

1.3940237954608796


  distance = model.wmdistance(sentence_obama, sentence_president)
