In [None]:
#
# Word2vec
#

In [None]:
# full docs available at
# https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html
# Gensim has a gensim.downloader module for programmatically accessing data.
# This module leverages a local cache (in user’s home folder, by default)
# that ensures data is downloaded at most once.
#
import logging
import gensim.downloader as api

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# download the text8 corpus and load it as an object that supports streamed access.
corpus = api.load('text8')

In [None]:
# In this case, our corpus is an iterable.
# it has the following definition:
import inspect
print(inspect.getsource(corpus.__class__))

In [None]:
# For more details, in the file that defines
# the Dataset class for your particular resource.

print(inspect.getfile(corpus.__class__))

In [None]:
# With the corpus has been downloaded and loaded,
# use it to TRAIN a word2vec model.

from gensim.models.word2vec import Word2Vec
model = Word2Vec(corpus)

In [None]:
# Having a word2vec model, we can find words that are similar to any term.

print(model.wv.most_similar('house'))

In [None]:
# the API can be used to download and list
# all resources and corpora available in gensim-data:
import json
info = api.info()
print(json.dumps(info, indent=4))

In [None]:
# There are two types of data resources: corpora and models.

print(info.keys())

In [None]:
# print available corpora
for corpus_name, corpus_data in sorted(info['corpora'].items()):
    print(
        '%s (%d records): %s' % (
            corpus_name,
            corpus_data.get('num_records', -1),
            corpus_data['description'][:40] + '...',
        )
    )

In [None]:
# print available models:

for model_name, model_data in sorted(info['models'].items()):
    print(
        '%s (%d records): %s' % (
            model_name,
            model_data.get('num_records', -1),
            model_data['description'][:40] + '...',
        )
    )


In [None]:
# to get detailed information about a model/corpus, use:

fake_news_info = api.info('fake-news')
print(json.dumps(fake_news_info, indent=4))

In [None]:
# Sometimes (most times), you do not want
# to load a model into memory. Instead,
# you can request just the filesystem path
# to the model.

print(api.load('glove-wiki-gigaword-50', return_path=True))

In [None]:
#  to load the model to memory, then:

model = api.load("glove-wiki-gigaword-50")
model.most_similar("teacher")