https://medium.com/cityai/deep-learning-for-natural-language-processing-part-i-8369895ffb98

In [1]:
import nltk
import pandas as pd
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
from bokeh.io import output_notebook
from bokeh.plotting import show, figure

### load model and data

In [2]:
# Punctuation and Tokenizer module
nltk.download('punkt')
# The Gutenberg dataset. A set of 18 books we can used to train upon.
nltk.download('gutenberg')
from nltk.corpus import gutenberg

[nltk_data] Downloading package punkt to /home/huachen/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/huachen/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


take a look of books inside gutenberg:

In [3]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


### load sentences

In [4]:
# Due to lack of resources, I'm not working with the full Gutenberg 
# dataset (18 books). If you got a GPU, you can just omit the
# ‘fileids’ parameter and all 18 books will be loaded.
gberg_sents = gutenberg.sents(fileids=['bible-kjv.txt',
                              'austen-emma.txt',
                              'austen-persuasion.txt',
                              'austen-sense.txt',
                              'carroll-alice.txt'])

how many words are there in the set we loaded:

In [6]:
print(len(gutenberg.sents(fileids=['bible-kjv.txt',
                                   'austen-emma.txt',
                                   'austen-persuasion.txt',
                                   'austen-sense.txt',
                                   'carroll-alice.txt'])))

48304


### hardcore stuffs
run the word2vec model

In [13]:
# size = 64, dimensions
# sg = 1, use Skip-Gram. If zero, it will use CBOW
# window = 10, context words (10 to the left and 10 to the right)
# min_count = 5, ignore words with frequency lower than that
# seed = 42, the answer to the universe, life and everything.
# workers = 2, number of worker threads.
model = Word2Vec(sentences=gberg_sents,  sg=1,
                 window=10, min_count=5, seed=42,
                 workers=2)
# Shows the coordinates of the word ‘house’ in the vector space.
print(model.wv['house'])
print(model.wv.most_similar('house'))
print(model.wv.most_similar('day'))
print(model.wv.most_similar('father'))
print(model.wv.doesnt_match('mother father daughter house'.split()))

[ 0.26663274 -0.21826315 -0.1348525  -0.4016196  -0.52962965  0.03369028
  0.09285289 -0.01911975 -0.06512941 -0.12755392  0.21094711  0.04094973
 -0.0474777  -0.09716108 -0.03971877 -0.01073876 -0.15421328 -0.3395195
 -0.08078054 -0.12460786 -0.1376849  -0.07534534  0.21138014 -0.3128426
 -0.0471597  -0.13127126 -0.11695661  0.32264823 -0.32070845 -0.23523596
 -0.46685365  0.02009837 -0.01898701  0.1418586  -0.21016406 -0.05596342
 -0.3043874  -0.16800652 -0.20094025  0.2239185  -0.06167073 -0.14520763
  0.14730707 -0.06095502  0.42102984  0.08698533 -0.4569305   0.13173863
  0.10498156 -0.32804     0.03641748  0.16695778 -0.03016772 -0.07994103
  0.37708336  0.4199998  -0.06383915  0.39205605  0.1604757   0.39924726
 -0.08539315 -0.06614196 -0.7299913  -0.33521625 -0.1659048   0.11154507
  0.3070444  -0.40802354  0.3381951  -0.15096235  0.1470864   0.07025782
  0.41416436  0.23269054  0.15718628 -0.19619706  0.2164799  -0.83695626
  0.08877262 -0.01090841  0.2640706  -0.07118145 -0.2