# Creating Word Vectors with word2vec

In [1]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure

In [2]:
nltk.download('punkt') # English-language sentence tokenizer (not all periods end sentences; not all sentences start with a capital letter)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [4]:
from nltk.corpus import gutenberg

len(gutenberg.fileids())

18

In [5]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

### Tokenization of text

In [6]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [7]:
gberg_sent_tokens[0:6]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.",
 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.',
 "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.",
 'Between _them_ it was more the intimacy\nof sisters.',
 "Even before Miss Taylor had ceased to hold the nominal

In [8]:
gberg_sent_tokens[1]

"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."

In [9]:
word_tokenize(gberg_sent_tokens[1])

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [10]:
word_tokenize(gberg_sent_tokens[1])[14]

'father'

In [11]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [13]:
gberg_sents[0:6]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I'],
 ['Emma',
  'Woodhouse',
  ',',
  'handsome',
  ',',
  'clever',
  ',',
  'and',
  'rich',
  ',',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  ',',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  ';',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  '-',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her',
  '.'],
 ['She',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  ',',
  'indulgent',
  'father',
  ';',
  'and',
  'had',
  ',',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  "'",
  's',
  'marriage',
  ',',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period',
  '.'],
 ['Her',
  'mother',
  'h

In [14]:
gberg_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [15]:
gberg_sents[4][14]

'father'

In [16]:
# another convenient method that we don't immediately need: 
gutenberg.words()

# gutenberg.words() is analogous to the following line, which need not be run: 
# word_tokenize(gutenberg.raw())

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [17]:
# our Gutenberg corpus is 2.6m words in length: 
len(gutenberg.words())

2621613

### Running word2vec

In [18]:
model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, workers=8)
model.save('raw_gutenberg_model.w2v')


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Evaluate

In [19]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('raw_gutenberg_model.w2v')



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [20]:
model.wv['dog']

array([ 3.8860184e-01, -1.3514456e-01,  9.3909360e-02,  2.9134524e-01,
        3.4199722e-02,  9.7620912e-02, -6.0048245e-02, -1.7240748e-01,
       -5.4713517e-01,  5.1955181e-01, -5.3144950e-01,  2.6515886e-02,
       -2.3254623e-01,  1.5099242e-01,  1.5213323e-01,  1.8493994e-01,
       -1.3945773e-01, -3.5197276e-01, -1.3518563e-01, -5.8792078e-01,
        1.7958093e-01, -3.5033777e-01,  2.8343424e-01, -1.3415277e-01,
       -3.2894412e-01, -2.0980074e-01,  2.1197915e-01, -1.4676289e-01,
       -3.7010437e-01, -4.8135701e-01, -3.9316875e-01, -2.1858032e-01,
       -2.5742960e-01,  2.9314622e-02,  1.0151608e-01, -8.2955301e-01,
       -3.3397490e-01,  7.5473614e-02, -1.6446263e-01,  1.8014897e-01,
        9.7646676e-02, -3.8634649e-01, -3.6173075e-01,  1.1698970e-01,
       -3.7834194e-01, -1.4834623e-01, -1.2280551e-01,  9.6468106e-03,
        5.4638531e-02,  5.8814698e-01, -1.7105897e-01,  1.5666448e-01,
       -3.8211527e-03, -4.8711514e-05, -6.3591234e-02, -3.2564472e-02,
      

In [21]:
len(model.wv['dog'])

64

In [22]:
model.wv.most_similar('dog') # distance

  if np.issubdtype(vec.dtype, np.int):


[('puppy', 0.8399801254272461),
 ('chimney', 0.7804919481277466),
 ('boy', 0.7711700797080994),
 ('broth', 0.7704296112060547),
 ('sweeper', 0.7677549123764038),
 ('kick', 0.7673530578613281),
 ('thief', 0.7639141082763672),
 ('cage', 0.7617539167404175),
 ('whip', 0.7564784288406372),
 ('cow', 0.7515280842781067)]

In [23]:
model.wv.most_similar('think')

  if np.issubdtype(vec.dtype, np.int):


[('manage', 0.8418099880218506),
 ('contradict', 0.8380868434906006),
 ('suppose', 0.836441159248352),
 ('believe', 0.8199832439422607),
 ('behave', 0.8155148029327393),
 ('know', 0.8114057779312134),
 ('Mamma', 0.8086732625961304),
 ('really', 0.8075456023216248),
 ('mean', 0.8071227073669434),
 ('guess', 0.8015608787536621)]

In [24]:
model.wv.most_similar('day')

  if np.issubdtype(vec.dtype, np.int):


[('morning', 0.778294026851654),
 ('time', 0.7724676728248596),
 ('night', 0.7368128895759583),
 ('month', 0.72005295753479),
 ('week', 0.693356990814209),
 ('Saturday', 0.6790335774421692),
 ('sabbath', 0.675177812576294),
 ('evening', 0.6677111387252808),
 ('Adar', 0.6607083082199097),
 ('fourteenth', 0.6517543792724609)]

In [25]:
model.wv.most_similar('father')

  if np.issubdtype(vec.dtype, np.int):


[('mother', 0.8628281950950623),
 ('brother', 0.8480645418167114),
 ('sister', 0.8077663779258728),
 ('daughter', 0.7768992185592651),
 ('wife', 0.7691270112991333),
 ('Amnon', 0.7391806244850159),
 ('uncle', 0.7216702103614807),
 ('younger', 0.7145200967788696),
 ('servant', 0.713840126991272),
 ('Tamar', 0.709930419921875)]

In [26]:
model.wv.doesnt_match("mother father daughter dog".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'dog'

In [27]:
model.wv.similarity('father', 'dog')

  if np.issubdtype(vec.dtype, np.int):


0.46994475

In [28]:
# close, but not quite; distinctly in female direction: 
model.wv.most_similar(positive=['father', 'woman'], negative=['man'])

  if np.issubdtype(vec.dtype, np.int):


[('mother', 0.7833986282348633),
 ('daughter', 0.7813690900802612),
 ('husband', 0.7790541648864746),
 ('sister', 0.7786932587623596),
 ('wife', 0.7725012898445129),
 ('brother', 0.7363234758377075),
 ('daughters', 0.7005206346511841),
 ('Rachel', 0.6896263360977173),
 ('Sarai', 0.6845165491104126),
 ('Sarah', 0.6740219593048096)]

In [29]:
# more confident about this one: 
model.wv.most_similar(positive=['son', 'woman'], negative=['man'])

  if np.issubdtype(vec.dtype, np.int):


[('daughter', 0.7486582398414612),
 ('wife', 0.7452919483184814),
 ('Sarai', 0.7249553203582764),
 ('Leah', 0.718904972076416),
 ('Sarah', 0.7184145450592041),
 ('Hagar', 0.7135589718818665),
 ('Bethuel', 0.7135344743728638),
 ('Abram', 0.7004900574684143),
 ('Jephunneh', 0.6973567605018616),
 ('Bilhah', 0.6933270692825317)]

In [30]:
model.wv.most_similar(positive=['husband', 'woman'], negative=['man'])

  if np.issubdtype(vec.dtype, np.int):


[('wife', 0.7276437282562256),
 ('sister', 0.6854567527770996),
 ('daughter', 0.6776511669158936),
 ('mother', 0.6615833044052124),
 ('widow', 0.6540002822875977),
 ('child', 0.6526877880096436),
 ('conceived', 0.6513186693191528),
 ('maid', 0.6462736129760742),
 ('Rachel', 0.619889497756958),
 ('nurse', 0.6193939447402954)]

In [31]:
model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=30)

  if np.issubdtype(vec.dtype, np.int):


[('Sarah', 0.7277158498764038),
 ('Rachel', 0.7238438725471497),
 ('Abram', 0.7036274671554565),
 ('Leah', 0.692919135093689),
 ('Sarai', 0.6917165517807007),
 ('Babylon', 0.687897264957428),
 ('David', 0.6863441467285156),
 ('Judah', 0.6834956407546997),
 ('Bethuel', 0.6801999807357788),
 ('daughter', 0.6733150482177734),
 ('Hagar', 0.6710746884346008),
 ('Padanaram', 0.6705467700958252),
 ('Laban', 0.6703552007675171),
 ('Pharaoh', 0.659987211227417),
 ('Jerusalem', 0.6561363935470581),
 ('Rebekah', 0.6541107892990112),
 ('Rahab', 0.6522243022918701),
 ('Bilhah', 0.6510846614837646),
 ('magicians', 0.6491096615791321),
 ('Ephron', 0.6475814580917358),
 ('Hanun', 0.6469735503196716),
 ('tribute', 0.6447017192840576),
 ('queen', 0.6439679265022278),
 ('Solomon', 0.6430158615112305),
 ('Lot', 0.6419060230255127),
 ('Onan', 0.6415507197380066),
 ('damsel', 0.6407134532928467),
 ('Jerubbaal', 0.6405037641525269),
 ('Zilpah', 0.6399076581001282),
 ('Cain', 0.6363469362258911)]

### Reduce word vector dimensionality with t-SNE

In [32]:
len(model.wv.vocab)

17011

In [33]:
X = model.wv[model.wv.vocab]
tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter; default is 1000
X_2d = tsne.fit_transform(X)

In [34]:
X_2d[0:5]

array([[ 26.159676 ,  48.444515 ],
       [ 61.802086 ,  -5.59553  ],
       [ 12.030325 , -30.90867  ],
       [ 62.730694 ,  -7.9478903],
       [ 26.197147 ,  48.381195 ]], dtype=float32)

In [35]:
# create DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [36]:
coords_df.head()

Unnamed: 0,x,y,token
0,26.159676,48.444515,[
1,61.802086,-5.59553,Emma
2,12.030325,-30.90867,by
3,62.730694,-7.94789,Jane
4,26.197147,48.381195,]


In [37]:
coords_df.to_csv('raw_gutenberg_tsne.csv', index=False) #save

### Visualize 2D Representation

In [38]:
coords_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [39]:
output_notebook() # output bokeh plots inline in notebook

In [43]:
subset_df = coords_df.sample(n=200)

In [44]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [45]:
show(p)