# Creating Word Vectors with word2vec

In [1]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure

In [2]:
nltk.download('punkt') # English-language sentence tokenizer (not all periods end sentences; not all sentences start with a capital letter)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kkomb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [3]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\kkomb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


True

In [4]:
from nltk.corpus import gutenberg

len(gutenberg.fileids())

18

In [5]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

### Tokenization of text

In [6]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [7]:
gberg_sent_tokens[0:6]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.",
 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.',
 "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.",
 'Between _them_ it was more the intimacy\nof sisters.',
 "Even before Miss Taylor had ceased to hold the nominal

In [8]:
gberg_sent_tokens[1]

"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."

In [9]:
word_tokenize(gberg_sent_tokens[1])

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [10]:
word_tokenize(gberg_sent_tokens[1])[14]

'father'

In [11]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [12]:
gberg_sents[0:6]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I'],
 ['Emma',
  'Woodhouse',
  ',',
  'handsome',
  ',',
  'clever',
  ',',
  'and',
  'rich',
  ',',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  ',',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  ';',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  '-',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her',
  '.'],
 ['She',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  ',',
  'indulgent',
  'father',
  ';',
  'and',
  'had',
  ',',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  "'",
  's',
  'marriage',
  ',',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period',
  '.'],
 ['Her',
  'mother',
  'h

In [13]:
gberg_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [14]:
gberg_sents[4][14]

'father'

In [15]:
# another convenient method that we don't immediately need: 
gutenberg.words()

# gutenberg.words() is analogous to the following line, which need not be run: 
# word_tokenize(gutenberg.raw())

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [16]:
# our Gutenberg corpus is 2.6m words in length: 
len(gutenberg.words())

2621613

### Running word2vec

In [17]:
model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, workers=8) # sg = skip gram == 1 , 0 continuous bag of words, window_size = 10, iter = 5 (default), min_count = min number of times a word must occur across corpus in order to fit it in vector space, workers == processing cores
model.save('model_output/raw_gutenberg_model.w2v')


### Evaluate

In [18]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('model_output/raw_gutenberg_model.w2v')



In [19]:
model.wv['dog']

array([ 0.4226558 ,  0.02448606, -0.41807085,  0.26955914,  0.10553136,
        0.25590342, -0.12657648, -0.02339899,  0.29244262,  0.0510579 ,
        0.24887173,  0.09414656, -0.12491327, -0.27467686, -0.093724  ,
       -0.33949256,  0.07559001,  0.43766394,  0.47325373,  0.02063425,
       -0.20066582,  0.07053802, -0.09124026,  0.30558187,  0.6162438 ,
       -0.15669328,  0.04338922,  0.57369167, -0.32228184,  0.4124832 ,
       -0.1545553 , -0.1358651 ,  0.7119419 , -0.5747579 , -0.26627415,
       -0.22414848,  0.013143  ,  0.2873723 , -0.00465772, -0.15452005,
        0.32250834,  0.12563957, -0.08129893, -0.08432494, -0.08771995,
       -0.1944579 , -0.07339533,  0.24133126,  0.04558555, -0.3826227 ,
        0.2764877 , -0.36965927,  0.00703222,  0.04124278, -0.17772743,
        0.49824077, -0.04227719, -0.27596867, -0.4161617 ,  0.02790228,
       -0.22703338,  0.76119006,  0.45782053,  0.17657396], dtype=float32)

In [20]:
len(model.wv['dog'])

64

In [21]:
model.wv.most_similar('dog') # distance

[('puppy', 0.8123878836631775),
 ('chimney', 0.7941816449165344),
 ('sweeper', 0.7764820456504822),
 ('cage', 0.7646546363830566),
 ('thief', 0.7599992752075195),
 ('broth', 0.7570700645446777),
 ('boy', 0.7502454519271851),
 ('whip', 0.7441377639770508),
 ('gallon', 0.7417463064193726),
 ('wid', 0.740312933921814)]

In [22]:
model.wv.most_similar('think')

[('contradict', 0.8400367498397827),
 ('suppose', 0.8337477445602417),
 ('manage', 0.829731822013855),
 ('Mamma', 0.8158707022666931),
 ('really', 0.8141055703163147),
 ('believe', 0.8100804686546326),
 ('behave', 0.8042991161346436),
 ('guess', 0.7992029786109924),
 ('happen', 0.7978112697601318),
 ('NOW', 0.796760618686676)]

In [23]:
model.wv.most_similar('day')

[('morning', 0.7836750745773315),
 ('time', 0.7652457356452942),
 ('night', 0.7573159337043762),
 ('month', 0.7365054488182068),
 ('week', 0.7267901301383972),
 ('evening', 0.6919419765472412),
 ('sabbath', 0.6876345276832581),
 ('Saturday', 0.6808834075927734),
 ('Adar', 0.6795975565910339),
 ('afternoon', 0.675356388092041)]

In [24]:
model.wv.most_similar('father')

[('mother', 0.8632395267486572),
 ('brother', 0.8439583778381348),
 ('sister', 0.8127918839454651),
 ('wife', 0.7884070873260498),
 ('daughter', 0.7847610116004944),
 ('Amnon', 0.7597274780273438),
 ('servant', 0.7320384979248047),
 ('younger', 0.7281097769737244),
 ('David', 0.7275477647781372),
 ('uncle', 0.726576566696167)]

In [25]:
model.wv.doesnt_match("mother father daughter dog".split())

'dog'

In [26]:
model.wv.similarity('father', 'dog')

0.4857121

In [27]:
# close, but not quite; distinctly in female direction: 
model.wv.most_similar(positive=['father', 'woman'], negative=['man'])

[('daughter', 0.7914818525314331),
 ('mother', 0.7896489500999451),
 ('sister', 0.7877634763717651),
 ('husband', 0.7875680923461914),
 ('wife', 0.7837554812431335),
 ('brother', 0.7397383451461792),
 ('Rachel', 0.7329663634300232),
 ('Sarai', 0.7137974500656128),
 ('daughters', 0.7071865797042847),
 ('Sarah', 0.6965440511703491)]

In [28]:
# more confident about this one: 
model.wv.most_similar(positive=['son', 'woman'], negative=['man'])

[('daughter', 0.7628356218338013),
 ('Leah', 0.7462682723999023),
 ('Sarai', 0.7392925024032593),
 ('wife', 0.7346237897872925),
 ('Rachel', 0.7345350384712219),
 ('Hagar', 0.7269033193588257),
 ('Bethuel', 0.7259371876716614),
 ('Sarah', 0.7173603773117065),
 ('Bilhah', 0.7109167575836182),
 ('Abram', 0.7043097019195557)]

In [29]:
model.wv.most_similar(positive=['husband', 'woman'], negative=['man'])

[('wife', 0.731613278388977),
 ('sister', 0.7058153748512268),
 ('conceived', 0.6886899471282959),
 ('daughter', 0.6862990856170654),
 ('mother', 0.6789271831512451),
 ('child', 0.675650954246521),
 ('maid', 0.6623833775520325),
 ('Rachel', 0.6582338809967041),
 ('widow', 0.6521735191345215),
 ('whoredoms', 0.6426436901092529)]

In [30]:
model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=30)

[('Sarah', 0.7233039140701294),
 ('Rachel', 0.7136567234992981),
 ('Sarai', 0.6866680383682251),
 ('Leah', 0.6775226593017578),
 ('Abram', 0.6749584078788757),
 ('Padanaram', 0.6722350120544434),
 ('daughter', 0.6673407554626465),
 ('Laban', 0.6667613983154297),
 ('Bethuel', 0.666467547416687),
 ('Cain', 0.6645205020904541),
 ('Hagar', 0.6643502712249756),
 ('Babylon', 0.6641674637794495),
 ('Hanun', 0.6588125228881836),
 ('Judah', 0.6540241241455078),
 ('tribute', 0.6522761583328247),
 ('David', 0.6496819257736206),
 ('queen', 0.6477495431900024),
 ('household', 0.6465251445770264),
 ('Rahab', 0.6429450511932373),
 ('Solomon', 0.6417278051376343),
 ('Onan', 0.6369689702987671),
 ('Bilhah', 0.6316032409667969),
 ('Pharaoh', 0.6302520036697388),
 ('Vashti', 0.6297912001609802),
 ('Rebekah', 0.6291931867599487),
 ('Esther', 0.6262664198875427),
 ('Hamor', 0.625645637512207),
 ('magicians', 0.6234855651855469),
 ('captivity', 0.6212238669395447),
 ('Zilpah', 0.6205405592918396)]

### Reduce word vector dimensionality with t-SNE
- Collapes 64 dimensional space down to 2 dimensions

In [43]:
print('words that got through:', len(model.wv.vocab))

words that got through: 17011


In [32]:
X = model.wv[model.wv.vocab]
tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter; default is 1000
X_2d = tsne.fit_transform(X)

In [None]:
X_2d[0:5]

In [35]:
# create DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [36]:
coords_df.head()

Unnamed: 0,x,y,token
0,-51.602955,25.0156,[
1,-52.998913,-33.808945,Emma
2,2.438241,-32.696819,by
3,-52.478035,-36.89204,Jane
4,-51.523075,25.036707,]


In [37]:
# coords_df.to_csv('model_output/raw_gutenberg_tsne.csv', index=False) #save

### Visualize 2D Representation

In [38]:
coords_df = pd.read_csv('model_output/raw_gutenberg_tsne.csv')

In [39]:
output_notebook() # output bokeh plots inline in notebook

In [40]:
subset_df = coords_df.sample(n=200)

In [41]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [42]:
show(p)