**Load Dependencies**

In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure

In [None]:
nltk.download('punkt') # English-language sentence tokenizer (not all periods end sentences; not all sentences start with a capital letter)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Load Data**

In [None]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [None]:
from nltk.corpus import gutenberg

In [None]:
len(gutenberg.fileids())

18

In [None]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

**Tokenise text**

In [None]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [None]:
gberg_sent_tokens[0:12]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.",
 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.',
 "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.",
 'Between _them_ it was more the intimacy\nof sisters.',
 "Even before Miss Taylor had ceased to hold the nominal

In [None]:
gberg_sent_tokens[1]

"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."

In [None]:
word_tokenize(gberg_sent_tokens[0])

['[',
 'Emma',
 'by',
 'Jane',
 'Austen',
 '1816',
 ']',
 'VOLUME',
 'I',
 'CHAPTER',
 'I',
 'Emma',
 'Woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich',
 ',',
 'with',
 'a',
 'comfortable',
 'home',
 'and',
 'happy',
 'disposition',
 ',',
 'seemed',
 'to',
 'unite',
 'some',
 'of',
 'the',
 'best',
 'blessings',
 'of',
 'existence',
 ';',
 'and',
 'had',
 'lived',
 'nearly',
 'twenty-one',
 'years',
 'in',
 'the',
 'world',
 'with',
 'very',
 'little',
 'to',
 'distress',
 'or',
 'vex',
 'her',
 '.']

In [None]:
word_tokenize(gberg_sent_tokens[1])[2]

'the'

In [None]:
# A convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [None]:
gberg_sents[0:6]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I'],
 ['Emma',
  'Woodhouse',
  ',',
  'handsome',
  ',',
  'clever',
  ',',
  'and',
  'rich',
  ',',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  ',',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  ';',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  '-',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her',
  '.'],
 ['She',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  ',',
  'indulgent',
  'father',
  ';',
  'and',
  'had',
  ',',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  "'",
  's',
  'marriage',
  ',',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period',
  '.'],
 ['Her',
  'mother',
  'h

In [None]:
gberg_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [None]:
gberg_sents[4][14]

'father'

In [None]:
# Another convenient method that we don't immediately need: 
gutenberg.words()

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [None]:
# gutenberg.words() is analogous to the following line, which need not be run: 
#word_tokenize(gutenberg.raw())
# as it would list the entire corpus

In [None]:
# our Gutenberg corpus is 2.6m words in length: 
len(gutenberg.raw())

11793318

In [None]:
len(gutenberg.words())

2621613

**Run word2vec**

In [None]:
model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, workers=8)

In [None]:
model.save('raw_gutenberg_model.w2v')

**Explore** **model**

In [None]:
# skip re-training the model using the next line: as we can load the trained model from the saved loc.
model = gensim.models.Word2Vec.load('raw_gutenberg_model.w2v')

In [None]:
model.wv['dog']

array([ 0.2758564 ,  0.19389503,  0.09921485,  0.18718605,  0.25579908,
        0.3511914 ,  0.04934984, -0.1275554 ,  0.29673687,  0.24426503,
       -0.15590529,  1.2551665 ,  0.16555962, -0.03253528, -0.25612152,
       -0.18227388,  0.08835025,  0.12748653,  0.19124743,  0.21050847,
       -0.05364254,  0.27734542, -0.34833264,  0.31854904, -0.11189602,
        0.2884107 , -0.18016295, -0.45710707, -0.07138496,  0.1685546 ,
       -0.0904722 , -0.10058635,  0.4426659 , -0.09493393, -0.32582644,
       -0.25687853,  0.5575899 , -0.14363497, -0.28450364,  0.1929922 ,
        0.03577699,  0.28881165, -0.253583  , -0.03058347,  0.07171685,
       -0.2021446 , -0.32135987, -0.75394034, -0.07441638,  0.03891585,
       -0.3128604 , -0.21936046, -0.06340459, -0.13789219, -0.1169392 ,
       -0.33756343,  0.150887  , -0.22844532, -0.19806841,  0.17280646,
        0.07406019,  0.11091194, -0.42252144, -0.2656763 ], dtype=float32)

In [None]:
len(model.wv['dog'])

64

In [None]:
model.wv.most_similar('dog')#i.e., the distance

[('puppy', 0.8340737223625183),
 ('cage', 0.7928931713104248),
 ('chimney', 0.7839993238449097),
 ('thief', 0.7753556966781616),
 ('sweeper', 0.7708736658096313),
 ('broth', 0.7643749713897705),
 ('whip', 0.7533056735992432),
 ('gallon', 0.7523377537727356),
 ('kick', 0.7522217035293579),
 ('boy', 0.7465114593505859)]

In [None]:
model.wv.most_similar('think')

[('contradict', 0.8554378747940063),
 ('manage', 0.843704104423523),
 ('suppose', 0.8308237791061401),
 ('Mamma', 0.8161925673484802),
 ('believe', 0.8160182237625122),
 ('imagine', 0.8138350248336792),
 ('NOW', 0.8135796189308167),
 ('know', 0.8104155659675598),
 ('interfere', 0.8067306280136108),
 ('guess', 0.8060098886489868)]

In [None]:
model.wv.most_similar('day')

[('morning', 0.7808342576026917),
 ('night', 0.753442645072937),
 ('month', 0.7276401519775391),
 ('time', 0.7273766994476318),
 ('week', 0.6760929822921753),
 ('Saturday', 0.6664142608642578),
 ('year', 0.6654803156852722),
 ('evening', 0.6626105308532715),
 ('afternoon', 0.6599017381668091),
 ('Adar', 0.6503151655197144)]

In [None]:
model.wv.most_similar('mother')

[('sister', 0.8727375268936157),
 ('father', 0.8622727990150452),
 ('brother', 0.8418745994567871),
 ('wife', 0.809418797492981),
 ('husband', 0.8053178787231445),
 ('daughter', 0.7809246778488159),
 ('child', 0.7518359422683716),
 ('Mary', 0.7497849464416504),
 ('nurse', 0.749450147151947),
 ('Amnon', 0.7461297512054443)]

In [None]:
model.wv.doesnt_match("mother sister daughter dog". split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'dog'

In [None]:
model.wv.similarity('father', 'dog')

0.45413953

In [None]:
# close, but not quite; distinctly in female direction: while this model performs better
model.wv.most_similar(positive=['father', 'woman'], negative=['man'])

[('mother', 0.7804487943649292),
 ('wife', 0.776812732219696),
 ('husband', 0.7620337009429932),
 ('sister', 0.7574506998062134),
 ('daughter', 0.7545719742774963),
 ('brother', 0.7241369485855103),
 ('Rachel', 0.6826387643814087),
 ('Tamar', 0.6804218292236328),
 ('child', 0.6785721778869629),
 ('Sarai', 0.6742565035820007)]

In [None]:
model.wv.most_similar(positive=['son', 'woman'], negative=['man'])

[('wife', 0.7315358519554138),
 ('daughter', 0.7298749089241028),
 ('Leah', 0.7172065377235413),
 ('Sarai', 0.7135816812515259),
 ('Bethuel', 0.7105062007904053),
 ('Sarah', 0.7087455987930298),
 ('Hagar', 0.7039390802383423),
 ('Rachel', 0.6974040865898132),
 ('conceived', 0.6948695778846741),
 ('Abram', 0.6919147372245789)]

In [None]:
model.wv.most_similar(positive=['husband', 'woman'], negative= ['man'])

[('wife', 0.7411867380142212),
 ('sister', 0.6907098889350891),
 ('mother', 0.6808767914772034),
 ('conceived', 0.6805608868598938),
 ('daughter', 0.6778602600097656),
 ('child', 0.6749604940414429),
 ('widow', 0.6532807350158691),
 ('maid', 0.64496910572052),
 ('nurse', 0.633793830871582),
 ('adultery', 0.6286718249320984)]

In [None]:
model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=30)

[('Rachel', 0.6771907806396484),
 ('Sarah', 0.6688170433044434),
 ('Babylon', 0.6573801636695862),
 ('Leah', 0.6569687128067017),
 ('daughter', 0.6541973948478699),
 ('Solomon', 0.6514551639556885),
 ('Judah', 0.6482703685760498),
 ('David', 0.6478943824768066),
 ('Sarai', 0.6450138092041016),
 ('Abram', 0.6445779800415039),
 ('Padanaram', 0.6422224640846252),
 ('Laban', 0.640700101852417),
 ('queen', 0.6355904936790466),
 ('Bethuel', 0.6308280229568481),
 ('Vashti', 0.6280956268310547),
 ('Hanun', 0.6271393299102783),
 ('household', 0.6219233274459839),
 ('Rahab', 0.6214480400085449),
 ('Ephron', 0.6184583306312561),
 ('servants', 0.6169059872627258),
 ('Hagar', 0.6169013977050781),
 ('Esther', 0.6162646412849426),
 ('Onan', 0.6161724925041199),
 ('conceived', 0.612637460231781),
 ('Cain', 0.61234050989151),
 ('Lot', 0.6121366620063782),
 ('tribute', 0.6118837594985962),
 ('Bilhah', 0.6099321842193604),
 ('Hamor', 0.6080735325813293),
 ('Rebekah', 0.6077944040298462)]

In [None]:
# impressive for such a small data set, without any cleaning, e.g., to lower case (covered next)

**Reduce word vector dimensionality with t-SNE**

In [None]:
len(model.wv.vocab)

17011

In [None]:
X = model.wv[model.wv.vocab]

In [None]:
tsne = TSNE(n_components=2, n_iter= 1000) # 200 is minimum iter; default is 1000

In [None]:
X_2nd = tsne.fit_transform(X) #transforming 64 dimensions to 2, would be X_2nd or simply X_2d

In [None]:
X_2nd[0:5]

array([[-44.524197, -35.81803 ],
       [ 11.805072, -61.097305],
       [ 33.176964, -12.027263],
       [ 14.915606, -61.707603],
       [-44.477802, -35.744102]], dtype=float32)

In [None]:
# create DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2nd, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [None]:
coords_df.head()

Unnamed: 0,x,y,token
0,-44.524197,-35.818031,[
1,11.805072,-61.097305,Emma
2,33.176964,-12.027263,by
3,14.915606,-61.707603,Jane
4,-44.477802,-35.744102,]


In [None]:
coords_df.to_csv('raw_gutenberg_tsne.csv', index=False)

**Visualize 2D representation of word vectors**

In [None]:
coords_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [None]:
output_notebook() #output bokeh plots inline in notebook

In [None]:
subset_df = coords_df.sample(n = 5000)

In [None]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y= subset_df.y, text=subset_df.token)

In [None]:
show(p)