# Creating Word Vectors using word2vec

We'are going to create our own word vectors

#### Importing dependencies

In [8]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
#Why do we need TSNE?
from sklearn.manifold import TSNE
import pandas as pd
#For interactive visualizations of our word vectors
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
%matplotlib inline

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/mohammedsunasra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Loading Data

In [3]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/mohammedsunasra/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [14]:
from nltk.corpus import gutenberg

In [5]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [6]:
print(len(gutenberg.fileids()))

18


In [9]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [10]:
gberg_sent_tokens[:5]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.",
 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.',
 "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.",
 'Between _them_ it was more the intimacy\nof sisters.']

In [11]:
word_tokenize(gberg_sent_tokens[1])

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [15]:
gberg_tokens = gutenberg.sents()

In [21]:
gberg_words = gutenberg.words()

In [24]:
gberg_words

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [25]:
len(gberg_words)

2621613

#### Run Word2Vec

In [26]:
model = Word2Vec(sentences=gberg_tokens, size=64, sg=1, window=10, min_count=5, seed=42)
#Size is dimension of vector
#window size is context word window size

#### Saving model

In [28]:
model.save("../model_files/raw_gutenberg_word2vec.w2v")

#### Loading model

In [29]:
model_loaded = gensim.models.Word2Vec.load("../model_files/raw_gutenberg_word2vec.w2v")

In [30]:
model_loaded["dog"]

  """Entry point for launching an IPython kernel.


array([-2.26538479e-01,  5.51780462e-01,  2.12162733e-01,  2.24508177e-02,
        1.50332138e-01, -4.10672463e-02, -4.64036176e-03,  9.68765542e-02,
        4.51341838e-01, -9.14576054e-02,  8.65351351e-04,  9.46146715e-03,
       -4.11433399e-01, -5.89423925e-02,  3.98921371e-01, -4.89158742e-02,
        4.69561905e-01,  1.57292094e-02, -4.93609905e-02,  3.80450457e-01,
        1.91041425e-01,  1.12455741e-01,  1.88222572e-01,  1.14541225e-01,
        2.59090424e-01, -2.30431274e-01, -4.69348859e-03, -2.08607793e-01,
       -4.14753795e-01,  2.51397840e-04,  7.12788850e-02, -1.01157762e-02,
       -2.99347192e-01, -5.25355577e-01,  7.16360733e-02, -1.65304154e-01,
       -3.71582240e-01, -8.93100798e-02,  2.22487934e-02, -4.06755894e-01,
        3.85372549e-01, -2.53169835e-01,  1.23078100e-01, -3.83523218e-02,
        6.01540387e-01,  1.62243783e-01,  4.02884960e-01, -1.57584101e-01,
        7.75456149e-03, -5.62906682e-01,  8.19009960e-01,  2.12703869e-01,
       -1.04828268e-01,  

In [31]:
model_loaded["dog"].shape

  """Entry point for launching an IPython kernel.


(64,)

#### Similar words

In [32]:
model_loaded.most_similar('dog')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('puppy', 0.8311045169830322),
 ('cage', 0.7914822101593018),
 ('sweeper', 0.7845744490623474),
 ('thief', 0.7631908655166626),
 ('cow', 0.7547645568847656),
 ('shell', 0.7454959154129028),
 ('chimney', 0.7435685396194458),
 ('Alligator', 0.7414529323577881),
 ('broth', 0.7404745221138),
 ('pet', 0.7378256916999817)]

In [33]:
model_loaded.most_similar('think')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('suppose', 0.8590080142021179),
 ('contradict', 0.8471552729606628),
 ('manage', 0.8414980173110962),
 ('downright', 0.8357493877410889),
 ('NOW', 0.82381272315979),
 ('know', 0.8202742338180542),
 ('_you_', 0.8164973258972168),
 ('pretend', 0.8131539821624756),
 ('awfully', 0.8087693452835083),
 ('anyhow', 0.808165431022644)]

In [34]:
model_loaded.most_similar('day')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('morning', 0.8025069832801819),
 ('night', 0.7603828310966492),
 ('time', 0.7224133014678955),
 ('evening', 0.7140817046165466),
 ('morrow', 0.6953521370887756),
 ('month', 0.6893912553787231),
 ('sabbath', 0.6633560657501221),
 ('feasting', 0.6615920662879944),
 ('week', 0.6582901477813721),
 ('Saturday', 0.6573561429977417)]

In [35]:
model_loaded.most_similar('father')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('mother', 0.8595028519630432),
 ('brother', 0.8427099585533142),
 ('sister', 0.8065947890281677),
 ('Amnon', 0.7854680418968201),
 ('wife', 0.7724324464797974),
 ('daughter', 0.7621254920959473),
 ('bondwoman', 0.7503658533096313),
 ('uncle', 0.7269459962844849),
 ('concubine', 0.7240821123123169),
 ('Nabal', 0.712792694568634)]

#### Odd man out

In [36]:
model_loaded.doesnt_match("mother father dad dog".split())

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


'dog'

#### Arithmetic

In [38]:
model.most_similar(positive=['father','woman'], negative=['man'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('sister', 0.7918761968612671),
 ('mother', 0.7630699872970581),
 ('wife', 0.7518402934074402),
 ('husband', 0.7391830682754517),
 ('Sarah', 0.7261159420013428),
 ('daughter', 0.7252950668334961),
 ('Sarai', 0.7215025424957275),
 ('brother', 0.7040714025497437),
 ('Rachel', 0.7015578746795654),
 ('daughters', 0.695123553276062)]

In [39]:
model.most_similar(positive=['husband','woman'], negative=['man'])

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('sister', 0.7296842336654663),
 ('wife', 0.7292662262916565),
 ('conceived', 0.6931815147399902),
 ('mother', 0.6727426052093506),
 ('daughter', 0.6505609750747681),
 ('child', 0.648764967918396),
 ('maid', 0.6431931853294373),
 ('adultery', 0.6396881937980652),
 ('nurse', 0.6287553310394287),
 ('Sarai', 0.6240081787109375)]

In [40]:
model.most_similar(positive=['king','woman'], negative=['man'], topn=30)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('Sarah', 0.7566111087799072),
 ('Rachel', 0.7172688245773315),
 ('Solomon', 0.7000220417976379),
 ('Sarai', 0.6978135108947754),
 ('Laban', 0.6877530813217163),
 ('Bethuel', 0.6798403263092041),
 ('Hagar', 0.6778625845909119),
 ('Abram', 0.677825927734375),
 ('Leah', 0.6733506321907043),
 ('Pharaoh', 0.6732127666473389),
 ('Rebekah', 0.6620738506317139),
 ('Bilhah', 0.6535000205039978),
 ('birthright', 0.653274655342102),
 ('Padanaram', 0.652320921421051),
 ('Ephron', 0.6518051028251648),
 ('Esau', 0.6483858823776245),
 ('Heth', 0.6478888988494873),
 ('Hittite', 0.6455763578414917),
 ('David', 0.6443098783493042),
 ('Zilpah', 0.641307532787323),
 ('Judah', 0.6412990093231201),
 ('Babylon', 0.6388645172119141),
 ('queen', 0.6377655267715454),
 ('Hamor', 0.6355526447296143),
 ('Uriah', 0.6321998834609985),
 ('Hezekiah', 0.6292102336883545),
 ('Jerubbaal', 0.6283986568450928),
 ('Hanun', 0.6282304525375366),
 ('princes', 0.6282073259353638),
 ('Mephibosheth', 0.6276932954788208)]

#### Reducing and Visualizing word embedding

In [41]:
len(model_loaded.wv.vocab)

17011

In [42]:
X = model_loaded[model_loaded.wv.vocab]

  """Entry point for launching an IPython kernel.


In [44]:
X.shape

(17011, 64)

In [45]:
tsne = TSNE(n_components=2, n_iter=1000)

In [46]:
X_2d = tsne.fit_transform(X)

In [49]:
df = pd.DataFrame(X_2d, columns=['x','y'])
df['token'] = model_loaded.wv.vocab.keys()

In [50]:
df.head()

Unnamed: 0,x,y,token
0,56.267014,8.370474,[
1,15.403315,60.909252,Emma
2,-17.392303,31.840427,by
3,13.557517,61.084011,Jane
4,56.2052,8.382701,]


In [51]:
df.to_csv("../model_files/dataframe_words.csv")

In [52]:
output_notebook()

In [53]:
subset_df = df.sample(n=5000)

In [54]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [55]:
show(p)