In [3]:
import re
import numpy as np

from gensim.models import Word2Vec
from nltk.corpus import gutenberg
from multiprocessing import Pool
from scipy import spatial

In [5]:
import nltk
nltk.download("gutenberg")

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/soonam/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [7]:
sentences = list(gutenberg.sents('/Users/soonam/workspace/python/hamlet.txt'))   # import the corpus and convert into a list

In [8]:
sentences

[['THE', 'TRAGEDY', 'OF', 'HAMLET', ',', 'PRINCE', 'OF', 'DENMARK'],
 ['by', 'William', 'Shakespeare'],
 ['Dramatis', 'Personae'],
 ['Claudius', ',', 'King', 'of', 'Denmark', '.'],
 ['Marcellus', ',', 'Officer', '.'],
 ['Hamlet',
  ',',
  'son',
  'to',
  'the',
  'former',
  ',',
  'and',
  'nephew',
  'to',
  'the',
  'present',
  'king',
  '.'],
 ['Polonius', ',', 'Lord', 'Chamberlain', '.'],
 ['Horatio', ',', 'friend', 'to', 'Hamlet', '.'],
 ['Laertes', ',', 'son', 'to', 'Polonius', '.'],
 ['Voltemand', ',', 'courtier', '.'],
 ['Cornelius', ',', 'courtier', '.'],
 ['Rosencrantz', ',', 'courtier', '.'],
 ['Guildenstern', ',', 'courtier', '.'],
 ['Osric', ',', 'courtier', '.'],
 ['A', 'Gentleman', ',', 'courtier', '.'],
 ['A', 'Priest', '.'],
 ['Marcellus', ',', 'officer', '.'],
 ['Bernardo', ',', 'officer', '.'],
 ['Francisco',
  ',',
  'a',
  'soldier',
  'Reynaldo',
  ',',
  'servant',
  'to',
  'Polonius',
  '.'],
 ['Players', '.'],
 ['Two', 'Clowns', ',', 'gravediggers', '.'],
 

In [9]:
print('Type of corpus: ', type(sentences))
print('Length of corpus: ', len(sentences))

Type of corpus:  <class 'list'>
Length of corpus:  3921


In [10]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['THE', 'TRAGEDY', 'OF', 'HAMLET', ',', 'PRINCE', 'OF', 'DENMARK']
['by', 'William', 'Shakespeare']
['Cornelius', ',', 'courtier', '.']


In [11]:
for i in range(len(sentences)):
    sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)]

In [12]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['the', 'tragedy', 'of', 'hamlet', 'prince', 'of', 'denmark']
['by', 'william', 'shakespeare']
['cornelius', 'courtier']


In [13]:
model = Word2Vec(sentences = sentences, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = Pool()._processes)

In [14]:
model.init_sims(replace = True)

In [15]:
model.save('word2vec_model')


In [16]:
model = Word2Vec.load('word2vec_model')

In [17]:
model.most_similar('hamlet')

  """Entry point for launching an IPython kernel.


[('dear', 0.9936438202857971),
 ('polonius', 0.992931604385376),
 ('king', 0.9925959706306458),
 ('death', 0.9921386241912842),
 ('s', 0.9921372532844543),
 ('mother', 0.9921145439147949),
 ('closet', 0.9917299151420593),
 ('very', 0.9916934967041016),
 ('exeunt', 0.9916423559188843),
 ('life', 0.9914907217025757)]

In [18]:
v1 = model['king']
v2 = model['queen']

  """Entry point for launching an IPython kernel.
  


In [19]:

# define a function that computes cosine similarity between two words
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)

In [20]:
cosine_similarity(v1, v2)


0.9987213015556335

In [21]:
v1

array([-0.07270265,  0.01292004,  0.13056308, -0.0529054 , -0.01754526,
       -0.02449138,  0.04295189, -0.05175127, -0.05827956, -0.10829289,
        0.13782005,  0.08460987, -0.11408605,  0.04938787,  0.11990755,
       -0.08148331,  0.05096746,  0.19126841, -0.15573435,  0.16344829,
       -0.05264206,  0.01674959, -0.13287538, -0.18777697,  0.00718729,
        0.17791672, -0.02080614, -0.11409375,  0.03905941, -0.03927641,
       -0.10084318,  0.11294754, -0.03580342,  0.18669286, -0.04037194,
        0.01784157, -0.01239766,  0.05327073, -0.11526928, -0.01645968,
        0.21663253, -0.02012269,  0.03073677,  0.00239004, -0.02700321,
       -0.08301137, -0.08522729,  0.2200604 , -0.10128394,  0.22703208,
        0.12655275, -0.10081679,  0.18779956,  0.04182823,  0.00318894,
        0.05095762,  0.0899329 ,  0.15745594,  0.04588254, -0.02374525,
       -0.01799428, -0.06096854, -0.04106136, -0.15774798,  0.04030783,
        0.03300713,  0.18424493, -0.00132191,  0.09562178,  0.02