# word2vec implementation with Python to calculate similarities

In [1]:
import re
import numpy as np

from gensim.models import Word2Vec
from nltk.corpus import gutenberg
from multiprocessing import Pool
from scipy import spatial



### Import training dataset

In [15]:
 # import the corpus and convert into a list
sentences = list(gutenberg.sents('train.txt'))  

### Preprocess data

In [17]:
# Convert all letters into lowercase

for i in range(len(sentences)):
    sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)] 

### Create and train model

    - Create a word2vec model and train it with raw_sentences corpus
    - Key parameter:
        * sentences: training data (has to be a list with tokenized sentences)
        * size: dimension of embedding space
        * sg: CBOW if 0, skip-gram if 1
        * window: number of words accounted for each context (if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
        * min_count: minimum count of words to be included in the vocabulary
        * iter: number of training iterations
        * workers: number of worker threads to train

In [18]:
our_model = Word2Vec(sentences = sentences, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = Pool()._processes)

In [19]:
our_model.init_sims(replace = True)

### Save and load model

In [20]:
our_model.save('word2vec_model')
our_model = Word2Vec.load('word2vec_model')

### Similarity calculation between words

In [22]:
our_model.most_similar('photo')

  """Entry point for launching an IPython kernel.


[('sound', 0.9857641458511353),
 ('image', 0.9848567247390747),
 ('impressive', 0.9837137460708618),
 ('rear', 0.9827864170074463),
 ('selfie', 0.9817068576812744),
 ('bright', 0.9798246622085571),
 ('loud', 0.9790486097335815),
 ('icon', 0.9778032302856445),
 ('outdoor', 0.9760921001434326),
 ('slim', 0.9758537411689758)]

In [23]:
our_model.most_similar('pic')

  """Entry point for launching an IPython kernel.


[('opens', 0.992725133895874),
 ('sleek', 0.9921761751174927),
 ('image', 0.9906978607177734),
 ('flawless', 0.9904495477676392),
 ('decent', 0.9903085231781006),
 ('loud', 0.989993691444397),
 ('superior', 0.9896432161331177),
 ('brilliant', 0.9895043969154358),
 ('slightly', 0.9893741607666016),
 ('thus', 0.9893368482589722)]

In [24]:
# define a function that computes cosine similarity between two words
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)


In [27]:
v1 = our_model['pic']
v2 = our_model['picture']
cosine_similarity(v1, v2)

  """Entry point for launching an IPython kernel.
  


0.9456096291542053