In [1]:
# word2vec google news 300 (R^300)
!pip3 install gensim
!pip3 install scipy



#### Patch gensim to avoid downgrading SciPy: Unable to import gensim properly because gensim is trying to import triu from scipy.linalg, but in the latest SciPy version, the function triu has been removed from scipy.linalg. Instead, we're supposed to use numpy.triu. This 'monkey-patch' isn't recommended in the long-term but for a demo, it is suitable

In [None]:
# Patch gensim to avoid downgrading SciPy
# Unable to import gensim properly because gensim is
# trying to import triu from scipy.linalg, but in the latest
# SciPy version, the function triu has been removed from 
# scipy.linalg. Instead, we're supposed to use 
# numpy.triu.
# This 'monkey-patch' isn't recommended in the long-term
# but for a demo, it is suitable
import numpy as np
import scipy.linalg

scipy.linalg.triu = np.triu

In [3]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")
# Pre-trained model for learning purposes



#### Example of a word as a vector

In [5]:
word_vectors = model

print(word_vectors['computer'])
# R^300 vector for any word
print(word_vectors['dog'].shape)

[ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777344e-02  1.88476562e-01
  5.51757812e-02  5.02929

#### Similar Words

#### What would King + Woman - Man = ?

In [8]:
# Ex of using most_similar
print(word_vectors.most_similar(positive=['king', 'woman'],
                                negative=['man'], topn=10))
# What is the equivalent of a "Woman King" who is not a man?
# A Queen right?

[('queen', 0.7118192911148071), ('monarch', 0.6189674735069275), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321243286133), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.5181134343147278), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411403656006)]


#### Another example relating to sports:

In [None]:
print(word_vectors.most_similar(positive=['sport', 'hoop', 'shoot'],
                                negative=['feet'], topn=10))
# Sport that has a hoop and people shoot with 
# and they dont use their feet. Basketball!

[('basketball', 0.49093201756477356), ('hoops', 0.47231557965278625), ('shooters', 0.471876859664917), ('dribble', 0.4546862244606018), ('amuse_oneself', 0.4472128450870514), ('Slamball', 0.4436055123806), ('sports', 0.44302672147750854), ('baskeball', 0.4382047951221466), ('dunks_alley_oops', 0.4283624291419983), ('freethrow', 0.419028639793396)]


#### Check similarity between a few other pair of words

In [None]:
# Ex: Calculating Similarity:
print(word_vectors.similarity('woman', 'man'))
print(word_vectors.similarity('boy', 'girl'))
print(word_vectors.similarity('uncle', 'aunt'))
print(word_vectors.similarity('plane', 'car'))
print(word_vectors.similarity('hand', 'foot'))
print(word_vectors.similarity('apple', 'banana'))
print(word_vectors.similarity('wood', 'tennis'))

# Notice wood and tennis have a very low similarity score
# Whereas the rest of the pairs have a decent-strong similarity score
#  

0.76640123
0.8543272
0.7643474
0.3779698
0.22718486
0.5318407
0.086871386


#### Most similar words:

In [None]:
# Words most similar to tower
print(word_vectors.most_similar("tower", topn=5))

[('towers', 0.8531749844551086), ('skyscraper', 0.6417425870895386), ('Tower', 0.639177143573761), ('spire', 0.5946877598762512), ('responded_Understood_Atlasjet', 0.5931612849235535)]


#### Now let's look at vector similarity

In [22]:
import numpy as np

# Words to compare:
word1 = 'man'
word2 = 'woman'

word3 = 'plane'
word4 = 'earthworm'

word5 = 'aunt'
word6 = 'uncle'

# Calculate vector difference
vec_diff_1 = model[word1] - model[word2]
vec_diff_2 = model[word3] - model[word4]
vec_diff_3 = model[word5] - model[word6]

# Calculate the magnitude of the vector difference
magn_of_diff1 = np.linalg.norm(vec_diff_1)
magn_of_diff2 = np.linalg.norm(vec_diff_2)
magn_of_diff3 = np.linalg.norm(vec_diff_3)

print(f'The magnitude of the difference beteween {word1} and {word2} is {magn_of_diff1}')
print(f'The magnitude of the difference beteween {word3} and {word4} is {magn_of_diff2}')
print(f'The magnitude of the difference beteween {word5} and {word6} is {magn_of_diff3}')




The magnitude of the difference beteween man and woman is 1.7279510498046875
The magnitude of the difference beteween plane and earthworm is 5.286352157592773
The magnitude of the difference beteween aunt and uncle is 1.9634674787521362
