In [1]:
import numpy as np
import os

In [2]:
glove_dir = "glove.6B"

embeddings_index = {}
f = open(os.path.join(glove_dir, "glove.6B.100d.txt"), encoding= "utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype= "float32")
    embeddings_index[word] = coefs
f.close()

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

def similarity(u, v):
    return np.squeeze(cosine_similarity(u.reshape(1, -1), v.reshape(1, -1)))

In [4]:
father = embeddings_index["father"]
mother = embeddings_index["mother"]
ball = embeddings_index["ball"]
crocodile = embeddings_index["crocodile"]
france = embeddings_index["france"]
tehran = embeddings_index["tehran"]
paris = embeddings_index["paris"]
iran = embeddings_index["iran"]
print("cosine_similarity(father, mother) = ", similarity(father, mother))
print("cosine_similarity(ball, crocodile) = ", similarity(ball, crocodile))
print("cosine_similarity(france - paris, tehran - iran) = ", similarity(france - paris, tehran - iran))

cosine_similarity(father, mother) =  0.86566615
cosine_similarity(ball, crocodile) =  0.15206575
cosine_similarity(france - paris, tehran - iran) =  -0.6854124


In [5]:
embeddings_index["father"]

array([ 0.64706 , -0.068067,  0.15468 , -0.17408 , -0.29134 ,  0.76999 ,
       -0.3192  , -0.25663 , -0.25082 , -0.036737, -0.25509 ,  0.29636 ,
        0.5776  ,  0.49641 ,  0.19167 , -0.83888 ,  0.58482 , -0.38717 ,
       -0.71591 ,  0.9519  , -0.37966 , -0.1131  ,  0.47154 ,  0.20921 ,
        0.38197 ,  0.067582, -0.92879 , -1.1237  ,  0.84831 ,  0.68744 ,
       -0.15472 ,  0.92714 ,  0.53371 , -0.037392, -0.856   ,  0.19056 ,
       -0.014594,  0.15186 ,  0.53514 , -0.20306 , -0.35164 ,  0.33152 ,
        1.1306  , -0.72787 , -0.19724 ,  0.031659, -0.24041 , -0.057617,
        0.60473 , -0.49233 , -0.24405 , -0.3184  ,  0.96156 ,  1.0895  ,
        0.21534 , -2.0542  , -1.0615  ,  0.052439,  0.57958 ,  0.2748  ,
        0.91587 ,  0.85195 ,  0.36113 , -0.31901 ,  0.7784  , -0.36865 ,
        0.64387 ,  0.33104 , -0.27181 ,  0.58524 , -0.15143 ,  0.11121 ,
        0.2126  , -0.60345 ,  0.16148 ,  0.32952 , -0.1354  , -0.30629 ,
       -0.89143 ,  0.091912,  0.49753 ,  0.55932 , 

In [6]:
def complete_analogy(word_a, word_b, word_c, embeddings_index):
    
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
    
    e_a, e_b, e_c = embeddings_index[word_a], embeddings_index[word_b], embeddings_index[word_c]
    
    words = embeddings_index.keys()
    max_cosine_sim = -100              
    best_word = None                   

    for w in words:        
        if w in [word_a, word_b, word_c] :
            continue
        
        cosine_sim = similarity(e_b - e_a, embeddings_index[w] - e_c)
        
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
        
    return best_word

In [7]:
complete_analogy("china", "chinese", "iran", embeddings_index)

'iranian'

In [8]:
complete_analogy("india", "delhi", "iran", embeddings_index)

'tehran'

In [9]:
complete_analogy("man", "woman", "boy", embeddings_index)

'girl'

In [10]:
complete_analogy("small", "smaller", "big", embeddings_index)

'bigger'

In [11]:
complete_analogy("iran", "farsi", "canada", embeddings_index)

'inuktitut'

In [12]:
complete_analogy("king", "Queen", "Prince", embeddings_index)

'princess'