In [1]:
import numpy as np

In [2]:
#Importing the word embeddings from the 'glove.6B.50d.txt' file
f=open("glove.6B.50d.txt", "r", encoding='utf8')
word_to_vec_map={}

for line in f:
    var_list=line.strip().split()
    word_to_vec_map[var_list[0]]=np.array( var_list[1:] , dtype=float)
    
f.close()

In [3]:
#Analysing the word embeddings
print("Dimensions of word Embeddings :" , word_to_vec_map['mother'].shape)
print("Vocab Size : ",len(word_to_vec_map.keys() ) )

Dimensions of word Embeddings : (50,)
Vocab Size :  400000


In [4]:
def cosine_similarity(u, v):
    """
    This computes the cosine similarity of two given vectors.
    Arguments:
        u: Vector of shape(n,)
        v: Vector of shape(n,)
        
    Returns:
        cosine similarity of given vectors
    """
    dot=np.dot(u,v)
    u_mod=np.sqrt( np.dot(u,u) )
    v_mod=np.sqrt( np.dot(v,v) )
    
    return dot/(u_mod*v_mod)

In [5]:
def euclid_distance_similarity(u,v):
    """
    This computes the negative-euclidean-distance-similarity.
    """
    e_dist=np.dot(u-v, u-v)
    
    return -e_dist

In [None]:
def jaccard_similarity(u, v):
    

In [6]:
#Let's check some of the similarities
father=word_to_vec_map['father']
mother=word_to_vec_map['mother']
delhi=word_to_vec_map['delhi']
india=word_to_vec_map['india']
canada=word_to_vec_map["canada"]
ottawa=word_to_vec_map["ottawa"]

print(cosine_similarity(father, mother))
print(cosine_similarity(india, canada))
print(cosine_similarity(canada-india, ottawa-delhi))
print(cosine_similarity(canada-ottawa, india-delhi))

0.8909038442893615
0.5654517453660549
0.8626543013758097
0.6608601608976962


In [8]:
def analogy_reasoning(w1,w2,w3, word_to_vec_map):
    """
    Given 3 words, this computes the fourth word with analogy-reasoning.
    
    Arguments:
        w1, w2, w3: String of 1st, 2nd, 3rd word respectively.
        word_to_vec_map: Dictionary containing words and their corresponding Embeddings.
    
    Return:
        Completes the analogy by providing decent word.
    """
    w1,w2,w3 =w1.lower(), w2.lower(), w3.lower()
    
    e1, e2, e3=word_to_vec_map[w1], word_to_vec_map[w2], word_to_vec_map[w3] 
    words_list=word_to_vec_map.keys()
    given_words=[w1,w2,w3]
    
    max=float( '-inf' )
    for word in words_list:
        if word in given_words:
            continue
        e_word=word_to_vec_map[word]
        #max_sim=cosine_similarity(e3+e2-e1, e_word)
        max_sim=euclid_distance_similarity(e3+e2-e1,e_word)
        if max_sim>max:
            max=max_sim
            my_word=word
    return my_word

In [9]:
#Let's do some analogies

triads_to_try = [('italy', 'italian', 'spain'), ('india', 'delhi', 'japan'), ('man', 'woman', 'boy'),\
                 ('small', 'smaller', 'large'), ("canada", "india", "ottawa"), ("canada", "ottawa", "india")]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, analogy_reasoning(*triad,word_to_vec_map)))

italy -> italian :: spain -> spanish
india -> delhi :: japan -> tokyo
man -> woman :: boy -> girl
small -> smaller :: large -> larger
canada -> india :: ottawa -> delhi
canada -> ottawa :: india -> delhi


In [21]:
#Let's do some more analogies.
triads_to_try = [( "man", "doctor", "woman" ),("woman",'doctor',"man" ), ( "india", "action", "pakistan" ), \
                 ( "black", "athlete",  "white" )]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, analogy_reasoning(*triad,word_to_vec_map)))

man -> doctor :: woman -> nurse
woman -> doctor :: man -> colleague
india -> action :: pakistan -> threats
black -> athlete :: white -> participant


[Reference for the Embeddings](https://nlp.stanford.edu/projects/glove/)