# Embeddings

This is a notebook to explore properties of embeddings, mostly the possibility to find semantically similar words or analogies for natural language with mathematical properties. The embeddings must be already downloadad in _data/glove.6B/glove.6B.100d.txt_, an example of how to do that can be found at _sample.ipynb_.

In [1]:
import numpy as np
import heapq as hp

In [2]:
def load_embeddings(path):
    ans = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            ans[word] = vector
    return ans 

In [3]:
path = 'data/glove.6B/glove.6B.100d.txt'
embeddings = load_embeddings(path)
print(embeddings['hello'])

[ 0.26688    0.39632    0.6169    -0.77451   -0.1039     0.26697
  0.2788     0.30992    0.0054685 -0.085256   0.73602   -0.098432
  0.5479    -0.030305   0.33479    0.14094   -0.0070003  0.32569
  0.22902    0.46557   -0.19531    0.37491   -0.7139    -0.51775
  0.77039    1.0881    -0.66011   -0.16234    0.9119     0.21046
  0.047494   1.0019     1.1133     0.70094   -0.08696    0.47571
  0.1636    -0.44469    0.4469    -0.93817    0.013101   0.085964
 -0.67456    0.49662   -0.037827  -0.11038   -0.28612    0.074606
 -0.31527   -0.093774  -0.57069    0.66865    0.45307   -0.34154
 -0.7166    -0.75273    0.075212   0.57903   -0.1191    -0.11379
 -0.10026    0.71341   -1.1574    -0.74026    0.40452    0.18023
  0.21449    0.37638    0.11239   -0.53639   -0.025092   0.31886
 -0.25013   -0.63283   -0.011843   1.377      0.86013    0.20476
 -0.36815   -0.68874    0.53512   -0.46556    0.27389    0.4118
 -0.854     -0.046288   0.11304   -0.27326    0.15636   -0.20334
  0.53586    0.59784   

In [4]:
def cosine_similarity(u,v):
    ans = 0.0
    prod = np.dot(u,v)
    ans = prod / (np.linalg.norm(u) * np.linalg.norm(v))
    return ans

In [5]:
def analogy(word_1, sim_1, word_2, dic):
    word_1 = word_1.lower()
    sim_1 = sim_1.lower()
    word_2 = word_2.lower()
    emb_word_1, emb_sim_1, emb_word_2 = dic[word_1], dic[sim_1], dic[word_2]
    # target = emb_sim_1 - emb_word_1 + emb_word_2
    curr = -999
    # min_diff = 100000
    ans = 'hello'
    for word, emb in dic.items():
        if word in [word_1, sim_1, word_2]:
            continue
        # diff = np.linalg.norm(target - emb)
        similarity = cosine_similarity(emb_sim_1 - emb_word_1, emb - emb_word_2)
        if (similarity > curr):
            curr = similarity
            ans = word
        # if (diff < min_diff):
        #     ans = word
        #     min_diff = diff
    return ans

In [14]:
analogy('man','king','woman',embeddings)

'queen'

In [19]:
analogy('austria','red','argentina',embeddings)

'blue'

In [34]:
analogy('austria','vienna','france',embeddings)

'paris'

In [8]:
def find_n_similar(word_sim, dic, n=10):
    word_emb = dic[word_sim]
    # store (-distance, word) because it is a min heap
    heap = []
    for word, emb in dic.items():
        if(word == word_sim):
            continue
        dist = np.linalg.norm(word_emb - emb)
        if (len(heap) < n):
            hp.heappush(heap,(-dist,word))
        elif ((-heap[0][0])>dist):
            #remove largest element and add current
            hp.heappop(heap)
            hp.heappush(heap,(-dist,word))
    return [tup[1] for tup in heap] 

In [9]:
find_n_similar('austria',embeddings,n=20)

['slovenia',
 'greece',
 'poland',
 'finland',
 'lithuania',
 'norway',
 'bulgaria',
 'belgium',
 'sweden',
 'latvia',
 'slovakia',
 'netherlands',
 'romania',
 'italy',
 'hungary',
 'denmark',
 'germany',
 'switzerland',
 'austrian',
 'luxembourg']