In [1]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

## Load GloVe
The data is simple txt file in the format:  
```
word1 val1 val2 val3 ... valn
word2 val1 val2 val3 ... valn
...
...
wordk ...
```
Where,  
wordi -> ith word in the vocab  
val1, val2,... valn -> n-d vector for wordi

In [3]:
GLOVE_PATH = "/home/paradox/data/glove/glove.6B.50d.txt"

In [5]:
def load_embeddings(path):
    glove = {}
    with open(path) as f:
        for line in f:
            line = line.strip()
            tokens = line.split()
            token = tokens[0]
            vec = list(map(float, tokens[1:]))
            glove[token] = np.array(vec)
    return glove

In [6]:
glove = load_embeddings(GLOVE_PATH)

## Inspect Vectors

In [13]:
glove['human'].shape

(50,)

In [7]:
glove['king']

array([ 0.50451 ,  0.68607 , -0.59517 , -0.022801,  0.60046 , -0.13498 ,
       -0.08813 ,  0.47377 , -0.61798 , -0.31012 , -0.076666,  1.493   ,
       -0.034189, -0.98173 ,  0.68229 ,  0.81722 , -0.51874 , -0.31503 ,
       -0.55809 ,  0.66421 ,  0.1961  , -0.13495 , -0.11476 , -0.30344 ,
        0.41177 , -2.223   , -1.0756  , -1.0783  , -0.34354 ,  0.33505 ,
        1.9927  , -0.04234 , -0.64319 ,  0.71125 ,  0.49159 ,  0.16754 ,
        0.34344 , -0.25663 , -0.8523  ,  0.1661  ,  0.40102 ,  1.1685  ,
       -1.0137  , -0.21585 , -0.15155 ,  0.78321 , -0.91241 , -1.6106  ,
       -0.64426 , -0.51042 ])

In [8]:
glove['queen']

array([ 0.37854  ,  1.8233   , -1.2648   , -0.1043   ,  0.35829  ,
        0.60029  , -0.17538  ,  0.83767  , -0.056798 , -0.75795  ,
        0.22681  ,  0.98587  ,  0.60587  , -0.31419  ,  0.28877  ,
        0.56013  , -0.77456  ,  0.071421 , -0.5741   ,  0.21342  ,
        0.57674  ,  0.3868   , -0.12574  ,  0.28012  ,  0.28135  ,
       -1.8053   , -1.0421   , -0.19255  , -0.55375  , -0.054526 ,
        1.5574   ,  0.39296  , -0.2475   ,  0.34251  ,  0.45365  ,
        0.16237  ,  0.52464  , -0.070272 , -0.83744  , -1.0326   ,
        0.45946  ,  0.25302  , -0.17837  , -0.73398  , -0.20025  ,
        0.2347   , -0.56095  , -2.2839   ,  0.0092753, -0.60284  ])

In [9]:
glove['woman']

array([-1.8153e-01,  6.4827e-01, -5.8210e-01, -4.9451e-01,  1.5415e+00,
        1.3450e+00, -4.3305e-01,  5.8059e-01,  3.5556e-01, -2.5184e-01,
        2.0254e-01, -7.1643e-01,  3.0610e-01,  5.6127e-01,  8.3928e-01,
       -3.8085e-01, -9.0875e-01,  4.3326e-01, -1.4436e-02,  2.3725e-01,
       -5.3799e-01,  1.7773e+00, -6.6433e-02,  6.9795e-01,  6.9291e-01,
       -2.6739e+00, -7.6805e-01,  3.3929e-01,  1.9695e-01, -3.5245e-01,
        2.2920e+00, -2.7411e-01, -3.0169e-01,  8.5286e-04,  1.6923e-01,
        9.1433e-02, -2.3610e-02,  3.6236e-02,  3.4488e-01, -8.3947e-01,
       -2.5174e-01,  4.2123e-01,  4.8616e-01,  2.2325e-02,  5.5760e-01,
       -8.5223e-01, -2.3073e-01, -1.3138e+00,  4.8764e-01, -1.0467e-01])

In [10]:
glove['man']

array([-0.094386,  0.43007 , -0.17224 , -0.45529 ,  1.6447  ,  0.40335 ,
       -0.37263 ,  0.25071 , -0.10588 ,  0.10778 , -0.10848 ,  0.15181 ,
       -0.65396 ,  0.55054 ,  0.59591 , -0.46278 ,  0.11847 ,  0.64448 ,
       -0.70948 ,  0.23947 , -0.82905 ,  1.272   ,  0.033021,  0.2935  ,
        0.3911  , -2.8094  , -0.70745 ,  0.4106  ,  0.3894  , -0.2913  ,
        2.6124  , -0.34576 , -0.16832 ,  0.25154 ,  0.31216 ,  0.31639 ,
        0.12539 , -0.012646,  0.22297 , -0.56585 , -0.086264,  0.62549 ,
       -0.0576  ,  0.29375 ,  0.66005 , -0.53115 , -0.48233 , -0.97925 ,
        0.53135 , -0.11725 ])

## Vector Arithmetic
With these vectors, we can do:  
```
king - man + woman = queen
```

In [11]:
res = glove['king'] - glove['man'] + glove['woman']

In [23]:
res, res.shape

(array([ 0.417366  ,  0.90427   , -1.00503   , -0.062021  ,  0.49726   ,
         0.80667   , -0.14855   ,  0.80365   , -0.15654   , -0.66974   ,
         0.234354  ,  0.62476   ,  0.925871  , -0.971     ,  0.92566   ,
         0.89915   , -1.54596   , -0.52625   ,  0.136954  ,  0.66199   ,
         0.48716   ,  0.37035   , -0.214214  ,  0.10101   ,  0.71358   ,
        -2.0875    , -1.1362    , -1.14961   , -0.53599   ,  0.2739    ,
         1.6723    ,  0.02931   , -0.77656   ,  0.46056286,  0.34866   ,
        -0.057417  ,  0.19444   , -0.207748  , -0.73039   , -0.10752   ,
         0.235544  ,  0.96424   , -0.46994   , -0.487275  , -0.254     ,
         0.46213   , -0.66081   , -1.94515   , -0.68797   , -0.49784   ]),
 (50,))

#### similarity test

In [24]:
def compute_cosine_similarity(v1, v2):
    mag1 = np.linalg.norm(v1)
    mag2 = np.linalg.norm(v2)
    return np.dot(v1, v2)/(mag1*mag2)

In [27]:
def similarity_test(s1, s2):
    v = np.zeros(50)
    v1 = glove.get(s1, v)
    v2 = glove.get(s2, v)
    return compute_cosine_similarity(v1, v2)

In [33]:
similarity_test('life', 'mind')

0.8514841863412058

#### Get Similar Words

In [54]:
def find_similar(word, embedding, top_n=5):
    if word not in embedding:
        return []
    wvec = embedding[word]
    result = []
    s = 0
    for w in embedding:
        vec = embedding[w]
        score = compute_cosine_similarity(wvec, vec)
        result.append((score, w))
    return sorted(result, key=lambda x: x[0], reverse=True)[:top_n]

In [55]:
find_similar('queen', glove, top_n=10)

[(1.0000000000000002, 'queen'),
 (0.851516638750669, 'princess'),
 (0.8050609250765663, 'lady'),
 (0.7873042176943493, 'elizabeth'),
 (0.7839043010964117, 'king'),
 (0.7821860976090151, 'prince'),
 (0.7692777928548158, 'coronation'),
 (0.7626097498967261, 'consort'),
 (0.744286480167597, 'royal'),
 (0.7382649680186563, 'crown')]

## Encode Full Text/Document
Since, word2vec and glove give vectors for single words, we can take the average of word vectors of words in the text 
to convert our text into a vector.  
We are actually generating document vector here using word vectors.

In [56]:
def doc2vec(text, ignore_case=True):
    tokens = text.split()
    n = 50
    res = np.zeros(n)
    for token in tokens:
        token = token.lower() if ignore_case else token
        vec = glove.get(token, [])
        if len(vec):
            res = res + glove[token]
    return res/n

In [57]:
v1 = doc2vec("located")
v2 = doc2vec("location of")
compute_cosine_similarity(v1, v2)

0.7395517877525256

In [58]:
v1 = doc2vec("sorry")
v2 = doc2vec("forgive me")
compute_cosine_similarity(v1, v2)

0.8547478507418468

In [68]:
v1 = doc2vec("queen")
v2 = doc2vec("man")
compute_cosine_similarity(v1, v2)

0.5366700279346573