In [196]:
import numpy as np
import fasttext
import io
import math

Load in vectors from fasttext:
https://fasttext.cc/docs/en/english-vectors.html

In [197]:
def load_vectors(fname, n_vecs):
#get vectors for the top n_vecs most frequent words
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    i = 0
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
        i += 1
        if i >= n_vecs:
            res = {key:np.array([i for i in data[key]]) for key in data}
            return(res)
    return res

In [198]:
num_vecs = 100000
vecs = load_vectors("wiki-news-300d-1M.vec", num_vecs)
keys = list(vecs.keys())

In [199]:
def cosine_similarity(v1, v2):
    return(v1.dot(v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
    
    
trials = 100
cumsum = 0
for i in range(trials):
    word1 = keys[np.random.randint(num_vecs)]
    word2 = keys[np.random.randint(num_vecs)]
    cumsum += cosine_similarity(vecs[word1], vecs[word2])
cumsum = cumsum / trials

print(cumsum)

0.19469498133283902


If the vectors were uniformly distributed in the space, we would expect the above experiment to show an avg cosine similarity of zero. The magnitude of the avg cos similarity may be considered to quantify "hubness".

In [200]:
def get_knn(query_vec, k = 1):
    closest_matches = [(-1.1, "not a word")]
    cos_theta_thresh = min(closest_matches)[0]
    for key in keys:
        cos_theta = cosine_similarity(vecs[key], query_vec)
        if cos_theta > cos_theta_thresh:
            closest_matches.append((cos_theta, key))
            closest_matches.sort(reverse = True)
            closest_matches = closest_matches[:k]
            cos_theta_thresh = min(closest_matches)[0]
    return closest_matches

def show_knn(query_vec, k = 1):
    knn = get_knn(query_vec, k = k)
    for pair in knn:
        print(f"{pair[1]}: {pair[0]}")

Now that we have a nice-to-handle word vector data structure, and a function for retrieving nearest neighbors, we can play around with the vector space!

In [203]:
vecs["dog"]

array([-8.860e-02,  3.510e-02,  3.920e-02,  1.570e-01, -2.760e-02,
        2.400e-03, -7.510e-02, -9.310e-02,  1.010e-01, -2.100e-03,
        5.930e-02,  1.100e-02,  1.730e-01,  5.630e-02,  2.550e-02,
        2.320e-02,  1.158e-01,  4.220e-02,  6.080e-02, -1.078e-01,
       -1.808e-01,  4.340e-02, -1.358e-01,  7.340e-02, -1.350e-01,
        2.790e-02, -1.970e-02,  1.795e-01, -1.206e-01,  4.450e-02,
        7.580e-02, -1.076e-01,  1.460e-02,  8.420e-02, -2.139e-01,
       -6.450e-02,  4.800e-02,  3.930e-02,  5.630e-02,  2.820e-02,
       -6.820e-02,  2.524e-01, -4.160e-02, -9.900e-03, -7.130e-02,
       -4.260e-02,  6.570e-02, -1.421e-01,  9.200e-03, -1.495e-01,
        1.142e-01, -1.868e-01, -6.275e-01, -9.090e-02, -7.850e-02,
       -1.130e-02, -1.500e-03,  1.011e-01,  3.430e-02, -8.040e-02,
        9.480e-02, -8.250e-02, -1.163e-01, -1.611e-01, -3.260e-02,
       -6.840e-02,  7.960e-02, -1.900e-01, -7.530e-02,  1.507e-01,
       -1.954e-01, -2.490e-02,  7.890e-02,  8.710e-02,  2.300e

In [201]:
show_knn(vecs["love"], k = 5)

love: 0.9999999999999999
hate: 0.7127140373945532
loving: 0.7031040644656243
loved: 0.677062808092265
loves: 0.6755636195802975


In [202]:
empty_half_plane = np.zeros(300)
empty_half_plane[52] = 1
show_knn(empty_half_plane, 3)

Dunhill: -0.049358651590376765
compels: -0.05567376493485118
cross-sections: -0.06605214092327753


That the above vector has a nearest neighbor with negative cosine similarity indicates there is no word embedding with 53rd coord > 0 => there is an empty half plane