In [148]:
import numpy as np
import fasttext
import io
import math

Load in vectors from fasttext:
https://fasttext.cc/docs/en/english-vectors.html

In [149]:
def load_vectors(fname, n_vecs):
#get vectors for the top n_vecs most frequent vectors
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    i = 0
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
        i += 1
        if i >= n_vecs:
            res = {key:np.array([i for i in data[key]]) for key in data}
            return(res)
    return res

In [150]:
num_vecs = 100000
vecs = load_vectors("wiki-news-300d-1M.vec", num_vecs)
keys = list(vecs.keys())

In [151]:
trials = 100
cumsum = 0
for i in range(trials):
    idx1 = np.random.randint(num_vecs)
    idx2 = np.random.randint(num_vecs)
    cumsum += vecs[keys[idx1]].dot(vecs[keys[idx2]])
cumsum = cumsum / trials

print(cumsum)

1.203819212


In [152]:

def cosine_similarity(v1, v2):
    return(v1.dot(v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))

def get_knn(query_vec, k = 1):
    closest_matches = [(-1.1, "not a word")]
    cos_theta_thresh = min(closest_matches)[0]
    for key in keys:
        cos_theta = cosine_similarity(vecs[key], query_vec)
        if cos_theta > cos_theta_thresh:
            closest_matches.append((cos_theta, key))
            closest_matches.sort(reverse = True)
            closest_matches = closest_matches[:k]
            cos_theta_thresh = min(closest_matches)[0]
    return closest_matches

def show_knn(query_vec, k = 1):
    knn = get_knn(query_vec, k = k)
    for pair in knn:
        print(f"{pair[1]}: {pair[0]}")

In [157]:
show_knn(vecs["man"] + vecs['royal'], 5)

royal: 0.8649942294282293
man: 0.7981027103621264
king: 0.7392085765872102
prince: 0.7163905971310617
woman: 0.6905222115891921
