In [1]:
# Import libraries
import pandas as pd
import numpy as np
import scipy
from scipy import stats
from scipy import spatial
from scipy.spatial import distance
from numpy.linalg import norm
from sklearn.neighbors import NearestNeighbors

In [2]:
# Load ViSim
data = pd.read_csv("../datasets/ViSim-400/Visim-400.txt", sep = "\t")

In [3]:
# Load word2vec data
word2vec = {}
coordinates = []
with open("../word2vec/W2V_150.txt", 'r', encoding='utf-8') as f:
    N = int(f.readline())
    dim = int(f.readline())
    for i in range(N):
        line = f.readline()
        elements = line.split()
        word2vec[elements[0]] = np.asanyarray([float(i) for i in elements[1:]], dtype=np.float32)
        coordinates.append(word2vec[elements[0]])
coordinates = np.array(coordinates)
list_word = np.array(list(word2vec.keys()))

Ex1:

a, Using cosine distance

In [5]:
# Cosine metric
def cosine_dist(x, y):
    return distance.cosine(x, y)

b, Using other metrics

In [6]:
# Dot product distance
def dot_product_distance(x, y):
    pass

In [7]:
# Euclidean dstance
def Euclidean_dist(x, y):
    return distance.euclidean(x, y)

In [8]:
# Dice distance
def Dice_dist(x, y):
    return distance.dice(x, y)

In [9]:
# Jaccard distance
def Jaccard_dist(x, y):
    return distance.jaccard(x, y)

In [10]:
word1 = "trai"
word2 = "nam"
print(word1, word2)
print("Cosine distance is ", cosine_dist(word2vec[word1], word2vec[word2]))
print("Dot product distance is ", dot_product_distance(word2vec[word1], word2vec[word2]))
print("Euclidean dstance is ", Euclidean_dist(word2vec[word1], word2vec[word2]))
print("Dice distance is ", Dice_dist(word2vec[word1], word2vec[word2]))
print("Jaccard distance is ", Jaccard_dist(word2vec[word1], word2vec[word2]))

trai nam
Cosine distance is  0.8609540909528732
Dot product distance is  None
Euclidean dstance is  15.099640846252441
Dice distance is  7.688263969182271
Jaccard distance is  1.0


Ex2:

In [11]:
# Model with metric of library
model = NearestNeighbors(n_neighbors=K, algorithm='brute', metric='cosine', n_jobs=-1)
model.fit(coordinates)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=10)

In [12]:
# Model with mannual metric
nbrs = NearestNeighbors(n_neighbors=K, algorithm='brute', metric=cosine_dist, n_jobs=-1)
nbrs.fit(coordinates)

NearestNeighbors(algorithm='brute',
                 metric=<function cosine_dist at 0x0000028FB34DF4C0>, n_jobs=-1,
                 n_neighbors=10)

In [13]:
# Find KNN with library metric model
model.kneighbors([word2vec[w]], K, return_distance=True)

(array([[5.9604645e-08, 2.8176874e-01, 3.7507415e-01, 3.8698512e-01,
         4.0976185e-01, 4.4179761e-01, 4.5175934e-01, 4.5987982e-01,
         4.6927774e-01, 4.6952456e-01]], dtype=float32),
 array([[   99, 29127, 12678, 48484, 74193, 69571, 10901, 10371,  6556,
         47989]], dtype=int64))

In [14]:
# Find KNN with mannual metric model
nbrs.kneighbors([word2vec[w]], K, return_distance=True)

(array([[0.        , 0.2817688 , 0.3750741 , 0.386985  , 0.40976185,
         0.44179755, 0.4517594 , 0.45987988, 0.46927774, 0.46952456]],
       dtype=float32),
 array([[   99, 29127, 12678, 48484, 74193, 69571, 10901, 10371,  6556,
         47989]], dtype=int64))

In [15]:
# Manual calculate the cosine distance
res = np.dot(word2vec[w], coordinates.T) / norm(coordinates[np.where(list_word == w)]) / norm(coordinates, axis = 1)
res = 1 - res
# Find K smallest distance (not increasing order just K smallest)
index = np.argpartition(res, K)[:K]
print(res[index])
print(index)

[3.7507421e-01 4.0976179e-01 5.9604645e-08 2.8176874e-01 3.8698506e-01
 4.4179761e-01 4.6927768e-01 4.6952450e-01 4.5987982e-01 4.5175934e-01]
[12678 74193    99 29127 48484 69571  6556 47989 10371 10901]


In [16]:
# Print K nearest neighbor words
print("Queried word is: ", w)
print(list_word[index])

Queried word is:  nhà
['nhà_chung' 'nhà_ở' 'nhà' 'nhà_tầng' 'nhà_di_động' 'nhà_nguyên' 'khu_nhà'
 'nhà_trệt' 'nhà_trọ' 'nhà_cao_tầng']


In [17]:
# Euclidean distance
model = NearestNeighbors(n_neighbors=K, algorithm='brute', metric='euclidean', n_jobs=-1)
model.fit(coordinates)
dist, index = model.kneighbors([word2vec[w]], K, return_distance=True)
print("Queried word is: ", w)
print(list_word[index])

Queried word is:  nhà
[['nhà' 'nhà_nguyên' 'nhà_trệt' 'nhà_di_động' 'trên_nhà' '18,000' 'nàh'
  'ttxvn/vietnam' 'tay_không_bắt_giặc' 'nhà_phụ']]


In [18]:
# Dice distance
model = NearestNeighbors(n_neighbors=K, algorithm='brute', metric='dice', n_jobs=-1)
model.fit(coordinates)
dist, index = model.kneighbors([word2vec[w]], K, return_distance=True)
print("Queried word is: ", w)
print(list_word[index])

Queried word is:  nhà
[['băng_sơn' 'vũ_kỳ' 'tàng_thư' '2013-2014' 'lên_nước' 'cao_bá_quát'
  'ấm_dần_lên' 'tảng_băng_trôi' 'nonthaburi' 'khuyển']]




In [19]:
# Jaccard distance
model = NearestNeighbors(n_neighbors=K, algorithm='brute', metric='jaccard', n_jobs=-1)
model.fit(coordinates)
dist, index = model.kneighbors([word2vec[w]], K, return_distance=True)
print("Queried word is: ", w)
print(list_word[index])

Queried word is:  nhà
[['băng_sơn' 'vũ_kỳ' 'tàng_thư' '2013-2014' 'lên_nước' 'cao_bá_quát'
  'ấm_dần_lên' 'tảng_băng_trôi' 'nonthaburi' 'khuyển']]


