In [1]:
# Import libraries
import pandas as pd
import numpy as np
import scipy
from scipy import stats
from scipy import spatial
from scipy.spatial import distance
from numpy.linalg import norm
from sklearn.neighbors import NearestNeighbors

In [2]:
# Load word2vec data
word2vec = {}
coordinates = []
with open("../word2vec/W2V_150.txt", 'r', encoding='utf-8') as f:
    N = int(f.readline())
    dim = int(f.readline())
    for i in range(N):
        line = f.readline()
        elements = line.split()
        word2vec[elements[0]] = np.asanyarray([float(i) for i in elements[1:]], dtype=np.float32)
        coordinates.append(word2vec[elements[0]])
coordinates = np.array(coordinates)
list_word = np.array(list(word2vec.keys()))

# Ex1:

a, Using cosine distance

In [3]:
# Cosine metric
def cosine_dist(x, y):
    return 1 - distance.cosine(x, y)

b, Using other metrics

In [4]:
# Dot product distance
def dot_product_distance(x, y):
    pass

In [5]:
# Euclidean dstance
def Euclidean_dist(x, y):
    return distance.euclidean(x, y)

In [6]:
# Dice distance
def Dice_dist(x, y):
    return distance.dice(x, y)

In [7]:
# Jaccard distance
def Jaccard_dist(x, y):
    return distance.jaccard(x, y)

In [8]:
word1 = "trai"
word2 = "nam"
print(word1, word2)
print("Cosine distance is ", cosine_dist(word2vec[word1], word2vec[word2]))
print("Dot product distance is ", dot_product_distance(word2vec[word1], word2vec[word2]))
print("Euclidean dstance is ", Euclidean_dist(word2vec[word1], word2vec[word2]))
print("Dice distance is ", Dice_dist(word2vec[word1], word2vec[word2]))
print("Jaccard distance is ", Jaccard_dist(word2vec[word1], word2vec[word2]))

trai nam
Cosine distance is  0.13904590904712677
Dot product distance is  None
Euclidean dstance is  15.099640846252441
Dice distance is  7.688263969182271
Jaccard distance is  1.0


In [9]:
# Load ViSim to test
data = pd.read_csv("../datasets/ViSim-400/Visim-400.txt", sep = "\t")
data

Unnamed: 0,Word1,Word2,POS,Sim1,Sim2,STD
0,biến,ngập,V,3.13,5.22,0.72
1,nhà_thi_đấu,nhà,N,3.07,5.12,1.18
2,động,tĩnh,V,0.60,1.00,0.95
3,khuyết,ưu,N,0.20,0.33,0.40
4,cõi_tục,cõi_âm,N,0.60,1.00,0.95
...,...,...,...,...,...,...
395,lình_xình,nặng_tình,A,1.33,2.22,1.14
396,người_làm,người_bị_hại,N,2.20,3.67,0.83
397,cõi_tục,trần_gian,N,5.40,9.00,0.71
398,chần_chừ,lảo_đảo,V,3.20,5.33,0.98


In [11]:
def cosine_word(x, y):
    if x not in word2vec or y not in word2vec:
        return 0
    return 1 - distance.cosine(word2vec[x], word2vec[y])

In [12]:
data["cos"] = data.apply(lambda x : cosine_word(x["Word1"], x["Word2"]), axis = 1)

In [13]:
data["True_sim"] = data["Sim2"] / 10

In [27]:
for index, row in data.iterrows():
    if row["Word1"] not in word2vec or row["Word2"] not in word2vec:
        data = data.drop(index)
data

Unnamed: 0,Word1,Word2,POS,Sim1,Sim2,STD,cos,True_sim
0,biến,ngập,V,3.13,5.22,0.72,-0.004912,0.522
1,nhà_thi_đấu,nhà,N,3.07,5.12,1.18,0.082523,0.512
2,động,tĩnh,V,0.60,1.00,0.95,0.277086,0.100
3,khuyết,ưu,N,0.20,0.33,0.40,0.176799,0.033
5,thủ_pháp,biện_pháp,N,4.13,6.88,1.26,0.402366,0.688
...,...,...,...,...,...,...,...,...
393,triều_đại,cổ_đại,N,3.67,6.12,1.14,0.274376,0.612
395,lình_xình,nặng_tình,A,1.33,2.22,1.14,0.170494,0.222
396,người_làm,người_bị_hại,N,2.20,3.67,0.83,0.135008,0.367
398,chần_chừ,lảo_đảo,V,3.20,5.33,0.98,0.112939,0.533


In [28]:
print(" Pearson correlation coefficient: ", stats.pearsonr(data["cos"], data["True_sim"]))
print(" Spearman's rank correlation coefficient: ", stats.spearmanr(data["cos"], data["True_sim"]))

 Pearson correlation coefficient:  (0.44681974395860896, 2.7581796904741187e-18)
 Spearman's rank correlation coefficient:  SpearmanrResult(correlation=0.4077568887734169, pvalue=3.26456245952008e-15)


# Ex2:

In [16]:
# Init parameters
# Number of nearest neighbors
K = 10
# Word queried
w = "nhà"

In [17]:
# Model with metric of library
model = NearestNeighbors(n_neighbors=K, algorithm='brute', metric='cosine', n_jobs=-1)
model.fit(coordinates)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=10)

In [18]:
# Model with mannual metric
nbrs = NearestNeighbors(n_neighbors=K, algorithm='brute', metric=cosine_dist, n_jobs=-1)
nbrs.fit(coordinates)

NearestNeighbors(algorithm='brute',
                 metric=<function cosine_dist at 0x000001AE271F0C10>, n_jobs=-1,
                 n_neighbors=10)

In [19]:
# Find KNN with library metric model
model.kneighbors([word2vec[w]], K, return_distance=True)

(array([[5.9604645e-08, 2.8176874e-01, 3.7507415e-01, 3.8698512e-01,
         4.0976185e-01, 4.4179761e-01, 4.5175934e-01, 4.5987982e-01,
         4.6927774e-01, 4.6952456e-01]], dtype=float32),
 array([[   99, 29127, 12678, 48484, 74193, 69571, 10901, 10371,  6556,
         47989]], dtype=int64))

In [20]:
# Find KNN with mannual metric model
nbrs.kneighbors([word2vec[w]], K, return_distance=True)

(array([[-0.37769967, -0.36435413, -0.33553994, -0.33494672, -0.29812038,
         -0.29788288, -0.2915615 , -0.29100323, -0.28186232, -0.27850887]],
       dtype=float32),
 array([[11571, 29938, 11832, 19404, 13874, 11206, 43939,   759, 15388,
         21047]], dtype=int64))

In [21]:
# Manual calculate the cosine distance
res = np.dot(word2vec[w], coordinates.T) / norm(coordinates[np.where(list_word == w)]) / norm(coordinates, axis = 1)
res = 1 - res
# Find K smallest distance (not increasing order just K smallest)
index = np.argpartition(res, K)[:K]
print(res[index])
print(index)

[3.7507421e-01 4.0976179e-01 5.9604645e-08 2.8176874e-01 3.8698506e-01
 4.4179761e-01 4.6927768e-01 4.6952450e-01 4.5987982e-01 4.5175934e-01]
[12678 74193    99 29127 48484 69571  6556 47989 10371 10901]


In [22]:
# Print K nearest neighbor words
print("Queried word is: ", w, " with cosine distance.")
print(list_word[index])

Queried word is:  nhà  with cosine distance.
['nhà_chung' 'nhà_ở' 'nhà' 'nhà_tầng' 'nhà_di_động' 'nhà_nguyên' 'khu_nhà'
 'nhà_trệt' 'nhà_trọ' 'nhà_cao_tầng']


In [23]:
# Euclidean distance
model = NearestNeighbors(n_neighbors=K, algorithm='brute', metric='euclidean', n_jobs=-1)
model.fit(coordinates)
dist, index = model.kneighbors([word2vec[w]], K, return_distance=True)
print("Queried word is: ", w, " with Euclidean distance.")
print(list_word[index])

Queried word is:  nhà  with Euclidean distance.
[['nhà' 'nhà_nguyên' 'nhà_trệt' 'nhà_di_động' 'trên_nhà' '18,000' 'nàh'
  'ttxvn/vietnam' 'tay_không_bắt_giặc' 'nhà_phụ']]


In [24]:
# Dice distance
model = NearestNeighbors(n_neighbors=K, algorithm='brute', metric='dice', n_jobs=-1)
model.fit(coordinates)
dist, index = model.kneighbors([word2vec[w]], K, return_distance=True)
print("Queried word is: ", w, " with Dice distance.")
print(list_word[index])

Queried word is:  nhà  with Dice distance.
[['băng_sơn' 'vũ_kỳ' 'tàng_thư' '2013-2014' 'lên_nước' 'cao_bá_quát'
  'ấm_dần_lên' 'tảng_băng_trôi' 'nonthaburi' 'khuyển']]




In [25]:
# Jaccard distance
model = NearestNeighbors(n_neighbors=K, algorithm='brute', metric='jaccard', n_jobs=-1)
model.fit(coordinates)
dist, index = model.kneighbors([word2vec[w]], K, return_distance=True)
print("Queried word is: ", w, " with Jaccard distance.")
print(list_word[index])



Queried word is:  nhà  with Jaccard distance.
[['băng_sơn' 'vũ_kỳ' 'tàng_thư' '2013-2014' 'lên_nước' 'cao_bá_quát'
  'ấm_dần_lên' 'tảng_băng_trôi' 'nonthaburi' 'khuyển']]
