In [1]:
import numpy as np

import faiss

In [2]:
embedding = np.load("./data/embedding_10.npy")

print(embedding.shape)
embedding

(70000, 10)


array([[ 1.0809438 ,  4.5842113 ,  8.188814  , ...,  4.3815236 ,
         4.5348864 ,  3.8039389 ],
       [10.242349  ,  1.8976094 ,  5.18953   , ...,  5.2431035 ,
         3.8557544 ,  3.9528236 ],
       [10.297947  ,  3.2953832 ,  5.1501646 , ...,  5.2841597 ,
         3.912732  ,  4.7057314 ],
       ...,
       [ 9.052287  ,  2.36858   ,  5.025304  , ...,  5.19613   ,
         3.960764  ,  4.592086  ],
       [10.50785   , 10.595664  ,  4.872868  , ...,  5.2744155 ,
         3.4071124 ,  3.2184975 ],
       [ 0.59059334,  4.7471175 ,  3.3372943 , ...,  5.6807604 ,
         4.609006  ,  3.4885309 ]], dtype=float32)

In [3]:
k = 500

embedding = embedding.copy(order="C").astype(np.float32)
embedding -= embedding.mean(axis=0)

index = faiss.IndexFlatL2(embedding.shape[1])
index.add(embedding)

sq_distances, nearest_neighbors = index.search(embedding, k + 1)

In [4]:
nearest_neighbors

array([[    0, 10263, 19633, ..., 33110,  6610, 56310],
       [    1, 31493, 37439, ..., 25501, 15193,  1843],
       [    2, 40206, 53194, ..., 68008, 68626, 62329],
       ...,
       [69997, 27238, 66440, ..., 67091, 62004, 41057],
       [69998, 63769, 32988, ..., 52097, 48434, 48109],
       [69999, 21067, 21505, ..., 50867, 65201, 40145]])

In [9]:
# similarities = 1 / (1 + sq_distances)
similarities = np.exp(- sq_distances)

print(similarities.min())
similarities[:, :100]

0.016716577


array([[1.        , 0.9930813 , 0.9928313 , ..., 0.93655777, 0.936072  ,
        0.9349158 ],
       [0.9999923 , 0.99528855, 0.99513674, ..., 0.9493778 , 0.9493778 ,
        0.94934875],
       [1.        , 0.99900866, 0.99822575, ..., 0.91397464, 0.9139729 ,
        0.9139328 ],
       ...,
       [0.9999962 , 0.99970627, 0.9939795 , ..., 0.934053  , 0.9324439 ,
        0.93127084],
       [1.        , 0.99964917, 0.9992983 , ..., 0.9602683 , 0.96000457,
        0.9599899 ],
       [0.9999923 , 0.9950608 , 0.99497724, ..., 0.930298  , 0.92980117,
        0.929702  ]], dtype=float32)

In [10]:
labels = np.load("./data/labels.npy")
labels

array([9, 0, 0, ..., 8, 1, 5])

In [11]:
from scipy.io import savemat

savemat(
    "fashion_nearest_neighbors_500.mat",
    {
        "labels": labels + 1,
        "nearest_neighbors": nearest_neighbors[:, 1:] + 1,
        "similarities": similarities[:, 1:],
    },
)

In [12]:
uniq_vals, counts = np.unique(labels, return_counts=True)
uniq_vals, counts

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([7000, 7000, 7000, 7000, 7000, 7000, 7000, 7000, 7000, 7000]))