In [1]:
import numpy as np
from sklearn.neighbors import KDTree
import matplotlib.pyplot as plt
import time
import os
from annoy import AnnoyIndex

In [2]:
def random_unit_vectors(num_vectors, n):
    # Gera uma matriz de vetores com distribuição normal (gaussiana)
    vectors = np.random.normal(0, 1, (num_vectors, n))
    norm = np.linalg.norm(vectors, axis = 1)
    for i in np.where(norm != 1.0)[0]:
        vectors[i] = vectors[i] / np.linalg.norm(vectors[i])
    vectors = vectors.astype(np.float32)
    norm = np.linalg.norm(vectors, axis = 1)
    for i in np.where(norm != 1.0)[0]:
        vectors[i] = vectors[i] / np.linalg.norm(vectors[i])
    
    return vectors

def L2(query, features):
    # Compute L2 distance between query and features
    return np.sqrt(np.sum((query - features)**2, axis=1))

def knn(query, k, features, distFunc = None, tie = 0):
    """
    Realiza uma busca k-NN (k-Nearest Neighbors) utilizando uma função de distância personalizada.

    Parameters
    ----------
    query : np.ndarray
        O vetor de consulta para o qual os vizinhos mais próximos serão encontrados.
    k : int
        O número de vizinhos mais próximos a serem retornados.
    features : np.ndarray
        Um array 2D onde cada linha é um vetor de características.
    distFunc : callable, optional
        A função de distância a ser utilizada. Se None, a função L2 (distância Euclidiana) será utilizada (default é None).
    tie : int, optional
        Se 1, verifica se há mais características com a mesma distância que o k-ésimo vizinho mais próximo e as inclui no resultado (default é 0).

    Returns
    -------
    list of tuple
        Uma lista de tuplas onde cada tupla contém o índice e a distância do vizinho mais próximo no array de características.
        A lista é ordenada pela distância em ordem crescente.

    Notes
    -----
    Em caso de empate (distâncias iguais) e se `tie` for 1, todos os vizinhos com a mesma distância que o k-ésimo vizinho mais próximo são incluídos no resultado.
    """
    if distFunc is None:
        distFunc = L2
    
    # Vetor com distâncias
    distances = distFunc(query, features)
    k_nearest = []
     
    for i in range(k):
        min_idx = np.argmin(distances)
        k_nearest.append((min_idx, distances[min_idx]))
        distances[min_idx] = np.inf
        
    # Check if there are more features with the same distance
    if tie:
        min_idx = np.argmin(distances)
        while distances[min_idx] == k_nearest[-1][1]:
            k_nearest.append((min_idx, distances[min_idx]))
            distances[min_idx] = np.inf
            min_idx = np.argmin(distances)
    
    return k_nearest

In [3]:
# Gerar vetores
num_vectors = 100000
n = 128
np.random.seed(42)
random_vectors = random_unit_vectors(num_vectors, n)

# Gerar queries
np.random.seed(235)
list_of_queries = random_unit_vectors(80, n)
random_vectors.shape, list_of_queries.shape

((100000, 128), (80, 128))

In [4]:
# Perform k-NN search for each query
results = []
k = 32
times = []

for query in list_of_queries:
    
    t0 = time.time()
    k_nearest = knn(query, k, random_vectors, tie=0)
    results.append(k_nearest)
    t1 = time.time()
    
    dt = (t1-t0)*1000
    print(f"Query {len(results)}: {dt:.3f} ms")
    times.append(dt)

# Mean and standard deviation of time
median_time = np.median(times)
total_time = np.sum(times)
print(f"Total time: {total_time:.2f} ms, Median time: {median_time:.3f} ms")

Query 1: 73.485 ms
Query 2: 86.354 ms
Query 3: 66.092 ms
Query 4: 73.479 ms
Query 5: 69.304 ms
Query 6: 59.918 ms
Query 7: 46.254 ms
Query 8: 68.606 ms
Query 9: 114.649 ms
Query 10: 100.094 ms
Query 11: 62.259 ms
Query 12: 58.308 ms
Query 13: 43.409 ms
Query 14: 70.886 ms
Query 15: 86.135 ms
Query 16: 69.674 ms
Query 17: 78.671 ms
Query 18: 60.405 ms
Query 19: 49.064 ms
Query 20: 41.941 ms
Query 21: 42.276 ms
Query 22: 67.138 ms
Query 23: 32.119 ms
Query 24: 66.491 ms
Query 25: 49.773 ms
Query 26: 54.505 ms
Query 27: 62.199 ms
Query 28: 66.629 ms
Query 29: 58.931 ms
Query 30: 63.101 ms
Query 31: 61.552 ms
Query 32: 75.444 ms
Query 33: 57.482 ms
Query 34: 67.628 ms
Query 35: 48.078 ms
Query 36: 33.332 ms
Query 37: 49.995 ms
Query 38: 61.458 ms
Query 39: 59.184 ms
Query 40: 46.170 ms
Query 41: 49.981 ms
Query 42: 84.069 ms
Query 43: 99.302 ms
Query 44: 66.633 ms
Query 45: 50.040 ms
Query 46: 66.458 ms
Query 47: 71.222 ms
Query 48: 89.655 ms
Query 49: 72.513 ms
Query 50: 50.021 ms
Query 5

In [5]:
k_nearest = knn(list_of_queries[0], k, random_vectors, tie=0)
lknn = list(k_nearest)
print(lknn)

[(91844, 1.136661), (17564, 1.1378299), (33805, 1.1448735), (18666, 1.145155), (41991, 1.1482445), (63406, 1.1485198), (63799, 1.1503543), (70384, 1.1506016), (77743, 1.1545849), (29698, 1.155954), (3758, 1.1588564), (52268, 1.1609489), (68080, 1.1610289), (23512, 1.1649672), (81694, 1.1650403), (20900, 1.1675389), (86881, 1.1686496), (56749, 1.1710454), (71121, 1.1724294), (41006, 1.174596), (68221, 1.1752834), (35921, 1.1774664), (52723, 1.178008), (53850, 1.1781019), (5091, 1.1788859), (67976, 1.1798925), (37578, 1.1799468), (9015, 1.1804007), (4153, 1.1805314), (78378, 1.1839035), (19633, 1.1843182), (42649, 1.1849439)]


In [8]:
a = [(91844, 1.1366609334945679), (17564, 1.1378300189971924), (33805, 1.1448736190795898), (18666, 1.1451550722122192), (41991, 1.1482445001602173), (63406, 1.14851975440979), (63799, 1.1503543853759766), (70384, 1.1506015062332153), (77743, 1.1545848846435547), (29698, 1.1559538841247559), (3758, 1.1588562726974487), (52268, 1.1609488725662231), (81694, 1.1650402545928955), (86881, 1.1686495542526245), (71121, 1.1724293231964111), (41006, 1.174595832824707), (68221, 1.1752831935882568), (53850, 1.1781021356582642), (67976, 1.1798925399780273), (37578, 1.1799466609954834), (4153, 1.1805312633514404), (78378, 1.1839035749435425), (19633, 1.1843183040618896), (42649, 1.1849439144134521), (10183, 1.1860792636871338), (58748, 1.1866508722305298), (41600, 1.187021017074585), (38036, 1.1881496906280518), (9415, 1.1882237195968628), (78048, 1.1886341571807861), (32609, 1.1891897916793823), (25344, 1.189258337020874)]

for i in range(32):
    if a[i][0] != lknn[i][0]:
        print(i, a[i][0], "!=", lknn[i][0])

12 81694 != 68080
13 86881 != 23512
14 71121 != 81694
15 41006 != 20900
16 68221 != 86881
17 53850 != 56749
18 67976 != 71121
19 37578 != 41006
20 4153 != 68221
21 78378 != 35921
22 19633 != 52723
23 42649 != 53850
24 10183 != 5091
25 58748 != 67976
26 41600 != 37578
27 38036 != 9015
28 9415 != 4153
29 78048 != 78378
30 32609 != 19633
31 25344 != 42649


In [6]:
# Construir a árvore KDTree
tree = KDTree(random_vectors)

In [7]:
t0 = time.time()
distances, indices = tree.query(list_of_queries, k=k)
t1 = time.time()

dt = (t1-t0)*1000

print([(index, distance) for index, distance in zip(indices[0], distances[0])])
print("Query KDTree: ", dt, "ms")

[(39012, 0.8540882087764496), (72022, 0.8588310375973289), (97966, 0.8620615211822189), (77034, 0.8641481891478793), (5199, 0.8701768721669323), (43414, 0.8740318057844166), (6378, 0.8786597144619676), (71688, 0.8786978419617187), (82336, 0.885273151852604), (60077, 0.8862355273851796), (62201, 0.8914727288571067), (70115, 0.8932470236849629), (91614, 0.8941340346093617), (26890, 0.8951905566626349), (49897, 0.9017866117124517), (49274, 0.9018788419543528), (83391, 0.9044178763832441), (2429, 0.916806952163638), (87362, 0.9168840365332922), (14629, 0.9173161402807113), (29331, 0.9194732746553261), (64151, 0.9217168724170083), (86495, 0.9232730752544106), (31970, 0.9243479558018235), (55268, 0.9264679696568147), (8526, 0.9284950315321479), (70457, 0.9291735715270502), (93785, 0.9310438750451115), (87951, 0.9335948919178855), (21445, 0.936514821525829), (87922, 0.93701967991894), (77227, 0.9387216371642045)]
Query KDTree:  2065.000057220459 ms


In [8]:
annoy = AnnoyIndex(n, 'euclidean')
for i in range(random_vectors.shape[0]):
    annoy.add_item(i, random_vectors[i, :])
annoy.build(n_trees=1)

indices = annoy.get_nns_by_vector(list_of_queries[0], 20)
print('end')

: 