In [1]:
import sys
sys.path.append('../modules')
from nsw import Node, NSWGraph
import data_gen as dg

In [None]:
import time
import math
import numpy as np

grid = {
    "size": [10000, 100000],
    "dim": [2, 5, 10, 50, 100],
    "regularity_ratio": [0.5, 1, 5],
    "multisearch": [1, 5, 10, 20],
    "top": [10, 100, 1000]
}

test_size = 100

times_build = {}
times_search = {}
accuracy = {}

for dim in grid["dim"]:
    for size in grid["size"]:
        data = np.random.rand(size, dim)
        test = np.random.rand(test_size, dim)
        print(f"Dataset for {size} samples with {dim} dimensions generated")
        dist = data @ test.T
        
        # for each test sample
        closest = []
        for i in range(test.shape[0]):
            cl = np.argpartition(dist[:, i], max(grid["top"])).tolist()
            closest.append(cl)
        print("Ground truth is generated")
        
        for multisearch in grid["multisearch"]:
            for regularity_ratio in grid["regularity_ratio"]:
                regularity = math.ceil(math.log(size) * regularity_ratio)
                # TODO: i
                data_with_classes = list((row, 0) for row in data)
                tpl = (dim, size, regularity, multisearch) 
                start = time.time()
                G = NSWGraph()
                G.build_navigable_graph(data_with_classes,  K=regularity, attempts=multisearch)
                fin = time.time()
                t = fin - start
                print(f"Graph [K={regularity}, Repeat={multisearch}] is generated in {t:.2f} sec.")
                times_build[tpl] = t
                
                for top in grid["top"]:
                    tpl = (dim, size, regularity, multisearch, top)                 
                    match, total = 0, 0
                    for i, row in enumerate(test):
                        start += time.time()
                        result = G.multi_search(row, attempts=multisearch, top=top)
                        fin += time.time()
                        result = set(result)
                        match += len(result.intersection(closest[i][:top]))
                        total += top
                    accuracy[tpl] = (match, total)
                    print(f'top {top} ~ {100 * accuracy[tpl][0] / accuracy[tpl][1]:.2f}%')
                    times_search[tpl] = (fin - start) / test.shape[0] 

Dataset for 10000 samples with 2 dimensions generated
Ground truth is generated
Graph [K=5, Repeat=1] is generated in 17.50 sec.
top 10 ~ 0.00%
top 100 ~ 0.21%
top 1000 ~ 0.20%
Graph [K=10, Repeat=1] is generated in 43.75 sec.
top 10 ~ 0.00%
top 100 ~ 0.36%
top 1000 ~ 0.58%
Graph [K=47, Repeat=1] is generated in 179.04 sec.
top 10 ~ 0.20%
top 100 ~ 0.31%
top 1000 ~ 3.38%
Graph [K=5, Repeat=5] is generated in 213.43 sec.
top 10 ~ 0.20%
top 100 ~ 0.21%
top 1000 ~ 2.21%
