In [1]:
import sys
import os.path

sys.path.append('../modules')
from nsw import Node, NSWGraph
from nsw_visualization import show_state
import data_gen as dg

In [7]:
import time
import math
import numpy as np

grid = {
    "size": [2000, 10000, 100000],
    "dim": [2, 5, 10, 50, 100],
    "multisearch": [2, 5, 10, 20, 50],
    "top": [10, 100, 1000]
}

test_size = 100
approx_constant = 10

times_build = {}
times_search = {}
accuracy = {}

In [None]:
SampleG = NSWGraph()

times_file = open("../dumps/times.txt", "w+")
accuracy_file = open("../dumps/accuracy.txt", "w+")

for dim in grid["dim"]:
    for size in grid["size"]:
        data = np.random.rand(size, dim)
        test = np.random.rand(test_size, dim)
        
        print(f"Dataset for {size} samples with {dim} dimensions generated")
        data_with_classes = list((row, 0) for row in data)
        print("Data with labels is created")
        
        # for each test sample
        closest = []
        for t in test:
            dist = []
            for d in data:
                dist.append(SampleG.dist(t, d))
            cl = np.argpartition(dist, max(grid["top"])).tolist()
            closest.append(cl)
        print("Ground truth is generated")
        
        for multisearch in grid["multisearch"]:

            tpl = (dim, size, multisearch) 

            filename = f"../dumps/{dim}D_{size}items_{multisearch}repeat.graph"
            if os.path.exists(filename):
                G = NSWGraph.load(filename)
                print(f"Graph [Size={size}, Dim={dim}, Repeat={multisearch}] is loaded from file.")
            else:
                start = time.time()
                G = NSWGraph()
                G.build_navigable_graph(data_with_classes, attempts=multisearch)
                fin = time.time()
                t = fin - start
                G.save(filename)
                print(f"Graph [Size={size}, Dim={dim}, Repeat={multisearch}] is generated in {t:.2f} sec.")
                times_build[tpl] = t
                times_file.write(f"{dim},{size},{multisearch},{t}\n")
                times_file.flush()

            for top in grid["top"]:
                tpl = (dim, size, multisearch, top)                 
                match, match_scaled, total, total_scaled = 0, 0, 0, 0
                for i, row in enumerate(test):
                    start += time.time()
                    result = G.multi_search(row, attempts=multisearch, top=top)
                    fin += time.time()

                    result = set(result)
                    intersect = len(result.intersection(closest[i][:top]))
                    intersect_scaled = len(result.intersection(closest[i][:top // approx_constant]))
                    match += intersect
                    match_scaled += intersect_scaled
                    total += top
                    total_scaled += top // approx_constant
                accuracy[tpl] = (match, total)
                accuracy_file.write(f"{dim},{size},{multisearch},{top},{match / total},{match_scaled / total_scaled}\n")
                accuracy_file.flush()
                
                print(f'top {top} ~ {100 * match / total:.2f}% ; scaled[{approx_constant}] ~ {100 * match_scaled / total_scaled:.2f}%')
                times_search[tpl] = (fin - start) / test.shape[0] 

Dataset for 2000 samples with 2 dimensions generated
Data with labels is created
Ground truth is generated
Graph [Size=2000, Dim=2, Repeat=2] is loaded from file.
top 10 ~ 0.80% ; scaled[10] ~ 2.00%
top 100 ~ 5.74% ; scaled[10] ~ 5.40%
top 1000 ~ 17.31% ; scaled[10] ~ 24.52%
Graph [Size=2000, Dim=2, Repeat=5] is loaded from file.
top 10 ~ 0.80% ; scaled[10] ~ 2.00%
top 100 ~ 5.72% ; scaled[10] ~ 5.40%
top 1000 ~ 33.06% ; scaled[10] ~ 40.48%
Graph [Size=2000, Dim=2, Repeat=10] is loaded from file.
top 10 ~ 0.80% ; scaled[10] ~ 2.00%
top 100 ~ 5.72% ; scaled[10] ~ 5.40%
top 1000 ~ 50.17% ; scaled[10] ~ 52.41%
Graph [Size=2000, Dim=2, Repeat=20] is loaded from file.
top 10 ~ 0.80% ; scaled[10] ~ 2.00%
top 100 ~ 5.66% ; scaled[10] ~ 5.40%
top 1000 ~ 49.94% ; scaled[10] ~ 52.14%
Data dimensionality detected is 2. regularity = 6
Graph [Size=2000, Dim=2, Repeat=50] is generated in 15.79 sec.
top 10 ~ 1.90% ; scaled[10] ~ 0.00%
top 100 ~ 17.71% ; scaled[10] ~ 20.30%
top 1000 ~ 97.55% ; scaled[