In [3]:
import numpy as np

def load_ground_truth(file_path):
    # Read the binary file
    with open(file_path, 'rb') as f:
        # Read the number of rows from the first 4 bytes
        num_rows = np.frombuffer(f.read(4), dtype=np.int32)[0]
        num_cols = np.frombuffer(f.read(4), dtype=np.int32)[0]
        num_bytes = num_rows * num_cols * 4
        labels = np.frombuffer(f.read(num_bytes), dtype=np.int32)
        distances = np.frombuffer(f.read(num_bytes), dtype=np.single)
        
        # Reshape the data into rows
        labels = labels.reshape((num_rows, -1))
        distances = distances.reshape((num_rows, -1))
        
        return labels, distances

def load_data(file_path: str, max_elements: int):
    # Read the binary file
    with open(file_path, 'rb') as f:
        # Read the number of rows from the first 4 bytes
        num_rows = np.frombuffer(f.read(4), dtype=np.int32)[0]
        num_cols = np.frombuffer(f.read(4), dtype=np.int32)[0]
        num_rows = int(min(max_elements, num_rows))
        print(f"loading {num_rows=} from {file_path}")

        if file_path.endswith("fbin"):
            num_bytes = num_rows * num_cols * 4
            queries = np.frombuffer(f.read(num_bytes), dtype=np.single)
            queries = queries.reshape((num_rows, num_cols))
            return queries
        elif file_path.endswith("u8bin"):
            num_bytes = num_rows * num_cols
            queries = np.frombuffer(f.read(num_bytes), dtype=np.uint8)
            queries = queries.reshape((num_rows, num_cols))
            return queries
            
data_dir= "/data/juelin/project/melee/data/datasets"
data_size=1e8
gt_path = f"{data_dir}/gt/GT_10M/bigann-10M"
feat_path = f"{data_dir}/bigann/base.1B.u8bin"
sample_path = f"{data_dir}/bigann/learn.100M.u8bin"
query_path = f"{data_dir}/bigann/query.10k.u8bin"
# index_path = f"{data_dir}/../graphs/deep_10M_M16_ef500.index"
query = load_data(query_path, data_size)
feat = load_data(feat_path, data_size)
sample = load_data(sample_path, data_size)
label, distance = load_ground_truth(gt_path)

loading num_rows=10000 from /data/juelin/project/melee/data/datasets/bigann/query.10k.u8bin
loading num_rows=100000000 from /data/juelin/project/melee/data/datasets/bigann/base.1B.u8bin
loading num_rows=100000000 from /data/juelin/project/melee/data/datasets/bigann/learn.100M.u8bin


In [4]:
feat

array([[ 0,  0,  0, ..., 14, 10,  6],
       [65, 35,  8, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  1,  0,  0],
       ...,
       [20, 96, 68, ..., 14, 15,  3],
       [35,  2,  0, ...,  2,  8,  5],
       [78, 59,  7, ..., 39,  0,  0]], dtype=uint8)

In [6]:
sample

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 2,  0,  0, ..., 14, 11, 19],
       [34,  2,  0, ..., 20, 41,  3],
       ...,
       [29, 17, 45, ...,  5,  2,  2],
       [ 8, 34, 44, ...,  3,  0,  0],
       [ 6, 23, 42, ..., 20, 14,  1]], dtype=uint8)

In [None]:
import hnswlib
num_elements = feat.shape[0]
ids = np.arange(num_elements)
p = hnswlib.Index(space = 'l2', dim = query.shape[1])
p.init_index(max_elements = num_elements, ef_construction = 200, M = 16)
p.add_items(feat, ids)

In [None]:
pred_labels, pred_distances = p.knn_query(query, k = 100)

In [None]:
pred_labels, 