In [1]:
QUERY_SEED_NUMBER = 10

In [2]:
import numpy as np
DB_SEED_NUMBER = 42
ELEMENT_SIZE = np.dtype(np.float32).itemsize
DIMENSION = 64

In [21]:
import json
import numpy as np
import os
import shutil
import tqdm
import heapq
from sklearn.cluster import KMeans, MiniBatchKMeans
from typing import Annotated
import time

DB_SEED_NUMBER = 42
ELEMENT_SIZE = np.dtype(np.float32).itemsize
DIMENSION = 64

class VecDB:
    def __init__(self, database_file_path = "saved_db.dat", index_file_path = "index.dat", new_db = True, db_size = None) -> None:
        self.db_path = database_file_path
        self.index_path = index_file_path
        if new_db:
            if db_size is None:
                raise ValueError("You need to provide the size of the database")
            # delete the old DB file if exists
            if os.path.exists(self.db_path):
                os.remove(self.db_path)
            self.generate_database(db_size)
    
    def generate_database(self, size: int) -> None:
        # rng = np.random.default_rng(DB_SEED_NUMBER)
        vectors = np.memmap("new_embeddings.dat", dtype=np.float32, mode='r', shape=(size, DIMENSION))
        self._write_vectors_to_file(vectors)
        self._build_index()

    def _write_vectors_to_file(self, vectors: np.ndarray) -> None:
        mmap_vectors = np.memmap(self.db_path, dtype=np.float32, mode='w+', shape=vectors.shape)
        mmap_vectors[:] = vectors[:]
        mmap_vectors.flush()

    def _get_num_records(self) -> int:
        return os.path.getsize(self.db_path) // (DIMENSION * ELEMENT_SIZE)

    def insert_records(self, rows: Annotated[np.ndarray, (int, 64)]):
        num_old_records = self._get_num_records()
        num_new_records = len(rows)
        full_shape = (num_old_records + num_new_records, DIMENSION)
        mmap_vectors = np.memmap(self.db_path, dtype=np.float32, mode='r+', shape=full_shape)
        mmap_vectors[num_old_records:] = rows
        mmap_vectors.flush()
        #TODO: might change to call insert in the index, if you need
        self._build_index()

    def get_one_row(self, row_num: int) -> np.ndarray:
        # This function is only load one row in memory
        try:
            offset = row_num * DIMENSION * ELEMENT_SIZE
            mmap_vector = np.memmap(self.db_path, dtype=np.float32, mode='r', shape=(1, DIMENSION), offset=offset)
            return np.array(mmap_vector[0])
        except Exception as e:
            return f"An error occurred: {e}"

    def get_all_rows(self) -> np.ndarray:
        # Take care this load all the data in memory
        num_records = self._get_num_records()
        vectors = np.memmap(self.db_path, dtype=np.float32, mode='r', shape=(num_records, DIMENSION))
        return np.array(vectors)
    
    def get_all_ids_rows_optimized(self, ids):
        ids = np.array(ids)
        num_records = self._get_num_records()

        sorted_idx = np.argsort(ids)
        sorted_ids = ids[sorted_idx]

        base = sorted_ids[0]
        row_size_bytes = DIMENSION * np.dtype(np.float32).itemsize
        offset = base * row_size_bytes

        # memmap starting from the base
        vectors = np.memmap(
            self.db_path, dtype=np.float32, mode='r',
            offset=offset,
            shape=(num_records - base, DIMENSION)
        )

        local_ids = sorted_ids - base
        
        result = np.empty((len(ids), DIMENSION), dtype=np.float32)
        result[sorted_idx] = vectors[local_ids]

        del vectors
        return result
    
    
    
    def _cal_score(self, vec1, vec2):
        dot_product = np.dot(vec1, vec2)
        norm_vec1 = np.linalg.norm(vec1)
        norm_vec2 = np.linalg.norm(vec2)
        cosine_similarity = dot_product / (norm_vec1 * norm_vec2)
        return cosine_similarity

####################################################################################
####################################################################################
####################################################################################


    


    def retrieve(self, query, top_k=5, n_probe_level2=5, n_probe_level1=6, chunk_size=50):
        self.no_centroids = 7000
        self.no_level2_centroids = 80
        self.index_path = f"index_10M_{self.no_level2_centroids}_{self.no_centroids}_centroids"

        query = np.asarray(query, dtype=np.float32).squeeze()
        query_norm = np.linalg.norm(query)
        if query_norm == 0:
            query_norm = 1.0
        normalized_query = query / query_norm

        centroids_level2_path = os.path.join(self.index_path, "centroids_level2.npy")
        centroids_level1_path = os.path.join(self.index_path, "centroids.npy")
        if not os.path.exists(centroids_level2_path) or not os.path.exists(centroids_level1_path):
            return []

        # Load headers
        level2_header_arr = np.fromfile(
            os.path.join(self.index_path, "level2_header.bin"), dtype=np.uint32
        ).reshape(-1, 2)
        index_header_arr = np.fromfile(
            os.path.join(self.index_path, "index_header.bin"), dtype=np.uint32
        ).reshape(-1, 2)
        flat_index_path = os.path.join(self.index_path, "all_indices.bin")

        # Load level-2 centroids
        centroids_level2 = np.load(centroids_level2_path, mmap_mode="r")
        sims_level2 = centroids_level2.dot(normalized_query)
        # pick top n_probe_level2 level2 centroids
        nearest_level2 = np.argpartition(sims_level2, -n_probe_level2)[-n_probe_level2:]
        del sims_level2, centroids_level2

        centroids_level1 = np.load(centroids_level1_path, mmap_mode="r")
        top_heap = []

        # Iterate over selected top-level clusters
        for lvl2_idx in nearest_level2:
            offset_lvl2, length_lvl2 = level2_header_arr[lvl2_idx]
            if length_lvl2 == 0:
                continue

            # slice first-level centroids for this level2 cluster
            level1_start = offset_lvl2
            level1_end = offset_lvl2 + length_lvl2
            sims_level1 = centroids_level1[level1_start:level1_end].dot(normalized_query)

            # pick top n_probe_level1 first-level centroids
            nearest_first_level = np.argpartition(sims_level1, -n_probe_level1)[-n_probe_level1:]
            del sims_level1

            for idx in nearest_first_level:
                # map to global first-level index
                c = level1_start + idx
                offset, length = index_header_arr[c]
                if length == 0:
                    continue

                # process vectors in chunks
                for start in range(0, length, chunk_size):
                    cur_len = min(chunk_size, length - start)
                    ids_mm = np.memmap(
                        flat_index_path,
                        dtype=np.uint32,
                        mode="r",
                        offset=offset + start * np.dtype(np.uint32).itemsize,
                        shape=(cur_len,)
                    )
                    chunk_ids = ids_mm[:]
                    del ids_mm

                    vecs = self.get_all_ids_rows_optimized(chunk_ids)
                    scores = vecs.dot(normalized_query)

                    for score, id in zip(scores, chunk_ids):
                        if len(top_heap) < top_k:
                            heapq.heappush(top_heap, (score, id))
                        else:
                            heapq.heappushpop(top_heap, (score, id))

                    del scores, chunk_ids, vecs

        # extract top-k sorted
        results = [idx for score, idx in heapq.nlargest(top_k, top_heap)]
        del top_heap
        return results



    

    def _build_index(self):
        self.no_centroids = 7000
        self.no_level2_centroids = 80
        self.index_path = f"index_10M_{self.no_level2_centroids}_{self.no_centroids}_centroids"

        data = self.get_all_rows()

        # 1-level clustering
        kmeans = MiniBatchKMeans(
            n_clusters=self.no_centroids,
            init="k-means++",
            batch_size=10_000,
            random_state=42
        )
        kmeans.fit(data)
        labels = kmeans.labels_
        centers = kmeans.cluster_centers_.astype(np.float32)
        del data, kmeans

        if os.path.exists(self.index_path):
            shutil.rmtree(self.index_path)
        os.makedirs(self.index_path, exist_ok=True)

        cluster_infos = [(cid, np.where(labels == cid)[0].astype(np.uint32))
                        for cid in range(self.no_centroids)]

        # 2-level clustering
        kmeans2 = MiniBatchKMeans(
            n_clusters=self.no_level2_centroids,
            init="k-means++",
            batch_size=1000,
            random_state=42,
        )
        kmeans2.fit(centers)
        centers2 = kmeans2.cluster_centers_.astype(np.float32)
        labels2 = kmeans2.labels_
        cluster_level2_infos = [(cid, np.where(labels2 == cid)[0].astype(np.uint32))
                            for cid in range(self.no_level2_centroids)]

        reordered_centers = []
        reordered_cluster_infos = []
        for _, inds in cluster_level2_infos:
            for ind in inds:
                reordered_centers.append(centers[ind])
                reordered_cluster_infos.append(cluster_infos[ind])
        centers = np.array(reordered_centers, dtype=np.float32)
        cluster_infos = reordered_cluster_infos
        del labels, labels2
        del reordered_centers, reordered_cluster_infos

        header = []
        flat_path = os.path.join(self.index_path, "all_indices.bin")
        with open(flat_path, "wb") as f:
            offset = 0
            for _, inds in cluster_infos:
                length = inds.size
                f.write(inds.tobytes())
                header.append([offset, length])
                offset += length * inds.dtype.itemsize
        header_matrix = np.array(header, dtype=np.uint32)
        header_matrix.tofile(os.path.join(self.index_path, "index_header.bin"))

        # save level2 header (offset, length) for easy slicing later
        level2_header = []
        offset = 0
        for _, inds in cluster_level2_infos:
            length = len(inds)  
            level2_header.append([offset, length])
            offset += length
        np.array(level2_header, dtype=np.uint32).tofile(os.path.join(self.index_path, "level2_header.bin"))

        # normalize centers
        centers /= (np.linalg.norm(centers, axis=1, keepdims=True) + 1e-12)
        np.save(os.path.join(self.index_path, "centroids.npy"), centers)

        centers2 /= (np.linalg.norm(centers2, axis=1, keepdims=True) + 1e-12)
        np.save(os.path.join(self.index_path, "centroids_level2.npy"), centers2)


In [22]:
import numpy as np
import os
# from vec_db import VecDB
import time
from dataclasses import dataclass
from typing import List
from memory_profiler import memory_usage
import gc

@dataclass
class Result:
    run_time: float
    top_k: int
    db_ids: List[int]
    actual_ids: List[int]

def run_queries(db, queries, top_k, actual_ids, num_runs):
    """
    Run queries on the database and record results for each query.

    Parameters:
    - db: Database instance to run queries on.
    - queries: List of query vectors.
    - top_k: Number of top results to retrieve.
    - actual_ids: List of actual results to evaluate accuracy.
    - num_runs: Number of query executions to perform for testing.

    Returns:
    - List of Result
    """
    global results
    results = []
    for i in range(num_runs):
        tic = time.time()
        db_ids = db.retrieve(queries[i], top_k)
        toc = time.time()
        run_time = toc - tic
        results.append(Result(run_time, top_k, db_ids, actual_ids[i]))
    return results

def memory_usage_run_queries(args):
    """
    Run queries and measure memory usage during the execution.

    Parameters:
    - args: Arguments to be passed to the run_queries function.

    Returns:
    - results: The results of the run_queries.
    - memory_diff: The difference in memory usage before and after running the queries.
    """
    global results
    mem_before = max(memory_usage())
    mem = memory_usage(proc=(run_queries, args, {}), interval = 1e-3)
    return results, max(mem) - mem_before

def evaluate_result(results: List[Result]):
    """
    Evaluate the results based on accuracy and runtime.
    Scores are negative. So getting 0 is the best score.

    Parameters:
    - results: A list of Result objects

    Returns:
    - avg_score: The average score across all queries.
    - avg_runtime: The average runtime for all queries.
    """
    scores = []
    run_time = []
    for res in results:
        run_time.append(res.run_time)
        # case for retireving number not equal to top_k, socre will be the lowest
        if len(set(res.db_ids)) != res.top_k or len(res.db_ids) != res.top_k:
            scores.append( -1 * len(res.actual_ids) * res.top_k)
            continue
        score = 0
        for id in res.db_ids:
            try:
                ind = res.actual_ids.index(id)
                if ind > res.top_k * 3:
                    score -= ind
            except:
                score -= len(res.actual_ids)
        scores.append(score)

    return sum(scores) / len(scores), sum(run_time) / len(run_time)

def get_actual_ids_first_k(actual_sorted_ids, k):
    """
    Retrieve the IDs from the sorted list of actual IDs.
    actual IDs has the top_k for the 20 M database but for other databases we have to remove the numbers higher than the max size of the DB.

    Parameters:
    - actual_sorted_ids: A list of lists containing the sorted actual IDs for each query.
    - k: The DB size.

    Returns:
    - List of lists containing the actual IDs for each query for this DB.
    """
    return [[id for id in actual_sorted_ids_one_q if id < k] for actual_sorted_ids_one_q in actual_sorted_ids]

This code to generate all the files for databases.

In [23]:
def _write_vectors_to_file(vectors: np.ndarray, db_path) -> None:
    mmap_vectors = np.memmap(db_path, dtype=np.float32, mode='w+', shape=vectors.shape)
    mmap_vectors[:] = vectors[:]
    mmap_vectors.flush()

# def generate_database(size: int) -> None:
#     rng = np.random.default_rng(DB_SEED_NUMBER)
#     vectors = rng.random((size, DIMENSION), dtype=np.float32)
#     return vectors
def generated_database():
    # Load memmap
    mmap_vectors = np.memmap("new_embeddings.dat", dtype=np.float32, mode='r', shape=(20_000_000, 64))
    return mmap_vectors


vectors = generated_database()

db_filename_size_20M = 'saved_db_20M.dat'
if not os.path.exists(db_filename_size_20M): _write_vectors_to_file(vectors, db_filename_size_20M)
db_filename_size_15M = 'saved_db_15M.dat'
if not os.path.exists(db_filename_size_15M): _write_vectors_to_file(vectors[:15*10**6], db_filename_size_15M)
db_filename_size_10M = 'saved_db_10M.dat'
if not os.path.exists(db_filename_size_10M): _write_vectors_to_file(vectors[:10*10**6], db_filename_size_10M)
db_filename_size_1M = 'saved_db_1M.dat'
if not os.path.exists(db_filename_size_1M): _write_vectors_to_file(vectors[:1*10**6], db_filename_size_1M)


In [24]:
needed_top_k = 10000
rng = np.random.default_rng(QUERY_SEED_NUMBER)
query1 = rng.random((1, 64), dtype=np.float32)
query2 = rng.random((1, 64), dtype=np.float32)
query3 = rng.random((1, 64), dtype=np.float32)
query_dummy = rng.random((1, 64), dtype=np.float32)

actual_sorted_ids_20m_q1 = np.argsort(vectors.dot(query1.T).T / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(query1)), axis= 1).squeeze().tolist()[::-1][:needed_top_k]
gc.collect()
actual_sorted_ids_20m_q2 = np.argsort(vectors.dot(query2.T).T / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(query2)), axis= 1).squeeze().tolist()[::-1][:needed_top_k]
gc.collect()
actual_sorted_ids_20m_q3 = np.argsort(vectors.dot(query3.T).T / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(query3)), axis= 1).squeeze().tolist()[::-1][:needed_top_k]
gc.collect()

queries = [query1, query2, query3]
actual_sorted_ids_20m = [actual_sorted_ids_20m_q1, actual_sorted_ids_20m_q2, actual_sorted_ids_20m_q3]

  actual_sorted_ids_20m_q1 = np.argsort(vectors.dot(query1.T).T / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(query1)), axis= 1).squeeze().tolist()[::-1][:needed_top_k]
  actual_sorted_ids_20m_q2 = np.argsort(vectors.dot(query2.T).T / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(query2)), axis= 1).squeeze().tolist()[::-1][:needed_top_k]
  actual_sorted_ids_20m_q3 = np.argsort(vectors.dot(query3.T).T / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(query3)), axis= 1).squeeze().tolist()[::-1][:needed_top_k]


In [25]:
# No more need to the actual vectors so delete it
del vectors
gc.collect()

0

In [26]:
results = []
to_print_arr = []

In [27]:
print("Team Number", 3)
database_info = {
    # "1M": {
    #     "database_file_path": db_filename_size_1M,
    #     "index_file_path": "index.dat",
    #     "size": 10**6
    # },
    "10M": {
        "database_file_path": db_filename_size_10M,
        "index_file_path": "index_10M_80_7000_centroids",
        "size": 10 * 10**6
    },
    # "15M": {
    #     "database_file_path": db_filename_size_15M,
    #     "index_file_path": PATH_DB_15M,
    #     "size": 15 * 10**6
    # },
    # "20M": {
    #     "database_file_path": db_filename_size_20M,
    #     "index_file_path": "index.dat",
    #     "size": 20 * 10**6
    # }
}


Team Number 3


In [28]:

# for db_name, info in database_info.items():
#     db = VecDB(database_file_path = info["database_file_path"], index_file_path = info["index_file_path"], new_db = False)
#     db._build_index()

In [29]:

for db_name, info in database_info.items():
    db = VecDB(database_file_path = info["database_file_path"], index_file_path = info["index_file_path"], new_db = False)
    actual_ids = get_actual_ids_first_k(actual_sorted_ids_20m, info["size"])
    # Make a dummy run query to make everything fresh and loaded (wrap up)
    res = run_queries(db, query_dummy, 5, actual_ids, 1)
    # actual runs to evaluate
    res, mem = memory_usage_run_queries((db, queries, 5, actual_ids, 3))
    eval = evaluate_result(res)
    to_print = f"{db_name}\tscore\t{eval[0]}\ttime\t{eval[1]:.2f}\tRAM\t{mem:.2f} MB"
    print(to_print)
    to_print_arr.append(to_print)
    del db
    del actual_ids
    del res
    del mem
    del eval
    gc.collect()

10M	score	0.0	time	0.28	RAM	0.31 MB


In [1]:
import numpy as np

In [26]:

def get_one_row(row_num: int) -> np.ndarray:
    # This function is only load one row in memory
    try:
        offset = row_num * 64 * np.dtype(np.float32).itemsize
        mmap_vector = np.memmap("saved_db_1M.dat", dtype=np.float32, mode='r', shape=(1, 64), offset=offset)
        return np.array(mmap_vector[0])
    except Exception as e:
        return f"An error occurred: {e}"

In [27]:
def get_all_ids_rows( ids) -> np.ndarray:
    """
    Load only the requested rows from the memmap, without loading all data in RAM.
    Updated: Instead of loading all vectors into memory, we load only the requested batch.
    """
    num_records = 1_000_000
    vectors = np.memmap("saved_db_1M.dat", dtype=np.float32, mode='r', shape=(num_records, 64))
    
    # Sort IDs to access memmap sequentially (faster disk read)
    sorted_idx = np.argsort(ids)
    sorted_ids = np.array(ids)[sorted_idx]
    
    # Load only selected rows
    result = np.empty((len(ids), 64), dtype=np.float32)
    result[sorted_idx] = vectors[sorted_ids]
    
    del vectors
    return result

In [28]:
def get_all_ids_rows_optimized(ids):
    ids = np.array(ids)
    num_records = 1_000_000

    sorted_idx = np.argsort(ids)
    sorted_ids = ids[sorted_idx]

    base = sorted_ids[0]
    row_size_bytes = 64 * np.dtype(np.float32).itemsize
    offset = base * row_size_bytes

    # memmap starting from the base
    vectors = np.memmap(
        "saved_db_1M.dat", dtype=np.float32, mode='r',
        offset=offset,
        shape=(num_records - base, 64)
    )

    local_ids = sorted_ids - base
    
    result = np.empty((len(ids), 64), dtype=np.float32)
    result[sorted_idx] = vectors[local_ids]

    del vectors
    return result

In [30]:
import time

In [83]:
np.random.seed(42)

high_base = 600_000
spread = 50000  # random range width
ids = high_base + np.random.choice(spread, size=50, replace=False)
ids

array([633553, 609427, 600199, 612447, 639489, 642724, 610822, 649498,
       604144, 636958, 643106, 638695, 606188, 601414, 618471, 629282,
       615177, 634304, 612609, 612144, 606113, 615908, 600821, 615118,
       613466, 626497, 642111, 630188, 637237, 633109, 636480, 624148,
       605503, 603918, 638478, 621123, 649717, 642294, 647609, 611076,
       641514, 622062, 609413, 638340, 630263, 641252, 614644, 626335,
       628102, 617523], dtype=int32)

In [84]:
time1 = time.time()
vectors1 = get_all_ids_rows(ids)
time2 = time.time()
# print(vectors)
print("Time taken:", time2 - time1)

Time taken: 0.01510763168334961


In [85]:
time1 = time.time()
vectors2 = get_all_ids_rows_optimized(ids)
time2 = time.time()
# print(vectors)
print("Time taken:", time2 - time1)

Time taken: 0.0012903213500976562


In [82]:
if np.array_equal(np.array(vectors1[1]), np.array(vectors2[1])):
    print("nice")

nice


In [23]:
print(vectors[0])

[-6.28111288e-02 -1.47320539e-01  1.90533586e-02  4.43077274e-02
  1.05074465e-01  2.83172131e-01 -1.63409859e-01  1.95712700e-01
  1.07049912e-01 -1.23330027e-01 -3.00835390e-02  2.45135650e-01
 -5.26915416e-02 -5.52256368e-02  9.29732770e-02 -6.97657317e-02
 -6.27087284e-05 -1.32255822e-01 -2.69026589e-02  7.89163932e-02
 -1.89308122e-01  6.02391511e-02  3.74443419e-02 -1.43766150e-01
 -2.76008368e-01 -6.13625012e-02  5.22740744e-02  1.01051793e-01
 -5.17447963e-02  3.67304794e-02 -2.33252682e-02  1.31471306e-01
 -1.30366221e-01  6.15089536e-02 -1.63679924e-02 -1.46132097e-01
 -8.24083313e-02  1.16798908e-01 -1.00988574e-01  1.16019160e-01
 -6.51810691e-02  4.59860981e-04  1.06093474e-01  1.89122364e-01
 -2.01225087e-01  8.32308456e-02 -4.10196595e-02 -1.48688197e-01
  2.44087949e-02 -2.67753322e-02  9.42895487e-02 -1.49306238e-01
 -1.71684042e-01 -3.24866235e-01  3.02920695e-02 -1.64438151e-02
 -1.86713353e-01 -1.15885682e-01  1.82194505e-02  9.29413810e-02
  9.96821746e-02  1.78462

In [24]:
vector_tmp=get_one_row(ids[0])
print(vector_tmp)

[-6.28111288e-02 -1.47320539e-01  1.90533586e-02  4.43077274e-02
  1.05074465e-01  2.83172131e-01 -1.63409859e-01  1.95712700e-01
  1.07049912e-01 -1.23330027e-01 -3.00835390e-02  2.45135650e-01
 -5.26915416e-02 -5.52256368e-02  9.29732770e-02 -6.97657317e-02
 -6.27087284e-05 -1.32255822e-01 -2.69026589e-02  7.89163932e-02
 -1.89308122e-01  6.02391511e-02  3.74443419e-02 -1.43766150e-01
 -2.76008368e-01 -6.13625012e-02  5.22740744e-02  1.01051793e-01
 -5.17447963e-02  3.67304794e-02 -2.33252682e-02  1.31471306e-01
 -1.30366221e-01  6.15089536e-02 -1.63679924e-02 -1.46132097e-01
 -8.24083313e-02  1.16798908e-01 -1.00988574e-01  1.16019160e-01
 -6.51810691e-02  4.59860981e-04  1.06093474e-01  1.89122364e-01
 -2.01225087e-01  8.32308456e-02 -4.10196595e-02 -1.48688197e-01
  2.44087949e-02 -2.67753322e-02  9.42895487e-02 -1.49306238e-01
 -1.71684042e-01 -3.24866235e-01  3.02920695e-02 -1.64438151e-02
 -1.86713353e-01 -1.15885682e-01  1.82194505e-02  9.29413810e-02
  9.96821746e-02  1.78462

In [2]:
import numpy as np

In [12]:
header_arr = np.fromfile("./index_10M_80_7000_centroids/level2_header.bin", dtype=np.uint32)


In [14]:
len(header_arr)

160

In [None]:
ids_mm = np.memmap("./index_10M_80_7000_centroids/all_indices.bin", dtype=np.uint32, mode="r",
                offset=0,
                shape=(10_000_000,))

OSError: [WinError 8] Not enough memory resources are available to process this command

In [10]:
len(ids_mm)


10000000