# Imports

In [1]:
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
import json

In [2]:
import hnsw
from iris.io.dataclasses import IrisTemplate
from iris_integration import (
    irisint_make_query as make_query,
    irisint_query_to_vector as query_to_vector,
    irisint_distance as distance,
    _np_to_bigint
)

In [3]:
n_jobs = 4 # Fit to CPU
DIM = (2, 32, 200)
X, Y = DIM [1:]

In [8]:
M = 16
efConstruction = 128
db_size = 10

# Functions

In [9]:
def numpy_array_to_iris_df(numpy_array):
    def create_iris_template(matrix, mask):
        return IrisTemplate(
            iris_codes=matrix,
            mask_codes=mask, 
            # iris_code_version="v3.0" # Doesn't work on open-iris==1.0.0
        )
    mask = [np.ones(DIM[1:], dtype=np.bool_) for _ in range(DIM[0])]
    iris_templates = Parallel(n_jobs=n_jobs)(delayed(create_iris_template)(list(matrix), mask) for matrix in numpy_array)
    return pd.DataFrame({'Template': iris_templates}).assign(Inserted = False)

In [10]:
def import_voter_model_rust_implementation(path_low, path_high, total_num_samples, num_samples=None):
    num_samples = num_samples if num_samples else total_num_samples
    assert num_samples <= total_num_samples
    low_high_data_lst = [
        np.unpackbits(np.fromfile(path, dtype=np.uint8), bitorder="little")
        .reshape(total_num_samples, X, Y)
        [np.random.choice(total_num_samples, size=num_samples, replace=False)] for path in [path_low, path_high]
    ]
    data = np.stack(low_high_data_lst, axis=1).astype(bool)
    return numpy_array_to_iris_df(data)

In [11]:
def update_db(db, iris_df, db_size):
    db_current_size = db.get_stats()['db_size']
    assert (db_size - db_current_size) > 0
    
    new_irises = iris_df.loc[range(db_current_size, db_size), 'Template']
    for iris in new_irises:
        db.insert(make_query(iris))
    iris_df.loc[range(db_current_size, db_size), 'Inserted'] = True

# Try-outs

In [12]:
empty_db = hnsw.HNSW(
    M=M, 
    efConstruction=efConstruction, 
    m_L=1/np.log(M), 
    distance_func=distance, 
    query_to_vector_func=query_to_vector
)

In [21]:
updated_db = hnsw.HNSW(
    M=M, 
    efConstruction=efConstruction, 
    m_L=1/np.log(M), 
    distance_func=distance, 
    query_to_vector_func=query_to_vector
)
iris_df = import_voter_model_rust_implementation(
    path_low='2M_voter_arrays_80k_b45.dat', 
    path_high='2M_voter_arrays_7k_b13.dat', 
    total_num_samples=1000000, 
    num_samples=db_size
)
update_db(updated_db, iris_df, db_size)

In [14]:
for attribute in dir(empty_db):
    if not attribute.startswith('__'):
        new_value = getattr(empty_db, attribute)
        if callable(new_value):
            continue
        processed_value = getattr(updated_db, attribute)
        if new_value != processed_value:
            print(f"Attribute '{attribute}' has changed.")

Attribute 'entry_point' has changed.
Attribute 'layers' has changed.
Attribute 'lock' has changed.
Attribute 'n_comparisons' has changed.
Attribute 'n_distances' has changed.
Attribute 'n_improve' has changed.
Attribute 'n_insertions' has changed.
Attribute 'stat_time' has changed.
Attribute 'vectors' has changed.


# Code Base

In [15]:
def parse_string_to_dict(input_string):
    try:
        parsed_dict = json.loads(input_string)
        return parsed_dict
    except json.JSONDecodeError as e:
        print(f"Error parsing string: {e}")
        return None

def update_entry_point(db, entries):
    db.entry_point[:] = entries['id'].values.tolist()

def update_n_insertions(db, vectors):
    db.n_insertions = len(vectors)

def update_layers(db, links):
    def process_links(df):
        def process_row(row):
            return [(item[1], item[0]) for item in row['queue']]
            
        df['processed_queue'] = df['links'].apply(lambda x: process_row(x))
        return pd.Series(df['processed_queue'].values, index=df['source_ref']).to_dict()
        
    db.layers = links.groupby('layer').apply(process_links).sort_index(ascending=True).tolist()

def update_vectors(db, vectors):
    def process_vectors(data_dict):
        data = np.array(data_dict['data']['data'])
        bi = np.where(data == -1, 1, 0)
        mi = np.where(data != 0, 1, 0)
        return (_np_to_bigint(bi.astype(np.bool_)), _np_to_bigint(mi.astype(np.bool_)))

    vectors_sorted = vectors.sort_values(by='id')
    processed_points = vectors_sorted['point'].apply(process_vectors)
    db.vectors = processed_points.tolist()

def copy_in(db, vectors, links, entries):
    update_vectors(db, vectors)
    update_layers(db, links)
    update_n_insertions(db, vectors)
    update_entry_point(db, entries)

In [16]:
vectors = pd.read_csv('hnsw_db_100_3896635365_vectors.csv')
links = pd.read_csv('hnsw_db_100_141959194_hawk_graph_links.csv')
entries = pd.read_csv('hnsw_db_100_141959194_hawk_graph_entry.csv')

In [17]:
links['links'] = links['links'].apply(lambda x: parse_string_to_dict(x))
vectors['point'] = vectors['point'].apply(lambda x: parse_string_to_dict(x))

In [18]:
copy_in(empty_db, vectors, links, entries)

  db.layers = links.groupby('layer').apply(process_links).sort_index(ascending=True).tolist()
