# Imports and Functions

## Imports and Constants

In [1]:
# Standard Library Imports
import time
import sys
import pickle

In [2]:
# Third-Party Library Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import Parallel, delayed, parallel_backend
from functools import reduce
from operator import mul

In [3]:
# Custom Library Imports
from iris.io.dataclasses import IrisTemplate
from iris_integration import (
    iris_with_noise,
    irisint_make_query as make_query,
    irisint_query_to_vector as query_to_vector,
    irisint_distance as distance,
)
import hnsw

In [4]:
# Constants
n_jobs = 6  # Number of parallel jobs, adjust to running CPU
DIM = (2, 32, 200)  # Iris dimensions

## Utility Functions

In [5]:
last_update_time = time.time() # Tracks the last update time for progress messages
def print_progress(msg, delay=1, force_print=False):
    """Prints a progress message, updating periodically or immediately if forced."""
    global last_update_time
    if (time.time() - last_update_time > delay) or force_print:
        sys.stdout.write('\r' + ' ' * 200)  # Clear previous message
        sys.stdout.write(f"\r{msg}")
        sys.stdout.flush()
        last_update_time = time.time()  # Update the last printed time

In [6]:
def save_pickle(obj, filename):
    """Saves a Python object to a file using pickle."""
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)  # Save the object
    print(f"Object successfully saved to {filename}")

In [7]:
def load_pickle(filename):
    """Loads a Python object from a pickle file."""
    with open(filename, 'rb') as file:
        obj = pickle.load(file)  # Load the object
    print(f"Object successfully loaded from {filename}")
    return obj

In [8]:
def plot_boolean_iris(matrix, title=''):
    """Plots a grayscale image of a 2D boolean matrix."""
    plt.imshow(matrix, cmap='gray')  # Render the matrix as an image
    plt.title(title)
    plt.show()

In [9]:
def int_to_scaled_string(n):
    """Converts a large integer into a human-readable format (e.g., 1.2K, 3M)."""
    suffixes = ['', 'K', 'M', 'B', 'T']
    idx = max(0, min(len(suffixes) - 1, int((len(str(abs(n))) - 1) / 3)))
    scaled = n / (1000 ** idx)
    # Format scaled number with suffix
    return f"{scaled:.1f}{suffixes[idx]}" if scaled % 1 else f"{int(scaled)}{suffixes[idx]}"

In [10]:
def update_avg_time(current_avg, denominator, new_time):
    """Updates a running average given a new value."""
    # Weighted average calculation
    return ((current_avg * denominator) + new_time) / (denominator + 1)

## Data Loading Functions

In [11]:
def read_partial_file(filename, num_bits):
    """Reads a specified number of bits from a binary file."""
    num_bytes = (num_bits + 7) // 8  # Calculate the number of bytes needed
    with open(filename, 'rb') as f:
        chunk = f.read(num_bytes)  # Read the required bytes
    return np.frombuffer(chunk, dtype=np.uint8)  # Convert to numpy array

In [12]:
def load_and_reshape_masks(filename, num_masks, DIM=DIM):
    """Loads and reshapes binary masks, duplicating them to fit the target dimensions."""
    # Read and unpack binary data into boolean array
    flattened_data = np.unpackbits(read_partial_file(filename, ((DIM[1] // 2) * DIM[2]) * num_masks))
    boolean_arrays = flattened_data.reshape((num_masks, DIM[1] // 2, DIM[2]))  # Reshape to mask dimensions
    vertically_stacked = np.tile(boolean_arrays, (1, 2, 1))  # Duplicate vertically to restore full height
    duplicated_arrays = np.repeat(vertically_stacked[:, np.newaxis, :, :], DIM[0], axis=1)  # Duplicate across depth
    return duplicated_arrays

In [13]:
def load_and_reshape_irises(path_low, path_high, num_samples, DIM=DIM):
    """Loads and reshapes low and high-resolution iris data into a boolean array."""
    low_high_lst = [
        np.unpackbits(
            read_partial_file(path, (reduce(mul, DIM[1:]) * num_samples)), bitorder="little"
        ).reshape(num_samples, *DIM[1:]) for path in [path_low, path_high]
    ]
    return np.concatenate(low_high_lst, axis=1).astype(bool)  # Combine and cast to boolean

## Test Functions and DB Buildup 

In [14]:
def update_db(db, iris_df, db_size, force_layer=None):
    """Updates the database by inserting new iris templates up to the target size."""
    db_current_size = db.get_stats()['db_size']  # Current size of the database
    if (db_size - db_current_size) <= 0:
        return  # Exit if no additional entries are needed
    
    # Select new iris templates to insert
    new_irises = iris_df.loc[range(db_current_size, db_size), 'Template']
    for i, iris in enumerate(new_irises):
        print_progress(
            f'Currently building {int_to_scaled_string(db_size)} DB, with M={db.M}, efConstruction={db.efConstruction}. '
            f'Insertion Progress: {(i+1)/len(new_irises):.1%}'
        )
        db.insert(make_query(iris), insert_layer=force_layer)  # Insert into the database
    iris_df.loc[range(db_current_size, db_size), 'Inserted'] = True  # Mark inserted templates

In [15]:
def numpy_array_to_iris_df(iris_array, mask_array):
    """Converts numpy arrays of iris and mask data into a DataFrame with iris templates."""
    def create_iris_template(matrix, mask):
        return IrisTemplate(
            iris_codes=matrix,
            mask_codes=mask
            iris_code_version="v3.0" # compatibility issues in version open-iris==1.0.0
        )
    # Generate iris templates in parallel
    iris_templates = Parallel(n_jobs=n_jobs)(
        delayed(create_iris_template)(list(iris), list(mask)) for iris, mask in zip(iris_array, mask_array)
    )
    return pd.DataFrame({'Template': iris_templates}).assign(Inserted=False)  # Add a column for insertion status

In [16]:
def run_single_experiment(db, idx, iris, noise, efSearch, K):
    """Runs a single experiment by searching for a noisy iris query in the database."""
    noisy_query = make_query(iris_with_noise(iris, noise_level=noise))  # Create noisy query
    res = db.search(noisy_query, K, ef=efSearch)  # Perform the search
    return any(idx == tup[1] for tup in res)  # Check if any result matches the index

# Stats Calculation

## Decision Boundary Stability Over DB Size

In [20]:
# Database Configuration
efConstruction = 128  # I was using 256, maybe run it with 128? 
db_size_range = np.arange(50000, 250001, 5000)  # Range of database sizes
M_range = np.arange(16, 193, 8)  # Range of `M` values for HNSW
efSearch_range = np.arange(16, 193, 8)  # Range of `efSearch` values for HNSW
K = 1  # Number of nearest neighbors to retrieve

# Experiment Settings
num_experiments = 1000  # Number of experiments to run
noise_level = 0.3  # Level of noise to apply during experiments

# Results Output
results_path = f'analysis_data/efConstruction{efConstruction}_db_size_stability_results.parquet'

In [None]:
# Define synthetic data size and file paths (modify these paths as needed)
synthetic_data_size = 2**22  # Size of synthetic data 
path_masks = f'path/to/synthetic_data/{int_to_scaled_string(synthetic_data_size)}_mask_arrays.dat'  # Modify this path
path_iris_low = 'path/to/synthetic_data/2_23_voter_arrays_90k_b090.dat'  # Modify this path
path_iris_high = 'path/to/synthetic_data/2_23_voter_arrays_14k_b010.dat'  # Modify this path

# Load and reshape data
loaded_masks = load_and_reshape_masks(path_masks, db_size_range.max()).astype(bool)  # Load and convert to boolean
loaded_irises = load_and_reshape_irises(path_iris_low, path_iris_high, db_size_range.max())

# Create a DataFrame from the loaded iris and mask data
iris_df = numpy_array_to_iris_df(
    loaded_irises.reshape(db_size_range.max(), *DIM),
    loaded_masks
)

In [None]:
# Initialize variables for results and average iteration time
results_lst, avg_iteration_time = [], 0

# Iterate over M values in M_range
for j, M in enumerate(M_range):
    start_time = time.time()  # Track the start time for this iteration
    
    # Reset 'Inserted' status in iris DataFrame
    iris_df['Inserted'] = False

    # Initialize the HNSW database with current M value
    db = hnsw.HNSW(
        M=M,
        efConstruction=efConstruction,
        m_L=1 / np.log(M),  # Layer multiplier
        distance_func=distance,
        query_to_vector_func=query_to_vector
    )

    # Iterate over database sizes in db_size_range
    for db_size in db_size_range:
        update_db(db, iris_df, db_size)  # Update the database with current size

        # Iterate over efSearch values in efSearch_range
        for efSearch in efSearch_range:
            # Sample data for experiments
            indices, irises = iris_df.loc[iris_df['Inserted'], 'Template'].sample(
                num_experiments
            ).reset_index().T.values
            
            # Run experiments in parallel
            with parallel_backend('threading'):
                results = Parallel(n_jobs=n_jobs)(
                    delayed(run_single_experiment)(
                        db, indices[i], irises[i], noise_level, efSearch, K
                    ) for i in range(num_experiments)
                )

            # Store results for this configuration
            results_lst.append((
                M, efConstruction, db_size, efSearch, 
                np.mean(results), len(db.layers[1])
            ))
        
        # Save intermediate results to a parquet file
        results_df = pd.DataFrame(
            results_lst, 
            columns=['M', 'efConstruction', 'DB_Size', 'efSearch', 'Recall', 'Layer 1 Size']
        )
        results_df.to_parquet(results_path)

    # Calculate and update average iteration time
    end_time = time.time() - start_time
    avg_iteration_time = update_avg_time(avg_iteration_time, j, end_time)

    # Print progress and estimated time remaining
    print_progress(
        f'\r{(j+1)/len(M_range):.1%} Completed, '
        f'Average Iteration Time: {avg_iteration_time/3600:.1f} hours, '
        f'EOC: {(avg_iteration_time/3600)*(len(M_range)-(j+1)):.1f} hours\n', force_print=True
    )