In [1]:
# Setup
# ! sudo apt install -y libgl1-mesa-glx libglib2.0-0 libsm6 libxrender1 libxext6
# ! pip install open-iris==1.0.0 faiss-cpu seaborn

# Imports and Functions

## Imports and Constants

In [2]:
import os
import pickle
import shutil
import sys
import threading
import time
from datetime import datetime
from functools import reduce
from io import BytesIO
from itertools import combinations, product
from operator import mul

In [3]:
import boto3
import iris
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import psutil
import scipy
import seaborn as sns
from joblib import Parallel, delayed, parallel_backend
from scipy.spatial.distance import hamming
from scipy.stats import ks_2samp, ttest_ind
from threading import Lock

In [4]:
import hnsw
from iris.io.dataclasses import IrisTemplate
from iris_integration import (
    iris_with_noise,
    irisint_make_query as make_query,
    irisint_query_to_vector as query_to_vector,
    irisint_distance as distance,
    int_distance
)
from iris_pairwise_min_dist_calculation import get_pairwise_min_dist_across_rotations

In [5]:
n_jobs = 6 # Fit to CPU
DIM = (2, 32, 200)
base_path = 'db/'

## General Functions

In [6]:
last_update_time = time.time()
def print_progress(msg, delay=1, force_print=False):
    global last_update_time
    if (time.time() - last_update_time > delay) or force_print:
        sys.stdout.write('\r' + ' ' * 200)
        sys.stdout.write(f"\r{msg}")
        sys.stdout.flush()
        
        last_update_time = time.time()

def save_pickle(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)
    print(f"Object successfully saved to {filename}")

def load_pickle(filename):
    with open(filename, 'rb') as file:
        obj = pickle.load(file)
    print(f"Object successfully loaded from {filename}")
    return obj

In [7]:
def plot_boolean_iris(matrix, title=''):
    plt.imshow(matrix, cmap='gray')
    plt.title(title)
    plt.show()

In [8]:
def int_to_scaled_string(n):
    suffixes = ['', 'K', 'M', 'B', 'T']
    idx = max(0, min(len(suffixes) - 1, int((len(str(abs(n))) - 1) / 3)))
    scaled = n / (1000 ** idx)
    return f"{scaled:.1f}{suffixes[idx]}" if scaled % 1 else f"{int(scaled)}{suffixes[idx]}"

## Loading Functions

In [9]:
def read_partial_file(filename, num_bits):
    num_bytes = (num_bits + 7) // 8  # Ensure we round up if num_bits isn't a multiple of 8
    with open(filename, 'rb') as f:
        chunk = f.read(num_bytes)
    return np.frombuffer(chunk, dtype=np.uint8)

In [10]:
def load_and_reshape_masks(filename, num_masks, DIM=DIM):
    flattened_data = np.unpackbits(read_partial_file(filename, ((DIM[1]//2) * DIM[2]) * num_masks))
    boolean_arrays = flattened_data.reshape((num_masks, DIM[1]//2, DIM[2]))
    vertically_stacked = np.tile(boolean_arrays, (1, 2, 1))
    duplicated_arrays = np.repeat(vertically_stacked[:, np.newaxis, :, :], DIM[0], axis=1)
    return duplicated_arrays

In [11]:
def load_and_reshape_irises(path_low, path_high, num_samples, DIM=DIM):
    low_high_lst = [
        np.unpackbits(
            read_partial_file(path, (reduce(mul, DIM[1:]) * num_samples)), bitorder="little"
        ).reshape(num_samples, *DIM[1:]) for path in [path_low, path_high]
    ]
    return np.concatenate(low_high_lst, axis=1).astype(bool)

## Test Functions and DB Buildup 

In [12]:
def update_db(db, iris_df, db_size, force_layer=None):
    db_current_size = db.get_stats()['db_size']
    if (db_size - db_current_size) <= 0:
        return
    
    new_irises = iris_df.loc[range(db_current_size, db_size), 'Template']
    for i, iris in enumerate(new_irises):
        print_progress(f'Currently building {int_to_scaled_string(db_size)} DB, M={db.M}, with efConstruction={db.efConstruction}. Insertion Progress: {(i+1)/len(new_irises):.1%}')
        db.insert(make_query(iris), insert_layer=force_layer)
    iris_df.loc[range(db_current_size, db_size), 'Inserted'] = True

In [13]:
def numpy_array_to_iris_df(iris_array, mask_array):
    def create_iris_template(matrix, mask):
        return IrisTemplate(
            iris_codes=matrix,
            mask_codes=mask, 
            # iris_code_version="v3.0" # Doesn't work on open-iris==1.0.0
        )
    iris_templates = Parallel(n_jobs=n_jobs)(delayed(create_iris_template)(list(iris), list(mask)) for iris, mask in zip(iris_array, mask_array))
    return pd.DataFrame({'Template': iris_templates}).assign(Inserted = False)

# Data Loading

## Configurations

In [15]:
synthetic_data_size = 2**22
max_tested_db_size = 100000

In [16]:
path_masks = f'synthetic_data/{int_to_scaled_string(synthetic_data_size)}_mask_arrays.dat'
path_iris_low = 'synthetic_data/2_23_voter_arrays_90k_b090.dat'
path_iris_high = 'synthetic_data/2_23_voter_arrays_14k_b010.dat'

## Loading

In [17]:
loaded_masks = load_and_reshape_masks(path_masks, max_tested_db_size).astype(bool)
loaded_irises = load_and_reshape_irises(path_iris_low, path_iris_high, max_tested_db_size)

## Reassuring stats

In [18]:
assert len(loaded_masks) == max_tested_db_size
assert len(loaded_irises) == max_tested_db_size
print(f'Successfuly loaded {int_to_scaled_string(max_tested_db_size)} masks and irises')

Successfuly loaded 100 masks and irises


# DB Creation and Saving

## Configurations

In [19]:
M = 64
efConstruction = 64
db_sizes = np.arange(100000, 500001, 100000)

In [19]:
assert db_sizes.max() <= max_tested_db_size

## Build up and Saving

In [24]:
iris_df = numpy_array_to_iris_df(loaded_irises.reshape(max_tested_db_size, *DIM), loaded_masks)
db = hnsw.HNSW(
    M=M, 
    efConstruction=efConstruction, 
    m_L=1/np.log(M), 
    distance_func=distance, 
    query_to_vector_func=query_to_vector
)

In [25]:
for db_size in db_sizes:
    update_db(db, iris_df, db_size)
    del db.lock
    save_pickle(db, f'{base_path}db{int_to_scaled_string(db_size)}_M{M}_efConstruction{efConstruction}.pkl')
    db.lock = Lock()

Currently building 1K DB, M=64, with efConstruction=64. Insertion Progress: 97.9%                                                                                                                       Object successfully saved to db/1Kdb_M64_efConstruction64.pkl
Currently building 2K DB, M=64, with efConstruction=64. Insertion Progress: 98.9%                                                                                                                       Object successfully saved to db/2Kdb_M64_efConstruction64.pkl
