In [1]:
# # Setup
# ! sudo apt install -y libgl1-mesa-glx libglib2.0-0 libsm6 libxrender1 libxext6
# ! pip install open-iris==1.0.0 faiss-cpu seaborn

# Imports and Functions

## Imports and Constants

In [2]:
import boto3
from io import BytesIO
import igraph as ig
import pickle
import iris
import scipy
import psutil
import time
from datetime import datetime
import sys
import threading
from itertools import combinations, product
from functools import reduce
from operator import mul
from joblib import Parallel, delayed, parallel_backend
import networkx as nx

In [3]:
import sys
import shutil
import os
parent_dir = os.path.abspath(os.path.join(os.path.dirname('/Users/odedgoffer/Documents/GitHub/hnsw-demo/analysis_notebooks/'), '..'))
sys.path.append(parent_dir)

In [4]:
import hnsw
from iris.io.dataclasses import IrisTemplate
from iris_integration import (
    iris_with_noise,
    irisint_make_query as make_query,
    irisint_query_to_vector as query_to_vector,
    irisint_distance as distance,
)
from iris_pairwise_min_dist_calculation import get_pairwise_min_dist_across_rotations

In [5]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp, ttest_ind
from scipy.spatial.distance import hamming

In [6]:
n_jobs = 6 # Fit to CPU
DIM = (2, 32, 200)

## General Functions

In [7]:
last_update_time = time.time()
def print_progress(msg, delay=1, force_print=False):
    global last_update_time
    if (time.time() - last_update_time > delay) or force_print:
        sys.stdout.write('\r' + ' ' * (shutil.get_terminal_size().columns - 1))
        sys.stdout.write(f"\r{msg}")
        sys.stdout.flush()
        
        last_update_time = time.time()

def save_pickle(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)
    # print(f"Object successfully saved to {filename}")

def load_pickle(filename):
    with open(filename, 'rb') as file:
        obj = pickle.load(file)
    # print(f"Object successfully loaded from {filename}")
    return obj

In [8]:
def plot_boolean_iris(matrix, title=''):
    plt.imshow(matrix, cmap='gray')
    plt.title(title)
    plt.show()

In [9]:
def int_to_scaled_string(n):
    suffixes = ['', 'K', 'M', 'B', 'T']
    idx = max(0, min(len(suffixes) - 1, int((len(str(abs(n))) - 1) / 3)))
    scaled = n / (1000 ** idx)
    return f"{scaled:.1f}{suffixes[idx]}" if scaled % 1 else f"{int(scaled)}{suffixes[idx]}"

## Loading Functions

In [10]:
def read_partial_file(filename, num_bits):
    num_bytes = (num_bits + 7) // 8  # Ensure we round up if num_bits isn't a multiple of 8
    with open(filename, 'rb') as f:
        chunk = f.read(num_bytes)
    return np.frombuffer(chunk, dtype=np.uint8)

In [11]:
def load_and_reshape_masks(filename, num_masks, DIM=DIM):
    flattened_data = np.unpackbits(read_partial_file(filename, ((DIM[1]//2) * DIM[2]) * num_masks))
    boolean_arrays = flattened_data.reshape((num_masks, DIM[1]//2, DIM[2]))
    vertically_stacked = np.tile(boolean_arrays, (1, 2, 1))
    duplicated_arrays = np.repeat(vertically_stacked[:, np.newaxis, :, :], DIM[0], axis=1)
    return duplicated_arrays

In [12]:
def load_and_reshape_irises(path_low, path_high, num_samples, DIM=DIM):
    low_high_lst = [
        np.unpackbits(
            read_partial_file(path, (reduce(mul, DIM[1:]) * num_samples)), bitorder="little"
        ).reshape(num_samples, *DIM[1:]) for path in [path_low, path_high]
    ]
    return np.concatenate(low_high_lst, axis=1).astype(bool)

## Test Functions and DB Buildup 

In [13]:
def update_db(db, iris_df, db_size):
    db_current_size = db.get_stats()['db_size']
    assert (db_size - db_current_size) > 0
    
    new_irises = iris_df.loc[range(db_current_size, db_size), 'Template']
    for i, iris in enumerate(new_irises):
        print_progress(f'Currently building {int_to_scaled_string(db_size)} Data-base. Insertion progress: {(i+1)/len(new_irises):.2%}')
        db.insert(make_query(iris))
    iris_df.loc[range(db_current_size, db_size), 'Inserted'] = True

In [14]:
def numpy_array_to_iris_df(iris_array, mask_array):
    def create_iris_template(matrix, mask):
        return IrisTemplate(
            iris_codes=matrix,
            mask_codes=mask, 
            # iris_code_version="v3.0" # Doesn't work on open-iris==1.0.0
        )
    iris_templates = Parallel(n_jobs=n_jobs)(delayed(create_iris_template)(list(iris), list(mask)) for iris, mask in zip(iris_array, mask_array))
    return pd.DataFrame({'Template': iris_templates}).assign(Inserted = False)

In [15]:
def run_single_experiment(db, idx, iris, noise, efSearch, K):
    noisy_query = make_query(iris_with_noise(iris, noise_level=noise))
    res = db.search(noisy_query, K, ef=efSearch)
    return any(idx == tup[1] for tup in res)

In [16]:
def get_expected_diameter(db_size, M):
    return np.log(db_size) / np.log((2*M)-1)

# Data Loading

## Configurations

In [17]:
synthetic_data_size = 2**22
max_tested_db_size = 100000

In [18]:
path_masks = f'synthetic_data/{int_to_scaled_string(synthetic_data_size)}_mask_arrays.dat'
path_iris_low = 'synthetic_data/2_23_voter_arrays_90k_b090.dat'
path_iris_high = 'synthetic_data/2_23_voter_arrays_14k_b010.dat'

## Loading

In [19]:
loaded_masks = load_and_reshape_masks(path_masks, max_tested_db_size).astype(bool)
loaded_irises = load_and_reshape_irises(path_iris_low, path_iris_high, max_tested_db_size)

## Reassuring stats

In [20]:
assert len(loaded_masks) == max_tested_db_size
assert len(loaded_irises) == max_tested_db_size
print(f'Successfuly loaded {int_to_scaled_string(max_tested_db_size)} masks and irises')

Successfuly loaded 100K masks and irises


# Graph analysis

## Configurations

In [21]:
M = 64
db_size_range = np.arange(10000, 100001, 10000)

In [22]:
assert db_size_range.max() <= max_tested_db_size

## DB and graph Build up and saving

In [28]:
iris_df = numpy_array_to_iris_df(loaded_irises.reshape(max_tested_db_size, *DIM), loaded_masks)
db = hnsw.HNSW(
    M=M, 
    efConstruction=M, 
    m_L=1/np.log(M**6), 
    distance_func=distance, 
    query_to_vector_func=query_to_vector
)

In [29]:
betweenness_centrality_dict = dict()
for db_size in db_size_range:
    update_db(db, iris_df, db_size)
    print_progress(f'Calculating Betweenness Centrality of Data-base size {int_to_scaled_string(db_size)}', force_print=True)
    
    layer_0_graph = ig.Graph()
    layer_0_graph.add_vertices(list(range(db_size)))
    edges_to_add = list(
        set(
            (min(source, target), max(source, target)) for source, target_lst in db.layers[0].items() 
            for _, target in target_lst
        )
    )
    layer_0_graph.add_edges(edges_to_add)
    
    max_betweenness = (db_size - 1) * (db_size - 2) / 2
    betweenness_centrality_dict[db_size] = {
        'betweenness_centrality':pd.Series(layer_0_graph.betweenness(directed=False)) / max_betweenness,
        'entry_point':db.entry_point
    }
    save_pickle(betweenness_centrality_dict, 'betweenness_centrality_dict.pkl')

Calculating Betweenness Centrality of Data-base size 10K                       Object successfully saved to betweenness_centrality_dict.pkl
Calculating Betweenness Centrality of Data-base size 20K                       Object successfully saved to betweenness_centrality_dict.pkl
Calculating Betweenness Centrality of Data-base size 30K                       Object successfully saved to betweenness_centrality_dict.pkl
Calculating Betweenness Centrality of Data-base size 40K                       Object successfully saved to betweenness_centrality_dict.pkl
Calculating Betweenness Centrality of Data-base size 50K                       Object successfully saved to betweenness_centrality_dict.pkl
Calculating Betweenness Centrality of Data-base size 60K                       Object successfully saved to betweenness_centrality_dict.pkl
Calculating Betweenness Centrality of Data-base size 70K                       Object successfully saved to betweenness_centrality_dict.pkl
Calculating Betweenn

In [None]:
betweenness_centrality_dict = load_pickle('betweenness_centrality_dict.pkl')

In [None]:
for db_size, curr_betweenness_centrality_dict in betweenness_centrality_dict.items():
    betweenness_values = pd.Series(curr_betweenness_centrality_dict['betweenness_centrality'])
    entry_points = curr_betweenness_centrality_dict['entry_point']
    plt.figure(figsize=(10,5))
    sns.histplot(betweenness_values.rename('Betweenness Centrality'), stat='probability', color='#34675C')
    for entry_point in entry_points:
        quantile = np.searchsorted(np.sort(betweenness_values), betweenness_values[entry_point], side='right') / len(betweenness_values)
        plt.axvline(x=betweenness_values[entry_point], color='#F26800')
        plt.text(betweenness_values[entry_point], plt.gca().get_ylim()[1] * 1.05, f'{1-quantile:.2%}', 
                 color='#F26800', ha='center', va='top', fontsize=10, fontweight='bold')
    plt.title(f'Betweenness Centrality Distribution\nData-base size {int_to_scaled_string(db_size)}', fontsize=15, y=1.05)
    plt.show()