### Notes:
This notebook uses pairwise_distances_chunked and BallTree to batch compute distances for later use in the hdbscan model.

In [None]:
import pandas as pd
import numpy as np
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances_chunked
from sklearn.neighbors import BallTree
from scipy.sparse import csr_matrix, save_npz

In [None]:
df = pd.read_csv('spectra.csv')
df = df.sort_values(by='TIME', ascending=True)
df.TIME = df.TIME - df.TIME.iloc[0]
df.TIME = pd.to_numeric(df.TIME, downcast='integer')
df = df.set_index('TIME')
df.columns = pd.to_numeric(df.columns, downcast='integer')

In [None]:
# Split the dataset into a training set (80%) and a testing set (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Compute the pairwise Euclidean distances in chunks
chunk_size = 1000
n_chunks = int(np.ceil(train_df.shape[0] / chunk_size))
save_interval = 10000

# Create BallTree for faster distance calculation
tree = BallTree(train_df, leaf_size=40)

# Compute the pairwise Euclidean distances in chunks
for i in range(n_chunks):
    start_idx = i * chunk_size
    if start_idx > 38_000: # Use if statement to start at the end of the last job
        end_idx = min((i + 1) * chunk_size, train_df.shape[0])
        distances, _ = tree.query(train_df.iloc[start_idx:end_idx], k=train_df.shape[0]-1, return_distance=True)
        distance_chunk = csr_matrix(distances)

        # Save the distance matrix chunk to disk
        save_npz(f'distance_matrix_chunk_{start_idx + 1}-{end_idx}.npz', distance_chunk)

        # Check if it's time to save the next file
        if (start_idx + chunk_size) % save_interval == 0 or end_idx == train_df.shape[0]:
            print(f"Saved distance matrix chunk for observations {start_idx + 1}-{end_idx}")