## HDBSCAN

In [None]:
!conda install -c conda-forge hdbscan

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import hdbscan
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances_chunked
from sklearn.neighbors import BallTree

In [None]:
df = pd.read_csv('/home/idies/workspace/Temporary/jaclar15/scratch/nicer/spectra2.csv')
df = df.sort_values(by='TIME', ascending=True)
df.TIME = df.TIME - df.TIME.iloc[0]
df.TIME = pd.to_numeric(df.TIME, downcast='integer')
df = df.set_index('TIME')
df.columns = pd.to_numeric(df.columns, downcast='integer')

In [None]:
# Split the dataset into a training set (80%) and a testing set (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Function to accumulate the distance matrix chunks
def accumulate_distance_matrix(chunk, start, accumulated_matrix):
    accumulated_matrix[start:start + chunk.shape[0], :] = chunk

# Compute the pairwise Euclidean distances in chunks
chunk_size = 1000
n_chunks = int(np.ceil(train_df.shape[0] / chunk_size))

# Create BallTree for faster distance calculation
tree = BallTree(train_df, leaf_size=40)

# Initialize an empty sparse distance matrix
from scipy.sparse import lil_matrix
distance_matrix = lil_matrix((train_df.shape[0], train_df.shape[0]))

# Compute the pairwise Euclidean distances in chunks
for i in range(n_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, train_df.shape[0])
    distances, _ = tree.query(train_df.iloc[start_idx:end_idx], k=train_df.shape[0]-1, return_distance=True)
    accumulate_distance_matrix(distances, start_idx, distance_matrix)

# Save the distance matrix to disk
from scipy.sparse import save_npz
save_npz('distance_matrix_sparse.npz', distance_matrix.tocsr())

In [None]:
# Load the distance matrix from disk
# from scipy.sparse import load_npz
# loaded_distance_matrix = load_npz('distance_matrix_sparse.npz')

# # Make clusters
# clusterer = hdbscan.HDBSCAN(metric='precomputed', min_samples=5, core_dist_n_jobs=-2)
# # Dump to a pickle file
# dump(clusterer, open('untrained-hdbscan_cluster-job.joblib', 'wb'))
# clusterer.fit(loaded_distance_matrix)
# dump(clusterer, open('trained-hdbscan_cluster-job.joblib', 'wb'))
# y_pred = clusterer.labels_
# y_prob = clusterer.probabilities_

# # Plot clusters
# # plt = df.groupby(y_pred).agg('mean').T.plot()
# # Add the cluster labels to the DataFrame
# df_with_labels = df.assign(cluster_labels=y_pred)

# # Plot clusters
# plt = df_with_labels.groupby('cluster_labels').agg('mean').T.plot()

In [None]:
# # Make clusters
# clusterer = hdbscan.HDBSCAN(metric='euclidean', min_samples=5, core_dist_n_jobs=-2)
# # dump to a pickle file
# dump(clusterer, open('untrained-hdbscan_cluster-job.joblib', 'wb'))
# clusterer.fit(train_df)
# dump(clusterer, open('trained-hdbscan_cluster-job.joblib', 'wb'))
# y_pred = clusterer.labels_
# y_prob = clusterer.probabilities_ # The hdbscan library implements soft clustering, where each data point is assigned a cluster 
#                                   # membership score ranging from 0.0 to 1.0. A score of 0.0 represents a sample that is not in the cluster at all
# # Plot clusters
# plt = df.groupby(y_pred).agg('mean').T.plot()

### Testing

In [None]:
# test_df = df.sample(n=1000)

# clusterer = hdbscan.HDBSCAN(metric='euclidean', min_samples=5)
# dump(clusterer, open('untrained-hdbscan_cluster-job.joblib', 'wb'))
# clusterer.fit(test_df)
# dump(clusterer, open('trained-hdbscan_cluster-job.joblib', 'wb'))
# y_pred = clusterer.labels_
# # y_prob = clusterer.probabilities_ # Probability 
# # Plot clusters
# cluster_std = test_df.groupby(y_pred).agg(np.std)
# plt = test_df.groupby(y_pred).agg('mean').T.plot(yerr=cluster_std.T)

### Testing BallTree

In [None]:
train_df = train_df.sample(n=10_000)
# Function to accumulate the distance matrix chunks
def accumulate_distance_matrix(chunk, start, accumulated_matrix):
    accumulated_matrix[start:start + chunk.shape[0], :] = chunk

# Compute the pairwise Euclidean distances in chunks
chunk_size = 1000
n_chunks = int(np.ceil(train_df.shape[0] / chunk_size))

# Create BallTree for faster distance calculation
tree = BallTree(train_df, leaf_size=40)

# Initialize an empty sparse distance matrix
from scipy.sparse import lil_matrix
distance_matrix = lil_matrix((train_df.shape[0], train_df.shape[0]))

# Compute the pairwise Euclidean distances in chunks
for i in range(n_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, train_df.shape[0])
    distances, _ = tree.query(train_df.iloc[start_idx:end_idx], k=train_df.shape[0], return_distance=True)
    accumulate_distance_matrix(distances, start_idx, distance_matrix)

# Save the distance matrix to disk
from scipy.sparse import save_npz
save_npz('distance_matrix_sparse.npz', distance_matrix.tocsr())

In [None]:
# Load the distance matrix from disk
from scipy.sparse import load_npz
loaded_distance_matrix = load_npz('distance_matrix_sparse.npz')

# Make clusters
clusterer = hdbscan.HDBSCAN(metric='precomputed', min_samples=5, core_dist_n_jobs=-2)
# Dump to a pickle file
dump(clusterer, open('untrained-hdbscan_cluster-job.joblib', 'wb'))
clusterer.fit(loaded_distance_matrix)
dump(clusterer, open('trained-hdbscan_cluster-job.joblib', 'wb'))
y_pred = clusterer.labels_
y_prob = clusterer.probabilities_

# Plot clusters
# plt = df.groupby(y_pred).agg('mean').T.plot()
# Add the cluster labels to the DataFrame
# df_with_labels = test_df.assign(cluster_labels=y_pred)

# # Plot clusters
# plt = df_with_labels.groupby('cluster_labels').agg('mean').T.plot()