In [None]:
import numpy as np
import cupy as cp 
import gc 
import plotly.graph_objects as go
from sklearn.cluster import DBSCAN
from scipy.spatial import KDTree
import pickle



In [None]:
#get all the vectors from the csv
import ast

with open('distribution_vectors.csv', 'r') as file:
    data = file.read()

vectors = [ast.literal_eval(vec) for vec in data.strip().split('"') if vec.strip()]
vectors = np.array(vectors).astype(np.float16)

mask = vectors[:,1] <= 3

vectors = vectors[mask]
vectors[:,2] = np.abs(vectors[:,2])

In [None]:
#plot the vector csv
ds_f = 4

x = vectors[::ds_f,1]
y = vectors[::ds_f,2]
z = vectors[::ds_f,3]
c = vectors[::ds_f,2]


fig = go.Figure(data=[go.Scatter3d(
    x=x, y=y, z=z,
    mode='markers',
    marker=dict(size=1.3, color=c, opacity=1),
)])

fig.update_layout(
    title="3D Scatter of Vectors",
    scene=dict(
        xaxis_title="mean",
        yaxis_title="skew",
        zaxis_title="azimuthal angle",
        camera=dict(projection=dict(type="orthographic"))
    ),
    width=1000,
    height=1000
)

In [None]:
#clustering
hdbscan_refined = DBSCAN(min_samples =150,
                             metric = "chebyshev",
                             eps= 0.05,
                             n_jobs=3)

labels = hdbscan_refined.fit_predict(vectors)


In [None]:
gc.collect
# Downsample the data
ds_f = 4

filter = labels != -1

x = vectors[filter][::ds_f, 1]
y = vectors[filter][::ds_f, 2]
z = vectors[filter][::ds_f, 3]
c = labels[filter][::ds_f]

fig = go.Figure(data=[go.Scatter3d(
    x=x, y=y, z=z,
    mode='markers',
    marker=dict(
        size=1.3,
        color=c,
        colorscale='tealrose', # You can adjust the colorscale here
        opacity=1,
        colorbar=dict(title="Label") # Add color legend
    )
)])

fig.update_layout(
    title="3D Scatter of Vectors",
    scene=dict(
        xaxis_title="mean",
        yaxis_title="skew",
        zaxis_title="azimuthal angle",
        camera=dict(projection=dict(type="orthographic"))
    ),
    width=1000,
    height=1000
)

fig.show()


In [None]:
stop

In [None]:
#save to a pickle file
mask = labels != -1
return_vectors = vectors[mask]
defined_labels = labels[mask] + 1
print(np.unique (defined_labels))

tree = KDTree(return_vectors)


with open('cluster_kd_tree.pkl', 'wb') as f:
    pickle.dump((tree, defined_labels), f)

