In [1]:
import pandas as pd
from sklearn.cluster import DBSCAN
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  
import plotly.express as px

In [2]:
df = pd.read_csv("../../preprocessing/data_preprocessed.csv")

In [3]:
df.head()

Unnamed: 0,a (AU),e,i (deg),w (deg),Node (deg),M (deg),q (AU),Q (AU),H (mag),MOID (AU),class
0,1.078066,0.826854,22.825495,31.382966,88.010681,215.528772,0.1867,1.97,16.9,0.034507,APO*
1,1.245304,0.335342,13.337482,276.893024,337.207958,104.155607,0.8277,1.66,15.6,0.030669,APO*
2,1.470264,0.559922,6.352995,285.852564,35.736768,174.626213,0.647,2.29,16.25,0.025795,APO*
3,1.776025,0.650141,39.832538,267.791993,356.903343,173.188556,0.6214,2.93,15.2,0.003551,APO*
4,1.874123,0.764602,1.326399,43.388048,349.694944,235.158622,0.4412,3.31,18.8,0.011645,APO*


In [4]:
X = df.drop('class', axis=1)
y = df['class']

In [5]:
scaler = StandardScaler()

In [6]:
X_scaled = scaler.fit_transform(X)

In [7]:
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)

In [8]:
X_scaled

Unnamed: 0,a (AU),e,i (deg),w (deg),Node (deg),M (deg),q (AU),Q (AU),H (mag),MOID (AU)
0,-1.147774,1.670912,0.819440,-1.496342,-0.820986,0.323183,-2.577495,-0.674815,-2.038149,0.779797
1,-0.856272,-1.069622,0.002458,0.968756,1.605575,-0.715209,0.322343,-0.954966,-2.901665,0.511570
2,-0.464159,0.182573,-0.598953,1.058716,-1.330004,-0.058174,-0.495131,-0.385627,-2.469907,0.170940
3,0.068791,0.685609,2.283861,0.877375,1.797359,-0.071578,-0.610944,0.192749,-3.167362,-1.383629
4,0.239779,1.323810,-1.031777,-1.375803,1.727168,0.506203,-1.426156,0.536160,-0.776088,-0.817963
...,...,...,...,...,...,...,...,...,...,...
1742,-0.128517,-0.806755,-1.070678,1.250775,0.475525,1.223676,1.223057,-0.376590,0.738385,0.909647
1743,1.690865,1.323015,-0.161467,1.030024,0.926805,1.523349,-0.538108,1.864618,0.147209,-0.469576
1744,0.525194,0.782538,-0.379066,0.997160,-1.048068,-1.508572,-0.357151,0.617494,1.256495,-1.136718
1745,0.625111,0.710817,-0.800313,-0.351231,-1.432951,1.490838,-0.149051,0.680754,1.289707,1.473147


In [9]:
pca = PCA(n_components = 3)

In [10]:
pca.fit(X_scaled)

In [11]:
X_pca = pca.transform(X_scaled)

In [12]:
pca_names = [f'pca_{i}' for i in range(3)]
pca_names

['pca_0', 'pca_1', 'pca_2']

In [13]:
X_pca = pd.DataFrame(X_pca, columns = pca_names)

In [14]:
X_pca

Unnamed: 0,pca_0,pca_1,pca_2
0,0.058915,1.679911,2.961399
1,-1.305982,-1.313795,1.902921
2,-0.001329,0.250895,1.478462
3,0.614681,-0.602997,3.644217
4,1.488232,1.396926,0.372212
...,...,...,...
1742,-0.700643,-0.488094,-1.035565
1743,2.838650,0.668823,0.351246
1744,0.967316,1.099285,-1.128839
1745,1.114837,0.586035,-0.893929


In [15]:
model = DBSCAN(eps=0.1, min_samples=2)

In [16]:
model.fit(X_pca)

In [17]:
model.labels_

array([-1, -1, -1, ..., 61, -1, -1])

In [18]:
np.unique(model.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64])

In [24]:
data = pd.concat([X_pca, pd.Series(model.labels_, name='Cluster color')], axis=1)

fig = px.scatter_3d(
    data,
    x='pca_0',
    y='pca_1',
    z='pca_2',
    color='Cluster color',
    color_continuous_scale='viridis',
    title='DBSCAN Clustering'
)

fig.update_layout(scene=dict(
    xaxis_title='PCA_0',
    yaxis_title='PCA_1',
    zaxis_title='PCA_2'
))
fig.update_traces(marker=dict(size=5, opacity=0.7))

fig.show()
