In [22]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [23]:
df = pd.read_csv('./clustering_df.csv')
df.head()

Unnamed: 0,pca_0,pca_1,pca_2
0,0.256332,0.15015,0.345492
1,0.234291,0.138619,0.324007
2,0.252762,0.146195,0.337321
3,0.223684,0.197239,0.339346
4,0.240676,0.117453,0.320155


In [24]:
df.describe()

Unnamed: 0,pca_0,pca_1,pca_2
count,7195.0,7195.0,7195.0
mean,0.483138,0.459948,0.460233
std,0.216344,0.147716,0.119252
min,0.0,0.0,0.0
25%,0.295635,0.38479,0.387239
50%,0.493345,0.473212,0.471214
75%,0.686922,0.539564,0.523623
max,1.0,1.0,1.0


In [25]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import plotly.express as px

In [26]:
idx = 1
best_model = None
best_score = float('-inf')

In [27]:
eps_values = np.arange(0.03, 0.1, 0.01)
min_samples = range(10,50,10)

In [28]:
for eps_value in eps_values:
  for sample in min_samples:
    dbscan_model = DBSCAN(eps=eps_value, min_samples=sample)
    dbscan_model.fit(df)

    score = silhouette_score(df, dbscan_model.labels_)

    if score > best_score:
      best_score = score
      best_model = dbscan_model

In [29]:
best_score

0.4754082509182181

In [30]:
best_model.get_params()

{'algorithm': 'auto',
 'eps': 0.07,
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'min_samples': 40,
 'n_jobs': None,
 'p': None}

In [31]:
np.unique(best_model.labels_)

array([-1,  0,  1,  2])

In [32]:
colors = np.array([x for x in 'bgrcmykbgr'])

In [33]:
px.scatter_3d(x=df['pca_0'], y=df['pca_1'], z=df['pca_2'], color=colors[best_model.labels_ + 1])

In [34]:
import os
import pickle

In [35]:
output_directory = '../models/clustering/'

name = 'DBSCAN_model.pkl'
try:
  with open(os.path.join(output_directory, name), 'wb') as file:
    pickle.dump(best_model, file)
except IOError as err:
  print('Error while exporting model \'{}\''.format(name.split('.')[0]))
  print(err)