This step is the Datamining phase, the first steps allow one to extract a range (EPS) value by creating a four distance plot. See the original publication Ester et al. (1996) for an indepth explanation of the four distance plot. Alternatively my method section in my thesis has an example which explains the steps on a arteficial set of data that has been already clutered. I've also included the code for how that works in this folder as DBSCAN_Example.

In [1]:
#Load in relevant libraries
import math

import numpy as np

from astropy import units as u
from astropy import coordinates as coord
from astropy.coordinates import SkyCoord, ICRS, Galactocentric, Distance, LSR
from astropy.coordinates import FK5
from astropy.coordinates import CartesianRepresentation, CartesianDifferential
from astropy.table import Table, vstack, hstack
from astropy.table import Column
from astropy.io import ascii

import gala.potential as gp
import gala.dynamics as gd
from gala.units import galactic

from scipy import stats

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler, normalize


import time

import matplotlib.pyplot as plt

import pandas as pd

import seaborn as sns

In [2]:
#Load in the subset that contain the actions, ready for datamining
t = Table.read('Step_2_Output.fits')

print("Total Number of Stars in Dataset: " + str(len(t)))

Total Number of Stars in Dataset: 6789


In [3]:
#This line of code coverts any two columns specified from the data set in a 3-D
# array which is compatable with the DBSCAN algorithm.
x_val='Lz'
y_val='L_perp'
z_val='Energy'

A=t[x_val][0]
B=t[y_val][0]
C=t[z_val][0]
X=[A,B,C]
for i in range(1,len(t)):
    D=t[x_val][i]
    E=t[y_val][i]
    F=t[z_val][i]
    G=np.array([D,E,F])
    X=np.vstack([X,G])

X = StandardScaler().fit_transform(X)

In [4]:
#This body of code calculates the 4-distance of each datapoint as specified 
# in the literature. It effectively calculates all the euclidian distances
#  from each point sorts them into ascending order and takes the forth closest
#   neighbours distance. It eventually compiles a list of these values in ascending
#    order.
import math
#Array needs to be initialised for the loop to save
four_dist=[0]
for j in range(0,len(t)):
    #Empty list for the euclidian distance
    kdist=[0]
    
    for i in range(0,len(t)):
        #This double loops allows all the distances to be calculated for each star
        x = X[j]
        y = X[i]
        
        #This line calcualtes the euclidian distance
        distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(x, y)]))
    
        #The euclidian distances are then stacked together
        distance = np.array(distance)
        kdist=np.vstack([kdist,distance])
    
    #The initial zero value is deleted
    kdist=np.delete(kdist,0)
    kdist=np.sort(kdist)
    
    four_dist=np.vstack([four_dist,kdist[4]])

    
    print("Loops Left: " + str(len(t)-j), end="\r")

#Deleted the first zero four distance value
four_dist=np.delete(four_dist,0)

#This section creates the 4-dist graph which one can extract the 
# EPS distance from. This is decided by noting where the homogenous valley
#  starts and choosing the required point value.

#The four disances are sorted in ascending order
four_dist=np.sort(four_dist)

#This creates a label for each distance to be plotted in a scatter plot
points = np.array(range(0,len(t)))

#We flip the order of the points so that the plot plots lager distancs first like in the literature
points = np.flip(points,axis=0)

Loops Left: 6544

KeyboardInterrupt: 

In [5]:
#y_space allows you to change the horizonal line spacing on the graph
# which means you can alter it so it lies on the point where the graph
#  expreiences to DBSCAN kick that allows you to extract the EPS value
y_space = 0.15

#Occasionally the drop is difficult to see, this is usually due to the
# large distance values which muck with the resolution. If this happens
#  deleting the extremum points should lead to the kick to show up.

#Value Deleter
#Number of specified values to be deleted.
x=0
if x > 0:
    i=0
    while i < x:
        four_dist=np.delete(four_dist,[len(four_dist)-1])
        points = np.delete(points,[len(four_dist)-1])
    
        i=i+1

#The four distances and their labels are plotted on a scatterplot the 
# the distance can be exptracted visually


plt.figure(figsize=(15,15))
plt.scatter(points,four_dist,s=1)
plt.grid(b=True,linewidth=1)
plt.title("DBSCAN 4-Distance Plot: Inside Solar Neighbourhood Halo", fontsize=28)
plt.ylabel("Normalised Distance Value",fontsize=32)
plt.xlabel("Distance Rank",fontsize=32)
plt.yticks(np.arange(min(four_dist), max(four_dist)+1, y_space))
plt.xticks(fontsize=24)
plt.yticks(fontsize=24)
fig = plt.scatter(points,four_dist,s=1)
#plot = fig.add_subplot(111)




#If you need to save the figure de-comment the command below
#plt.savefig('Fig2_4dist.png', dpi = 150, overwrite=True)

NameError: name 'points' is not defined

<Figure size 1080x1080 with 0 Axes>

In [6]:
#Using the graph you can see the line becomes smooth at around 0.328
# so that is what is used as the DBSCAN EPS value.
EPS = 0.328

#This block of code runs the DBSCAN and informs the user how many clusters
# were in fact detected.

#The main block of DBSCAN code, this is adapted from the example provided 
# by the sci-kit learn package
db = DBSCAN(eps=EPS, min_samples=4).fit(X)
labels_true = np.full((1000,1),1000)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

#The number of estimated clusters found using the EPS and min samples of 4
# is displayed for the user
print('Estimated number of clusters: %d' % n_clusters_)

#The labels for each star in the set are extracted and then converted into 
# a column.
P = db.labels_

#Creates into a column which can be added to the table
label = Column(P,name='cluster_label')
t.add_column(label) #if you need to add the column

print("Total Number of Stars in Dataset: " + str(len(t)))

Estimated number of clusters: 10
Total Number of Stars in Dataset: 6789


In [7]:
#The subset now contains all the stars with an associated cluster label. This
# is saved as a new table.
t.write('Step_3_Output.fits',overwrite=True)