In [2]:


# from scluster import SCluster
import pandas as pd

from pathlib import Path

from sklearn.cluster import KMeans, MeanShift
from sklearn.metrics import silhouette_score
from hdbscan import HDBSCAN



In [4]:
# define variables
data= pd.read_csv(Path('../data/raw/data_3.csv'))

In [5]:
data

Unnamed: 0,num,x,y
0,1,32.205609,31.794388
1,2,31.231352,33.378132
2,3,30.438987,33.125897
3,4,30.062514,32.160704
4,5,29.671359,33.337847
...,...,...,...
7315,7316,31.664378,33.534468
7316,7317,31.662992,33.529399
7317,7318,31.666121,33.534270
7318,7319,31.662232,33.530855


In [9]:

# dictionary of clustering functions:
CLUSTERING = {'kmeans' : lambda df, k: KMeans(n_clusters = k).fit(df).labels_,
             'hdbscan' : lambda df, s: HDBSCAN(min_cluster_size = s).fit(df).labels_,
            'meanshift': lambda df, q: MeanShift(bandwidth=0.025*min(q,50)).fit(df).labels_}

# the clustering by silhouette object
class SCluster:
    """
    The object that calculate the clustering by silhouette.
    for each step, the object calculate the clustering labels,
    and then, calculate for each result the silhouette score.
    next, this code choose the labels with the best score.
    """
    def __init__(self,typ='kmeans', org=2, lim=20, stp=1, dup=0.95):
        """
        initialize the object
        :param typ: clustering type
        :param org: first value in the loop
        :param lim: last value in the loop
        :param stp: # values between each step
        :param dup: value for fix dataframe row length for silhouette
        """
        # initial parameters
        self.type= typ
        self.org = org
        self.lim = lim+1
        self.stp = stp
        self.dup = dup

        # clustering function
        self.function = CLUSTERING[self.type.lower()]

        # values for calculation
        self.max = -1
        self.scores = {}
        self.labels_= []

    def adapt_silhouette(self,labels):
        """
        calculate the silhouette value for the given dataframe
        :param labels: cluster labels
        :return: the dataframe silhouette score for the given labels
        """
        data, labels= self.df[labels > -1], labels[labels > -1]
        if data.shape[0] == 0: return -1
        while True:
            try:
                return silhouette_score(data, labels, sample_size=self.size)*(labels.shape[0]/self.n)
            except:
                self.size = int(self.size*self.duf)

    def fit(self,data):
        """
        fit the optimal cluster labels to the data
        :param data: input dataframe
        """
        self.n = data.shape[0]
        self.size = self.n
        self.df = data
        for i in range(self.org, self.lim , self.stp):
            label = self.function(self.df, i)
            silho = self.adapt_silhouette(label)
            self.scores[silho] = label
            self.max = silho if self.max<silho else self.max
            print(f'cluster kind: {self.type}, input value = {i}, silhouette = {round(silho,3)}')
        self.labels_ = self.scores[self.max]
        return self

In [12]:
typ = 'kmeans'
org = 4 
lim = 10 
stp = 1

# application
data['labels'] = SCluster(typ=typ, org=org ,lim=lim, stp=stp).fit(data).labels_

cluster kind: kmeans, input value = 4, silhouette = 0.572
cluster kind: kmeans, input value = 5, silhouette = 0.561
cluster kind: kmeans, input value = 6, silhouette = 0.553
cluster kind: kmeans, input value = 7, silhouette = 0.548
cluster kind: kmeans, input value = 8, silhouette = 0.544
cluster kind: kmeans, input value = 9, silhouette = 0.541
cluster kind: kmeans, input value = 10, silhouette = 0.538


In [14]:
data['labels'].unique()

array([0, 2, 3, 1], dtype=int32)