In [65]:
import sys, getopt
import pickle
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
import warnings
import time
from TSKFS.fuzzy_cluster import ESSC
from subspaceClustering.cluster.selfrepresentation import ElasticNetSubspaceClustering
from sklearn.pipeline import Pipeline
import hdbscan
import umap
from tqdm import tqdm
import itertools
import time
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


class BaseAlg():
    def __init__(self):
        self.parameters = {}
        self.algorithm = None
        self.classes_ = None

    # Can be changed
    def set_classes(self, X):
        self.classes_ = self.algorithm.labels_

    def fit(self, X):
        start_time = time.time()
        self.algorithm.fit(X)
        fit_time = time.time() - start_time
        self.set_classes(X)
        return fit_time

    def get_classes(self):
        if self.classes_ is not None:
            return self.classes_
        else:
            warnings.warn('Classes have not been set')

    def check_params(self, parameter):
        for name in parameter:
            if name not in self.algorithm.__dict__:
                warnings.warn(f'{name} not in algorithm')

    # Can be changed
    def helper_set(self, parameter):
        self.algorithm.set_params(**parameter)

    # parameter should be a dictionary
    def set_params(self, parameter):
        self.check_params(parameter)
        self.helper_set(parameter)

    def get_params(self):
        return self.parameters


class Kmeans(BaseAlg):
    def __init__(self):
        self.parameters = {'n_clusters': np.arange(2, 30), 'batch_size': [128, 256, 512, 1024]}
        self.algorithm = MiniBatchKMeans(random_state=0)
        self.classes_ = None


class ESSCGrid(BaseAlg):
    def __init__(self):
        self.parameters = {'n_cluster': np.arange(2, 15, 1),
                           'eta': [0, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9],
                           'gamma': [1, 2, 5, 10, 50, 100, 1000]
                           }
        self.algorithm = ESSC(None)
        self.classes_ = None

    def set_classes(self, X):
        self.classes_ = self.algorithm.predict(X)
        self.classes_ = [np.argmax(q) for q in self.classes_]

    # parameter should be a dictionary
    def helper_set(self, parameter):
        for name in parameter:
            setattr(self.algorithm, name, parameter[name])


class ENSC(BaseAlg):
    def __init__(self):
        self.parameters = {'n_clusters': np.arange(2, 15, 1),
                           #              'affinity ': ['symmetrize', 'nearest_neighbors'],
                           'tau': np.linspace(0.1, 1, 4),
                           'gamma': [5, 50, 100, 500]
                           }
        self.algorithm = ElasticNetSubspaceClustering(algorithm='spams')
        self.classes_ = None

    def helper_set(self, parameter):
        for name in parameter:
            setattr(self.algorithm, name, parameter[name])


class UHDBSCAN(BaseAlg):
    def __init__(self):
        self.parameters = {'DimReduction__n_neighbors': np.arange(30, 100, 20),
                           'DimReduction__min_dist': np.linspace(0, 1, 3),
                           'DimReduction__n_components': np.arange(1, 100, 25)[::-1],
                           'Clustering__min_cluster_size': [2, 25, 50, 75, 100],
                           'Clustering__min_samples': [2, 25, 50, 75, 100],
                           'Clustering__cluster_selection_epsilon': [0.1, 0.5, 1],
                           'Clustering__cluster_selection_method': ['eom', 'leaf']
                           }
        for name in self.parameters:
            if isinstance(self.parameters[name], type(np.array)):
                self.parameters[name] = self.parameters[name].tolist()
        self.algorithm = Pipeline([('DimReduction',
                                    umap.UMAP()),
                                   ('Clustering',
                                    hdbscan.HDBSCAN())
                                   ])
        self.classes_ = None

    def set_classes(self, X):
        self.classes_ = self.algorithm['Clustering'].labels_

    def check_params(self, parameter):
        for name in parameter:
            if name not in self.algorithm.get_params():
                warnings.warn(f'{name} not in algorithm')

    def helper_set(self, parameter):
        self.algorithm.set_params(**parameter)


class Gridsearch():
    def __init__(self, name, X):
        self.name = name
        self.algorithm = self.get_alg(name)
        self.scores = []
        self.parameters = self.algorithm.get_params()
        self.X = X

    def start(self):
        combinations = list(self.product_dict(**self.parameters))
        for parameter in tqdm(combinations):
            # Fit alg
            self.algorithm.set_params(parameter)
            fit_time = self.algorithm.fit(self.X)
            current_scores = self.get_score(self.algorithm.get_classes())
            current_scores['Fit_Time'] = fit_time
            current_scores.update(parameter)
            self.scores.append(current_scores)
        with open(f'/home/g0017139/UMCG_Thesis/Working_Code/Results/{self.name}{time.time()}.pkl', 'wb') as f:
            pickle.dump(pd.DataFrame(self.scores), f, pickle.HIGHEST_PROTOCOL)

    def get_alg(self, name):
        if name == 'kmeans':
            class_alg = Kmeans()
        elif name == 'essc':
            class_alg = ESSCGrid()
        elif name == 'ensc':
            class_alg = ENSC()
        elif name == 'dbscan':
            class_alg = UHDBSCAN()
        else:
            raise ValueError(f'Algorithm {name} is not implemented chose "kmeans" "essc" "ensc" "dbscan"')
        return class_alg

    def get_score(self, labels):
        result = {}
        try:
            result['silhouette_score'] = silhouette_score(self.X, labels, metric='euclidean')
        except:
            result['silhouette_score'] = np.nan

        try:
            result['calinski_harabasz_score'] = calinski_harabasz_score(self.X, labels)
        except:
            result['calinski_harabasz_score'] = np.nan

        try:
            result['davies_bouldin_score'] = davies_bouldin_score(self.X, labels)
        except:
            result['davies_bouldin_score'] = np.nan
        return result

    def product_dict(self, **kwargs):
        keys = kwargs.keys()
        vals = kwargs.values()
        for instance in itertools.product(*vals):
            yield dict(zip(keys, instance))


df = pd.read_csv('/data/g0017139/MAFIA/gene_expression_PCA.dat', sep=None, engine='python', header=None)
search = Gridsearch('dbscan', df)
search.start()


  0%|          | 2/7200 [00:43<43:48:43, 21.91s/it]


KeyboardInterrupt: 

In [None]:
with open("/home/g0017139/UMCG_Thesis/Working_Code/Results/dbscan1632749303.2476459.pkl", 'rb') as f:
        tests = pickle.load(f)

In [58]:
tests.sort_values('silhouette_score', axis=0, ascending=False).head(20)

Unnamed: 0,silhouette_score,calinski_harabasz_score,davies_bouldin_score,Fit_Time,DimReduction__n_neighbors,DimReduction__min_dist,DimReduction__n_components,Clustering__min_cluster_size,Clustering__min_samples,Clustering__cluster_selection_epsilon,Clustering__cluster_selection_method
51,0.10435,66.932434,3.049101,6.36767,30,1.0,1,2,2,1.0,leaf
59,0.10435,66.932434,3.049101,6.400951,30,1.0,1,100,2,1.0,leaf
58,0.10435,66.932434,3.049101,6.319919,30,1.0,1,100,2,1.0,eom
95,0.10435,66.932434,3.049101,7.764699,80,0.0,1,100,100,1.0,leaf
82,0.10435,66.932434,3.049101,7.563775,80,0.0,1,2,2,1.0,eom
84,0.10435,66.932434,3.049101,7.521032,80,0.0,1,2,100,0.1,eom
123,0.10435,66.932434,3.049101,7.134686,80,1.0,1,100,2,1.0,leaf
56,0.10435,66.932434,3.049101,6.102727,30,1.0,1,100,2,0.1,eom
90,0.104283,66.785719,3.045097,7.14593,80,0.0,1,100,2,1.0,eom
120,0.104283,66.785719,3.045097,7.490159,80,1.0,1,100,2,0.1,eom
