In [25]:
import sys, getopt
import pickle
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
import warnings
import time
from TSKFS.fuzzy_cluster import ESSC
from subspaceClustering.cluster.selfrepresentation import ElasticNetSubspaceClustering
from sklearn.pipeline import Pipeline
import hdbscan
import umap
from tqdm import tqdm
import itertools
import time
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


class BaseAlg():
    def __init__(self):
        self.parameters = {}
        self.algorithm = None
        self.classes_ = None

    # Can be changed
    def set_classes(self, X):
        self.classes_ = self.algorithm.labels_

    def fit(self, X):
        start_time = time.time()
        self.algorithm.fit(X)
        fit_time = time.time() - start_time
        self.set_classes(X)
        return fit_time

    def get_classes(self):
        if self.classes_ is not None:
            return self.classes_
        else:
            warnings.warn('Classes have not been set')

    def check_params(self, parameter):
        for name in parameter:
            if name not in self.algorithm.__dict__:
                warnings.warn(f'{name} not in algorithm')

    # Can be changed
    def helper_set(self, parameter):
        self.algorithm.set_params(**parameter)

    # parameter should be a dictionary
    def set_params(self, parameter):
        self.check_params(parameter)
        self.helper_set(parameter)

    def get_params(self):
        return self.parameters


class MAFIA(BaseAlg):
    def __init__(self):
        self.parameters = {'a': np.linspace(0.5, 5, 5),
                           'b': np.linspace(0, 1, 4),
                           'n': np.arange(100, 2001, 250),
                           'u': np.arange(1, 11, 5),
                           'M': np.arange(15, 31, 5),
                          }
        self.algorithm = None
        self.classes_ = None
        self.current_params = None

    # Can be changed
    def set_classes(self, X):
        # Load all the classes 
        rows = []
        clusters = []
        directory = '/data/g0017139/MAFIA'
        for filename in tqdm(os.listdir(directory)):
            print(filename)
            if filename.endswith(".idx"):
                loadedrow = pd.read_table(f"{directory}/{filename}",sep="  ", header=None, engine='python').values.tolist()
                os.remove(f"{directory}/{filename}")
                clusters.extend(np.repeat(int(filename.split('-')[1].replace('.idx','')), len(loadedrow)))
                rows.extend(loadedrow)
        rows = np.array(rows).ravel()
        clusters = np.array(clusters)
        # Get some outliers
        test = np.arange(0,len(X))
        for x in test:
            if x not in rows:
                rows = np.append(rows, x)
                clusters = np.append(clusters, -1)
        rows_to_cluster = pd.DataFrame(clusters, index=rows)
        rows_to_cluster = rows_to_cluster[~rows_to_cluster.index.duplicated(keep='last')]
        rows_to_cluster = rows_to_cluster.sort_index()
        self.classes_ = rows_to_cluster.values.ravel()

    def fit(self, X):
        pd.DataFrame(X).to_csv("/data/g0017139/MAFIA/X.dat", sep = " ",header=False, index=False)
        start_time = time.time()
        subprocess.run(f"/home/g0017139/UMCG_Thesis/Working_Code/bin/cppmafia /data/g0017139/MAFIA/X.dat -a {self.current_params['a']} -b {self.current_params['b']} -n {self.current_params['n']} -u {self.current_params['u']} -M {self.current_params['M']}",
                shell=True)
        fit_time = time.time() - start_time
        self.set_classes(X)
        return fit_time

    def get_classes(self):
        if self.classes_ is not None:
            return self.classes_
        else:
            warnings.warn('Classes have not been set')

    # Can be changed
    def helper_set(self, parameter):
        self.current_params = parameter

    # parameter should be a dictionary
    def set_params(self, parameter):
        self.helper_set(parameter)

    def get_params(self):
        return self.parameters





class Gridsearch():
    def __init__(self, name, X):
        self.name = name
        self.algorithm = self.get_alg(name)
        self.scores = []
        self.parameters = self.algorithm.get_params()
        self.X = X

    def start(self):
        combinations = list(self.product_dict(**self.parameters))
        for parameter in tqdm(combinations):
            # Fit alg
            self.algorithm.set_params(parameter)
            fit_time = self.algorithm.fit(self.X)
            current_scores = self.get_score(self.algorithm.get_classes())
            current_scores['Fit_Time'] = fit_time
            current_scores.update(parameter)
            self.scores.append(current_scores)
        with open(f'/home/g0017139/UMCG_Thesis/Working_Code/Results/{self.name}{time.time()}.pkl', 'wb') as f:
            pickle.dump(pd.DataFrame(self.scores), f, pickle.HIGHEST_PROTOCOL)

    def get_alg(self, name):
        if name == 'kmeans':
            class_alg = Kmeans()
        elif name == 'essc':
            class_alg = ESSCGrid()
        elif name == 'ensc':
            class_alg = ENSC()
        elif name == 'dbscan':
            class_alg = UHDBSCAN()
        elif name == 'mafia':
            class_alg = MAFIA()
        else:
            raise ValueError(f'Algorithm {name} is not implemented chose "kmeans" "essc" "ensc" "dbscan" "mafia"')
        return class_alg

    def get_score(self, labels):
        print(labels)
        result = {}
        try:
            result['silhouette_score'] = silhouette_score(self.X, labels, metric='euclidean')
        except:
            result['silhouette_score'] = np.nan

        try:
            result['calinski_harabasz_score'] = calinski_harabasz_score(self.X, labels)
        except:
            result['calinski_harabasz_score'] = np.nan

        try:
            result['davies_bouldin_score'] = davies_bouldin_score(self.X, labels)
        except:
            result['davies_bouldin_score'] = np.nan
        return result

    def product_dict(self, **kwargs):
        keys = kwargs.keys()
        vals = kwargs.values()
        for instance in itertools.product(*vals):
            yield dict(zip(keys, instance))


df = pd.read_csv("/data/g0017139/gene_expression_norm.dat", sep=None, engine='python', header=None)
search = Gridsearch('mafia', df)
search.start()

  0%|          | 0/2 [00:00<?, ?it/s]

/home/g0017139/UMCG_Thesis/Working_Code/bin/cppmafia /data/g0017139/MAFIA/X.dat -a 0.2 -b 0.5 -n 100 -u 10 -M 20


  0%|          | 0/2 [02:38<?, ?it/s]


KeyboardInterrupt: 

In [27]:
subprocess.run('/home/g0017139/UMCG_Thesis/Working_Code/bin/cppmafia /data/g0017139/MAFIA/X.dat -a 0.2 -b 0.5 -n 100 -u 10 -M 20',
        shell=True)

CompletedProcess(args='/home/g0017139/UMCG_Thesis/Working_Code/bin/cppmafia /data/g0017139/MAFIA/X.dat -a 0.2 -b 0.5 -n 100 -u 10 -M 20', returncode=-9)

In [20]:
with open("/home/g0017139/UMCG_Thesis/Working_Code/Results/mafia1632780212.037191.pkl", 'rb') as f:
        test = pickle.load(f)

In [21]:
test

Unnamed: 0,silhouette_score,calinski_harabasz_score,davies_bouldin_score,Fit_Time,a,b,n,u,M
0,,,,417.706118,0.2,0.5,100,10,20
1,,,,410.769105,0.5,0.5,100,10,20
