# Different base-algorithms model

In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openensembles as oe
from kmodes.kmodes import KModes
from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import SpectralClustering, KMeans, AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, silhouette_score
from algorithms import KModesEnsemble, SpectralEnsemble, HierarchyEnsemble
from evaluation import clustering_agreement, connectivity, jaccard
from collections import namedtuple
from time import perf_counter


path = r"C:\Users\Manik\repos\bachelor_thesis"
if os.getcwd() != path:
    os.chdir(path)

In [3]:
Dataset = namedtuple("Dataset", "x y k")

iris = load_iris()
iris_ = Dataset(iris.data, iris.target, 3)

cassini = pd.read_csv("datasets/cassini.csv", sep=" ")
cassini.index = np.arange(1000)
cassini_x = np.array(cassini[["x", "y"]])
cassini_y = np.array(cassini.classes)
cassini_ = Dataset(cassini_x, cassini_y, 3)

# yeast
yeast = pd.read_csv("datasets/yeast.txt", sep="  ", header=None, engine="python")
yeast_y = yeast.iloc[:, -1]
yeast_x = yeast.drop([0, 9], axis=1)
yeast_y = LabelEncoder().fit_transform(yeast_y)
yeast_x = np.array(yeast_x)

yeast_ = Dataset(yeast_x, yeast_y, 10)

my_own = np.loadtxt("datasets/my_own.txt")
my_own_x = my_own[:, :2]
my_own_y = my_own[:, 2]

my_own_ = Dataset(my_own_x, my_own_y, 5)

digits = np.loadtxt("datasets/digits.txt")
digits_x = digits[:, :-1]
digits_y = digits[:, -1]

digits_ = Dataset(digits_x, digits_y, 10)

In [4]:
# setup for diverse clusterings
data = oe.data(pd.DataFrame(iris_.x), range(iris_.x.shape[1]))
c = oe.cluster(data)
a = c.algorithms_available()
paramsC = c.clustering_algorithm_parameters() 

algorithmsToRemove = ['DBSCAN', 'Birch', "HDBSCAN", "MeanShift", "AffinityPropagation", "GaussianMixture"]
for algToRemove in algorithmsToRemove:
    del a[algToRemove]
takesLinkages = paramsC['linkage']
takesDistances = paramsC['distance']
takesK = paramsC['K']

linkages = ['average', 'complete', 'ward', "single"]
distances = ['euclidean', "manhattan"]

In [5]:
def diverse_clustering(algo, X, K, Ks):
    X_df = pd.DataFrame(X)
    data = oe.data(X_df, X_df.columns)
    c = oe.cluster(data)
    for algorithm in list(a.keys()): 
        if algorithm in takesK:
            for k in Ks:
                if algorithm in takesDistances:
                    if algorithm in takesLinkages:
                        for linkage in linkages:
                            if linkage == 'ward':
                                out_name = '_'.join(["parent", algorithm, linkage, str(k)])
                                c.cluster("parent", algorithm, out_name, K=k, Require_Unique= True, linkage=linkage)
                            else:
                                for dist in distances:
                                    out_name = '_'.join(["parent", algorithm, dist, linkage, str(k)])
                                    c.cluster("parent", algorithm, out_name, K=k, Require_Unique= True, linkage=linkage, distance=dist)
                    else:
                        for dist in distances:
                            out_name = '_'.join(["parent", algorithm, dist, str(k)])
                            c.cluster("parent", algorithm, out_name, K=k, Require_Unique= True, distance=dist)
                else:
                    out_name = '_'.join(["parent", algorithm, str(k)])
                    c.cluster("parent", algorithm, out_name, K=k, Require_Unique= True)
    if algo == "KModesEnsemble":
        n = len(c.labels)
        Y = np.zeros((X.shape[0], n))
        clusters = list(c.labels.values())
        for i in range(n):
            Y[:, i] = clusters[i]
        kmodes_ = KModes(n_clusters=K)
        return kmodes_.fit_predict(Y)
    else:
        co_matrix = c.co_occurrence_matrix().co_matrix
        if algo == "SpectralEnsemble":
            spec = SpectralClustering(n_clusters=K, affinity="precomputed").fit(co_matrix)
            return spec.labels_
        hc = AgglomerativeClustering(n_clusters=K, affinity="precomputed", linkage="average").fit(1 - co_matrix)
        return hc.labels_

In [6]:
def cluster_stability(X, est, Ks, n_iter=20, random_seed=50, **params):
    data, k = X.x, X.k
    np.random.seed(random_seed)
    initial_cluster = diverse_clustering(est, data, k, Ks)
    nrow = data.shape[0]
    indices = np.arange(nrow)
    scores = []
    for i in range(n_iter):
        sample_indices = np.random.randint(0, nrow, nrow)
        X_bootstrap = data[sample_indices]
        bootstrap_labels = diverse_clustering(est, X_bootstrap, k, Ks)
        relabel = -np.ones(nrow)
        relabel[sample_indices] = bootstrap_labels
        in_both = np.intersect1d(indices, sample_indices)
        scores.append(jaccard(initial_cluster[in_both], relabel[in_both]))
    return scores

In [7]:
def evaluate_diverse(dataset, algorithm, Ks, reps=20, seed=50):
    data_x, k, true_labels = dataset.x, dataset.k, dataset.y
    np.random.seed(seed)
    clusters = []
    agreement_indexes = []
    rand_indexes = []
    silhouetes_indexes = []
    connectivity_indexes = []
    times = []
    for i in range(reps):
        start = perf_counter()
        labels = diverse_clustering(algorithm, data_x, k, Ks)
        end = perf_counter()
        times.append(end - start)
        clusters.append([int(label) for label in labels])
        agreement_indexes.append(clustering_agreement(true_labels, labels))
        rand_indexes.append(adjusted_rand_score(true_labels, labels))
        silhouetes_indexes.append(silhouette_score(data_x, labels))
        connectivity_indexes.append(connectivity(data_x, labels))
    return {
        "labels": clusters,
        "agreement": agreement_indexes,
        "rand": rand_indexes,
        "silhouette": silhouetes_indexes,
        "connectivity": connectivity_indexes,
        "stability": cluster_stability(dataset, algorithm, Ks),
        "time": times
           }

In [8]:
algos = ["SpectralEnsemble", "HierarchicalEnsemble", "KModesEnsemble"]

def evaluate_final(dataset, name, Ks):
    for (algo_name, ks) in zip(algos, Ks):
        res = evaluate_diverse(dataset, algo_name, ks)
        with open(f"diversity/{name}_{algo_name}.json", "w") as write_file:
            json.dump(res, write_file)

In [195]:
evaluate_final(iris_, "iris", [list(range(3, 10)) for i in range(3)])
evaluate_final(my_own_, "my_own", [list(range(5, 7)), list(range(5, 7)), list(range(5, 10))])
evaluate_final(cassini_, "cassini", [list(range(3, 6)) for i in range(3)])
evaluate_final(yeast_, "yeast", [list(range(10, 15)) for i in range(3)])