# Results

In [1]:
import os
import json
import warnings
import numpy as np
import matplotlib.pyplot as plt
import openensembles as oe
import pandas as pd
from copy import deepcopy
from time import perf_counter
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.metrics import adjusted_rand_score, silhouette_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings("ignore")


path = r"C:\Users\Manik\repos\bachelor_thesis"
if os.getcwd() != path:
    os.chdir(path)

In [2]:
from algorithms import BaggedEnsemble, SpectralEnsemble, HierarchyEnsemble, KModesEnsemble, BaggedMajority
from evaluation import connectivity, cluster_stability, clustering_agreement

In [3]:
algorithms = [KMeans, SpectralEnsemble, HierarchyEnsemble, KModesEnsemble, BaggedEnsemble, BaggedMajority]
names = ["KMeans", "SpectralEnsemble", "HierarchicalEnsemble", "KModesEnsemble", "BaggedEnsemble", "BaggedMajority"]

In [4]:
def evaluate_clustering(algo, data_x, iterations, true_labels, seed=42, **params):
    np.random.seed(seed)
    algorithm = algo(**params)
    clusters = []
    agreement_indexes = []
    rand_indexes = []
    silhouetes_indexes = []
    connectivity_indexes = []
    times = []
    for i in range(iterations):
        algorithm_copy = algo(**params)
        start = perf_counter()
        algorithm_copy.fit(data_x)
        end = perf_counter()
        times.append(end - start)
        labels = algorithm_copy.labels_
        clusters.append([int(label) for label in labels])
        agreement_indexes.append(clustering_agreement(true_labels, labels))
        rand_indexes.append(adjusted_rand_score(true_labels, labels))
        silhouetes_indexes.append(silhouette_score(data_x, labels))
        connectivity_indexes.append(connectivity(data_x, labels))
    stabilities = cluster_stability(data_x, algo, **params)
    return {
        "labels": clusters,
        "agreement": agreement_indexes,
        "rand": rand_indexes,
        "silhouette": silhouetes_indexes,
        "connectivity": connectivity_indexes,
        "stability": stabilities,
        "time": times
           }

In [5]:
def evaluate(dataset, algorithms, params, folder, iterations=20):
    dataset_x, dataset_y, dataset_name = dataset
    for (index, (algo, parameters)) in enumerate(zip(algorithms, params)):
        result = evaluate_clustering(algo, 
                            dataset_x,
                            iterations,
                            dataset_y,
                            **parameters)
        with open(f"{folder}/{dataset_name}_{names[index]}.json", "w") as in_file:
            json.dump(result, in_file)   

# iris

In [6]:
iris = load_iris()
iris_x = iris.data
iris_y = iris.target

In [7]:
iris_params = [
    {"n_clusters": 3, "n_init": 1, "init": "random"},
    {"clusters": 3, "base_estimator_k": 5},
    {"clusters": 3, "base_estimator_k": 5},
    {"clusters": 3, "base_estimator_k": 3},
    {"clusters": 3, "base_centers": 10, "num_of_partitions": 300},
    {"clusters": 3, "num_of_partitions": 30, "params": {"n_clusters": 3, "n_init": 1, "init": "random"}}
]

In [83]:
evaluate(
    (iris_x, iris_y, "iris"),
    algorithms, 
    iris_params,
    "iris"
        )

# cassini

In [10]:
cassini = pd.read_csv("datasets/cassini.csv", sep=" ")
cassini.index = np.arange(1000)
cassini_x = np.array(cassini[["x", "y"]])
cassini_y = np.array(cassini.classes)

In [11]:
cassini_params = [
    {"n_clusters": 3, "n_init": 1, "init": "random"},
    {"clusters": 3, "base_estimator_k": 6},
    {"clusters": 3, "base_estimator_k": 6},
    {"clusters": 3, "base_estimator_k": 6},
    {"clusters": 3, "base_centers": 40, "num_of_partitions": 100},
    {"clusters": 3, "num_of_partitions": 30, "params": {"n_init": 1 , "init": "random", "n_clusters": 3}}
]

In [None]:
evaluate(
    (cassini_x, cassini_y, "cassini"),
    algorithms,
    cassini_params,
    "cassini"
)

# yeast

In [12]:
# yeast
yeast = pd.read_csv("datasets/yeast.txt", sep="  ", header=None, engine="python")
yeast_y = yeast.iloc[:, -1]
yeast_x = yeast.drop([0, 9], axis=1)
yeast_y = LabelEncoder().fit_transform(yeast_y)
yeast_x = np.array(yeast_x)

In [13]:
yeast_params = [
    {"n_clusters": 10, "n_init": 1, "init": "random"},
    {"clusters": 10, "base_estimator_k": 30},
    {"clusters": 10, "base_estimator_k": 40},
    {"clusters": 10, "base_estimator_k": 10},
    {"clusters": 10, "base_centers": 40, "num_of_partitions": 100},
    {"clusters": 10, "num_of_partitions": 30, "params": {"n_clusters": 10, "n_init": 1, "init": "random"}}    
]

In [None]:
evaluate(
    (yeast_x, yeast_y, "yeast"),
    algorithms,
    yeast_params,
    "yeast"
)

# my_own

In [14]:
my_own = np.loadtxt("datasets/my_own.txt")
my_own_x = my_own[:, :2]
my_own_y = my_own[:, 2]

In [15]:
my_own_params = [
    {"n_clusters": 5, "n_init": 1, "init": "random"},
    {"clusters": 5, "base_estimator_k": 20},
    {"clusters": 5, "base_estimator_k": 15},
    {"clusters": 5, "base_estimator_k": 5},
    {"clusters": 5, "base_centers": 20},
    {"clusters": 5, "num_of_partitions": 30, "params": {"n_clusters": 5, "n_init": 1, "init": "random"}}
]

In [153]:
evaluate(
    (my_own_x, my_own_y, "my_own"),
    algorithms[:3],
    my_own_params[:3],
    "my_own"
)

# digits

In [16]:
digits = np.loadtxt("datasets/digits.txt")
digits_x = digits[:, :-1]
digits_y = digits[:, -1]

In [17]:
digits_params = [
    {"n_clusters": 10, "init": "random", "n_init": 1},
    {"clusters": 10, "base_estimator_k": 30},
    {"clusters": 10, "base_estimator_k": 30},
    {"clusters": 10, "base_estimator_k": 20},
    {"clusters": 10, "base_centers": 50},
    {"clusters": 10, "num_of_partitions": 30, "params":{"n_clusters": 10, "n_init": 1, "init": "random"}}
]

In [274]:
evaluate(
    (digits_x, digits_y, "digits"),
    algorithms,
    digits_params,
    "digits"
)