# Examples from manuscript

Code to run the examples reported in https://arxiv.org/abs/2009.01077

In [None]:
from reval.best_nclust_cv import FindBestClustCV
from reval.internal_baselines import select_best, evaluate_best
from reval.visualization import plot_metrics
from reval.utils import kuhn_munkres_algorithm, compute_metrics
from reval.param_selection import ParamSelection, SCParamSelection
from datasets.manuscript_builddatasets import build_ucidatasets

from sklearn.datasets import make_blobs, load_digits
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering
from sklearn.metrics import zero_one_loss, adjusted_mutual_info_score, silhouette_score, davies_bouldin_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
import hdbscan
from umap import UMAP
import numpy as np

import pandas as pd
import pickle as pkl
import logging
import matplotlib.pyplot as plt

# Modify this variable for parallelization
N_JOBS=7

Three example functions that can also be run from shell (see manuscript_examples.py file). 

> Example 1: blobs dataset;

> Example 2: real-world dataset MNIST;

> Example 3: best clussifier-clustering combinations for 18 datasets from UCI Machine Learning Repository.

## Example 1: blobs dataset

In [None]:
# EXAMPLE 1: Isotropic Gaussian blobs
# Generate dataset
data = make_blobs(1000, 2, centers=5, 
                  center_box=(-20, 20),
                  random_state=42)

# Visualize dataset
plt.figure(figsize=(6, 4))
for i in range(5):
    plt.scatter(data[0][data[1]==i][:, 0],
                data[0][data[1]==i][:, 1],
                label=i, cmap='tab20')
plt.title("Blobs dataset")
# plt.savefig('./blobs.png', format='png')
plt.show()

# Create training and test sets
X_tr, X_ts, y_tr, y_ts = train_test_split(data[0],
                                          data[1],
                                          test_size=0.30,
                                          random_state=42,
                                          stratify=data[1])

# Initialize clustering and classifier
classifier = KNeighborsClassifier(n_neighbors=15)
clustering = KMeans()

# Run relatve validation (repeated CV and testing)
findbestclust = FindBestClustCV(nfold=2,
                                nclust_range=list(range(2, 7, 1)),
                                s=classifier,
                                c=clustering,
                                nrand=10,
                                n_jobs=N_JOBS)
metrics, nbest = findbestclust.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr)
out = findbestclust.evaluate(X_tr, X_ts, nclust=nbest)

# Plot CV metrics
plot_metrics(metrics, prob_lines=False)
logging.info(f"Validation stability: {metrics['val'][nbest]}")
perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

logging.info(f"Best number of clusters: {nbest}")
logging.info(f'AMI (true labels vs predicted labels) for test set = '
      f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
logging.info('\n\n')

# Compute metrics
logging.info("Metrics from true label comparisons on test set:")
class_scores = compute_metrics(y_ts, perm_lab, perm=False)
for k, val in class_scores.items():
    if k in ['ACC', 'MCC']:
        logging.info(f"{k}, {val}")
logging.info("\n\n")

# Internal measures
# SILHOUETTE
logging.info("Silhouette score based selection")
sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, clustering, silhouette_score,
                                                      select='max',
                                                      nclust_range=list(range(2, 7, 1)))
sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, clustering, silhouette_score,
                                                      select='max',
                                                      nclust_range=list(range(2, 7, 1)))

sil_eval = evaluate_best(X_ts, clustering, silhouette_score, sil_best_tr)

logging.info(f"Best number of clusters (and scores) for tr/ts independent runs: "
             f"{sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})")
logging.info(f"Test set evaluation {sil_eval}")
logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}')
logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}')
logging.info('\n\n')

# DAVIES-BOULDIN
logging.info("Davies-Bouldin score based selection")
db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, clustering, davies_bouldin_score, 
                                        select='min', nclust_range=list(range(2, 7, 1)))
db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, clustering, davies_bouldin_score, 
                                        select='min', nclust_range=list(range(2, 7, 1)))

db_eval = evaluate_best(X_ts, clustering, davies_bouldin_score, db_best_tr)

logging.info(f"Best number of clusters (and scores) for tr/ts independent runs: "
             f"{db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})")
logging.info(f"Test set evaluation {db_eval}")
logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}')
logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}')
logging.info('\n\n')

# Plot true vs predicted labels for test sets
plt.figure(figsize=(6, 4))
for i in range(5):
    plt.scatter(X_ts[y_ts==i][:, 0], 
                X_ts[y_ts==i][:, 1],
                label=str(i),
                cmap='tab20')
plt.legend(loc=3)
plt.title("Test set true labels")
# plt.savefig('./blobs_true.png', format='png')
plt.show()


plt.figure(figsize=(6, 4))
for i in range(5):
    plt.scatter(X_ts[perm_lab==i][:, 0], 
                X_ts[perm_lab==i][:, 1],
                label=str(i),
                cmap='tab20')
plt.legend(loc=3)
plt.title("Test set clustering labels")
# plt.savefig('./blobs_clustering.png', format='png')
plt.show()

## Example 2: MNIST real-world dataset

In [None]:
# Example 3: MNIST dataset
# Load the dataset
mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(int)

# Create training and test sets
X_tr, y_tr = mnist['data'][:60000], mnist.target[:60000]
X_ts, y_ts = mnist['data'][60000::], mnist.target[60000::]
transform = UMAP(n_components=2,
                 random_state=42,
                 n_neighbors=30,
                 min_dist=0.0)
X_tr = transform.fit_transform(X_tr)
X_ts = transform.transform(X_ts)

#Initialize classifier/clustering algorithms
combo = {'s': [KNeighborsClassifier(n_neighbors=30), SVC(), LogisticRegression(), RandomForestClassifier()],
         'c': [hdbscan.HDBSCAN(min_samples=10, min_cluster_size=200)]}

scsel = SCParamSelection(combo, 2, 10, 7, 10, list(range(2, 13)), y_tr)
scsel.fit(X_tr, 10)

# s = KNeighborsClassifier(n_neighbors=30)
s = RandomForestClassifier()
c = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=200)

reval = FindBestClustCV(s=s,
                        c=c,
                        nfold=2,
                        nrand=10,
                        n_jobs=N_JOBS)

metrics, nclustbest, tr_lab = reval.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr)

plot_metrics(metrics, save_fig='mnist_performance.png')
logging.info(f"Validation stability: {metrics['val'][nclustbest]}")

out = reval.evaluate(X_tr, X_ts, nclust=nclustbest, tr_lab=tr_lab)
perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

logging.info(f"Best number of clusters during CV: {nclustbest}")
logging.info(f"Best number of clusters on test set: {len([lab for lab in np.unique(out.test_cllab) if lab >= 0])}")
logging.info(f'AMI train (true labels vs predicted labels) = '
             f'{adjusted_mutual_info_score(y_tr, out.train_cllab)}')
logging.info('\n\n')
logging.info(f'AMI (true labels vs predicted labels) = '
             f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
logging.info('\n\n')

logging.info("Metrics from true label comparisons on test set:")
class_scores = compute_metrics(y_ts, perm_lab)
for k, val in class_scores.items():
    logging.info(f'{k}, {val}')
logging.info('\n\n')

# Visualization
fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(X_tr[:, 0],
                     X_tr[:, 1],
                     c=y_tr, 
                     cmap='tab20',
                     s=0.1)
legend = ax.legend(*scatter.legend_elements())
ax.add_artist(legend)
plt.title("")
# plt.savefig('./mnist_train.png')
plt.show()

fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(X_tr[:, 0],
                     X_tr[:, 1],
                     c=kuhn_munkres_algorithm(y_tr, tr_lab), 
                     cmap='tab20',
                     s=0.1)
legend = ax.legend(*scatter.legend_elements())
ax.add_artist(legend)
plt.title("")
# plt.savefig('./mnist_trainreval.png')
plt.show()

fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(X_ts[:, 0],
                    X_ts[:, 1],
                    c=y_ts, cmap='tab20',
                    s=0.1)
legend = ax.legend(*scatter.legend_elements())
ax.add_artist(legend)
plt.title("")
# plt.savefig('./mnist_test.png')
plt.show()

fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(X_ts[:, 0],
                X_ts[:, 1],
                s=0.1,
                c=perm_lab, cmap='tab20')
legend = ax.legend(*scatter.legend_elements())
ax.add_artist(legend)
plt.title("")
# plt.savefig('./mnist_testreval.png')
plt.show()

# Internal measures
# SILHOUETTE
logging.info("Silhouette score based selection")
sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, c, silhouette_score, select='max')
sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, c, silhouette_score, select='max')
logging.info(f"Best number of clusters (and scores) for tr/ts independent runs: {sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})")
logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}')
logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}')
logging.info('\n\n')

# DAVIES-BOULDIN
logging.info("Davies-Bouldin score based selection")
db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, c, davies_bouldin_score, 
                                        select='min')
db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, c, davies_bouldin_score, 
                                        select='min')

logging.info(f"Best number of clusters (and scores) for tr/ts independent runs: {db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})")
logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}')
logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}')
logging.info('\n\n')

#Visualization
fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(X_tr[:, 0],
                     X_tr[:, 1],
                     c=sil_label_tr, 
                     cmap='tab20',
                     s=0.1)
legend = ax.legend(*scatter.legend_elements())
ax.add_artist(legend)
plt.title("")
# plt.savefig('./mnist_trainsilh.png')
plt.show()

fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(X_ts[:, 0],
                     X_ts[:, 1],
                     c=sil_label_ts, 
                     cmap='tab20',
                     s=0.1)
legend = ax.legend(*scatter.legend_elements())
ax.add_artist(legend)
legend = ax.legend(*scatter.legend_elements())
ax.add_artist(legend)
plt.title("")
# plt.savefig('./mnist_testsilh.png')
plt.show()

fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(X_tr[:, 0],
                     X_tr[:, 1],
                     c=db_label_tr, 
                     cmap='tab20',
                     s=0.1)
legend = ax.legend(*scatter.legend_elements())
ax.add_artist(legend)
plt.title("")
# plt.savefig('./mnist_traindb.png')
plt.show()

fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(X_ts[:, 0],
                     X_ts[:, 1],
                     s=0.1,
                     c=db_label_ts, 
                     cmap='tab20')
legend = ax.legend(*scatter.legend_elements())
ax.add_artist(legend)
plt.title("")
# plt.savefig('./mnist_testdb.png')
plt.show()

## Example 4: best classifier/clustering combination search

In [None]:
# Example 4: best clussifier/clustering for UCI dataset
# Import benchmark datasets
uci_data = build_ucidatasets()

In [None]:
# Classifiers
s = [LogisticRegression(solver='liblinear',
                        random_state=42),
     RandomForestClassifier(n_estimators=100,
                            random_state=42),
     KNeighborsClassifier(n_neighbors=1,
                           metric='euclidean'),
     SVC(C=1,
         random_state=42)]

# Clustering
c = [AgglomerativeClustering(), 
     KMeans(random_state=42),
     hdbscan.HDBSCAN()]

scparam = {'s': s,
           'c': c}

transform = UMAP(n_neighbors=30, min_dist=0.0, random_state=42)
scale = StandardScaler()

# Run parameter selection algorithm
best_results = {}
for data, name in zip(uci_data, uci_data._fields):
    if name == 'hwdigits':
        scparam['s'][-1].gamma = (1 / data['data'].shape[0])
        nclass = len(np.unique(data['target']))
        logging.info(f"Processing dataset {name}")
        logging.info(f"True number of classes: {nclass}\n")
        X_tr, X_ts, y_tr, y_ts = train_test_split(data['data'],
                                                  data['target'],
                                                  test_size=0.40,
                                                  random_state=42,
                                                  stratify=data['target'])
        # Change here for different preprocessing
        X_tr = transform.fit_transform(X_tr)
        X_ts = transform.transform(X_ts)
        scparam_select = SCParamSelection(sc_params=scparam,
                                          cv=2,
                                          nrand=10,
                                          clust_range=list(range(2, nclass+3, 1)),
                                          n_jobs=N_JOBS,
                                          iter_cv=10,
                                          strat=y_tr)
        scparam_select.fit(X_tr, nclass=nclass)
        best_results[name] = scparam_select.best_param_
        # Uncomment to save the results
    #     pkl.dump(best_results, open('./best_resultUCI_scaledumap.pkl', 'wb'))
        logging.info('*' * 100)
        logging.info('\n\n')

In [None]:
# Function that fits the best models to the UCI datasets and reports the results
def test_ucibest(X_tr, X_ts, y_tr, y_ts, best_sol, 
                 n_jobs=1, preprocess=None, tr_lab=None):
    reval = FindBestClustCV(s=best_sol['s'],
                            c=best_sol['c'],
                            nfold=2,
                            nrand=10,
                            n_jobs=n_jobs)

    transform = UMAP(n_neighbors=30, min_dist=0.0, random_state=42)
    scale = StandardScaler()

    if preprocess == 'scaled':
        X_tr = scale.fit_transform(X_tr)
        X_ts = scale.transform(X_ts)
    elif preprocess == 'umap':
        X_tr = transform.fit_transform(X_tr)
        X_ts = transform.transform(X_ts)
    elif preprocess == 'scaled+umap':
        X_tr = transform.fit_transform(scale.fit_transform(X_tr))
        X_ts = transform.transform(scale.transform(X_ts))

    if isinstance(best_sol['c'], hdbscan.HDBSCAN):
        _, _, tr_lab = reval.best_nclust(X_tr, iter_cv=1, strat_vect=y_tr)
        
    out = reval.evaluate(X_tr, X_ts, nclust=best_sol['nclust'], tr_lab=tr_lab)
    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

    logging.info(f'Testing solution {best_sol}.')
    logging.info(f"Best number of clusters during CV: {best_sol['nclust']}")
    logging.info(f"Best number of clusters on test set: {len([lab for lab in np.unique(out.test_cllab) if lab >= 0])}")
    logging.info(f"Test set accuracy: {out.test_acc}")
    logging.info(f'AMI (true labels vs predicted labels) = '
                 f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    logging.info('\n\n')
    
    logging.info("Metrics from true label comparisons on test set:")
    class_scores = compute_metrics(y_ts, perm_lab)
    logging.info('\n\n')
    
    for k, val in class_scores.items():
        logging.info(f"{k}, {val}")

    # Internal measures
    # SILHOUETTE
    logging.info("Silhouette score based selection")
    sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, best_sol['c'], silhouette_score, select='max')
    sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, best_sol['c'], silhouette_score, select='max')
    logging.info(f"Best number of clusters (and scores) for tr/ts independent "
                 f"runs: {sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}')
    logging.info(f"Silhouette metrics: {compute_metrics(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}")
    logging.info('\n\\n')
    
    # DAVIES-BOULDIN
    logging.info("Davies-Bouldin score based selection")
    db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, best_sol['c'], davies_bouldin_score,
                                                       select='min')
    db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, best_sol['c'], davies_bouldin_score,
                                                       select='min')

    logging.info(f"Best number of clusters (and scores) for tr/ts independent "
                 f"runs: {db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}')
    logging.info(f"Davies-Bouldin metrics: {compute_metrics(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}")
    logging.info('\n\n')

## Best solutions with raw data
Dataset whose best solutions don't require preprocessing are: biodeg, breastwi, ionosphere, seeds, and forest.

In [None]:
# Read pkl objects with best solutions
best_raw = pkl.load(open('../best_resultUCI_raw.pkl', 'rb'))

### `biodeg` dataset

In [None]:
biodeg_best = {'s': best_raw['biodeg'][0][0],
               'c': best_raw['biodeg'][0][1],
               'nclust': best_raw['biodeg'][0][2]}
biodeg_tr, biodeg_ts, biodeg_y_tr, biodeg_y_ts = train_test_split(uci_data.biodeg['data'],
                                                                  uci_data.biodeg['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.biodeg['target'])
test_ucibest(biodeg_tr, biodeg_ts, biodeg_y_tr, biodeg_y_ts, biodeg_best, n_jobs=7)

### `breastwi` dataset

In [None]:
breast_best = {'s': best_raw['breastwi'][0][0],
               'c': best_raw['breastwi'][0][1],
               'nclust': best_raw['breastwi'][0][2]}
breast_tr, breast_ts, breast_y_tr, breast_y_ts = train_test_split(uci_data.breastwi['data'],
                                                                  uci_data.breastwi['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.breastwi['target'])
test_ucibest(breast_tr, breast_ts, breast_y_tr, breast_y_ts, breast_best, n_jobs=7)

### `ionosphere` dataset

In [None]:
ionosphere_best = {'s': best_raw['ionosphere'][0][0],
               'c': best_raw['ionosphere'][0][1],
               'nclust': best_raw['ionosphere'][0][2]}
ionosphere_tr, ionosphere_ts, ionosphere_y_tr, ionosphere_y_ts = train_test_split(uci_data.ionosphere['data'],
                                                                  uci_data.ionosphere['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.ionosphere['target'])
test_ucibest(ionosphere_tr, ionosphere_ts, ionosphere_y_tr, ionosphere_y_ts, ionosphere_best, n_jobs=7)

### `seeds` dataset

In [None]:
seeds_best = {'s': best_raw['seeds'][0][0],
               'c': best_raw['seeds'][0][1],
               'nclust': best_raw['seeds'][0][2]}
seeds_tr, seeds_ts, seeds_y_tr, seeds_y_ts = train_test_split(uci_data.seeds['data'],
                                                                  uci_data.seeds['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.seeds['target'])
test_ucibest(seeds_tr, seeds_ts, seeds_y_tr, seeds_y_ts, seeds_best, n_jobs=7)

### `forest` dataset

In [None]:
forest_best = {'s': best_raw['forest'][0][0],
               'c': best_raw['forest'][0][1],
               'nclust': best_raw['forest'][0][2]}
forest_tr, forest_ts, forest_y_tr, forest_y_ts = train_test_split(uci_data.forest['data'],
                                                                  uci_data.forest['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.forest['target'])
test_ucibest(forest_tr, forest_ts, forest_y_tr, forest_y_ts, forest_best, n_jobs=7)

## Best solutions with UMAP preprocessed data
Datasets that require uniform manifold approximation and projection (UMAP) preprocessing are: hwdigits, iris, liver, movement, wholesale, ecoli, transfusion.

In [None]:
# Read pkl objects with best solutions
best_umap = pkl.load(open('../best_resultUCI_umap.pkl', 'rb'))

### `hwdigits` dataset

In [None]:
hwdigits_best = {'s': best_umap['hwdigits'][0][0],
                 'c': best_umap['hwdigits'][0][1],
                 'nclust': best_umap['hwdigits'][0][2]}

hwdigits_tr, hwdigits_ts, hwdigits_y_tr, hwdigits_y_ts = train_test_split(uci_data.hwdigits['data'],
                                                                          uci_data.hwdigits['target'],
                                                                          test_size=0.40,
                                                                          random_state=42,
                                                                          stratify=uci_data.hwdigits['target'])

test_ucibest(hwdigits_tr, hwdigits_ts, hwdigits_y_tr, hwdigits_y_ts, hwdigits_best, preprocess='umap', n_jobs=7)

### `iris` dataset

In [None]:
iris_best = {'s': best_umap['iris'][0][0],
               'c': best_umap['iris'][0][1],
               'nclust': best_umap['iris'][0][2]}
iris_tr, iris_ts, iris_y_tr, iris_y_ts = train_test_split(uci_data.iris['data'],
                                                                  uci_data.iris['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.iris['target'])

test_ucibest(iris_tr, iris_ts, iris_y_tr, iris_y_ts, iris_best, preprocess='umap', n_jobs=7)

### `liver` dataset

In [None]:
liver_best = {'s': best_umap['liver'][0][0],
               'c': best_umap['liver'][0][1],
               'nclust': best_umap['liver'][0][2]}
liver_tr, liver_ts, liver_y_tr, liver_y_ts = train_test_split(uci_data.liver['data'],
                                                                  uci_data.liver['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.liver['target'])
test_ucibest(liver_tr, liver_ts, liver_y_tr, liver_y_ts, liver_best, 
             preprocess='umap', n_jobs=7)

### `movement` dataset

In [None]:
movement_best = {'s': best_umap['movement'][0][0],
               'c': best_umap['movement'][0][1],
               'nclust': best_umap['movement'][0][2]}
movement_tr, movement_ts, movement_y_tr, movement_y_ts = train_test_split(uci_data.movement['data'],
                                                                  uci_data.movement['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.movement['target'])
test_ucibest(movement_tr, movement_ts, movement_y_tr, movement_y_ts, movement_best, 
             preprocess='umap', n_jobs=7)

### `wholesale` dataset

In [None]:
wholesale_best = {'s': best_umap['wholesale'][0][0],
               'c': best_umap['wholesale'][0][1],
               'nclust': best_umap['wholesale'][0][2]}
wholesale_tr, wholesale_ts, wholesale_y_tr, wholesale_y_ts = train_test_split(uci_data.wholesale['data'],
                                                                  uci_data.wholesale['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.wholesale['target'])
test_ucibest(wholesale_tr, wholesale_ts, wholesale_y_tr, wholesale_y_ts, wholesale_best, 
             preprocess='umap', n_jobs=7)

### `ecoli` dataset

In [None]:
ecoli_best = {'s': best_umap['ecoli'][-2][0],
               'c': best_umap['ecoli'][-2][1],
               'nclust': best_umap['ecoli'][-2][2]}
ecoli_tr, ecoli_ts, ecoli_y_tr, ecoli_y_ts = train_test_split(uci_data.ecoli['data'],
                                                                  uci_data.ecoli['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.ecoli['target'])
test_ucibest(ecoli_tr, ecoli_ts, ecoli_y_tr, ecoli_y_ts, ecoli_best, 
             preprocess='umap', n_jobs=7)

### `transfusion` dataset

In [None]:
transfusion_best = {'s': best_umap['transfusion'][-2][0],
               'c': best_umap['transfusion'][-2][1],
               'nclust': best_umap['transfusion'][-2][2]}
transfusion_tr, transfusion_ts, transfusion_y_tr, transfusion_y_ts = train_test_split(uci_data.transfusion['data'],
                                                                  uci_data.transfusion['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.transfusion['target'])
test_ucibest(transfusion_tr, transfusion_ts, transfusion_y_tr, transfusion_y_ts, transfusion_best, 
             preprocess='umap', n_jobs=7)

## Best solutions with scaled data
Datasets: glass, leaf.

In [None]:
# Read pkl objects with best solutions
best_scaled = pkl.load(open('../best_resultUCI_scaled.pkl', 'rb'))

### `glass` dataset

In [None]:
glass_best = {'s': best_scaled['glass'][0][0],
               'c': best_scaled['glass'][0][1],
               'nclust': best_scaled['glass'][0][2]}
glass_tr, glass_ts, glass_y_tr, glass_y_ts = train_test_split(uci_data.glass['data'],
                                                                  uci_data.glass['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.glass['target'])
test_ucibest(glass_tr, glass_ts, glass_y_tr, glass_y_ts, glass_best, 
             preprocess='scaled', n_jobs=7)

### `leaf` dataset

In [None]:
leaf_best = {'s': best_scaled['leaf'][0][0],
               'c': best_scaled['leaf'][0][1],
               'nclust': best_scaled['leaf'][0][2]}
leaf_tr, leaf_ts, leaf_y_tr, leaf_y_ts = train_test_split(uci_data.leaf['data'],
                                                                  uci_data.leaf['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.leaf['target'])
test_ucibest(leaf_tr, leaf_ts, leaf_y_tr, leaf_y_ts, leaf_best, 
             preprocess='scaled', n_jobs=7)

## Best solutions with scaled+UMAP preprocessed data
Datasets: climate, banknote, parkinsons, yeast, urban.

In [None]:
# Read pkl objects with best solutions
best_scaledumap = pkl.load(open('../best_resultUCI_scaledumap.pkl', 'rb'))

### `climate` dataset

In [None]:
climate_best = {'s': best_scaledumap['climate'][0][0],
               'c': best_scaledumap['climate'][0][1],
               'nclust': best_scaledumap['climate'][0][2]}
climate_tr, climate_ts, climate_y_tr, climate_y_ts = train_test_split(uci_data.climate['data'],
                                                                  uci_data.climate['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.climate['target'])
test_ucibest(climate_tr, climate_ts, climate_y_tr, climate_y_ts, climate_best, 
             preprocess='scaled+umap', n_jobs=7)

### `banknote` dataset

In [None]:
banknote_best = {'s': best_scaledumap['banknote'][0][0],
               'c': best_scaledumap['banknote'][0][1],
               'nclust': best_scaledumap['banknote'][0][2]}
banknote_tr, banknote_ts, banknote_y_tr, banknote_y_ts = train_test_split(uci_data.banknote['data'],
                                                                  uci_data.banknote['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.banknote['target'])
test_ucibest(banknote_tr, banknote_ts, banknote_y_tr, banknote_y_ts, banknote_best, 
             preprocess='scaled+umap', n_jobs=7)

### `parkinsons` dataset

In [None]:
parkinsons_best = {'s': best_scaledumap['parkinsons'][0][0],
               'c': best_scaledumap['parkinsons'][0][1],
               'nclust': best_scaledumap['parkinsons'][0][2]}
parkinsons_tr, parkinsons_ts, parkinsons_y_tr, parkinsons_y_ts = train_test_split(uci_data.parkinsons['data'],
                                                                  uci_data.parkinsons['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.parkinsons['target'])
test_ucibest(parkinsons_tr, parkinsons_ts, parkinsons_y_tr, parkinsons_y_ts, parkinsons_best, 
             preprocess='scaled+umap', n_jobs=7)

### `yeast` dataset

In [None]:
yeast_best = {'s': best_scaledumap['yeast'][0][0],
               'c': best_scaledumap['yeast'][0][1],
               'nclust': best_scaledumap['yeast'][0][2]}
yeast_tr, yeast_ts, yeast_y_tr, yeast_y_ts = train_test_split(uci_data.yeast['data'],
                                                                  uci_data.yeast['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.yeast['target'])
test_ucibest(yeast_tr, yeast_ts, yeast_y_tr, yeast_y_ts, yeast_best, 
             preprocess='scaled+umap', n_jobs=7)

### `urban` dataset

In [None]:
urban_best = {'s': best_scaledumap['urban'][0][0],
               'c': best_scaledumap['urban'][0][1],
               'nclust': best_scaledumap['urban'][0][2]}
urban_tr, urban_ts, urban_y_tr, urban_y_ts = train_test_split(uci_data.urban['data'],
                                                                  uci_data.urban['target'],
                                                                  test_size=0.40,
                                                                  random_state=42,
                                                                  stratify=uci_data.urban['target'])
test_ucibest(urban_tr, urban_ts, urban_y_tr, urban_y_ts, urban_best, 
             preprocess='scaled+umap', n_jobs=7)