# Bootstrapping clustering procedure

We use the bootstrap technique to train the clustering 1000 times, with different samples. This way, we should be able to obtain a better picture of the resulting space. The exact procedure is, for each iteration:
* Obtain a random subsampling of the data.
* Compute the clustering
* Calculate Jaccard coeficient between original clusters and new. Record highest Jaccard coeficient.
In the end, compute median of the jaccard coeficients. This procedure is similar to clusterboot() algorithm in R, to account for stability in the clustering and find if we are actually finding relevant clusters or not.

In [1]:
import sys
sys.path.append('..')
import numpy as np
import simlr_ad
import pandas as pd
from utils.data_utils import load_all_data, load_covariates
from utils.utils import compute_cimlr, feat_ranking, estimate_number_clusters
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**Parameters**

In [2]:
# Parameters of the procedure
clusters = 4
stab_limit = 0.5 # if the stability of a said cluster is dissolved, it records.
rd_seed = 1714                                          # Random seed for experiment replication

# Paths
existing_cluster = True                               # Compute the clustering again or use an existing one
cluster_path = "/home/gerard/Documents/EXPERIMENTS/SIMLR-AD/cimlr4long/"# Path of the existing cluster, if applicable
covariate_path = "../long_data/covariates_long.csv"       # Path of the covariance data frame (.csv)

# Parameters of the cluster creation
config_file = "../configs/config_cimlr_long.ini"               # Configuration file for the clustering computation
output_directory_name = "bootstrap"

# Testing parameters


**Data loader**

In [3]:
covariate_data, cov_names = load_covariates(covariate_path, normalize=True)

In [4]:
if existing_cluster:
    # Load existent
    c_data = pd.read_csv(cluster_path + 'cluster_data.csv')
    c_data.reset_index(inplace=True)
else:
    # Compute base clustering
    y_b, S, F, ydata, alpha = compute_simlr(
        np.array(covariate_data_new[cov_names]), clusters)

print(len(c_data))
print(len(covariate_data))

435
435


In [5]:
## Test outlier detection
from sklearn import svm
clf = svm.OneClassSVM(kernel="rbf")
clf.fit(covariate_data[cov_names])
y_pred = clf.predict(covariate_data[cov_names])
n_error_outliers = y_pred[y_pred == -1].size
print(n_error_outliers)

219


### Main Loop

In [6]:
from sklearn.cluster import KMeans
# array where the number of times a cluster is dissolved (Jaccard coeficient < stab_limit)
n_diss = np.zeros(clusters)
niterations=100
# array of arrays where all the coefficients obtained will be stored.
j_coeff = np.zeros((clusters,niterations))
# Base labels
for i in range(niterations):
    # Subsample
    boot_data = covariate_data.sample(n=len(covariate_data), replace=True)
    # Compute it
    y_it, S, F, ydata, alpha = compute_cimlr(
       np.array(boot_data[cov_names]), clusters)
    # y_it = np.random.randint(1,clusters+1, size=len(boot_data))
    # km = KMeans(n_clusters=clusters, random_state = rd_seed).fit(boot_data[cov_names])
    # y_it = km.labels_ + 1
    # Assign clusters
    for c in range(1, clusters+1):
        # For each of the original clusters
        # And that PTID is included in PTID
        cond = (c_data.C.values == c)
        set_b = c_data[cond].PTID.values
        set_b = set_b[np.in1d(set_b, boot_data.PTID.values)]
        max_js = 0.0
        for k in range(1, clusters+1):
            # Create new set of clusters
            cond = (y_it == k)
            set_it = boot_data[cond].PTID.values
            # set_it = set_it[np.in1d(set_it, boot_data.PTID.values)]
            # compute jaccard score between base assignation and given cluster
            inter = set([x for x in set_b if x in set_it])
            union = set(list(set_b) + list(set_it))
            js = float(len(inter) / len(union))
            # If larger, get it
            if js > max_js:
                max_js = js
        # If it dissolves, we want to record it
        if max_js < stab_limit:
            n_diss[c-1] += 1
        # Save jaccard scores
        j_coeff[c-1,i] = max_js
    
print('Computation finished')
for c in range(1,clusters+1):
    print('Cluster ' + str(c) + ': ' + str(np.mean(j_coeff[c-1,:])) + " Jaccard score.")
    print("It got dissolved " + str(n_diss[c-1]) + ", " + str((n_diss[c-1]/niterations)* 100) + "% of the time.")


Computation finished
Cluster 1: 0.44719138438357703 Jaccard score.
It got dissolved 65.0, 65.0% of the time.
Cluster 2: 0.4287163551050427 Jaccard score.
It got dissolved 72.0, 72.0% of the time.
Cluster 3: 0.3164895154490641 Jaccard score.
It got dissolved 99.0, 99.0% of the time.
Cluster 4: 0.4868191216726101 Jaccard score.
It got dissolved 52.0, 52.0% of the time.


Now do the same procedure, but with synthetic data. This way, we can directly compare with data that is well separated.

In [7]:
## Create synthetic data of the same size
n_samples = []
for c in range(0, clusters):
    n_samples.append(len(c_data[c_data.C.values == c]))

max_samples = max(n_samples)

from sklearn.datasets import make_blobs
# Create a blob for each cluster with the corresponding number of samples
X, y = make_blobs(n_samples=max_samples*clusters, n_features = len(cov_names), centers = clusters, cluster_std=20.0)
# For each cluster, select only as many elements as members of the cluster
synth_X = []
synth_y = []
for c in range(0, clusters):
    curr_items = X[y==c]
    print(curr_items.shape)
    curr_items = curr_items[:n_samples[c],:]
    print(curr_items.shape)
    synth_X.append(curr_items)
    synth_y += ([c+1] * n_samples[c])

synth_X = np.concatenate((synth_X[0],synth_X[1],synth_X[2],synth_X[3]))
synth_data = pd.DataFrame(synth_X)
print(synth_data.shape)
synth_y = np.array(synth_y)

(130, 146)
(0, 146)
(130, 146)
(96, 146)
(130, 146)
(110, 146)
(130, 146)
(130, 146)
(336, 146)


In [8]:
from sklearn.cluster import KMeans
# array where the number of times a cluster is dissolved (Jaccard coeficient < stab_limit)
n_diss = np.zeros(clusters)
niterations=5
# array of arrays where all the coefficients obtained will be stored.
j_coeff = np.zeros((clusters,niterations))
# Base labels
for i in range(niterations):
    # Subsample
    boot_data = synth_data.sample(n=len(synth_data), replace=True)
    # Compute it
    # y_it, S, F, ydata, alpha = compute_cimlr(
    #   np.array(boot_data), clusters)
    # y_it = np.random.randint(1,clusters+1, size=len(boot_data))
    km = KMeans(n_clusters=clusters, random_state = rd_seed).fit(boot_data)
    y_it = km.labels_ + 1
    # Assign clusters
    for c in range(1, clusters+1):
        # For each of the original clusters
        # And that PTID is included in PTID
        cond = (synth_y == c)
        set_b = synth_data[cond].index.values
        set_b = set_b[np.in1d(set_b, boot_data.index.values)]
        max_js = 0.0
        for k in range(1, clusters+1):
            # Create new set of clusters
            cond = (y_it == k)
            set_it = boot_data[cond].index.values
            # set_it = set_it[np.in1d(set_it, boot_data.PTID.values)]
            # compute jaccard score between base assignation and given cluster
            inter = set([x for x in set_b if x in set_it])
            union = set(list(set_b) + list(set_it))
            js = float(len(inter) / len(union))
            # If larger, get it
            if js > max_js:
                max_js = js
        # If it dissolves, we want to record it
        if max_js < stab_limit:
            n_diss[c-1] += 1
        # Save jaccard scores
        j_coeff[c-1,i] = max_js
    
print('Computation finished')
for c in range(1,clusters+1):
    print('Cluster ' + str(c) + ': ' + str(np.mean(j_coeff[c-1,:])) + " Jaccard score.")
    print("It got dissolved " + str(n_diss[c-1]) + ", " + str((n_diss[c-1]/niterations)* 100) + "% of the time.")


Computation finished
Cluster 1: 0.0 Jaccard score.
It got dissolved 5.0, 100.0% of the time.
Cluster 2: 0.8191973424103122 Jaccard score.
It got dissolved 0.0, 0.0% of the time.
Cluster 3: 0.8711991889587798 Jaccard score.
It got dissolved 0.0, 0.0% of the time.
Cluster 4: 0.8481057921525675 Jaccard score.
It got dissolved 0.0, 0.0% of the time.
