# Bootstrapping clustering procedure

We use the bootstrap technique to train the clustering 1000 times, with different samples. This way, we should be able to obtain a better picture of the resulting space. The exact procedure is, for each iteration:
* Obtain a random subsampling of the data.
* Compute the clustering
* Calculate Jaccard coeficient between original clusters and new. Record highest Jaccard coeficient.
In the end, compute median of the jaccard coeficients. This procedure is similar to clusterboot() algorithm in R, to account for stability in the clustering and find if we are actually finding relevant clusters or not.

In [1]:
import numpy as np
import simlr_ad
import pandas as pd
from utils.data_utils import load_all_data
from utils.utils import compute_cimlr, feat_ranking, estimate_number_clusters
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


**Parameters**

In [2]:
# Parameters of the procedure
clusters = 4
stab_limit = 0.5 # if the stability of a said cluster is dissolved, it records.
rd_seed = 1714                                          # Random seed for experiment replication

# Paths
existing_cluster = True                               # Compute the clustering again or use an existing one
cluster_path = "results/cimlr4/cluster_data.csv"   # Path of the existing cluster, if applicable
covariate_path = "data/useddata_homo_abeta_plasma_meta.csv"                 # Path of the covariance data frame (.csv)
feature_path = "data/UCSDVOL.csv"                     # Path of the feature path (.csv)

# Parameters of the cluster creation
config_file = "configs/config_cimlr.ini"               # Configuration file for the clustering computation
output_directory_name = "bootstrap"

# Testing parameters


**Data loader**

In [3]:
covariate_data, cov_names, feature_data, feature_names = load_all_data(covariate_path, feature_path)

In [4]:
if existing_cluster:
    # Load existent
    c_data = pd.read_csv(cluster_path)
else:
    # Compute base clustering
    y_b, S, F, ydata, alpha = compute_simlr(
        np.array(covariate_data_new[cov_names]), clusters)


In [5]:
## Test outlier detection
from sklearn import svm
clf = svm.OneClassSVM(kernel="rbf")
clf.fit(covariate_data[cov_names])
y_pred = clf.predict(covariate_data[cov_names])
n_error_outliers = y_pred[y_pred == -1].size
print(n_error_outliers)

150


### Main Loop

In [29]:
from sklearn.cluster import KMeans
# array where the number of times a cluster is dissolved (Jaccard coeficient < stab_limit)
n_diss = np.zeros(clusters)
niterations=100
# array of arrays where all the coefficients obtained will be stored.
j_coeff = np.zeros((clusters,niterations))
# Base labels
for i in range(niterations):
    # Subsample
    boot_data = covariate_data.sample(n=len(covariate_data), replace=True)
    # Compute it
    y_it, S, F, ydata, alpha = compute_cimlr(
       np.array(boot_data[cov_names]), clusters)
    # y_it = np.random.randint(1,clusters+1, size=len(boot_data))
    # km = KMeans(n_clusters=clusters, random_state = rd_seed).fit(boot_data[cov_names])
    # y_it = km.labels_ + 1
    # Assign clusters
    for c in range(1, clusters+1):
        # For each of the original clusters
        # And that PTID is included in PTID
        cond = (c_data.C.values == c)
        set_b = c_data[cond].PTID.values
        set_b = set_b[np.in1d(set_b, boot_data.PTID.values)]
        max_js = 0.0
        for k in range(1, clusters+1):
            # Create new set of clusters
            cond = (y_it == k)
            set_it = boot_data[cond].PTID.values
            # set_it = set_it[np.in1d(set_it, boot_data.PTID.values)]
            # compute jaccard score between base assignation and given cluster
            inter = set([x for x in set_b if x in set_it])
            union = set(list(set_b) + list(set_it))
            js = float(len(inter) / len(union))
            # If larger, get it
            if js > max_js:
                max_js = js
        # If it dissolves, we want to record it
        if max_js < stab_limit:
            n_diss[c-1] += 1
        # Save jaccard scores
        j_coeff[c-1,i] = max_js
    
print('Computation finished')
for c in range(1,clusters+1):
    print('Cluster ' + str(c) + ': ' + str(np.mean(j_coeff[c-1,:])) + " Jaccard score.")
    print("It got dissolved " + str(n_diss[c-1]) + ", " + str((n_diss[c-1]/niterations)* 100) + "% of the time.")


KeyError: "['Alpha-1-Microglobulin (A1Micro) (ug/ml)'\n 'Alpha-2-Macroglobulin (A2Macro) (mg/mL)'\n 'Alpha-1-Antichymotrypsin (AACT) (ug/ml)'\n 'Alpha-1-Antitrypsin (AAT) (mg/mL)'\n 'Angiotensin-Converting Enzyme (ACE) (ng/ml)' 'Adiponectin (ug/mL)'\n 'Alpha-Fetoprotein (AFP) (ng/mL)' 'Agouti-Related Protein (AGRP) (pg/mL)'\n 'Angiopoietin-2 (ANG-2) (ng/mL)' 'Angiotensinogen (ng/mL)'\n 'Apolipoprotein A-I (Apo A-I) (mg/mL)'\n 'Apolipoprotein A-II (Apo A-II) (ng/ml)'\n 'Apolipoprotein A-IV (Apo A-IV) (ug/ml)'\n 'Apolipoprotein B (Apo B) (ug/ml)' 'Apolipoprotein C-I (Apo C-I) (ng/ml)'\n 'Apolipoprotein C-III (Apo C-III) (ug/mL)'\n 'Apolipoprotein D (Apo D) (ug/ml)' 'Apolipoprotein E (Apo E) (ug/ml)'\n 'Apolipoprotein H (Apo H) (ug/mL)'\n 'AXL Receptor Tyrosine Kinase (AXL) (ng/mL)'\n 'Beta-2-Microglobulin (B2M) (ug/mL)'\n 'Brain-Derived Neurotrophic Factor (BDNF) (ng/mL)'\n 'B Lymphocyte Chemoattractant (BLC) (pg/ml)'\n 'Bone Morphogenetic Protein 6 (BMP-6) (ng/mL)'\n 'Brain Natriuretic Peptide  (BNP) (pg/ml)' 'Betacellulin (BTC) (pg/mL)'\n 'Complement C3 (C3) (mg/mL)' 'Cancer Antigen 19-9 (CA-19-9) (U/mL)'\n 'Calcitonin (pg/mL)' 'CD 40 antigen (CD40) (ng/mL)'\n 'CD40 Ligand (CD40-L) (ng/mL)' 'CD5 (CD5L) (ng/ml)'\n 'Carcinoembryonic Antigen (CEA) (ng/mL)' 'Chromogranin-A (CgA) (ng/mL)'\n 'Creatine Kinase-MB (CK-MB) (ng/mL)' 'Clusterin (CLU) (ug/ml)'\n 'Ciliary Neurotrophic Factor (CNTF) (pg/mL)'\n 'Complement Factor H (ug/ml)' 'Cortisol (Cortisol) (ng/ml)'\n 'C-peptide (ng/ml)' 'C-Reactive Protein (CRP) (ug/mL)'\n 'Cystatin-C (ng/ml)' 'Epidermal Growth Factor (EGF) (pg/mL)'\n 'Epidermal Growth Factor Receptor (EGFR) (ng/mL)'\n 'Epithelial-Derived Neutrophil-Activating (ng/mL)' 'Eotaxin-1 (pg/mL)'\n 'Eotaxin-3 (pg/mL)' 'E-Selectin (ng/mL)'\n 'Fatty Acid-Binding Protein- heart  (FABP (ng/mL)' 'Factor VII (ng/mL)'\n 'FASLG Receptor (FAS) (ng/mL)' 'Fas Ligand (FasL) (pg/mL)'\n 'Fetuin-A (ug/ml)' 'Fibroblast Growth Factor 4 (FGF-4) (pg/mL)'\n 'Fibrinogen (mg/mL)' 'Ferritin (FRTN) (ng/mL)'\n 'Follicle-Stimulating Hormone (FSH) (mIU/mL)'\n 'Growth Hormone (GH) (ng/mL)'\n 'Growth-Regulated alpha protein (GRO-alph (pg/mL)'\n 'Glutathione S-Transferase alpha (GST-alp (ng/ml)' 'Haptoglobin (mg/mL)'\n 'Heparin-Binding EGF-Like Growth Factor ( (pg/mL)'\n 'Chemokine CC-4 (HCC-4) (ng/mL)' 'Hepatocyte Growth Factor (HGF) (ng/mL)'\n 'T Lymphocyte-Secreted Protein I-309 (I-3 (pg/mL)'\n 'Intercellular Adhesion Molecule 1 (ICAM- (ng/mL)'\n 'Immunoglobulin A (IgA) (mg/mL)' 'Immunoglobulin E (IgE) (ng/mL)'\n 'Insulin-like Growth Factor-Binding Prote (ng/mL)'\n 'Immunoglobulin M (IGM) (mg/mL)' 'Interleukin-13 (IL-13) (pg/mL)'\n 'Interleukin-16 (IL-16) (pg/mL)' 'Interleukin-18 (IL-18) (pg/mL)'\n 'Interleukin-3 (IL-3) (ng/mL)' 'Interleukin-6 receptor (IL-6r) (ng/mL)'\n 'Interleukin-8 (IL-8) (pg/mL)' 'Insulin (uIU/mL)'\n 'Interferon gamma Induced Protein 10 (IP- (pg/ml)'\n 'Kidney Injury Molecule-1  (KIM-1) (ng/ml)' 'Leptin (ng/mL)'\n 'Luteinizing Hormone  (LH) (mIU/mL)' 'Apolipoprotein(a) (Lp(a)) (ug/mL)'\n 'Monocyte Chemotactic Protein 1 (MCP-1) (pg/mL)'\n 'Monocyte Chemotactic Protein 2 (MCP-2) (pg/ml)'\n 'Monocyte Chemotactic Protein 3 (MCP-3) (pg/mL)'\n 'Monocyte Chemotactic Protein 4 (MCP-4) (pg/ml)'\n 'Macrophage Colony-Stimulating Factor 1 ( (ng/mL)'\n 'Macrophage-Derived Chemokine (MDC) (pg/mL)'\n 'Macrophage Migration Inhibitory Factor ( (ng/mL)'\n 'Monokine Induced by Gamma Interferon (MI (pg/ml)'\n 'Macrophage Inflammatory Protein-1 alpha (pg/mL)'\n 'Macrophage Inflammatory Protein-1 beta ( (pg/mL)'\n 'Macrophage Inflammatory Protein-3 alpha (pg/ml)'\n 'Matrix Metalloproteinase-1 (MMP-1) (ng/ml)'\n 'Matrix Metalloproteinase-10 (MMP-10) (ng/ml)'\n 'Matrix Metalloproteinase-2 (MMP-2) (ng/mL)'\n 'Matrix Metalloproteinase-7 (MMP-7) (ng/ml)'\n 'Matrix Metalloproteinase-9 (MMP-9) (ng/mL)'\n 'Matrix Metalloproteinase-9- total (MMP-9 (ng/ml)'\n 'Myeloid Progenitor Inhibitory Factor 1 ( (ng/mL)'\n 'Myeloperoxidase (MPO) (ng/mL)' 'Myoglobin (ng/mL)'\n 'Neutrophil Gelatinase-Associated Lipocal (ng/ml)'\n 'Neuronal Cell Adhesion Molecule (Nr-CAM) (ng/mL)' 'Osteopontin (ng/ml)'\n 'Plasminogen Activator Inhibitor 1 (PAI-1 (ng/mL)'\n 'Prostatic Acid Phosphatase (PAP) (ng/mL)'\n 'Pregnancy-Associated Plasma Protein A (P (mIU/mL)'\n 'Pulmonary and Activation-Regulated Chemo (ng/mL)'\n 'Platelet-Derived Growth Factor BB (PDGF- (pg/ml)'\n 'Placenta Growth Factor (PLGF) (pg/ml)'\n 'Pancreatic Polypeptide (PPP) (pg/ml)' 'Prolactin (PRL) (ng/ml)'\n 'Proinsulin- Intact (pM)' 'Proinsulin- Total (pM)'\n 'Peptide YY (PYY) (pg/mL)'\n 'Receptor for advanced glycosylation end (ng/mL)'\n 'T-Cell-Specific Protein RANTES (RANTES) (ng/mL)' 'Resistin (ng/ml)'\n 'Serum Amyloid P-Component (SAP) (ug/mL)'\n 'Stem Cell Factor (SCF) (pg/mL)'\n 'Serum Glutamic Oxaloacetic Transaminase (ug/mL)'\n 'Sex Hormone-Binding Globulin (SHBG) (nmol/L)'\n 'Superoxide Dismutase 1- Soluble (SOD-1) (ng/mL)' 'Sortilin (ng/mL)'\n 'Thyroxine-Binding Globulin (TBG) (ug/mL)'\n 'Thymus-Expressed Chemokine (TECK) (ng/mL)' 'Testosterone- Total (ng/ml)'\n 'Trefoil Factor 3 (TFF3) (ug/ml)'\n 'Tamm-Horsfall Urinary Glycoprotein (THP) (ug/ml)'\n 'Thrombospondin-1 (ng/mL)'\n 'Tissue Inhibitor of Metalloproteinases 1 (ng/mL)'\n 'Thrombomodulin (TM) (ng/ml)' 'Tenascin-C (TN-C) (ng/mL)'\n 'Tumor Necrosis Factor alpha (TNF-alpha) (pg/mL)'\n 'Tumor Necrosis Factor Receptor-Like 2 (T (ng/mL)'\n 'Thrombopoietin (ng/mL)'\n 'TNF-Related Apoptosis-Inducing Ligand Re (ng/mL)'\n 'Serotransferrin (Transferrin) (mg/dl)'\n 'Thyroid-Stimulating Hormone  (TSH) (uIU/mL)'\n 'Transthyretin (TTR) (mg/dl)'\n 'Vascular Cell Adhesion Molecule-1 (VCAM- (ng/mL)'\n 'Vascular Endothelial Growth Factor (VEGF (pg/mL)' 'Vitronectin (ug/ml)'\n 'Vitamin K-Dependent Protein S (VKDPS) (ug/ml)'\n 'von Willebrand Factor (vWF) (ug/mL)' 'HCAMPLAS' 'AB40' 'AB42'\n 'PLASMA_NFL' 'Alanine' 'Arginine' 'Asparagine' 'Aspartic Acid'\n 'Citrulline' 'Glutamine' 'Glutamic Acid' 'Glycine' 'Isoleucine' 'Lysine'\n 'Methionine' 'Ornithine' 'Phenylalanine' 'Proline' 'Serine' 'Threonine'\n 'Tryptophan' 'Tyrosine' 'Valine' 'Kynurenine' 'Sarcosine'\n 'trans isomer of 4-hydroxyproline '] not in index"

Now do the same procedure, but with synthetic data. This way, we can directly compare with data that is well separated.

In [52]:
## Create synthetic data of the same size
n_samples = []
for c in range(0, clusters):
    n_samples.append(len(c_data[c_data.C.values == c]))

max_samples = max(n_samples)

from sklearn.datasets import make_blobs
# Create a blob for each cluster with the corresponding number of samples
X, y = make_blobs(n_samples=max_samples*clusters, n_features = len(cov_names), centers = clusters, cluster_std=20.0)
# For each cluster, select only as many elements as members of the cluster
synth_X = []
synth_y = []
for c in range(0, clusters):
    curr_items = X[y==c]
    print(curr_items.shape)
    curr_items = curr_items[:n_samples[c],:]
    print(curr_items.shape)
    synth_X.append(curr_items)
    synth_y += ([c+1] * n_samples[c])

synth_X = np.concatenate((synth_X[0],synth_X[1],synth_X[2],synth_X[3]))
synth_data = pd.DataFrame(synth_X)
print(synth_data.shape)
synth_y = np.array(synth_y)

(82, 172)
(82, 172)
(82, 172)
(77, 172)
(82, 172)
(61, 172)
(82, 172)
(78, 172)
(298, 172)


In [56]:
from sklearn.cluster import KMeans
# array where the number of times a cluster is dissolved (Jaccard coeficient < stab_limit)
n_diss = np.zeros(clusters)
niterations=5
# array of arrays where all the coefficients obtained will be stored.
j_coeff = np.zeros((clusters,niterations))
# Base labels
for i in range(niterations):
    # Subsample
    boot_data = synth_data.sample(n=len(synth_data), replace=True)
    # Compute it
    # y_it, S, F, ydata, alpha = compute_cimlr(
    #   np.array(boot_data), clusters)
    # y_it = np.random.randint(1,clusters+1, size=len(boot_data))
    km = KMeans(n_clusters=clusters, random_state = rd_seed).fit(boot_data)
    y_it = km.labels_ + 1
    # Assign clusters
    for c in range(1, clusters+1):
        # For each of the original clusters
        # And that PTID is included in PTID
        cond = (synth_y == c)
        set_b = synth_data[cond].index.values
        set_b = set_b[np.in1d(set_b, boot_data.index.values)]
        max_js = 0.0
        for k in range(1, clusters+1):
            # Create new set of clusters
            cond = (y_it == k)
            set_it = boot_data[cond].index.values
            # set_it = set_it[np.in1d(set_it, boot_data.PTID.values)]
            # compute jaccard score between base assignation and given cluster
            inter = set([x for x in set_b if x in set_it])
            union = set(list(set_b) + list(set_it))
            js = float(len(inter) / len(union))
            # If larger, get it
            if js > max_js:
                max_js = js
        # If it dissolves, we want to record it
        if max_js < stab_limit:
            n_diss[c-1] += 1
        # Save jaccard scores
        j_coeff[c-1,i] = max_js
    
print('Computation finished')
for c in range(1,clusters+1):
    print('Cluster ' + str(c) + ': ' + str(np.mean(j_coeff[c-1,:])) + " Jaccard score.")
    print("It got dissolved " + str(n_diss[c-1]) + ", " + str((n_diss[c-1]/niterations)* 100) + "% of the time.")


Computation finished
Cluster 1: 0.8872770970202504 Jaccard score.
It got dissolved 0.0, 0.0% of the time.
Cluster 2: 0.7662575160688367 Jaccard score.
It got dissolved 1.0, 20.0% of the time.
Cluster 3: 0.8624996280315429 Jaccard score.
It got dissolved 0.0, 0.0% of the time.
Cluster 4: 0.7616256355242588 Jaccard score.
It got dissolved 0.0, 0.0% of the time.


In [51]:
Computation finished
Cluster 1: 0.23410747106399282 Jaccard score.
It got dissolved 5.0, 100.0% of the time.
Cluster 2: 0.21395057141481763 Jaccard score.
It got dissolved 5.0, 100.0% of the time.
Cluster 3: 0.23569497274249546 Jaccard score.
It got dissolved 5.0, 100.0% of the time.
Cluster 4: 0.253058868116535 Jaccard score.
It got dissolved 5.0, 100.0% of the time.


SyntaxError: invalid syntax (<ipython-input-51-8dd583973a0d>, line 1)