Run a stack of different clusetring algorithms from the skealrn library

calculate the cophenetic coefficient for agglomerative algorithm

print silhouette score of different models

plot results on PCA reduce

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.decomposition import PCA
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist
from sklearn import metrics



import os, glob, inspect, sys, time, warnings


currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import epri_mc_lib as mc
from importlib import reload
reload(mc)

# Import data

In [None]:
data_path = "../../Data/Merged_data"
df = pd.read_csv(os.path.join(data_path, 'ALL_TUBE_PIPE_simulated.csv'), 
                 index_col=0)


## Calculating new values

In [None]:
df['AUC_avg'] = mc.findAUC(df, A=df['A'], B=df['B'], p=df['p'], name='AUC_avg')
df.drop(columns=["A","B","p"],inplace=True)

In [None]:
#df['CF_perm'] = df['mean_CF']/df['mean_perm'].astype('float64')
#df.drop(columns=["mean_MBN","mean_CF"],inplace=True)

In [None]:
df.drop(columns=["Absorption_avg_500","Absorption_avg_200"],inplace=True)

## Scaling values

In [None]:
scaled_df = mc.scale_general(df, MinMaxScaler())[0]


## Selecting sub samples

In [None]:
tube, pipe, tube_wo_blind, tube_blind = mc.get_subsample_df(scaled_df)

In [None]:
scaled_df['CF_perm'] = scaled_df['mean_CF']/scaled_df['mean_perm'].astype('float64')
corr_scaled_df = scaled_df.copy().loc[:,mc.correlation_list]
tube_scaled_corr, pipe_scaled_corr, \
tube_wo_blind_scaled_corr, tube_blind_scaled_corr = mc.get_subsample_df(corr_scaled_df)

In [None]:
mini_scaled_df = scaled_df.copy().loc[:,mc.minimal_informative_features]
tube_scaled_mini, pipe_scaled_mini, \
tube_wo_blind_scaled_mini, tube_blind_scaled_mini = mc.get_subsample_df(mini_scaled_df)

# Cophenetic Correlation Coefficient 

In [None]:
# The closer the value is to 1, the better the clustering preserves the original distances.
def get_c (df):
    results = []
    for linkage_name in ['single', 'average', 'weighted', 'centroid', 'median', 'ward']:
        for metric_name in ['chebyshev', 'cityblock', 'cosine', 'euclidean', 'minkowski', 'sqeuclidean']:
            try:
                Z = hierarchy.linkage(df, method=linkage_name, metric=metric_name)
            except ValueError:
                pass
            c, coph_dists = hierarchy.cophenet(Z, pdist(df, metric_name))

            results.append([linkage_name, metric_name, c])
    data = pd.DataFrame(results, columns=['linkage', 'distance metric', 'C']).sort_values('C', ascending=False)
    return data

In [None]:
get_c(tube_wo_blind_scaled_mini)

# Clustering

- **MeanShift**: [`MeanShift`](https://sklearn.org/modules/generated/sklearn.cluster.MeanShift.html#sklearn.cluster.MeanShift)

- **K-Means**:
[`KMeans`](https://sklearn.org/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans)

- **Agglomerative Hierarchical Clustering**:
[`AgglomerativeClustering`](https://sklearn.org/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering)

- **Ward**:
[`AgglomerativeClustering`](https://sklearn.org/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering)

- **Spectral Clustering**:
[`SpectralClustering`](https://sklearn.org/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering)

- **DBSCAN**:
[`DBSCAN`](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html)
    
- **OPTICS**:
[`OPTICS`](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html)

- **BIRCH**:
[`BIRCH`]([`BIRCH`](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html)

- **Gaussian Mixture**:
[`GaussianMixture`](https://scikit-learn.org/stable/modules/mixture.html#gmm)

- **Bayesian Gaussian Mixture**:
[`BayesianGaussianMixture`](https://scikit-learn.org/stable/modules/mixture.html#gmm)


In [None]:
np.random.seed(42)
random_state = 42

default_base = {'quantile': .1, #for bandwidth of RBF kernel in meanshift
                'eps': 0.1, #max distance between 2 samples for DBSCAN
                'metric': 'chebyshev', #for DBSCAN
                'damping': .1, #for Affinity Propagation:the extent to which the current value is maintained relative to incoming values
                'preference': -200, #for Affinity Propagation: each point - points with larger values of preferences are more likely to be chosen as exemplars.
                'n_neighbors': 3, #for k-neighbors graph for the connectivity matrix
                'n_clusters': 6, #for kmeans, ward, spectral, meanshift, BIRCH
                #for kmeans determine on elbow cf model_kmeans_simulated.ipynb
                'xi': 0.02, #for OPTICS Determines the minimum steepness on the reachability plot that constitutes a cluster boundary.
                'min_cluster_size': 0.007, # for optics Minimum number of samples in an OPTICS cluster, expressed as an absolute number or a fraction of the number of samples
                'threshold': 0.05, #for BIRCH to limit the samples per leaf node 
                'random_state': random_state
               }

datasets = [
    #(tube, {})
    #(tube_scaled_mini, {}),
    #(tube_scaled_corr, {}),
    (tube_wo_blind_scaled_mini, {})
]

In [None]:
for i_dataset, (dataset, algo_params) in enumerate(datasets):
    # update parameters with dataset-specific values
    params = default_base.copy()
    params.update(algo_params)

    X  = dataset

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

    # connectivity matrix for structured Ward (if need see jelly roll example)
    connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # ============
    # Create cluster objects
    # ============
    ms = cluster.MeanShift(bandwidth=bandwidth)
    two_means = cluster.KMeans(n_clusters=params['n_clusters']) #use minibatch if too heavy
    average_linkage = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity)
    ward = cluster.AgglomerativeClustering(linkage="ward", affinity="euclidean", n_clusters=params['n_clusters'])
    spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") #can also be rbf
    dbscan = cluster.DBSCAN(eps=params['eps'], metric=params['metric'])
    #affinity_propagation = cluster.AffinityPropagation(damping=params['damping'], preference=params['preference'])
    optics = cluster.OPTICS(xi=params['xi'], min_cluster_size=params['min_cluster_size'])
    birch = cluster.Birch(n_clusters=params['n_clusters'], threshold=params['threshold'])
    gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full')
    bgmm = mixture.BayesianGaussianMixture(n_components=params['n_clusters'], covariance_type='full')

    clustering_algorithms = (
        ('Kmeans', two_means),
        #('AffinityPropagation', affinity_propagation), EXCLUDED because of high complexity, most appropriate for small to medium sized datasets
        ('Ward', ward),
        ('AgglomerativeClustering', average_linkage),
        ('MeanShift', ms),
        ('SpectralClustering', spectral),
        ('DBSCAN', dbscan),
        ('OPTICS', optics),
        ('Birch', birch),
        ('GaussianMixture', gmm),
        ('BayesianGaussianMixture', bgmm)
    )

    labels = pd.DataFrame(index=X.index)
    for name, algorithm in clustering_algorithms:
        t0 = time.time()

        # catch warnings related to kneighbors_graph
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" +
                " > 1. Completing it to avoid stopping the tree early.",category=UserWarning)
            warnings.filterwarnings("ignore", message="Graph is not fully connected, spectral embedding" +
                " may not work as expected.", category=UserWarning)
            
            algorithm.fit(X)

        t1 = time.time()
        
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(X)
        
        labels['label_' + name] = y_pred
        
        silhouette = metrics.silhouette_score(X, labels['label_' + name], metric='euclidean')
        print(name, ': silhouette score = ',silhouette)
    results = pd.concat([X, labels], axis=1)

# Perform PCA to plot all dimension (n_components = 2)

In [None]:
pca = PCA(n_components=2, svd_solver='full')
transformed = pca.fit_transform(X)
results['PC 1'] = transformed[:,0]
results['PC 2'] = transformed[:,1]


In [None]:
fig, axes = plt.subplots(int(len(datasets)*2), int(len(clustering_algorithms)/2), figsize=(20, 7))
plt.tight_layout(h_pad=5, w_pad=5)
for ax, col in zip(axes.flatten(), labels.columns):
    print(ax, col)
    sns.scatterplot(data=results, ax=ax,
                    x=results['PC 1'], y=results['PC 2'], s=10, hue=col)
    handles, lab = ax.get_legend_handles_labels()
    ax.legend(handles=handles[:], labels=lab[:])

    ax.set_title(col.split('_')[1], size=16)