[Chapter 4] Optimal Clustering

In [None]:
# a clustering problem consists of a set of objects and a set of features associated with those objects (unsupervised learning)
# the goal is to seperate the objects into groups (called clusters) using the features
#           where intragroup similarities are maximized and intergroup similarities are minimized

# 2 main classes of clustering: partitional (one-level/un-nested) and hierarchical(nested sequence of partitions)
# types of clustering algo: connectivity (e.g. hierarchical clustering), centroids (e.g. k-means), distribution,
#                      density (e.g. DBSCAN, OPTICS), subspace (on 2-dimensions both features and observations, e.g. bi/co-clustering) 

import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples

base clustering

In [7]:
def clusterKMeansBase(corr0, maxNumClusters=10, n_init=10):
    """
    use k-means algorithm on observation matrix
    """
    # derive observation matrix x as distance matrix
    x = ((1-corr0.fillna(0))/2.)**.5    # corr=1, x=0; corr=-1, x=1
    silh = pd.Series()
    
    for init in range(n_init):  # k-means uses multiple random initialization to avoid local optimas
        for i in range(2, maxNumClusters+1): # loop different number of maxNumClusters
            
            # k-means clustering
            kmeans_ = KMeans(n_clusters=i, n_jobs=1, n_init=1)  # only initialization once for inner loop
            kmeans_ = kmeans_.fit(x)

            #  calculate silhouette coef in measuring comparing intracluster distance and intercluster distance
            #       Si = (b_i-a_i)/max{a_i, b_i} 
            #       - a_i is the avg distance between i and all other elements in the same cluster
            #       - b_i is the avg distance between i and all the elements in the nearest cluster which i is not a member 
            #       - Si = 1 means i was clustered well and -1 means i was clustered poorly
            silh_ = silhouette_samples(x, kmeans_.labels_)
            
            # clustering quality q = mean(silh)/std(silh)
            # comparing current q_ vs historical optimal q
            stat = (silh_.mean()/silh_.std(), silh.mean()/silh.std())
            if np.isnan(stat[1]) or stat[0]>stat[1]:    
                silh, kmeans=silh_, kmeans_     # select clustering with the highest q

    # reorder correlation matrix based on kmeans clustering
    newIdx = np.argsort(kmeans.labels_)
    corr1 = corr0.iloc[newIdx] # reorder rows
    corr1 = corr1.iloc[:, newIdx] # reorder columns

    # extract/output clustering info
    clstrs = {i:corr0.columns[np.where(kmeans.labels_==i)[0]].tolist() \
              for i in np.unique(kmeans.labels_)}   # cluster members
    
    silh = pd.Series(silh, index=x.index)
    
    return corr1, clstrs, silh

higher-level clustering