### Implementing this paper: https://web.stanford.edu/~hastie/Papers/gap.pdf

In [147]:
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
import itertools

output_notebook()

In [200]:
# helper functions
def plot_pca(X):
    pca = PCA(n_components=2, random_state=0)
    pca.fit(X.transpose())
    x1, x2 = pca.components_
    variance_explained = sum(pca.explained_variance_ratio_) *100
    title="first two components explained {0: .1f} % of variance".format(variance_explained)
    p = figure(title=title)
    p.scatter(x1, x2)
    show(p)
    
def calculate_wk(X, y):
    return sum([mean_ss(X[y==i]) for i in set(y)])

def mean_ss(points):
    """ averaged distances between every two data points"""
    s = sum([euclidean(p1, p2) for p1, p2 in itertools.combinations(points, r=2)])
    return s * 1.0/(2*len(points))

In [201]:
# simulate data points with 3 clusters
X_cluster, y_cluster = make_blobs(n_samples=1000, n_features=5, centers=4)
plot_pca(X_cluster)

In [202]:
# simulate data with isotropic gaussian distribution
X_uniform = np.random.normal(size=(1000, 5))
plot_pca(X_uniform)

In [203]:
Wk = []
E_Wk = []
RANGE = range(1, 11)
for n_cluster in RANGE:
    y_obs = KMeans(n_clusters=n_cluster).fit_predict(X_cluster)
    Wk.append(calculate_wk(X_cluster, y_obs))
    y_exp = KMeans(n_clusters=n_cluster).fit_predict(X_uniform)
    E_Wk.append(calculate_wk(X_uniform, y_exp))   

In [204]:
gap = np.log(np.array(E_Wk)) - np.log(np.array(Wk))
p = figure()
p.line(RANGE, gap)
p.square(RANGE, gap)
show(p)