## Load functions

In [1]:
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import fcluster
# test data
from sklearn.datasets import load_iris
from sklearn.metrics import silhouette_score
from utils import vis_tsne, plot_elbow, vis_umap

In [2]:
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

def plot_dendro(X, link_method, annotate_threshold, hline):
    """
    link_method: single or ward
    """
    labelList = range(np.shape(X)[0])
    linked = linkage(X, link_method)
    fancy_dendrogram(linked,
            orientation='top',
            labels=labelList,
            distance_sort='descending',
            show_leaf_counts=True,
            max_d = hline,
            annotate_above=annotate_threshold)
    plt.show()

def dendro_cluster(X, threshold, link_method, view=False):
    f_cut = fcluster(linkage(X, method=link_method), threshold,criterion='distance')
    if view:
        vis_tsne(X,f_cut)
        vis_umap(X,f_cut)
    return f_cut

## plot dendrogram

In [3]:
#X = load_iris().data
X = np.loadtxt("data/abalone.test.nld",delimiter=',')
link_method = 'ward'
annotate_threshold = 5
hline = 5
plot_dendro(X, link_method, annotate_threshold, hline)

OSError: data/abalone.test.nld not found.

## Calculate the Silhouette score

In [None]:
cut_off_list = [25, 10, 5, 2, 1]
silhouette_score_list = []
for i_cut in cut_off_list:
    membership = dendro_cluster(X, i_cut, link_method, view=False)
    tmp_score = silhouette_score(X, membership)
    silhouette_score_list.append(tmp_score)
    print("For cutoff = {}, silhouette score is {}".format(i_cut, tmp_score))

plot_elbow(1, len(cut_off_list)+1, silhouette_score_list,'Silhouette', 'Silhouette score plot')

## Choose the optimal cutoff and visualize clusters

In [None]:
cut_off = 2
membership = dendro_cluster(X, cut_off, link_method, view=True)
print(membership)

## save cluster memberships

In [None]:
np.save("hierarchical_membership.npy",membership)