# Importing libraries...
  * #### The Cell #1 imports the essential matplotlib modules for displaying figures outside jupyter cell 
  * #### The Cell #2 imports the essessential pandas, numpy and scipy modules for our computations (including the agg. clustering)

In [1]:
import PyQt5
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib import style;  style.use('ggplot')
get_ipython().magic('matplotlib qt')

In [10]:
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster

# Loading Processed Data Matrix...

In [4]:
X = np.load('comp-data/1-preprocessing-comp-data/user-feature-set-stdscl.npy')

# Generating The Hierachical Clustering Dendrogram...
  * #### Using Complete Linkage Method

In [5]:
# generate the linkage matrix
ZC = linkage(X, 'complete')

In [7]:
# calculate full dendrogram
plt.figure(1, figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram -- Complete-Linkage')
plt.xlabel('X[i]')
plt.ylabel('distance')
dendrogram(
    ZC,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()

In [6]:
plt.figure(2, figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram -- Complete-Linkage (truncated)')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    ZC,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=20,  # show only the last p merged clusters
    show_leaf_counts=False,  # otherwise numbers in brackets are counts
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  # to get a distribution impression in truncated branches
)
plt.show()

# Getting the -optimal- number of Clusters... (k-means elbow method)
  * #### As seen in the above dendrogram if we "trim" the tree in a certain distance point, we have from 2 to 9 clusters.
  * #### Running k-means for k = 2, 3, 4, 5, 6, 7, 8, 9 - and optionally 10 - and applying the elbow method should tell us the optimal number of clusters

  * ### Importing sklearn essential libraries for k-means and scipy

In [9]:
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist

In [10]:
# k means determine optimal k
distortions = []
K = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for k in tqdm(K):
    kmeanTest = KMeans(n_clusters=k, n_init=20, n_jobs=-1, precompute_distances=True, random_state=0, verbose=2)
    kmeanTest.fit(X); kmeanTest.fit(X)
    distortions.append(sum(np.min(cdist(X, kmeanTest.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
 
# Plot the elbow
plt.figure(2, figsize=(25, 10))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:10<00:00,  7.05s/it]


 * #### From the elbow method we saw that the optimal number of clusters is 4 (as much as BSAS suggested)
 * #### So, we trim the dendrogram at max_d = 6.10

In [8]:
max_d = 6.10
clusters_ = fcluster(ZC, max_d, criterion='distance')
clusters_

array([1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4,
       2, 2, 3, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2, 4, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 1, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 4, 2, 1, 4, 2, 4, 2,
       4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 1, 2, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 4, 2, 2,
       2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 4,
       1, 2, 2, 2, 2, 4, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4,
       2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 3, 2, 2, 2, 2,
       4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 2, 2, 4, 4, 2, 2,
       2, 2,

In [18]:
tmp = pd.DataFrame(X)
tmp[19] = clusters_

centroids_ = tmp.groupby([19]).mean()
centroids_

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
19,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,-0.7749,0.347888,-0.047486,-0.375541,-0.028664,2.411857,-0.427876,-0.742729,1.860261,-0.680843,-0.688428,-0.540042,-0.255504,-0.468997,1.273786,-0.191637,0.258992,-0.243843,-0.686293
2,-0.866079,0.888199,0.018012,-0.638964,-0.453794,1.13058,-0.115776,-0.798859,2.826348,-0.797645,-0.689839,-0.535611,-0.548579,-0.339864,0.724513,0.047952,0.911283,-0.012656,-0.74922
3,-0.977285,2.019809,1.043577,-0.472172,-0.056826,1.520509,-0.567352,-0.947458,1.112684,-0.808942,-0.91202,-0.672844,-0.530786,-0.735786,0.584859,0.776066,0.5803,-0.100289,-0.856044
4,-0.845902,1.91072,0.145498,-0.723954,-0.530085,0.319899,-0.149105,-0.827516,1.245113,-0.800132,-0.702762,-0.049504,-0.671629,-0.014798,0.246837,0.245658,2.091967,-0.105131,-0.785176


# Saving the Computed Results...

In [14]:
np.save('comp-data/3b-hierarchical-clustering-comp-data/clusters_.npy', clusters_)

In [19]:
np.save('comp-data/3b-hierarchical-clustering-comp-data/centroids_.npy', centroids_)

# ~ END OF CHAPTER 3 - (AGGLOMERATIVE) HIERARCHICAL CLUSTERING ~