# Importing libraries...
  * #### The Cell #1 imports the essential matplotlib modules for displaying figures outside jupyter cell 
  * #### The Cell #2 imports the essessential numpy and scipy modules for our computations (including the agg. clustering)

In [1]:
import PyQt5
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib import style;  style.use('ggplot')
get_ipython().magic('matplotlib qt')

In [2]:
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster

# Loading Processed Data Matrix...

In [3]:
X = np.load('comp-data/1-preprocessing-comp-data/user-feature-set-gauss.npy')

# Generating The Hierachical Clustering Dendrogram...
  * #### Using Complete Linkage Method

In [4]:
# generate the linkage matrix
ZC = linkage(X, 'complete')

In [7]:
# calculate full dendrogram
plt.figure(1, figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram -- Complete-Linkage')
plt.xlabel('X[i]')
plt.ylabel('distance')
dendrogram(
    ZC,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()

In [6]:
plt.figure(2, figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram -- Complete-Linkage (truncated)')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    ZC,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=20,  # show only the last p merged clusters
    show_leaf_counts=False,  # otherwise numbers in brackets are counts
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  # to get a distribution impression in truncated branches
)
plt.show()

# Getting the number of Clusters... (trimming the dendrogram)

In [8]:
max_d = 4.78
clusters_ = fcluster(ZC, max_d, criterion='distance')
clusters_

array([3, 5, 4, 4, 3, 4, 4, 4, 4, 5, 4, 4, 5, 4, 4, 4, 5, 8, 4, 7, 4, 4, 8,
       4, 5, 6, 4, 5, 4, 3, 5, 6, 4, 5, 4, 4, 4, 4, 6, 2, 4, 4, 7, 2, 5, 5,
       4, 4, 5, 4, 5, 4, 5, 8, 4, 5, 4, 1, 4, 4, 4, 5, 4, 8, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 5, 4, 4, 8, 4, 4, 4, 4, 5, 4, 5, 5, 5, 4, 4, 5, 4,
       4, 4, 4, 5, 5, 5, 6, 4, 4, 4, 4, 4, 4, 5, 2, 5, 8, 4, 3, 8, 5, 9, 4,
       8, 5, 4, 5, 9, 4, 4, 4, 4, 5, 4, 4, 4, 4, 5, 4, 8, 4, 4, 5, 4, 4, 4,
       4, 5, 5, 4, 4, 4, 4, 4, 5, 6, 4, 4, 4, 5, 3, 4, 8, 4, 5, 5, 4, 4, 4,
       4, 4, 5, 5, 5, 4, 8, 8, 4, 4, 4, 4, 4, 4, 4, 9, 8, 8, 5, 4, 9, 4, 5,
       5, 4, 4, 7, 4, 4, 4, 4, 5, 8, 9, 4, 4, 4, 8, 7, 4, 4, 5, 8, 4, 4, 8,
       2, 5, 4, 5, 5, 9, 4, 4, 4, 5, 2, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 8, 8,
       5, 5, 4, 4, 5, 4, 8, 4, 5, 5, 5, 3, 4, 3, 4, 4, 5, 4, 6, 4, 4, 4, 4,
       8, 4, 4, 5, 5, 5, 4, 7, 8, 4, 4, 4, 4, 4, 5, 5, 4, 5, 4, 4, 4, 4, 4,
       4, 4, 5, 5, 8, 5, 4, 4, 4, 5, 4, 5, 5, 5, 5, 6, 9, 5, 4, 8, 8, 5, 5,
       5, 5,

# Getting the -optimal- number of Clusters... (k-means elbow method)
  * #### As seen in the above dendrogram if we "trim" the tree in a certain distance point, we have from 2 to 9 clusters.
  * #### Running k-means for k = 2, 3, 4, 5, 6, 7, 8, 9 - and optionally 10 - and applying the elbow method should tell us the optimal number of clusters

  * ### Importing sklearn essential libraries for k-means and scipy

In [9]:
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist

In [10]:
# k means determine optimal k
distortions = []
K = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for k in tqdm(K):
    kmeanTest = KMeans(n_clusters=k, n_init=20, n_jobs=-1, precompute_distances=True, random_state=0, verbose=2)
    kmeanTest.fit(X); kmeanTest.fit(X)
    distortions.append(sum(np.min(cdist(X, kmeanTest.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
 
# Plot the elbow
plt.figure(2, figsize=(25, 10))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:10<00:00,  7.05s/it]


 * #### From the elbow method we saw that the optimal number of clusters is 4 (as much as BSAS suggested)
 * #### So, we trim the dendrogram at max_d = 6.10

In [11]:
max_d = 6.10
clusters_ = fcluster(ZC, max_d, criterion='distance')
clusters_

array([1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4,
       2, 2, 3, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2, 4, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 1, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 4, 2, 1, 4, 2, 4, 2,
       4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 1, 2, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 4, 2, 2,
       2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 4,
       1, 2, 2, 2, 2, 4, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4,
       2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 3, 2, 2, 2, 2,
       4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 2, 2, 4, 4, 2, 2,
       2, 2,

# Saving the Computed Results...

In [14]:
np.save('comp-data/3b-hierarchical-clustering-comp-data/clusters_.npy', clusters_)

# ~ END OF CHAPTER 3 - (AGGLOMERATIVE) HIERARCHICAL CLUSTERING ~