**HIERARCHICAL CLUSTERING**

**IMPORT RELEVANT LIBRARIES**

In [1]:
# main libraries
import numpy as np
import pandas as pd
import os, time
import pickle
import gzip

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
%matplotlib inline
color = sns.color_palette()

# scikitlearn libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report,average_precision_score,roc_curve, auc,roc_auc_score

**LOAD THE DATASET**

In [2]:
current_path = os.getcwd()

In [3]:
file = "..\\Hierachical Clustering MNIST\\mnist_data\\mnist.pkl.gz"

In [4]:
file

'..\\Hierachical Clustering MNIST\\mnist_data\\mnist.pkl.gz'

In [5]:
with gzip.open(filename=file, mode='rb') as file_content:
    train_set, validation_set, test_set = pickle.load(file=file_content, encoding='latin1')
    file_content.close()

In [6]:
train_set

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 array([5, 0, 4, ..., 8, 4, 8], dtype=int64))

In [7]:
validation_set

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 array([3, 8, 6, ..., 5, 6, 8], dtype=int64))

In [8]:
test_set

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 array([7, 2, 1, ..., 4, 5, 6], dtype=int64))

**SPLIT THE DATASET INTO FEATURES AND LABELS**

In [9]:
X_train, y_train = train_set[0], train_set[1]

In [10]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [11]:
y_train

array([5, 0, 4, ..., 8, 4, 8], dtype=int64)

In [12]:
X_validation, y_validation = validation_set[0], validation_set[1]

In [13]:
X_validation

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [14]:
y_validation

array([3, 8, 6, ..., 5, 6, 8], dtype=int64)

In [15]:
X_test, y_test = test_set[0], test_set[1]

In [16]:
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [17]:
y_test

array([7, 2, 1, ..., 4, 5, 6], dtype=int64)

**CREATE THE DATASET INDEX**

In [18]:
train_index = range(0, len(X_train))
validation_index = range(len(X_train), len(X_train)+len(X_validation))
test_index = range(len(X_train)+len(X_validation), len(X_train)+len(X_validation)+len(X_test))

In [19]:
train_index

range(0, 50000)

In [20]:
validation_index

range(50000, 60000)

In [21]:
test_index

range(60000, 70000)

**CREATE THE DATAFRAME FOR EACH DATASET**

In [22]:
X_train = pd.DataFrame(data=X_train, index=train_index)
y_train = pd.Series(data=y_train, index=train_index)

In [23]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
y_train

0        5
1        0
2        4
3        1
4        9
        ..
49995    5
49996    0
49997    8
49998    4
49999    8
Length: 50000, dtype: int64

In [25]:
X_validation = pd.DataFrame(data=X_validation, index=validation_index)
y_validation = pd.Series(data=y_validation, index=validation_index)

In [26]:
X_validation

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
50000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
y_validation

50000    3
50001    8
50002    6
50003    9
50004    6
        ..
59995    8
59996    3
59997    5
59998    6
59999    8
Length: 10000, dtype: int64

In [28]:
X_test = pd.DataFrame(data=X_test, index=test_index)
y_test = pd.Series(data=y_test, index=test_index)

In [29]:
X_test 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
60000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
y_test 

60000    7
60001    2
60002    1
60003    0
60004    4
        ..
69995    2
69996    3
69997    4
69998    5
69999    6
Length: 10000, dtype: int64

**USE PRINCIPAL COMPONENT ANALYSIS TO REDUCE DIMENSIONALITY OF THE DATASET**

In [35]:
N_COMPONENTS = 784
WHITEN = False
RANDOM_STATE = 101

In [36]:
from sklearn.decomposition import PCA

In [37]:
pca_model = PCA(n_components=N_COMPONENTS, whiten=WHITEN, random_state=RANDOM_STATE)

In [38]:
X_train_PCA = pca_model.fit_transform(X_train)
X_train_PCA = pd.DataFrame(data=X_train_PCA, index=train_index)

In [39]:
X_train_PCA

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0.461489,-1.246858,0.046276,-2.151941,-0.247280,-0.925431,0.889318,0.507168,-1.541697,0.689373,...,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.767288e-05,-0.000000e+00,-0.000000e+00,0.000000e+00
1,3.921775,-1.252002,2.335261,-1.340842,-3.421508,-0.725720,-0.206340,-0.345271,0.134062,0.595485,...,-2.309765e-09,-8.985165e-10,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-6.628464e-07,4.199177e-06,7.117097e-07,4.421789e-07
2,-0.203706,1.547939,-0.980354,2.039094,-1.079874,0.112921,-3.312354,1.403168,-0.592878,-0.763857,...,-9.977864e-09,-3.198249e-09,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.815266e-07,-3.473385e-07,-1.459310e-06,-2.761606e-07
3,-3.148300,-2.296076,1.091139,0.484630,0.066844,2.778985,-1.834361,-0.174741,1.166452,0.052842,...,-4.199942e-10,9.597283e-10,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.884261e-07,-1.370707e-06,2.530371e-06,-1.434178e-07
4,-1.442724,2.872014,0.175649,-0.976921,0.302749,0.120639,-0.376714,-1.478136,1.003752,0.540803,...,5.567928e-09,1.313987e-09,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.591596e-07,-1.614703e-06,-1.821566e-06,-1.833750e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,1.147566,-1.972706,-3.739126,1.209677,-0.312003,-0.893662,1.491481,0.013930,-0.385557,-2.202910,...,4.157774e-09,3.717023e-09,7.827457e-09,-8.838475e-09,3.114241e-10,-9.085165e-10,1.812201e-08,3.360465e-09,3.808364e-10,-6.259797e-10
49996,1.416380,-0.252682,-0.809474,-0.523549,-3.905662,0.187959,-0.449565,-1.231872,-0.104621,0.510852,...,-3.458996e-08,3.132664e-08,-4.230493e-09,-6.338331e-09,-3.618732e-09,-1.449634e-09,-1.393230e-08,-9.026688e-10,-9.224052e-09,1.520501e-09
49997,-1.425322,-1.957553,-2.551114,0.094601,0.654065,-0.603922,0.312862,-0.047751,0.994544,0.222442,...,4.911834e-09,-2.828789e-09,4.039725e-09,5.902486e-09,-4.874691e-09,-2.672040e-09,6.574875e-09,4.509849e-09,1.525374e-09,2.055360e-11
49998,-1.507544,1.025465,0.107839,0.276074,2.285548,-0.590755,-0.676690,-0.062368,0.050035,1.578246,...,-5.910784e-09,-3.080603e-09,3.922255e-09,-5.065844e-09,-1.198241e-09,-2.244490e-10,5.053866e-09,-1.493890e-10,-1.375516e-08,9.862802e-10


In [40]:
X_validation_PCA = pca_model.transform(X_validation)
X_validation_PCA = pd.DataFrame(data=X_validation_PCA, index=validation_index)

In [41]:
X_validation_PCA

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
50000,-1.376968,-2.978099,-0.870271,-0.679374,0.432764,-0.655289,0.546649,1.079511,-0.927923,0.633329,...,-1.989270e-13,-5.822887e-13,0.0,0.0,0.0,0.0,1.204290e-08,-6.665326e-07,5.325985e-08,5.244903e-08
50001,-1.167671,-0.894288,-2.199374,-0.335717,1.166562,1.101883,-0.797325,-1.155509,-0.585077,-0.082714,...,3.276457e-13,5.579159e-14,0.0,0.0,0.0,0.0,2.039048e-08,7.843078e-07,-1.699236e-07,-1.939123e-07
50002,1.258624,1.053050,0.525828,2.661964,-0.434636,-2.527815,0.387211,0.162875,-2.051164,-0.985090,...,2.487244e-13,2.030788e-13,0.0,0.0,0.0,0.0,3.817493e-09,8.398824e-07,1.923138e-07,-2.027063e-07
50003,-0.961151,3.278554,-1.466551,0.972176,0.324461,0.182943,-0.765920,0.497211,0.201661,1.640570,...,-6.917835e-13,-2.955069e-13,0.0,0.0,0.0,0.0,6.064426e-09,1.010534e-06,-1.256646e-07,-3.804390e-07
50004,1.040634,-0.548196,-1.874843,4.091005,0.743932,-0.140688,-0.254495,-1.770556,-0.024961,-1.392018,...,-7.078897e-13,-8.610828e-13,0.0,0.0,0.0,0.0,2.257260e-08,8.847005e-07,5.257820e-07,5.349617e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,-0.402175,-1.140592,0.674568,-1.350839,1.053965,0.876546,-1.980529,-1.103266,-1.844545,1.301491,...,7.638561e-13,5.363815e-13,0.0,0.0,0.0,0.0,-2.464480e-08,2.918382e-07,-1.893276e-07,-2.333251e-07
59996,0.557106,-2.706336,-0.143372,-2.587929,0.624705,0.710978,-1.925585,1.716824,-2.794522,0.090570,...,2.390971e-13,-1.615097e-14,0.0,0.0,0.0,0.0,8.043386e-09,-1.637585e-06,-4.339241e-07,-5.359703e-07
59997,-0.692112,0.602690,-0.819041,-2.798507,-1.022520,-1.803614,-1.861802,-0.717424,-0.272729,0.961678,...,7.295357e-13,6.812743e-13,0.0,0.0,0.0,0.0,-4.682772e-08,-5.949426e-07,-8.370726e-07,-3.485256e-07
59998,0.492090,-0.041597,1.761742,1.672517,-2.027228,-2.090638,-0.437876,0.587194,-0.149893,-0.408577,...,-2.789047e-13,-3.842808e-13,0.0,0.0,0.0,0.0,8.793780e-09,1.325293e-07,-3.155611e-07,1.837101e-07


In [42]:
X_test_PCA = pca_model.transform(X_test)
X_test_PCA = pd.DataFrame(data=X_test_PCA, index=test_index)

In [43]:
X_test_PCA

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
60000,-1.253000,2.900604,-0.105458,-1.142523,-1.999419,0.939988,1.543680,1.673789,-1.716286,-0.746451,...,2.437218e-14,-1.395518e-14,0.0,0.0,0.0,0.0,-1.614258e-08,7.457054e-07,-2.175807e-07,-5.087120e-07
60001,0.011357,-3.755733,-0.755992,1.551665,-0.102826,-1.821269,1.061337,0.914371,0.523033,-0.380582,...,-4.641877e-13,-6.498616e-13,0.0,0.0,0.0,0.0,1.211869e-08,-1.566701e-07,-2.008141e-07,-2.590602e-07
60002,-3.736669,-1.616001,0.131312,1.138057,-1.966462,0.398666,-0.069477,-0.376921,0.410484,-0.559361,...,-6.582022e-13,-9.043686e-13,0.0,0.0,0.0,0.0,2.084202e-09,1.605355e-07,-4.802309e-08,2.719877e-07
60003,4.831466,0.331520,0.949453,-0.219984,-0.909405,-1.934462,2.673368,0.321865,0.013342,-0.193064,...,1.942448e-13,4.025754e-13,0.0,0.0,0.0,0.0,-1.252868e-08,-1.855262e-07,-5.406459e-07,-5.353815e-07
60004,0.760218,2.791334,-1.405336,2.352506,-0.427331,0.927282,-1.887704,0.798964,-1.432858,0.714422,...,-6.866940e-13,-5.208702e-13,0.0,0.0,0.0,0.0,6.069885e-09,1.262216e-06,-4.867098e-07,-1.279456e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,1.160580,-2.119934,-0.121350,0.777306,3.299665,3.640685,3.105781,0.621062,-2.365495,-0.132130,...,-3.081875e-13,-4.853166e-13,0.0,0.0,0.0,0.0,-2.497102e-08,-2.333520e-07,-1.622456e-07,6.905796e-07
69996,2.192501,-2.751729,-3.121061,-1.064043,0.513746,-0.729493,2.671909,2.973097,-0.650974,-0.214322,...,-1.708861e-13,-6.631241e-13,0.0,0.0,0.0,0.0,1.088414e-08,-1.563943e-06,8.258696e-08,-4.375413e-07
69997,-0.972941,2.371036,1.615877,-1.673816,2.478252,0.564567,-0.200502,-1.424664,-0.008649,-3.072330,...,2.348912e-13,5.217913e-13,0.0,0.0,0.0,0.0,9.525043e-09,3.649665e-07,3.912827e-07,-3.239196e-07
69998,-1.213136,-0.410504,2.599205,-0.713675,-0.708521,-1.136348,0.586810,0.733457,0.991110,-0.773542,...,3.083355e-14,2.186161e-13,0.0,0.0,0.0,0.0,-6.323317e-09,1.236412e-06,-2.901706e-07,5.966693e-07


**AGGLOMERATIVE HIERARCHICAL CLUSTERING ALGORITHM**

In [44]:
import fastcluster
from scipy.cluster.hierarchy import dendrogram, cophenet, fcluster
from scipy.spatial.distance import pdist

In [45]:
cutoff = 100

In [46]:
Z = fastcluster.linkage_vector(X_train_PCA.loc[:,0:cutoff], method='ward', metric='euclidean')
Z_dataFrame = pd.DataFrame(data=Z, columns=['clusterOne','clusterTwo','distance','newClusterSize'])

In [47]:
Z_dataFrame

Unnamed: 0,clusterOne,clusterTwo,distance,newClusterSize
0,42194.0,43025.0,0.567260,2.0
1,28350.0,37674.0,0.592542,2.0
2,26696.0,44705.0,0.623949,2.0
3,12634.0,32823.0,0.627941,2.0
4,24707.0,43151.0,0.639199,2.0
...,...,...,...,...
49994,99985.0,99990.0,292.846800,17166.0
49995,99992.0,99993.0,391.432259,23294.0
49996,99991.0,99995.0,432.664577,28775.0
49997,99987.0,99996.0,470.804025,32834.0


In [48]:
Z_dataFrame.iloc[0:20]

Unnamed: 0,clusterOne,clusterTwo,distance,newClusterSize
0,42194.0,43025.0,0.56726,2.0
1,28350.0,37674.0,0.592542,2.0
2,26696.0,44705.0,0.623949,2.0
3,12634.0,32823.0,0.627941,2.0
4,24707.0,43151.0,0.639199,2.0
5,20465.0,24483.0,0.663984,2.0
6,46542.0,49961.0,0.668588,2.0
7,466.0,42098.0,0.669458,2.0
8,2301.0,5732.0,0.674403,2.0
9,37564.0,47668.0,0.67511,2.0


In [49]:
Z_dataFrame.iloc[49980:]

Unnamed: 0,clusterOne,clusterTwo,distance,newClusterSize
49980,99952.0,99973.0,168.816626,5786.0
49981,99959.0,99977.0,169.486507,6750.0
49982,99954.0,99957.0,175.574735,3396.0
49983,99964.0,99972.0,178.80482,4583.0
49984,99936.0,99950.0,180.484483,3356.0
49985,99958.0,99978.0,181.380974,6098.0
49986,99971.0,99980.0,195.583986,7672.0
49987,99948.0,99975.0,203.634639,4059.0
49988,99976.0,99979.0,210.634067,6957.0
49989,99967.0,99974.0,221.900123,5004.0


**The Dendrogram**


*Above shows the Z matrix that was generated by the clustering algorithm,
showing what the algorithm can accomplish.The first two columns in this table, clusterOne and clusterTwo, list which
two clusters—could be single-point clusters (i.e., the original observations)
or multipoint clusters—are being merged given their distance relative to each
other. The third column, distance, displays this distance, which was
determined by the Ward method and euclidean metric that we passed into
the clustering algorithm.*


*As you can see, the distance is monotonically increasing. In other words, the
shortest-distance clusters are merged first, and the algorithm iteratively
merges the next shortest-distance clusters until all the points have been
joined into a single cluster at the top of the dendrogram.*


*Initially, the algorithm merges single-point clusters together, forming new
clusters with a size of two, as shown in the fourth column, newClusterSize.
However, as we get much further along, the algorithm joins large multipoint
clusters with other large multipoint clusters, as shown in the table. At the
very last iteration (49,998), two large clusters are joined together, forming a
single cluster—the top tree trunk—with all 50,000 original observations.*

*You may be a bit confused by the clusterOne and clusterTwo entries in
this table. For example, in the last row—49,998—cluster 99,996 is joined
with cluster 99,997. But as you know, there are only 50,000 observations in
the MNIST digits dataset.
clusterOne and clusterTwo refer to the original observations for numbers
0 through 49,999. For numbers above 49,999, the cluster numbers refer to
previously clustered points. For example, 50,000 refers to the newly formed
cluster in row 0, 50,001 refers to the newly formed cluster in row 1, etc.
In row 49,998, clusterOne, 99,996 refers to the cluster formed in row
49,996, and clusterTwo, 99,997, refers to the cluster formed in row 49,997.
You can continue to work your way through this table using this formula to
see how the clusters are being joined.*

**EVALUATING THE CLUSTERING RESULTS**

*Now that we have the dendrogram in place, let’s determine where to cut off
the dendrogram to make the number of clusters we desire. To more easily
compare hierarchical clustering results with those of k-means, let’s cut the
dendrogram to have exactly 20 clusters. We will then use the clustering
accuracy metric—defined in the k-means section—to judge how
homogenous the hierarchical clustering clusters are.*

*To create the clusters we desire from the dendrogram, let’s pull in the
fcluster library from SciPy. We need to specify the distance threshold of the
dendrogram to determine how many distinct clusters we are left with. The
larger the distance threshold, the fewer clusters we will have. Data points
within the distance threshold we set will belong to the same cluster. A large
distance threshold is akin to cutting the upside-down tree at a very high
vertical point. Since more and more of the points are grouped together the
higher up the tree we go, the fewer clusters we will have.
To get exactly 20 clusters, we need to experiment with the distance threshold,
as done here. The fcluster library will take our dendrogram and cut it with
the distance threshold we specify. Each observation in the 50,000
observations MNIST digits dataset will get a cluster label, and we will store
these in a Pandas DataFrame:*

In [82]:
distance_threshold = 166

In [83]:
clusters = fcluster(Z, distance_threshold, criterion='distance')

In [84]:
X_train_hierClustered = pd.DataFrame(data=clusters, index=X_train_PCA.index, columns=['cluster'])

In [85]:
# Print number of clusters
print("Number of distinct clusters: ", len(X_train_hierClustered['cluster'].unique()))

Number of distinct clusters:  20


**SHOW OVERALL ACCURACY**

In [86]:
# Evaluating the Clustering Results
def analyze_cluster(clusterDF, labelsDF):
    countByCluster = pd.DataFrame(data=clusterDF['cluster'].value_counts())
    countByCluster.reset_index(inplace=True,drop=False)
    countByCluster.columns = ['cluster','clusterCount']
    
    # join the clusterDF with the true labels array, which we will call labelsDF
    preds = pd.concat([labelsDF,clusterDF], axis=1)
    preds.columns = ['trueLabel','cluster']
    
    # count the number of observations for each true label
    countByLabel = pd.DataFrame(data=preds.groupby('trueLabel').count())
    
    # for each cluster, we will count the number of observations for each distinct label within a cluster
    countMostFreq = pd.DataFrame(data=preds.groupby('cluster').agg(lambda x:x.value_counts().iloc[0]))
    countMostFreq.reset_index(inplace=True,drop=False)
    countMostFreq.columns = ['cluster','countMostFrequent']
    
    accuracyDF = countMostFreq.merge(countByCluster,left_on="cluster",right_on="cluster")
    overallAccuracy = accuracyDF.countMostFrequent.sum()/accuracyDF.clusterCount.sum()
    
    accuracyByLabel = accuracyDF.countMostFrequent/accuracyDF.clusterCount
    
    return (countByCluster, countByLabel, countMostFreq, accuracyDF, overallAccuracy, accuracyByLabel)

In [90]:
countByCluster_hierClust, countByLabel_hierClust, countMostFreq_hierClust, accuracyDF_hierClust, overallAccuracy_hierClust, accuracyByLabel_hierClust = analyze_cluster(X_train_hierClustered, y_train)

In [91]:
print("Overall accuracy from hierarchical clustering: ", overallAccuracy_hierClust)

Overall accuracy from hierarchical clustering:  0.76186


**SHOW ACCURACY BY CLUSTERS**

In [92]:
print("Accuracy by cluster for hierarchical clustering: ", accuracyByLabel_hierClust)

Accuracy by cluster for hierarchical clustering:  0     0.558750
1     0.464206
2     0.987776
3     0.940405
4     0.981972
5     0.425097
6     0.415640
7     0.991525
8     0.988538
9     0.996706
10    0.992337
11    0.996076
12    0.968486
13    0.968198
14    0.975526
15    0.954727
16    0.860745
17    0.449151
18    0.406570
19    0.954678
dtype: float64


**VIEW CLUSTERED LABEL**

In [93]:
X_train_hierClustered

Unnamed: 0,cluster
0,19
1,9
2,2
3,12
4,6
...,...
49995,18
49996,8
49997,20
49998,2


In [94]:
import datetime as dt
print("Completed: ", dt.datetime.now())

Completed:  2023-12-03 18:16:25.676275
