Model dendrogram on variation coefficients 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import os, glob, inspect, sys
from sklearn import metrics
from scipy.cluster.hierarchy import fcluster

from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import KMeans

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import epri_mc_lib as mc
from importlib import reload
reload(mc)

In [None]:
data_path = "../../Data/Merged_data"
df = pd.read_csv(os.path.join(data_path, 'ALL_TUBE_PIPE_merge_1.csv'), 
                 index_col=0)


## Selecting subsample


In [None]:
df=df.iloc[:16] # choosing tube specimen data only
df.dropna(axis=1, inplace=True) # drop all nan columns

In [None]:
CV_list=df[['TEP_mean_uV_C',
 'TEP_error_uV_C',
 'backscatter_avg',
 'backscatter_std',
 'Absorption_avg_50',
 'Absorption_std_50',
 'A',
 'A std',
 'B',
 'B std',
 'p',
 'p std',
 'Absorption_avg_100',
 'Absorption_std_100',
 'mean_CF',
 'std_CF',
 'mean_perm',
 'std_perm']].copy()


In [None]:
CV_list.columns.tolist()

In [None]:
CV_list.shape

In [None]:
def scale_general(df, scaler):
    ''' Scale a dataframe using a given scaler (fit and transform).
        Keeps index and column names.
        Return new dataframe, scaler.
        
        Args:
        - df : pandas dataframe
        - scaler : initialized sklearn scaler function
        
        return scaled df and fit scaler
    '''
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)
    return df_scaled, scaler

In [None]:
def plot_corr(data):
    '''
    Plot correlation 
    Args:
    - data: pd dataframe
    '''
    corr = data.corr()
    sns.set(font_scale=1.2)
    mask = np.triu(np.ones_like(corr, dtype=bool))
    with sns.axes_style("white"):
        f, ax = plt.subplots(figsize=(12, 10))
        ax = sns.heatmap(corr, mask=mask, square=True, cmap='RdBu_r', center=0, annot=True)

## Calculating ther coefficient of variation CV

In [None]:
CV_list['TEP_uV_C_CV']=CV_list['TEP_error_uV_C']/CV_list['TEP_mean_uV_C']
CV_list['bkstr_CV']=CV_list['backscatter_std']/CV_list['backscatter_avg']
CV_list['absp_50_CV']=CV_list['Absorption_std_50']/CV_list['Absorption_avg_50']
CV_list['A_CV']=CV_list['A std']/CV_list['A']
CV_list['B_CV']=CV_list['B std']/CV_list['B']
CV_list['p_CV']=CV_list['p std']/CV_list['p']
CV_list['absp_100_CV']=CV_list['Absorption_std_100']/CV_list['Absorption_avg_100']
CV_list['CF_CV']=CV_list['std_CF']/CV_list['mean_CF']
CV_list['perm_CV']=CV_list['std_perm']/CV_list['mean_perm']

# we also drop the old feature columns
CV_list.drop(['TEP_error_uV_C','TEP_mean_uV_C'
              ,'backscatter_avg', 'backscatter_std','Absorption_avg_50','Absorption_std_50','A','A std','B',
              'B std','p','p std', 'Absorption_avg_100', 'Absorption_std_100', 'mean_CF', 'std_CF', 'mean_perm',
              'std_perm'],axis=1, inplace=True)

In [None]:
CV_list.columns.tolist()

## Check correlation between features

In [None]:
plot_corr(CV_list)

#### We observe high correlation between some features, divide one by the other

In [None]:
CV_list['CF_perm_CV'] = CV_list['CF_CV']/CV_list['perm_CV']
CV_list['B_p_CV'] = CV_list['B_CV']/CV_list['p_CV']

# we also drop the existing features
CV_list.drop(['CF_CV', 'B_CV','p_CV','perm_CV'], axis=1, inplace=True)

In [None]:
CV_list.columns.tolist()

In [None]:
plot_corr(CV_list)

## Scaling the data using MinMaxScaler

In [None]:
CV_list_scaled = mc.scale_general(CV_list, MinMaxScaler())[0]


## Hierarchical Clustering 

#### Algorithm Description
1. Calculate distance between objects using pdist function
2. Use linkage function to link pairs of objects that are in close proximity, build hierarchical cluster tree
3. Determining the cut-off in the hierarchical tree 


In [None]:
X=CV_list_scaled.copy()

#### Names of models using different metric and method for each model

In [None]:
Y1=pdist(X, 'euclidean')
Z1=linkage(Y1, 'ward')

Y2=pdist(X, 'euclidean')
Z2=linkage(Y2, 'single')

Y3=pdist(X, 'euclidean')
Z3=linkage(Y3, 'average')

Y4=pdist(X, 'cityblock')
Z4=linkage(Y4, 'average')

In [None]:
sns.set_theme(style="white")            

fig= plt.figure(figsize=(15, 6))
dn = dendrogram(Z1, labels=X.index)
sns.despine(left=True)
plt.ylabel('Distance')
plt.title('Tubes')


fig= plt.figure(figsize=(15, 6))
dn = dendrogram(Z2,labels=X.index)
sns.despine(left=True)
plt.ylabel('Distance')
plt.title('Tubes')
                

fig= plt.figure(figsize=(15, 6))
dn = dendrogram(Z3,labels=X.index)
sns.despine(left=True)
plt.ylabel('Distance')
plt.title('Tubes')
                

fig= plt.figure(figsize=(15, 6))
dn = dendrogram(Z4,labels=X.index)
sns.despine(left=True)                
plt.ylabel('Distance')
plt.title('Tubes')

plt.show()

### Verifying the cluster tree

#### Verifying dissimilarity using cophenetic correlation coefficient



In [None]:
c1, ccc_eu_ward = hierarchy.cophenet(Z1, Y1)
c2, ccc_eu_single = hierarchy.cophenet(Z2, Y2)
c3, ccc_eu_average = hierarchy.cophenet(Z3, Y3)
c4, ccc_cb_average = hierarchy.cophenet(Z4, Y4)

print("ccc_eu_ward :", c1)
print("ccc_eu_single :", c2)
print("ccc_eu_average :", c3)
print("ccc_cb_average :", c4)

#### The highest ccc is for using eu_avg, it is the most representable linkage dendrogram of our pdist data

### Verifying Consistency
we can determine the natural cluster division in a dataset is to compare the height of a link to its neighbouring links below it in the tree.
This helps to indicate the distinction between division in the tree. A link with high consistency is said to have a its distance approximately the same as the distance of the objects it contains (the links below in the tree). On the other hand, a link with high inconsistency is said to have a distance whose link differs noticably from the links below it. Hence a high inconsistency indicates that the link joined above the leaf nodes is farther apart from the joined objects. 

The inconsistent coefficient is a quantified expression of the relative consistency of each link. The lead nodes (bottom object) has a zero inconsistency coefficient. 

"This value compares
the height of a link in a cluster hierarchy with the average height of links
below it. Links that join distinct clusters have a high inconsistency coefficient;
links that join indistinct clusters have a low inconsistency coefficient." (http://cda.psych.uiuc.edu/multivariate_fall_2013/matlab_help/cluster_analysis.pdf)

Column Description of the inconsistency matrix

1 Mean of the heights of all the links included in the calculation

2 Standard deviation of all the links included in the calculation

3 Number of links included in the calculation

4 Inconsistency coefficient

In [None]:
#d is optional  depth of the comparison chooses how many levels below a link to compare
#incons = hierarchy.inconsistent(Z, d)

Z1_incons = hierarchy.inconsistent(Z1)
Z2_incons = hierarchy.inconsistent(Z2)
Z3_incons = hierarchy.inconsistent(Z3)
Z4_incons = hierarchy.inconsistent(Z4)

In [None]:
print("Z1_inconsistency :\n", Z1_incons)

print("\nZ2_inconsistency :\n", Z2_incons)

print("\nZ3_inconsistency :\n", Z3_incons)

print("\nZ4_inconsistency :\n", Z4_incons)

#### Finding natural divisions in data using the inconsistency coefficient to try find the cut-off the line. 

In [None]:
#this returns a list with highest t (inconsistency coefficient), max number of cluster for the t and depth
def inconsistency(linkage_matrix, CV_list_scaled, method):
    for depth in np.arange(0,6):
        incons = hierarchy.inconsistent(linkage_matrix, depth)
        max_inc = hierarchy.maxinconsts(linkage_matrix, incons)
        for t in np.unique(np.around(max_inc, 2)):
            cluster = hierarchy.fclusterdata(CV_list_scaled, t=t, method=method)
            print('depth:', depth, ': ', 't=', t, ' cluster = ', int(cluster.max()))

In [None]:
Z3_incons=inconsistency(Z3, CV_list_scaled, 'average')
Z3_incons

In [None]:
Z4_incons=inconsistency(Z4, CV_list_scaled, 'average')
Z4_incons

#### Our highest level of inconsistency coefficient is 1.15104191 as observed by Z3_inconsistency array above, hence why t=1.26 creates one cluster. And for t=0 we get 11 cluster, t=0.71 we get 8 clusters. That seems to be the optimal the model can generate in terms of distinction based on inconsistency coefficient.

Below is a df grouping each specimen with its corresponding group in cluster value

In [None]:
Z3_inconsistent_fcluster=fcluster(Z3, t=0.71, criterion='inconsistent')
Z3_inconsistent_fcluster=pd.DataFrame(Z3_inconsistent_fcluster, index=X.index)
Z3_inconsistent_fcluster.columns=['Cluster_incons']
Z3_inconsistent_fcluster.sort_values(by=['Cluster_incons'])

### Elbow method
Probably the most well known method, the elbow method, in which the sum of squares at each number of clusters is calculated and graphed, and the user looks for a change of slope from steep to shallow (an elbow) to determine the optimal number of clusters. This method is inexact, but still potentially helpful.

Note, this method is inexact. And from the below results i does not seem to be clear if the elbow position is at 6 or 8. Use 6 based on observation judgement.

In [None]:
min_range = 2
max_range = 15

inertia = []
k_list = range(min_range, max_range+1)

for k in k_list:
    km = KMeans(n_clusters = k, random_state= 0)
    km.fit(X) 
    score = km.inertia_
    inertia.append(score)


plt.figure(1 , figsize = (10 ,6))
plt.plot(np.arange(min_range , max_range+1) , inertia , 'o')
plt.plot(np.arange(min_range , max_range+1) , inertia , '-' , alpha = 0.5)

plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()

### Specifying Arbitrary Clusters
#### Cut off line at 6 clusters based on elbow method using maxclust method

In [None]:
Z3_maxclust_fcluster=fcluster(Z3, t=6, criterion='maxclust')
Z3_maxclust_fcluster=pd.DataFrame(Z3_maxclust_fcluster, index=X.index)
Z3_maxclust_fcluster.columns=['Cluster_maxclust']
Z3_maxclust_fcluster.sort_values(by=['Cluster_maxclust'])

#### Cut-off line by observation for 'Euclidean Average' model of the dendrogram using distance as the criterion for fcluster function at 0.9 distance height as seen below - 0.9 is the observed distance value by judgement

In [None]:
sns.set_theme(style="white")            
                

fig= plt.figure(figsize=(15, 6))
dn = dendrogram(Z3,labels=X.index)
sns.despine(left=True)
plt.ylabel('Distance')
plt.title('Tubes')

In [None]:


ax, fig = plt.subplots(figsize=(7,9))


dendro = hierarchy.dendrogram(Z3, labels=X.index, leaf_rotation=0, orientation='left',
                         color_threshold=0.9)

for i, d, c in zip(dendro['icoord'], dendro['dcoord'], dendro['color_list']):
    y = 0.5 * sum(i[1:3])
    x = d[1]
    plt.plot(x, y, 'o', c=c)
    plt.annotate('%.3g'%x, (x, y), xytext=(5, -5),
                 textcoords = 'offset points',
                 va='bottom', ha='left',
                 fontsize=10
                )


plt.xlabel('Distance')
plt.title('All Tubes \n Model 3')
sns.despine(left=True)

In [None]:
sns.clustermap(CV_list_scaled, method='average', metric='euclidean', cmap='PuBu', figsize=(6,8), dendrogram_ratio=(0.2, 0.2), cbar_pos=(0.05, 0.85, 0.025, 0.1))

#### For the 'Cityblock Average' model it is more difficult to guess the cut-off line by judging the dendrogram tree

In [None]:


ax, fig = plt.subplots(figsize=(7,9))


dendro = hierarchy.dendrogram(Z4, labels=X.index, leaf_rotation=0, orientation='left',
                         color_threshold=1.8)

for i, d, c in zip(dendro['icoord'], dendro['dcoord'], dendro['color_list']):
    y = 0.5 * sum(i[1:3])
    x = d[1]
    plt.plot(x, y, 'o', c=c)
    plt.annotate('%.3g'%x, (x, y), xytext=(5, -5),
                 textcoords = 'offset points',
                 va='bottom', ha='left',
                 fontsize=10
                )


plt.xlabel('Distance')
plt.title('All Tubes \n Model 4')
sns.despine(left=True)

In [None]:
sns.clustermap(CV_list_scaled, method='average', metric='cityblock', cmap='PuBu', figsize=(6,8), dendrogram_ratio=(0.2, 0.2), cbar_pos=(0.05, 0.85, 0.025, 0.1))

#### Clustering group for 'Euclidean Average'

In [None]:
Z3_distance_fcluster=fcluster(Z3, t=0.9, criterion='distance')
Z3_distance_fcluster=pd.DataFrame(Z3_distance_fcluster, index=X.index)
Z3_distance_fcluster.columns=['Cluster_distance']
Z3_distance_fcluster.sort_values(by=['Cluster_distance'])

In [None]:
#### Making one dataframe with all the clusters using different criterion
df_clusters=pd.DataFrame([Z3_inconsistent_fcluster['Cluster_incons'],Z3_maxclust_fcluster['Cluster_maxclust'],Z3_distance_fcluster['Cluster_distance']])
df_clusters

In [None]:
df_clusters=df_clusters.transpose()

# 

In [None]:
df_clusters.sort_values(by=['Cluster_distance'])

# 

## 

# 