In [None]:
import lib._util.visualplot as vp

# Pre-processing
from lib._class.DFDuplicateRemoval import DFDuplicateRemoval
from lib._class.DFVarianceThreshold import DFVarianceThreshold
from lib._class.DFVIFThreshold import DFVIFThreshold

# Feature encoding
from lib._class.DFOneHotEncoder import DFOneHotEncoder

# Feature scaling
from lib._class.DFStandardScaler import DFStandardScaler

# Clustering
from lib._class.DFKMeans import DFKMeans
from lib._class.DFKMedoids import DFKMedoids
from lib._class.DFGaussianMixture import DFGaussianMixture
from lib._class.DFAgglomerative import DFAgglomerative
from lib._class.DFDBSCAN import DFDBSCAN

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

# Scikit-learn
from sklearn.pipeline import Pipeline

# Plotly
import plotly.express as px

# Constant Variable

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/graph/customer/'

# Phase 1 - Data Loading
- Reference: https://www.kaggle.com/fazilbtopal/popular-unsupervised-clustering-algorithms
- CustomerID: Unique ID assigned to the customer
- Gender: Gender of the customer
- Age: Age of the customer
- Annual Income (k$): Annual Income of the customee
- Spending Score (1-100): Score assigned by the mall based on customer behavior and spending nature

In [None]:
data_df = pd.read_csv(f'{SOURCE_PATH_DATA}Mall_Customers.csv', sep=',')

data_df.shape

In [None]:
data_df.head()

In [None]:
# Remove ID column
data_df.drop(columns=['CustomerID'], inplace=True)

# Rename columns
data_df.rename(columns={
    'Annual Income (k$)': 'Income',
    'Spending Score (1-100)': 'Score',
}, inplace=True)

In [None]:
vp.faststat(data_df)

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=2,
             title='Phase 1 - Histogram',
             out_path=OUT_PATH_GRAPH)

###### Box

In [None]:
vp.box(data_df,
       max_col=2,
       title='Phase 1 - Box',
       out_path=OUT_PATH_GRAPH)

###### Pair

In [None]:
tmp_df = data_df.copy()

vp.pair(tmp_df,
        color='Gender',
        title='Phase 1 - Pair',
        out_path=OUT_PATH_GRAPH)

del tmp_df

# Phase 2 - Data Preparation
- Remove duplication
- Feature scaling
- Feature selection

In [None]:
X = data_df.copy()

X.shape

In [None]:
# Remove duplicated data
duplicate_removal = DFDuplicateRemoval()
X = duplicate_removal.fit_transform(X)

X.shape

In [None]:
duplicate_removal.duplicate_df

In [None]:
# Remove low variance feature
onehot_encoder     = DFOneHotEncoder(columns=X.select_dtypes(include='object').columns, dtype='byte', drop='first')
variance_threshold = DFVarianceThreshold(threshold=.01)

steps = [
    ('onehot_encoder', onehot_encoder),
    ('variance_threshold', variance_threshold),
]
X = Pipeline(steps).fit_transform(X)

X.shape

In [None]:
variance_threshold.stat_df

In [None]:
# May choose to drop Male feature, as it's not useful in creating segmentation based on pair-plot
standard_scaler = DFStandardScaler(columns=[x for x in data_df.columns if x != 'Gender_Male'])
scale_df        = standard_scaler.fit_transform(X)

scale_df.describe()

In [None]:
# Feature selection
scale_df = DFVIFThreshold(show_progress=True).fit_transform(scale_df)

scale_df.shape

In [None]:
vp.faststat(scale_df)

# Phase 3 - Clustering
- K-Means

In [None]:
# Determine number of clusters
kmeans = DFKMeans(cluster_name='KMeans', n_clusters=15, random_state=0, n_jobs=-1,
                  eval_inertia=True, eval_silhouette=True, eval_chi=True, eval_dbi=True)
kmeans.fit(scale_df)

###### Line

In [None]:
vp.line(kmeans.eval_df,
        xy_tuples=[('n_cluster', x) for x in ['inertia', 'silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=2,
        title='Phase 3 - N Cluster - K-Means',
        out_path=OUT_PATH_GRAPH)

In [None]:
# Determine number of clusters by scores
kmeans.eval_df.loc[kmeans.eval_df['silhouette'].idxmax()]['n_cluster'],\
kmeans.eval_df.loc[kmeans.eval_df['calinski_harabasz'].idxmax()]['n_cluster'],\
kmeans.eval_df.loc[kmeans.eval_df['davies_bouldin'].idxmin()]['n_cluster']

In [None]:
# Clustering
kmeans     = DFKMeans(cluster_name='KMeans', n_clusters=6, random_state=0, n_jobs=-1)
cluster_df = kmeans.fit_predict(scale_df)
proba_df   = kmeans.predict_proba(scale_df)

vp.value_count(cluster_df, 'KMeans')

In [None]:
kmeans.centroid_df

In [None]:
# Validation on probability vs. prediction result
tmp_df = pd.concat([
    cluster_df,
    proba_df[[x for x in proba_df.columns if x.startswith('KMeans')]].idxmax(axis=1).to_frame('Probability')
], axis=1)
mismatch_df = tmp_df[tmp_df['KMeans'] != tmp_df['Probability'].str.replace('KMeans ', '')]
print(f'mismatch_df.shape: {mismatch_df.shape}')

del tmp_df, mismatch_df

###### Pair

In [None]:
vp.pair(cluster_df.sort_values(by='KMeans'),
        color='KMeans',
        title='Phase 3 - Pair - K-Means',
        out_path=OUT_PATH_GRAPH)

###### Box

In [None]:
vp.box(pd.concat([cluster_df[['KMeans']], data_df], axis=1),
       color='KMeans',
       max_col=2,
       title='Phase 3 - Box - K-Means',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={'showlegend': False})

###### Distribution Matrix

In [None]:
vp.distmat(pd.concat([cluster_df[['KMeans']], data_df], axis=1),
           target='KMeans',
           title='Phase 3 - Distribution Matrix - K-Means',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

# Phase 4 - Clustering
- Gaussian mixtures

In [None]:
# Determine number of clusters
gmm = DFGaussianMixture(cluster_name='GMM', n_components=15, random_state=0,
                        eval_aic=True, eval_bic=True, eval_silhouette=True, eval_chi=True, eval_dbi=True)
gmm.fit(scale_df)

###### Line

In [None]:
vp.line(gmm.eval_df,
        xy_tuples=[('n_cluster', x) for x in ['akaike', 'bayesian', 'silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=3,
        title='Phase 4 - N Cluster - GMM',
        out_path=OUT_PATH_GRAPH)

In [None]:
# Determine number of clusters by scores
gmm.eval_df.loc[gmm.eval_df['akaike'].idxmin()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df['bayesian'].idxmin()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df['silhouette'].idxmax()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df['calinski_harabasz'].idxmax()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df['davies_bouldin'].idxmin()]['n_cluster']

In [None]:
# Clustering
gmm        = DFGaussianMixture(cluster_name='GMM', n_components=4, random_state=0)
cluster_df = gmm.fit_predict(scale_df)
proba_df   = gmm.predict_proba(scale_df)

vp.value_count(cluster_df, 'GMM')

In [None]:
gmm.centroid_df

In [None]:
# Validation on probability vs. prediction result
tmp_df = pd.concat([
    cluster_df,
    proba_df[[x for x in proba_df.columns if x.startswith('GMM')]].idxmax(axis=1).to_frame('Probability')
], axis=1)
mismatch_df = tmp_df[tmp_df['GMM'] != tmp_df['Probability'].str.replace('GMM ', '')]
print(f'mismatch_df.shape: {mismatch_df.shape}')

del tmp_df, mismatch_df

###### Pair

In [None]:
vp.pair(cluster_df.sort_values(by='GMM'),
        color='GMM',
        title='Phase 4 - Pair - GMM',
        out_path=OUT_PATH_GRAPH)

###### Box

In [None]:
vp.box(pd.concat([cluster_df[['GMM']], data_df], axis=1),
       color='GMM',
       max_col=2,
       title='Phase 4 - Box - GMM',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={'showlegend': False})

###### Distribution Matrix

In [None]:
vp.distmat(pd.concat([cluster_df[['GMM']], data_df], axis=1),
           target='GMM',
           title='Phase 4 - Distribution Matrix - GMM',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

# Phase 5 - Clustering
- Agglomerative

In [None]:
# Reference: https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/
# Determine number of clusters
agglo = DFAgglomerative(cluster_name='Agglo_15', n_clusters=15, random_state=0,
                        eval_silhouette=True, eval_chi=True, eval_dbi=True)
agglo.fit(scale_df)

###### Line

In [None]:
vp.line(agglo.eval_df,
        xy_tuples=[('n_cluster', x) for x in ['silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=2,
        title='Phase 5 - N Cluster - Agglomerative',
        out_path=OUT_PATH_GRAPH)

###### Dendrogram

In [None]:
vp.dendrogram(scale_df,
              title='Phase 5 - Dendrogram - Agglomerative',
              out_path=OUT_PATH_GRAPH,
              layout_kwargs={
                  'width': 1350,
                  'height': 600
              })

In [None]:
# Determine number of clusters by scores
agglo.eval_df.loc[agglo.eval_df['silhouette'].idxmax()]['n_cluster'],\
agglo.eval_df.loc[agglo.eval_df['calinski_harabasz'].idxmax()]['n_cluster'],\
agglo.eval_df.loc[agglo.eval_df['davies_bouldin'].idxmin()]['n_cluster']

In [None]:
# Clustering
agglo      = DFAgglomerative(cluster_name='Agglo', n_clusters=6, random_state=0)
cluster_df = agglo.fit_predict(scale_df)

vp.value_count(cluster_df, 'Agglo')

In [None]:
agglo.centroid_df

###### Pair

In [None]:
vp.pair(cluster_df.sort_values(by='Agglo'),
        color='Agglo',
        title='Phase 5 - Pair - Agglomerative',
        out_path=OUT_PATH_GRAPH)

###### Box

In [None]:
vp.box(pd.concat([cluster_df[['Agglo']], data_df], axis=1),
       color='Agglo',
       max_col=2,
       title='Phase 5 - Box - Agglomerative',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={'showlegend': False})

###### Distribution Matrix

In [None]:
vp.distmat(pd.concat([cluster_df[['Agglo']], data_df], axis=1),
           target='Agglo',
           title='Phase 5 - Distribution Matrix - Agglomerative',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

# Phase 6 - Clustering
- DBSCAN

In [None]:
# Determine hyperparameters with highest score
dbscan = DFDBSCAN(cluster_name='DBSCAN', random_state=0, n_jobs=-1,
                  eps_samples_tuples=[(round(x,5), 5) for x in np.arange(.1, 2, .1)],
                  eval_cluster=True, eval_silhouette=True, eval_chi=True, eval_dbi=True)
dbscan.fit(scale_df)

###### Line

In [None]:
vp.line(dbscan.eval_df,
        xy_tuples=[('eps', x) for x in ['n_cluster', 'n_noise', 'silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=3,
        title='Phase 6 - EPS Evaluation - DBSCAN',
        out_path=OUT_PATH_GRAPH)

In [None]:
# Determine EPS by scores
dbscan.eval_df.loc[dbscan.eval_df['silhouette'].idxmax()]['eps'],\
dbscan.eval_df.loc[dbscan.eval_df['calinski_harabasz'].idxmax()]['eps'],\
dbscan.eval_df.loc[dbscan.eval_df['davies_bouldin'].idxmin()]['eps']

In [None]:
# Clustering
dbscan     = DFDBSCAN(cluster_name='DBSCAN', random_state=0, n_jobs=-1, eps=1.1, min_samples=5)
cluster_df = dbscan.fit_predict(scale_df)

vp.value_count(cluster_df, 'DBSCAN')

In [None]:
dbscan.centroid_df

###### Pair

In [None]:
vp.pair(cluster_df.sort_values(by='DBSCAN'),
        color='DBSCAN',
        title='Phase 6 - Pair - DBSCAN',
        out_path=OUT_PATH_GRAPH)

###### Box

In [None]:
vp.box(pd.concat([cluster_df[['DBSCAN']], data_df], axis=1),
       color='DBSCAN',
       max_col=2,
       title='Phase 6 - Box - DBSCAN',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={'showlegend': False})

###### Distribution Matrix

In [None]:
vp.distmat(pd.concat([cluster_df[['DBSCAN']], data_df], axis=1),
           target='DBSCAN',
           title='Phase 6 - Distribution Matrix - DBSCAN',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

# Phase 7 - Clustering
- K-Medoids

In [None]:
# Determine number of clusters
kmedoids = DFKMedoids(cluster_name='KMedoids', n_clusters=15, random_state=0,
                      eval_inertia=True, eval_silhouette=True, eval_chi=True, eval_dbi=True)
kmedoids.fit(scale_df)

###### Line

In [None]:
vp.line(kmedoids.eval_df,
        xy_tuples=[('n_cluster', x) for x in ['inertia', 'silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=2,
        title='Phase 7 - N Cluster - K-Medoids',
        out_path=OUT_PATH_GRAPH)

In [None]:
# Determine number of clusters by scores
kmedoids.eval_df.loc[kmedoids.eval_df['silhouette'].idxmax()]['n_cluster'],\
kmedoids.eval_df.loc[kmedoids.eval_df['calinski_harabasz'].idxmax()]['n_cluster'],\
kmedoids.eval_df.loc[kmedoids.eval_df['davies_bouldin'].idxmin()]['n_cluster']

In [None]:
# Clustering
kmedoids   = DFKMedoids(cluster_name='KMedoids', n_clusters=4, random_state=0)
cluster_df = kmedoids.fit_predict(scale_df)
proba_df   = kmedoids.predict_proba(scale_df)

vp.value_count(cluster_df, 'KMedoids')

In [None]:
kmedoids.centroid_df

In [None]:
# Validation on probability vs. prediction result
tmp_df = pd.concat([
    cluster_df,
    proba_df[[x for x in proba_df.columns if x.startswith('KMedoids')]].idxmax(axis=1).to_frame('Probability')
], axis=1)
mismatch_df = tmp_df[tmp_df['KMedoids'] != tmp_df['Probability'].str.replace('KMedoids ', '')]
print(f'mismatch_df.shape: {mismatch_df.shape}')

del tmp_df, mismatch_df

###### Pair

In [None]:
vp.pair(cluster_df.sort_values(by='KMedoids'),
        color='KMedoids',
        title='Phase 7 - Pair - K-Medoids',
        out_path=OUT_PATH_GRAPH)

###### Box

In [None]:
vp.box(pd.concat([cluster_df[['KMedoids']], data_df], axis=1),
       color='KMedoids',
       max_col=2,
       title='Phase 7 - Box - K-Medoids',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={'showlegend': False})

###### Distribution Matrix

In [None]:
vp.distmat(pd.concat([cluster_df[['KMedoids']], data_df], axis=1),
           target='KMedoids',
           title='Phase 7 - Distribution Matrix - K-Medoids',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })