In [None]:
import lib._util.visualplot as vp

# Pre-processing
from lib._class.DFDuplicateRemoval import DFDuplicateRemoval
from lib._class.DFVarianceThreshold import DFVarianceThreshold
from lib._class.DFVIFThreshold import DFVIFThreshold

# Feature encoding
from lib._class.DFOneHotEncoder import DFOneHotEncoder

# Feature scaling
from lib._class.DFStandardScaler import DFStandardScaler

# Clustering
from lib._class.DFKMeans import DFKMeans
from lib._class.DFGaussianMixture import DFGaussianMixture
from lib._class.DFAgglomerative import DFAgglomerative
from lib._class.DFDBSCAN import DFDBSCAN

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

# Scikit-learn
from sklearn.pipeline import Pipeline

# Plotly
import plotly.express as px

# Constant Variable

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/graph/customer/'

# Phase 1 - Data Loading
- Reference: https://www.kaggle.com/fazilbtopal/popular-unsupervised-clustering-algorithms
- CustomerID: Unique ID assigned to the customer
- Gender: Gender of the customer
- Age: Age of the customer
- Annual Income (k$): Annual Income of the customee
- Spending Score (1-100): Score assigned by the mall based on customer behavior and spending nature

In [None]:
data_df = pd.read_csv(f'{SOURCE_PATH_DATA}Mall_Customers.csv', sep=',')

data_df.shape

In [None]:
data_df.head()

In [None]:
# Remove ID column
data_df.drop(columns=['CustomerID'], inplace=True)

# Rename columns
data_df.rename(columns={
    'Annual Income (k$)': 'Income',
    'Spending Score (1-100)': 'Score',
}, inplace=True)

In [None]:
vp.faststat(data_df)

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=2,
             title='Phase 1 - Histogram',
             out_path=OUT_PATH_GRAPH)

###### Box

In [None]:
vp.box(data_df,
       max_col=2,
       title='Phase 1 - Box',
       out_path=OUT_PATH_GRAPH)

###### Pair

In [None]:
tmp_df = data_df.copy()

vp.pair(tmp_df,
        color='Gender',
        title='Phase 1 - Pair',
        out_path=OUT_PATH_GRAPH)

del tmp_df

# Phase 2 - Data Preparation
- Remove duplication
- Feature scaling
- Feature selection

In [None]:
X = data_df.copy()

X.shape

In [None]:
# Remove duplicated data
duplicate_removal = DFDuplicateRemoval()
X = duplicate_removal.fit_transform(X)

X.shape

In [None]:
duplicate_removal.duplicate_df

In [None]:
# Remove low variance feature
onehot_encoder     = DFOneHotEncoder(columns=X.select_dtypes(include='object').columns, dtype='byte', drop='first')
variance_threshold = DFVarianceThreshold(threshold=.01)

steps = [
    ('onehot_encoder', onehot_encoder),
    ('variance_threshold', variance_threshold),
]
X = Pipeline(steps).fit_transform(X)

X.shape

In [None]:
variance_threshold.stat_df

In [None]:
# May choose to drop Male feature, as it's not useful in creating segmentation based on pair-plot
standard_scaler = DFStandardScaler(columns=[x for x in data_df.columns if x != 'Gender_Male'])
scale_df        = standard_scaler.fit_transform(X)

scale_df.describe()

In [None]:
# Feature selection
scale_df = DFVIFThreshold(show_progress=True).fit_transform(scale_df)

scale_df.shape

In [None]:
vp.faststat(scale_df)

# Phase 3 - Clustering
- K-Means

In [None]:
# Determine number of clusters
kmeans = DFKMeans(cluster_name='KMeans_15', n_clusters=15, random_state=0, n_jobs=-1,
                  eval_inertia=True, eval_silhouette=True, eval_chi=True, eval_dbi=True)
kmeans.fit(scale_df)

###### Line

In [None]:
vp.line(kmeans.eval_df,
        xy_tuples=[('n_cluster', x) for x in ['inertia', 'silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=2,
        title='Phase 3 - Cluster Evaluation - K-Means',
        out_path=OUT_PATH_GRAPH)

In [None]:
# Determine number of clusters by scores
kmeans.eval_df.loc[kmeans.eval_df['silhouette'].idxmax()]['n_cluster'],\
kmeans.eval_df.loc[kmeans.eval_df['calinski_harabasz'].idxmax()]['n_cluster'],\
kmeans.eval_df.loc[kmeans.eval_df['davies_bouldin'].idxmin()]['n_cluster']

In [None]:
# Clustering
kmeans     = DFKMeans(cluster_name='KMeans_6', n_clusters=6, random_state=0, n_jobs=-1)
cluster_df = kmeans.fit_predict(scale_df)

vp.value_count(cluster_df, 'KMeans_6')

###### Pair

In [None]:
tmp_df = cluster_df.copy()
tmp_df.sort_values(by='KMeans_6', inplace=True)
tmp_df['KMeans_6'] = tmp_df['KMeans_6'].astype(str)

vp.pair(tmp_df,
        color='KMeans_6',
        title='Phase 3 - Pair - K-Means',
        out_path=OUT_PATH_GRAPH)

del tmp_df

###### Box

In [None]:
tmp_df = pd.concat([cluster_df[['KMeans_6']], data_df], axis=1)
tmp_df['KMeans_6'] = tmp_df['KMeans_6'].astype(str)

vp.box(tmp_df[[x for x in tmp_df.columns if x != 'Male']],
       color='KMeans_6',
       max_col=2,
       title='Phase 3 - Box - K-Means',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={'showlegend': False})

del tmp_df

###### Distribution Matrix

In [None]:
tmp_df = pd.concat([cluster_df[['KMeans_6']], data_df], axis=1)

vp.distmat(tmp_df,
           target='KMeans_6',
           title='Phase 3 - Distribution Matrix - K-Means',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

del tmp_df

# Phase 4 - Clustering
- Gaussian mixtures

In [None]:
# Determine number of clusters
gmm = DFGaussianMixture(cluster_name='GMM_15', n_components=15, random_state=0,
                        eval_aic=True, eval_bic=True, eval_silhouette=True, eval_chi=True, eval_dbi=True)
gmm.fit(scale_df)

###### Line

In [None]:
vp.line(gmm.eval_df,
        xy_tuples=[('n_cluster', x) for x in ['akaike', 'bayesian', 'silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=3,
        title='Phase 4 - Cluster Evaluation - GMM',
        out_path=OUT_PATH_GRAPH)

In [None]:
# Determine number of clusters by scores
gmm.eval_df.loc[gmm.eval_df[:7]['akaike'].idxmin()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df['bayesian'].idxmin()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df['silhouette'].idxmax()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df[2:]['calinski_harabasz'].idxmax()]['n_cluster'],\
gmm.eval_df.loc[gmm.eval_df[:5]['davies_bouldin'].idxmin()]['n_cluster']

In [None]:
# Clustering
gmm        = DFGaussianMixture(cluster_name='GMM_4', n_components=4, random_state=0)
cluster_df = gmm.fit_predict(scale_df)

vp.value_count(cluster_df, 'GMM_4')

###### Pair

In [None]:
tmp_df = cluster_df.copy()
tmp_df.sort_values(by='GMM_4', inplace=True)
tmp_df['GMM_4'] = tmp_df['GMM_4'].astype(str)

vp.pair(tmp_df,
        color='GMM_4',
        title='Phase 4 - Pair - GMM',
        out_path=OUT_PATH_GRAPH)

del tmp_df

###### Box

In [None]:
tmp_df = pd.concat([cluster_df[['GMM_4']], data_df], axis=1)
tmp_df['GMM_4'] = tmp_df['GMM_4'].astype(str)

vp.box(tmp_df,
       color='GMM_4',
       max_col=2,
       title='Phase 4 - Box - GMM',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={'showlegend': False})

del tmp_df

###### Distribution Matrix

In [None]:
tmp_df = pd.concat([cluster_df[['GMM_4']], data_df], axis=1)

vp.distmat(tmp_df,
           target='GMM_4',
           title='Phase 4 - Distribution Matrix - GMM',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

del tmp_df

# Phase 5 - Clustering
- Agglomerative

In [None]:
# Reference: https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/
# Determine number of clusters
agglo = DFAgglomerative(cluster_name='Agglo_15', n_clusters=15, random_state=0,
                        eval_silhouette=True, eval_chi=True, eval_dbi=True)
agglo.fit(scale_df)

###### Line

In [None]:
vp.line(agglo.eval_df,
        xy_tuples=[('n_cluster', x) for x in ['silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=2,
        title='Phase 5 - Cluster Evaluation - Agglomerative',
        out_path=OUT_PATH_GRAPH)

###### Dendrogram

In [None]:
vp.dendrogram(scale_df,
              title='Phase 5 - Dendrogram - Agglomerative',
              out_path=OUT_PATH_GRAPH,
              layout_kwargs={
                  'width': 1350,
                  'height': 600
              })

In [None]:
# Determine number of clusters by scores
agglo.eval_df.loc[agglo.eval_df['silhouette'].idxmax()]['n_cluster'],\
agglo.eval_df.loc[agglo.eval_df['calinski_harabasz'].idxmax()]['n_cluster'],\
agglo.eval_df.loc[agglo.eval_df['davies_bouldin'].idxmin()]['n_cluster']

In [None]:
# Clustering
agglo      = DFAgglomerative(cluster_name='Agglo_6', n_clusters=6, random_state=0)
cluster_df = agglo.fit_predict(scale_df)

vp.value_count(cluster_df, 'Agglo_6')

###### Pair

In [None]:
tmp_df = cluster_df.copy()
tmp_df.sort_values(by='Agglo_6', inplace=True)
tmp_df['Agglo_6'] = tmp_df['Agglo_6'].astype(str)

vp.pair(tmp_df,
        color='Agglo_6',
        title='Phase 5 - Pair - Agglomerative',
        out_path=OUT_PATH_GRAPH)

del tmp_df

###### Box

In [None]:
tmp_df = pd.concat([cluster_df[['Agglo_6']], data_df], axis=1)
tmp_df['Agglo_6'] = tmp_df['Agglo_6'].astype(str)

vp.box(tmp_df,
       color='Agglo_6',
       max_col=2,
       title='Phase 5 - Box - Agglomerative',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={'showlegend': False})

del tmp_df

###### Distribution Matrix

In [None]:
tmp_df = pd.concat([cluster_df[['Agglo_6']], data_df], axis=1)

vp.distmat(tmp_df,
           target='Agglo_6',
           title='Phase 5 - Distribution Matrix - Agglomerative',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

del tmp_df

# Phase 6 - Clustering
- DBSCAN

In [None]:
# Determine hyperparameters with highest score
dbscan = DFDBSCAN(cluster_name='DBSCAN', random_state=0, n_jobs=-1,
                  eps_samples_tuples=[(round(x,5), 5) for x in np.arange(.1, 2, .1)],
                  eval_cluster=True, eval_silhouette=True, eval_chi=True, eval_dbi=True)
dbscan.fit(scale_df)

###### Line

In [None]:
vp.line(dbscan.eval_df,
        xy_tuples=[('eps', x) for x in ['n_cluster', 'n_noise', 'silhouette', 'calinski_harabasz', 'davies_bouldin']],
        max_col=3,
        title='Phase 6 - EPS Evaluation - DBSCAN',
        out_path=OUT_PATH_GRAPH)

In [None]:
# Determine EPS by scores
dbscan.eval_df.loc[dbscan.eval_df['silhouette'].idxmax()]['eps'],\
dbscan.eval_df.loc[dbscan.eval_df['calinski_harabasz'].idxmax()]['eps'],\
dbscan.eval_df.loc[dbscan.eval_df['davies_bouldin'].idxmin()]['eps']

In [None]:
# Clustering
dbscan     = DFDBSCAN(cluster_name='DBSCAN', random_state=0, n_jobs=-1, eps=1.1, min_samples=5)
cluster_df = dbscan.fit_predict(scale_df)

vp.value_count(cluster_df, 'DBSCAN')

###### Pair

In [None]:
tmp_df = cluster_df.copy()
tmp_df.sort_values(by='DBSCAN', inplace=True)
tmp_df['DBSCAN'] = tmp_df['DBSCAN'].astype(str)

vp.pair(tmp_df,
        color='DBSCAN',
        title='Phase 6 - Pair - DBSCAN',
        out_path=OUT_PATH_GRAPH)

del tmp_df

###### Box

In [None]:
tmp_df = pd.concat([cluster_df[['DBSCAN']], data_df], axis=1)
tmp_df['DBSCAN'] = tmp_df['DBSCAN'].astype(str)

vp.box(tmp_df,
       color='DBSCAN',
       max_col=2,
       title='Phase 6 - Box - DBSCAN',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={'showlegend': False})

del tmp_df

###### Distribution Matrix

In [None]:
tmp_df = pd.concat([cluster_df[['DBSCAN']], data_df], axis=1)

vp.distmat(tmp_df,
           target='DBSCAN',
           title='Phase 6 - Distribution Matrix - DBSCAN',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'colorscale': 'Dense',
               'zmin': 0,
               'zmax': 1
           })

del tmp_df