In [None]:
import numpy as np
import pandas as pd

# Plotly
from plotly.offline import init_notebook_mode, iplot, plot
from plotly.subplots import make_subplots
import plotly.graph_objects as go
init_notebook_mode(connected=True)

# Sscikit-Learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, MeanShift, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

# Sample Data

In [None]:
def populate_sample(size):
    np.random.seed(1)

    genders = ['male', 'female']
    countries = ['MY', 'CY', 'US', 'SG', 'FR', 'EU']

    return pd.DataFrame({
        'gender': [genders[x] for x in np.random.randint(low=0, high=len(genders), size=size)],
        'age': np.random.randint(low=1, high=99, size=size),
        'country': [countries[x] for x in np.random.randint(low=0, high=len(countries), size=size)],
        'balance': np.random.uniform(low=.00, high=10_000, size=size)
    })

In [None]:
sample_df = populate_sample(1_000)
sample_df.shape

In [None]:
sample_df.head()

# Exploratory Data Analysis

In [None]:
def plot_graph(data, title, x_label=None, y_label=None):
    layout = go.Layout(
        title = title,
        xaxis = dict(
            title=x_label,
            gridcolor='rgb(159, 197, 232)'
        ),
        yaxis = dict(
            title=y_label,
            gridcolor='rgb(159, 197, 232)'
        ),
        showlegend=True,
        legend_orientation='h',
        plot_bgcolor='rgba(0, 0, 0, 0)'
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    
def plot_subplots(data, max_col, subplot_titles=None):
    max_row = int(np.ceil(len(data) / max_col))
    
    fig = make_subplots(rows=max_row, cols=max_col, subplot_titles=subplot_titles)
    for index, d in enumerate(data):
        col = index +1

        if col <= max_col:
            row = 1
        else:
            quotient = int(col / max_col)
            col -= (max_col * quotient)
            if col == 0:
                col = max_col
            elif col == 1:
                row += 1

        fig.add_trace(d, row=row, col=col)

    fig.update_layout(showlegend=False)
    iplot(fig)
    
def exploratory_analysis(df):
    data = []
    for column in df.columns:
        data.append(go.Histogram(x=df[column]))
        
    subplot_titles = [f'{x.title()} Distribution' for x in df.columns]
    plot_subplots(data, max_col=2, subplot_titles=subplot_titles)

In [None]:
exploratory_analysis(sample_df)

# One-Hot Encoding

In [None]:
def onehot_encoding(df, fields):
    onehot_df = df.copy()
    
    for field in fields:
        onehot_df[field] = pd.Categorical(onehot_df[field])
        onehot_df = pd.concat([onehot_df, pd.get_dummies(onehot_df[field], prefix=field)], axis=1)
        
    onehot_df.drop(columns=fields, inplace=True)
    return onehot_df

In [None]:
onehot_df = onehot_encoding(sample_df, ['gender', 'country'])
onehot_df.head()

# Feature Selection

### High Correlation Filter

In [None]:
# NOTE: consider dropping variables if correlation between a pair of variables is greater than 0.5 - 0.6
corr_df = onehot_df.corr()
corr_df

In [None]:
data = go.Heatmap(
    x = corr_df.columns,
    y = corr_df.index,
    z = corr_df.values,
    colorscale='Reds'
)
plot_graph(data, 'Feature Correlation Matrix')

# Data Normalization

In [None]:
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(onehot_df), columns=onehot_df.columns)
scaled_df

# Dimensionality Reduction

In [None]:
def reduce_dim(df, n_components, method):
    methods = ['pca', 'tsne']
    assert method in methods, f'method is not in valid list: {methods}'
    
    random_state = 0
    reducer = None
    
    if method == 'pca':
        reducer = PCA(n_components=n_components, random_state=random_state)
        
    elif method == 'tsne':
        reducer = TSNE(n_components=n_components, random_state=random_state)
        
    reduce_values = reducer.fit_transform(df.values)
    reduced_df = pd.DataFrame(reduce_values, columns=['x', 'y'])
    return reduced_df, reducer

def plot_reduced_dim(df, title):
    data = []
    data.append(go.Scattergl(
        x = df['x'],
        y = df['y'],
        mode = 'markers',
        marker = dict(
            opacity=.5
        ),

        hovertext=[f'Index: {row.Index}<br />Gender: {row.gender}<br />Age: {row.age}<br />Country: {row.country}<br />Balance: {row.balance}'
                   for row in df.itertuples()],
        hoverinfo='text'
    ))
    plot_graph(data, title)

### PCA (Principal Component Analysis)

In [None]:
reduced_df, reducer = reduce_dim(scaled_df, n_components=2, method='pca')
reduced_df.head()

In [None]:
# Inverse transform
pd.DataFrame(reducer.inverse_transform(reduced_df), columns=scaled_df.columns).head()

In [None]:
# Map transformed values to samples
pca_map_df = sample_df.merge(reduced_df, how='left', left_index=True, right_index=True)
pca_map_df.head()

In [None]:
# Visualization
plot_reduced_dim(pca_map_df, 'PCA')

### t-SNE (t-Distributed Stochastic Neighbor Embedding)

In [None]:
# NOTE: t-SNE takes longer to train
reduced_df, reducer = reduce_dim(scaled_df, n_components=2, method='tsne')
reduced_df.head()

In [None]:
# Inverse transform is not supported by t-SNE

In [None]:
# Map transformed values to samples
tsne_map_df = sample_df.merge(reduced_df, how='left', left_index=True, right_index=True)
tsne_map_df.head()

In [None]:
# Visualization
plot_reduced_dim(tsne_map_df, 't-SNE')

# Clustering

In [None]:
def plot_cluster(df, cluster_column, title):
    data = []
    for cluster in df[cluster_column].unique():
        cluster_df = df[df[cluster_column] == cluster]
        
        data.append(go.Scattergl(
            x = cluster_df['x'],
            y = cluster_df['y'],
            mode = 'markers',
            name = f'Cluster {cluster}',
            marker = dict(
                opacity=.5
            ),

            hovertext=[f'Index: {row.Index}<br />Gender: {row.gender}<br />Age: {row.age}<br />Country: {row.country}<br />Balance: {row.balance}'
                       for row in cluster_df.itertuples()],
            hoverinfo='text'
        ))
    plot_graph(data, title)
    
def clustering(df, n_clusters, method):
    methods = ['kmeans', 'meanshift', 'hierarchy', 'gmm']
    assert method in methods, f'method is not in valid list: {methods}'
    
    if method == 'kmeans':
        cluster = KMeans(n_clusters=n_clusters)
        clusters = cluster.fit(df[['x', 'y']].values).predict(df[['x', 'y']].values)
        
    elif method == 'meanshift':
        cluster = MeanShift()
        clusters = cluster.fit(df[['x', 'y']].values).predict(df[['x', 'y']].values)
        
    elif method == 'hierarchy':
        cluster = AgglomerativeClustering(n_clusters=n_clusters)
        clusters = cluster.fit(df[['x', 'y']].values).labels_
        
    elif method == 'gmm':
        cluster = GaussianMixture(n_components=n_clusters)
        clusters = cluster.fit(df[['x', 'y']].values).predict(df[['x', 'y']].values)
        
    cluster_df = df.copy()
    cluster_df[f'{method}_cluster'] = clusters
    
    return cluster_df

### K-Means

In [None]:
# Reference: https://towardsdatascience.com/clustering-metrics-better-than-the-elbow-method-6926e1f723a6
def kmeans_evaluation(df, ranges):
    sum_squared_distances = []
    silhouette_scores = []
    
    for k in ranges:
        values = df[['x', 'y']].values
        kmeans = KMeans(n_clusters=k).fit(values)
        sum_squared_distances.append(kmeans.inertia_)
        silhouette_scores.append( silhouette_score(values, kmeans.predict(values), metric='euclidean') )
        
    return sum_squared_distances, silhouette_scores

In [None]:
ranges = [x for x in range(2,11)]
pca_inertias, pca_scores = kmeans_evaluation(pca_map_df, ranges)
tsne_inertias, tsne_scores = kmeans_evaluation(tsne_map_df, ranges)

data = []
for eval_result in [pca_inertias, tsne_inertias, pca_scores, tsne_scores]:
    data.append(go.Scatter(
        x = ranges,
        y = eval_result,
        mode = 'lines+markers'
    ))

reduce_type = ['PCA', 't-SNE']
subplot_titles = [f'Elbow Method - {x}' for x in reduce_type]
subplot_titles += [f'Silhouette Scores - {x}' for x in reduce_type]
plot_subplots(data, max_col=2, subplot_titles=subplot_titles)

In [None]:
method = 'kmeans'
pca_cluster_df = clustering(pca_map_df, n_clusters=6, method=method)
plot_cluster(pca_cluster_df.sort_values(by=[f'{method}_cluster']), f'{method}_cluster', 'K-Means Cluster - PCA')

In [None]:
method = 'kmeans'
tsne_cluster_df = clustering(tsne_map_df, n_clusters=10, method=method)
plot_cluster(tsne_cluster_df.sort_values(by=[f'{method}_cluster']), f'{method}_cluster', 'K-Means Cluster - t-SNE')

### Mean-Shift

In [None]:
method = 'meanshift'
pca_cluster_df = clustering(pca_cluster_df, n_clusters=None, method=method)
plot_cluster(pca_cluster_df.sort_values(by=[f'{method}_cluster']), f'{method}_cluster', 'Mean-Shift Cluster - PCA')

In [None]:
method = 'meanshift'
tsne_cluster_df = clustering(tsne_cluster_df, n_clusters=None, method=method)
plot_cluster(tsne_cluster_df.sort_values(by=[f'{method}_cluster']), f'{method}_cluster', 'Mean-Shift Cluster - t-SNE')

### Agglomerative Hierarchical

In [None]:
method = 'hierarchy'
pca_cluster_df = clustering(pca_cluster_df, n_clusters=6, method=method)
plot_cluster(pca_cluster_df.sort_values(by=[f'{method}_cluster']), f'{method}_cluster', 'Hierarchical Cluster - PCA')

In [None]:
method = 'hierarchy'
tsne_cluster_df = clustering(tsne_cluster_df, n_clusters=10, method=method)
plot_cluster(tsne_cluster_df.sort_values(by=[f'{method}_cluster']), f'{method}_cluster', 'Hierarchical Cluster - t-SNE')

### Gaussian Mixture

In [None]:
method = 'gmm'
pca_cluster_df = clustering(pca_cluster_df, n_clusters=6, method=method)
plot_cluster(pca_cluster_df.sort_values(by=[f'{method}_cluster']), f'{method}_cluster', 'GMM Cluster - PCA')

In [None]:
method = 'gmm'
tsne_cluster_df = clustering(tsne_cluster_df, n_clusters=10, method=method)
plot_cluster(tsne_cluster_df.sort_values(by=[f'{method}_cluster']), f'{method}_cluster', 'GMM Cluster - t-SNE')