# Customer Clustering Visualization
 - 3D plots by Plotly
 - 2D plots for each attributes by each cluster
 - 2D attributes comparisons between clusters

In [1]:
import pandas as pd
pd.set_option('max_columns', 60)

import numpy as np
import scipy as sp

import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.offline import iplot
import plotly.graph_objs as go
from mpl_toolkits.mplot3d import Axes3D

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import seaborn as sns; sns.set(style='whitegrid')
from kmodes import kprototypes


axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.



** init **

In [2]:
PROD = 'mjk' # 'all'
N_clusters = 7

In [3]:
path = './output/cluster_data_w_label.csv'

** read the data with cluster labels **

In [4]:
data = pd.read_csv(path, sep='|')
data.shape

(12123, 11)

In [5]:
data.sample(3)

Unnamed: 0,max_bal_3mth,cusedu_id,gender_id,marital_id,age_tier,intuser_fg,cc_type_cnt_tableflag,TOT_SAVING_bank_ph,all_acct_vint_l_bank_ph,EAST,cluster_label
339,1215097.0,C,F,A,5.0,N,0,1.0,5.0,0.0,6
11831,40065500.0,D,F,A,4.0,Y,0,2.0,3.0,0.0,1
3348,184006300.0,D,F,A,5.0,N,0,1.0,5.0,0.0,6


 - init the feature categories

In [6]:
NUMERICAL_FEATURES = ['age_tier', 'max_bal_3mth', 'all_acct_vint_l_bank_ph', 'TOT_SAVING_bank_ph']
CATEGORICAL_FEATURES = ['gender_id',  'cusedu_id', 'marital_id', 'intuser_fg', 'cc_type_cnt_tableflag', 'EAST'] 

 - add a label for count 1

In [7]:
data['cnt'] = 1

## 3D Graph by Plotly

** a naive way that simply plot the points **
 - points will overlap by the categorical and discrete attributes dimensions

In [8]:
def plotly_3D(_df, xlab, ylab, zlab, cluster_init='cluster_label'):
    cluster_color_map = {0: 'rgb(127,201,127)', 1: 'rgb(190,174,212)', 2: 'rgb(253,192,134)', 
                         3: 'rgb(255,255,153)', 4: 'rgb(56,108,176)', 5: 'rgb(240,2,127)', 
                         6: 'rgb(191,91,23)' #, 7: 'rgb(102,102,102)'
                        }
    trace = []
    for clus, color in cluster_color_map.iteritems():
        trace_new = go.Scatter3d(
                    x=_df[_df[cluster_init]==clus][xlab],
                    y=_df[_df[cluster_init]==clus][ylab],
                    z=_df[_df[cluster_init]==clus][zlab],
                    mode='markers',
                    name='cluster_%s'%clus,
                    marker=dict(
                        color=color,
                        size=5,
                        symbol='circle',
                        opacity = .7
                    )
                )
        trace.append(trace_new)

    layout = go.Layout(
        margin={'l':0,'r':0,'b':0,'t':0}
    )
    
    fig = go.Figure(data=trace, layout=layout)
    iplot(fig)

In [9]:
plotly_3D(data, 'max_bal_3mth', 'TOT_SAVING_bank_ph', 'all_acct_vint_l_bank_ph', 'cluster_label')

** introduce a bit randomness to split the discrete points **

In [10]:
def encode_discrete(_df, _categorical_columns, _discrete_columns):
    encode_map = {}
    # step 1: encode categorical columns
    for col in _categorical_columns:
        val_list = _df[col].unique().tolist();val_list.sort()
        val_map = {}
        # create the dictionary to store all the value mapping 
        for val in val_list:
            val_map[val] = val_list.index(val)
        
        _df[col] = _df[col].apply(lambda x: val_map[x])
        encode_map[col] = val_map
        
    # step 2: jitter the discrete columns 
    for col in _discrete_columns:
        jitter = 0.07
        _df[col] = _df[col].apply(lambda x: x+np.random.randn()*jitter)
        
    return _df, encode_map

In [11]:
data_encoded = data.copy()
data_encoded, encode_map = encode_discrete(data_encoded, ['cusedu_id','gender_id','marital_id','intuser_fg'], 
    ['cusedu_id','gender_id','marital_id','intuser_fg','age_tier','cc_type_cnt_tableflag','TOT_SAVING_bank_ph','EAST'])

In [12]:
plotly_3D(data_encoded.sample(3000), 'age_tier', 'gender_id', 'max_bal_3mth')

 - the data behind 

In [13]:
encode_map

{'cusedu_id': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'Z': 6},
 'gender_id': {'F': 0, 'M': 1},
 'intuser_fg': {'N': 0, 'Y': 1},
 'marital_id': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'Z': 4}}

In [14]:
data_encoded.sample(3)

Unnamed: 0,max_bal_3mth,cusedu_id,gender_id,marital_id,age_tier,intuser_fg,cc_type_cnt_tableflag,TOT_SAVING_bank_ph,all_acct_vint_l_bank_ph,EAST,cluster_label,cnt
8644,2859556.41,2.934336,1.01057,-0.051977,3.997129,0.058433,-0.071991,0.924728,11.0,-0.019386,6,1
1342,2116175.79,4.929707,1.012694,-0.024066,8.98862,0.083022,-0.009742,0.882521,6.0,-0.020834,2,1
10437,2695455.37,2.026678,0.954482,0.04617,5.026186,-0.088643,0.052281,1.017891,3.0,-0.089266,2,1


## 2D Plots for Each Attributes by Clusters

** Histogram with bining: **
 - max_bal_3mth
 - all_acct_vint_l_bank_ph
 - TOT_SAVING_bank_ph

** bar chart: **
 - cusedu_id
 - gender_id
 - marital_id
 - intuser_fg
 - cc_type_cnt_tableflag
 - EAST

** pie chart: **
 - age_range

 - the name map for cluster names

In [15]:
# Define cluster names accordingly
name_map = {
    0: 'A',
    1: 'B',
    2: 'C',
    3: 'D',
    4: 'E',
    5: 'F',
    6: 'G'}

data['cluster_name'] = data['cluster_label'].map(name_map)

 - the name map for columns 

In [16]:
cols_name = {'max_bal_3mth': 'MAX balance in the last 3 month', 'age_tier': 'Age Tier', 
             'all_acct_vint_l_bank_ph':'Vintage of the latest bank account', 
                'gender_id': 'Gender', 'cusedu_id': 'Education', 'marital_id': 'Marital Status',
               'EAST': 'Location', 'intuser_fg': 'Internet Banking User', 'cc_type_cnt_tableflag': 'Credit Card',
               'TOT_SAVING_bank_ph': 'Number of Saving Products'}

** Histogram function **

In [17]:
def histogram(_df, cluster_id, col, color='rgb(102,194,165)', title=''):
    UPPER = np.percentile(_df[_df.cluster_name==name_map[cluster_id]][col], 95)

    trace = go.Histogram(
        x=_df[_df.cluster_name==name_map[cluster_id]][col].apply(lambda x: x if x<=UPPER else np.nan),
        histnorm='probability',
        name=col,
        marker=dict(
            color=color,
        ),
        opacity=0.75
    )
    
    if title == '':
        title = cols_name[col]
        
    layout = go.Layout(
        title='<b> Histogram of %s </b> <br> cluster: %s </br>'%(title, name_map[cluster_id]),
        xaxis=dict(
            title=col
        ),
        yaxis=dict(
            title='probability'
        )
    )
    
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)

** pie chart function **

In [18]:
age_map = {
    '< 20' : 1,
    '20 - 25' : 2,
    '26 - 30' : 3,
    '31 - 35' : 4,
    '36 - 40' : 5,
    '41 - 45' : 6,
    '46 - 50' : 7,
    '51 - 55' : 8,
    '56 - 60' : 9,
    '> 60' : 10
}

map_age = {
    1: '< 20',
    2: '20 - 25',
    3: '26 - 30',
    4: '31 - 35',
    5: '36 - 40',
    6: '41 - 45',
    7: '46 - 50',
    8: '51 - 55',
    9: '56 - 60',
    10: '> 60'
}

In [19]:
def pie(_df, cluster_id, col='age_tier', title = ''):
    colors = ['rgb(158,1,66)','rgb(213,62,79)','rgb(244,109,67)','rgb(253,174,97)','rgb(254,224,139)',
              'rgb(230,245,152)','rgb(171,221,164)','rgb(102,194,165)','rgb(50,136,189)','rgb(94,79,162)']
     
    group = _df[_df.cluster_name==name_map[cluster_id]].groupby(col, as_index=False).agg({'cnt':'count'})

    COLORS = []
    for i in (group[col].values-1).tolist():
        COLORS.append(colors[int(i)])

    trace = go.Pie(
        labels = group[col],
        values = group.cnt,
        marker = {'colors': COLORS},
        text = group[col].map(map_age),
        textposition = 'top',
        hoverinfo="label+percent+text",
        hole = .3,
        sort = False, 
        textinfo = 'text+percent',
        opacity = .8
        )

    annotation = go.Annotation(
        font={'size': 18},
        showarrow=False,
        text='<b> Cluster <br> %s </b>'%name_map[cluster_id],
        x=.5,
        y=.5
        )

    if title == '':
        title = cols_name[col]
        
    layout = go.Layout(
        title = '<b> Proportion of %s </b>'%(title),
        annotations = [annotation],
        showlegend= False
        )

    fig = go.Figure(
        data = [trace],
        layout = layout
                   )

    iplot(fig)

** vertical bar chart **

In [20]:
def bar(_df, cluster_id, colors = ['rgb(141,211,199)', 'rgb(190,186,218)', 'rgb(251,128,114)', 'rgb(128,177,211)',
                              'rgb(253,180,98)', 'rgb(179,222,105)', 'rgb(252,205,229)', 'rgb(248, 248, 249)']):
    
    cluster_data = _df[_df.cluster_name==name_map[cluster_id]].copy()
    cluster_data['EAST'] = cluster_data['EAST'].apply(lambda x: 'E' if x == 1.0 else 'W')
    cluster_data['cc_type_cnt_tableflag'] = cluster_data['cc_type_cnt_tableflag'].apply(lambda x: 'Y' if x == 1.0 else 'N')
    
    col_map = {'gender_id': 'Gender', 'cusedu_id': 'Education', 'marital_id': 'Marital Status',
               'EAST': 'Location', 'intuser_fg': 'Internet Banking User', 'cc_type_cnt_tableflag': 'Credit Card',
               'TOT_SAVING_bank_ph': 'Number of Saving Products'}
    
    def make_trace(col, colors):
        group = cluster_data.groupby(col).agg({'cnt':'count'}).T
        VALS = group.columns.values.tolist()

        trace_made = [go.Bar(
            y=[col_map[col]+'  '],
            x=group[c],
            name=c,
            orientation = 'h',
            marker={'color':colors[VALS.index(c)],
                    'line': {'color': 'rgb(248, 248, 249)',
                             'width': 1}}
            ) for c in  VALS]

        return trace_made
    
    def make_annotation(col):
        group = cluster_data.groupby(col).agg({'cnt':'count'}).T
        VALS = group.columns.values.tolist()

        annotation_made = [go.Annotation(
                xref='x', yref='y',
                x=(group.loc[:,:c].sum(axis=1).get_value('cnt') - group[c].get_value('cnt')*1.0/2), 
                y=col_map[col]+'  ',
                xanchor='center',
                text='%s: %.0f%%'%(c, (group[c].get_value('cnt')*100.0 / group.sum(axis=1).get_value('cnt')))\
                    if (group[c].get_value('cnt')*1.0 / group.sum(axis=1).get_value('cnt'))>0.05 else '',
                font=dict(family='Arial', size=16,
                color='rgb(255, 255, 255)'),
                showarrow=False) for c in  VALS]

        return annotation_made
    
    trace = []
    for col in col_map.keys():
        trace = trace + make_trace(col, colors)
    
    annotations = []
    for col in col_map.keys():
        annotations = annotations + make_annotation(col)
        
    layout = go.Layout(
        title = '<b> Bar plots for Categorical attributes </b> <br> Cluster %s </br>'%(name_map[cluster_id]),
        xaxis=dict(
            showgrid=False,
            showline=False,
            showticklabels=False,
            zeroline=False,
            domain=[0.15, 1]
        ),
        yaxis=dict(
            showgrid=False,
            showline=False,
            zeroline=False,
        ),
        barmode='stack',
        margin=dict(
            l=100,
            r=10,
            t=140,
            b=80
        ),
        showlegend=False,
        annotations=annotations
    )
    
    fig = go.Figure(
        data = trace,
        layout = layout)

    iplot(fig)

** plot for clusters **

In [21]:
for clus in [3]:
    pie(data, clus, 'age_tier')
    bar(data, clus)
    histogram(data, clus, 'max_bal_3mth')
    histogram(data, clus, 'all_acct_vint_l_bank_ph', 'rgb(253,174,97)')

## 2D Attributes Comparisons Between Clusters 

** histogram (for numerical attributes) **

In [22]:
def histogram_compare(_df, cluster_ids, col, colors=['rgb(158,1,66)','rgb(230,245,152)','rgb(253,174,97)','rgb(102,194,165)',
 'rgb(171,221,164)','rgb(213,62,79)','rgb(244,109,67)','rgb(94,79,162)','rgb(50,136,189)','rgb(254,224,139)'], title = ''):
    
    UPPER = np.percentile(_df[_df.cluster_name.isin([name_map[ix] for ix in cluster_ids])][col], 95)
    traces = []
    for cluster_id in cluster_ids:
        trace = go.Histogram(
            x=_df[_df.cluster_name==name_map[cluster_id]][col].apply(lambda x: x if x<=UPPER else UPPER),
            histnorm='probability',
            name='Cluster %s'%(name_map[cluster_id]),
            marker=dict(
                color=colors[cluster_id],
            ),
            nbinsx=30,
             opacity=0.75
        )
        traces.append(trace)
    
    if title == '':
        title = cols_name[col] 
        
    layout = go.Layout(
        title=' Comparison of <b> %s '%(title),
        xaxis=dict(
            title=col
        ),
        yaxis=dict(
            title='probability'
        ),
        legend={'x':0.88, 'y':1},
        barmode='group'
    )
    
    fig = go.Figure(data=traces, layout=layout)
    iplot(fig)

In [23]:
histogram_compare(data, range(6), 'max_bal_3mth')

** box plot (for numerical attributes) **

In [24]:
def box_compare(_df, cluster_ids, col, colors=['rgb(141,211,199)', 'rgb(190,186,218)', 'rgb(251,128,114)', 'rgb(128,177,211)',
                              'rgb(253,180,98)', 'rgb(179,222,105)', 'rgb(252,205,229)', 'rgb(248, 248, 249)'], 
                cluster_label = 'cluster_label', title = ''):

    trace = [go.Box(
        y = _df[_df[cluster_label]==clus][col],
        name='Cluster %s'%(name_map[clus]),
        marker=dict(
            color=colors[clus],
        ),
        boxpoints = 'outliers'
    ) for clus in cluster_ids]
    
    if  title == '':
        title = cols_name[col]
        
    layout = go.Layout(
        title=' Comparison of <b> %s '%(title),
        yaxis=dict(
            title=col,
            type='log'
        ))
    
    fig = go.Figure(
        data = trace, 
        layout = layout)

    iplot(fig)
    

In [25]:
box_compare(data, range(6), 'max_bal_3mth')

** bar chart (for categorical attributes) **

In [26]:
map_edu = {
    'A': 'Master / Doctoral',
    'B': 'Bachelor',
    'C': 'Diploma',
    'D': 'High School',
    'E': 'Junior High School',
    'F': 'Elementary',
    'Z': 'Others'
}

In [27]:
map_marital = {
    'A': 'Married',
    'B': 'Single',
    'C': 'Widower',
    'D': 'widow',
    'Z': 'Unknowd'
}

In [28]:
def bar_compare(_df, cluster_ids, col, cluster_label = 'cluster_label', colors = ['rgb(102,194,165)','rgb(252,141,98)','rgb(141,160,203)','rgb(231,138,195)',
                 'rgb(166,216,84)','rgb(255,217,47)','rgb(229,196,148)','rgb(179,179,179)',
                 'rgb(228,26,28)','rgb(55,126,184)','rgb(77,175,74)','rgb(152,78,163)',
                 'rgb(255,127,0)','rgb(255,255,51)','rgb(166,86,40)','rgb(247,129,191)'], title=''):
    
    cluster_data = _df[_df[cluster_label].isin(cluster_ids)].copy()
    cluster_data['EAST'] = cluster_data['EAST'].apply(lambda x: 'EAST' if x == 1.0 else 'WEST')
    cluster_data['cc_type_cnt_tableflag'] = cluster_data['cc_type_cnt_tableflag'].apply(lambda x: 'Y' if x == 1.0 else 'N')
    cluster_data['age_tier'] = cluster_data['age_tier'].map(map_age)
    cluster_data['marital_id'] = cluster_data['marital_id'].map(map_marital)
    cluster_data['cusedu_id'] = cluster_data['cusedu_id'].map(map_edu)
    
    group = cluster_data.groupby([cluster_label, col], as_index=False).agg({'cnt': sum})
    pivot = pd.pivot_table(group, values='cnt', columns=col, index=cluster_label)
    
    vals = pivot.columns.values.tolist()
    pivot['sum'] = pivot.sum(axis=1)
    for v in vals:
        pivot['%s_pro'%v] = pivot[v] / pivot['sum']
    pivot.drop('sum', axis=1, inplace=True)
    
    trace = [go.Bar(
        x=['cluster %s'%name_map[x] for x in pivot.index.values],
        y=pivot['%s_pro'%v],
        name=v,
        marker=({'color': colors[vals.index(v)]}),
        text=pivot['%s_pro'%v].apply(lambda x: str(round(x*100, 2))+'%'),
        legendgroup = v,
        showlegend = True,
        opacity = .8,
        hoverinfo='name+text'
        ) for v in vals]
    
    if  title == '':
        title = cols_name[col]
        
    layout = go.Layout(
        title = 'Comparison of <b> %s </b>'%title,
        xaxis={'categoryorder':'category ascending'},
        yaxis=dict(
            title='probability'
        ),
        bargroupgap = .1)
    
    fig = go.Figure(
        data = trace, 
        layout = layout)

    iplot(fig)
    
    return pivot

In [29]:
compare_df = {}
compare_df['marital_id'] = bar_compare(data, range(6), 'marital_id')