In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import ipywidgets as widgets
from ipywidgets.widgets import interact, interact_manual
import re
import datetime as dt

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.decomposition import PCA


ModuleNotFoundError: No module named 'ipywidgets'

# Data

In [None]:
df_customer = pd.read_csv('twm_customer.csv', sep=';')
df_customer.head(5)

In [None]:
print(df_customer.shape)
print(df_customer.isnull().sum())
print('# of unique customer IDs: ', df_customer['cust_id'].nunique())

In [None]:
df_accounts = pd.read_csv('twm_accounts.csv', sep=';')
df_accounts.head(3)

In [None]:
print(df_accounts.shape)
print(df_accounts.isnull().sum())
print('# of unique customer IDs: ', df_accounts['cust_id'].nunique())
print('# of unique ACCOUNT numbers: ', df_accounts['acct_nbr'].nunique())
# Customers can have multiple accounts.

In [None]:
df_checking_acct = pd.read_csv('twm_checking_acct.csv', sep=';')
df_checking_acct.head(3)

In [None]:
print(df_checking_acct.shape)
print(df_checking_acct.isnull().sum())
print('# of unique customer IDs: ', df_checking_acct['cust_id'].nunique())

In [None]:
df_credit_acct = pd.read_csv('twm_credit_acct.csv', sep=';')
df_credit_acct.head(3)

In [None]:
print(df_credit_acct.shape)
print(df_credit_acct.isnull().sum())
print('# of unique customer IDs: ', df_credit_acct['cust_id'].nunique())

In [None]:
df_savings_acct = pd.read_csv('twm_savings_acct.csv', sep=';')
df_savings_acct.head(3)

In [None]:
print(df_savings_acct.shape)
print(df_savings_acct.isnull().sum())
print('# of unique customer IDs: ', df_savings_acct['cust_id'].nunique())

In [None]:
df_transactions = pd.read_csv('twm_transactions.csv', sep=';')
df_transactions.head(3)

In [None]:
print(df_transactions.shape)
print(df_transactions.isnull().sum())
print('# of ACCOUNT numbers: ', df_transactions['acct_nbr'].nunique())

In [None]:
df_checking_tran = pd.read_csv('twm_checking_tran.csv', sep=';')
df_checking_tran.head(3)

In [None]:
print(df_checking_tran.shape)
print(df_checking_tran.isnull().sum())
print('# of unique customer IDs: ', df_checking_tran['cust_id'].nunique())

In [None]:
df_credit_tran = pd.read_csv('twm_credit_tran.csv', sep=';')
df_credit_tran.head(3)

In [None]:
print(df_credit_tran.shape)
print(df_credit_tran.isnull().sum())
print('# of unique customer IDs: ', df_credit_tran['cust_id'].nunique())

In [None]:
df_savings_tran = pd.read_csv('twm_savings_tran.csv', sep=';')
df_savings_tran.head(3)

In [None]:
print(df_savings_tran.shape)
print(df_savings_tran.isnull().sum())
print('# of unique customer IDs: ', df_savings_tran['cust_id'].nunique())

# Analysis

Customer segmentation based on demographics:

In [None]:
df_customer2 = df_customer.copy()
#df_customer2['gender_num'] = df_customer2.apply(lambda row: 0 if row['gender']=='M ' else 1, axis=1)
df_customer2.head(3)

In [None]:
def plot_distortion(dataframe_name, max_clusters = 10):
    distortions = []
    
    scaler = MinMaxScaler()
    df_seg_scaled = pd.DataFrame(scaler.fit_transform(dataframe_name.astype(float)))
    
    X = df_seg_scaled.to_numpy()
    
    for i in range(1, max_clusters +1):
        km = KMeans(n_clusters=i,
                    init='k-means++',
                    n_init=10,
                    random_state=0)
        km.fit(X)
        distortions.append(km.inertia_)

    plt.plot(range(1,max_clusters +1), distortions, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.title('Distortion Plot')
    plt.show() 

In [None]:
def plot_clusters(dataframe_name, cols, num_clusters, plt_cluster_centers = False):
    # cols = list of two column numbers
    
    scaler = StandardScaler()
    df_customer_scaled = pd.DataFrame(scaler.fit_transform(dataframe_name.iloc[:,cols].astype(float)))
    X = df_customer_scaled.to_numpy()
    
    col1_title = dataframe_name.columns[cols[0]]
    col2_title = dataframe_name.columns[cols[1]]
    
    km = KMeans(n_clusters=num_clusters, n_init=10, random_state=0)
    y_res = km.fit_predict(X)
    
    X_centroids = []
    Y_centroids = []
    
    for cluster in set(y_res):
        x = X[y_res == cluster, 0]
        y = X[y_res == cluster, 1]
        X_centroids.append(np.mean(x))
        Y_centroids.append(np.mean(y))
        
        
        plt.scatter(x, y, s=50, marker='s', label = f'cluster {cluster}')
    
    if plt_cluster_centers:
        plt.scatter(X_centroids, Y_centroids, marker = '*', c = 'red', s = 250, label = 'centroids')
    
    plt.legend()
    plt.grid()
    plt.title(f'KMeans Clustering of {col1_title} and {col2_title}')
    
    #plt.rcParams["figure.figsize"] = (6,4)
    
    return

## Customer segmentation based on their banking behavior:

In [None]:
df_cust_accounts = pd.merge(df_customer,df_accounts,on='cust_id')[['cust_id','income','years_with_bank','age','gender','nbr_children','marital_status',
                                                                   'acct_nbr','acct_type','acct_end_date',
                                                                   'ending_balance']]
df_cust_accounts.nunique()

In [None]:
df_cust_accounts['acct_type'].value_counts()
#CC = credit
#CK = checking
#SV = savings
df_cust_accounts['credit_balance'] = df_cust_accounts['ending_balance'][df_cust_accounts['acct_type']=='CC  ']
df_cust_accounts['checking_balance'] = df_cust_accounts['ending_balance'][df_cust_accounts['acct_type']=='CK  ']
df_cust_accounts['savings_balance'] = df_cust_accounts['ending_balance'][df_cust_accounts['acct_type']=='SV  ']
#df_cust_accounts['total_balance'] = df_cust_accounts['savings_balance'] + df_cust_accounts['checking_balance'] - df_cust_accounts['credit_balance']
df_cust_accounts.head(3)

In [None]:
df_cust_info = df_cust_accounts.loc[:,~df_cust_accounts.columns.isin(['acct_nbr','ending_balance','acct_end_date','acct_type'])]
df_cust_info = df_cust_info.groupby(['cust_id','income','years_with_bank','age','gender','nbr_children','marital_status']).sum().reset_index()
df_cust_info['total_balance'] = df_cust_info['checking_balance']+df_cust_info['savings_balance']-df_cust_info['credit_balance']
df_cust_info.head(3)

In [None]:
df_n = df_cust_accounts[['cust_id','acct_type','acct_nbr']].groupby(['cust_id','acct_type']).count()
df_n[df_n['acct_nbr']>1]

### Transactions analysis

In [None]:
df_transactions2 = df_transactions.copy()

In [None]:
df_transactions2.head(3)

In [None]:
df_count_transacations = df_transactions2.groupby('acct_nbr').count()['tran_amt'].reset_index()
df_count_transacations = df_count_transacations.rename(columns={'tran_amt':'count_tran_amt'})
df_count_transacations.head(3)

In [None]:
df_transacation_stats = df_transactions2.groupby('acct_nbr').mean()['tran_amt'].reset_index()
df_transacation_stats = df_transacation_stats.rename(columns={'tran_amt':'avg_tran_amt'})
df_transacation_stats['count_tran_amt'] = df_count_transacations['count_tran_amt']
df_transacation_stats.head(3)

In [None]:
# No need to use
#df_account_transaction_summary = pd.merge(df_accounts,df_transacation_stats,on='acct_nbr')[['acct_nbr','cust_id','acct_type','avg_tran_amt','count_tran_amt','starting_balance',
#                                                                                               'ending_balance','account_active','acct_start_date',
#                                                                                               'acct_end_date']]
#df_account_transaction_summary.head(3)

In [None]:
df_cust_accounts2 = pd.merge(df_cust_accounts, df_transacation_stats,how = 'left')
df_cust_accounts2.head(3)

In [None]:
df_cust_accounts2['avg_savings_tran_amt'] = df_cust_accounts2['avg_tran_amt'][df_cust_accounts2['acct_type']=='SV  ']
df_cust_accounts2['avg_checking_tran_amt'] = df_cust_accounts2['avg_tran_amt'][df_cust_accounts2['acct_type']=='CK  ']
df_cust_accounts2['avg_credit_tran_amt'] = df_cust_accounts2['avg_tran_amt'][df_cust_accounts2['acct_type']=='CC  ']
df_cust_accounts2['cnt_savings_tran'] = df_cust_accounts2['count_tran_amt'][df_cust_accounts2['acct_type']=='SV  ']
df_cust_accounts2['cnt_checking_tran'] = df_cust_accounts2['count_tran_amt'][df_cust_accounts2['acct_type']=='CK  ']
df_cust_accounts2['cnt_credit_tran'] = df_cust_accounts2['count_tran_amt'][df_cust_accounts2['acct_type']=='CC  ']
df_cust_accounts2.head(3)

In [None]:
df_cust_info2 = df_cust_info.copy()
df_cust_info2.head(3)

In [None]:
df_cust_accounts2.columns

In [None]:
df_cust_info2 = df_cust_accounts2.loc[:,~df_cust_accounts2.columns.isin(['acct_nbr','ending_balance','acct_end_date','acct_type'])]
df_cust_info2 = df_cust_info2.groupby(['cust_id','income','age','gender','nbr_children','marital_status']).sum().reset_index()
df_cust_info2['total_balance'] = df_cust_info2['checking_balance'] + df_cust_info2['savings_balance'] - df_cust_info2['credit_balance']
df_cust_info2.head(3)

### Demographic clustering

In [None]:
segmentation_cols = ['income', 'age','years_with_bank','nbr_children','marital_status','gender']

In [None]:
df_seg_full = df_customer[segmentation_cols]
df_seg_full['marital_status'] = df_seg_full['marital_status'].astype('category')
df_seg_full = pd.get_dummies(df_seg_full, drop_first=True)
df_seg_full.head(3)

In [None]:
# Demographics distortion plot
plot_distortion(df_seg_full)

# **Main function**

In [None]:
def Nik(dataframe, ScalingMethod, ClusteringMethod, NumberOfClusters, epsilon=0.5, min_samples=5):   
    if ScalingMethod == 'MinMax':
        scaler = MinMaxScaler()
    elif ScalingMethod == 'Standard':
        scaler = StandardScaler()
    elif ScalingMethod == 'Robust':
        scaler = RobustScaler()
    elif ScalingMethod == 'MaxAbs':
        scaler = MaxAbsScaler()
    else:
        print('Error: no valid scaler specified')
        
    df_scaled = pd.DataFrame(scaler.fit_transform(dataframe.astype(float)), columns = dataframe.columns)
    
    if ClusteringMethod == 'KMeans':
        df_scaled['Cluster'] = KMeans(n_clusters=NumberOfClusters, n_init=10).fit_predict(df_scaled)
    elif ClusteringMethod == 'Agglomerative':
        ac = AgglomerativeClustering(affinity='euclidean', linkage='ward', n_clusters = NumberOfClusters)
        df_scaled['Cluster'] = ac.fit_predict(df_scaled)
    elif ClusteringMethod == 'DBSCAN':
        db = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
        df_scaled['Cluster'] = db.fit_predict(df_scaled)
    else:
        print('Error: no valid clustering method specified')
    
    df_radar = df_scaled.groupby('Cluster').mean().divide(df_scaled.drop('Cluster',axis=1).mean())*100
    
    return df_radar

In [None]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(0.1)
df_seg_full_transformed = vt.fit_transform(df_seg_full)

# columns we have selected
# get_support() is method of VarianceThreshold and stores boolean of each variable in the numpy array.
selected_columns = df_seg_full.columns[vt.get_support()]

# transforming an array back to a data-frame preserves column labels
df_seg_full_transformed = pd.DataFrame(df_seg_full_transformed, columns = selected_columns)

In [None]:
# step 1
df_corr = df_seg_full_transformed.corr().abs()
df_corr

***Feature Engineering implementation to demographics information***

In [None]:
df_seg_full2 = df_seg_full.copy()
df_seg_full2 = df_seg_full2.drop(['years_with_bank', 'gender_M '], axis=1)
df_seg_full2

### Banking behavior clustering

In [None]:
df_cust_info3 = df_cust_info2.copy()
df_cust_info3.columns
# df_cust_info3 = df_cust_info3.drop(columns=['cust_id', 'age', 'gender', 'nbr_children', 'marital_status'])
segmentation_cols = ['income', 'age','years_with_bank','nbr_children','marital_status','gender',
                     'credit_balance', 'checking_balance','savings_balance', 'avg_tran_amt', 'count_tran_amt',
                     'avg_savings_tran_amt', 'avg_checking_tran_amt', 'avg_credit_tran_amt',
                     'cnt_savings_tran', 'cnt_checking_tran', 'cnt_credit_tran',
                     'total_balance']
df_cust_seg = df_cust_info3[segmentation_cols]
df_cust_seg['marital_status'] = df_cust_seg['marital_status'].astype('category')
df_cust_seg = pd.get_dummies(df_cust_seg, drop_first=True)
df_cust_seg.head(3)

In [None]:
plot_distortion(df_cust_seg)

In [None]:
scaler = MinMaxScaler()
df_habits_scaled = pd.DataFrame(scaler.fit_transform(df_cust_seg.astype(float)),columns = df_cust_seg.columns)
df_habits_scaled['Cluster'] = KMeans(n_clusters=4, n_init=10).fit_predict(df_habits_scaled)
df_habits_scaled.head(3)

In [None]:
df_habits_scaled['Cluster'].value_counts()

# Radar Chart

In [None]:
def plot_radar_chart(df, mean=False):
    data = []
    for i in range(0,len(df)):
        data.append(go.Scatterpolar(r = df.iloc[i].values,
                                    theta = df.iloc[i].index,
                                    name = f'cluster_{df.index[i]}',
                                    fill = 'toself'))
    # mean line
    if mean == True:
        data.append(go.Scatterpolar(r = [100]* df.shape[1],
                                    theta= df.columns,
                                    name = 'mean',
                                    fill = 'toself'))
    
    layout = go.Layout(polar = dict(radialaxis = dict(visible = True)),
                       showlegend = True)
    fig = go.Figure(data = data, layout = layout)
    fig.show()

In [None]:
# Decent clusters:

# df_seg_full, 'MinMax', 'Agglomerative', 3

***Demographic Radar Chart***

In [None]:
plot_radar_chart(Nik(df_seg_full, 'MinMax', 'Agglomerative', 3, epsilon=0.2, min_samples=10))

In [None]:
plot_radar_chart(Nik(df_seg_full2, 'MinMax', 'Agglomerative', 3, epsilon=0.2, min_samples=10))

***Banking Behaviour Radar Chart***

In [None]:
plot_radar_chart(Nik(df_cust_seg, 'MinMax', 'KMeans', 4, epsilon=0.2, min_samples=10))

### Its a lot of information was used, especially numerical data, we need to reduce data's count to and see each features clearly

***Feature Selection implemetation for banking behaviour analysis***

Analyzing clients by their current balances

In [None]:
segmentation_cols = ['income', 'age','years_with_bank','nbr_children','marital_status_2','marital_status_3',
                     'marital_status_4','gender_M ','credit_balance', 'checking_balance','savings_balance']
df_cust_seg2 = df_cust_seg[segmentation_cols]


In [None]:
plot_distortion(df_cust_seg2)

In [None]:
plot_radar_chart(Nik(df_cust_seg2, 'MinMax', 'KMeans', 4, epsilon=0.2, min_samples=10))

***Changing clusterization method***

In [None]:
plot_radar_chart(Nik(df_cust_seg2, 'Robust', 'KMeans', 4, epsilon=0.2, min_samples=10))

***Analyzing only credit balances and their transactions***

In [None]:
df_cust_seg3 = df_cust_seg[['income', 'age', 'years_with_bank', 'nbr_children','marital_status_2', 'marital_status_3',
                            'marital_status_4', 'gender_M ', 'credit_balance','avg_credit_tran_amt','cnt_credit_tran']]
                           
plot_radar_chart(Nik(df_cust_seg3, 'MinMax', 'KMeans', 4, epsilon=0.2, min_samples=10)) 

In [None]:
plot_radar_chart(Nik(df_cust_seg3, 'Robust', 'KMeans', 4, epsilon=0.2, min_samples=10))

***Analyzing saving balances with their transactions***

In [None]:
df_cust_seg.columns

In [None]:
df_cust_seg4 = df_cust_seg['income', 'age', 'years_with_bank', 'nbr_children', 'credit_balance',
                           'checking_balance', 'savings_balance', 'avg_tran_amt', 'count_tran_amt',
                           'avg_savings_tran_amt', 'avg_checking_tran_amt', 'avg_credit_tran_amt',
                           'cnt_savings_tran', 'cnt_checking_tran', 'cnt_credit_tran',
                           'total_balance', 'marital_status_2', 'marital_status_3',
                           'marital_status_4', 'gender_M ']

# PCA --> Scatterplot Visualizations

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
components =pca.fit_transform(df_habits_scaled)
df_pca = pd.DataFrame(components, columns = ['PC1', 'PC2'])
df_pca['Cluster'] = df_habits_scaled['Cluster']

fig = px.scatter(df_pca, x='PC1', y='PC2', color=df_pca['Cluster'].astype(str))
fig.show()

In [None]:
df_pca

# Banking behaviour

In [None]:
df_bank_info = df_cust_info3.copy()
df_bank_info = df_bank_info.merge(df_customer[['cust_id','city_name','state_code']],on='cust_id')
df_bank_info.head(3)

## What we know about the bank?

In [None]:
states_income = df_bank_info[['income','total_balance','state_code']].groupby('state_code').mean().reset_index()
states_income.head(3)

In [None]:
cities_income = df_bank_info[['income','total_balance','city_name']].groupby('city_name').mean().reset_index()
cities_income.head(3)

In [None]:
states_income = states_income.sort_values(['income','total_balance'],ascending=False)

trace1 = go.Bar(
    x = states_income['state_code'],
    y = states_income['income'],
    name = 'Income',
    marker=dict(color='yellow') 
)

trace2 = go.Bar(
    x = states_income['state_code'],
    y = states_income['total_balance'],
    name='Net Worth',
    marker=dict(color='green')
)

# create the layout
layout = go.Layout(
    title='States average income and their net worth'
)
data = [trace1, trace2]

# create the figure
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
cities_income = cities_income.sort_values(['income','total_balance'],ascending=False)

trace1 = go.Bar(
    x = cities_income['city_name'],
    y = cities_income['income'],
    name = 'Income',
    marker=dict(color='red') 
)

#trace2 = go.Bar(
#    x = cities_income['city_name'],
#    y = cities_income['total_balance'],
#    name='Net Worth',
#    marker=dict(color='blue')
#)

# create the layout
layout = go.Layout(
    title='Cities average income and their net worth'
)
data = [trace1]#[trace1, trace2]

# create the figure
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
df_ages = df_bank_info[['cust_id','income','age']]
ages = df_ages[['cust_id','age']]
ages['<18'] = df_ages['income'][df_ages['age']<18]
ages['18-25'] = df_ages['income'][(df_ages['age']>=18) &(df_ages['age']<25)]
ages['25-35'] = df_ages['income'][(df_ages['age']>=25) &(df_ages['age']<35)]
ages['35-55'] = df_ages['income'][(df_ages['age']>=35) &(df_ages['age']<55)]
ages['55-65'] = df_ages['income'][(df_ages['age']>=55) &(df_ages['age']<65)]
ages['65+'] = df_ages['income'][df_ages['age']>=65]
ages.head(3)

In [None]:
import seaborn as sns

plt.subplot(1,2,1)

sns.boxplot(x='age',y='income', data = ages)
plt.ylabel('')