In [1]:
import pandas as pd 
import numpy as np
from sklearn.cluster import KMeans
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Import the dataset 
df = pd.read_csv('twm_customer.csv', delimiter=';')
df.isna().mean()

FileNotFoundError: [Errno 2] File twm_customer.csv does not exist: 'twm_customer.csv'

In [None]:
# use column_transformer and drop to prep the data for kmeans 
df_dropped = df[['income','age','years_with_bank','nbr_children']]

In [None]:
ct = make_column_transformer(
    (StandardScaler(), ['income','age','years_with_bank','nbr_children']), #turn all values in these columns between 0 and 1 
      
)

df_scaled = ct.fit_transform(df_dropped)

In [None]:
# turning the result of scaling back into a dataframe
df_scaled = pd.DataFrame(df_scaled, columns = ['income','age','years_with_bank','nbr_children'])

In [None]:
# muptiplying the income column by 2 to make it more significant when performing kmeans
df_scaled['income'] =df_scaled['income'] * 2

In [None]:
# now we figure out the optimum number of clusters for kmeans
def plot_distortion(X,max_clusters = 10):
    distortions = []
    for i in range(1, max_clusters +1):
        km = KMeans(n_clusters=i,
                    init='k-means++',
                    n_init=10,
                    random_state=0)
        km.fit(X)
        distortions.append(km.inertia_)

    plt.plot(range(1,max_clusters +1), distortions, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.show() 

In [None]:
plot_distortion(df_scaled,max_clusters=10)

In [None]:
# let's use 5 clusters from the graph given and apply kmeans to df_scaled

In [None]:
kmeans = KMeans(n_clusters=5, n_init=10)

y = kmeans.fit_predict(df_scaled)



In [None]:
#adding cluster to df_scaled
df_scaled['Cluster'] = y

In [None]:
ct = make_column_transformer(
    (MinMaxScaler(), ['income','age','years_with_bank','nbr_children']), #turn all values in these columns between 0 and 1 
      
)

df_scaled_minmax = ct.fit_transform(df_scaled)
df_scaled_minmax = pd.DataFrame(df_scaled_minmax, columns = ['income','age','years_with_bank','nbr_children'])
df_scaled_minmax['Cluster'] = y

(df_scaled_minmax.groupby('Cluster').mean())

In [None]:
# the result of kmeans to df_scaled
df_scaled_minmax

In [None]:
# selecting only the 0 cluster customers 
df_cluster_0 = df_scaled_minmax[df_scaled_minmax.Cluster == 0 ]

# calculating the average for each fature in the 0 cluster 
stats0 = (df_cluster_0.drop('Cluster', axis =1 ).mean(axis=0))

# creating labels and values for radar chart 
stats0 = stats0.values
labels = np.array(['income','age','years_with_bank','nbr_children'])

# Plotting the fist cluster
import plotly.express as px
import pandas as pd
df = pd.DataFrame(dict(
    r=stats0,
    theta=labels))
fig = px.line_polar(df, r='r', theta='theta', line_close=True)
fig.show()

In [None]:
# selecting only the 1 cluster customers 
df_cluster_1 = df_scaled_minmax[df_scaled_minmax.Cluster == 1 ]

# calculating the average for each fature in the 1 cluster 
stats1 = (df_cluster_1.drop('Cluster', axis =1 ).mean(axis=0))

# creating labels and values for radar chart 
stats1 = stats1.values
labels = np.array(['income','age','years_with_bank','nbr_children'])

# Plotting the second cluster
import plotly.express as px
import pandas as pd
df = pd.DataFrame(dict(
    r=stats1,
    theta=labels))
fig = px.line_polar(df, r='r', theta='theta', line_close=True)
fig.show()

In [None]:
# selecting only the 2 cluster customers 
df_cluster_2 = df_scaled_minmax[df_scaled_minmax.Cluster == 2 ]

# calculating the average for each fature in the 2 cluster 
stats2 = (df_cluster_2.drop('Cluster', axis =1 ).mean(axis=0))

# selecting only the 3 cluster customers 
df_cluster_3 = df_scaled_minmax[df_scaled_minmax.Cluster == 3 ]

# calculating the average for each fature in the 3 cluster 
stats3 = (df_cluster_3.drop('Cluster', axis =1 ).mean(axis=0))

# selecting only the 4 cluster customers 
df_cluster_4 = df_scaled_minmax[df_scaled_minmax.Cluster == 4 ]

# calculating the average for each fature in the 4 cluster 
stats4 = (df_cluster_4.drop('Cluster', axis =1 ).mean(axis=0))


In [None]:
# Plotting the third cluster
import plotly.express as px
import pandas as pd
df = pd.DataFrame(dict(
    r=stats2,
    theta=labels))
fig = px.line_polar(df, r='r', theta='theta', line_close=True)
fig.show()

In [None]:
# Plotting the fourth cluster
import plotly.express as px
import pandas as pd
df = pd.DataFrame(dict(
    r=stats3,
    theta=labels))
fig = px.line_polar(df, r='r', theta='theta', line_close=True)
fig.show()

In [None]:
# Plotting the fifth cluster
import plotly.express as px
import pandas as pd
df = pd.DataFrame(dict(
    r=stats4,
    theta=labels))
fig = px.line_polar(df, r='r', theta='theta', line_close=True)
fig.show()

In [None]:
import plotly.graph_objects as go

categories = ['income','age','years_with_bank','nbr_children']
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=stats0,
      theta=categories,
      fill='toself',
      name='Customer Group 1'
))
fig.add_trace(go.Scatterpolar(
      r=stats1,
      theta=categories,
      fill='toself',
      name='Customer Group 2'
))

fig.add_trace(go.Scatterpolar(
      r=stats2,
      theta=categories,
      fill='toself',
      name='Customer Group 3'
))

fig.add_trace(go.Scatterpolar(
      r=stats3,
      theta=categories,
      fill='toself',
      name='Customer Group 4'
))

fig.add_trace(go.Scatterpolar(
      r=stats4,
      theta=categories,
      fill='toself',
      name='Customer Group 5'
))


fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 1]
    )),
  showlegend=False
)

fig.show()

In [None]:
df_scaled_minmax

In [None]:
features = ['income','age','years_with_bank','nbr_children']

fig = px.scatter_matrix(
    df_scaled, 
    dimensions=features,
    color = 'Cluster')

fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
from sklearn.decomposition import PCA


features = ['income','age','years_with_bank','nbr_children']
pca = PCA(n_components=2)
components = pca.fit_transform(df_scaled[features])

labels ={
    str(i): f'PC {i+1} ( {var:.1f}%)'
    for i , var in enumerate(pca.explained_variance_ratio_* 100)
}

fig = px.scatter_matrix(

components, labels = labels, 

dimensions= range(2),
color = df_scaled.Cluster)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
# See how well PCs explain variance in the dataset
cum_explained_variance = np.cumsum(pca.explained_variance_ratio_)
plt.plot(cum_explained_variance)
plt.xlabel('PC number')
plt.ylabel('% Cumulative explained variance')