In [191]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.decomposition import PCA

In [192]:
df = pd.read_csv('./Mall_Customers.csv')
df

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


In [193]:
df.drop('CustomerID', inplace=True, axis=1)

In [194]:
px.imshow(df.corr())

In [195]:
df.describe()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0
mean,38.85,60.56,50.2
std,13.969007,26.264721,25.823522
min,18.0,15.0,1.0
25%,28.75,41.5,34.75
50%,36.0,61.5,50.0
75%,49.0,78.0,73.0
max,70.0,137.0,99.0


In [196]:
px.scatter_matrix(df, color='Gender')

In [197]:
px.histogram(df, x='Annual Income (k$)', color='Gender', marginal='box')

#### It can be seen a man with 137,000$ annual income is outlier! we can drop it

In [198]:
df.isnull().sum()

Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

In [199]:
df.drop(df[df['Annual Income (k$)'] == 137].index , inplace=True)

In [200]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [201]:
df['Gender'] = df['Gender'].replace({'Male':1 , 'Female':0})

In [202]:
scaler= StandardScaler().fit(df)
df_transformed =scaler.transform(df)

# pca = PCA(n_components=3)
# pca.fit_transform(df_transformed)




In [203]:
df_transformed

array([[ 1.1411948 , -1.42541358, -1.77917141, -0.43598941],
       [ 1.1411948 , -1.2823665 , -1.77917141,  1.19941337],
       [-0.87627458, -1.35389004, -1.73944701, -1.72094874],
       [-0.87627458, -1.13931942, -1.73944701,  1.04366072],
       [-0.87627458, -0.5671311 , -1.69972262, -0.39705125],
       [-0.87627458, -1.21084296, -1.69972262,  1.00472256],
       [-0.87627458, -0.28103694, -1.65999822, -1.72094874],
       [-0.87627458, -1.13931942, -1.65999822,  1.70560947],
       [ 1.1411948 ,  1.79314572, -1.62027383, -1.83776323],
       [-0.87627458, -0.63865464, -1.62027383,  0.84896992],
       [ 1.1411948 ,  2.00771635, -1.62027383, -1.40944345],
       [-0.87627458, -0.28103694, -1.62027383,  1.90030028],
       [-0.87627458,  1.36400448, -1.58054943, -1.37050529],
       [-0.87627458, -1.06779588, -1.58054943,  1.04366072],
       [ 1.1411948 , -0.13798986, -1.58054943, -1.44838161],
       [ 1.1411948 , -1.21084296, -1.58054943,  1.12153705],
       [-0.87627458, -0.

In [204]:
inertia_ = []
silhouette_scores = []
k = 20
for i in range(2, k):
    
    model=KMeans(n_clusters=i, random_state=42)
    model.fit(df_transformed)
    score = silhouette_score(df_transformed, model.labels_, metric='euclidean')
    inertia_.append(model.inertia_)
    silhouette_scores.append(score)

In [205]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=list(range(2,k)), y=inertia_, name="inertia_", mode="lines"),
    secondary_y=False
)
fig.add_trace(
    go.Scatter(x=list(range(2,k)), y=silhouette_scores, name="silhouette_scores", mode="lines"),
    secondary_y=True
)

fig.update_xaxes(title_text="number of clusters")
fig.update_yaxes(title_text="inertia_", secondary_y=False)
fig.update_yaxes(title_text="silhouette_scores", secondary_y=True)



We choose K=10 for the best number of clusters

In [206]:
model=KMeans(n_clusters=10, random_state=42)
model.fit(df_transformed)

KMeans(n_clusters=10, random_state=42)

In [207]:
df_transformed = scaler.inverse_transform(df_transformed)
df_transformed = pd.DataFrame(df_transformed)
df_transformed.columns = df.columns
df_transformed['labels'] = model.labels_
df_transformed['labels'] = df_transformed['labels'].astype(str)


In [208]:
px.scatter_3d(df_transformed , x='Annual Income (k$)' ,
              y='Spending Score (1-100)', z= 'Age',color='labels')