# Mall Customers Clustering Project

In [41]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from kneed import KneeLocator
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

## Load the data

In [42]:
data = pd.read_csv("Mall_Customers.csv")
data.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


## Data Preprocessing

In [43]:
# check the nulls
data.isnull().sum()

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

In [44]:
# check duplicates
data.duplicated().sum()

np.int64(0)

## Visualize outliers

In [45]:
fig = make_subplots(
    rows=3, cols=1,
    subplot_titles=["Age", "Annual Income (k$)", "Spending Score (1-100)"]
)

fig.add_trace(go.Box(y=data["Age"], name="Age", marker_color="royalblue"), row=1, col=1)
fig.add_trace(go.Box(y=data["Annual Income (k$)"], name="Annual Income", marker_color="seagreen"), row=2, col=1)
fig.add_trace(go.Box(y=data["Spending Score (1-100)"], name="Spending Score", marker_color="darkorange"), row=3, col=1)

fig.update_layout(
    title="Box Plots of Customer Features",
    height=900,
    template="plotly_white"
)

fig.show()

## Handling outliers

In [46]:
Q1 = data[['Annual Income (k$)']].quantile(0.25)
Q3 = data[['Annual Income (k$)']].quantile(0.75)
IQR = Q3 - Q1
filter = (data[['Annual Income (k$)']] >= (Q1 - 1.5 * IQR)) & (data[['Annual Income (k$)']] <= (Q3 + 1.5 * IQR))
data = data[filter.all(axis=1)]
fig = px.box(data, y="Annual Income (k$)", title="Box Plot of Annual Income")
fig.show()

In [47]:
fig1 = px.scatter(
    data,
    x="Annual Income (k$)",
    y="Spending Score (1-100)",
    color="Gender",  # color customers by gender
    title="Annual Income vs Spending Score"
)
fig1.show()

# 3D scatter plot
fig2 = px.scatter_3d(
    data,
    x="Age",
    y="Annual Income (k$)",
    z="Spending Score (1-100)",
    color="Gender",
    title="3D Visualization of Customers"
)
fig2.show()

In [48]:
x= data[['Annual Income (k$)','Spending Score (1-100)']]

## Scale the data

In [49]:
scaler= StandardScaler()
X_scaled= scaler.fit_transform(x)

## Calculate WCSS and Silhouette to select the best K


In [50]:
# Calculate WCSS and Silhouette
wcss = []
silhouette_scores = []
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Create Plotly figure with dual y-axis
fig = go.Figure()

# Add WCSS trace (left y-axis)
fig.add_trace(go.Scatter(
    x=list(K), y=wcss,
    mode="lines+markers",
    name="WCSS (Elbow Method)",
    yaxis="y1"
))

# Add Silhouette Score trace (right y-axis)
fig.add_trace(go.Scatter(
    x=list(K), y=silhouette_scores,
    mode="lines+markers",
    name="Silhouette Score",
    yaxis="y2"
))

# Update layout for dual axes
fig.update_layout(
    title="WCSS vs Silhouette Score for Different k",
    xaxis=dict(title="Number of Clusters (k)"),
    yaxis=dict(title="WCSS", side="left"),
    yaxis2=dict(title="Silhouette Score", overlaying="y", side="right"),
    legend=dict(x=0.5, y=-0.2, orientation="h", xanchor="center")
)

fig.show()


## Apply K-means with k = 5

In [51]:
k_means= KMeans(n_clusters=5,random_state=42)
data["Cluster"] = k_means.fit_predict(X_scaled)

In [52]:
data.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100),Cluster
0,1,Male,19,15,39,0
1,2,Male,21,15,81,2
2,3,Female,20,16,6,0
3,4,Female,23,16,77,2
4,5,Female,31,17,40,0


In [53]:
import plotly.express as px

# Scatter plot with clusters
fig = px.scatter(
    data,
    x="Annual Income (k$)",
    y="Spending Score (1-100)",
    color="Cluster",  # cluster labels
    title="Customer Segments",
    color_continuous_scale="Viridis",  # similar to cmap="viridis"
    hover_data=["Age"]  # optional, shows extra info when hovering
)

fig.update_traces(marker=dict(size=10, line=dict(width=1, color="DarkSlateGrey")))

fig.show()

## DBSCAN
Plot k-distance to select the best value for eps

In [54]:
k = 5  # same as n_neighbors
neighbors = NearestNeighbors(n_neighbors=k)
neighbors_fit = neighbors.fit(X_scaled)
distances, indices = neighbors_fit.kneighbors(X_scaled)

# take the distance to the k-th nearest neighbor
distances = np.sort(distances[:, k-1])

# Plot k-distance graph
fig = px.line(
    x=range(1, len(distances)+1),
    y=distances,
    title=f"{k}-Distance Graph (for eps selection)",
    labels={"x": "Points (sorted)", "y": f"{k}-NN Distance"},
    template="plotly_white"
)
fig.show()

In [55]:
kneedle = KneeLocator(range(1, len(distances)+1), distances, curve="convex", direction="increasing")
eps_value = distances[kneedle.knee]

print("Recommended eps value:", eps_value)

Recommended eps value: 0.5026050222664861


In [56]:
# Run DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
data["DBSCAN_Cluster"] = dbscan.fit_predict(X_scaled)

# Plot results
fig = px.scatter(
    data,
    x="Annual Income (k$)",
    y="Spending Score (1-100)",
    color="DBSCAN_Cluster",
    hover_data=["Age"],
    title="DBSCAN Clustering",
    template="plotly_white"
)
fig.show()

print()





In [57]:
data["DBSCAN_Cluster"].value_counts()

DBSCAN_Cluster
 0    157
 1     34
-1      7
Name: count, dtype: int64

# Analyze avg spending per cluster

In [58]:
data["KMeans_Cluster"] = k_means.fit_predict(X_scaled)
data["DBSCAN_Cluster"] = dbscan.fit_predict(X_scaled)

In [59]:
print("\nAverage Spending per Cluster (KMeans):")
print(data.groupby("KMeans_Cluster")["Spending Score (1-100)"].mean())

print("\nAverage Spending per Cluster (DBSCAN):")
print(data.groupby("DBSCAN_Cluster")["Spending Score (1-100)"].mean())



Average Spending per Cluster (KMeans):
KMeans_Cluster
0    42.814433
1    84.615385
2    79.363636
3    18.648649
4    77.586207
Name: Spending Score (1-100), dtype: float64

Average Spending per Cluster (DBSCAN):
DBSCAN_Cluster
-1    48.857143
 0    43.101911
 1    83.235294
Name: Spending Score (1-100), dtype: float64
