In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
vjchoudhary7_customer_segmentation_tutorial_in_python_path = kagglehub.dataset_download('vjchoudhary7/customer-segmentation-tutorial-in-python')

print('Data source import complete.')


# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')

# Load the dataset

In [None]:
data = pd.read_csv("/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")
data

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
sns.pairplot(data, hue='Gender', kind='scatter', diag_kind='kde', palette='Set1')
plt.title("Pairplot of Data")
plt.show()

#  Select relevant columns

In [None]:
data = data[['Annual Income (k$)', 'Spending Score (1-100)']]

#  Normalize the data for better clustering performance

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(data)

#  Plot the Dendrogram to determine the optimal number of clusters

In [None]:
plt.figure(figsize=(25, 7))
dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))
plt.title('Dendrogram for Agglomerative Clustering')
plt.xlabel('Data Points')
plt.ylabel('Euclidean Distance')
plt.show()

#  Apply Agglomerative Clustering with 5 clusters

In [None]:
agglom = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
y_agglo = agglom.fit_predict(X)
data.loc[:, 'Labels'] = y_agglo

# Plot Agglomerative Clustering results using Plotly

In [None]:
fig = px.scatter(data, x='Annual Income (k$)', y='Spending Score (1-100)',
                 color=data['Labels'].astype(str),
                 title='Agglomerative Clustering with 5 Clusters',
                 labels={'Labels': 'Cluster'},
                 color_discrete_sequence=px.colors.qualitative.Set1)
iplot(fig)

#  Apply K-Means Clustering and find the optimal number of clusters using the Elbow Method

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', n_init=10, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    y_kmeans = kmeans.fit_predict(data[['Annual Income (k$)', 'Spending Score (1-100)']])
    data['KMeans_Labels'] = y_kmeans

# Plot the Elbow Method Graph

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--', color='purple')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method for Optimal K')
plt.show()

# Apply DBSCAN Clustering

In [None]:
db = DBSCAN(eps=15, min_samples=6)
y = db.fit_predict(data[['Annual Income (k$)', 'Spending Score (1-100)']])
data['Labels'] = y

#  Plot DBSCAN results using Plotly

In [None]:
fig = px.scatter(data,
                 x='Annual Income (k$)',
                 y='Spending Score (1-100)',
                 color=data['Labels'].astype(str),
                 title='DBSCAN Clustering (eps=15, min_samples=6)',
                 labels={'Labels': 'Cluster'},
                 color_discrete_sequence=px.colors.qualitative.Set2)

fig.update_layout(
    xaxis_title="Annual Income (k$)",
    yaxis_title="Spending Score (1-100)",
    title="DBSCAN Clustering (eps=15, min_samples=6)"
)

iplot(fig)

#  Compute Silhouette Scores

In [None]:
kmeans_silhouette = silhouette_score(data[['Annual Income (k$)', 'Spending Score (1-100)']], y_kmeans)
agglomerative_silhouette = silhouette_score(data[['Annual Income (k$)', 'Spending Score (1-100)']], y_agglo)
dbscan_silhouette = silhouette_score(data[['Annual Income (k$)', 'Spending Score (1-100)']], y)
print("Silhouette Score:")
print("K-Means Silhouette Score:", kmeans_silhouette)
print("Agglomerative Clustering Silhouette Score:", agglomerative_silhouette)
print("DBSCAN Silhouette Score:", dbscan_silhouette)

#  Visualize Cluster Distribution using Pie Chart

In [None]:
pie_fig = px.pie(data, names='Labels', title='Cluster Distribution - Agglomerative Clustering',
                 color_discrete_sequence=px.colors.sequential.Purples)
iplot(fig)