In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import (KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering,
                             OPTICS, AffinityPropagation, MeanShift, Birch)
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.decomposition import PCA

In [2]:
# Set environment variable to avoid KMeans memory leak warning
os.environ["OMP_NUM_THREADS"] = "1"

In [3]:
# Load the data
data_url = r'E:\Journal Papers\Comparative Analysis of Advanced Clustering Algorithms for Market Segmentation\A Case Study on Mall Customer Data\Mall_Customers.csv'
data = pd.read_csv(data_url)

In [4]:
# Select relevant features
features = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
data = data.dropna(subset=features)  # Drop rows with missing values in selected features
X = data[features]

In [5]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Function to calculate silhouette score and plot clusters
def plot_clusters(X, labels, title):
    pca = PCA(2)  # Reduce to 2 dimensions for visualization
    X_pca = pca.fit_transform(X)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis')
    plt.title(title)
    plt.show()

def evaluate_clustering(X, labels, method_name):
    silhouette = silhouette_score(X, labels)
    davies_bouldin = davies_bouldin_score(X, labels)
    calinski_harabasz = calinski_harabasz_score(X, labels)
    print(f"Silhouette Score ({method_name}): {silhouette:.3f}")
    print(f"Davies-Bouldin Score ({method_name}): {davies_bouldin:.3f}")
    print(f"Calinski-Harabasz Score ({method_name}): {calinski_harabasz:.3f}")
    return silhouette, davies_bouldin, calinski_harabasz

In [7]:
# K-Means Clustering
def kmeans_segmentation(X, k=3):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)
    return labels, kmeans.inertia_

In [8]:
# Hierarchical Clustering
def hierarchical_segmentation(X, n_clusters=3):
    Z = linkage(X, 'ward')
    labels = fcluster(Z, t=n_clusters, criterion='maxclust')
    return labels, None  # No inertia for hierarchical clustering

In [9]:
# DBSCAN Clustering
def dbscan_segmentation(X, eps=0.5, min_samples=5):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(X)
    return labels, None  # No inertia for DBSCAN

In [10]:
# Gaussian Mixture Model Clustering
def gmm_segmentation(X, n_components=3):
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    labels = gmm.fit_predict(X)
    return labels, gmm.bic(X)  # Using BIC as a comparative metric

In [11]:
# Mean Shift Clustering
def meanshift_segmentation(X):
    meanshift = MeanShift()
    labels = meanshift.fit_predict(X)
    return labels, None  # No inertia for Mean Shift

In [12]:
# Agglomerative Clustering
def agglomerative_segmentation(X, n_clusters=3):
    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    labels = agglomerative.fit_predict(X)
    return labels, None  # No inertia for Agglomerative Clustering

In [13]:
# BIRCH Clustering
def birch_segmentation(X, n_clusters=3):
    birch = Birch(n_clusters=n_clusters)
    labels = birch.fit_predict(X)
    return labels, None  # No inertia for BIRCH

In [14]:
# Spectral Clustering
def spectral_segmentation(X, n_clusters=3):
    spectral = SpectralClustering(n_clusters=n_clusters, random_state=42)
    labels = spectral.fit_predict(X)
    return labels, None  # No inertia for Spectral Clustering

In [15]:
# OPTICS Clustering
def optics_segmentation(X, min_samples=5):
    optics = OPTICS(min_samples=min_samples)
    labels = optics.fit_predict(X)
    return labels, None  # No inertia for OPTICS Clustering

In [16]:
# Affinity Propagation Clustering
def affinity_propagation_segmentation(X):
    affinity_propagation = AffinityPropagation(random_state=42)
    labels = affinity_propagation.fit_predict(X)
    return labels, None  # No inertia for Affinity Propagation

In [17]:
# Apply the clustering algorithms
methods = {
    'K-Means': kmeans_segmentation,
    'Hierarchical': hierarchical_segmentation,
    'DBSCAN': dbscan_segmentation,
    'GMM': gmm_segmentation,
    'Mean Shift': meanshift_segmentation,
    'Agglomerative': agglomerative_segmentation,
    'BIRCH': birch_segmentation,
    'Spectral': spectral_segmentation,
    'OPTICS': optics_segmentation,
    'Affinity Propagation': affinity_propagation_segmentation
}

In [18]:
# Collect results
results = {}
silhouette_scores = {}
davies_bouldin_scores = {}
calinski_harabasz_scores = {}

for method_name, method_func in methods.items():
    labels, _ = method_func(X_scaled)
    
    # Check if labels are sufficient
    unique_labels = np.unique(labels)
    if len(unique_labels) < 2:
        print(f"Warning: {method_name} produced {len(unique_labels)} labels. Skipping evaluation.")
        continue
    
    results[method_name] = labels
    silhouette_scores[method_name], davies_bouldin_scores[method_name], calinski_harabasz_scores[method_name] = evaluate_clustering(X_scaled, labels, method_name)




Silhouette Score (K-Means): 0.358
Davies-Bouldin Score (K-Means): 1.033
Calinski-Harabasz Score (K-Means): 101.530
Silhouette Score (Hierarchical): 0.321
Davies-Bouldin Score (Hierarchical): 1.128
Calinski-Harabasz Score (Hierarchical): 88.102
Silhouette Score (DBSCAN): 0.185
Davies-Bouldin Score (DBSCAN): 1.757
Calinski-Harabasz Score (DBSCAN): 34.071
Silhouette Score (GMM): 0.335
Davies-Bouldin Score (GMM): 1.019
Calinski-Harabasz Score (GMM): 90.864
Silhouette Score (Agglomerative): 0.321
Davies-Bouldin Score (Agglomerative): 1.128
Calinski-Harabasz Score (Agglomerative): 88.102
Silhouette Score (BIRCH): 0.266
Davies-Bouldin Score (BIRCH): 1.061
Calinski-Harabasz Score (BIRCH): 63.583




Silhouette Score (Spectral): 0.353
Davies-Bouldin Score (Spectral): 0.993
Calinski-Harabasz Score (Spectral): 99.602
Silhouette Score (OPTICS): -0.063
Davies-Bouldin Score (OPTICS): 1.399
Calinski-Harabasz Score (OPTICS): 12.523
Silhouette Score (Affinity Propagation): 0.369
Davies-Bouldin Score (Affinity Propagation): 0.949
Calinski-Harabasz Score (Affinity Propagation): 128.602
