<a href="https://colab.research.google.com/github/Layaa-V/Customer-Segmentation-Analysis/blob/main/Clustering_Comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score

In [None]:
filetxt = 'all_outputs.txt'
results = {}

In [None]:
with open(filetxt, 'r') as f:
  for line in f:
    if '=' not in line:
      continue
    method, labels = line.split('=')
    method = method.strip()
    labels = labels.strip().strip('\n')
    labels = np.array([int(val) for val in labels.split(',')])
    results[method] = labels

In [None]:
results

{'DBSCAN_result': array([1, 1, 1, ..., 1, 1, 1]),
 'Hierarchical_result': array([2, 2, 2, ..., 2, 2, 2]),
 'KMeans_result': array([2, 0, 2, ..., 2, 1, 0])}

In [None]:
df = pd.read_csv('cleaned_customer_data.csv')

In [None]:
df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24
0,4.440072,-0.441154,-1.212194,-2.921589,1.702014,-0.933004,1.204538,-0.908896,0.584208,1.214454,...,1.333772,-0.299663,-0.151714,-0.972105,-0.239929,0.422676,-0.889899,-0.19914,-1.303815,-0.180862
1,-2.963183,0.130282,-0.043969,1.508977,-0.399798,1.89556,0.422089,1.63355,1.253528,0.663276,...,0.497868,-0.7432,-0.008859,-0.445665,-0.074259,-0.454633,-0.287264,-0.101832,-0.248411,-0.013024
2,2.041758,-1.179137,-1.362415,0.143487,0.277441,-0.888433,-0.274979,0.218121,-0.220829,-0.712737,...,0.002531,0.258796,-0.252426,-0.231615,-0.46543,-0.557716,-0.746706,-0.298896,1.099553,-0.284858
3,-2.699342,-1.587484,0.451775,0.307774,-0.899329,0.957434,-1.705891,0.623604,0.450687,-0.382943,...,0.401572,-0.454179,-0.21332,-0.409084,-0.136673,-0.008345,0.09032,-0.239212,0.048614,-0.406167
4,-0.752083,0.122107,-0.662017,-0.228723,-0.716655,0.026204,-1.004335,-1.656556,-0.707655,0.603419,...,0.674702,0.486841,-0.276584,-0.213139,0.097726,0.258561,0.26693,0.43967,0.753134,0.462823


In [None]:
X = df.values

# Separation/Spread Ratio

In [None]:
def ss_ratio(X, labels):
    # Filtering out noise
    valid_indices = np.where(labels != -1)[0]
    X_filtered = X[valid_indices]
    labels_filtered = labels[valid_indices]

    unique_labels = np.unique(labels_filtered)
    n_clusters = len(unique_labels)

    all_cluster_spreads = []
    centroids = []

    for label in unique_labels:
      # Get all points belonging to this cluster
      cluster_points = X_filtered[labels_filtered == label]

      # centroid(average point)
      centroid = np.mean(cluster_points, axis=0)
      centroids.append(centroid)

      #Distance of each point from its centroid
      distances = np.linalg.norm(cluster_points - centroid, axis=1)

      # Get the average distance for this cluster
      avg_cluster_spread = np.mean(distances)
      all_cluster_spreads.append(avg_cluster_spread)

    # Final spread(average of all cluster spreads)
    avg_spread = np.mean(all_cluster_spreads)

    # average separation between centroids
    total_separation = 0
    pair_count = 0
    for i in range(n_clusters):
      for j in range(i + 1, n_clusters):
        dist = np.linalg.norm(centroids[i] - centroids[j])
        total_separation += dist
        pair_count += 1

    avg_separation = total_separation / pair_count

    if avg_spread == 0:
      #avoiding division by zero if clusters are single points
      return np.inf

    return avg_separation / avg_spread

In [None]:
for method, labels in results.items():
    ratio = ss_ratio(X, labels)
    print(f'Separation to Spread Ratio for {method} is {ratio:.4f}')

Separation to Spread Ratio for DBSCAN_result is 2.3936
Separation to Spread Ratio for Hierarchical_result is 2.4407
Separation to Spread Ratio for KMeans_result is 1.1649


#Calinski-Harabasz (CH) Index

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import calinski_harabasz_score
DATA_PATH='cleaned_customer_data.csv'
df=pd.read_csv(DATA_PATH)
numeric_df=df.select_dtypes(include=[np.number]).copy()
for possible_id in ["CustomerID","customer_id","ID","Id","id"]:
    if possible_id in numeric_df.columns:
        numeric_df.drop(columns=[possible_id],inplace=True)
scaler=StandardScaler()
X=scaler.fit_transform(numeric_df.values)
print("Data loaded and scaled. Shape:",X.shape)
#this aligns the label arrays with the data length
for key in list(results.keys()):
    if len(results[key])!=len(X):
        results[key]=results[key][:len(X)]

#this computes the calinski-harabasz index for each method
scores={}
for method, labels in results.items():
    #this skips the invalid cases (if all points are in one cluster)
    if len(np.unique(labels))<2:
        print(f"Skipping {method}: only one cluster detected.")
        continue
    ch_score=calinski_harabasz_score(X, labels)
    scores[method]=ch_score
    print(f"{method}: CH Index = {ch_score:.3f}")

#this displays all the results as a dataframe
scores_df=pd.DataFrame(scores.items(),columns=["Clustering Method","Calinski–Harabasz Score"])
scores_df=scores_df.sort_values(by="Calinski–Harabasz Score",ascending=False).reset_index(drop=True)

print("\nClustering Comparison (Calinski–Harabasz Index)")
display(scores_df)


Data loaded and scaled. Shape: (2240, 24)
DBSCAN_result: CH Index = 78.566
Hierarchical_result: CH Index = 97.278
KMeans_result: CH Index = 86.743

Clustering Comparison (Calinski–Harabasz Index)


Unnamed: 0,Clustering Method,Calinski–Harabasz Score
0,Hierarchical_result,97.278213
1,KMeans_result,86.743005
2,DBSCAN_result,78.566374


#Davies-Bouldin Index (DBI)

In [None]:
def davies_bouldin_index(X, labels):
    unique_labels = np.unique(labels[labels >= 0])  # ignore DBSCAN noise (-1)
    k = len(unique_labels)
    centroids = np.array([X[labels == i].mean(axis=0) for i in unique_labels])
    S = np.zeros(k)

    for i, lbl in enumerate(unique_labels):
        cluster_points = X[labels == lbl]
        if len(cluster_points) > 0:
            S[i] = np.mean(np.linalg.norm(cluster_points - centroids[i], axis=1))

    M = np.linalg.norm(centroids[:, None] - centroids, axis=2)
    np.fill_diagonal(M, np.inf)
    R = (S[:, None] + S) / M
    D = np.max(R, axis=1)
    return np.mean(D)


# Compute DBI for all clustering methods
records = []
for method, labels in results.items():
    mask = labels >= 0  # handle DBSCAN noise points
    dbi = davies_bouldin_index(X[mask], labels[mask])
    records.append({
        "Clustering Method": method.replace("_result", ""),
        "Davies–Bouldin Index": round(dbi, 4)
    })

# Display result as DataFrame
df_results = pd.DataFrame(records)
print("\nClustering Comparison (Davies–Bouldin Index)")
display(df_results)



Clustering Comparison (Davies–Bouldin Index)


Unnamed: 0,Clustering Method,Davies–Bouldin Index
0,DBSCAN,1.0093
1,Hierarchical,0.8194
2,KMeans,3.4313
