# DBSCAN Clustering with Feature Engineering, PCA, and Analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan

sns.set(style="whitegrid")


## Load and Prepare Dataset

In [None]:
df = pd.read_csv('data/clean/order_klav_merge_customerLevel.csv')

# Get top cities and countries
top_cities = df['Recent City'].value_counts().head(5)
top_countries = df['Recent Country'].value_counts().head(3)

# Replace others
df['Recent City'] = df['Recent City'].apply(lambda x: x if x in top_cities.index else 'Other')
df['Recent Country'] = df['Recent Country'].apply(lambda x: x if x in top_countries.index else 'Other')

# Encode dummies
df = pd.get_dummies(df, columns=['Recent City', 'Recent Country'], drop_first=True)

# Drop irrelevant or redundant columns
df = df.drop(columns=[
    'CLV', 'Email', 'List SKU', 'Last Source New', 'Initial Source New', 'Days since Date Added',
    'Max Amount Orders', 'Max item amount', 'Days since Profile Created On',
    'PayMeth_Card','PayMeth_Other', 'Always Free Shipping', 'Never Free Shipping'
])


## Scale and Apply PCA

In [None]:
X = df.values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=10)
X_reduced = pca.fit_transform(X_scaled)


## DBSCAN Clustering

In [None]:
dbscan = DBSCAN(eps=0.4, min_samples=40)
dbscan_labels = dbscan.fit_predict(X_reduced)
df['dbscan_cluster'] = dbscan_labels


## Cluster Summary

In [None]:
cluster_summary = df.groupby('dbscan_cluster').mean(numeric_only=True).round(2)
cluster_sizes = df['dbscan_cluster'].value_counts().sort_index()
cluster_summary['Cluster Size'] = cluster_sizes
cluster_summary = cluster_summary.reset_index()

# Save
cluster_summary.to_csv("comparison_with_noise_cluster.csv", index=False)

# View key metrics
selected_columns = [
    'dbscan_cluster', 'Cluster Size', 'Nb Orders', 'Amount Orders', 'Avg Amount Orders',
    'DaysSinceRecentOrder', 'Days since Last Active', 'click', 'open', 'Accepts Marketing'
]
cluster_summary[selected_columns].sort_values(by="Cluster Size", ascending=False)


## Evaluate Clustering

In [None]:
valid = dbscan_labels != -1
if np.any(valid):
    sil = silhouette_score(X_reduced[valid], dbscan_labels[valid])
    print(f"DBSCAN Silhouette Score (no noise): {sil:.4f}")
else:
    print("No valid clusters for silhouette score.")


## Visualize Clusters in PCA Space

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=dbscan_labels, cmap='tab10', s=10)
plt.title("DBSCAN Clustering (PCA 2D)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.grid(True)
plt.tight_layout()
plt.show()


## Optional: HDBSCAN Comparison

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=350, min_samples=5, prediction_data=True)
hdb_labels = clusterer.fit_predict(X_reduced)

# Evaluate
valid = hdb_labels != -1
if np.any(valid):
    sil_score = silhouette_score(X_reduced[valid], hdb_labels[valid])
    print(f"HDBSCAN Silhouette Score (excluding noise): {sil_score:.4f}")
else:
    print("No valid clusters to evaluate.")

# Cluster distribution
unique, counts = np.unique(hdb_labels, return_counts=True)
print("HDBSCAN cluster distribution:", dict(zip(unique, counts)))
