# CUSTO CLARITY - Customer Segmentation Analysis
## 02. Clustering Analysis & Customer Segmentation

**Author**: Neelanjan Chakraborty  
**Website**: [neelanjanchakraborty.in](https://neelanjanchakraborty.in/)  
**Project**: Customer Segmentation for Retail Strategy  

---

### 📋 Notebook Overview

This notebook implements comprehensive clustering analysis using multiple algorithms including K-Means, DBSCAN, and Hierarchical clustering. We'll also apply dimensionality reduction techniques (PCA, t-SNE) and evaluate cluster quality.

### 🎯 Objectives
- Preprocess data for clustering analysis
- Apply dimensionality reduction techniques
- Implement multiple clustering algorithms
- Evaluate and compare clustering results
- Generate business insights from customer segments

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import sys
import os

# Sklearn imports
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score

# Suppress warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

# Import custom modules
from data_loader import DataLoader
from preprocessor import CustomerDataPreprocessor
from clustering import CustomerClusteringAnalyzer
from visualizer import CustomerVisualizationSuite

# Set plot style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")
print("🚀 Ready for Clustering Analysis!")

In [None]:
# Load and Preprocess Data
print("📊 LOADING AND PREPROCESSING DATA")
print("=" * 40)

# Load dataset
loader = DataLoader()
df = loader.load_dataset()

print(f"Original dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Initialize preprocessor
preprocessor = CustomerDataPreprocessor()

# Perform complete preprocessing
df_processed, X = preprocessor.prepare_for_clustering(
    df, 
    target_columns=['Age', 'Annual Income (k$)', 'Spending Score (1-100)'],
    scaling_method='standard'
)

print(f"\n✅ Preprocessing completed!")
print(f"Processed dataset shape: {df_processed.shape}")
print(f"Feature matrix shape: {X.shape}")
print(f"Features used: {preprocessor.feature_names}")

# Display preprocessing summary
summary = preprocessor.get_preprocessing_summary()
print(f"\n📋 Preprocessing Summary:")
for step in summary['steps_performed']:
    print(f"  • {step}")

In [None]:
# Dimensionality Reduction Analysis
print("📉 DIMENSIONALITY REDUCTION ANALYSIS")
print("=" * 45)

# Initialize clustering analyzer
analyzer = CustomerClusteringAnalyzer()

# Apply dimensionality reduction
reduction_results = analyzer.apply_dimensionality_reduction(X, methods=['pca', 'tsne'])

# PCA Analysis
if 'pca' in reduction_results:
    pca_data = reduction_results['pca']['data']
    pca_model = reduction_results['pca']['model']
    explained_variance = reduction_results['pca']['explained_variance_ratio']
    total_variance = reduction_results['pca']['total_variance_explained']
    
    print(f"🔍 PCA Results:")
    print(f"  • PC1 Explained Variance: {explained_variance[0]:.3f} ({explained_variance[0]*100:.1f}%)")
    print(f"  • PC2 Explained Variance: {explained_variance[1]:.3f} ({explained_variance[1]*100:.1f}%)")
    print(f"  • Total Variance Explained: {total_variance:.3f} ({total_variance*100:.1f}%)")
    
    # Show PCA components
    feature_names = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
    components_df = pd.DataFrame(
        pca_model.components_,
        columns=feature_names,
        index=['PC1', 'PC2']
    )
    print(f"\n📊 PCA Components:")
    print(components_df.round(3))

# t-SNE Analysis
if 'tsne' in reduction_results:
    tsne_data = reduction_results['tsne']['data']
    print(f"\n🔍 t-SNE Results:")
    print(f"  • t-SNE embedding shape: {tsne_data.shape}")
    print(f"  • Applied for non-linear dimensionality reduction")

print(f"\n✅ Dimensionality reduction completed!")

In [None]:
# Optimal Cluster Analysis - Elbow Method & Silhouette Analysis
print("🎯 FINDING OPTIMAL NUMBER OF CLUSTERS")
print("=" * 45)

# Find optimal clusters
optimal_results = analyzer.find_optimal_clusters_kmeans(X, max_clusters=10)

print(f"📊 Cluster Optimization Results:")
print(f"  • Elbow Method Optimal k: {optimal_results['elbow_optimal_k']}")
print(f"  • Silhouette Analysis Optimal k: {optimal_results['silhouette_optimal_k']}")
print(f"  • Recommended k: {optimal_results['recommended_k']}")

# Initialize visualization suite
viz_suite = CustomerVisualizationSuite()

# Create elbow and silhouette plots
viz_suite.plot_elbow_analysis(
    optimal_results['k_range'],
    optimal_results['inertias'],
    optimal_results['silhouette_scores']
)

# Display detailed metrics for each k
print(f"\n📈 Detailed Metrics by Number of Clusters:")
metrics_df = pd.DataFrame({
    'k': optimal_results['k_range'],
    'Inertia': optimal_results['inertias'],
    'Silhouette Score': optimal_results['silhouette_scores'],
    'Calinski-Harabasz': optimal_results['calinski_harabasz_scores'],
    'Davies-Bouldin': optimal_results['davies_bouldin_scores']
})
print(metrics_df.round(3))

In [None]:
# K-Means Clustering Analysis
print("🔍 K-MEANS CLUSTERING ANALYSIS")
print("=" * 35)

# Perform K-Means clustering with optimal k
optimal_k = optimal_results['recommended_k']
kmeans_results = analyzer.perform_kmeans_clustering(X, n_clusters=optimal_k)

print(f"📊 K-Means Results (k={optimal_k}):")
print(f"  • Silhouette Score: {kmeans_results['silhouette_score']:.3f}")
print(f"  • Calinski-Harabasz Score: {kmeans_results['calinski_harabasz_score']:.3f}")
print(f"  • Davies-Bouldin Score: {kmeans_results['davies_bouldin_score']:.3f}")
print(f"  • Inertia: {kmeans_results['inertia']:.3f}")

# Get cluster labels
kmeans_labels = kmeans_results['labels']
unique_labels = np.unique(kmeans_labels)

print(f"\n🏷️ Cluster Distribution:")
cluster_counts = pd.Series(kmeans_labels).value_counts().sort_index()
for cluster, count in cluster_counts.items():
    percentage = count / len(kmeans_labels) * 100
    print(f"  • Cluster {cluster}: {count} customers ({percentage:.1f}%)")

# Visualize K-Means clusters
feature_names = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']

# 2D visualization using original features
viz_suite.plot_cluster_analysis_2d(
    X[:, [1, 2]],  # Income and Spending Score
    kmeans_labels,
    title="K-Means Customer Segmentation (Income vs Spending)",
    feature_names=['Annual Income (k$)', 'Spending Score (1-100)']
)

# 3D visualization
viz_suite.plot_cluster_analysis_3d(
    X,
    kmeans_labels,
    title="K-Means Customer Segmentation 3D",
    feature_names=feature_names
)

print(f"✅ K-Means clustering visualization completed!")

In [None]:
# DBSCAN Clustering Analysis
print("🔍 DBSCAN CLUSTERING ANALYSIS")
print("=" * 32)

# Optimize DBSCAN parameters
print("⚙️ Optimizing DBSCAN Parameters...")
dbscan_optimization = analyzer.optimize_dbscan_parameters(X)

if dbscan_optimization['best_params']:
    best_eps = dbscan_optimization['best_params']['eps']
    best_min_samples = dbscan_optimization['best_params']['min_samples']
    print(f"  • Best eps: {best_eps}")
    print(f"  • Best min_samples: {best_min_samples}")
    print(f"  • Best silhouette score: {dbscan_optimization['best_score']:.3f}")
    
    # Perform DBSCAN with optimized parameters
    dbscan_results = analyzer.perform_dbscan_clustering(X, eps=best_eps, min_samples=best_min_samples)
else:
    print("  • Using default parameters")
    dbscan_results = analyzer.perform_dbscan_clustering(X)

print(f"\n📊 DBSCAN Results:")
print(f"  • Number of Clusters: {dbscan_results['n_clusters']}")
print(f"  • Number of Noise Points: {dbscan_results['n_noise']}")
print(f"  • Noise Percentage: {dbscan_results['n_noise']/len(X)*100:.1f}%")
if dbscan_results['silhouette_score']:
    print(f"  • Silhouette Score: {dbscan_results['silhouette_score']:.3f}")

# Get DBSCAN labels
dbscan_labels = dbscan_results['labels']

if dbscan_results['n_clusters'] > 0:
    print(f"\n🏷️ DBSCAN Cluster Distribution:")
    dbscan_cluster_counts = pd.Series(dbscan_labels).value_counts().sort_index()
    for cluster, count in dbscan_cluster_counts.items():
        percentage = count / len(dbscan_labels) * 100
        cluster_name = "Noise" if cluster == -1 else f"Cluster {cluster}"
        print(f"  • {cluster_name}: {count} customers ({percentage:.1f}%)")
    
    # Visualize DBSCAN clusters if valid clusters found
    viz_suite.plot_cluster_analysis_2d(
        X[:, [1, 2]],  # Income and Spending Score
        dbscan_labels,
        title="DBSCAN Customer Segmentation (Income vs Spending)",
        feature_names=['Annual Income (k$)', 'Spending Score (1-100)']
    )
else:
    print("  ⚠️ DBSCAN did not find meaningful clusters with current parameters")

print(f"✅ DBSCAN clustering analysis completed!")

In [None]:
# Hierarchical Clustering Analysis
print("🔍 HIERARCHICAL CLUSTERING ANALYSIS")
print("=" * 40)

# Perform hierarchical clustering
hierarchical_results = analyzer.perform_hierarchical_clustering(X, n_clusters=optimal_k)

print(f"📊 Hierarchical Clustering Results (k={optimal_k}):")
print(f"  • Silhouette Score: {hierarchical_results['silhouette_score']:.3f}")
print(f"  • Calinski-Harabasz Score: {hierarchical_results['calinski_harabasz_score']:.3f}")
print(f"  • Davies-Bouldin Score: {hierarchical_results['davies_bouldin_score']:.3f}")
print(f"  • Linkage Method: {hierarchical_results['linkage']}")

# Get hierarchical labels
hierarchical_labels = hierarchical_results['labels']

print(f"\n🏷️ Hierarchical Cluster Distribution:")
hierarchical_cluster_counts = pd.Series(hierarchical_labels).value_counts().sort_index()
for cluster, count in hierarchical_cluster_counts.items():
    percentage = count / len(hierarchical_labels) * 100
    print(f"  • Cluster {cluster}: {count} customers ({percentage:.1f}%)")

# Visualize hierarchical clusters
viz_suite.plot_cluster_analysis_2d(
    X[:, [1, 2]],  # Income and Spending Score
    hierarchical_labels,
    title="Hierarchical Customer Segmentation (Income vs Spending)",
    feature_names=['Annual Income (k$)', 'Spending Score (1-100)']
)

print(f"✅ Hierarchical clustering analysis completed!")

In [None]:
# Compare Clustering Methods
print("⚖️ CLUSTERING METHODS COMPARISON")
print("=" * 35)

# Compare all clustering methods
comparison_df = analyzer.compare_clustering_methods(X)

print("📊 Clustering Methods Performance Comparison:")
print(comparison_df.to_string(index=False))

# Determine best method based on silhouette score
valid_scores = comparison_df.dropna(subset=['Silhouette Score'])
if not valid_scores.empty:
    best_method = valid_scores.loc[valid_scores['Silhouette Score'].idxmax(), 'Method']
    best_score = valid_scores['Silhouette Score'].max()
    print(f"\n🏆 Best Performing Method: {best_method} (Silhouette Score: {best_score:.3f})")
    
    # Select best clustering results for further analysis
    if best_method == 'K-Means':
        best_labels = kmeans_labels
        best_results = kmeans_results
    elif best_method == 'DBSCAN':
        best_labels = dbscan_labels
        best_results = dbscan_results
    else:
        best_labels = hierarchical_labels
        best_results = hierarchical_results
    
    print(f"🎯 Using {best_method} results for business analysis")
else:
    # Fallback to K-Means if no valid scores
    best_labels = kmeans_labels
    best_results = kmeans_results
    best_method = 'K-Means'
    print(f"🎯 Using K-Means results for business analysis (fallback)")

print(f"✅ Clustering comparison completed!")

In [None]:
# Dimensionality Reduction Visualization
print("📊 DIMENSIONALITY REDUCTION VISUALIZATION")
print("=" * 45)

# Create visualization with best clustering results
reduced_data = {
    'pca': reduction_results['pca']['data'],
    'tsne': reduction_results['tsne']['data']
}

viz_suite.plot_dimensionality_reduction(
    X, reduced_data, best_labels
)

print(f"✅ Dimensionality reduction visualization completed!")

In [None]:
# Customer Cluster Profiling
print("👥 CUSTOMER CLUSTER PROFILING")
print("=" * 32)

# Analyze cluster profiles
feature_names = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
profiles = analyzer.analyze_cluster_profiles(df, best_labels, feature_names)

# Display numeric profiles
if 'numeric' in profiles:
    print("📊 Cluster Profiles (Numeric Features):")
    print(profiles['numeric'].round(2))

# Display cluster sizes
print(f"\n📈 Cluster Sizes:")
cluster_sizes = profiles['cluster_sizes']
cluster_percentages = profiles['cluster_percentages']
for cluster in cluster_sizes.index:
    print(f"  • Cluster {cluster}: {cluster_sizes[cluster]} customers ({cluster_percentages[cluster]:.1f}%)")

# Visualize cluster profiles
viz_suite.plot_cluster_profiles(profiles)
viz_suite.plot_cluster_sizes(cluster_sizes)

print(f"✅ Cluster profiling completed!")

In [None]:
# Generate Business Insights
print("💼 BUSINESS INSIGHTS GENERATION")
print("=" * 35)

# Generate business insights
business_insights = analyzer.generate_business_insights(profiles, best_labels)

print(f"🎯 Customer Segment Analysis & Business Recommendations:")
print("=" * 60)

for cluster_key, insight in business_insights.items():
    print(f"\n{insight}")
    print("-" * 50)

# Create interactive business dashboard
print(f"\n📈 Creating Interactive Business Dashboard...")
viz_suite.create_business_dashboard(df, best_labels, profiles)

print(f"✅ Business insights generation completed!")

In [None]:
# Save Results and Generate Summary Report
print("💾 SAVING RESULTS AND GENERATING SUMMARY")
print("=" * 45)

# Add cluster labels to original dataframe
df_final = df.copy()
df_final['Cluster'] = best_labels
df_final['Cluster_Method'] = best_method

# Save processed data
os.makedirs('../outputs', exist_ok=True)
df_final.to_csv('../outputs/customer_segments.csv', index=False)
print(f"✅ Customer segments saved to '../outputs/customer_segments.csv'")

# Generate comprehensive summary
print(f"\n📋 CUSTO CLARITY - CLUSTERING ANALYSIS SUMMARY")
print("=" * 50)
print(f"📊 Dataset: {df.shape[0]} customers with {df.shape[1]} features")
print(f"🔍 Best Clustering Method: {best_method}")
print(f"🎯 Number of Segments: {len(np.unique(best_labels))}")
if best_method in ['K-Means', 'Hierarchical']:
    print(f"📈 Silhouette Score: {best_results['silhouette_score']:.3f}")
    print(f"📈 Calinski-Harabasz Score: {best_results['calinski_harabasz_score']:.3f}")
    print(f"📈 Davies-Bouldin Score: {best_results['davies_bouldin_score']:.3f}")

print(f"\n🎯 Key Business Recommendations:")
print(f"  • Implement targeted marketing strategies for each segment")
print(f"  • Develop personalized product offerings")
print(f"  • Focus on high-value customer retention")
print(f"  • Create segment-specific promotional campaigns")

print(f"\n🔮 Next Steps:")
print(f"  • Validate clusters with domain experts")
print(f"  • Implement A/B testing for segment strategies")
print(f"  • Monitor cluster stability over time")
print(f"  • Develop automated segmentation pipeline")

print(f"\n🚀 Clustering Analysis Completed Successfully!")
print(f"\n👨‍💻 Analysis by: Neelanjan Chakraborty")
print(f"🌐 Website: https://neelanjanchakraborty.in/")
print(f"📧 Contact: Available via website")