# üéØ Customer Clustering - Model Training

This notebook trains multiple clustering models to group customers based on their features.

**Dataset Context:** Customer segmentation for marketing and business strategy

## Models to Train:
1. **K-Means Clustering** - Popular partitioning algorithm
2. **Mini-Batch K-Means** - Faster variant for large datasets
3. **Agglomerative Hierarchical Clustering** - Bottom-up approach
4. **DBSCAN** - Density-based clustering
5. **Gaussian Mixture Model (GMM)** - Probabilistic clustering
6. **Spectral Clustering** - Graph-based approach

## Evaluation Metrics:
- **Silhouette Score** - Cluster cohesion and separation (-1 to 1, higher is better)
- **Calinski-Harabasz Index** - Variance ratio (higher is better)
- **Davies-Bouldin Index** - Average similarity between clusters (lower is better)
- **Inertia** - Sum of squared distances to cluster centers (K-Means only)

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
from datetime import datetime
import time

# Clustering algorithms
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture

# Evaluation metrics
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Dimensionality reduction for visualization
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')

print("‚úì All libraries imported successfully!")

## Load Processed Data

In [None]:
# Load the scaled dataset (StandardScaler version recommended)
print("Loading processed clustering data...\n")

df = pd.read_csv('clustering_scaled_standard.csv')

print(f"Dataset shape: {df.shape}")
print(f"Features: {df.shape[1]}")
print(f"Samples: {df.shape[0]:,}")
print(f"\nFirst few rows:")
df.head()

## üîç Determine Optimal Number of Clusters

We'll use multiple methods to find the optimal k:
1. **Elbow Method** - Find the "elbow" in inertia curve
2. **Silhouette Analysis** - Maximize silhouette score

In [None]:
# Prepare data
X = df.values

print(f"Data shape: {X.shape}")
print(f"Features: {X.shape[1]}")
print(f"Samples: {X.shape[0]:,}")

In [None]:
# Elbow Method and Silhouette Analysis
print("="*80)
print("FINDING OPTIMAL NUMBER OF CLUSTERS")
print("="*80)

k_range = range(2, 11)
inertias = []
silhouette_scores = []
calinski_scores = []
davies_bouldin_scores = []

for k in k_range:
    print(f"\nTesting k={k}...")
    
    # Fit K-Means
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)
    
    # Calculate metrics
    inertia = kmeans.inertia_
    silhouette = silhouette_score(X, labels)
    calinski = calinski_harabasz_score(X, labels)
    davies_bouldin = davies_bouldin_score(X, labels)
    
    inertias.append(inertia)
    silhouette_scores.append(silhouette)
    calinski_scores.append(calinski)
    davies_bouldin_scores.append(davies_bouldin)
    
    print(f"  Inertia: {inertia:.2f}")
    print(f"  Silhouette: {silhouette:.4f}")
    print(f"  Calinski-Harabasz: {calinski:.2f}")
    print(f"  Davies-Bouldin: {davies_bouldin:.4f}")

print("\n‚úì Cluster analysis complete!")

In [None]:
# Plot results
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Elbow Method
axes[0, 0].plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0, 0].set_xlabel('Number of Clusters (k)', fontweight='bold')
axes[0, 0].set_ylabel('Inertia (Within-cluster sum of squares)', fontweight='bold')
axes[0, 0].set_title('Elbow Method', fontsize=14, fontweight='bold')
axes[0, 0].grid(alpha=0.3)
axes[0, 0].set_xticks(k_range)

# 2. Silhouette Score
axes[0, 1].plot(k_range, silhouette_scores, 'go-', linewidth=2, markersize=8)
axes[0, 1].set_xlabel('Number of Clusters (k)', fontweight='bold')
axes[0, 1].set_ylabel('Silhouette Score', fontweight='bold')
axes[0, 1].set_title('Silhouette Analysis (Higher is Better)', fontsize=14, fontweight='bold')
axes[0, 1].grid(alpha=0.3)
axes[0, 1].set_xticks(k_range)

# Mark best k
best_k_silhouette = list(k_range)[np.argmax(silhouette_scores)]
axes[0, 1].axvline(x=best_k_silhouette, color='red', linestyle='--', alpha=0.7, label=f'Best k={best_k_silhouette}')
axes[0, 1].legend()

# 3. Calinski-Harabasz Index
axes[1, 0].plot(k_range, calinski_scores, 'mo-', linewidth=2, markersize=8)
axes[1, 0].set_xlabel('Number of Clusters (k)', fontweight='bold')
axes[1, 0].set_ylabel('Calinski-Harabasz Index', fontweight='bold')
axes[1, 0].set_title('Calinski-Harabasz Index (Higher is Better)', fontsize=14, fontweight='bold')
axes[1, 0].grid(alpha=0.3)
axes[1, 0].set_xticks(k_range)

# 4. Davies-Bouldin Index
axes[1, 1].plot(k_range, davies_bouldin_scores, 'ro-', linewidth=2, markersize=8)
axes[1, 1].set_xlabel('Number of Clusters (k)', fontweight='bold')
axes[1, 1].set_ylabel('Davies-Bouldin Index', fontweight='bold')
axes[1, 1].set_title('Davies-Bouldin Index (Lower is Better)', fontsize=14, fontweight='bold')
axes[1, 1].grid(alpha=0.3)
axes[1, 1].set_xticks(k_range)

plt.tight_layout()
plt.savefig('optimal_clusters_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\n‚úì Analysis saved as 'optimal_clusters_analysis.png'")
print(f"\nüìä Recommended k based on Silhouette Score: {best_k_silhouette}")

In [None]:
# Set optimal k for training
optimal_k = best_k_silhouette

print("="*80)
print(f"OPTIMAL NUMBER OF CLUSTERS: k = {optimal_k}")
print("="*80)
print(f"This value will be used for clustering algorithms that require k parameter.")

## Define Evaluation Function

In [None]:
def evaluate_clustering(X, labels, model_name, training_time, model=None):
    """
    Evaluate clustering model performance
    
    Returns:
        dict: Dictionary containing evaluation metrics
    """
    print(f"\n{'='*80}")
    print(f"Evaluating: {model_name}")
    print(f"{'='*80}")
    
    # Number of clusters
    n_clusters = len(np.unique(labels[labels >= 0]))  # Exclude noise points (-1)
    n_noise = np.sum(labels == -1)
    
    # Calculate metrics
    if n_clusters > 1 and n_noise < len(labels):  # Need at least 2 clusters
        # For DBSCAN, exclude noise points
        mask = labels >= 0
        X_clean = X[mask]
        labels_clean = labels[mask]
        
        if len(np.unique(labels_clean)) > 1:
            silhouette = silhouette_score(X_clean, labels_clean)
            calinski = calinski_harabasz_score(X_clean, labels_clean)
            davies_bouldin = davies_bouldin_score(X_clean, labels_clean)
        else:
            silhouette = -1
            calinski = 0
            davies_bouldin = float('inf')
    else:
        silhouette = -1
        calinski = 0
        davies_bouldin = float('inf')
    
    # Get inertia for K-Means models
    inertia = model.inertia_ if hasattr(model, 'inertia_') else None
    
    # Print results
    print(f"\nüìä Clustering Results:")
    print(f"   Number of Clusters: {n_clusters}")
    if n_noise > 0:
        print(f"   Noise Points: {n_noise} ({n_noise/len(labels)*100:.2f}%)")
    print(f"   Training Time: {training_time:.2f} seconds")
    
    print(f"\nüìà Evaluation Metrics:")
    print(f"   Silhouette Score: {silhouette:.4f}")
    print(f"   Calinski-Harabasz Index: {calinski:.2f}")
    print(f"   Davies-Bouldin Index: {davies_bouldin:.4f}")
    if inertia is not None:
        print(f"   Inertia: {inertia:.2f}")
    
    # Cluster size distribution
    unique, counts = np.unique(labels[labels >= 0], return_counts=True)
    print(f"\nüì¶ Cluster Sizes:")
    for cluster_id, count in zip(unique, counts):
        print(f"   Cluster {cluster_id}: {count:,} samples ({count/len(labels)*100:.2f}%)")
    
    return {
        'model_name': model_name,
        'model': model,
        'labels': labels,
        'n_clusters': n_clusters,
        'n_noise': n_noise,
        'training_time': training_time,
        'silhouette_score': silhouette,
        'calinski_harabasz_score': calinski,
        'davies_bouldin_score': davies_bouldin,
        'inertia': inertia
    }

print("‚úì Evaluation function defined")

## Train Clustering Models

### 1. K-Means Clustering

In [None]:
# K-Means Clustering
print("Training K-Means Clustering...")

start_time = time.time()
kmeans_model = KMeans(
    n_clusters=optimal_k,
    init='k-means++',
    n_init=10,
    max_iter=300,
    random_state=42
)
kmeans_labels = kmeans_model.fit_predict(X)
kmeans_time = time.time() - start_time

kmeans_results = evaluate_clustering(X, kmeans_labels, 'K-Means', kmeans_time, kmeans_model)

# Save model
with open('model_kmeans.pkl', 'wb') as f:
    pickle.dump(kmeans_model, f)
print("\n‚úì Model saved: model_kmeans.pkl")

### 2. Mini-Batch K-Means

In [None]:
# Mini-Batch K-Means (faster for large datasets)
print("Training Mini-Batch K-Means...")

start_time = time.time()
minibatch_kmeans_model = MiniBatchKMeans(
    n_clusters=optimal_k,
    init='k-means++',
    n_init=10,
    max_iter=300,
    batch_size=1000,
    random_state=42
)
minibatch_labels = minibatch_kmeans_model.fit_predict(X)
minibatch_time = time.time() - start_time

minibatch_results = evaluate_clustering(X, minibatch_labels, 'Mini-Batch K-Means', minibatch_time, minibatch_kmeans_model)

# Save model
with open('model_minibatch_kmeans.pkl', 'wb') as f:
    pickle.dump(minibatch_kmeans_model, f)
print("\n‚úì Model saved: model_minibatch_kmeans.pkl")

### 3. Agglomerative Hierarchical Clustering

In [None]:
# Agglomerative Clustering
print("Training Agglomerative Hierarchical Clustering...")

start_time = time.time()
agglomerative_model = AgglomerativeClustering(
    n_clusters=optimal_k,
    linkage='ward'
)
agglomerative_labels = agglomerative_model.fit_predict(X)
agglomerative_time = time.time() - start_time

agglomerative_results = evaluate_clustering(X, agglomerative_labels, 'Agglomerative Clustering', agglomerative_time, agglomerative_model)

# Save model
with open('model_agglomerative.pkl', 'wb') as f:
    pickle.dump(agglomerative_model, f)
print("\n‚úì Model saved: model_agglomerative.pkl")

### 4. DBSCAN (Density-Based)

In [None]:
# DBSCAN - Density-based clustering
print("Training DBSCAN...")

start_time = time.time()
dbscan_model = DBSCAN(
    eps=0.5,
    min_samples=5,
    metric='euclidean',
    n_jobs=-1
)
dbscan_labels = dbscan_model.fit_predict(X)
dbscan_time = time.time() - start_time

dbscan_results = evaluate_clustering(X, dbscan_labels, 'DBSCAN', dbscan_time, dbscan_model)

# Save model
with open('model_dbscan.pkl', 'wb') as f:
    pickle.dump(dbscan_model, f)
print("\n‚úì Model saved: model_dbscan.pkl")

### 5. Gaussian Mixture Model (GMM)

In [None]:
# Gaussian Mixture Model
print("Training Gaussian Mixture Model...")

start_time = time.time()
gmm_model = GaussianMixture(
    n_components=optimal_k,
    covariance_type='full',
    max_iter=100,
    random_state=42
)
gmm_model.fit(X)
gmm_labels = gmm_model.predict(X)
gmm_time = time.time() - start_time

gmm_results = evaluate_clustering(X, gmm_labels, 'Gaussian Mixture Model', gmm_time, gmm_model)

# Save model
with open('model_gmm.pkl', 'wb') as f:
    pickle.dump(gmm_model, f)
print("\n‚úì Model saved: model_gmm.pkl")

### 6. Spectral Clustering

In [None]:
# Spectral Clustering
print("Training Spectral Clustering...")

start_time = time.time()
spectral_model = SpectralClustering(
    n_clusters=optimal_k,
    affinity='nearest_neighbors',
    n_neighbors=10,
    random_state=42,
    n_jobs=-1
)
spectral_labels = spectral_model.fit_predict(X)
spectral_time = time.time() - start_time

spectral_results = evaluate_clustering(X, spectral_labels, 'Spectral Clustering', spectral_time, spectral_model)

# Save model
with open('model_spectral.pkl', 'wb') as f:
    pickle.dump(spectral_model, f)
print("\n‚úì Model saved: model_spectral.pkl")

## üìä Compare All Models

In [None]:
# Collect all results
all_results = [
    kmeans_results,
    minibatch_results,
    agglomerative_results,
    dbscan_results,
    gmm_results,
    spectral_results
]

# Create comparison DataFrame
comparison_df = pd.DataFrame([{
    'Model': r['model_name'],
    'N_Clusters': r['n_clusters'],
    'Noise_Points': r['n_noise'],
    'Training_Time': f"{r['training_time']:.2f}s",
    'Silhouette': r['silhouette_score'],
    'Calinski-Harabasz': r['calinski_harabasz_score'],
    'Davies-Bouldin': r['davies_bouldin_score']
} for r in all_results])

# Sort by Silhouette Score (descending)
comparison_df = comparison_df.sort_values('Silhouette', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)
display(comparison_df)

# Find best model
best_model_name = comparison_df.iloc[0]['Model']
best_silhouette = comparison_df.iloc[0]['Silhouette']
print(f"\nüèÜ Best Model: {best_model_name} (Silhouette Score = {best_silhouette:.4f})")

# Save comparison
comparison_df.to_csv('clustering_results.csv', index=False)
print("\n‚úì Results saved: clustering_results.csv")

## üìà Visualize Model Comparison

In [None]:
# Create comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

models = comparison_df['Model']

# 1. Silhouette Score
axes[0, 0].barh(models, comparison_df['Silhouette'], color='skyblue', alpha=0.8)
axes[0, 0].set_xlabel('Silhouette Score (Higher is Better)', fontweight='bold')
axes[0, 0].set_title('Silhouette Score Comparison', fontsize=14, fontweight='bold')
axes[0, 0].invert_yaxis()
axes[0, 0].grid(alpha=0.3, axis='x')

# 2. Calinski-Harabasz Index
axes[0, 1].barh(models, comparison_df['Calinski-Harabasz'], color='lightgreen', alpha=0.8)
axes[0, 1].set_xlabel('Calinski-Harabasz Index (Higher is Better)', fontweight='bold')
axes[0, 1].set_title('Calinski-Harabasz Index Comparison', fontsize=14, fontweight='bold')
axes[0, 1].invert_yaxis()
axes[0, 1].grid(alpha=0.3, axis='x')

# 3. Davies-Bouldin Index
# Filter out infinity values
db_valid = comparison_df[comparison_df['Davies-Bouldin'] != float('inf')]
axes[1, 0].barh(db_valid['Model'], db_valid['Davies-Bouldin'], color='coral', alpha=0.8)
axes[1, 0].set_xlabel('Davies-Bouldin Index (Lower is Better)', fontweight='bold')
axes[1, 0].set_title('Davies-Bouldin Index Comparison', fontsize=14, fontweight='bold')
axes[1, 0].invert_yaxis()
axes[1, 0].grid(alpha=0.3, axis='x')

# 4. Training Time
training_times = [float(t.replace('s', '')) for t in comparison_df['Training_Time']]
axes[1, 1].barh(models, training_times, color='gold', alpha=0.8)
axes[1, 1].set_xlabel('Training Time (seconds)', fontweight='bold')
axes[1, 1].set_title('Training Time Comparison', fontsize=14, fontweight='bold')
axes[1, 1].invert_yaxis()
axes[1, 1].grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig('clustering_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úì Comparison chart saved as 'clustering_comparison.png'")

## üíæ Save All Results

In [None]:
# Save all results including labels
print("="*80)
print("SAVING ALL RESULTS")
print("="*80)

# Save results with pickle
with open('all_clustering_results.pkl', 'wb') as f:
    pickle.dump(all_results, f)
print("‚úì All results saved: all_clustering_results.pkl")

# Save labels for each model
labels_df = pd.DataFrame({
    'K-Means': kmeans_labels,
    'Mini-Batch K-Means': minibatch_labels,
    'Agglomerative': agglomerative_labels,
    'DBSCAN': dbscan_labels,
    'GMM': gmm_labels,
    'Spectral': spectral_labels
})
labels_df.to_csv('clustering_labels.csv', index=False)
print("‚úì All labels saved: clustering_labels.csv")

print("\n" + "="*80)
print("‚úÖ CLUSTERING TRAINING COMPLETE!")
print("="*80)
print(f"Total models trained: {len(all_results)}")
print(f"Best performing model: {best_model_name}")
print(f"\nFiles created:")
print(f"  ‚Ä¢ clustering_results.csv - Performance comparison")
print(f"  ‚Ä¢ clustering_labels.csv - All cluster labels")
print(f"  ‚Ä¢ all_clustering_results.pkl - Complete results")
print(f"  ‚Ä¢ model_*.pkl - Individual model files (6 models)")
print(f"  ‚Ä¢ clustering_comparison.png - Visualization")
print(f"  ‚Ä¢ optimal_clusters_analysis.png - Optimal k analysis")