# K-Means Clustering — Scikit-Learn

First unsupervised model in the project. K-Means groups data into K clusters by iteratively refining centroids — no labels used during training. We evaluate using inertia (WCSS), silhouette score, and ARI (comparing clusters to ground truth bean types).

**Dataset**: Dry Beans — 13,543 samples, 16 geometric features, 7 bean types.


In [1]:
import sys
import os
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score as sklearn_silhouette

# Add project root to path for utils
sys.path.insert(0, os.path.abspath('../..'))
from utils.data_loader import load_processed_data
from utils.performance import track_performance
from utils.visualization import (plot_elbow_curve, plot_silhouette_comparison,
                                  plot_silhouette_analysis, plot_convergence_curve)
from utils.results import save_results, add_result, print_comparison
from utils.metrics import adjusted_rand_index

# Configuration
RANDOM_STATE = 113
K_RANGE = range(2, 13)       # Test K=2 through K=12
MAX_ITER = 300
TOL = 1e-4
N_INIT = 5                   # 5 random initializations, keep best by inertia
FRAMEWORK = 'Scikit-Learn'

# Load preprocessed data
X_train, X_test, y_train, y_test, meta = load_processed_data('kmeans')

print("=" * 60)
print(f"K-MEANS — {FRAMEWORK}")
print("=" * 60)
print(f"Training: {X_train.shape[0]:,} samples, {X_train.shape[1]} features")
print(f"Test:     {X_test.shape[0]:,} samples")
print(f"Classes:  {meta['n_classes']} ({meta['class_names']})")

K-MEANS — Scikit-Learn
Training: 10,834 samples, 16 features
Test:     2,709 samples
Classes:  7 (['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA'])


In [2]:
# Step 1: K-Tuning - finding optimal number of clusters
# Run K-Means for each k, track inertia (elbow method) and silhouette (cluster quality)
print("=" * 60)
print("K-TUNING: Testing K=2 through K=12")
print("=" * 60)

inertias = []
sil_scores = []
k_list = list(K_RANGE)

for k in k_list:
    model = KMeans(
        n_clusters=k,
        init='k-means++',
        n_init=N_INIT,
        max_iter=MAX_ITER,
        tol=TOL,
        random_state=RANDOM_STATE
    )
    model.fit(X_train)

    inertias.append(model.inertia_)

    # sklearn's silhouette_score for speed during tuning loop
    sil = sklearn_silhouette(X_train, model.labels_)
    sil_scores.append(sil)

    print(f"    K={k:2d} | Inertia: {model.inertia_:,.1f} | Silhouette: {sil:.4f}")

print(f"\nTuning complete.")

K-TUNING: Testing K=2 through K=12
    K= 2 | Inertia: 103,635.7 | Silhouette: 0.3965
    K= 3 | Inertia: 75,557.9 | Silhouette: 0.4034
    K= 4 | Inertia: 60,743.3 | Silhouette: 0.3428
    K= 5 | Inertia: 49,175.7 | Silhouette: 0.3583
    K= 6 | Inertia: 43,871.6 | Silhouette: 0.3610
    K= 7 | Inertia: 38,925.7 | Silhouette: 0.3083
    K= 8 | Inertia: 36,050.3 | Silhouette: 0.3026
    K= 9 | Inertia: 33,511.6 | Silhouette: 0.3029
    K=10 | Inertia: 31,718.2 | Silhouette: 0.2902
    K=11 | Inertia: 30,243.4 | Silhouette: 0.2865
    K=12 | Inertia: 29,278.6 | Silhouette: 0.2654

Tuning complete.
