# K-Means Clustering — Scikit-Learn

First unsupervised model in the project. K-Means groups data into K clusters by iteratively refining centroids — no labels used during training. We evaluate using inertia (WCSS), silhouette score, and ARI (comparing clusters to ground truth bean types).

**Dataset**: Dry Beans — 13,543 samples, 16 geometric features, 7 bean types.


In [1]:
import sys
import os
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score as sklearn_silhouette

# Add project root to path for utils
sys.path.insert(0, os.path.abspath('../..'))
from utils.data_loader import load_processed_data
from utils.performance import track_performance
from utils.visualization import (plot_elbow_curve, plot_silhouette_comparison,
                                  plot_silhouette_analysis, plot_convergence_curve)
from utils.results import save_results, add_result, print_comparison
from utils.metrics import adjusted_rand_index

# Configuration
RANDOM_STATE = 113
K_RANGE = range(2, 13)       # Test K=2 through K=12
MAX_ITER = 300
TOL = 1e-4
N_INIT = 5                   # 5 random initializations, keep best by inertia
FRAMEWORK = 'Scikit-Learn'

# Load preprocessed data
X_train, X_test, y_train, y_test, meta = load_processed_data('kmeans')

print("=" * 60)
print(f"K-MEANS — {FRAMEWORK}")
print("=" * 60)
print(f"Training: {X_train.shape[0]:,} samples, {X_train.shape[1]} features")
print(f"Test:     {X_test.shape[0]:,} samples")
print(f"Classes:  {meta['n_classes']} ({meta['class_names']})")

K-MEANS — Scikit-Learn
Training: 10,834 samples, 16 features
Test:     2,709 samples
Classes:  7 (['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA'])
