# 🧠 Customer Segmentation using K-Means Clustering (Auto-save outputs)

This notebook performs K-Means clustering for customer segmentation and **automatically saves** the key outputs to an `outputs/` folder:
- `elbow_wcss.png` (Elbow plot)
- `clusters_pca.png` (PCA scatter plot with centroids)
- `clustered_data.csv` (original numeric data + `Cluster` label)
- `cluster_profiles.json` (summary stats per cluster)
- `summary.json` (chosen k, feature names, cluster sizes)
- `kmeans_centers.npy` (cluster centers in scaled space)

**How to use:** Upload your dataset `online_retail.csv` (or rename accordingly) into the notebook's working directory and run all cells. 


In [None]:
# Imports
import os, json, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


In [None]:
# Load dataset (replace filename if different)
DATA_PATH = 'online_retail.csv'   # <-- change this if needed
df = pd.read_csv(DATA_PATH)
print('Loaded', df.shape, 'rows x columns')
df.head()


In [None]:
# Preprocessing - keep numeric columns, fill missing, and scale
numeric_df = df.select_dtypes(include=[np.number]).copy()
# If numeric_df is empty, raise an informative error
if numeric_df.shape[1] == 0:
    raise ValueError('No numeric columns detected. Aggregate to customer-level numeric features before clustering.')
numeric_df = numeric_df.fillna(numeric_df.median())

scaler = StandardScaler()
X = scaler.fit_transform(numeric_df.values)
feature_names = numeric_df.columns.tolist()
print('Numeric features used for clustering:', feature_names)
print('Scaled shape:', X.shape)


In [None]:
# Elbow method: compute WCSS for k=1..max_k and save plot
def compute_wcss(X, max_k=10):
    wcss = []
    for k in range(1, max_k+1):
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        km.fit(X)
        wcss.append(km.inertia_)
    return wcss

outputs_dir = 'outputs'
os.makedirs(outputs_dir, exist_ok=True)

max_k = 8
wcss = compute_wcss(X, max_k=max_k)

plt.figure(figsize=(6,4))
plt.plot(range(1, max_k+1), wcss, marker='o')
plt.xlabel('Number of clusters k')
plt.ylabel('WCSS (inertia)')
plt.title('Elbow Method for optimal k')
plt.grid(True)
plt.tight_layout()
elbow_path = os.path.join(outputs_dir, 'elbow_wcss.png')
plt.savefig(elbow_path, dpi=150)
plt.show()
print('Saved elbow plot to', elbow_path)


In [None]:
# Simple heuristic to choose k from WCSS: largest relative drop
import numpy as np
def choose_k_by_elbow(wcss):
    drops = np.diff(wcss)
    if len(drops)==0:
        return 1
    rel = drops / np.array(wcss[:-1])
    # pick k where relative drop is largest (positive)
    k = int(np.argmax(-rel)) + 2
    if k < 2:
        k = 2
    return k

chosen_k = choose_k_by_elbow(wcss)
print('Heuristic chosen_k =', chosen_k)
# You can override chosen_k manually if you prefer:
# chosen_k = 4


In [None]:
# Run K-Means with chosen_k, save outputs
km = KMeans(n_clusters=chosen_k, random_state=42, n_init=20)
labels = km.fit_predict(X)
centers = km.cluster_centers_

# Save centers
np.save(os.path.join(outputs_dir, 'kmeans_centers.npy'), centers)

# Attach labels to dataframe and save clustered CSV (only the numeric features + cluster)
clustered = numeric_df.copy()
clustered['Cluster'] = labels
clustered_csv_path = os.path.join(outputs_dir, 'clustered_data.csv')
clustered.to_csv(clustered_csv_path, index=False)
print('Saved clustered data to', clustered_csv_path)


In [None]:
# PCA scatter plot with cluster centroids, saved to outputs
pca = PCA(n_components=2, random_state=42)
X2 = pca.fit_transform(X)
centers2 = pca.transform(centers)

plt.figure(figsize=(7,5))
unique = np.unique(labels)
for lab in unique:
    sel = X2[labels==lab]
    plt.scatter(sel[:,0], sel[:,1], label=f'Cluster {lab}', alpha=0.6, s=30)
plt.scatter(centers2[:,0], centers2[:,1], marker='X', s=120, edgecolor='k', linewidths=1.2)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('K-Means clusters (PCA projection)')
plt.legend()
plt.tight_layout()
pca_path = os.path.join(outputs_dir, 'clusters_pca.png')
plt.savefig(pca_path, dpi=150)
plt.show()
print('Saved PCA cluster plot to', pca_path)


In [None]:
# Create cluster profile summary and save as JSON & a human-readable CSV
profile = clustered.groupby('Cluster').agg(['count','mean','median','std']).to_dict()
sizes = clustered['Cluster'].value_counts().sort_index().to_dict()

summary = {
    'chosen_k': int(chosen_k),
    'sizes': sizes,
    'features': feature_names
}

with open(os.path.join(outputs_dir, 'cluster_profiles.json'), 'w') as f:
    json.dump({'sizes': sizes, 'summary_stats': profile}, f, indent=2, default=int)

with open(os.path.join(outputs_dir, 'summary.json'), 'w') as f:
    json.dump(summary, f, indent=2)

# Save per-cluster mean CSV (easy to load into slides)
cluster_means = clustered.groupby('Cluster').mean(numeric_only=True)
cluster_means.to_csv(os.path.join(outputs_dir, 'cluster_means.csv'))

print('Saved cluster_profiles.json, summary.json and cluster_means.csv to', outputs_dir)
print('Cluster sizes:', sizes)
cluster_means


In [None]:
# Final notes and suggestions
print('All outputs are saved in the `outputs/` folder.')
print('Next steps: review the elbow plot to confirm chosen_k, inspect cluster_means.csv, and paste visuals into your report or slide deck.')
