# 🧠 Customer Segmentation using K-Means Clustering

This notebook demonstrates how to perform **customer segmentation** for an online retail dataset using **K-Means clustering**.

Steps:
1. Load and explore the dataset  
2. Preprocess and standardize data  
3. Use the **Elbow Method** to find the optimal number of clusters  
4. Run **K-Means clustering**  
5. Visualize and analyze cluster results  
6. Summarize insights for business use


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns


In [None]:
# Load your dataset
# Replace 'online_retail.csv' with your dataset file
data = pd.read_csv('online_retail.csv')
data.head()


In [None]:
# Preprocessing - keep only numeric columns, handle missing values, and standardize
numeric_df = data.select_dtypes(include=[np.number]).fillna(data.median())
scaler = StandardScaler()
X = scaler.fit_transform(numeric_df)
print("Shape after scaling:", X.shape)


In [None]:
# Elbow Method to determine optimal K
wcss = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(6,4))
plt.plot(K, wcss, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS')
plt.grid(True)
plt.show()


In [None]:
# Choose K (based on elbow) and run K-Means
optimal_k = 4  # change after reviewing elbow plot
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=20)
labels = kmeans.fit_predict(X)

# Add cluster labels to the original data
data['Cluster'] = labels
data.head()


In [None]:
# PCA Visualization of clusters
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(7,5))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=labels, palette='Set2', s=50)
plt.title('Customer Segments (PCA 2D projection)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(title='Cluster')
plt.show()


In [None]:
# Cluster Analysis - compute mean statistics per cluster
cluster_summary = data.groupby('Cluster').mean(numeric_only=True)
cluster_summary


In [None]:
# Example business interpretation section
print("Example Interpretation:")
print("- Cluster 0: High-value, infrequent buyers — target with loyalty offers")
print("- Cluster 1: Frequent bargain shoppers — increase average order value")
print("- Cluster 2: One-time buyers — run reactivation campaigns")
print("- Cluster 3: New customers — focus on onboarding experience")
