# 🛍️ Customer Segmentation with K-Means

**Project**: Unsupervised Learning - Customer Clustering  
**Level**: Intermediate  
**Dataset**: Mall Customer Dataset  

## 📋 Project Overview

This project performs customer segmentation using K-Means clustering to identify distinct customer groups based on purchasing behavior. We'll learn:

- Unsupervised learning fundamentals
- K-Means clustering algorithm
- Optimal cluster determination (Elbow Method)
- Business insights from clustering
- Customer persona development

Let's discover hidden customer segments! 🎯

## 1. Import Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Machine learning
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print("🛍️ Ready for customer segmentation analysis!")

## 2. Data Generation and Exploration

In [None]:
# Generate sample mall customer data
np.random.seed(42)
n_customers = 200

# Create realistic customer data
data = {
    'CustomerID': range(1, n_customers + 1),
    'Gender': np.random.choice(['Male', 'Female'], n_customers, p=[0.45, 0.55]),
    'Age': np.random.normal(38, 12, n_customers).clip(18, 70).astype(int),
    'Annual_Income': np.random.normal(60, 20, n_customers).clip(15, 140).astype(int),
    'Spending_Score': np.random.normal(50, 25, n_customers).clip(1, 100).astype(int)
}

# Create some realistic correlations
# Higher income tends to have higher spending (with some noise)
for i in range(n_customers):
    if data['Annual_Income'][i] > 80:  # High income
        data['Spending_Score'][i] = np.random.normal(70, 15, 1)[0]
    elif data['Annual_Income'][i] < 40:  # Low income
        data['Spending_Score'][i] = np.random.normal(30, 15, 1)[0]
    
    # Clip spending score
    data['Spending_Score'][i] = max(1, min(100, data['Spending_Score'][i]))

# Create DataFrame
df = pd.DataFrame(data)

print("🛍️ Mall Customer dataset created!")
print(f"Dataset shape: {df.shape}")
print(f"Features: {list(df.columns)}")

# Display first few rows
df.head()

In [None]:
# Dataset information
print("📊 Dataset Information:")
print(f"Total customers: {len(df)}")
print(f"Features: {df.shape[1]}")
print(f"Missing values: {df.isnull().sum().sum()}")

print("\n👥 Gender Distribution:")
print(df['Gender'].value_counts())

print("\n📈 Statistical Summary:")
df.describe()

## 3. Exploratory Data Analysis

In [None]:
# Customer demographics overview
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('👥 Customer Demographics Overview', fontsize=16, fontweight='bold')

# Age distribution
axes[0,0].hist(df['Age'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('📊 Age Distribution')
axes[0,0].set_xlabel('Age')
axes[0,0].set_ylabel('Frequency')
axes[0,0].axvline(df['Age'].mean(), color='red', linestyle='--', 
                  label=f'Mean: {df["Age"].mean():.1f}')
axes[0,0].legend()

# Income distribution
axes[0,1].hist(df['Annual_Income'], bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0,1].set_title('💰 Annual Income Distribution')
axes[0,1].set_xlabel('Annual Income (k$)')
axes[0,1].set_ylabel('Frequency')
axes[0,1].axvline(df['Annual_Income'].mean(), color='red', linestyle='--',
                  label=f'Mean: ${df["Annual_Income"].mean():.1f}k')
axes[0,1].legend()

# Spending score distribution
axes[1,0].hist(df['Spending_Score'], bins=20, alpha=0.7, color='lightcoral', edgecolor='black')
axes[1,0].set_title('🛒 Spending Score Distribution')
axes[1,0].set_xlabel('Spending Score (1-100)')
axes[1,0].set_ylabel('Frequency')
axes[1,0].axvline(df['Spending_Score'].mean(), color='red', linestyle='--',
                  label=f'Mean: {df["Spending_Score"].mean():.1f}')
axes[1,0].legend()

# Gender distribution
gender_counts = df['Gender'].value_counts()
axes[1,1].pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%',
              colors=['lightblue', 'pink'])
axes[1,1].set_title('👫 Gender Distribution')

plt.tight_layout()
plt.show()

print(f"📊 Customer Demographics Summary:")
print(f"• Average Age: {df['Age'].mean():.1f} years")
print(f"• Average Income: ${df['Annual_Income'].mean():.1f}k")
print(f"• Average Spending Score: {df['Spending_Score'].mean():.1f}/100")
print(f"• Gender Split: {gender_counts['Female']}/{gender_counts['Male']} (F/M)")

In [None]:
# Key relationships analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('🔍 Customer Behavior Relationships', fontsize=16, fontweight='bold')

# Income vs Spending Score
scatter1 = axes[0].scatter(df['Annual_Income'], df['Spending_Score'], 
                          c=df['Age'], cmap='viridis', alpha=0.6, s=50)
axes[0].set_xlabel('Annual Income (k$)')
axes[0].set_ylabel('Spending Score')
axes[0].set_title('💰 Income vs Spending Score\n(Color = Age)')
plt.colorbar(scatter1, ax=axes[0], label='Age')

# Age vs Spending Score
colors = ['blue' if gender == 'Male' else 'red' for gender in df['Gender']]
axes[1].scatter(df['Age'], df['Spending_Score'], c=colors, alpha=0.6, s=50)
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Spending Score')
axes[1].set_title('👥 Age vs Spending Score\n(Blue=Male, Red=Female)')

# Age vs Income
scatter3 = axes[2].scatter(df['Age'], df['Annual_Income'], 
                          c=df['Spending_Score'], cmap='plasma', alpha=0.6, s=50)
axes[2].set_xlabel('Age')
axes[2].set_ylabel('Annual Income (k$)')
axes[2].set_title('📈 Age vs Income\n(Color = Spending Score)')
plt.colorbar(scatter3, ax=axes[2], label='Spending Score')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
plt.figure(figsize=(10, 8))

# Calculate correlations for numerical features
numerical_features = ['Age', 'Annual_Income', 'Spending_Score']
correlation_matrix = df[numerical_features].corr()

# Create heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='RdYlBu_r', center=0,
            square=True, linewidths=0.5, fmt='.3f')
plt.title('🔥 Customer Feature Correlations', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("🔍 Key Correlations:")
for i in range(len(numerical_features)):
    for j in range(i+1, len(numerical_features)):
        corr = correlation_matrix.iloc[i, j]
        print(f"• {numerical_features[i]} vs {numerical_features[j]}: {corr:.3f}")
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. K-Means Clustering Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare data for clustering\n",
    "print(\"🔧 Preparing data for clustering...\")\n",
    "\n",
    "# Select features for clustering (Income and Spending Score)\n",
    "clustering_features = ['Annual_Income', 'Spending_Score']\n",
    "X_cluster = df[clustering_features]\n",
    "\n",
    "print(f\"Clustering features: {clustering_features}\")\n",
    "print(f\"Data shape: {X_cluster.shape}\")\n",
    "\n",
    "# Scale the features\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X_cluster)\n",
    "\n",
    "print(\"✅ Features scaled for clustering\")\n",
    "print(f\"Original means: {X_cluster.mean().round(2).tolist()}\")\n",
    "print(f\"Scaled means: {X_scaled.mean(axis=0).round(2).tolist()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Determine optimal number of clusters using Elbow Method\n",
    "print(\"📈 Finding optimal number of clusters...\")\n",
    "\n",
    "# Calculate WCSS for different number of clusters\n",
    "wcss = []\n",
    "silhouette_scores = []\n",
    "k_range = range(2, 11)\n",
    "\n",
    "for k in k_range:\n",
    "    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)\n",
    "    kmeans.fit(X_scaled)\n",
    "    wcss.append(kmeans.inertia_)\n",
    "    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))\n",
    "\n",
    "# Plot Elbow Method and Silhouette Analysis\n",
    "fig, axes = plt.subplots(1, 2, figsize=(15, 6))\n",
    "\n",
    "# Elbow Method\n",
    "axes[0].plot(k_range, wcss, 'bo-', linewidth=2, markersize=8)\n",
    "axes[0].set_title('📈 Elbow Method for Optimal k', fontweight='bold', fontsize=14)\n",
    "axes[0].set_xlabel('Number of Clusters (k)')\n",
    "axes[0].set_ylabel('Within-Cluster Sum of Squares (WCSS)')\n",
    "axes[0].grid(True, alpha=0.3)\n",
    "\n",
    "# Silhouette Analysis\n",
    "axes[1].plot(k_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)\n",
    "axes[1].set_title('📊 Silhouette Analysis', fontweight='bold', fontsize=14)\n",
    "axes[1].set_xlabel('Number of Clusters (k)')\n",
    "axes[1].set_ylabel('Silhouette Score')\n",
    "axes[1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Find optimal k\n",
    "optimal_k_silhouette = k_range[np.argmax(silhouette_scores)]\n",
    "print(f\"\\n🎯 Optimal number of clusters:\")\n",
    "print(f\"• Based on Silhouette Score: {optimal_k_silhouette} (score: {max(silhouette_scores):.3f})\")\n",
    "print(f\"• Recommended k: 5 (good balance of interpretability and performance)\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}