***Project 3 : " Customer Segmentation Using K-Means "***
**Problem Statement**:
 ***Businesses need to target different customers differently based on behavior.***
 **Objective**:
 ***Cluster customers into groups using unsupervised learning.***
 **Requirements**:
**1.** ***Use K-Means clustering on data (age, income, frequency,spending)***
**2.** ***Preprocess data using scaling***
**3.** ***Visualize clusters using 2D or 3D scatter plots***
**Expected Outcome**:
***Identify 3–5 meaningful customer types for marketing or offers.***

***Installing neccesary libraries***

In [None]:
# %pip install pandas
# %pip install numpy
# %pip install matplot
# %pip install scikit-learn
# %pip install seaborn

***Importing  required libraries and Data***

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.neighbors import KNeighborsClassifier

import os

# Load dataset or create synthetic data if file does not exist
if not os.path.exists('C:/Users/HP/Python/Data Science Projects/customer_data.csv'):
    np.random.seed(42)
    n_samples = 200
    df = pd.DataFrame({  # for emergency when dataset is not available
        'age': np.random.randint(18, 70, size=n_samples),
        'income': np.random.randint(20000, 120000, size=n_samples),
        'frequency': np.random.randint(1, 30, size=n_samples),
        'spending': np.random.randint(100, 10000, size=n_samples)
    })
    df.to_csv('customer_data.csv', index=False)
else:
    df = pd.read_csv('customer_data.csv')
data = pd.read_csv('customer_data.csv')  
print(data)

***Preparing Data for our model in required format***

In [None]:
# Cleaning column names and ensure all are lowercase, no spaces
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

# Checking for missing values and fill if necessary
if df.isnull().sum().sum() > 0:
    df = df.fillna(df.median(numeric_only=True))

# Selecting relevant features
feature_cols = ['age', 'income', 'frequency', 'spending']
df = df[feature_cols] 
print(f"Prepared DataSet\n{df}")

features = df[feature_cols]

# Scaling data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

***Finding the optimal number of clusters (Elbow Method + Silhouette)***

In [None]:
wcss = []
silhouette_scores = []
K_range = range(2, 8)  
for i in K_range:
    kmeans = KMeans(n_clusters=i, random_state=42, n_init=10)
    kmeans.fit(scaled_features)
    wcss.append(kmeans.inertia_)
    score = silhouette_score(scaled_features, kmeans.labels_)
    silhouette_scores.append(score)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(list(K_range), wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method for Optimal K (2-7)')

plt.subplot(1, 2, 2)
plt.plot(list(K_range), silhouette_scores, marker='x', color='orange')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs K (2-7)')
plt.tight_layout()
plt.show()

***Selecting 3-5 Meaningful Clusters based on silhouette score***

In [None]:
optimal_k = K_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters (by silhouette score): {optimal_k}")

***KMeans with optimal_k and assign cluster label**

In [None]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(scaled_features)

***2D Scatter Plot: Age vs Income***

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['age'], y=df['income'], hue=df['cluster'], palette='viridis')
plt.xlabel('Age')
plt.ylabel('Income')
plt.title('Customer Clusters (Age vs Income)')
plt.legend(title='Cluster')
plt.show()

***2D Scatter Plot: Frequency vs Spending***

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['frequency'], y=df['spending'], hue=df['cluster'], palette='viridis')
plt.xlabel('Frequency')
plt.ylabel('Spending')
plt.title('Customer Clusters (Frequency vs Spending)')
plt.legend(title='Cluster')
plt.show()

***3D Scatter Plot: Age, Income, Spending***

In [None]:
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['age'], df['income'], df['spending'], c=df['cluster'], cmap='viridis')
ax.set_xlabel('Age')
ax.set_ylabel('Income')
ax.set_zlabel('Spending')
ax.set_title('3D Customer Segmentation')
plt.show()

***Cluster Summary for Marketing Insights***

In [None]:
print("\nCluster Summary (mean values):")
summary = df.groupby('cluster')[feature_cols].mean().round(2)
print(summary)

***Cluster sizes***

In [None]:
print("\nCluster Sizes:")
print(df['cluster'].value_counts().sort_index())

***5 Meaningful Customer types for Marketing***

In [None]:
print("\nCustomer Segment Profiles:")
for cluster_id, row in summary.iterrows():
    desc = []
    if row['income'] > summary['income'].mean() and row['spending'] > summary['spending'].mean():
        desc.append("High-income, high-spending (Premium Customers)")
    elif row['age'] < summary['age'].mean() and row['frequency'] > summary['frequency'].mean():
        desc.append("Young, frequent shoppers (Enthusiasts)")
    elif row['spending'] < summary['spending'].mean() and row['frequency'] < summary['frequency'].mean():
        desc.append("Low-spending, infrequent (Budget/Occasional Shoppers)")
    elif row['age'] > summary['age'].mean():
        desc.append("Older customers (Mature Segment)")
    else:
        desc.append("Average/Regular Customers")
    print(f"Cluster {cluster_id}: {', '.join(desc)}")
    print(row)
    print("-" * 40)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(scaled_features, df['cluster'])