# Customer Segmentation Analysis

## Project Overview
This project implements customer segmentation using machine learning clustering algorithms to identify distinct customer groups based on their behavior and demographics.

## 1. Data Understanding

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('customer_segmentation_data.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst 5 rows:")
df.head()

In [None]:
# Data info and missing values
print("Dataset Info:")
print(df.info())
print(f"\nMissing values:\n{df.isnull().sum()}")
print(f"\nDescriptive statistics:")
df.describe()

## 2. Data Cleaning

In [None]:
# Handle missing values
df_clean = df.copy()
print(f"Original shape: {df_clean.shape}")

# Remove duplicates
df_clean = df_clean.drop_duplicates()
print(f"After removing duplicates: {df_clean.shape}")

# Handle outliers using IQR method
numerical_cols = ['age', 'income', 'spending_score', 'membership_years', 'purchase_frequency', 'last_purchase_amount']

def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

df_clean = remove_outliers(df_clean, numerical_cols)
print(f"After removing outliers: {df_clean.shape}")

## 3. Feature Scaling

In [None]:
# Prepare features for clustering
# Encode categorical variables
le_gender = LabelEncoder()
le_category = LabelEncoder()

df_features = df_clean.copy()
df_features['gender_encoded'] = le_gender.fit_transform(df_features['gender'])
df_features['category_encoded'] = le_category.fit_transform(df_features['preferred_category'])

# Select features for clustering
features = ['age', 'income', 'spending_score', 'membership_years', 'purchase_frequency', 
           'last_purchase_amount', 'gender_encoded', 'category_encoded']

X = df_features[features]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Features shape: {X_scaled.shape}")
print(f"Features: {features}")

## 4. Dimensionality Reduction (PCA)

In [None]:
# Apply PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.3f}")

# Plot PCA components
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.6)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.3f})')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.3f})')
plt.title('Customer Data - PCA Visualization')
plt.grid(True, alpha=0.3)
plt.show()

## 5. Model Building - Finding Optimal Clusters

In [None]:
# Elbow Method for K-Means
inertias = []
silhouette_scores = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot Elbow Method and Silhouette Scores
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Elbow Method
ax1.plot(k_range, inertias, 'bo-')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method for Optimal k')
ax1.grid(True, alpha=0.3)

# Silhouette Scores
ax2.plot(k_range, silhouette_scores, 'ro-')
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Score vs Number of Clusters')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find optimal k
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_k}")
print(f"Best silhouette score: {max(silhouette_scores):.3f}")

## 6. Apply Clustering Algorithms

In [None]:
# K-Means Clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)

# Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=optimal_k)
hierarchical_labels = hierarchical.fit_predict(X_scaled)

# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)

# Calculate silhouette scores
kmeans_silhouette = silhouette_score(X_scaled, kmeans_labels)
hierarchical_silhouette = silhouette_score(X_scaled, hierarchical_labels)
if len(set(dbscan_labels)) > 1:
    dbscan_silhouette = silhouette_score(X_scaled, dbscan_labels)
else:
    dbscan_silhouette = -1

print(f"K-Means Silhouette Score: {kmeans_silhouette:.3f}")
print(f"Hierarchical Silhouette Score: {hierarchical_silhouette:.3f}")
print(f"DBSCAN Silhouette Score: {dbscan_silhouette:.3f}")
print(f"DBSCAN found {len(set(dbscan_labels))} clusters (including noise)")

## 7. Cluster Visualization

In [None]:
# Visualize clusters using PCA
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Original data
axes[0,0].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.6)
axes[0,0].set_title('Original Data')
axes[0,0].set_xlabel('PC1')
axes[0,0].set_ylabel('PC2')

# K-Means
scatter = axes[0,1].scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.6)
axes[0,1].set_title(f'K-Means Clustering (k={optimal_k})')
axes[0,1].set_xlabel('PC1')
axes[0,1].set_ylabel('PC2')
plt.colorbar(scatter, ax=axes[0,1])

# Hierarchical
scatter = axes[1,0].scatter(X_pca[:, 0], X_pca[:, 1], c=hierarchical_labels, cmap='viridis', alpha=0.6)
axes[1,0].set_title('Hierarchical Clustering')
axes[1,0].set_xlabel('PC1')
axes[1,0].set_ylabel('PC2')
plt.colorbar(scatter, ax=axes[1,0])

# DBSCAN
scatter = axes[1,1].scatter(X_pca[:, 0], X_pca[:, 1], c=dbscan_labels, cmap='viridis', alpha=0.6)
axes[1,1].set_title('DBSCAN Clustering')
axes[1,1].set_xlabel('PC1')
axes[1,1].set_ylabel('PC2')
plt.colorbar(scatter, ax=axes[1,1])

plt.tight_layout()
plt.show()

## 8. Cluster Analysis

In [None]:
# Use K-Means results for analysis (best performing)
df_analysis = df_clean.copy()
df_analysis['cluster'] = kmeans_labels

# Cluster statistics
cluster_stats = df_analysis.groupby('cluster').agg({
    'age': ['mean', 'std'],
    'income': ['mean', 'std'],
    'spending_score': ['mean', 'std'],
    'membership_years': ['mean', 'std'],
    'purchase_frequency': ['mean', 'std'],
    'last_purchase_amount': ['mean', 'std']
}).round(2)

print("Cluster Statistics:")
print(cluster_stats)

# Cluster sizes
cluster_sizes = df_analysis['cluster'].value_counts().sort_index()
print(f"\nCluster Sizes:")
print(cluster_sizes)

# Gender distribution by cluster
gender_dist = pd.crosstab(df_analysis['cluster'], df_analysis['gender'], normalize='index') * 100
print(f"\nGender Distribution by Cluster (%):")
print(gender_dist.round(1))

# Category preferences by cluster
category_dist = pd.crosstab(df_analysis['cluster'], df_analysis['preferred_category'], normalize='index') * 100
print(f"\nCategory Preferences by Cluster (%):")
print(category_dist.round(1))

## 9. Cluster Profiling

In [None]:
# Create cluster profiles
cluster_profiles = {}

for cluster in range(optimal_k):
    cluster_data = df_analysis[df_analysis['cluster'] == cluster]
    
    profile = {
        'size': len(cluster_data),
        'avg_age': cluster_data['age'].mean(),
        'avg_income': cluster_data['income'].mean(),
        'avg_spending_score': cluster_data['spending_score'].mean(),
        'avg_membership_years': cluster_data['membership_years'].mean(),
        'avg_purchase_frequency': cluster_data['purchase_frequency'].mean(),
        'avg_last_purchase': cluster_data['last_purchase_amount'].mean(),
        'dominant_gender': cluster_data['gender'].mode()[0],
        'dominant_category': cluster_data['preferred_category'].mode()[0]
    }
    
    cluster_profiles[f'Cluster_{cluster}'] = profile

# Display profiles
profiles_df = pd.DataFrame(cluster_profiles).T
print("Cluster Profiles:")
print(profiles_df.round(2))

## 10. Advanced Visualizations

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Age vs Income by cluster
for cluster in range(optimal_k):
    cluster_data = df_analysis[df_analysis['cluster'] == cluster]
    axes[0,0].scatter(cluster_data['age'], cluster_data['income'], 
                     label=f'Cluster {cluster}', alpha=0.6)
axes[0,0].set_xlabel('Age')
axes[0,0].set_ylabel('Income')
axes[0,0].set_title('Age vs Income by Cluster')
axes[0,0].legend()

# Spending Score vs Purchase Frequency
for cluster in range(optimal_k):
    cluster_data = df_analysis[df_analysis['cluster'] == cluster]
    axes[0,1].scatter(cluster_data['spending_score'], cluster_data['purchase_frequency'], 
                     label=f'Cluster {cluster}', alpha=0.6)
axes[0,1].set_xlabel('Spending Score')
axes[0,1].set_ylabel('Purchase Frequency')
axes[0,1].set_title('Spending Score vs Purchase Frequency')
axes[0,1].legend()

# Income vs Last Purchase Amount
for cluster in range(optimal_k):
    cluster_data = df_analysis[df_analysis['cluster'] == cluster]
    axes[0,2].scatter(cluster_data['income'], cluster_data['last_purchase_amount'], 
                     label=f'Cluster {cluster}', alpha=0.6)
axes[0,2].set_xlabel('Income')
axes[0,2].set_ylabel('Last Purchase Amount')
axes[0,2].set_title('Income vs Last Purchase Amount')
axes[0,2].legend()

# Cluster size distribution
cluster_sizes.plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Cluster Size Distribution')
axes[1,0].set_xlabel('Cluster')
axes[1,0].set_ylabel('Number of Customers')

# Average spending score by cluster
avg_spending = df_analysis.groupby('cluster')['spending_score'].mean()
avg_spending.plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Average Spending Score by Cluster')
axes[1,1].set_xlabel('Cluster')
axes[1,1].set_ylabel('Average Spending Score')

# Average income by cluster
avg_income = df_analysis.groupby('cluster')['income'].mean()
avg_income.plot(kind='bar', ax=axes[1,2])
axes[1,2].set_title('Average Income by Cluster')
axes[1,2].set_xlabel('Cluster')
axes[1,2].set_ylabel('Average Income')

plt.tight_layout()
plt.show()

## 11. Save Results

In [None]:
# Save cluster assignments
df_analysis.to_csv('customer_segments.csv', index=False)

# Save cluster profiles
profiles_df.to_csv('cluster_profiles.csv')

# Save cluster centers (K-means)
centers_df = pd.DataFrame(kmeans.cluster_centers_, columns=features)
centers_df.to_csv('cluster_centers.csv', index=False)

print("Results saved:")
print("- customer_segments.csv")
print("- cluster_profiles.csv")
print("- cluster_centers.csv")

## Summary

### Key Findings:
1. **Optimal Clusters**: Found optimal number of clusters using elbow method and silhouette analysis
2. **Algorithm Performance**: K-Means performed best with highest silhouette score
3. **Customer Segments**: Identified distinct customer groups with different characteristics
4. **Business Insights**: Each cluster represents different customer behavior patterns

### Recommendations:
- Target marketing campaigns based on cluster characteristics
- Develop personalized product recommendations
- Optimize pricing strategies for different segments
- Focus retention efforts on high-value clusters