# CUSTO CLARITY - Customer Segmentation Analysis
## 01. Exploratory Data Analysis

**Author**: Neelanjan Chakraborty  
**Website**: [neelanjanchakraborty.in](https://neelanjanchakraborty.in/)  
**Project**: Customer Segmentation for Retail Strategy  

---

### 📋 Project Overview

This notebook performs comprehensive Exploratory Data Analysis (EDA) on the Mall Customer Segmentation dataset to understand customer demographics, spending patterns, and identify potential segments for targeted marketing strategies.

### 🎯 Objectives
- Understand the structure and quality of customer data
- Analyze customer demographics and spending behavior
- Identify patterns and correlations in the data
- Prepare insights for clustering analysis

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import sys
import os

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Add src directory to path for importing custom modules
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

# Import custom modules
from data_loader import DataLoader
from visualizer import CustomerVisualizationSuite

# Set plot style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Libraries imported successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🔢 NumPy version: {np.__version__}")
print(f"📈 Matplotlib version: {plt.matplotlib.__version__}")
print(f"🎨 Seaborn version: {sns.__version__}")
print("\n🚀 Ready for Customer Segmentation Analysis!")

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load the dataset
loader = DataLoader()
df = loader.load_dataset()

print("📊 DATASET LOADED SUCCESSFULLY")
print("=" * 50)
print(f"Dataset Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n📋 First 5 rows of the dataset:")
df.head()

In [None]:
# Dataset Information Analysis
print("🔍 COMPREHENSIVE DATASET ANALYSIS")
print("=" * 50)

# Basic information
print(f"📊 Dataset Dimensions: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"💾 Memory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")

print("\n📈 Column Information:")
print(df.info())

print("\n📊 Statistical Summary:")
print(df.describe(include='all'))

print("\n🔍 Data Types:")
for col, dtype in df.dtypes.items():
    print(f"  {col}: {dtype}")

print("\n❓ Missing Values:")
missing_values = df.isnull().sum()
if missing_values.sum() == 0:
    print("  ✅ No missing values found!")
else:
    for col, missing in missing_values.items():
        if missing > 0:
            print(f"  {col}: {missing} ({missing/len(df)*100:.2f}%)")

print("\n🔄 Duplicate Rows:")
duplicates = df.duplicated().sum()
if duplicates == 0:
    print("  ✅ No duplicate rows found!")
else:
    print(f"  ⚠️ Found {duplicates} duplicate rows")

print("\n🎯 Unique Values per Column:")
for col in df.columns:
    unique_count = df[col].nunique()
    print(f"  {col}: {unique_count} unique values")
    if unique_count <= 10 and df[col].dtype == 'object':
        print(f"    Values: {list(df[col].unique())}")
    elif df[col].dtype in ['int64', 'float64']:
        print(f"    Range: {df[col].min()} - {df[col].max()}")
    print()

In [None]:
# Initialize Visualization Suite
viz_suite = CustomerVisualizationSuite(figsize=(15, 10))

print("🎨 CREATING COMPREHENSIVE DATA VISUALIZATIONS")
print("=" * 50)

# Create data overview plots
viz_suite.plot_data_overview(df)

print("✅ Data overview visualization completed!")

In [None]:
# Correlation Analysis
print("🔗 CORRELATION ANALYSIS")
print("=" * 30)

# Create correlation matrix
viz_suite.plot_correlation_matrix(df)

# Calculate and display correlation values
numeric_df = df.select_dtypes(include=[np.number])
if 'CustomerID' in numeric_df.columns:
    numeric_df = numeric_df.drop('CustomerID', axis=1)

correlation_matrix = numeric_df.corr()
print("📊 Correlation Matrix:")
print(correlation_matrix.round(3))

# Find strongest correlations
correlations = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        col1, col2 = correlation_matrix.columns[i], correlation_matrix.columns[j]
        corr_value = correlation_matrix.iloc[i, j]
        correlations.append((col1, col2, abs(corr_value), corr_value))

correlations.sort(key=lambda x: x[2], reverse=True)

print("\n🔥 Strongest Correlations:")
for col1, col2, abs_corr, corr in correlations[:5]:
    direction = "positive" if corr > 0 else "negative"
    strength = "very strong" if abs_corr > 0.8 else "strong" if abs_corr > 0.6 else "moderate" if abs_corr > 0.4 else "weak"
    print(f"  {col1} ↔ {col2}: {corr:.3f} ({strength} {direction})")

In [None]:
# Outlier Analysis
print("🚨 OUTLIER DETECTION ANALYSIS")
print("=" * 35)

# Create outlier visualization
viz_suite.plot_outlier_analysis(df)

# Detect outliers using IQR method
from preprocessor import CustomerDataPreprocessor
preprocessor = CustomerDataPreprocessor()

numeric_cols = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
outliers = preprocessor.detect_outliers(df, numeric_cols, method='iqr', threshold=1.5)

print("📊 Outlier Detection Results (IQR Method):")
for col, outlier_indices in outliers.items():
    if len(outlier_indices) > 0:
        print(f"\n📍 {col}:")
        print(f"  • Number of outliers: {len(outlier_indices)}")
        print(f"  • Percentage of data: {len(outlier_indices)/len(df)*100:.2f}%")
        outlier_values = df.loc[outlier_indices, col].values
        print(f"  • Outlier range: {outlier_values.min():.1f} - {outlier_values.max():.1f}")
        print(f"  • Normal range (Q1-Q3): {df[col].quantile(0.25):.1f} - {df[col].quantile(0.75):.1f}")
    else:
        print(f"\n✅ {col}: No outliers detected")

In [None]:
# Customer Demographics Analysis
print("👥 CUSTOMER DEMOGRAPHICS ANALYSIS")
print("=" * 40)

# Age Analysis
print("🎂 Age Demographics:")
print(f"  • Average Age: {df['Age'].mean():.1f} years")
print(f"  • Age Range: {df['Age'].min()} - {df['Age'].max()} years")
print(f"  • Most Common Age: {df['Age'].mode().values[0]} years")

age_groups = pd.cut(df['Age'], bins=[0, 25, 35, 50, 100], labels=['Young (18-25)', 'Adult (26-35)', 'Middle-aged (36-50)', 'Senior (50+)'])
age_distribution = age_groups.value_counts()
print(f"\n📊 Age Group Distribution:")
for group, count in age_distribution.items():
    percentage = count / len(df) * 100
    print(f"  • {group}: {count} customers ({percentage:.1f}%)")

# Gender Analysis
print(f"\n⚧ Gender Demographics:")
gender_distribution = df['Gender'].value_counts()
for gender, count in gender_distribution.items():
    percentage = count / len(df) * 100
    print(f"  • {gender}: {count} customers ({percentage:.1f}%)")

# Income Analysis
print(f"\n💰 Income Demographics:")
print(f"  • Average Income: ${df['Annual Income (k$)'].mean():.1f}k")
print(f"  • Income Range: ${df['Annual Income (k$)'].min()}k - ${df['Annual Income (k$)'].max()}k")
print(f"  • Median Income: ${df['Annual Income (k$)'].median():.1f}k")

income_groups = pd.cut(df['Annual Income (k$)'], bins=[0, 40, 70, 200], labels=['Low Income (<$40k)', 'Medium Income ($40-70k)', 'High Income (>$70k)'])
income_distribution = income_groups.value_counts()
print(f"\n📊 Income Group Distribution:")
for group, count in income_distribution.items():
    percentage = count / len(df) * 100
    print(f"  • {group}: {count} customers ({percentage:.1f}%)")

# Spending Analysis
print(f"\n💳 Spending Score Demographics:")
print(f"  • Average Spending Score: {df['Spending Score (1-100)'].mean():.1f}/100")
print(f"  • Spending Range: {df['Spending Score (1-100)'].min()} - {df['Spending Score (1-100)'].max()}")
print(f"  • Median Spending Score: {df['Spending Score (1-100)'].median():.1f}/100")

spending_groups = pd.cut(df['Spending Score (1-100)'], bins=[0, 35, 65, 100], labels=['Low Spender (1-35)', 'Medium Spender (36-65)', 'High Spender (66-100)'])
spending_distribution = spending_groups.value_counts()
print(f"\n📊 Spending Group Distribution:")
for group, count in spending_distribution.items():
    percentage = count / len(df) * 100
    print(f"  • {group}: {count} customers ({percentage:.1f}%)")

In [None]:
# Advanced Customer Behavior Analysis
print("🎯 ADVANCED CUSTOMER BEHAVIOR ANALYSIS")
print("=" * 45)

# Gender-based Analysis
print("⚧ Gender-based Spending Patterns:")
gender_stats = df.groupby('Gender')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].agg(['mean', 'std', 'median']).round(2)
print(gender_stats)

# Create gender comparison visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('Customer Behavior by Gender\nCUSTO CLARITY - by Neelanjan Chakraborty', fontsize=16, fontweight='bold')

# Age by Gender
sns.boxplot(data=df, x='Gender', y='Age', ax=axes[0])
axes[0].set_title('Age Distribution by Gender')

# Income by Gender
sns.boxplot(data=df, x='Gender', y='Annual Income (k$)', ax=axes[1])
axes[1].set_title('Income Distribution by Gender')

# Spending by Gender
sns.boxplot(data=df, x='Gender', y='Spending Score (1-100)', ax=axes[2])
axes[2].set_title('Spending Score by Gender')

plt.tight_layout()
plt.show()

# Age vs Income vs Spending Analysis
print(f"\n🔍 Age-Income-Spending Relationships:")

# Create age groups for analysis
df_analysis = df.copy()
df_analysis['Age_Group'] = age_groups
df_analysis['Income_Group'] = income_groups
df_analysis['Spending_Group'] = spending_groups

# Cross-tabulation analysis
print("\n📊 Age Group vs Income Group:")
age_income_crosstab = pd.crosstab(df_analysis['Age_Group'], df_analysis['Income_Group'], margins=True)
print(age_income_crosstab)

print("\n📊 Income Group vs Spending Group:")
income_spending_crosstab = pd.crosstab(df_analysis['Income_Group'], df_analysis['Spending_Group'], margins=True)
print(income_spending_crosstab)

# Interactive 3D Scatter Plot
print("\n🎨 Creating Interactive 3D Customer Visualization...")
fig_3d = px.scatter_3d(df, x='Age', y='Annual Income (k$)', z='Spending Score (1-100)',
                      color='Gender', size_max=18, size='Annual Income (k$)',
                      title='3D Customer Segmentation View<br>CUSTO CLARITY - by Neelanjan Chakraborty',
                      labels={'Annual Income (k$)': 'Annual Income ($k)',
                             'Spending Score (1-100)': 'Spending Score'})

fig_3d.update_layout(scene=dict(
    xaxis_title='Age (years)',
    yaxis_title='Annual Income ($k)',
    zaxis_title='Spending Score (1-100)'
))

fig_3d.show()

print("✅ Advanced customer behavior analysis completed!")

In [None]:
# Data Quality Assessment
print("✅ DATA QUALITY ASSESSMENT")
print("=" * 30)

# Calculate data quality metrics
total_records = len(df)
complete_records = df.dropna().shape[0]
data_completeness = complete_records / total_records * 100

print(f"📊 Data Quality Metrics:")
print(f"  • Total Records: {total_records:,}")
print(f"  • Complete Records: {complete_records:,}")
print(f"  • Data Completeness: {data_completeness:.1f}%")
print(f"  • Duplicate Records: {df.duplicated().sum()}")

# Feature distribution assessment
print(f"\n📈 Feature Distribution Assessment:")
for col in ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']:
    skewness = df[col].skew()
    kurtosis = df[col].kurtosis()
    skew_interpretation = "normal" if abs(skewness) < 0.5 else "moderately skewed" if abs(skewness) < 1 else "highly skewed"
    print(f"  • {col}:")
    print(f"    - Skewness: {skewness:.3f} ({skew_interpretation})")
    print(f"    - Kurtosis: {kurtosis:.3f}")

print(f"\n🎯 KEY INSIGHTS FROM EDA:")
print("="*30)

# Calculate some key insights
high_spenders = df[df['Spending Score (1-100)'] > 70].shape[0]
high_income = df[df['Annual Income (k$)'] > 70].shape[0]
young_customers = df[df['Age'] < 30].shape[0]
female_customers = df[df['Gender'] == 'Female'].shape[0]

print(f"• {high_spenders} customers ({high_spenders/total_records*100:.1f}%) are high spenders (>70 score)")
print(f"• {high_income} customers ({high_income/total_records*100:.1f}%) have high income (>$70k)")
print(f"• {young_customers} customers ({young_customers/total_records*100:.1f}%) are young (<30 years)")
print(f"• {female_customers} customers ({female_customers/total_records*100:.1f}%) are female")

# Identify potential customer segments (preliminary)
print(f"\n🔍 Preliminary Segment Identification:")

# High income, high spending
high_income_high_spend = df[(df['Annual Income (k$)'] > 70) & (df['Spending Score (1-100)'] > 70)]
print(f"• Premium Customers (High Income + High Spending): {len(high_income_high_spend)} ({len(high_income_high_spend)/total_records*100:.1f}%)")

# High income, low spending
high_income_low_spend = df[(df['Annual Income (k$)'] > 70) & (df['Spending Score (1-100)'] < 40)]
print(f"• Conservative Affluent (High Income + Low Spending): {len(high_income_low_spend)} ({len(high_income_low_spend)/total_records*100:.1f}%)")

# Low income, high spending
low_income_high_spend = df[(df['Annual Income (k$)'] < 40) & (df['Spending Score (1-100)'] > 70)]
print(f"• Aspirational Spenders (Low Income + High Spending): {len(low_income_high_spend)} ({len(low_income_high_spend)/total_records*100:.1f}%)")

# Low income, low spending
low_income_low_spend = df[(df['Annual Income (k$)'] < 40) & (df['Spending Score (1-100)'] < 40)]
print(f"• Budget Conscious (Low Income + Low Spending): {len(low_income_low_spend)} ({len(low_income_low_spend)/total_records*100:.1f}%)")

print(f"\n🚀 Ready for Clustering Analysis!")
print("Next steps: Data preprocessing and dimensionality reduction")

# CUSTO CLARITY 🛍️📊
## Customer Segmentation Analysis - Data Exploration

**Author:** Neelanjan Chakraborty  
**Website:** [neelanjanchakraborty.in](https://neelanjanchakraborty.in/)  
**Project:** Advanced Customer Segmentation using Machine Learning  
**Date:** July 2025

---

### 📋 Project Overview

**CUSTO CLARITY** is a comprehensive data science project that leverages machine learning clustering algorithms to identify distinct customer segments from retail data. This analysis will help businesses understand their customer base and develop targeted marketing strategies.

#### 🎯 Key Objectives:
- **Customer Segmentation**: Identify distinct customer groups using KMeans and DBSCAN
- **Pattern Discovery**: Uncover hidden patterns in customer behavior through EDA
- **Marketing Insights**: Provide actionable insights for targeted campaigns
- **Strategic Planning**: Guide product strategy and customer retention efforts

#### 🔬 Methodology:
1. **Exploratory Data Analysis (EDA)** - Understanding data patterns and distributions
2. **Data Preprocessing** - Cleaning and preparing data for analysis
3. **Dimensionality Reduction** - PCA and t-SNE for visualization
4. **Clustering Analysis** - KMeans and DBSCAN implementation
5. **Business Insights** - Translating results into actionable strategies

---

### 📊 Dataset Information

We'll be analyzing the **Mall Customer Segmentation Dataset** which contains:
- Customer demographics (Age, Gender)
- Annual Income information  
- Spending Score metrics
- 200 customer records for comprehensive analysis

Let's begin our exploration! 🚀

## 1. Project Setup and Library Imports 📚

Let's start by importing all the necessary libraries for our customer segmentation analysis.

In [None]:
# Core data manipulation and analysis libraries
import pandas as pd
import numpy as np
import warnings

# Machine learning libraries
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# System and file handling
import os
import sys

# Configure display settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Add project source to path
sys.path.append('../src')

# Import custom modules
try:
    from data_loader import DataLoader
    from preprocessor import CustomerDataPreprocessor
    from clustering import CustomerClusteringAnalyzer
    from visualizer import CustomerVisualizationSuite
    print("✅ Custom modules imported successfully!")
except ImportError as e:
    print(f"⚠️ Could not import custom modules: {e}")
    print("Running without custom modules...")

print("🎯 CUSTO CLARITY - Customer Segmentation Analysis")
print("📊 All libraries imported successfully!")
print(f"🐍 Python version: {sys.version}")
print(f"📈 Pandas version: {pd.__version__}")
print(f"🔬 NumPy version: {np.__version__}")

## 2. Data Loading and Initial Overview 📊

Now let's load our customer dataset and get an initial understanding of the data structure.

In [None]:
# Load the customer dataset
try:
    # Using custom data loader
    data_loader = DataLoader(data_dir="../data")
    df = data_loader.load_dataset()
    print("✅ Dataset loaded using custom DataLoader!")
except:
    # Fallback: Create sample data
    print("📂 Creating sample Mall Customer Segmentation dataset...")
    
    # Generate sample data similar to the original dataset
    np.random.seed(42)
    n_customers = 200
    
    data = {
        'CustomerID': range(1, n_customers + 1),
        'Gender': np.random.choice(['Male', 'Female'], n_customers, p=[0.44, 0.56]),
        'Age': np.random.normal(38, 12, n_customers).astype(int),
        'Annual Income (k$)': np.random.lognormal(3.7, 0.4, n_customers).astype(int),
        'Spending Score (1-100)': np.random.randint(1, 101, n_customers)
    }
    
    # Ensure realistic ranges
    data['Age'] = np.clip(data['Age'], 18, 70)
    data['Annual Income (k$)'] = np.clip(data['Annual Income (k$)'], 15, 137)
    
    df = pd.DataFrame(data)
    print("✅ Sample dataset created successfully!")

# Display basic dataset information
print("\n" + "="*60)
print("📊 CUSTO CLARITY - DATASET OVERVIEW")
print("="*60)

print(f"\n🔍 Dataset Shape: {df.shape}")
print(f"👥 Number of customers: {df.shape[0]}")
print(f"📈 Number of features: {df.shape[1]}")

print(f"\n📋 Column Names:")
for i, col in enumerate(df.columns, 1):
    print(f"   {i}. {col}")

print(f"\n📊 Data Types:")
print(df.dtypes)

print(f"\n🔍 First 5 rows:")
df.head()

### 2.1 Data Quality Assessment 🔍

Let's examine the data quality, check for missing values, and understand the overall health of our dataset.

In [None]:
# Comprehensive data quality assessment
print("🔍 DATA QUALITY ASSESSMENT")
print("="*50)

# Check for missing values
print("\n📊 Missing Values Analysis:")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Count': missing_values.values,
    'Missing Percentage': missing_percentage.values
})

print(missing_df)

if missing_values.sum() == 0:
    print("✅ No missing values found - excellent data quality!")
else:
    print(f"⚠️ Total missing values: {missing_values.sum()}")

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\n🔄 Duplicate rows: {duplicates}")
if duplicates == 0:
    print("✅ No duplicate rows found!")

# Check data types and potential issues
print(f"\n📋 Data Types Overview:")
print(df.dtypes)

# Basic statistical summary
print(f"\n📈 Statistical Summary:")
print(df.describe())

# Check for unique values in categorical columns
print(f"\n🏷️ Categorical Variables Analysis:")
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    unique_count = df[col].nunique()
    print(f"\n{col}:")
    print(f"  - Unique values: {unique_count}")
    print(f"  - Value counts:")
    print(f"    {df[col].value_counts().to_dict()}")

# Memory usage
print(f"\n💾 Memory Usage:")
memory_usage = df.memory_usage(deep=True)
total_memory = memory_usage.sum()
print(f"Total memory usage: {total_memory / 1024:.2f} KB")

print(f"\n✨ Data Quality Summary:")
print(f"   ✅ Dataset is clean and ready for analysis!")
print(f"   📊 {len(df)} customers with {len(df.columns)} features")
print(f"   🎯 No missing values or duplicates detected")

## 3. Exploratory Data Analysis (EDA) 🔍📊

Now let's dive deep into understanding our customer data through comprehensive exploratory analysis. We'll examine distributions, relationships, and patterns that will guide our segmentation strategy.

In [None]:
# Create comprehensive distribution analysis
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('CUSTO CLARITY - Customer Data Distribution Analysis\nby Neelanjan Chakraborty', 
             fontsize=16, fontweight='bold')

# Age distribution
if 'Age' in df.columns:
    sns.histplot(data=df, x='Age', bins=20, kde=True, ax=axes[0, 0], color='skyblue')
    axes[0, 0].set_title('Age Distribution', fontsize=12, fontweight='bold')
    axes[0, 0].set_xlabel('Age')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Add statistics
    mean_age = df['Age'].mean()
    median_age = df['Age'].median()
    axes[0, 0].axvline(mean_age, color='red', linestyle='--', label=f'Mean: {mean_age:.1f}')
    axes[0, 0].axvline(median_age, color='orange', linestyle='--', label=f'Median: {median_age:.1f}')
    axes[0, 0].legend()

# Annual Income distribution
if 'Annual Income (k$)' in df.columns:
    sns.histplot(data=df, x='Annual Income (k$)', bins=20, kde=True, ax=axes[0, 1], color='lightgreen')
    axes[0, 1].set_title('Annual Income Distribution', fontsize=12, fontweight='bold')
    axes[0, 1].set_xlabel('Annual Income (k$)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Add statistics
    mean_income = df['Annual Income (k$)'].mean()
    median_income = df['Annual Income (k$)'].median()
    axes[0, 1].axvline(mean_income, color='red', linestyle='--', label=f'Mean: {mean_income:.1f}')
    axes[0, 1].axvline(median_income, color='orange', linestyle='--', label=f'Median: {median_income:.1f}')
    axes[0, 1].legend()

# Spending Score distribution
if 'Spending Score (1-100)' in df.columns:
    sns.histplot(data=df, x='Spending Score (1-100)', bins=20, kde=True, ax=axes[0, 2], color='lightcoral')
    axes[0, 2].set_title('Spending Score Distribution', fontsize=12, fontweight='bold')
    axes[0, 2].set_xlabel('Spending Score (1-100)')
    axes[0, 2].set_ylabel('Frequency')
    axes[0, 2].grid(True, alpha=0.3)
    
    # Add statistics
    mean_spending = df['Spending Score (1-100)'].mean()
    median_spending = df['Spending Score (1-100)'].median()
    axes[0, 2].axvline(mean_spending, color='red', linestyle='--', label=f'Mean: {mean_spending:.1f}')
    axes[0, 2].axvline(median_spending, color='orange', linestyle='--', label=f'Median: {median_spending:.1f}')
    axes[0, 2].legend()

# Gender distribution
if 'Gender' in df.columns:
    gender_counts = df['Gender'].value_counts()
    colors = ['lightblue', 'lightpink']
    wedges, texts, autotexts = axes[1, 0].pie(gender_counts.values, labels=gender_counts.index, 
                                            autopct='%1.1f%%', colors=colors, startangle=90)
    axes[1, 0].set_title('Gender Distribution', fontsize=12, fontweight='bold')
    
    # Enhance pie chart
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')

# Age vs Income scatter plot
if 'Age' in df.columns and 'Annual Income (k$)' in df.columns:
    sns.scatterplot(data=df, x='Age', y='Annual Income (k$)', 
                   hue='Gender' if 'Gender' in df.columns else None, 
                   ax=axes[1, 1], alpha=0.7, s=60)
    axes[1, 1].set_title('Age vs Annual Income', fontsize=12, fontweight='bold')
    axes[1, 1].grid(True, alpha=0.3)

# Income vs Spending Score scatter plot
if 'Annual Income (k$)' in df.columns and 'Spending Score (1-100)' in df.columns:
    sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)',
                   hue='Gender' if 'Gender' in df.columns else None, 
                   ax=axes[1, 2], alpha=0.7, s=60)
    axes[1, 2].set_title('Income vs Spending Score', fontsize=12, fontweight='bold')
    axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n📊 DISTRIBUTION ANALYSIS SUMMARY")
print("="*50)

if 'Age' in df.columns:
    print(f"\n👥 Age Analysis:")
    print(f"   Range: {df['Age'].min()} - {df['Age'].max()} years")
    print(f"   Mean: {df['Age'].mean():.1f} years")
    print(f"   Median: {df['Age'].median():.1f} years")
    print(f"   Standard Deviation: {df['Age'].std():.1f} years")

if 'Annual Income (k$)' in df.columns:
    print(f"\n💰 Income Analysis:")
    print(f"   Range: ${df['Annual Income (k$)'].min()}k - ${df['Annual Income (k$)'].max()}k")
    print(f"   Mean: ${df['Annual Income (k$)'].mean():.1f}k")
    print(f"   Median: ${df['Annual Income (k$)'].median():.1f}k")
    print(f"   Standard Deviation: ${df['Annual Income (k$)'].std():.1f}k")

if 'Spending Score (1-100)' in df.columns:
    print(f"\n🛍️ Spending Score Analysis:")
    print(f"   Range: {df['Spending Score (1-100)'].min()} - {df['Spending Score (1-100)'].max()}")
    print(f"   Mean: {df['Spending Score (1-100)'].mean():.1f}")
    print(f"   Median: {df['Spending Score (1-100)'].median():.1f}")
    print(f"   Standard Deviation: {df['Spending Score (1-100)'].std():.1f}")

if 'Gender' in df.columns:
    print(f"\n⚧ Gender Distribution:")
    for gender, count in df['Gender'].value_counts().items():
        percentage = (count / len(df)) * 100
        print(f"   {gender}: {count} customers ({percentage:.1f}%)")