# Customer Segmentation - Exploratory Data Analysis

This notebook performs exploratory data analysis on the customer segmentation dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Data

In [None]:
# Load datasets
train_df = pd.read_csv('../data/raw/Train.csv')
test_df = pd.read_csv('../data/raw/Test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

In [None]:
# Display first few rows
train_df.head(10)

## 2. Basic Information

In [None]:
# Data types and non-null counts
print("Training Data Info:")
print("="*50)
train_df.info()

print("\n" + "="*50)
print("Test Data Info:")
print("="*50)
test_df.info()

In [None]:
# Statistical summary
train_df.describe(include='all')

## 3. Missing Values Analysis

In [None]:
# Missing values in training data
missing_train = train_df.isnull().sum()
missing_train_pct = (missing_train / len(train_df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_train,
    'Percentage': missing_train_pct
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

print("Missing Values in Training Data:")
print(missing_df)

# Visualize missing values
if len(missing_df) > 0:
    plt.figure(figsize=(10, 6))
    missing_df['Percentage'].plot(kind='bar', color='coral')
    plt.title('Missing Values Percentage by Feature', fontsize=14, fontweight='bold')
    plt.xlabel('Features')
    plt.ylabel('Percentage (%)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 4. Target Variable Analysis

In [None]:
# Segmentation distribution
print("Segmentation Distribution:")
print(train_df['Segmentation'].value_counts().sort_index())
print("\n" + "="*50)
print("Percentage Distribution:")
print((train_df['Segmentation'].value_counts(normalize=True) * 100).sort_index())

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
seg_counts = train_df['Segmentation'].value_counts().sort_index()
axes[0].bar(seg_counts.index, seg_counts.values, color='steelblue', edgecolor='black')
axes[0].set_title('Customer Segmentation Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Segment')
axes[0].set_ylabel('Count')
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
colors = plt.cm.Set3(range(len(seg_counts)))
axes[1].pie(seg_counts.values, labels=seg_counts.index, autopct='%1.1f%%', 
            colors=colors, startangle=90)
axes[1].set_title('Segment Proportions', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Categorical Features Analysis

In [None]:
# Identify categorical features
categorical_features = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']

# Distribution of categorical features
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.ravel()

for idx, col in enumerate(categorical_features):
    value_counts = train_df[col].value_counts()
    axes[idx].bar(range(len(value_counts)), value_counts.values, color='teal', alpha=0.7)
    axes[idx].set_title(f'{col} Distribution', fontsize=12, fontweight='bold')
    axes[idx].set_xticks(range(len(value_counts)))
    axes[idx].set_xticklabels(value_counts.index, rotation=45, ha='right')
    axes[idx].set_ylabel('Count')
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Print unique values
for col in categorical_features:
    print(f"\n{col}: {train_df[col].nunique()} unique values")
    print(f"Values: {train_df[col].unique()}")

## 6. Numerical Features Analysis

In [None]:
# Numerical features
numerical_features = ['Age', 'Work_Experience', 'Family_Size']

# Statistical summary
print("Numerical Features Statistics:")
print(train_df[numerical_features].describe())

In [None]:
# Distribution plots
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for idx, col in enumerate(numerical_features):
    axes[idx].hist(train_df[col].dropna(), bins=30, color='skyblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{col} Distribution', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Box plots for outlier detection
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for idx, col in enumerate(numerical_features):
    axes[idx].boxplot(train_df[col].dropna(), vert=True, patch_artist=True,
                     boxprops=dict(facecolor='lightgreen', alpha=0.7))
    axes[idx].set_title(f'{col} Box Plot', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col)
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Feature Correlations

In [None]:
# Correlation matrix for numerical features
correlation_matrix = train_df[numerical_features].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Numerical Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 8. Segmentation vs Features Analysis

In [None]:
# Age distribution by segment
plt.figure(figsize=(12, 6))
for segment in sorted(train_df['Segmentation'].unique()):
    data = train_df[train_df['Segmentation'] == segment]['Age'].dropna()
    plt.hist(data, alpha=0.5, label=f'Segment {segment}', bins=20)

plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Age Distribution by Segment', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Segment characteristics
segment_stats = train_df.groupby('Segmentation')[numerical_features].mean()
print("Average Feature Values by Segment:")
print(segment_stats)

# Visualize
segment_stats.plot(kind='bar', figsize=(12, 6), rot=0)
plt.title('Average Feature Values by Segment', fontsize=14, fontweight='bold')
plt.xlabel('Segment')
plt.ylabel('Average Value')
plt.legend(title='Features')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 9. Spending Score Analysis

In [None]:
# Spending score by segment
spending_segment = pd.crosstab(train_df['Segmentation'], train_df['Spending_Score'], normalize='index') * 100

spending_segment.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='viridis')
plt.title('Spending Score Distribution by Segment (%)', fontsize=14, fontweight='bold')
plt.xlabel('Segment')
plt.ylabel('Percentage')
plt.legend(title='Spending Score')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 10. Key Insights Summary

In [None]:
print("="*60)
print("KEY INSIGHTS FROM EDA")
print("="*60)

print(f"\n1. Dataset Size:")
print(f"   - Training samples: {len(train_df)}")
print(f"   - Test samples: {len(test_df)}")
print(f"   - Total features: {train_df.shape[1]}")

print(f"\n2. Target Variable:")
print(f"   - Number of segments: {train_df['Segmentation'].nunique()}")
print(f"   - Segments: {sorted(train_df['Segmentation'].unique())}")
print(f"   - Class balance: {(train_df['Segmentation'].value_counts().std() / train_df['Segmentation'].value_counts().mean() * 100):.2f}% variation")

print(f"\n3. Missing Values:")
total_missing = train_df.isnull().sum().sum()
if total_missing > 0:
    print(f"   - Total missing values: {total_missing}")
    for col in train_df.columns:
        missing_count = train_df[col].isnull().sum()
        if missing_count > 0:
            print(f"   - {col}: {missing_count} ({missing_count/len(train_df)*100:.2f}%)")
else:
    print("   - No missing values found")

print(f"\n4. Feature Types:")
print(f"   - Numerical: {len(numerical_features)} features")
print(f"   - Categorical: {len(categorical_features)} features")

print(f"\n5. Next Steps:")
print("   - Handle missing values")
print("   - Encode categorical features")
print("   - Scale numerical features")
print("   - Design Auto Encoder architecture")
print("   - Train model and perform clustering")
print("="*60)