# Customer Risk Segmentation - Exploratory Data Analysis
## Understanding Customer Behavioral and Transactional Patterns

### 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported successfully")

### 2. Load Data

In [None]:
# Load customer data
df = pd.read_csv("data/customers.csv")

print(f"Dataset shape: {df.shape}")
print(f"Total customers: {df.shape[0]:,}")
print(f"Total features: {df.shape[1]}")

### 3. Initial Data Inspection

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Display last few rows
print("Last 5 rows of the dataset:")
df.tail()

### 4. Data Types and Structure

In [None]:
# Dataset information
print("Dataset Information:")
print("="*60)
df.info()

print("\n" + "="*60)
print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")

### 5. Statistical Summary

In [None]:
# Descriptive statistics
print("Statistical Summary:")
df.describe()

In [None]:
# Additional statistics
print("\nAdditional Statistics:")
print("="*60)
for col in df.select_dtypes(include=[np.number]).columns:
    print(f"{col}:")
    print(f"  Mean: {df[col].mean():.2f}")
    print(f"  Median: {df[col].median():.2f}")
    print(f"  Std Dev: {df[col].std():.2f}")
    print(f"  Min: {df[col].min():.2f}")
    print(f"  Max: {df[col].max():.2f}")
    print("-" * 40)

### 6. Missing Values Analysis

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
}).sort_values('Missing Count', ascending=False)

print("Missing Values Summary:")
print("="*60)
print(missing_df[missing_df['Missing Count'] > 0])

if missing.sum() == 0:
    print("\n✅ No missing values found!")
else:
    print(f"\n⚠️ Total missing values: {missing.sum()}")

In [None]:
# Visualize missing values
if missing.sum() > 0:
    plt.figure(figsize=(10, 4))
    missing_df[missing_df['Missing Count'] > 0]['Percentage'].plot(kind='bar')
    plt.title('Missing Values by Column (%)')
    plt.ylabel('Percentage Missing')
    plt.xlabel('Columns')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

### 7. Distribution Analysis
Analyze the distribution of key numerical features

In [None]:
# Define numerical columns
num_cols = ['transaction_count', 'avg_transaction_value', 'total_spend', 'account_age_days']

print(f"Analyzing {len(num_cols)} numerical features...")

In [None]:
# Distribution plots with KDE
for col in num_cols:
    plt.figure(figsize=(10, 4))
    
    # Histogram with KDE
    plt.subplot(1, 2, 1)
    sns.histplot(df[col], bins=30, kde=True, color='steelblue')
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel('Frequency')
    
    # Box plot
    plt.subplot(1, 2, 2)
    sns.boxplot(y=df[col], color='lightcoral')
    plt.title(f"Box Plot of {col}")
    plt.ylabel(col)
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print(f"\n{col} Statistics:")
    print(f"  Skewness: {df[col].skew():.3f}")
    print(f"  Kurtosis: {df[col].kurtosis():.3f}")
    print("-" * 60)

### 8. Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation_matrix = df[num_cols].corr()

print("Correlation Matrix:")
print(correlation_matrix)
print("\n" + "="*60)

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, 
            annot=True, 
            fmt='.3f',
            cmap='coolwarm', 
            center=0,
            square=True,
            linewidths=1,
            cbar_kws={'shrink': 0.8})
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Identify highly correlated features
print("\nHighly Correlated Feature Pairs (|correlation| > 0.7):")
print("="*60)

high_corr = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.7:
            high_corr.append((
                correlation_matrix.columns[i],
                correlation_matrix.columns[j],
                correlation_matrix.iloc[i, j]
            ))

if high_corr:
    for feat1, feat2, corr_val in high_corr:
        print(f"{feat1} <-> {feat2}: {corr_val:.3f}")
else:
    print("No highly correlated feature pairs found.")

### 9. Pairplot for Feature Relationships

In [None]:
# Pairplot to visualize relationships
print("Generating pairplot (this may take a moment for large datasets)...")

# Sample data if dataset is large (>10000 rows)
if len(df) > 10000:
    df_sample = df[num_cols].sample(n=5000, random_state=42)
    print(f"Using sample of 5,000 rows for visualization")
else:
    df_sample = df[num_cols]

sns.pairplot(df_sample, diag_kind='kde', plot_kws={'alpha': 0.6})
plt.suptitle('Pairplot of Customer Features', y=1.02, fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

### 10. Outlier Detection

In [None]:
# Detect outliers using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

print("Outlier Analysis (IQR Method):")
print("="*60)

for col in num_cols:
    n_outliers, lower, upper = detect_outliers_iqr(df, col)
    outlier_pct = (n_outliers / len(df)) * 100
    print(f"\n{col}:")
    print(f"  Outliers: {n_outliers} ({outlier_pct:.2f}%)")
    print(f"  Lower bound: {lower:.2f}")
    print(f"  Upper bound: {upper:.2f}")

### 11. Data Quality Summary

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

# Check for constant columns
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
print(f"Constant columns: {constant_cols if constant_cols else 'None'}")

# Unique values per column
print("\nUnique Values per Column:")
print("="*60)
for col in df.columns:
    n_unique = df[col].nunique()
    unique_pct = (n_unique / len(df)) * 100
    print(f"{col}: {n_unique:,} ({unique_pct:.2f}%)")

---
## Summary of EDA Findings

### Key Observations:

**1. Data Quality:**
- Dataset contains [X] customers with [Y] features
- Missing values: [summarize from analysis]
- Duplicates: [number found]

**2. Feature Distributions:**
- [Feature X] shows [normal/skewed] distribution
- Outliers detected in [which features]
- [Notable patterns observed]

**3. Feature Relationships:**
- Strong correlation between [Feature A] and [Feature B]
- [Other important correlations]

**4. Outliers:**
- [X]% outliers in transaction_count
- [Y]% outliers in total_spend

### Recommendations for Next Steps:

1. **Data Preprocessing:**
   - Handle missing values (if any)
   - Address outliers (remove/cap/transform)
   - Scale/normalize features

2. **Feature Engineering:**
   - Create risk indicators
   - Derive customer segments
   - Calculate ratios (e.g., avg_value per transaction)

3. **Modeling Approach:**
   - Use clustering (K-Means) for segmentation
   - Apply classification for risk scoring
   - Consider PCA if dimensionality is high

**Ready to move to feature engineering and modeling phase!**