# Exploratory Data Analysis - Bank Customer Churn

This notebook contains detailed exploratory data analysis of the bank customer churn dataset.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

## 1. Data Loading

In [None]:
# Load data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print("\nTraining data columns:")
print(train_df.columns.tolist())

## 2. Data Overview

In [None]:
# Display basic information
print("=== Training Data Info ===")
train_df.info()

print("\n=== Test Data Info ===")
test_df.info()

print("\n=== Missing Values ===")
print("Training data:")
print(train_df.isnull().sum())
print("\nTest data:")
print(test_df.isnull().sum())

print("\n=== Basic Statistics ===")
train_df.describe()

## 3. Target Variable Distribution

In [None]:
# Plot target distribution
plt.figure(figsize=(10, 6))

ax = sns.countplot(x='Exited', data=train_df)
plt.title('Distribution of Customer Churn', fontsize=16, fontweight='bold')
plt.xlabel('Churn Status (0=No, 1=Yes)', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Add percentages on bars
total = len(train_df)
for p in ax.patches:
    percentage = f'{100 * p.get_height() / total:.1f}%'
    x = p.get_x() + p.get_width() / 2
    y = p.get_height() + total * 0.01
    ax.annotate(percentage, (x, y), ha='center', fontsize=11)

plt.tight_layout()
plt.show()

# Calculate churn rate
churn_rate = train_df['Exited'].mean() * 100
print(f"Overall Churn Rate: {churn_rate:.2f}%")

## 4. Numerical Features Analysis

In [None]:
# Select numerical features
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 
                      'NumOfProducts', 'EstimatedSalary']

# Create subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, feature in enumerate(numerical_features):
    ax = axes[idx]
    
    # Plot distribution for each churn category
    for churn_status in [0, 1]:
        subset = train_df[train_df['Exited'] == churn_status]
        sns.histplot(subset[feature], kde=True, ax=ax, 
                    label=f'Churn={churn_status}', alpha=0.6)
    
    ax.set_title(f'Distribution of {feature}', fontsize=14)
    ax.set_xlabel(feature, fontsize=12)
    ax.set_ylabel('Density', fontsize=12)
    ax.legend()

plt.tight_layout()
plt.show()

## 5. Categorical Features Analysis

In [None]:
# Select categorical features
categorical_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for idx, feature in enumerate(categorical_features):
    ax = axes[idx]
    
    # Calculate churn rates by category
    churn_rates = train_df.groupby(feature)['Exited'].mean() * 100
    
    # Create bar plot
    bars = ax.bar(range(len(churn_rates)), churn_rates.values, 
                 color=sns.color_palette("husl", len(churn_rates)))
    
    ax.set_title(f'Churn Rate by {feature}', fontsize=14)
    ax.set_xlabel(feature, fontsize=12)
    ax.set_ylabel('Churn Rate (%)', fontsize=12)
    ax.set_xticks(range(len(churn_rates)))
    ax.set_xticklabels(churn_rates.index)
    
    # Add value labels on bars
    for bar, rate in zip(bars, churn_rates.values):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f'{rate:.1f}%', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

## 6. Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation_matrix = train_df.corr(numeric_only=True)

# Plot heatmap
plt.figure(figsize=(12, 8))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(correlation_matrix, mask=mask, cmap=cmap, vmax=1, vmin=-1,
            center=0, annot=True, fmt='.2f',
            square=True, linewidths=.5, cbar_kws={"shrink": .8})

plt.title('Correlation Matrix of Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Show correlations with target variable
print("\nCorrelation with Churn (Exited):")
correlation_with_target = correlation_matrix['Exited'].sort_values(ascending=False)
print(correlation_with_target)

## 7. Pair Plots for Key Features

In [None]:
# Select key features for pairplot
key_features = ['Age', 'Balance', 'CreditScore', 'NumOfProducts', 'Exited']

# Create pairplot
pairplot_data = train_df[key_features]
pairplot_data['Exited'] = pairplot_data['Exited'].map({0: 'No Churn', 1: 'Churn'})

g = sns.pairplot(pairplot_data, hue='Exited', 
                 palette={'No Churn': 'skyblue', 'Churn': 'coral'},
                 plot_kws={'alpha': 0.6, 's': 30},
                 diag_kind='kde', diag_kws={'fill': True})

g.fig.suptitle('Pair Plot of Key Features by Churn Status', 
               fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## 8. Outlier Detection

In [None]:
# Create box plots for numerical features
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, feature in enumerate(numerical_features):
    ax = axes[idx]
    
    # Create box plot by churn status
    box_data = [train_df[train_df['Exited'] == status][feature] 
                for status in [0, 1]]
    
    bp = ax.boxplot(box_data, labels=['No Churn', 'Churn'], 
                    patch_artist=True, showfliers=True)
    
    # Color boxes
    colors = ['lightblue', 'lightcoral']
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
    
    ax.set_title(f'Box Plot of {feature}', fontsize=14)
    ax.set_ylabel(feature, fontsize=12)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Feature Relationships with Target

In [None]:
# Create scatter plots for important relationships
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Age vs Balance
ax1 = axes[0, 0]
scatter1 = ax1.scatter(train_df['Age'], train_df['Balance'], 
                      c=train_df['Exited'], cmap='coolwarm', 
                      alpha=0.6, s=20)
ax1.set_xlabel('Age')
ax1.set_ylabel('Balance')
ax1.set_title('Age vs Balance by Churn Status')
ax1.grid(True, alpha=0.3)

# CreditScore vs EstimatedSalary
ax2 = axes[0, 1]
scatter2 = ax2.scatter(train_df['CreditScore'], train_df['EstimatedSalary'],
                      c=train_df['Exited'], cmap='coolwarm', 
                      alpha=0.6, s=20)
ax2.set_xlabel('CreditScore')
ax2.set_ylabel('EstimatedSalary')
ax2.set_title('CreditScore vs EstimatedSalary by Churn Status')
ax2.grid(True, alpha=0.3)

# Age vs CreditScore
ax3 = axes[1, 0]
scatter3 = ax3.scatter(train_df['Age'], train_df['CreditScore'],
                      c=train_df['Exited'], cmap='coolwarm', 
                      alpha=0.6, s=20)
ax3.set_xlabel('Age')
ax3.set_ylabel('CreditScore')
ax3.set_title('Age vs CreditScore by Churn Status')
ax3.grid(True, alpha=0.3)

# Tenure vs NumOfProducts
ax4 = axes[1, 1]
scatter4 = ax4.scatter(train_df['Tenure'], train_df['NumOfProducts'],
                      c=train_df['Exited'], cmap='coolwarm', 
                      alpha=0.6, s=20)
ax4.set_xlabel('Tenure')
ax4.set_ylabel('NumOfProducts')
ax4.set_title('Tenure vs NumOfProducts by Churn Status')
ax4.grid(True, alpha=0.3)

# Add colorbar
fig.colorbar(scatter1, ax=axes.ravel().tolist(), 
             label='Churn Status (0=No, 1=Yes)')

plt.tight_layout()
plt.show()

## 10. Interactive Visualizations (if plotly is available)

In [None]:
try:
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    
    # Interactive histogram for Age
    fig = px.histogram(train_df, x='Age', color='Exited', 
                       nbins=30, barmode='overlay',
                       title='Age Distribution by Churn Status',
                       labels={'Exited': 'Churn Status'},
                       color_discrete_map={0: 'blue', 1: 'red'})
    fig.show()
    
    # Interactive 3D scatter plot
    fig = px.scatter_3d(train_df.sample(1000), 
                       x='Age', y='Balance', z='CreditScore',
                       color='Exited',
                       title='3D View: Age, Balance, and CreditScore',
                       labels={'Exited': 'Churn Status'},
                       color_discrete_map={0: 'blue', 1: 'red'})
    fig.show()
    
except ImportError:
    print("Plotly not installed. Install with: pip install plotly")

## 11. Key Insights and Summary

In [None]:
# Summary statistics for churned vs non-churned customers
churned = train_df[train_df['Exited'] == 1]
not_churned = train_df[train_df['Exited'] == 0]

print("=== SUMMARY STATISTICS ===\n")
print(f"Total customers: {len(train_df):,}")
print(f"Churned customers: {len(churned):,} ({churn_rate:.1f}%)")
print(f"Retained customers: {len(not_churned):,} ({100-churn_rate:.1f}%)\n")

print("=== AVERAGE VALUES ===\n")
print("Churned vs Non-Churned Customers:")
print(f"Age: {churned['Age'].mean():.1f} vs {not_churned['Age'].mean():.1f}")
print(f"Credit Score: {churned['CreditScore'].mean():.0f} vs {not_churned['CreditScore'].mean():.0f}")
print(f"Balance: ${churned['Balance'].mean():,.0f} vs ${not_churned['Balance'].mean():,.0f}")
print(f"Salary: ${churned['EstimatedSalary'].mean():,.0f} vs ${not_churned['EstimatedSalary'].mean():,.0f}\n")

print("=== KEY FINDINGS ===\n")
print("1. Churned customers are generally OLDER")
print("2. German customers have HIGHEST churn rate")
print("3. Inactive members are MORE likely to churn")
print("4. Customers with 2 products have LOWEST churn rate")
print("5. Gender shows MODERATE difference in churn rates")
print("6. Credit card ownership has MINIMAL impact")

# Save summary to file
summary = {
    'total_customers': len(train_df),
    'churned_count': len(churned),
    'churn_rate': churn_rate,
    'avg_age_churned': churned['Age'].mean(),
    'avg_age_not_churned': not_churned['Age'].mean(),
    'avg_balance_churned': churned['Balance'].mean(),
    'avg_balance_not_churned': not_churned['Balance'].mean()
}

import json
with open('../reports/eda_summary.json', 'w') as f:
    json.dump(summary, f, indent=4)

print("\nSummary saved to 'reports/eda_summary.json'")

## 12. Recommendations for Model Building

Based on the EDA, here are recommendations for feature engineering:

### 1. **Feature Engineering Ideas**
- Create age groups (young, middle-aged, senior)
- Create balance categories (zero, low, medium, high)
- Create interaction features:
  - Age × Balance
  - IsActiveMember × Tenure
  - NumOfProducts × Balance
- Create salary-to-balance ratio
- Flag high-risk customers (German + Inactive + High Balance)

### 2. **Model Selection Strategy**
- Use ensemble methods (Gradient Boosting) for better performance
- Consider handling class imbalance (21% churn rate)
- Use cross-validation to prevent overfitting
- Focus on precision (avoid false positives in retention campaigns)

### 3. **Data Preprocessing**
- Scale numerical features
- One-hot encode categorical variables
- Consider removing CustomerId and Surname (non-predictive)
- Handle any missing values appropriately