# Exploratory Data Analysis (EDA) - Customer Churn Prediction

## Overview
This notebook performs comprehensive exploratory data analysis on the customer churn prediction dataset to understand data quality, feature distributions, and relationships with the target variable.

## Objectives
1. **Data Quality Assessment**: Missing values, outliers, data types
2. **Feature Analysis**: Distributions, cardinality, correlations
3. **Target Analysis**: Churn rate, class balance
4. **Feature Engineering Decisions**: Preprocessing strategies for each feature
5. **Data Leakage Check**: Identify and exclude leaky features


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import sys
import yaml

# Add src to path for imports
sys.path.insert(0, str(Path.cwd() / 'src'))

from src.data.loader import DataLoader

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Seaborn version: {sns.__version__}")


## 1. Data Loading and Initial Inspection


In [None]:
# Load data using our custom loader
loader = DataLoader("configs/schema.yaml")
df = loader.load_data("data/sample.csv")

print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Display basic info
print("\n" + "="*50)
print("DATASET OVERVIEW")
print("="*50)
df.info()


In [None]:
# Display first few rows
print("First 5 rows:")
display(df.head())

print("\nLast 5 rows:")
display(df.tail())


## 2. Data Quality Assessment


In [None]:
# Missing values analysis
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
}).sort_values('Missing Percentage', ascending=False)

print("Missing Values Summary:")
print("="*40)
display(missing_df[missing_df['Missing Count'] > 0])

# Visualize missing values
plt.figure(figsize=(12, 8))
if missing_df['Missing Count'].sum() > 0:
    # Create heatmap of missing values
    plt.subplot(2, 1, 1)
    sns.heatmap(df.isnull(), cbar=True, yticklabels=False, cmap='viridis')
    plt.title('Missing Values Heatmap')
    plt.xticks(rotation=45)
    
    # Bar plot of missing percentages
    plt.subplot(2, 1, 2)
    missing_with_data = missing_df[missing_df['Missing Count'] > 0]
    if len(missing_with_data) > 0:
        sns.barplot(data=missing_with_data, x=missing_with_data.index, y='Missing Percentage')
        plt.title('Missing Values by Column')
        plt.xticks(rotation=45)
        plt.ylabel('Missing Percentage (%)')
    
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset!")


In [None]:
# Data types analysis
print("Data Types Summary:")
print("="*30)
dtype_counts = df.dtypes.value_counts()
print(dtype_counts)

print("\nDetailed Data Types:")
print("-" * 30)
for col in df.columns:
    print(f"{col:30} {str(df[col].dtype):15} {df[col].nunique():5} unique values")

# Check for potential data type issues
print("\nPotential Issues:")
print("-" * 20)
for col in df.columns:
    if df[col].dtype == 'object':
        # Check if numeric columns are stored as strings
        try:
            pd.to_numeric(df[col].dropna())
            print(f"⚠️  {col}: Numeric data stored as object")
        except:
            pass


## 3. Target Variable Analysis


In [None]:
# Target variable analysis
target_col = 'churn_probability'
print(f"Target Variable: {target_col}")
print("="*50)

# Basic statistics
print("Target Statistics:")
print(f"Mean: {df[target_col].mean():.4f}")
print(f"Median: {df[target_col].median():.4f}")
print(f"Std: {df[target_col].std():.4f}")
print(f"Min: {df[target_col].min():.4f}")
print(f"Max: {df[target_col].max():.4f}")

# Class distribution
print(f"\nClass Distribution:")
print(f"Churn (1): {df[target_col].sum():.0f} ({df[target_col].mean()*100:.1f}%)")
print(f"No Churn (0): {(1-df[target_col]).sum():.0f} ({(1-df[target_col].mean())*100:.1f}%)")

# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(df[target_col], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[0].set_title('Target Variable Distribution')
axes[0].set_xlabel('Churn Probability')
axes[0].set_ylabel('Frequency')
axes[0].axvline(df[target_col].mean(), color='red', linestyle='--', label=f'Mean: {df[target_col].mean():.3f}')
axes[0].legend()

# Bar plot of class counts
class_counts = df[target_col].value_counts().sort_index()
axes[1].bar(class_counts.index, class_counts.values, color=['lightcoral', 'lightblue'])
axes[1].set_title('Class Distribution')
axes[1].set_xlabel('Churn (0=No, 1=Yes)')
axes[1].set_ylabel('Count')
axes[1].set_xticks([0, 1])

# Add count labels on bars
for i, v in enumerate(class_counts.values):
    axes[1].text(i, v + 5, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()


## 4. Feature Analysis


In [None]:
# Separate features from target and leaky features
leaky_features = loader.get_leaky_features()
training_features = loader.get_training_features()

print("Feature Categories:")
print("="*30)
print(f"Total features: {len(df.columns)}")
print(f"Training features (non-leaky): {len(training_features)}")
print(f"Leaky features (exclude from training): {len(leaky_features)}")
print(f"Target feature: 1")

print(f"\nLeaky features to exclude:")
for feature in leaky_features:
    print(f"  - {feature}")

print(f"\nTraining features:")
for feature in training_features:
    print(f"  - {feature}")

# Create feature analysis dataframe
feature_analysis = []

for col in df.columns:
    if col != target_col:
        feature_info = {
            'feature': col,
            'dtype': str(df[col].dtype),
            'nunique': df[col].nunique(),
            'missing_count': df[col].isnull().sum(),
            'missing_pct': (df[col].isnull().sum() / len(df)) * 100,
            'is_leaky': col in leaky_features,
            'is_training': col in training_features
        }
        feature_analysis.append(feature_info)

feature_df = pd.DataFrame(feature_analysis)
feature_df = feature_df.sort_values(['is_leaky', 'nunique'], ascending=[True, False])

print("\nFeature Analysis Summary:")
print("="*50)
display(feature_df)


In [None]:
# Cardinality analysis for categorical features
print("Cardinality Analysis:")
print("="*30)

categorical_features = feature_df[feature_df['dtype'] == 'object']['feature'].tolist()
high_cardinality_threshold = 50

for feature in categorical_features:
    cardinality = df[feature].nunique()
    print(f"{feature:30} {cardinality:5} unique values")
    
    if cardinality > high_cardinality_threshold:
        print(f"  ⚠️  HIGH CARDINALITY - Consider embedding or target encoding")
    elif cardinality > 10:
        print(f"  ⚠️  MEDIUM CARDINALITY - Consider target encoding")
    else:
        print(f"  ✅ LOW CARDINALITY - Safe for one-hot encoding")

# Numeric features analysis
print(f"\nNumeric Features Analysis:")
print("="*30)

numeric_features = feature_df[
    (feature_df['dtype'].isin(['int64', 'float64'])) & 
    (feature_df['is_training'] == True)
]['feature'].tolist()

for feature in numeric_features:
    stats = df[feature].describe()
    print(f"{feature:30}")
    print(f"  Range: {stats['min']:.2f} to {stats['max']:.2f}")
    print(f"  Mean: {stats['mean']:.2f}, Std: {stats['std']:.2f}")
    
    # Check for outliers using IQR method
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[feature] < Q1 - 1.5*IQR) | (df[feature] > Q3 + 1.5*IQR)][feature]
    print(f"  Outliers: {len(outliers)} ({len(outliers)/len(df)*100:.1f}%)")
    print()


## 5. Feature Distributions


In [None]:
# Plot distributions for key numeric features
key_numeric_features = ['age', 'monthly_revenue', 'login_frequency', 'session_duration_avg', 'support_tickets_count']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, feature in enumerate(key_numeric_features):
    if i < len(axes) and feature in df.columns:
        axes[i].hist(df[feature].dropna(), bins=30, alpha=0.7, color='skyblue', edgecolor='black')
        axes[i].set_title(f'{feature} Distribution')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')
        
        # Add statistics
        mean_val = df[feature].mean()
        median_val = df[feature].median()
        axes[i].axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'Mean: {mean_val:.1f}')
        axes[i].axvline(median_val, color='green', linestyle='--', alpha=0.7, label=f'Median: {median_val:.1f}')
        axes[i].legend()

# Remove empty subplots
for i in range(len(key_numeric_features), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


In [None]:
# Categorical features distributions
categorical_features = ['gender', 'location_country', 'subscription_type']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, feature in enumerate(categorical_features):
    if feature in df.columns:
        value_counts = df[feature].value_counts()
        
        axes[i].bar(range(len(value_counts)), value_counts.values, color='lightcoral')
        axes[i].set_title(f'{feature} Distribution')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Count')
        axes[i].set_xticks(range(len(value_counts)))
        axes[i].set_xticklabels(value_counts.index, rotation=45)
        
        # Add count labels on bars
        for j, v in enumerate(value_counts.values):
            axes[i].text(j, v + 1, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()


## 6. Target vs Feature Relationships


In [None]:
# Correlation analysis for numeric features
numeric_cols = [col for col in training_features if df[col].dtype in ['int64', 'float64']]

if len(numeric_cols) > 0:
    # Calculate correlation with target
    correlations = df[numeric_cols + [target_col]].corr()[target_col].drop(target_col).sort_values(key=abs, reverse=True)
    
    print("Feature Correlations with Target:")
    print("="*40)
    for feature, corr in correlations.items():
        print(f"{feature:30} {corr:6.3f")
    
    # Visualize correlations
    plt.figure(figsize=(12, 8))
    correlations.plot(kind='barh', color='skyblue')
    plt.title('Feature Correlations with Target Variable')
    plt.xlabel('Correlation Coefficient')
    plt.ylabel('Features')
    plt.axvline(0, color='red', linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()
    
    # Correlation heatmap
    plt.figure(figsize=(12, 10))
    correlation_matrix = df[numeric_cols + [target_col]].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, fmt='.2f')
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.show()
else:
    print("No numeric features found for correlation analysis.")


In [None]:
# Target vs categorical features
categorical_cols = [col for col in training_features if df[col].dtype == 'object']

if len(categorical_cols) > 0:
    fig, axes = plt.subplots(1, len(categorical_cols), figsize=(5*len(categorical_cols), 5))
    if len(categorical_cols) == 1:
        axes = [axes]
    
    for i, feature in enumerate(categorical_cols):
        # Calculate churn rate by category
        churn_by_category = df.groupby(feature)[target_col].agg(['count', 'sum', 'mean']).reset_index()
        churn_by_category.columns = [feature, 'total_count', 'churn_count', 'churn_rate']
        churn_by_category = churn_by_category.sort_values('churn_rate', ascending=False)
        
        # Plot
        bars = axes[i].bar(range(len(churn_by_category)), churn_by_category['churn_rate'], 
                          color='lightcoral', alpha=0.7)
        axes[i].set_title(f'Churn Rate by {feature}')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Churn Rate')
        axes[i].set_xticks(range(len(churn_by_category)))
        axes[i].set_xticklabels(churn_by_category[feature], rotation=45)
        
        # Add count labels
        for j, (rate, count) in enumerate(zip(churn_by_category['churn_rate'], churn_by_category['total_count'])):
            axes[i].text(j, rate + 0.01, f'{count}', ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed statistics
    print("Categorical Feature Analysis:")
    print("="*40)
    for feature in categorical_cols:
        print(f"\n{feature}:")
        churn_by_category = df.groupby(feature)[target_col].agg(['count', 'sum', 'mean']).reset_index()
        churn_by_category.columns = [feature, 'total_count', 'churn_count', 'churn_rate']
        churn_by_category = churn_by_category.sort_values('churn_rate', ascending=False)
        display(churn_by_category)
else:
    print("No categorical features found for analysis.")


## 7. Feature Engineering Decisions


In [None]:
# Create feature engineering decisions
feature_decisions = []

for feature in df.columns:
    if feature == target_col:
        continue
        
    # Determine feature type and preprocessing strategy
    dtype = str(df[feature].dtype)
    cardinality = df[feature].nunique()
    missing_pct = (df[feature].isnull().sum() / len(df)) * 100
    is_leaky = feature in leaky_features
    
    # Determine preprocessing action
    if is_leaky:
        action = "drop"
        reason = "Leaky feature - only available post-label"
    elif dtype == 'object':
        if cardinality <= 10:
            action = "onehot"
            reason = f"Low cardinality categorical ({cardinality} unique values)"
        elif cardinality <= 50:
            action = "target_encode"
            reason = f"Medium cardinality categorical ({cardinality} unique values)"
        else:
            action = "embed"
            reason = f"High cardinality categorical ({cardinality} unique values)"
    elif dtype in ['int64', 'float64']:
        # Check if it's actually categorical (low unique values)
        if cardinality <= 20 and df[feature].dtype == 'int64':
            action = "onehot"
            reason = f"Integer with low cardinality ({cardinality} unique values)"
        else:
            action = "scale"
            reason = f"Numeric feature - requires scaling"
    else:
        action = "investigate"
        reason = f"Unknown data type: {dtype}"
    
    feature_decisions.append({
        'name': feature,
        'type': dtype,
        'cardinality': cardinality,
        'missing_pct': missing_pct,
        'is_leaky': is_leaky,
        'action': action,
        'reason': reason
    })

# Create DataFrame and sort by action
feature_decisions_df = pd.DataFrame(feature_decisions)
feature_decisions_df = feature_decisions_df.sort_values(['action', 'cardinality'])

print("Feature Engineering Decisions:")
print("="*60)
display(feature_decisions_df)

# Summary by action
print("\nSummary by Preprocessing Action:")
print("="*40)
action_summary = feature_decisions_df['action'].value_counts()
for action, count in action_summary.items():
    print(f"{action:15} {count:3} features")
    
    # Show examples
    examples = feature_decisions_df[feature_decisions_df['action'] == action]['name'].tolist()[:3]
    if examples:
        print(f"  Examples: {', '.join(examples)}")
    print()


## 8. Export Results


In [None]:
# Export feature list to CSV
feature_list_path = "configs/feature_list.csv"
feature_decisions_df[['name', 'type', 'cardinality', 'action']].to_csv(feature_list_path, index=False)
print(f"Feature list exported to: {feature_list_path}")

# Create EDA summary
eda_summary = f"""
# EDA Summary - Customer Churn Prediction

## Dataset Overview
- **Total Records**: {len(df):,}
- **Total Features**: {len(df.columns)}
- **Training Features**: {len(training_features)}
- **Leaky Features**: {len(leaky_features)}
- **Target Variable**: {target_col}

## Data Quality
- **Missing Values**: {df.isnull().sum().sum()} total missing values
- **Features with Missing Data**: {(df.isnull().sum() > 0).sum()}
- **Memory Usage**: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB

## Target Variable
- **Churn Rate**: {df[target_col].mean():.1%}
- **Class Balance**: {'Balanced' if 0.3 <= df[target_col].mean() <= 0.7 else 'Imbalanced'}

## Feature Engineering Summary
"""

for action in feature_decisions_df['action'].unique():
    count = (feature_decisions_df['action'] == action).sum()
    eda_summary += f"- **{action.title()}**: {count} features\n"

eda_summary += f"""
## Key Findings
1. **Data Leakage**: {len(leaky_features)} features identified as leaky and excluded from training
2. **Missing Data**: {(df.isnull().sum() > 0).sum()} features have missing values requiring imputation
3. **Feature Types**: {len([f for f in training_features if df[f].dtype == 'object'])} categorical, {len([f for f in training_features if df[f].dtype in ['int64', 'float64']])} numeric
4. **High Cardinality**: {len(feature_decisions_df[feature_decisions_df['cardinality'] > 50])} features with >50 unique values

## Recommendations
1. **Preprocessing**: Implement feature-specific preprocessing based on feature_list.csv
2. **Missing Values**: Use appropriate imputation strategies for each feature type
3. **Feature Selection**: Consider feature importance analysis after preprocessing
4. **Model Validation**: Use stratified sampling due to class imbalance
"""

# Save EDA summary
with open("docs/eda_summary.md", "w") as f:
    f.write(eda_summary)

print("EDA summary exported to: docs/eda_summary.md")
print("\nEDA Analysis Complete!")
