# Exploratory Data Analysis (EDA) - Customer Churn Prediction

This notebook performs comprehensive exploratory data analysis on the cleaned customer churn dataset.

## Objectives:
1. Understand the dataset structure and characteristics
2. Analyze the target variable (Churn)
3. Explore relationships between features and churn
4. Identify key patterns and insights
5. Prepare insights for feature engineering and modeling


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

# Set plotting style
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except:
    try:
        plt.style.use('seaborn-darkgrid')
    except:
        plt.style.use('ggplot')
        
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported successfully")


ModuleNotFoundError: No module named 'matplotlib'

## 1. Load Cleaned Dataset


In [None]:
# Load the cleaned dataset
data_path = r"C:\Users\ADMIN\Desktop\DataAnalytics\customer-churn-prediction\data\processed\cleaned_customer_churn.csv"

df = pd.read_csv(data_path)

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"\nFirst few rows:")
df.head()


## 2. Basic Data Overview


In [None]:
# Dataset information
print("=" * 60)
print("DATASET INFORMATION")
print("=" * 60)
print(f"\nData Types:")
print(df.dtypes)
print(f"\n\nMissing Values:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("✓ No missing values found!")
else:
    print(missing[missing > 0])
print(f"\n\nDuplicate Rows: {df.duplicated().sum()}")
print(f"\n\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


In [None]:
# Statistical summary for numeric columns
print("=" * 60)
print("NUMERIC COLUMNS SUMMARY")
print("=" * 60)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'customerID' in numeric_cols:
    numeric_cols.remove('customerID')
print(df[numeric_cols].describe())


In [None]:
# Categorical columns summary
print("=" * 60)
print("CATEGORICAL COLUMNS SUMMARY")
print("=" * 60)
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
if 'customerID' in categorical_cols:
    categorical_cols.remove('customerID')

for col in categorical_cols:
    print(f"\n{col}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Value counts:")
    print(df[col].value_counts().head(10))
    print("-" * 40)


## 3. Target Variable Analysis (Churn)


In [None]:
# Churn distribution
if 'Churn' in df.columns:
    churn_counts = df['Churn'].value_counts()
    churn_percent = df['Churn'].value_counts(normalize=True) * 100
    
    print("=" * 60)
    print("CHURN DISTRIBUTION")
    print("=" * 60)
    print(f"\nCounts:")
    print(churn_counts)
    print(f"\nPercentages:")
    print(churn_percent.round(2))
    
    # Visualize churn distribution
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Count plot
    churn_counts.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
    axes[0].set_title('Churn Distribution (Count)', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Churn', fontsize=12)
    axes[0].set_ylabel('Count', fontsize=12)
    axes[0].tick_params(axis='x', rotation=0)
    for i, v in enumerate(churn_counts.values):
        axes[0].text(i, v + 100, str(v), ha='center', fontweight='bold')
    
    # Pie chart
    axes[1].pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%', 
                colors=['#2ecc71', '#e74c3c'], startangle=90)
    axes[1].set_title('Churn Distribution (Percentage)', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\n⚠ Class Imbalance: {churn_percent.iloc[0]:.1f}% vs {churn_percent.iloc[1]:.1f}%")
else:
    print("⚠ 'Churn' column not found in dataset")


## 4. Univariate Analysis - Numeric Features


In [None]:
# Distribution of numeric features
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
numeric_features = [col for col in numeric_features if col in df.columns]

if numeric_features:
    fig, axes = plt.subplots(len(numeric_features), 2, figsize=(14, 5*len(numeric_features)))
    
    if len(numeric_features) == 1:
        axes = axes.reshape(1, -1)
    
    for idx, col in enumerate(numeric_features):
        # Histogram
        axes[idx, 0].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
        axes[idx, 0].set_title(f'{col} Distribution', fontsize=12, fontweight='bold')
        axes[idx, 0].set_xlabel(col, fontsize=10)
        axes[idx, 0].set_ylabel('Frequency', fontsize=10)
        axes[idx, 0].axvline(df[col].mean(), color='red', linestyle='--', 
                            label=f'Mean: {df[col].mean():.2f}')
        axes[idx, 0].axvline(df[col].median(), color='green', linestyle='--', 
                            label=f'Median: {df[col].median():.2f}')
        axes[idx, 0].legend()
        
        # Box plot
        axes[idx, 1].boxplot(df[col].dropna(), vert=True)
        axes[idx, 1].set_title(f'{col} Box Plot', fontsize=12, fontweight='bold')
        axes[idx, 1].set_ylabel(col, fontsize=10)
        axes[idx, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


## 5. Bivariate Analysis - Features vs Churn


In [None]:
# Numeric features vs Churn
if 'Churn' in df.columns:
    numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
    numeric_features = [col for col in numeric_features if col in df.columns]
    
    if numeric_features:
        fig, axes = plt.subplots(1, len(numeric_features), figsize=(16, 5))
        
        if len(numeric_features) == 1:
            axes = [axes]
        
        for idx, col in enumerate(numeric_features):
            # Box plot by Churn
            df.boxplot(column=col, by='Churn', ax=axes[idx], grid=False)
            axes[idx].set_title(f'{col} by Churn Status', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel('Churn', fontsize=10)
            axes[idx].set_ylabel(col, fontsize=10)
            axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=0)
        
        plt.suptitle('')
        plt.tight_layout()
        plt.show()
        
        # Statistical comparison
        print("\n" + "=" * 60)
        print("NUMERIC FEATURES: CHURN vs NO CHURN")
        print("=" * 60)
        for col in numeric_features:
            churned = df[df['Churn'] == 'Yes'][col]
            not_churned = df[df['Churn'] == 'No'][col]
            print(f"\n{col}:")
            print(f"  Churned:     Mean={churned.mean():.2f}, Median={churned.median():.2f}")
            print(f"  Not Churned: Mean={not_churned.mean():.2f}, Median={not_churned.median():.2f}")
            print(f"  Difference:  {abs(churned.mean() - not_churned.mean()):.2f}")


In [None]:
# Categorical features vs Churn
if 'Churn' in df.columns:
    categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 
                           'PhoneService', 'MultipleLines', 'InternetService',
                           'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                           'TechSupport', 'StreamingTV', 'StreamingMovies',
                           'Contract', 'PaperlessBilling', 'PaymentMethod']
    
    categorical_features = [col for col in categorical_features if col in df.columns]
    
    # Calculate churn rates by category
    print("=" * 60)
    print("CHURN RATES BY CATEGORY")
    print("=" * 60)
    
    churn_rates = {}
    for col in categorical_features:
        crosstab = pd.crosstab(df[col], df['Churn'], normalize='index') * 100
        churn_rates[col] = crosstab['Yes'] if 'Yes' in crosstab.columns else pd.Series()
        print(f"\n{col}:")
        print(crosstab.round(2))
    
    # Visualize top categorical features
    top_features = ['Contract', 'PaymentMethod', 'InternetService', 'OnlineSecurity', 'TechSupport']
    top_features = [col for col in top_features if col in categorical_features]
    
    if top_features:
        n_cols = 2
        n_rows = (len(top_features) + 1) // 2
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 5*n_rows))
        
        if n_rows == 1:
            axes = axes.reshape(1, -1)
        
        for idx, col in enumerate(top_features[:n_rows*n_cols]):
            row = idx // n_cols
            col_idx = idx % n_cols
            
            # Create crosstab
            crosstab = pd.crosstab(df[col], df['Churn'])
            crosstab_pct = pd.crosstab(df[col], df['Churn'], normalize='index') * 100
            
            # Plot
            crosstab_pct.plot(kind='bar', ax=axes[row, col_idx], 
                             color=['#2ecc71', '#e74c3c'], width=0.8)
            axes[row, col_idx].set_title(f'Churn Rate by {col}', fontsize=12, fontweight='bold')
            axes[row, col_idx].set_xlabel(col, fontsize=10)
            axes[row, col_idx].set_ylabel('Percentage (%)', fontsize=10)
            axes[row, col_idx].legend(['No Churn', 'Churn'], loc='best')
            axes[row, col_idx].tick_params(axis='x', rotation=45)
            axes[row, col_idx].grid(axis='y', alpha=0.3)
        
        # Hide empty subplots
        for idx in range(len(top_features), n_rows * n_cols):
            row = idx // n_cols
            col_idx = idx % n_cols
            axes[row, col_idx].axis('off')
        
        plt.tight_layout()
        plt.show()


## 6. Correlation Analysis


In [None]:
# Correlation matrix for numeric features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'customerID' in numeric_cols:
    numeric_cols.remove('customerID')

if len(numeric_cols) > 1:
    corr_matrix = df[numeric_cols].corr()
    
    # Heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix - Numeric Features', fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
    
    # Correlation with Churn (if Churn is encoded)
    if 'Churn' in df.columns:
        # Encode Churn for correlation
        df_encoded = df.copy()
        df_encoded['Churn_encoded'] = (df_encoded['Churn'] == 'Yes').astype(int)
        
        churn_corr = df_encoded[numeric_cols + ['Churn_encoded']].corr()['Churn_encoded'].sort_values(ascending=False)
        churn_corr = churn_corr.drop('Churn_encoded')
        
        print("\n" + "=" * 60)
        print("CORRELATION WITH CHURN")
        print("=" * 60)
        print(churn_corr.round(3))
        
        # Visualize
        plt.figure(figsize=(10, 6))
        churn_corr.plot(kind='barh', color='steelblue')
        plt.title('Feature Correlation with Churn', fontsize=14, fontweight='bold')
        plt.xlabel('Correlation Coefficient', fontsize=12)
        plt.ylabel('Features', fontsize=12)
        plt.axvline(x=0, color='red', linestyle='--', alpha=0.5)
        plt.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        plt.show()


## 7. Key Insights and Patterns


In [None]:
# Generate key insights
print("=" * 60)
print("KEY INSIGHTS")
print("=" * 60)

if 'Churn' in df.columns:
    # Overall churn rate
    overall_churn = (df['Churn'] == 'Yes').mean() * 100
    print(f"\n1. Overall Churn Rate: {overall_churn:.2f}%")
    
    # Top risk factors
    print("\n2. Top Risk Factors (Highest Churn Rates):")
    risk_factors = {}
    
    categorical_features = ['Contract', 'PaymentMethod', 'InternetService', 
                          'OnlineSecurity', 'TechSupport', 'OnlineBackup']
    categorical_features = [col for col in categorical_features if col in df.columns]
    
    for col in categorical_features:
        churn_by_category = df.groupby(col)['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
        max_churn = churn_by_category.max()
        risk_factors[col] = (churn_by_category.idxmax(), max_churn)
    
    # Sort by churn rate
    sorted_risks = sorted(risk_factors.items(), key=lambda x: x[1][1], reverse=True)
    for i, (feature, (category, rate)) in enumerate(sorted_risks[:5], 1):
        print(f"   {i}. {feature} = '{category}': {rate:.1f}% churn rate")
    
    # Tenure insights
    if 'tenure' in df.columns:
        print("\n3. Tenure Insights:")
        low_tenure_churn = df[df['tenure'] <= 12]['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
        high_tenure_churn = df[df['tenure'] > 12]['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
        print(f"   - Low tenure (≤12 months): {low_tenure_churn:.1f}% churn rate")
        print(f"   - High tenure (>12 months): {high_tenure_churn:.1f}% churn rate")
    
    # Monthly charges insights
    if 'MonthlyCharges' in df.columns:
        print("\n4. Monthly Charges Insights:")
        high_charge_churn = df[df['MonthlyCharges'] > df['MonthlyCharges'].median()]['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
        low_charge_churn = df[df['MonthlyCharges'] <= df['MonthlyCharges'].median()]['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
        print(f"   - High charges (>median): {high_charge_churn:.1f}% churn rate")
        print(f"   - Low charges (≤median): {low_charge_churn:.1f}% churn rate")
    
    # Contract insights
    if 'Contract' in df.columns:
        print("\n5. Contract Type Impact:")
        contract_churn = df.groupby('Contract')['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
        for contract, rate in contract_churn.items():
            print(f"   - {contract}: {rate:.1f}% churn rate")

print("\n" + "=" * 60)


## 8. Summary Statistics by Churn Status


In [None]:
# Compare statistics between churned and non-churned customers
if 'Churn' in df.columns:
    numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
    numeric_cols = [col for col in numeric_cols if col in df.columns]
    
    if numeric_cols:
        print("=" * 60)
        print("COMPARATIVE STATISTICS: CHURNED vs NOT CHURNED")
        print("=" * 60)
        
        comparison = df.groupby('Churn')[numeric_cols].agg(['mean', 'median', 'std', 'min', 'max'])
        print(comparison.round(2))
        
        # Create comparison visualization
        fig, axes = plt.subplots(1, len(numeric_cols), figsize=(16, 5))
        if len(numeric_cols) == 1:
            axes = [axes]
        
        for idx, col in enumerate(numeric_cols):
            churned = df[df['Churn'] == 'Yes'][col]
            not_churned = df[df['Churn'] == 'No'][col]
            
            axes[idx].hist(not_churned, bins=30, alpha=0.6, label='No Churn', color='#2ecc71', edgecolor='black')
            axes[idx].hist(churned, bins=30, alpha=0.6, label='Churn', color='#e74c3c', edgecolor='black')
            axes[idx].set_title(f'{col} Distribution by Churn', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col, fontsize=10)
            axes[idx].set_ylabel('Frequency', fontsize=10)
            axes[idx].legend()
            axes[idx].grid(alpha=0.3)
        
        plt.tight_layout()
        plt.show()


## 9. EDA Summary and Next Steps

### Key Findings:
1. **Class Distribution**: Check if the dataset is balanced or imbalanced
2. **High-Risk Groups**: Identify customer segments with highest churn rates
3. **Feature Importance**: Determine which features are most predictive
4. **Data Quality**: Verify data is clean and ready for modeling

### Recommendations for Feature Engineering:
- Consider creating tenure groups (e.g., 0-12, 13-24, 25-48, 49+ months)
- Create interaction features (e.g., MonthlyCharges × Contract type)
- Encode categorical variables appropriately
- Handle class imbalance if present
- Consider feature scaling/normalization

### Next Steps:
1. Feature Engineering
2. Model Selection
3. Model Training
4. Model Evaluation
