In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# For better display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Data Loading and Overview

In [None]:
# Load the data
# Note: Update the path to your actual data file
df = pd.read_csv('../data/customer_churn.csv')

print(f"Dataset Shape: {df.shape}")
print(f"Number of Rows: {df.shape[0]:,}")
print(f"Number of Columns: {df.shape[1]}")

In [None]:
# First few rows
df.head()

In [None]:
# Data types and info
df.info()

In [None]:
# Statistical summary
df.describe()

## 2. Data Quality Assessment

In [None]:
# Check for missing values
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})

missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
print("\nMissing Values Summary:")
print(missing_data)

In [None]:
# Visualize missing values
if len(missing_data) > 0:
    plt.figure(figsize=(10, 6))
    plt.barh(missing_data['Column'], missing_data['Missing_Percentage'])
    plt.xlabel('Missing Percentage (%)')
    plt.title('Missing Values by Column')
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset!")

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")
if duplicates > 0:
    print(f"Percentage of duplicates: {(duplicates / len(df) * 100):.2f}%")

## 3. Univariate Analysis

### Target Variable Analysis

In [None]:
# Target variable distribution (adjust column name as needed)
target_col = 'Churn'  # Update this to match your dataset

if target_col in df.columns:
    churn_counts = df[target_col].value_counts()
    churn_pct = df[target_col].value_counts(normalize=True) * 100
    
    print("Churn Distribution:")
    print(churn_counts)
    print("\nPercentage:")
    print(churn_pct.round(2))
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Count plot
    axes[0].bar(churn_counts.index.astype(str), churn_counts.values, color=['green', 'red'])
    axes[0].set_xlabel('Churn Status')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Churn Distribution (Count)')
    for i, v in enumerate(churn_counts.values):
        axes[0].text(i, v + 50, str(v), ha='center', va='bottom')
    
    # Pie chart
    axes[1].pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%',
                colors=['green', 'red'], startangle=90)
    axes[1].set_title('Churn Distribution (Percentage)')
    
    plt.tight_layout()
    plt.show()
    
    # Check for class imbalance
    imbalance_ratio = churn_pct.max() / churn_pct.min()
    print(f"\nClass Imbalance Ratio: {imbalance_ratio:.2f}:1")
    if imbalance_ratio > 2:
        print("⚠️ Significant class imbalance detected. Consider using SMOTE or class weights.")

### Numerical Features Analysis

In [None]:
# Identify numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
if target_col in numerical_cols:
    numerical_cols.remove(target_col)

print(f"Numerical Features ({len(numerical_cols)}):")
print(numerical_cols)

In [None]:
# Distribution of numerical features
if len(numerical_cols) > 0:
    n_cols = 3
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes]
    
    for idx, col in enumerate(numerical_cols):
        axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'Distribution of {col}')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
    
    # Hide empty subplots
    for idx in range(len(numerical_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Box plots for outlier detection
if len(numerical_cols) > 0:
    n_cols = 3
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes]
    
    for idx, col in enumerate(numerical_cols):
        axes[idx].boxplot(df[col].dropna())
        axes[idx].set_title(f'Box Plot of {col}')
        axes[idx].set_ylabel(col)
    
    # Hide empty subplots
    for idx in range(len(numerical_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

### Categorical Features Analysis

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

print(f"Categorical Features ({len(categorical_cols)}):")
print(categorical_cols)

In [None]:
# Distribution of categorical features
if len(categorical_cols) > 0:
    for col in categorical_cols:
        print(f"\n{col} - Value Counts:")
        print(df[col].value_counts())
        print(f"Unique values: {df[col].nunique()}")

In [None]:
# Visualize categorical features
if len(categorical_cols) > 0:
    n_cols = 2
    n_rows = (len(categorical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if len(categorical_cols) == 1 else axes
    
    for idx, col in enumerate(categorical_cols):
        value_counts = df[col].value_counts()
        axes[idx].bar(range(len(value_counts)), value_counts.values)
        axes[idx].set_xticks(range(len(value_counts)))
        axes[idx].set_xticklabels(value_counts.index, rotation=45, ha='right')
        axes[idx].set_title(f'Distribution of {col}')
        axes[idx].set_ylabel('Count')
    
    # Hide empty subplots
    for idx in range(len(categorical_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

## 4. Bivariate Analysis

### Numerical Features vs Target

In [None]:
# Compare numerical features across churn groups
if target_col in df.columns and len(numerical_cols) > 0:
    n_cols = 2
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if len(numerical_cols) == 1 else axes
    
    for idx, col in enumerate(numerical_cols):
        df.boxplot(column=col, by=target_col, ax=axes[idx])
        axes[idx].set_title(f'{col} by Churn Status')
        axes[idx].set_xlabel('Churn')
        axes[idx].set_ylabel(col)
    
    # Hide empty subplots
    for idx in range(len(numerical_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.suptitle('')  # Remove default title
    plt.tight_layout()
    plt.show()

### Categorical Features vs Target

In [None]:
# Churn rate by categorical features
if target_col in df.columns and len(categorical_cols) > 0:
    for col in categorical_cols:
        # Calculate churn rate for each category
        churn_rate = df.groupby(col)[target_col].apply(lambda x: (x == 1).sum() / len(x) * 100)
        
        print(f"\nChurn Rate by {col}:")
        print(churn_rate.sort_values(ascending=False))
        
        # Visualize
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        # Stacked bar chart
        pd.crosstab(df[col], df[target_col]).plot(kind='bar', stacked=True, ax=axes[0])
        axes[0].set_title(f'Churn Distribution by {col}')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Count')
        axes[0].legend(title='Churn', labels=['No', 'Yes'])
        axes[0].tick_params(axis='x', rotation=45)
        
        # Churn rate bar chart
        churn_rate.plot(kind='bar', ax=axes[1], color='coral')
        axes[1].set_title(f'Churn Rate by {col}')
        axes[1].set_xlabel(col)
        axes[1].set_ylabel('Churn Rate (%)')
        axes[1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
if len(numerical_cols) > 0:
    # Include target if it's numerical
    corr_cols = numerical_cols.copy()
    if target_col in df.columns and df[target_col].dtype in ['int64', 'float64']:
        corr_cols.append(target_col)
    
    correlation_matrix = df[corr_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1)
    plt.title('Correlation Matrix of Numerical Features')
    plt.tight_layout()
    plt.show()
    
    # Top correlations with target
    if target_col in corr_cols:
        target_corr = correlation_matrix[target_col].drop(target_col).sort_values(ascending=False)
        print("\nTop Correlations with Target:")
        print(target_corr)

In [None]:
# Identify highly correlated features (multicollinearity)
if len(numerical_cols) > 0:
    high_corr_threshold = 0.8
    high_corr_pairs = []
    
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) > high_corr_threshold:
                high_corr_pairs.append((
                    correlation_matrix.columns[i],
                    correlation_matrix.columns[j],
                    correlation_matrix.iloc[i, j]
                ))
    
    if high_corr_pairs:
        print(f"\nHighly Correlated Feature Pairs (|correlation| > {high_corr_threshold}):")
        for feat1, feat2, corr in high_corr_pairs:
            print(f"{feat1} <-> {feat2}: {corr:.3f}")
        print("\n⚠️ Consider removing one feature from each highly correlated pair.")
    else:
        print(f"\nNo highly correlated feature pairs found (threshold: {high_corr_threshold})")

## 6. Key Insights and Findings

### Summary Statistics

In [None]:
# Create summary insights
print("="*80)
print("KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("="*80)

print(f"\n1. Dataset Overview:")
print(f"   - Total Records: {len(df):,}")
print(f"   - Total Features: {len(df.columns)}")
print(f"   - Numerical Features: {len(numerical_cols)}")
print(f"   - Categorical Features: {len(categorical_cols)}")

if target_col in df.columns:
    churn_rate = (df[target_col] == 1).sum() / len(df) * 100
    print(f"\n2. Churn Statistics:")
    print(f"   - Overall Churn Rate: {churn_rate:.2f}%")
    print(f"   - Churned Customers: {(df[target_col] == 1).sum():,}")
    print(f"   - Retained Customers: {(df[target_col] == 0).sum():,}")

missing_total = df.isnull().sum().sum()
print(f"\n3. Data Quality:")
print(f"   - Total Missing Values: {missing_total:,}")
print(f"   - Duplicate Rows: {duplicates}")

print("\n4. Recommendations:")
if missing_total > 0:
    print("   - Handle missing values through imputation or removal")
if duplicates > 0:
    print("   - Remove duplicate records")
if 'imbalance_ratio' in locals() and imbalance_ratio > 2:
    print("   - Address class imbalance using SMOTE or class weights")
if high_corr_pairs:
    print("   - Consider feature selection to handle multicollinearity")

print("\n" + "="*80)

## Next Steps

Based on this EDA, the following steps are recommended:

1. **Data Preprocessing**:
   - Handle missing values
   - Encode categorical variables
   - Scale numerical features
   - Engineer new features if needed

2. **Feature Selection**:
   - Remove highly correlated features
   - Select most important features for modeling

3. **Model Training**:
   - Train multiple classification models
   - Perform hyperparameter tuning
   - Use cross-validation

4. **Model Evaluation**:
   - Evaluate using appropriate metrics (ROC-AUC, F1-score, etc.)
   - Analyze feature importance
   - Generate interpretability insights