# üìä Day 2: EDA + Data Cleaning + Validation

**Customer Churn Analytics Project**

This notebook covers:
1. Data Quality Checks
2. Leakage Detection
3. EDA Visualizations
4. Data Cleaning & Preprocessing
5. Train/Test Split

In [16]:
# Imports
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path

# Project modules
from config import OUTPUT_DIR, RAW_DATA_DIR
from eda_utils import (
    plot_churn_distribution, plot_correlation_heatmap,
    plot_boxplots_vs_churn, plot_histograms,
    plot_churn_rate_by_category
)
from data_quality_checks import (
    check_shape, check_duplicates, check_missing_values,
    check_unique_values, compute_numeric_stats,
    detect_outliers_iqr, detect_impossible_values,
    check_leakage, generate_eda_report
)
from preprocess import (
    load_raw_data, clean_dataset, save_clean_data,
    split_data, CATEGORICAL_COLS, NUMERIC_COLS
)

print("‚úì All modules loaded successfully!")

‚úì All modules loaded successfully!


## 1. Load Data

In [17]:
# Load the final churn dataset
df = load_raw_data()

print(f"Dataset shape: {df.shape}")
df.head()

Loaded dataset: 5,000 rows √ó 17 columns
Dataset shape: (5000, 17)


Unnamed: 0,customer_id,age,gender,location,device_type,acquisition_channel,plan_type,monthly_price,auto_renew,total_sessions_30d,avg_session_minutes_30d,total_crashes_30d,failed_payments_30d,total_amount_success_30d,support_tickets_30d,avg_resolution_time_30d,churn
0,C000001,50,Female,Nagpur,Web,Ads,Standard,499,1,207,30.09,3,0,488.79,0,0.0,0
1,C000002,34,Male,Patna,Android,Partner,Standard,499,1,233,27.37,5,0,477.52,0,0.0,0
2,C000003,45,Female,Bangalore,Android,Ads,Standard,499,1,206,25.24,3,0,501.11,0,0.0,0
3,C000004,18,Male,Nagpur,iOS,Ads,Basic,199,1,158,20.67,0,0,203.95,0,0.0,0
4,C000005,40,Male,Vadodara,Android,Organic,Premium,999,0,0,0.0,0,0,0.0,0,0.0,1


## 2. Data Quality Checks

In [18]:
# Shape
rows, cols = check_shape(df)
print(f"üìä Shape: {rows:,} rows √ó {cols} columns")

# Duplicates
dupes = check_duplicates(df)
print(f"\nüîç Duplicate customer_ids: {dupes}")

# Missing values
print("\n‚ùì Missing Values:")
missing = check_missing_values(df)
if len(missing) > 0:
    display(missing)
else:
    print("   No missing values!")

üìä Shape: 5,000 rows √ó 17 columns

üîç Duplicate customer_ids: 0

‚ùì Missing Values:
   No missing values!


In [19]:
# Unique values for categorical columns
print("üìã Categorical Columns:")
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
unique_info = check_unique_values(df, cat_cols)

for col, info in unique_info.items():
    print(f"\n  {col}: {info['n_unique']} unique values")
    print(f"     {info['values']}")

üìã Categorical Columns:

  customer_id: 5000 unique values
     ['C000001', 'C000002', 'C000003', 'C000004', 'C000005', 'C000006', 'C000007', 'C000008', 'C000009', 'C000010']

  gender: 3 unique values
     ['Female', 'Male', 'Other']

  location: 20 unique values
     ['Nagpur', 'Patna', 'Bangalore', 'Vadodara', 'Hyderabad', 'Kolkata', 'Pune', 'Bhopal', 'Chennai', 'Thane']

  device_type: 3 unique values
     ['Web', 'Android', 'iOS']

  acquisition_channel: 4 unique values
     ['Ads', 'Partner', 'Organic', 'Referral']

  plan_type: 3 unique values
     ['Standard', 'Basic', 'Premium']


In [20]:
# Numeric statistics
print("üìà Numeric Statistics:")
stats = compute_numeric_stats(df)
display(stats)

üìà Numeric Statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,5000.0,31.91,9.20235,18.0,25.0,32.0,38.0,60.0
monthly_price,5000.0,504.78,313.068992,199.0,199.0,499.0,499.0,999.0
auto_renew,5000.0,0.556,0.496904,0.0,0.0,1.0,1.0,1.0
total_sessions_30d,5000.0,155.7782,82.854615,0.0,153.0,189.0,210.0,287.0
avg_session_minutes_30d,5000.0,19.296924,10.269311,0.0,18.0675,22.97,26.05,39.14
total_crashes_30d,5000.0,2.351,2.432403,0.0,0.0,2.0,4.0,21.0
failed_payments_30d,5000.0,0.0312,0.173875,0.0,0.0,0.0,0.0,1.0
total_amount_success_30d,5000.0,408.808394,368.866199,0.0,191.9,207.99,514.155,2087.63
support_tickets_30d,5000.0,0.1864,0.504882,0.0,0.0,0.0,0.0,6.0
avg_resolution_time_30d,5000.0,3.878256,11.431128,0.0,0.0,0.0,0.0,77.5


In [21]:
# Outlier detection (IQR method)
print("‚ö†Ô∏è Outliers (IQR Method):")
outliers = detect_outliers_iqr(df)

if len(outliers) > 0:
    for col, info in outliers.items():
        print(f"\n  {col}: {info['count']} outliers ({info['pct']}%)")
        print(f"     Bounds: [{info['lower_bound']}, {info['upper_bound']}]")
        print(f"     Actual: [{info['min']}, {info['max']}]")
else:
    print("   No significant outliers detected!")

‚ö†Ô∏è Outliers (IQR Method):

  age: 25 outliers (0.5%)
     Bounds: [5.5, 57.5]
     Actual: [18, 60]

  monthly_price: 1247 outliers (24.94%)
     Bounds: [-251.0, 949.0]
     Actual: [199, 999]

  total_sessions_30d: 1095 outliers (21.9%)
     Bounds: [67.5, 295.5]
     Actual: [0, 287]

  avg_session_minutes_30d: 1037 outliers (20.74%)
     Bounds: [6.09, 38.02]
     Actual: [0.0, 39.14]

  total_crashes_30d: 42 outliers (0.84%)
     Bounds: [-6.0, 10.0]
     Actual: [0, 21]

  failed_payments_30d: 156 outliers (3.12%)
     Bounds: [0.0, 0.0]
     Actual: [0, 1]

  total_amount_success_30d: 522 outliers (10.44%)
     Bounds: [-291.48, 997.54]
     Actual: [0.0, 2087.63]

  support_tickets_30d: 738 outliers (14.76%)
     Bounds: [0.0, 0.0]
     Actual: [0, 6]

  avg_resolution_time_30d: 738 outliers (14.76%)
     Bounds: [0.0, 0.0]
     Actual: [0.0, 77.5]

  churn: 1097 outliers (21.94%)
     Bounds: [0.0, 0.0]
     Actual: [0, 1]


In [22]:
# Impossible value detection
print("üö´ Impossible Values:")
impossible = detect_impossible_values(df)

if len(impossible) > 0:
    for col, info in impossible.items():
        print(f"\n  {col}: {info['count']} invalid values")
        print(f"     Rule: {info['rule']}")
else:
    print("   No impossible values detected!")

üö´ Impossible Values:
   No impossible values detected!


## 3. Leakage Detection

In [23]:
# Check for data leakage
print("üîí LEAKAGE DETECTION")
print("=" * 50)

leakage_warnings = check_leakage(df)

if len(leakage_warnings) == 0:
    print("‚úì No data leakage detected!")
else:
    for warning in leakage_warnings:
        print(f"\n[{warning['type']}] {warning['feature']}")
        print(f"   Issue: {warning['issue']}")
        print(f"   Action: {warning['action']}")

üîí LEAKAGE DETECTION

   Issue: High correlation with target: -0.959
   Action: Investigate if this feature uses future information

   Issue: High correlation with target: -0.941
   Action: Investigate if this feature uses future information

[INFO] auto_renew
   Issue: Large mean difference: churned=0.00, not_churned=0.71
   Action: Verify this is a legitimate predictive signal

[INFO] total_sessions_30d
   Issue: Large mean difference: churned=5.99, not_churned=197.88
   Action: Verify this is a legitimate predictive signal

[INFO] avg_session_minutes_30d
   Issue: Large mean difference: churned=1.07, not_churned=24.42
   Action: Verify this is a legitimate predictive signal

[INFO] total_amount_success_30d
   Issue: Large mean difference: churned=74.99, not_churned=502.63
   Action: Verify this is a legitimate predictive signal


## 4. EDA Visualizations

In [24]:
# Create plots directory
plots_dir = OUTPUT_DIR / 'plots'
plots_dir.mkdir(parents=True, exist_ok=True)

print("üìä Generating EDA visualizations...")

üìä Generating EDA visualizations...


In [25]:
# Churn distribution
plot_churn_distribution(df, plots_dir=plots_dir)

  ‚úì Saved: churn_distribution.png


WindowsPath('C:/Users/Lenovo/Desktop/churn/churn_project/outputs/plots/churn_distribution.png')

In [26]:
# Correlation heatmap
plot_correlation_heatmap(df, plots_dir=plots_dir)

  ‚úì Saved: correlation_heatmap.png


WindowsPath('C:/Users/Lenovo/Desktop/churn/churn_project/outputs/plots/correlation_heatmap.png')

In [27]:
# Boxplots vs churn
numeric_cols = ['age', 'monthly_price', 'total_sessions_30d', 
                'avg_session_minutes_30d', 'total_crashes_30d', 'failed_payments_30d']
plot_boxplots_vs_churn(df, numeric_cols, plots_dir=plots_dir)

  ‚úì Saved: boxplots_vs_churn.png


WindowsPath('C:/Users/Lenovo/Desktop/churn/churn_project/outputs/plots/boxplots_vs_churn.png')

In [28]:
# Histograms
hist_cols = ['monthly_price', 'total_sessions_30d', 'failed_payments_30d']
plot_histograms(df, hist_cols, plots_dir=plots_dir)

  ‚úì Saved: feature_histograms.png


WindowsPath('C:/Users/Lenovo/Desktop/churn/churn_project/outputs/plots/feature_histograms.png')

In [29]:
# Churn rate by categorical features
cat_cols = ['plan_type', 'device_type', 'acquisition_channel']
plot_churn_rate_by_category(df, cat_cols, plots_dir=plots_dir)

  ‚úì Saved: churn_rate_by_category.png


WindowsPath('C:/Users/Lenovo/Desktop/churn/churn_project/outputs/plots/churn_rate_by_category.png')

## 5. Generate EDA Report

In [30]:
# Generate and save comprehensive EDA report
report = generate_eda_report(df)
print("\n" + report)

UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f4ca' in position 205: character maps to <undefined>

## 6. Data Cleaning

In [None]:
# Clean the dataset
df_clean = clean_dataset(df, verbose=True)

In [None]:
# Save cleaned dataset
save_clean_data(df_clean)

## 7. Train/Test Split

In [None]:
# Split data with stratification
X_train, X_test, y_train, y_test = split_data(df_clean, test_size=0.2, random_state=42)

## 8. Final Summary

In [None]:
print("=" * 60)
print("DAY 2 SUMMARY")
print("=" * 60)
print(f"\nüìä Cleaned dataset shape: {df_clean.shape}")
print(f"üéØ Churn rate: {df_clean['churn'].mean()*100:.2f}%")
print(f"‚ùì Missing values after cleaning: {df_clean.isnull().sum().sum()}")
print(f"\n‚úì Saved outputs:")
print(f"   - outputs/eda_report.txt")
print(f"   - outputs/cleaned_dataset.csv")
print(f"   - outputs/plots/*.png")