# Customer Churn Prediction - EDA Playground

This notebook explores the e-commerce customer churn dataset and helps identify patterns and insights for prediction.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import os
import sys

# Add the parent directory to the path
sys.path.append('..')
# © 2025 Meet Jain | Project created by Meet Jain. Unauthorized copying or reproduction is prohibited.

from src.helpers import SAMPLE_DATA_PATH, NUMERIC_COLS, CATEGORICAL_COLS, BINARY_COLS, TARGET_COL
from src.data_prep import load_csv, basic_clean, handle_missing
from src.eda_utils import create_eda_plots

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [None]:
# Load the dataset
if os.path.exists(SAMPLE_DATA_PATH):
    print(f"Loading data from {SAMPLE_DATA_PATH}")
    df = pd.read_csv(SAMPLE_DATA_PATH)
else:
    # If sample data doesn't exist, run the synthetic data generator
    print("Sample data not found. Running synthetic data generator...")
    !python ../data/generate_synthetic_data.py
    df = pd.read_csv(SAMPLE_DATA_PATH)

# Display basic info
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Check data types and missing values
print("Data types:")
print(df.dtypes)
# © 2025 Meet Jain | Project created by Meet Jain. Unauthorized copying or reproduction is prohibited.

print("\nMissing values:")
print(df.isnull().sum())

In [None]:
# Basic data cleaning
df_clean = basic_clean(df)

# Handle missing values if any
df_clean = handle_missing(df_clean)

# Verify cleaning
print("\nAfter cleaning:")
print(f"Dataset shape: {df_clean.shape}")
print("Missing values:")
print(df_clean.isnull().sum())

In [None]:
# Basic statistics
print("Numeric columns statistics:")
df_clean[NUMERIC_COLS].describe()

In [None]:
# Categorical columns distribution
for col in CATEGORICAL_COLS:
    print(f"\n{col} distribution:")
    print(df_clean[col].value_counts(normalize=True).round(3) * 100)

In [None]:
# Churn distribution
churn_counts = df_clean[TARGET_COL].value_counts()
churn_pct = df_clean[TARGET_COL].value_counts(normalize=True).round(3) * 100

print("Churn distribution:")
print(f"Not churned: {churn_counts[0]} ({churn_pct[0]:.1f}%)")
print(f"Churned: {churn_counts[1]} ({churn_pct[1]:.1f}%)")

# Plot churn distribution
plt.figure(figsize=(8, 5))
ax = sns.countplot(x=TARGET_COL, data=df_clean, palette=['#4ECDC4', '#FF6B6B'])

# Add percentages on top of bars
for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height + 20, f'{churn_pct[i]:.1f}%', 
            ha="center", fontsize=12)

plt.title('Customer Churn Distribution')
plt.ylabel('Count')
plt.xlabel('Churn')
plt.xticks([0, 1], ['Not Churned (0)', 'Churned (1)'])
plt.show()

In [None]:
# Explore churn by categorical features
plt.figure(figsize=(15, 12))

for i, col in enumerate(CATEGORICAL_COLS, 1):
    plt.subplot(len(CATEGORICAL_COLS), 1, i)
    
    # Calculate churn rate by category
    churn_by_cat = df_clean.groupby(col)[TARGET_COL].mean().sort_values(ascending=False) * 100
    
    # Plot
    ax = sns.barplot(x=churn_by_cat.index, y=churn_by_cat.values, palette='viridis')
    
    # Add percentages on top of bars
    for j, p in enumerate(ax.patches):
        height = p.get_height()
        ax.text(p.get_x() + p.get_width()/2., height + 1, f'{height:.1f}%', 
                ha="center", fontsize=10)
    
    plt.title(f'Churn Rate by {col}')
    plt.ylabel('Churn Rate (%)')
    plt.xlabel(col)
    plt.xticks(rotation=45 if len(churn_by_cat) > 3 else 0)
    plt.tight_layout()
    # © 2025 Meet Jain | Project created by Meet Jain. Unauthorized copying or reproduction is prohibited.

plt.tight_layout()
plt.show()

In [None]:
# Explore numeric features distribution by churn
plt.figure(figsize=(15, 20))

for i, col in enumerate(NUMERIC_COLS, 1):
    plt.subplot(len(NUMERIC_COLS), 2, i*2-1)
    
    # Histogram by churn
    sns.histplot(data=df_clean, x=col, hue=TARGET_COL, bins=20, alpha=0.6, 
                 palette=['#4ECDC4', '#FF6B6B'], element="step", multiple="layer")
    plt.title(f'Distribution of {col} by Churn')
    plt.ylabel('Count')
    plt.xlabel(col)
    plt.legend(['Not Churned', 'Churned'])
    
    plt.subplot(len(NUMERIC_COLS), 2, i*2)
    
    # Boxplot by churn
    sns.boxplot(data=df_clean, y=col, x=TARGET_COL, palette=['#4ECDC4', '#FF6B6B'])
    plt.title(f'Boxplot of {col} by Churn')
    plt.ylabel(col)
    plt.xlabel('Churn')
    plt.xticks([0, 1], ['Not Churned', 'Churned'])

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
corr_cols = NUMERIC_COLS + BINARY_COLS + [TARGET_COL]
corr_matrix = df_clean[corr_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with churn
churn_corr = corr_matrix[TARGET_COL].drop(TARGET_COL).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
ax = sns.barplot(x=churn_corr.values, y=churn_corr.index, palette='viridis')

# Add correlation values
for i, v in enumerate(churn_corr.values):
    ax.text(v + 0.01 if v > 0 else v - 0.06, i, f'{v:.2f}', va='center', fontsize=10)

plt.title('Features Correlation with Churn')
plt.xlabel('Correlation')
plt.ylabel('Features')
plt.axvline(x=0, color='gray', linestyle='--')
plt.tight_layout()
plt.show()
# © 2025 Meet Jain | Project created by Meet Jain. Unauthorized copying or reproduction is prohibited.

In [None]:
# Create days since last purchase buckets
bins = [0, 30, 60, 90, np.inf]
labels = ['0-30 days', '31-60 days', '61-90 days', 'Over 90 days']
df_clean['days_bucket'] = pd.cut(df_clean['days_since_last_purchase'], bins=bins, labels=labels, right=False)

# Calculate churn rate by bucket
churn_by_days = df_clean.groupby('days_bucket')[TARGET_COL].mean().sort_values() * 100

plt.figure(figsize=(10, 6))
ax = sns.barplot(x=churn_by_days.index, y=churn_by_days.values, palette='viridis')

# Add percentages on top of bars
for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height + 1, f'{height:.1f}%', 
            ha="center", fontsize=10)

plt.title('Churn Rate by Days Since Last Purchase')
plt.ylabel('Churn Rate (%)')
plt.xlabel('Days Since Last Purchase')
plt.tight_layout()
# © 2025 Meet Jain | Project created by Meet Jain. Unauthorized copying or reproduction is prohibited.
plt.show()

In [None]:
# Create tenure buckets
bins = [0, 6, 12, 24, 36, np.inf]
labels = ['0-6 months', '7-12 months', '1-2 years', '2-3 years', 'Over 3 years']
df_clean['tenure_bucket'] = pd.cut(df_clean['tenure_months'], bins=bins, labels=labels, right=False)

# Calculate churn rate by bucket
churn_by_tenure = df_clean.groupby('tenure_bucket')[TARGET_COL].mean().sort_values(ascending=False) * 100

plt.figure(figsize=(10, 6))
ax = sns.barplot(x=churn_by_tenure.index, y=churn_by_tenure.values, palette='viridis')

# Add percentages on top of bars
for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height + 1, f'{height:.1f}%', 
            ha="center", fontsize=10)

plt.title('Churn Rate by Tenure')
plt.ylabel('Churn Rate (%)')
plt.xlabel('Tenure')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: Loyalty vs Complaints colored by churn
plt.figure(figsize=(10, 6))
scatter = sns.scatterplot(data=df_clean, x='loyalty_score', y='complaints', 
                         hue=TARGET_COL, palette=['#4ECDC4', '#FF6B6B'], alpha=0.7)
plt.title('Loyalty Score vs Complaints by Churn Status')
plt.xlabel('Loyalty Score')
plt.ylabel('Complaints')
plt.legend(['Not Churned', 'Churned'])
plt.tight_layout()
plt.show()

In [None]:
# Try plotly visualizations
try:
    # Churn rate by region (interactive)
    churn_by_region = df_clean.groupby('region')[TARGET_COL].mean().reset_index()
    churn_by_region.columns = ['region', 'churn_rate']
    churn_by_region['churn_rate'] = churn_by_region['churn_rate'] * 100
    
    fig = px.bar(churn_by_region, x='region', y='churn_rate',
                 labels={'churn_rate': 'Churn Rate (%)', 'region': 'Region'},
                 title='Churn Rate by Region',
                 text=churn_by_region['churn_rate'].round(1).astype(str) + '%',
                 color='churn_rate',
                 color_continuous_scale=['#4ECDC4', '#FFD166', '#FF6B6B'])
    
    fig.update_layout(width=700, height=500)
    fig.show()
    
    # Loyalty vs Days Since Last Purchase colored by churn
    fig = px.scatter(df_clean, x='loyalty_score', y='days_since_last_purchase', 
                     color=df_clean[TARGET_COL].map({0: 'Not Churned', 1: 'Churned'}),
                     color_discrete_map={'Not Churned': '#4ECDC4', 'Churned': '#FF6B6B'},
                     title='Loyalty Score vs Days Since Last Purchase',
                     labels={'loyalty_score': 'Loyalty Score', 
                             'days_since_last_purchase': 'Days Since Last Purchase'},
                     opacity=0.7)
    
    fig.update_layout(width=800, height=600)
    fig.show()
except Exception as e:
    print(f"Error creating plotly visualizations: {e}")

In [None]:
# Key Insights
print("Key Insights from EDA:")
print("---------------------")

# Churn rate
print(f"1. Overall churn rate: {df_clean[TARGET_COL].mean() * 100:.1f}%")

# Most correlated feature
top_corr = churn_corr.index[0]
top_corr_val = churn_corr.values[0]
print(f"2. Most correlated feature with churn: {top_corr} ({top_corr_val:.2f})")

# Region with highest churn
region_churn = df_clean.groupby('region')[TARGET_COL].mean()
highest_region = region_churn.idxmax()
highest_region_rate = region_churn.max() * 100
print(f"3. Region with highest churn: {highest_region} ({highest_region_rate:.1f}%)")

# Payment type with highest churn
payment_churn = df_clean.groupby('payment_type')[TARGET_COL].mean()
highest_payment = payment_churn.idxmax()
highest_payment_rate = payment_churn.max() * 100
print(f"4. Payment type with highest churn: {highest_payment} ({highest_payment_rate:.1f}%)")

# Days since purchase insight
if 'days_bucket' in df_clean.columns:
    days_churn = df_clean.groupby('days_bucket')[TARGET_COL].mean()
    highest_days = days_churn.idxmax()
    highest_days_rate = days_churn.max() * 100
    print(f"5. Days since purchase with highest churn: {highest_days} ({highest_days_rate:.1f}%)")

# Tenure insight
if 'tenure_bucket' in df_clean.columns:
    tenure_churn = df_clean.groupby('tenure_bucket')[TARGET_COL].mean()
    highest_tenure = tenure_churn.idxmax()
    highest_tenure_rate = tenure_churn.max() * 100
    print(f"6. Tenure bracket with highest churn: {highest_tenure} ({highest_tenure_rate:.1f}%)")

# Promo users
promo_churn = df_clean.groupby('is_promo_user')[TARGET_COL].mean()
promo_diff = (promo_churn[1] - promo_churn[0]) * 100
print(f"7. Promo users churn {promo_diff:.1f}% {'more' if promo_diff > 0 else 'less'} than non-promo users")

# Complaints
complaints_churn = df_clean.groupby(df_clean['complaints'] > 0)[TARGET_COL].mean()
if True in complaints_churn.index:
    complaint_diff = (complaints_churn[True] - complaints_churn.get(False, 0)) * 100
    print(f"8. Customers with complaints churn {complaint_diff:.1f}% more than those without")

In [None]:
# Simple human-readable interpretations
print("Simple Interpretations for Business Team:")
print("---------------------------------------")
print("1. looks like ppl with more days since last purchase are way more likely to leave, not good")
print("2. customers with low loyalty scores churn more, need to bump those up somehow")
print("3. new customers (under 6 months) churn way more than old ones, maybe our onboarding sucks?")
print(f"4. {highest_payment} users leave more often, wonder why?")
print("5. people with complaints are leaving a lot more, fix those issues asap!")
print("6. promo-only users bail when deals dry up, might need better regular pricing")
print(f"7. {highest_region} folks churn more, maybe we need region-specific stuff")