# Customer Churn Prediction - Exploratory Data Analysis

This notebook performs comprehensive EDA on the Telco Customer Churn dataset.

## Objectives:
1. Understand the dataset structure and features
2. Analyze the target variable (Churn) distribution
3. Explore relationships between features and churn
4. Identify patterns and insights for model building
5. Visualize key findings

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

In [None]:
# Add project root to path
import sys
sys.path.append('..')

from src.data_loader import load_raw_data, get_data_info
from config import FIGURES_DIR, RAW_DATA_FILE

# Create figures directory
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

# Load data
df = load_raw_data()
print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

## 1. Dataset Overview

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Dataset info
print("Dataset Information:")
print("="*50)
df.info()

In [None]:
# Statistical summary
print("Statistical Summary:")
df.describe(include='all').T

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percent
})

print("Missing Values Analysis:")
print(missing_df[missing_df['Missing Values'] > 0])

if missing_df['Missing Values'].sum() == 0:
    print("No missing values found!")

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

## 2. Target Variable Analysis

In [None]:
# Churn distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
churn_counts = df['Churn'].value_counts()
colors = ['#2ecc71', '#e74c3c']

axes[0].bar(churn_counts.index, churn_counts.values, color=colors)
axes[0].set_title('Customer Churn Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Churn Status')
axes[0].set_ylabel('Count')

# Add value labels
for i, (idx, val) in enumerate(zip(churn_counts.index, churn_counts.values)):
    axes[0].text(i, val + 50, f'{val:,}', ha='center', fontsize=12)

# Pie chart
axes[1].pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%',
           colors=colors, explode=[0, 0.05], shadow=True, startangle=90)
axes[1].set_title('Churn Rate Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'churn_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# Print statistics
print(f"\nChurn Statistics:")
print(f"- Total customers: {len(df):,}")
print(f"- Churned: {churn_counts.get('Yes', 0):,} ({churn_counts.get('Yes', 0)/len(df)*100:.1f}%)")
print(f"- Retained: {churn_counts.get('No', 0):,} ({churn_counts.get('No', 0)/len(df)*100:.1f}%)")

## 3. Numerical Features Analysis

In [None]:
# Numerical columns
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Distribution of numerical features
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(numerical_cols):
    axes[i].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
    axes[i].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].axvline(df[col].mean(), color='red', linestyle='--', label=f'Mean: {df[col].mean():.2f}')
    axes[i].axvline(df[col].median(), color='green', linestyle='--', label=f'Median: {df[col].median():.2f}')
    axes[i].legend()

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'numerical_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Numerical features by Churn status
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(numerical_cols):
    sns.boxplot(x='Churn', y=col, data=df, ax=axes[i], palette=['#2ecc71', '#e74c3c'])
    axes[i].set_title(f'{col} by Churn Status', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'numerical_by_churn.png', dpi=300, bbox_inches='tight')
plt.show()

# Print summary statistics by churn
print("\nNumerical Features by Churn Status:")
print(df.groupby('Churn')[numerical_cols].agg(['mean', 'median', 'std']).round(2))

In [None]:
# Tenure analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Tenure distribution by churn
for churn_status in ['No', 'Yes']:
    subset = df[df['Churn'] == churn_status]
    axes[0].hist(subset['tenure'], bins=30, alpha=0.6, 
                label=f'Churn: {churn_status}', edgecolor='black')

axes[0].set_title('Tenure Distribution by Churn Status', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Tenure (months)')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Tenure groups
tenure_bins = [0, 12, 24, 48, 60, 73]
tenure_labels = ['0-12', '12-24', '24-48', '48-60', '60+']
df['TenureGroup'] = pd.cut(df['tenure'], bins=tenure_bins, labels=tenure_labels, include_lowest=True)

tenure_churn = df.groupby(['TenureGroup', 'Churn']).size().unstack()
tenure_churn_rate = tenure_churn['Yes'] / (tenure_churn['Yes'] + tenure_churn['No']) * 100

axes[1].bar(tenure_churn_rate.index.astype(str), tenure_churn_rate.values, color='#e74c3c')
axes[1].set_title('Churn Rate by Tenure Group', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Tenure Group (months)')
axes[1].set_ylabel('Churn Rate (%)')

for i, val in enumerate(tenure_churn_rate.values):
    axes[1].text(i, val + 1, f'{val:.1f}%', ha='center', fontsize=10)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'tenure_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Categorical Features Analysis

In [None]:
# Categorical columns
categorical_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
                   'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                   'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                   'Contract', 'PaperlessBilling', 'PaymentMethod']

# Unique values for each categorical column
print("Categorical Features - Unique Values:")
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())

In [None]:
# Churn rate by key categorical features
key_features = ['Contract', 'InternetService', 'PaymentMethod', 'TechSupport']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, col in enumerate(key_features):
    churn_by_cat = df.groupby([col, 'Churn']).size().unstack(fill_value=0)
    churn_rate = churn_by_cat['Yes'] / (churn_by_cat['Yes'] + churn_by_cat['No']) * 100
    
    bars = axes[i].bar(range(len(churn_rate)), churn_rate.values, color='#3498db')
    axes[i].set_title(f'Churn Rate by {col}', fontsize=12, fontweight='bold')
    axes[i].set_ylabel('Churn Rate (%)')
    axes[i].set_xticks(range(len(churn_rate)))
    axes[i].set_xticklabels(churn_rate.index, rotation=45, ha='right')
    
    for j, val in enumerate(churn_rate.values):
        axes[i].text(j, val + 1, f'{val:.1f}%', ha='center', fontsize=9)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'churn_by_categorical.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Demographics analysis
demo_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for i, col in enumerate(demo_cols):
    ct = pd.crosstab(df[col], df['Churn'], normalize='index') * 100
    ct.plot(kind='bar', ax=axes[i], color=['#2ecc71', '#e74c3c'], edgecolor='black')
    axes[i].set_title(f'Churn Rate by {col}', fontsize=12, fontweight='bold')
    axes[i].set_ylabel('Percentage (%)')
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=0)
    axes[i].legend(title='Churn', loc='upper right')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'demographics_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Correlation Analysis

In [None]:
# Create numeric version of Churn for correlation
df['Churn_numeric'] = (df['Churn'] == 'Yes').astype(int)

# Correlation matrix for numerical features
numeric_df = df[numerical_cols + ['Churn_numeric']].copy()

plt.figure(figsize=(10, 8))
correlation_matrix = numeric_df.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

sns.heatmap(correlation_matrix, annot=True, cmap='RdBu_r', center=0,
           mask=mask, square=True, linewidths=0.5, fmt='.2f')
plt.title('Correlation Matrix - Numerical Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nCorrelation with Churn:")
print(correlation_matrix['Churn_numeric'].sort_values(ascending=False))

In [None]:
# Scatter plot: Monthly Charges vs Total Charges colored by Churn
plt.figure(figsize=(10, 6))

for churn_status, color in [('No', '#2ecc71'), ('Yes', '#e74c3c')]:
    subset = df[df['Churn'] == churn_status]
    plt.scatter(subset['MonthlyCharges'], subset['TotalCharges'], 
               alpha=0.5, c=color, label=f'Churn: {churn_status}')

plt.xlabel('Monthly Charges ($)', fontsize=12)
plt.ylabel('Total Charges ($)', fontsize=12)
plt.title('Monthly Charges vs Total Charges by Churn Status', fontsize=14, fontweight='bold')
plt.legend()
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'charges_scatter.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Service Analysis

In [None]:
# Services analysis
service_cols = ['PhoneService', 'MultipleLines', 'InternetService', 
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies']

# Calculate churn rate for each service
service_churn = {}
for col in service_cols:
    churn_by_service = df.groupby(col)['Churn_numeric'].mean() * 100
    service_churn[col] = churn_by_service

# Plot
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for i, col in enumerate(service_cols):
    data = service_churn[col]
    colors = ['#3498db' if val < 30 else '#e74c3c' for val in data.values]
    axes[i].bar(data.index.astype(str), data.values, color=colors)
    axes[i].set_title(f'{col}', fontsize=11, fontweight='bold')
    axes[i].set_ylabel('Churn Rate (%)')
    axes[i].tick_params(axis='x', rotation=45)
    
    for j, val in enumerate(data.values):
        axes[i].text(j, val + 1, f'{val:.1f}%', ha='center', fontsize=8)

plt.suptitle('Churn Rate by Service Type', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'service_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Number of services vs Churn
df['NumServices'] = 0
for col in service_cols:
    df['NumServices'] += (df[col].isin(['Yes', 'DSL', 'Fiber optic'])).astype(int)

plt.figure(figsize=(10, 6))

service_counts = df.groupby(['NumServices', 'Churn']).size().unstack(fill_value=0)
churn_rate_by_services = service_counts['Yes'] / (service_counts['Yes'] + service_counts['No']) * 100

plt.bar(churn_rate_by_services.index, churn_rate_by_services.values, color='#9b59b6')
plt.xlabel('Number of Services', fontsize=12)
plt.ylabel('Churn Rate (%)', fontsize=12)
plt.title('Churn Rate by Number of Services Subscribed', fontsize=14, fontweight='bold')

for i, val in enumerate(churn_rate_by_services.values):
    plt.text(churn_rate_by_services.index[i], val + 1, f'{val:.1f}%', ha='center', fontsize=10)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'num_services_churn.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Contract & Payment Analysis

In [None]:
# Contract and Payment analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Contract type
contract_churn = pd.crosstab(df['Contract'], df['Churn'], normalize='index') * 100
contract_churn.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'], edgecolor='black')
axes[0].set_title('Churn Rate by Contract Type', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Percentage (%)')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45)
axes[0].legend(title='Churn')

# Payment method
payment_churn = pd.crosstab(df['PaymentMethod'], df['Churn'], normalize='index') * 100
payment_churn.plot(kind='bar', ax=axes[1], color=['#2ecc71', '#e74c3c'], edgecolor='black')
axes[1].set_title('Churn Rate by Payment Method', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Percentage (%)')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45)
axes[1].legend(title='Churn')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'contract_payment_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Key Insights Summary

In [None]:
# Summary of key insights
print("="*70)
print("KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("="*70)

# Overall churn rate
overall_churn = df['Churn_numeric'].mean() * 100
print(f"\n1. OVERALL CHURN RATE: {overall_churn:.1f}%")

# Contract impact
month_to_month_churn = df[df['Contract'] == 'Month-to-month']['Churn_numeric'].mean() * 100
two_year_churn = df[df['Contract'] == 'Two year']['Churn_numeric'].mean() * 100
print(f"\n2. CONTRACT IMPACT:")
print(f"   - Month-to-month customers churn rate: {month_to_month_churn:.1f}%")
print(f"   - Two-year contract customers churn rate: {two_year_churn:.1f}%")

# Tenure impact
short_tenure = df[df['tenure'] <= 12]['Churn_numeric'].mean() * 100
long_tenure = df[df['tenure'] > 48]['Churn_numeric'].mean() * 100
print(f"\n3. TENURE IMPACT:")
print(f"   - New customers (≤12 months) churn rate: {short_tenure:.1f}%")
print(f"   - Long-term customers (>48 months) churn rate: {long_tenure:.1f}%")

# Internet service impact
fiber_churn = df[df['InternetService'] == 'Fiber optic']['Churn_numeric'].mean() * 100
no_internet_churn = df[df['InternetService'] == 'No']['Churn_numeric'].mean() * 100
print(f"\n4. INTERNET SERVICE IMPACT:")
print(f"   - Fiber optic customers churn rate: {fiber_churn:.1f}%")
print(f"   - No internet customers churn rate: {no_internet_churn:.1f}%")

# Payment method impact
electronic_check = df[df['PaymentMethod'] == 'Electronic check']['Churn_numeric'].mean() * 100
credit_card = df[df['PaymentMethod'] == 'Credit card (automatic)']['Churn_numeric'].mean() * 100
print(f"\n5. PAYMENT METHOD IMPACT:")
print(f"   - Electronic check churn rate: {electronic_check:.1f}%")
print(f"   - Credit card (automatic) churn rate: {credit_card:.1f}%")

# Monthly charges
high_charges = df[df['MonthlyCharges'] > df['MonthlyCharges'].median()]['Churn_numeric'].mean() * 100
low_charges = df[df['MonthlyCharges'] <= df['MonthlyCharges'].median()]['Churn_numeric'].mean() * 100
print(f"\n6. MONTHLY CHARGES IMPACT:")
print(f"   - High monthly charges (above median) churn rate: {high_charges:.1f}%")
print(f"   - Low monthly charges (below median) churn rate: {low_charges:.1f}%")

print("\n" + "="*70)
print("RECOMMENDATIONS FOR MODEL BUILDING:")
print("="*70)
print("\n1. Focus on contract type, tenure, and internet service as key predictors")
print("2. Payment method (especially electronic check) is a strong indicator")
print("3. Consider creating interaction features (e.g., tenure × contract)")
print("4. Address class imbalance using techniques like SMOTE or class weights")
print("5. Use feature engineering to create 'risk scores' based on these insights")
print("="*70)

In [None]:
# Save cleaned data info
print("\nEDA Complete! All visualizations saved to:", FIGURES_DIR)
print("\nFigures generated:")
for fig_file in FIGURES_DIR.glob('*.png'):
    print(f"  - {fig_file.name}")