# Telco Customer Churn: Exploratory Data Analysis

This notebook performs a comprehensive exploratory data analysis (EDA) on the Telco Customer Churn dataset. The goal is to understand the data, identify patterns, and extract insights that can inform feature engineering and model building.

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import yaml

# Add src directory to path to import custom modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.data_loader import TelcoDataLoader

# Load config
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

TARGET_COLUMN = config['target']
NUMERICAL_FEATURES = config['features']['numerical']
CATEGORICAL_FEATURES = config['features']['categorical']

# Set plot style
sns.set(style='whitegrid')

# Load data
loader = TelcoDataLoader()
df = loader.load_raw_data()

# Initial data preparation
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(subset=['TotalCharges'], inplace=True)

## 2. Data Quality Assessment

In [None]:
print('--- DataFrame Info ---')
df.info()

print('\n--- Missing Values ---')
print(df.isnull().sum())

print('\n--- Duplicate Rows ---')
print(f'Number of duplicate rows: {df.duplicated().sum()}')

print('\n--- Summary Statistics ---')
print(df.describe())

## 3. Univariate Analysis

### Target Variable

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x=TARGET_COLUMN, data=df)
plt.title('Distribution of Customer Churn')
plt.show()
print(df[TARGET_COLUMN].value_counts(normalize=True))

### Numerical Features

In [None]:
for col in NUMERICAL_FEATURES:
    plt.figure(figsize=(8, 5))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

### Categorical Features

In [None]:
for col in CATEGORICAL_FEATURES:
    plt.figure(figsize=(10, 6))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.show()

## 4. Bivariate Analysis

### Numerical Features vs. Churn

In [None]:
for col in NUMERICAL_FEATURES:
    plt.figure(figsize=(8, 5))
    sns.boxplot(x=TARGET_COLUMN, y=col, data=df)
    plt.title(f'{col} vs. Churn')
    plt.show()

### Categorical Features vs. Churn

In [None]:
for col in CATEGORICAL_FEATURES:
    if col != TARGET_COLUMN:
        pd.crosstab(df[col], df[TARGET_COLUMN]).plot(kind='bar', stacked=True, figsize=(10, 6))
        plt.title(f'{col} vs. Churn')
        plt.xticks(rotation=45)
        plt.show()

## 5. Multivariate Analysis

In [None]:
plt.figure(figsize=(10, 8))
correlation_matrix = df[NUMERICAL_FEATURES].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

## 6. Customer Segmentation (Example)

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

cluster_features = ['tenure', 'MonthlyCharges']
df_cluster = df[cluster_features].copy()

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_cluster)

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
df['Segment'] = kmeans.fit_predict(df_scaled)

plt.figure(figsize=(10, 6))
sns.scatterplot(x='tenure', y='MonthlyCharges', hue='Segment', data=df, palette='viridis')
plt.title('Customer Segments (Tenure vs. Monthly Charges)')
plt.show()

## 7. Business Insights

In [None]:
df['Churn_numeric'] = (df[TARGET_COLUMN] == 'Yes').astype(int)
segment_analysis = df.groupby('Segment').agg({
    'tenure': 'mean',
    'MonthlyCharges': 'mean',
    'Churn_numeric': 'mean'
}).rename(columns={'Churn_numeric': 'ChurnRate'})

print('--- Segment Analysis ---')
print(segment_analysis)

print('\n--- Insights ---')
for i, row in segment_analysis.iterrows():
    print(f'- Segment {i}: Average tenure of {row.tenure:.1f} months, average monthly charge of ${row.MonthlyCharges:.2f}, and a churn rate of {row.ChurnRate:.2%}.')