# Telco Customer Churn Prediction — Data Cleaning & EDA

### Step 1: Load Dataset
We load the Telco Customer Churn dataset and preview the structure.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Display dataset shape and first few rows
print(f"Dataset shape: {df.shape}")
df.head()

### Step 2: Inspect Data Structure
We check data types, missing values, and column structure.

In [None]:
df.info()

### Step 3: Data Cleaning
Convert `TotalCharges` to numeric and handle missing values.

In [None]:
# Convert 'TotalCharges' from object to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check missing values
print("Missing values before imputation:\n", df.isnull().sum())

# Impute missing TotalCharges values with mean
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

# Confirm no missing values remain
print("Missing values after imputation:\n", df.isnull().sum())

### Step 4: Check for Duplicates

In [None]:
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

### Step 5: Churn Distribution

In [None]:
print(df['Churn'].value_counts(normalize=True))

sns.countplot(x='Churn', data=df, palette='coolwarm')
plt.title("Churn Distribution")
plt.show()

### Step 6: Churn by Contract Type

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Contract', hue='Churn', data=df, palette='coolwarm')
plt.title("Churn by Contract Type")
plt.show()

### Step 7: Churn by Payment Method

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='PaymentMethod', hue='Churn', data=df, palette='coolwarm')
plt.title("Churn by Payment Method")
plt.xticks(rotation=45)
plt.show()

### Step 8: Tenure Distribution by Churn

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(data=df, x='tenure', hue='Churn', kde=True, bins=30, palette='coolwarm')
plt.title("Tenure Distribution by Churn")
plt.show()

### Step 9: Monthly Charges vs. Churn Boxplot

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x='Churn', y='MonthlyCharges', data=df, palette='coolwarm')
plt.title("Monthly Charges vs Churn")
plt.show()

### Step 10: Correlation Heatmap

In [None]:
# Map churn to binary
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Select numeric features
numeric_features = df.select_dtypes(include=[np.number])

# Correlation matrix
corr = numeric_features.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()