In [None]:
# %% [markdown]
# # Exploratory Data Analysis (EDA)
# **Goal**: Understand the dataset’s structure, distributions, and relationships to guide preprocessing and modeling.

# %% [markdown]
# ## 1. Load Libraries and Data

# %%
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv("../data/raw/telco_churn.csv")

# %% [markdown]
# ## 2. Basic Descriptive Statistics

# %%
# Summary for numerical features (e.g., tenure, charges)
print("Numerical Features Summary:")
print(data.describe())

# %%
# Summary for categorical features (e.g., Contract, PaymentMethod)
print("\nCategorical Features Summary:")
print(data.describe(include='object'))

# %% [markdown]
# ## 3. Data Cleaning Checks

# %%
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# %%
# Convert 'TotalCharges' to numeric (handling errors)
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# Check missing values again after conversion
print("\nMissing Values in TotalCharges:", data['TotalCharges'].isnull().sum())

# %% [markdown]
# ## 4. Visualizations

# %%
# Set plot style
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10, 6))

# %% [markdown]
# ### A. Churn Distribution (Target Variable)

# %%
# Plot churn rate
sns.countplot(x='Churn', data=data)
plt.title("Churn vs Non-Churn Customers")
plt.xlabel("Churn")
plt.ylabel("Count")
plt.savefig("../results/plots/churn_distribution.png")  # Save plot
plt.show()

# %% [markdown]
# **Insight**: The dataset is imbalanced (more non-churners than churners).

# %% [markdown]
# ### B. Numerical Feature Distributions

# %%
# Plot histograms for tenure, MonthlyCharges, and TotalCharges
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.histplot(data['tenure'], kde=True, ax=axes[0])
axes[0].set_title("Tenure Distribution")

sns.histplot(data['MonthlyCharges'], kde=True, ax=axes[1])
axes[1].set_title("Monthly Charges")

sns.histplot(data['TotalCharges'], kde=True, ax=axes[2])
axes[2].set_title("Total Charges")

plt.tight_layout()
plt.savefig("../results/plots/numerical_distributions.png")
plt.show()

# %% [markdown]
# **Insight**: 
# - Most customers have tenure < 10 months or > 60 months (long-term).
# - `MonthlyCharges` has a bimodal distribution.

# %% [markdown]
# ### C. Categorical Features vs Churn

# %%
# Churn rate by Contract Type
plt.figure(figsize=(8, 5))
sns.barplot(x='Contract', y='Churn', data=data, estimator=lambda x: sum(x == 'Yes')/len(x)*100)
plt.title("Churn Rate by Contract Type")
plt.ylabel("Churn Rate (%)")
plt.savefig("../results/plots/churn_by_contract.png")
plt.show()

# %% [markdown]
# **Insight**: Month-to-month contracts have the highest churn rate (~43%).

# %% [markdown]
# ### D. Correlation Heatmap

# %%
# Convert Churn to binary (1=Yes, 0=No)
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

# Compute correlations
corr = data.corr(numeric_only=True)

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.savefig("../results/plots/correlation_heatmap.png")
plt.show()

# %% [markdown]
# **Insight**: 
# - `tenure` and `TotalCharges` are strongly correlated (0.83).
# - `Churn` is negatively correlated with `tenure` (-0.35).

# %% [markdown]
# ### E. Monthly Charges vs Churn (Box Plot)

# %%
plt.figure(figsize=(8, 5))
sns.boxplot(x='Churn', y='MonthlyCharges', data=data)
plt.title("Monthly Charges by Churn Status")
plt.savefig("../results/plots/boxplot_monthly_charges.png")
plt.show()

# %% [markdown]
# **Insight**: Churners tend to have higher median monthly charges.

# %% [markdown]
# ## 5. Save Processed Data
# Export cleaned data for modeling.

# %%
# Fill missing TotalCharges with median
data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)

# Save cleaned data
data.to_csv("../data/processed/cleaned_data.csv", index=False)
print("Data saved to data/processed/cleaned_data.csv")