In [None]:
# Reload Raw Data
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

In [None]:
# Drop Non-Informative Columns
df.drop(columns=['customerID'], inplace=True)

In [None]:
# Fix Incorrect Data Types
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].isnull().sum()

In [None]:
# Handle Missing Values
df.loc[df['tenure'] == 0, 'TotalCharges'] = 0
df.isnull().sum()

In [None]:
# Encode Target Variable
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [None]:
# Normalize Binary Categorical Columns
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']

for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

In [None]:
# One-Hot Encode Multi-Class Categoricals
df = pd.get_dummies(
    df,
    columns=['Contract', 'PaymentMethod', 'InternetService', 'MultipleLines'],
    drop_first=True
)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
# Final Dataset Check
df.head()
df.info()

In [None]:
# Save Clean Dataset
df.to_csv("../data/processed/clean_telco_churn.csv", index=False)