# Telco Customer Churn Prediction — Data Preprocessing

### Step 1: Load Cleaned Data
Load the dataset prepared in the previous EDA step.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Convert TotalCharges

# Just in case
if df['TotalCharges'].dtype == 'object':
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

# Map churn to binary
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

### Step 2: Separate Features
Separate categorical and numerical columns.

In [None]:
# Separate feature types
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove customerID from categorical columns
categorical_cols.remove('customerID')

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('Churn')  # target variable

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

### Step 3: Encode Categorical Features

In [None]:
# Label encode binary categorical columns
binary_cols = [col for col in categorical_cols if df[col].nunique() == 2]
le = LabelEncoder()
for col in binary_cols:
    df[col] = le.fit_transform(df[col])

# One-hot encode multi-category columns
multi_cat_cols = [col for col in categorical_cols if df[col].nunique() > 2]
df = pd.get_dummies(df, columns=multi_cat_cols, drop_first=True)

df.head()

### Step 4: Scale Numerical Features

In [None]:
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df.head()

### Step 5: Split into Training and Test Sets

In [None]:
# Split features and target
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")