# Telco Customer Churn Prediction — Feature Engineering & Baseline Modeling

### Step 1: Load Preprocessed Data

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


### Step 2: Feature Engineering — Tenure Groups & Average Monthly Charge

In [None]:

def tenure_group(tenure):
    if tenure <= 12:
        return '0-12 months'
    elif tenure <= 24:
        return '12-24 months'
    elif tenure <= 48:
        return '24-48 months'
    else:
        return '48+ months'

df['tenure_group'] = df['tenure'].apply(tenure_group)
df['avg_monthly_charge'] = df['TotalCharges'] / df['tenure'].replace(0, 1)

# One-hot encode tenure_group
df = pd.get_dummies(df, columns=['tenure_group'], drop_first=True)
df.head()


### Step 3: Separate Features and Target

In [None]:

X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training size: {X_train.shape}, Testing size: {X_test.shape}")


### Step 4: Baseline Model — Logistic Regression

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_lr):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_lr):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_lr):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_lr):.4f}")


### Step 5: Baseline Model — Decision Tree Classifier

In [None]:

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print("Decision Tree Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_dt):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_dt):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_dt):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_dt):.4f}")


### Step 6: Feature Importance from Decision Tree

In [None]:

import matplotlib.pyplot as plt

feature_importances = pd.Series(dt.feature_importances_, index=X_train.columns)
top_features = feature_importances.sort_values(ascending=False).head(10)

top_features.plot(kind='barh')
plt.xlabel('Feature Importance Score')
plt.title('Top 10 Important Features (Decision Tree)')
plt.gca().invert_yaxis()
plt.show()
