In [None]:
# Load Data & Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score
)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("../data/processed/clean_telco_churn.csv")

X = df.drop(columns=['Churn'])
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
# Train Decision Tree
dt_model = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=50,
    class_weight='balanced',
    random_state=42
)

dt_model.fit(X_train, y_train)

In [None]:
# Evaluate Decision Tree
dt_pred = dt_model.predict(X_test)
dt_prob = dt_model.predict_proba(X_test)[:, 1]

dt_results = {
    "model_name": "Decision Tree",
    "roc_auc": roc_auc_score(y_test, dt_prob),
    "precision": precision_score(y_test, dt_pred),
    "recall": recall_score(y_test, dt_pred),
    "f1_score": f1_score(y_test, dt_pred)
}

dt_results

In [None]:
# Save Decision Tree Results
import json, os

os.makedirs("../results", exist_ok=True)

with open("../results/decision_tree.json", "w") as f:
    json.dump(dt_results, f, indent=4)

In [None]:
# Train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=50,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

In [None]:
# Evaluate Random Forest
rf_pred = rf_model.predict(X_test)
rf_prob = rf_model.predict_proba(X_test)[:, 1]

rf_results = {
    "model_name": "Random Forest",
    "roc_auc": roc_auc_score(y_test, rf_prob),
    "precision": precision_score(y_test, rf_pred),
    "recall": recall_score(y_test, rf_pred),
    "f1_score": f1_score(y_test, rf_pred)
}

rf_results

In [None]:
# Save Random Forest Results
with open("../results/random_forest.json", "w") as f:
    json.dump(rf_results, f, indent=4)

In [None]:
# Feature Importance

# Decision Tree
dt_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": dt_model.feature_importances_
}).sort_values(by="importance", ascending=False)

dt_importance.head(10)

# Random Forest
rf_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": rf_model.feature_importances_
}).sort_values(by="importance", ascending=False)

rf_importance.head(10)

## ðŸ“Œ Model Interpretation

- Tree-based models confirm that contract type, tenure, and monthly charges are strong predictors of churn.
- Random Forest outperforms Decision Tree due to better generalization.
- Feature importance aligns with business intuition observed during EDA.