In [None]:
# Only if you haven't installed already
!pip install scikit-learn


# 1. Imports
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, roc_curve
)
import matplotlib.pyplot as plt

# 2. Load the cleaned data
df = pd.read_csv("data/cleaned/telco_churn_cleaned.csv")

# 3. Define features (X) and target (y)
X = df.drop(columns=["customerid", "churn"])
y = df["churn"]

# 4. Split into train & test sets (80/20 stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 5. Scale numeric columns
num_cols = ["tenure", "monthlycharges", "totalcharges"]
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

#hyper parameter grid and best cv
param_grid_rf = {
    "n_estimators":     [50, 100, 200],
    "max_depth":        [None, 5, 10],
    "min_samples_split":[2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap":        [True, False]
}

base_rf = RandomForestClassifier(random_state=42)

grid_rf = GridSearchCV(
    estimator=base_rf,
    param_grid=param_grid_rf,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Run Grid Search CV
grid_rf.fit(X_train, y_train)


#Review CV Results
print("Best CV AUC:       ", grid_rf.best_score_)
print("Best hyperparameters:", grid_rf.best_params_)


#Evaluate Best Model on Hold-Out Test Set
best_rf = grid_rf.best_estimator_

y_pred_rf  = best_rf.predict(X_test)
y_proba_rf = best_rf.predict_proba(X_test)[:, 1]

print("Test Accuracy :", accuracy_score(y_test, y_pred_rf))
print("Test Precision:", precision_score(y_test, y_pred_rf))
print("Test Recall   :", recall_score(y_test, y_pred_rf))
print("Test ROC AUC  :", roc_auc_score(y_test, y_proba_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


# 10. Inspect feature importances
importances = pd.Series(best_rf.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)

# View the top 10 drivers of churn
print(importances.head(10))
