In [None]:
# If you haven’t already:
!pip install xgboost scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, roc_curve
)
import matplotlib.pyplot as plt

df = pd.read_csv("data/cleaned/telco_churn_cleaned.csv")
df.head()

#
X = df.drop(columns=["customerid", "churn"])
y = df["churn"]

# Split the dataset into training and testing sets
# Using stratified sampling to maintain the proportion of churn classes
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

#scale numeric feautureas
num_cols = ["tenure", "monthlycharges", "totalcharges"]
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

#hyper parameter tuning
param_grid = {
    "n_estimators":   [50, 100, 200],
    "max_depth":      [3, 5, 7],
    "learning_rate":  [0.01, 0.1],
    "subsample":      [0.6, 0.8, 1.0]
}

base_xgb = XGBClassifier(
    eval_metric="logloss",
    random_state=42
)

grid_cv = GridSearchCV(
    estimator=base_xgb,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid_cv.fit(X_train, y_train)

print("Best CV AUC:       ", grid_cv.best_score_)
print("Best hyperparams:   ", grid_cv.best_params_)

# Evaluate the best model on the test set
best_xgb = grid_cv.best_estimator_



y_pred  = best_xgb.predict(X_test)
y_proba = best_xgb.predict_proba(X_test)[:, 1]

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("ROC AUC  :", roc_auc_score(y_test, y_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure()
plt.plot(fpr, tpr, label=f'XGB (AUC = {roc_auc_score(y_test, y_proba):.3f})')
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("XGBoost ROC Curve")
plt.legend(loc="lower right")
plt.show()

importances = pd.Series(best_xgb.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)
importances.head(10)
