In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score 

In [4]:
data = pd.read_csv('churn_data_cleaned.csv')

# split target
X = data.drop(columns=['TARGET'])
y = data['TARGET']

# split data (keep a separate test set BEFORE applying SMOTE!)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# Build pipeline: scaling, SMOTE, then XGBoost classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('xgb', XGBClassifier(
    eval_metric='logloss',
    random_state=42)
    )
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict classes and probabilities on test set
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

target_names = ['Not Churn', 'Churn']

# Evaluate performance
print(classification_report(y_test, y_pred, target_names=target_names))
print('ROC AUC score:', roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

   Not Churn       0.94      0.92      0.93     65253
       Churn       0.24      0.29      0.26      5785

    accuracy                           0.87     71038
   macro avg       0.59      0.60      0.59     71038
weighted avg       0.88      0.87      0.87     71038

ROC AUC score: 0.7507643495622869
