In [5]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [6]:
data = pd.read_csv('churn_data_cleaned.csv')

# split target
X = data.drop(columns=['TARGET'])
y = data['TARGET']

# split data (keep a separate test set before applying SMOTE!)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# Create a pipeline with scaling, SMOTE, and logistic regression
pipeline_log = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('logreg', LogisticRegression(solver='liblinear', random_state=42))
])

# Train the model
pipeline_log.fit(X_train, y_train)

# Predictions and probabilities
y_pred_log = pipeline_log.predict(X_test)
y_proba_log = pipeline_log.predict_proba(X_test)[:, 1]

target_names = ['Not Churn', 'Churn']

# Evaluate performance
print(classification_report(y_test, y_pred_log, target_names=target_names))
print('ROC AUC score:', roc_auc_score(y_test, y_proba_log))

              precision    recall  f1-score   support

   Not Churn       0.96      0.60      0.74     65253
       Churn       0.14      0.71      0.23      5785

    accuracy                           0.61     71038
   macro avg       0.55      0.65      0.48     71038
weighted avg       0.89      0.61      0.69     71038

ROC AUC score: 0.710443577495538
