In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib
from xgboost import XGBClassifier

# Load dataset
data = pd.read_csv("Telco-Customer-Churn.csv")

# Convert TotalCharges to numeric (some values may be spaces)
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors="coerce")
data = data.dropna(subset=["TotalCharges"])  # drop rows with missing TotalCharges

# Features & target
X = data.drop(["customerID", "Churn"], axis=1)
y = data["Churn"].map({"Yes": 1, "No": 0})

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)




In [None]:
# Logistic Regression (with early stopping built-in via saga solver)
log_reg = LogisticRegression(
    solver="saga",
    max_iter=5000,
    penalty="l2",
    class_weight="balanced",
    n_jobs=-1,
    verbose=1
)

# XGBoost Classifier with GPU
xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    tree_method="gpu_hist",   # GPU acceleration
    predictor="gpu_predictor",
    random_state=42,
    verbosity=1
)

# Pipelines
pipe_lr = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", log_reg)])
pipe_xgb = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", xgb)])

# Grid Search Parameters
param_grid_lr = {
    "classifier__C": [0.01, 0.1, 1, 10],
}

param_grid_xgb = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [3, 6, 10],
    "classifier__learning_rate": [0.01, 0.1, 0.2],
    "classifier__subsample": [0.8, 1],
}

# GridSearch
grid_lr = GridSearchCV(
    pipe_lr,
    param_grid=param_grid_lr,
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    verbose=2,
)

grid_xgb = GridSearchCV(
    pipe_xgb,
    param_grid=param_grid_xgb,
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    verbose=2,
)


In [None]:

# Train Logistic Regression
print("Training Logistic Regression...")
grid_lr.fit(X_train, y_train)

# Train XGBoost with early stopping
print("Training XGBoost (GPU)...")
grid_xgb.fit(
    X_train, y_train,
    classifier__eval_set=[(X_test, y_test)],
    classifier__early_stopping_rounds=20,
    classifier__verbose=10
)

# Evaluate best models
best_lr = grid_lr.best_estimator_
best_xgb = grid_xgb.best_estimator_

y_pred_lr = best_lr.predict(X_test)
y_pred_xgb = best_xgb.predict(X_test)

print("\nBest Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

print("\nBest XGBoost Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

# Select best model overall
if accuracy_score(y_test, y_pred_lr) > accuracy_score(y_test, y_pred_xgb):
    final_model = best_lr
    print("\nSelected Model: Logistic Regression")
else:
    final_model = best_xgb
    print("\nSelected Model: XGBoost (GPU)")

# Save pipeline
joblib.dump(final_model, "churn_model_pipeline.pkl")
print("\n✅ Model pipeline saved as churn_model_pipeline.pkl")