In [52]:
import pandas as pd
import numpy as np

In [53]:
X_train = pd.read_csv("X_train.csv")
X_test  = pd.read_csv("X_test.csv")

y_train = pd.read_csv("y_train.csv")
y_test  = pd.read_csv("y_test.csv")

In [54]:
X_train = X_train.drop(columns=["tenure_by_year"])
X_test  = X_test.drop(columns=["tenure_by_year"])

In [55]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

print("\nTrain churn rate:", y_train.mean(), 5)
print("Test churn rate:", y_test.mean(), 5)

X_train shape: (5634, 23)
X_test shape: (1409, 23)
y_train shape: (5634, 1)
y_test shape: (1409, 1)

Train churn rate: Churn    0.265353
dtype: float64 5
Test churn rate: Churn    0.265436
dtype: float64 5


In [56]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [57]:
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object", "bool", "category"]).columns.tolist()

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

Numeric features: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'new_customer', 'high_monthly_charge', 'multiple_support', 'streaming_bundle']
Categorical features: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [58]:
preprocessor = ColumnTransformer(transformers=[
                                ("num", StandardScaler(), numeric_features),
                                ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features)])

In [59]:
log_reg = LogisticRegression(max_iter=1000, solver="lbfgs")

In [60]:
baseline_model = Pipeline(steps=[("preprocessor", preprocessor),("model", log_reg)])

In [61]:
baseline_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [62]:
from sklearn.metrics import (roc_auc_score, classification_report, confusion_matrix)

In [63]:
y_pred = baseline_model.predict(X_test)
y_prob = baseline_model.predict_proba(X_test)[:, 1]

In [64]:
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC-AUC:", round(roc_auc, 4))

ROC-AUC: 0.857


In [65]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1035
           1       0.70      0.55      0.62       374

    accuracy                           0.82      1409
   macro avg       0.77      0.73      0.75      1409
weighted avg       0.81      0.82      0.81      1409



In [66]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[946  89]
 [168 206]]


In [67]:
threshold = 0.4
y_pred_custom = (y_prob >= threshold).astype(int)

In [68]:
print(classification_report(y_test, y_pred_custom))

print(confusion_matrix(y_test, y_pred_custom))

              precision    recall  f1-score   support

           0       0.88      0.83      0.85      1035
           1       0.59      0.67      0.63       374

    accuracy                           0.79      1409
   macro avg       0.74      0.75      0.74      1409
weighted avg       0.80      0.79      0.80      1409

[[864 171]
 [123 251]]


In [69]:
preprocessor = baseline_model.named_steps["preprocessor"]

num_features = numeric_features

cat_encoder = preprocessor.named_transformers_["cat"]
cat_features = cat_encoder.get_feature_names_out(categorical_features)

all_features = list(num_features) + list(cat_features)

In [70]:
log_model = baseline_model.named_steps["model"]

coefficients = log_model.coef_[0]

coef_df = pd.DataFrame({"feature": all_features,"coefficient": coefficients})

coef_df = coef_df.sort_values(by="coefficient", ascending=False)

coef_df.head(10)


Unnamed: 0,feature,coefficient
14,InternetService_Fiber optic,1.14191
3,TotalCharges,0.381106
13,MultipleLines_Yes,0.359021
32,PaymentMethod_Electronic check,0.317871
30,PaperlessBilling_Yes,0.316193
27,StreamingMovies_Yes,0.233913
4,new_customer,0.227289
25,StreamingTV_Yes,0.224047
7,streaming_bundle,0.123197
0,SeniorCitizen,0.117252


In [71]:
coef_df.tail(10)

Unnamed: 0,feature,coefficient
22,TechSupport_No internet service,-0.188109
20,DeviceProtection_No internet service,-0.188109
10,Dependents_Yes,-0.21259
23,TechSupport_Yes,-0.291721
17,OnlineSecurity_Yes,-0.357166
2,MonthlyCharges,-0.453389
11,PhoneService_Yes,-0.473831
28,Contract_One year,-0.668976
1,tenure,-0.923708
29,Contract_Two year,-1.286717


In [72]:
from sklearn.ensemble import GradientBoostingClassifier

In [73]:
from sklearn.model_selection import GridSearchCV

pm_gridsearch = {
    "model__n_estimators": [150, 200, 300],
    "model__learning_rate": [.01 , 0.03, 0.05],
    "model__max_depth": [1, 2, 3, 4]}

In [74]:
gb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", GradientBoostingClassifier(random_state=17))
])

grid_search = GridSearchCV(
    estimator=gb_pipeline,
    param_grid=pm_gridsearch,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)

In [75]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


  y = column_or_1d(y, warn=True)


In [76]:
print("Best Parameters:", grid_search.best_params_)

print("CV ROC-AUC:", round(grid_search.best_score_, 4))

Best Parameters: {'model__learning_rate': 0.05, 'model__max_depth': 1, 'model__n_estimators': 300}
CV ROC-AUC: 0.8436


In [77]:
best_gb = grid_search.best_estimator_
y_pred_gb = best_gb.predict(X_test)
y_proba_gb = best_gb.predict_proba(X_test)[:, 1]

In [78]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

roc_auc_gb = roc_auc_score(y_test, y_proba_gb)
print("Test ROC-AUC:", round(roc_auc_gb, 4))

Test ROC-AUC: 0.8622


In [79]:
threshold = 0.4
y_pred_gb_04 = (y_proba_gb >= threshold).astype(int)

print("Classification Report (0.4 threshold)")
print(classification_report(y_test, y_pred_gb_04))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb_04))

Classification Report (0.4 threshold)
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      1035
           1       0.65      0.66      0.65       374

    accuracy                           0.81      1409
   macro avg       0.76      0.76      0.76      1409
weighted avg       0.81      0.81      0.81      1409

Confusion Matrix:
[[902 133]
 [129 245]]


In [80]:
from sklearn.ensemble import RandomForestClassifier

In [81]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=17,
    n_jobs=-1)

In [82]:
rf_pipeline = Pipeline(steps=[("preprocessor", preprocessor),("model", rf_model)])
rf_pipeline.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [83]:
y_proba_rf = rf_pipeline.predict_proba(X_test)[:, 1]

In [84]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

print("Random Forest ROC-AUC:", round(roc_auc_score(y_test, y_proba_rf), 4))

threshold = 0.4
y_pred_rf = (y_proba_rf >= threshold).astype(int)

print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

Random Forest ROC-AUC: 0.8371
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      1035
           1       0.59      0.63      0.61       374

    accuracy                           0.79      1409
   macro avg       0.73      0.74      0.73      1409
weighted avg       0.79      0.79      0.79      1409

[[874 161]
 [138 236]]


In [85]:
best_gb = grid_search.best_estimator_

y_proba_final = best_gb.predict_proba(X_test)[:, 1]

In [86]:
results = X_test.copy()

results["actual_churn"] = y_test.values
results["churn_probability"] = y_proba_final

results.sample(10)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,new_customer,high_monthly_charge,multiple_support,streaming_bundle,actual_churn,churn_probability
342,Female,1,Yes,No,8,Yes,Yes,Fiber optic,No,No,...,Yes,Electronic check,84.0,613.4,1,0,0,0,1,0.661714
948,Male,0,No,Yes,5,Yes,No,DSL,No,No,...,Yes,Bank transfer (automatic),48.65,235.2,1,0,1,0,0,0.310365
676,Female,0,Yes,No,33,No,No phone service,DSL,Yes,No,...,No,Bank transfer (automatic),59.55,2016.3,0,0,3,1,0,0.049199
53,Male,1,Yes,No,70,Yes,Yes,DSL,Yes,Yes,...,No,Credit card (automatic),90.05,6333.4,0,1,3,1,0,0.037049
1384,Female,1,No,No,66,Yes,Yes,Fiber optic,No,No,...,Yes,Credit card (automatic),102.85,6976.75,0,1,2,1,0,0.21888
223,Male,0,Yes,Yes,61,No,No phone service,DSL,Yes,No,...,No,Bank transfer (automatic),33.6,2117.2,0,0,2,0,0,0.07702
257,Male,1,No,No,5,Yes,No,Fiber optic,No,No,...,Yes,Mailed check,71.45,371.6,1,0,0,0,0,0.625141
388,Female,0,Yes,Yes,10,Yes,Yes,Fiber optic,Yes,No,...,Yes,Electronic check,100.25,1064.65,1,1,1,1,1,0.582324
678,Female,0,No,No,1,Yes,No,DSL,No,No,...,Yes,Mailed check,50.45,50.45,1,0,1,0,1,0.535451
208,Female,1,No,No,52,No,No phone service,DSL,No,Yes,...,Yes,Electronic check,50.5,2566.3,0,0,2,0,0,0.271786


In [87]:
def risk_tier(prob):
    if prob >= 0.6:
        return "High Risk"
    elif prob >= 0.4:
        return "Medium Risk"
    else:
        return "Low Risk"

results["risk_segment"] = results["churn_probability"].apply(risk_tier)

In [88]:
results["risk_segment"].value_counts()

Unnamed: 0_level_0,count
risk_segment,Unnamed: 1_level_1
Low Risk,1031
Medium Risk,226
High Risk,152


In [89]:
results.groupby("risk_segment")["actual_churn"].mean()

Unnamed: 0_level_0,actual_churn
risk_segment,Unnamed: 1_level_1
High Risk,0.736842
Low Risk,0.125121
Medium Risk,0.588496


In [90]:
def retention_strategy(row):
    if row["risk_segment"] == "High Risk":
        return "Contract Discount or Price Incentive"
    elif row["risk_segment"] == "Medium Risk":
        return "Targeted Engagement Email"
    else:
        return "Monitor and upsell"

results["retention_action"] = results.apply(retention_strategy, axis=1)

results[["churn_probability", "risk_segment", "retention_action"]].sample(5)

Unnamed: 0,churn_probability,risk_segment,retention_action
957,0.027636,Low Risk,Monitor and upsell
160,0.101483,Low Risk,Monitor and upsell
1317,0.045263,Low Risk,Monitor and upsell
25,0.306065,Low Risk,Monitor and upsell
581,0.111314,Low Risk,Monitor and upsell


## Findings/Conclusion

Overall, the tuned Gradient Boosting model achieved a test ROC-AUC of 0.862, outperforming the logistic regression baseline (ROC-AUC ≈ 0.857), and at a business-aligned threshold of 0.4 it achieved 65% precision and 66% recall for churners, correctly identifying 258 of 374 actual churners while limiting false positives to 155 customers. Risk segmentation showed strong separation, the High Risk group (152 customers) had a realized churn rate of 73.7%, the Medium Risk group (226 customers) churned at 58.8%, and the Low Risk group (1,031 customers) churned at only 12.5%, demonstrating strong ranking power. Key drivers included fiber optic service (coefficient ≈ +1.14), electronic check payments (+0.32), new customers (+0.23), and streaming add-ons, while two-year contracts (−1.29), tenure (−0.92), one-year contracts (−0.67), and support services reduced churn risk. Together, these results show that contract structure, pricing sensitivity, and service engagement impact churn, and that targeted interventions toward the 152 high-risk customers could meaningfully reduce overall churn while avoiding unnecessary incentives for the 1,031 low-risk customers.