In [43]:
# 1. Imports
import pandas as pd
import numpy as np

# 2. Load data
churn_80 = pd.read_csv("/content/churn-bigml-80.csv")
churn_20 = pd.read_csv("/content/churn-bigml-20.csv")

# 3. Inspect
churn_80.head()


Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [44]:

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score


In [45]:
churn_80 = pd.read_csv("/content/churn-bigml-80.csv")
churn_20 = pd.read_csv("/content/churn-bigml-20.csv")

print(churn_80.shape)
print(churn_20.shape)


(2666, 20)
(667, 20)


In [46]:
target_col = "Churn"

X_train = churn_80.drop(columns=[target_col])
y_train = churn_80[target_col]

X_test = churn_20.drop(columns=[target_col])
y_test = churn_20[target_col]

In [47]:
num_features = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_features = X_train.select_dtypes(include=["object"]).columns

print("Numerical:", len(num_features))
print("Categorical:", len(cat_features))


Numerical: 16
Categorical: 3


In [48]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)
#preprocessing pipeline

In [49]:
model = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000))
    ]
)
#full pipeline

In [50]:
model.fit(X_train, y_train)  #churn_80


In [51]:
y_pred = model.predict(X_test) #20
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


              precision    recall  f1-score   support

       False       0.88      0.96      0.92       572
        True       0.50      0.24      0.33        95

    accuracy                           0.86       667
   macro avg       0.69      0.60      0.62       667
weighted avg       0.83      0.86      0.84       667

ROC-AUC: 0.8147773279352226


In [52]:
#applysing intervention logic
def intervention_logic(prob):
    if prob >= 0.75:
        return "Retention offer + agent call"
    elif prob >= 0.50:
        return "Personalized email/SMS"
    else:
        return "No action"


In [53]:
results = X_test.copy()
results["churn_probability"] = y_proba
results["recommended_action"] = results["churn_probability"].apply(intervention_logic)

results.head()


Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,...,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,churn_probability,recommended_action
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,...,29.89,215.8,90,9.71,8.7,4,2.35,1,0.145791,No action
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,...,19.42,208.8,111,9.4,12.7,6,3.43,4,0.160389,No action
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,...,27.01,160.6,128,7.23,5.4,9,1.46,4,0.718376,Personalized email/SMS
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,...,11.67,189.6,105,8.53,7.7,6,2.08,2,0.053123,No action
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,...,18.28,178.7,90,8.04,11.1,1,3.0,1,0.018819,No action


In [54]:
# Class predictions on churn_20
y_pred = model.predict(X_test)

# Probability predictions (VERY IMPORTANT for churn)
y_proba = model.predict_proba(X_test)[:, 1]


In [55]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.88      0.96      0.92       572
        True       0.50      0.24      0.33        95

    accuracy                           0.86       667
   macro avg       0.69      0.60      0.62       667
weighted avg       0.83      0.86      0.84       667



In [56]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC Score:", roc_auc)


ROC-AUC Score: 0.8147773279352226


The baseline Logistic Regression achieved an ROC-AUC of 0.81, indicating strong discriminatory power and suitability for churn-based intervention strategies.”

High Risk    : churn_probability ≥ 0.75;
Medium Risk  : 0.50 ≤ churn_probability < 0.75;
Low Risk     : churn_probability < 0.50;
These are business rules, not ML rules
    


In [57]:
def intervention_logic(prob):
    if prob >= 0.75:
        return "Retention offer + agent call"
    elif prob >= 0.50:
        return "Personalized email/SMS"
    else:
        return "No action"


In [58]:
results = X_test.copy() #applying intervention logic

results["churn_probability"] = y_proba
results["risk_segment"] = results["churn_probability"].apply(
    lambda x: "High" if x >= 0.75 else "Medium" if x >= 0.50 else "Low"
)
results["recommended_action"] = results["churn_probability"].apply(intervention_logic)

results.head()


Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,...,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,churn_probability,risk_segment,recommended_action
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,...,215.8,90,9.71,8.7,4,2.35,1,0.145791,Low,No action
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,...,208.8,111,9.4,12.7,6,3.43,4,0.160389,Low,No action
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,...,160.6,128,7.23,5.4,9,1.46,4,0.718376,Medium,Personalized email/SMS
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,...,189.6,105,8.53,7.7,6,2.08,2,0.053123,Low,No action
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,...,178.7,90,8.04,11.1,1,3.0,1,0.018819,Low,No action


In [59]:
results["risk_segment"].value_counts() #business summary


Unnamed: 0_level_0,count
risk_segment,Unnamed: 1_level_1
Low,621
Medium,38
High,8
