## Quickscan

In [1]:
import os
import sys
sys.path.append('..')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, classification_report


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

#print("Path to dataset files:", path)

data_path = os.path.join(path, "WA_Fn-UseC_-Telco-Customer-Churn.csv")

df = pd.read_csv(data_path)

df.columns = df.columns.str.strip()
df["Churn"] = df["Churn"].str.strip()

# TotalCharges fix
if "TotalCharges" in df.columns:
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")


In [3]:
missing = df.isna().sum().sort_values(ascending=False)
missing[(missing > 0)]


TotalCharges    11
dtype: int64

In [4]:
df = df[~df['TotalCharges'].isna()]

In [5]:
target_col = "Churn"

# Drop CustomerID-like column
id_cols = [c for c in df.columns if "customer" in c.lower() or "id" in c.lower()]

X = df.drop(columns=[target_col] + id_cols)
y = df[target_col].map({"No": 0, "Yes": 1})  # binary target

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

numeric_cols, categorical_cols


(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'],
 ['gender',
  'Partner',
  'Dependents',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [7]:
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),          # or "mean"
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),   # fills NaN with mode
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])


In [8]:
clf.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [9]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print(f"Accuracy: {acc:.3f}")
print(f"Recall (Churn=1): {rec:.3f}")
print(f"AUC: {auc:.3f}")
print("\nClassification report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.804
Recall (Churn=1): 0.572
AUC: 0.836

Classification report:

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1033
           1       0.65      0.57      0.61       374

    accuracy                           0.80      1407
   macro avg       0.75      0.73      0.74      1407
weighted avg       0.80      0.80      0.80      1407



In [10]:
EXPECTED_REMAINING_MONTHS = 12

assert "MonthlyCharges" in df.columns, "MonthlyCharges not found"

X_test_reset = X_test.copy()
X_test_reset["true_churn"] = y_test.values
X_test_reset["churn_proba"] = y_proba

X_test_reset["expected_loss"] = (
    X_test_reset["churn_proba"] * X_test_reset["MonthlyCharges"] * EXPECTED_REMAINING_MONTHS
)

X_test_reset.head()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,true_churn,churn_proba,expected_loss
974,Female,0,Yes,Yes,59,Yes,No,DSL,No,Yes,...,Yes,Yes,Two year,Yes,Credit card (automatic),75.95,4542.35,0,0.017513,15.961373
619,Female,0,No,No,7,Yes,Yes,Fiber optic,No,Yes,...,No,No,Month-to-month,Yes,Bank transfer (automatic),78.55,522.95,0,0.591921,557.945023
4289,Female,0,No,No,54,Yes,No,No,No internet service,No internet service,...,No internet service,No internet service,Two year,No,Mailed check,20.1,1079.45,0,0.004821,1.162832
3721,Female,0,No,No,2,Yes,No,No,No internet service,No internet service,...,No internet service,No internet service,Month-to-month,No,Mailed check,20.65,38.7,1,0.201652,49.969284
4533,Female,0,Yes,No,71,Yes,Yes,Fiber optic,No,Yes,...,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.15,7555.0,0,0.101501,128.074464


In [11]:
top_n = 20
top_risk = X_test_reset.sort_values("expected_loss", ascending=False).head(top_n)

top_risk[["MonthlyCharges", "churn_proba", "expected_loss"]].round(2)


Unnamed: 0,MonthlyCharges,churn_proba,expected_loss
2797,100.95,0.82,988.48
2631,99.25,0.83,984.6
2448,104.85,0.78,979.49
3956,105.3,0.77,969.49
3380,95.1,0.85,967.13
3159,94.85,0.85,962.7
6839,100.75,0.79,950.6
3727,96.6,0.82,945.91
2294,106.7,0.74,945.61
4701,104.35,0.74,931.33


In [12]:
if id_cols:
    # Index for merging
    df_indexed = df.set_index(df.index)
    X_test_reset_with_id = X_test_reset.copy()
    X_test_reset_with_id = pd.concat(
        [df[id_cols].loc[X_test_reset.index].reset_index(drop=True),
         X_test_reset.reset_index(drop=True)],
        axis=1
    )

    top_risk = X_test_reset_with_id.sort_values("expected_loss", ascending=False).head(top_n)
    top_risk[[*id_cols, "MonthlyCharges", "churn_proba", "expected_loss"]].round(2)


In [13]:
total_expected_loss_per_month = (
    X_test_reset["churn_proba"] * X_test_reset["MonthlyCharges"]
).sum()

total_expected_loss_total = X_test_reset["expected_loss"].sum()

print(f"Expected monthly volume risked (test-set, proxy): €{total_expected_loss_per_month:,.2f}")
print(f"Expected total volume in {EXPECTED_REMAINING_MONTHS} months (test-set, proxy): €{total_expected_loss_total:,.2f}")


Expected monthly volume risked (test-set, proxy): €28,304.83
Expected total volume in 12 months (test-set, proxy): €339,657.93


In [14]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score

dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X, y)
y_dummy = dummy.predict(X)

print("Dummy AUC:", roc_auc_score(y, y_dummy))


Dummy AUC: 0.5


## Proof of Concept

In [15]:
import sys
sys.path.append('..')

from pathlib import Path
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score

from xgboost import XGBClassifier

from src.data.preprocess import load_raw, split_features_target, build_preprocessor


In [16]:
df, DATA_DIR = load_raw()
MODELS_DIR = DATA_DIR.parent / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

from src.data.feature_engineering import apply_feature_engineering
df = apply_feature_engineering(df)

X, y, id_cols = split_features_target(df)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [17]:
preprocessor = build_preprocessor(X_train)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", xgb),
    ]
)


In [18]:
clf.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [24]:
model_path = MODELS_DIR / "model.pkl"
saved_model = joblib.dump(clf, model_path)


In [20]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print(f"Accuracy: {acc:.3f}")
print(f"Recall:   {rec:.3f}")
print(f"AUC:      {auc:.3f}")


Accuracy: 0.805
Recall:   0.516
AUC:      0.844


In [21]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = cross_val_score(clf, X, y, cv=cv, scoring="roc_auc")

print("CV AUC mean:", auc_scores.mean())
print("CV AUC std:", auc_scores.std())


CV AUC mean: 0.8425613608384113
CV AUC std: 0.0103711563250955


In [22]:
import numpy as np

def eval_top_k(y_true, y_proba, top_frac=0.2):
    n = int(len(y_true) * top_frac)
    idx = np.argsort(-y_proba)[:n]
    recall = y_true.iloc[idx].mean()
    return recall

print("Recall @ top 20%:", eval_top_k(y_test, y_proba, 0.2))


Recall @ top 20%: 0.6725978647686833


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
import pandas as pd


models = {
    "dummy": DummyClassifier(strategy="most_frequent"),
    "logistic": LogisticRegression(max_iter=1000, n_jobs=-1),
    "xgboost": clf,
}

results = []

for name, clf_compare in models.items():
    print("Evaluating "+name)
    # train, except xgb which has already been trained
    if not hasattr(clf_compare, "classes_"):
        clf_compare =Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("model", clf_compare),
            ])
        clf_compare.fit(X_train, y_train)

    # predict proba + labels
    if hasattr(clf_compare, "predict_proba"):
        y_proba = clf_compare.predict_proba(X_test)[:, 1]
    else:
        # fallback: some models only have decision_function
        from sklearn.metrics import roc_curve
        scores = clf_compare.decision_function(X_test)
        # normalise to 0–1 for roc_auc
        y_proba = (scores - scores.min()) / (scores.max() - scores.min())

    y_pred = clf_compare.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    results.append(
        {
            "model": name,
            "accuracy": acc,
            "recall": rec,
            "auc": auc,
        }
    )

results_df = pd.DataFrame(results).sort_values("auc", ascending=False)
results_df


Evaluating dummy
Evaluating logistic
Evaluating xgboost


Unnamed: 0,model,accuracy,recall,auc
2,xgboost,0.804826,0.516043,0.843712
1,logistic,0.799148,0.52139,0.842161
0,dummy,0.734564,0.0,0.5
