In [285]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)


In [286]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [287]:
df.shape


(7043, 21)

In [288]:
df = df.drop(columns=["customerID"])


In [289]:
numeric_cols = ["TotalCharges",  "MonthlyCharges"]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")  # nieprawidłowe wartości -> NaN

print(df[numeric_cols].isna().sum())

for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)


TotalCharges      11
MonthlyCharges     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [290]:
df["Churn"] = (df["Churn"] == "Yes").astype(int)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [291]:
replace_map = {
    "No internet service": "No",
    "No phone service": "No"
}

for col in [
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies", "MultipleLines"
]:
    df[col] = df[col].replace(replace_map)


In [292]:
numeric_features = [
    "tenure",
    "MonthlyCharges",
    "TotalCharges"
]

binary_features = [
    "SeniorCitizen"
]

categorical_features = [
    "gender", "Partner", "Dependents",
    "PhoneService", "MultipleLines",
    "InternetService",
    "OnlineSecurity", "OnlineBackup",
    "DeviceProtection", "TechSupport",
    "StreamingTV", "StreamingMovies",
    "Contract", "PaperlessBilling",
    "PaymentMethod"
]


In [293]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


In [294]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features)
    ],
    remainder="passthrough"
)


In [295]:
df["tenure_bucket"] = pd.cut(
    df["tenure"],
    bins=[0, 6, 18, 100],
    labels=["0-6", "7-18", "19+"],
    right=True
)


In [296]:
y = df["Churn"]

In [297]:
X = df.drop(columns=["Churn", "tenure_bucket"])

In [298]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, bucket_train, bucket_test = train_test_split(
    X, y, df["tenure_bucket"],
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [299]:
#najprostszy baseline
df["baseline_pred"] = (
    (df["tenure"] < 6) &
    (df["MonthlyCharges"] > 50)
).astype(int)


In [300]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)

y_true = df["Churn"]
y_pred = df["baseline_pred"]

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1:", f1_score(y_true, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))


Accuracy: 0.7711202612523073
Precision: 0.6679738562091503
Recall: 0.27340823970037453
F1: 0.3880030372057707
Confusion matrix:
 [[4920  254]
 [1358  511]]


In [301]:
baseline_bucket = (
    df
    .groupby("tenure_bucket")
    .apply(
        lambda x: pd.Series({
            "recall": recall_score(x["Churn"], x["baseline_pred"], zero_division=0),
            "precision": precision_score(x["Churn"], x["baseline_pred"], zero_division=0),
            "churn_rate": x["Churn"].mean(),
            "n": len(x)
        })
    )
    .reset_index()
)

baseline_bucket


  .groupby("tenure_bucket")
  .apply(


Unnamed: 0,tenure_bucket,recall,precision,churn_rate,n
0,0-6,0.651786,0.672368,0.533333,1470.0
1,7-18,0.0,0.0,0.343176,1253.0
2,19+,0.0,0.0,0.152007,4309.0


In [302]:
def expected_remaining_months(tenure):
    if tenure < 6:
        return 24
    elif tenure < 12:
        return 14
    elif tenure < 24:
        return 12
    elif tenure < 48:
        return 10
    else:
        return 6


In [303]:
df["expected_remaining_months"] = df["tenure"].apply(expected_remaining_months)
df["remaining_LTV"] = df["expected_remaining_months"] * df["MonthlyCharges"]


In [304]:
#biznesowa funkcja błędu
def business_loss(y_true, y_pred, remaining_LTV, monthly_fee):
    cost_retention = 20 + 3 * monthly_fee #koszt obsługi potencjalnego churnu w $
    
    loss = 0
    
    if y_true == 1 and y_pred == 0:  # FN
        loss = remaining_LTV
    elif y_true == 1 and y_pred == 1:  # TP
        loss = cost_retention
    elif y_true == 0 and y_pred == 1:  # FP
        loss = cost_retention
    
    return loss


In [305]:

df["loss_baseline"] = df.apply(
    lambda row: business_loss(
        y_true=row["Churn"],
        y_pred=row["baseline_pred"],
        remaining_LTV=row["remaining_LTV"],
        monthly_fee=row["MonthlyCharges"]
    ),
    axis=1
)

total_loss_baseline = df["loss_baseline"].sum()

df["loss_no_action"] = df["Churn"] * df["remaining_LTV"]
total_loss_no_action = df["loss_no_action"].sum()

# wynik
print(f"Total loss – no action: ${total_loss_no_action:,.0f}")
print(f"Total loss – baseline: ${total_loss_baseline:,.0f}")
print(f"Savings vs no action: ${(total_loss_no_action - total_loss_baseline):,.0f}")


Total loss – no action: $2,090,674
Total loss – baseline: $1,336,614
Savings vs no action: $754,059


In [306]:
df.groupby(["Churn", "baseline_pred"]).size()


Churn  baseline_pred
0      0                4920
       1                 254
1      0                1358
       1                 511
dtype: int64

In [307]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


features = ["tenure", "MonthlyCharges"]  # baseline + możesz dorzucić inne
X = df[features]
y = df["Churn"]

X_train, X_test, y_train, y_test, df_train, df_test = train_test_split(
    X, y, df, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [308]:
clf = LogisticRegression()
clf.fit(X_train_scaled, y_train)

y_proba = clf.predict_proba(X_test_scaled)[:,1]


In [309]:
import numpy as np

def total_business_loss_for_threshold(threshold):
    y_pred_local = (y_proba >= threshold).astype(int)
    
    df_test_reset = df_test.reset_index(drop=True)
    
    loss = df_test_reset.apply(
        lambda row: business_loss(
            y_true=row["Churn"],
            y_pred=y_pred_local[row.name],  
            remaining_LTV=row["remaining_LTV"],
            monthly_fee=row["MonthlyCharges"]
        ), axis=1
    )
    return loss.sum()


thresholds = np.linspace(0, 1, 101)
losses = [total_business_loss_for_threshold(t) for t in thresholds]

best_idx = np.argmin(losses)  
best_threshold = thresholds[best_idx]

print(f"Best threshold (minimal loss): {best_threshold:.2f}")
print(f"Expected loss at this threshold: ${losses[best_idx]:,.0f}")


Best threshold (minimal loss): 0.21
Expected loss at this threshold: $194,743


In [310]:
def bucket_losses_for_threshold(threshold):
    y_pred_local = (y_proba >= threshold).astype(int)
    
    df_test_reset = df_test.reset_index(drop=True)
    
    buckets = df_test_reset["tenure_bucket"].unique()
    
    results = []
    for b in buckets:
        bucket_df = df_test_reset[df_test_reset["tenure_bucket"] == b]
        loss = bucket_df.apply(
            lambda row: business_loss(
                y_true=row["Churn"],
                y_pred=y_pred_local[row.name],
                remaining_LTV=row["remaining_LTV"],
                monthly_fee=row["MonthlyCharges"]
            ),
            axis=1
        ).sum()
        results.append({
            "bucket": b,
            "loss": loss
        })
        
    return pd.DataFrame(results)

bucket_losses_df = bucket_losses_for_threshold(best_threshold)
print(bucket_losses_df)

total_loss = bucket_losses_df["loss"].sum()
print(f"\nTotal expected loss (all buckets): ${total_loss:,.0f}")


  bucket      loss
0    19+  92907.75
1   7-18  44189.75
2    0-6  57223.10
3    NaN      0.00

Total expected loss (all buckets): $194,321


In [311]:
def loss_for_bucket(bucket_df, threshold):
    bucket_df_reset = bucket_df.reset_index(drop=True)
    y_pred_local = (y_proba[bucket_df_reset.index] >= threshold).astype(int)  # teraz row.index = 0…len(bucket_df)-1
    loss = bucket_df_reset.apply(
        lambda row: business_loss(
            y_true=row["Churn"],
            y_pred=y_pred_local[row.name],
            remaining_LTV=row["remaining_LTV"],
            monthly_fee=row["MonthlyCharges"]
        ), axis=1
    )
    return loss.sum()

buckets = df_test["tenure_bucket"].unique()

results = []
total_loss_global = 0

for b in df_test["tenure_bucket"].unique():
    bucket_df = df_test[df_test["tenure_bucket"] == b]
    thresholds = np.linspace(0, 1, 101)
    losses = [loss_for_bucket(bucket_df, t) for t in thresholds]
    best_idx = np.argmin(losses)
    best_threshold = thresholds[best_idx]
    best_loss = losses[best_idx]
    
    results.append({
        "bucket": b,
        "best_threshold": best_threshold,
        "expected_loss": best_loss
    })
    
    total_loss_global += best_loss

results_df = pd.DataFrame(results)
print(results_df)
print(f"\nTotal expected loss (all buckets): ${total_loss_global:,.0f}")


  bucket  best_threshold  expected_loss
0    19+            0.83       93533.80
1   7-18            0.00       46721.90
2    0-6            0.00       57016.45
3    NaN            0.00           0.00

Total expected loss (all buckets): $197,272


In [312]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df.drop(columns=["Churn", "baseline_pred", "remaining_LTV", "expected_remaining_months"])
X = pd.get_dummies(X, drop_first=True)
y = df["Churn"]

X_train, X_test, y_train, y_test, df_train, df_test = train_test_split(
    X, y, df, test_size=0.3, random_state=42
)

rf = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42)
rf.fit(X_train, y_train)

y_proba = rf.predict_proba(X_test)[:,1]

def total_business_loss_for_threshold_rf(threshold):
    y_pred_local = (y_proba >= threshold).astype(int)
    df_test_reset = df_test.reset_index(drop=True)
    
    loss = df_test_reset.apply(
        lambda row: business_loss(
            y_true=row["Churn"],
            y_pred=y_pred_local[row.name],
            remaining_LTV=row["remaining_LTV"],
            monthly_fee=row["MonthlyCharges"]
        ),
        axis=1
    )
    return loss.sum()

import numpy as np
thresholds = np.linspace(0, 1, 101)
losses = [total_business_loss_for_threshold_rf(t) for t in thresholds]

best_idx = np.argmin(losses)
best_threshold = thresholds[best_idx]

print(f"Best threshold RF (minimal loss): {best_threshold:.2f}")
print(f"Expected loss RF at this threshold: ${losses[best_idx]:,.0f}")


Best threshold RF (minimal loss): 0.23
Expected loss RF at this threshold: $140,188


In [313]:
def loss_for_bucket_rf(bucket_df, threshold):
    bucket_df_reset = bucket_df.reset_index(drop=True)
    y_pred_local = (y_proba[bucket_df_reset.index] >= threshold).astype(int)  # teraz index pasuje
    
    loss = bucket_df_reset.apply(
        lambda row: business_loss(
            y_true=row["Churn"],
            y_pred=y_pred_local[row.name],
            remaining_LTV=row["remaining_LTV"],
            monthly_fee=row["MonthlyCharges"]
        ),
        axis=1
    ).sum()
    
    return loss


buckets = df_test["tenure_bucket"].unique()
results = []

for b in buckets:
    bucket_df = df_test[df_test["tenure_bucket"] == b]
    thresholds = np.linspace(0, 1, 101)
    
    losses = [loss_for_bucket_rf(bucket_df, t) for t in thresholds]
    
    best_idx = np.argmin(losses)
    best_threshold = thresholds[best_idx]
    
    results.append({
        "bucket": b,
        "best_threshold": best_threshold,
        "expected_loss": losses[best_idx]
    })

results_df = pd.DataFrame(results)
print(results_df)

total_loss_all_buckets = results_df["expected_loss"].sum()
print(f"\nTotal expected loss (all buckets): ${total_loss_all_buckets:,.0f}")


  bucket  best_threshold  expected_loss
0    0-6             0.0        85615.4
1    19+             1.0       152313.4
2   7-18             0.0        72844.5
3    NaN             0.0            0.0

Total expected loss (all buckets): $310,773


In [314]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd


X = df.drop(columns=["Churn", "baseline_pred", "expected_remaining_months", "remaining_LTV"])
y = df["Churn"]

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test, df_train, df_test = train_test_split(
    X, y, df, test_size=0.33, random_state=42, stratify=y
)

scaler = StandardScaler()
numeric_cols = X_train.select_dtypes(include=np.number).columns
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])


gb_clf = GradientBoostingClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    random_state=42
)
gb_clf.fit(X_train_scaled, y_train)

y_proba = gb_clf.predict_proba(X_test_scaled)[:,1]

def total_business_loss_for_threshold(threshold):
    y_pred_local = (y_proba >= threshold).astype(int)
    df_test_reset = df_test.reset_index(drop=True)
    
    loss = df_test_reset.apply(
        lambda row: business_loss(
            y_true=row["Churn"],
            y_pred=y_pred_local[row.name],
            remaining_LTV=row["remaining_LTV"],
            monthly_fee=row["MonthlyCharges"]
        ), axis=1
    )
    return loss.sum()


thresholds = np.linspace(0, 1, 101)
losses = [total_business_loss_for_threshold(t) for t in thresholds]

best_idx = np.argmin(losses)
best_threshold = thresholds[best_idx]

print(f"Best threshold GBM (minimal loss): {best_threshold:.2f}")
print(f"Expected loss GBM at this threshold: ${losses[best_idx]:,.0f}")


Best threshold GBM (minimal loss): 0.01
Expected loss GBM at this threshold: $147,431


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import pandas as pd


gb_clf = GradientBoostingClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    random_state=42
)
gb_clf.fit(X_train_scaled, y_train)

y_proba = gb_clf.predict_proba(X_test_scaled)[:,1]

def loss_for_bucket(bucket_df, threshold, y_proba_local):
    bucket_df_reset = bucket_df.reset_index(drop=True)
    y_pred_local = (y_proba_local >= threshold).astype(int)
    
    loss = bucket_df_reset.apply(
        lambda row: business_loss(
            y_true=row["Churn"],
            y_pred=y_pred_local[row.name],
            remaining_LTV=row["remaining_LTV"],
            monthly_fee=row["MonthlyCharges"]
        ), axis=1
    )
    return loss.sum()


buckets = df_test["tenure_bucket"].dropna().unique()
results = []

for b in buckets:
    bucket_df = df_test[df_test["tenure_bucket"] == b].reset_index(drop=True)
    y_proba_local = y_proba[bucket_df.index] if len(bucket_df) == len(y_proba) else y_proba[bucket_df.index[:len(bucket_df)]]
    
    thresholds = np.linspace(0, 1, 101)
    losses = [loss_for_bucket(bucket_df, t, y_proba_local) for t in thresholds]
    best_idx = np.argmin(losses)
    
    results.append({
        "bucket": b,
        "best_threshold": thresholds[best_idx],
        "expected_loss": losses[best_idx]
    })

results_df = pd.DataFrame(results)
total_expected_loss = results_df["expected_loss"].sum()

print("Per-bucket thresholds and losses:")
print(results_df.sort_values("bucket"))
print(f"\nTotal expected loss (all buckets): ${total_expected_loss:,.0f}")


Per-bucket thresholds and losses:
  bucket  best_threshold  expected_loss
2    0-6             0.0       89678.90
0    19+             1.0      156044.20
1   7-18             0.0       79409.15

Total expected loss (all buckets): $325,132
