In [68]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score


In [31]:
df_churn = pd.read_csv('../data/cleaned_telecom_churn_data.csv')
df_churn.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,1,0,0,0,34,1,No,DSL,Yes,No,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,1,0,0,0,2,1,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,1,0,0,0,45,0,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,0,0,0,0,2,1,No,Fiber optic,No,No,No,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1


In [None]:

service_cols = [
    'OnlineBackup', 'OnlineSecurity', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies'
]

for col in service_cols:
    df_churn[col] = df_churn[col].replace({'Yes': 1, 'No': 0, 'No internet service': 0})

df_churn['services_count'] = df_churn[service_cols].sum(axis=1)

  df_churn[col] = df_churn[col].replace({'Yes': 1, 'No': 0, 'No internet service': 0})


In [33]:
df_churn.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,services_count
0,0,0,1,0,1,0,No phone service,DSL,0,1,...,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,0,1
1,1,0,0,0,34,1,No,DSL,1,0,...,0,0,0,One year,0,Mailed check,56.95,1889.5,0,2
2,1,0,0,0,2,1,No,DSL,1,1,...,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,1,2
3,1,0,0,0,45,0,No phone service,DSL,1,0,...,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,0,3
4,0,0,0,0,2,1,No,Fiber optic,0,0,...,0,0,0,Month-to-month,1,Electronic check,70.7,151.65,1,0


In [34]:
bins = [0, 6, 12, 24, 48, 72]
labels = ['0-6', '6-12', '12-24', '24-48', '48-72']

df_churn['tenure_group'] = pd.cut(
    df_churn['tenure'],
    bins=bins,
    labels=labels,
    include_lowest=True
)

In [35]:
df_churn['TotalCharges'] = pd.to_numeric(df_churn['TotalCharges'], errors='coerce')
df_churn['TotalCharges'] = df_churn['TotalCharges'].fillna(0)
df_churn['payment_ratio'] = df_churn['MonthlyCharges'] / (df_churn['TotalCharges'] / df_churn['tenure'].replace(0, 1))
df_churn['payment_ratio'].replace([np.inf, -np.inf], 0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_churn['payment_ratio'].replace([np.inf, -np.inf], 0, inplace=True)


In [36]:
df_churn['payment_ratio'].replace([np.inf, -np.inf], 0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_churn['payment_ratio'].replace([np.inf, -np.inf], 0, inplace=True)


In [37]:
df_churn['is_new_client'] = (df_churn['tenure'] < 3).astype(int)

In [38]:
df_churn.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,services_count,tenure_group,payment_ratio,is_new_client
0,0,0,1,0,1,0,No phone service,DSL,0,1,...,Month-to-month,1,Electronic check,29.85,29.85,0,1,0-6,1.0,1
1,1,0,0,0,34,1,No,DSL,1,0,...,One year,0,Mailed check,56.95,1889.5,0,2,24-48,1.024768,0
2,1,0,0,0,2,1,No,DSL,1,1,...,Month-to-month,1,Mailed check,53.85,108.15,1,2,0-6,0.995839,1
3,1,0,0,0,45,0,No phone service,DSL,1,0,...,One year,0,Bank transfer (automatic),42.3,1840.75,0,3,24-48,1.034089,0
4,0,0,0,0,2,1,No,Fiber optic,0,0,...,Month-to-month,1,Electronic check,70.7,151.65,1,0,0-6,0.93241,1


In [39]:
df_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   gender            7032 non-null   int64   
 1   SeniorCitizen     7032 non-null   int64   
 2   Partner           7032 non-null   int64   
 3   Dependents        7032 non-null   int64   
 4   tenure            7032 non-null   int64   
 5   PhoneService      7032 non-null   int64   
 6   MultipleLines     7032 non-null   object  
 7   InternetService   7032 non-null   object  
 8   OnlineSecurity    7032 non-null   int64   
 9   OnlineBackup      7032 non-null   int64   
 10  DeviceProtection  7032 non-null   int64   
 11  TechSupport       7032 non-null   int64   
 12  StreamingTV       7032 non-null   int64   
 13  StreamingMovies   7032 non-null   int64   
 14  Contract          7032 non-null   object  
 15  PaperlessBilling  7032 non-null   int64   
 16  PaymentMethod     7032 n

In [None]:
df = pd.get_dummies(df_churn, drop_first=True)

X = df.drop("Churn", axis=1)
y = df["Churn"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [41]:
df = df_churn.copy()

cat_cols = ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'tenure_group']
num_cols = [col for col in df_churn.columns 
            if df_churn[col].dtype in ['int64', 'float64'] and col != 'Churn']


X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

baseline_model = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=200))
])

baseline_model.fit(X_train, y_train)

y_pred = baseline_model.predict(X_test)
y_prob = baseline_model.predict_proba(X_test)[:,1]

print("BASELINE RESULTS")
print("----------------------")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("F1:", round(f1_score(y_test, y_pred), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 4))

BASELINE RESULTS
----------------------
Accuracy: 0.7982
F1: 0.586
ROC-AUC: 0.8404


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=200).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

rf_pipeline = Pipeline([
    ('prep', preprocess),
    ('clf', RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    ))
])

rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)
y_proba = rf_pipeline.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("RANDOM FOREST BASELINE RESULTS")
print("------------------------------")
print(f"Accuracy: {acc:.4f}")
print(f"F1:       {f1:.4f}")
print(f"ROC-AUC:  {auc:.4f}")

RANDOM FOREST BASELINE RESULTS
------------------------------
Accuracy: 0.7953
F1:       0.5802
ROC-AUC:  0.8205


In [None]:
from catboost import CatBoostClassifier, Pool

class_weights = [1, 2.7683]

cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    eval_metric='AUC',
    random_seed=42,
    class_weights=class_weights,
    verbose=100
)

target = 'Churn'

cat_features = ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'tenure_group']
num_features = [col for col in df.columns if col not in cat_features + [target, 'customerID']]

X = df[cat_features + num_features]
y = df[target]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

cat_model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50)

y_pred = cat_model.predict(X_test)
y_proba = cat_model.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("CATBOOST BASELINE RESULTS")
print("-------------------------")
print(f"Accuracy: {acc:.4f}")
print(f"F1:       {f1:.4f}")
print(f"ROC-AUC:  {auc:.4f}")

0:	test: 0.7533041	best: 0.7533041 (0)	total: 4.04ms	remaining: 4.03s
100:	test: 0.8421476	best: 0.8421476 (100)	total: 387ms	remaining: 3.44s
200:	test: 0.8417658	best: 0.8432257 (153)	total: 765ms	remaining: 3.04s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8432256912
bestIteration = 153

Shrink model to first 154 iterations.
CATBOOST BASELINE RESULTS
-------------------------
Accuracy: 0.7392
F1:       0.6228
ROC-AUC:  0.8432


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

param_dist = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'iterations': [200, 500, 1000],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128, 254],
    'bagging_temperature': [0, 1, 2, 5],
    'random_strength': [1, 5, 10]
}

cat_model = CatBoostClassifier(
    eval_metric='F1',
    random_seed=42,
    class_weights=[1, 2.7683],
    verbose=0
)

f1_scorer = make_scorer(f1_score)

search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_dist,
    n_iter=30,             
    scoring=f1_scorer,
    cv=3,                 
    verbose=2,
    n_jobs=-1,
    random_state=42
)


search.fit(X_train, y_train, cat_features=cat_features)

print("Best parameters:", search.best_params_)
best_model = search.best_estimator_

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

from sklearn.metrics import accuracy_score, roc_auc_score

print("TUNED CATBOOST RESULTS")
print("----------------------")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("F1:", round(f1_score(y_test, y_pred), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, y_proba), 4))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END bagging_temperature=2, border_count=128, depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.1, random_strength=5; total time=   3.6s
[CV] END bagging_temperature=2, border_count=128, depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.1, random_strength=5; total time=   3.7s
[CV] END bagging_temperature=2, border_count=128, depth=4, iterations=500, l2_leaf_reg=1, learning_rate=0.1, random_strength=5; total time=   3.9s
[CV] END bagging_temperature=0, border_count=64, depth=4, iterations=1000, l2_leaf_reg=3, learning_rate=0.05, random_strength=10; total time=   7.6s
[CV] END bagging_temperature=0, border_count=64, depth=4, iterations=1000, l2_leaf_reg=3, learning_rate=0.05, random_strength=10; total time=   7.8s
[CV] END bagging_temperature=0, border_count=64, depth=4, iterations=1000, l2_leaf_reg=3, learning_rate=0.05, random_strength=10; total time=   7.8s
[CV] END bagging_temperature=1, border_count=254, 

In [None]:

df_churn['avg_charges'] = df_churn['TotalCharges'] / df_churn['tenure']
df_churn['avg_charges'].replace([np.inf, -np.inf], 0, inplace=True)
df_churn['avg_charges'].fillna(0, inplace=True)

df_churn['contract_payment'] = df_churn['Contract'].astype(str) + "_" + df_churn['PaymentMethod'].astype(str)

target = 'Churn'

cat_features = ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'tenure_group', 'contract_payment']
num_features = [col for col in df_churn.columns if col not in cat_features + [target, 'customerID']]

X = df_churn[cat_features + num_features]
y = df_churn[target]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

class_weights = [1, 2.7683]

cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    eval_metric='F1',
    random_seed=42,
    class_weights=class_weights,
    verbose=100
)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

cat_model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50)

y_pred = cat_model.predict(X_test)
y_proba = cat_model.predict_proba(X_test)[:,1]

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("CATBOOST WITH ENGINEERED FEATURES")
print("---------------------------------")
print(f"Accuracy: {acc:.4f}")
print(f"F1:       {f1:.4f}")
print(f"ROC-AUC:  {auc:.4f}")

0:	learn: 0.7678138	test: 0.7514881	best: 0.7514881 (0)	total: 3.22ms	remaining: 3.22s


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_churn['avg_charges'].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_churn['avg_charges'].fillna(0, inplace=True)


100:	learn: 0.7992713	test: 0.7712563	best: 0.7731732 (94)	total: 409ms	remaining: 3.64s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7742401561
bestIteration = 126

Shrink model to first 127 iterations.
CATBOOST WITH ENGINEERED FEATURES
---------------------------------
Accuracy: 0.7392
F1:       0.6236
ROC-AUC:  0.8390


In [None]:
from sklearn.model_selection import train_test_split

target = 'Churn'

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



class_weights = [1, 2.7683]
f1_scorer = make_scorer(f1_score)

param_dist = {
    'depth': [4,6,8,10],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'iterations': [200, 500, 1000],
    'l2_leaf_reg': [1,3,5,7,9],
    'bagging_temperature': [0,1,2,5],
    'border_count': [32,64,128,254],
}

cat_model = CatBoostClassifier(
    eval_metric='F1',
    random_seed=42,
    class_weights=class_weights,
    verbose=0
)

search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_dist,
    n_iter=50,
    scoring=f1_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

search.fit(X_train, y_train, cat_features=cat_features)
best_model = search.best_estimator_
print("Best parameters:", search.best_params_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END bagging_temperature=0, border_count=254, depth=4, iterations=500, l2_leaf_reg=3, learning_rate=0.05; total time=   5.0s
[CV] END bagging_temperature=0, border_count=254, depth=4, iterations=500, l2_leaf_reg=3, learning_rate=0.05; total time=   5.1s
[CV] END bagging_temperature=0, border_count=254, depth=4, iterations=500, l2_leaf_reg=3, learning_rate=0.05; total time=   5.1s
[CV] END bagging_temperature=0, border_count=254, depth=4, iterations=500, l2_leaf_reg=3, learning_rate=0.05; total time=   5.5s
[CV] END bagging_temperature=0, border_count=254, depth=4, iterations=500, l2_leaf_reg=3, learning_rate=0.05; total time=   5.6s
[CV] END bagging_temperature=5, border_count=64, depth=6, iterations=500, l2_leaf_reg=9, learning_rate=0.05; total time=   7.3s
[CV] END bagging_temperature=5, border_count=64, depth=6, iterations=500, l2_leaf_reg=9, learning_rate=0.05; total time=   8.1s
[CV] END bagging_temperature=5, borde

In [47]:
best_model

<catboost.core.CatBoostClassifier at 0x132f22850>

In [48]:
from sklearn.metrics import precision_recall_curve

y_proba = best_model.predict_proba(X_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

print("Best threshold for F1:", best_threshold)

Best threshold for F1: 0.5768876842997213


In [67]:
y_pred = (y_proba >= best_threshold).astype(int)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("FINAL CATBOOST RESULTS")
print("----------------------")
print(f"Accuracy:  {acc:.4f}")
print(f"F1:        {f1:.4f}")
print(f"ROC-AUC:   {auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")

FINAL CATBOOST RESULTS
----------------------
Accuracy:  0.7441
F1:        0.6211
ROC-AUC:   0.8302
Precision: 0.5122
Recall:    0.7888
