# Bank Customer Churn Prediction (part 3 - Decision Tree)
EDA and previous experiments can be found in below notebooks:

https://github.com/Maxstef/data-loves-ml-for-people-course/blob/main/notebooks/2_2_logistic_regression/0_4_bank_customer.ipynb

https://github.com/Maxstef/data-loves-ml-for-people-course/blob/main/notebooks/2_2_logistic_regression/0_7_bank_customer_numeric_binner.ipynb

In current notebook we test decision tree models performance on dataset

## Intro & imports

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from math import inf

import pandas as pd
import numpy as np


# read train.csv / show first 5 rows / show info
raw_df = pd.read_csv('downloads/train.csv', index_col=0)
target_col = "Exited"

In [2]:
from mlpeople.optimization.experiments.polynomial_pipeline_experiments import run_experiment_poly, run_experiments_poly

## Train model with best found combination

In [3]:
fixed_params = {
    "stratify_col": target_col,
    "drop_cols": ["Surname", "CustomerId", "Tenure", "HasCrCard"],
    "scale_mode": "standard",
    "encode_drop": None,
    "separate_binary_numeric": False,
    "polynomial_degree": 4,
    "polynomial_after_scale": True,
    "top_n_cat_values": None,
    "binary_cat_flag_cols": {
        'Geography': [{
            'flag_name': 'IsGermany',
            'value': 'Germany',
            'drop_original': True
        }]
    },
    "binary_num_flag_cols": None,
    "polynomial_interaction_only": False,
    "model": LogisticRegression(C=0.5, solver='liblinear'),
    "num_bin_cols": {
        "EstimatedSalary": {
            "bins": [0, 25000, 100000, inf],
            "labels": ["low", "medium", "high"],
            "new_col": "SalaryScore",
            "drop_original": True
        },
        "CreditScore": {
            "bins": [0, 675, inf],
            "labels": ["low", "high"],
            "new_col": "CreditScoreScore",
            "drop_original": True
        }
    }
}

fixed_params_options = {
    "test_size_options": [0.2],
    "stratify_col_options": [fixed_params["stratify_col"]],
    "drop_cols_options": [fixed_params["drop_cols"]],
    "scale_mode_options": [fixed_params["scale_mode"]],
    "encode_drop_options": [fixed_params["encode_drop"]],
    "model_options": [fixed_params["model"]],
    "separate_binary_numeric_options": [fixed_params["separate_binary_numeric"]],
    "polynomial_degree_options": [fixed_params["polynomial_degree"]],
    "polynomial_after_scale_options": [fixed_params["polynomial_after_scale"]],
    "top_n_cat_values_options": [fixed_params["top_n_cat_values"]],
    "binary_cat_flag_cols_options": [fixed_params["binary_cat_flag_cols"]],
    "binary_num_flag_cols_options": [fixed_params["binary_num_flag_cols"]],
    "polynomial_interaction_only_options": [fixed_params["polynomial_interaction_only"]],
    "num_bin_cols_options": [fixed_params["num_bin_cols"]],
}

In [4]:
result = run_experiment_poly(
    raw_df,
    target_col,
    **fixed_params
)

roc_auc_train = result["roc_auc_train"]
roc_auc_val = result["roc_auc_val"]
poly_model_pipeline_optimal_v3 = result["pipeline"]

roc_auc_train, roc_auc_val

(0.9381483220937081, 0.9395325468139104)

In [10]:
roc_auc_val_best = roc_auc_val

## Decision Tree max_depth=5

In [6]:
filtered_params = {k: v for k, v in fixed_params.items() if k != "model"}

result = run_experiment_poly(
    raw_df,
    target_col,
    model=DecisionTreeClassifier(random_state=42, max_depth=5),
    **filtered_params
)

roc_auc_train = result["roc_auc_train"]
roc_auc_val = result["roc_auc_val"]
roc_auc_train, roc_auc_val

(0.9224089052243478, 0.9129106934632005)

In [7]:
fixed_params["model"] = DecisionTreeClassifier(random_state=42, max_depth=5)
fixed_params_options["model_options"] = [fixed_params["model"]]

## Experiments

### Drop Cols options

In [None]:
%%time

filtered_params_options = {k: v for k, v in fixed_params_options.items() if k != "drop_cols_options"}

drop_cols_options = [
    [],
    ["Surname"],
    ["CustomerId"],
    ["Surname", "CustomerId"],
    ["Surname", "CustomerId", "Tenure"],
    ["Surname", "CustomerId", "Tenure", "HasCrCard"],
]

results_drop_cols = run_experiments_poly(
    raw_df,
    target_col,
    drop_cols_options=drop_cols_options,
    **filtered_params_options,
)

results_drop_cols["IsBetter"] = results_drop_cols["roc_auc_val"] > roc_auc_val_best
results_drop_cols[["drop_cols", "roc_auc_train", "roc_auc_val", "IsBetter"]].sort_values("roc_auc_val", ascending=False).head(10)

CPU times: user 3.89 s, sys: 161 ms, total: 4.05 s
Wall time: 4.17 s


Unnamed: 0,drop_cols,roc_auc_train,roc_auc_val,IsBetter
5,"(Surname, CustomerId, Tenure, HasCrCard, Exited)",0.922409,0.912911,False
4,"(Surname, CustomerId, Tenure, Exited)",0.92259,0.911598,False
2,"(CustomerId, Exited)",0.922307,0.911563,False
3,"(Surname, CustomerId, Exited)",0.922646,0.910355,False
0,"(Exited,)",0.922887,0.910089,False
1,"(Surname, Exited)",0.922887,0.908456,False


### Reset all params & Drop Cols options

no polynomial features

no custom transformers

In [None]:
%%time

drop_cols_options = [
    [],
    ["Surname"],
    ["CustomerId"],
    ["Surname", "CustomerId"],
    ["Surname", "CustomerId", "Tenure"],
    ["Surname", "CustomerId", "Tenure", "HasCrCard"],
]
polynomial_degree_options = [1]
binary_cat_flag_cols_options = [None]
num_bin_cols_options = [None]

filtered_params_options = {
    k: v
    for k, v in fixed_params_options.items()
    if k not in [
        "drop_cols_options",
        "polynomial_degree_options",
        "binary_cat_flag_cols_options",
        "num_bin_cols_options",
    ]
}

results_drop_cols = run_experiments_poly(
    raw_df,
    target_col,
    drop_cols_options=drop_cols_options,
    polynomial_degree_options=polynomial_degree_options,
    binary_cat_flag_cols_options=binary_cat_flag_cols_options,
    num_bin_cols_options=num_bin_cols_options,
    **filtered_params_options,
)

results_drop_cols["IsBetter"] = results_drop_cols["roc_auc_val"] > roc_auc_val_best
results_drop_cols[["drop_cols", "roc_auc_train", "roc_auc_val", "IsBetter"]].sort_values("roc_auc_val", ascending=False).head(10)

CPU times: user 397 ms, sys: 78.7 ms, total: 476 ms
Wall time: 502 ms


Unnamed: 0,drop_cols,roc_auc_train,roc_auc_val,IsBetter
3,"(Surname, CustomerId, Exited)",0.925694,0.921927,True
4,"(Surname, CustomerId, Tenure, Exited)",0.925694,0.921927,True
5,"(Surname, CustomerId, Tenure, HasCrCard, Exited)",0.925694,0.921927,True
1,"(Surname, Exited)",0.925766,0.921864,True
2,"(CustomerId, Exited)",0.924003,0.918712,True
0,"(Exited,)",0.924075,0.917015,True


In [14]:
fixed_params["drop_cols"] = ["Surname", "CustomerId"]
fixed_params_options["drop_cols_options"] = [fixed_params["drop_cols"]]

### Polynomial degree and max depth

In [None]:
%%time

polynomial_degree_options = [1,2,3,4]
binary_cat_flag_cols_options = [None]
num_bin_cols_options = [None]
model_options = [
    DecisionTreeClassifier(random_state=42, max_depth=5),
    DecisionTreeClassifier(random_state=42, max_depth=6),
    DecisionTreeClassifier(random_state=42, max_depth=7),
    DecisionTreeClassifier(random_state=42, max_depth=8),
    DecisionTreeClassifier(random_state=42, max_depth=9),
    DecisionTreeClassifier(random_state=42, max_depth=10),
]

filtered_params_options = {
    k: v
    for k, v in fixed_params_options.items()
    if k not in [
        "model_options",
        "polynomial_degree_options",
        "binary_cat_flag_cols_options",
        "num_bin_cols_options",
    ]
}

results_poly_depth = run_experiments_poly(
    raw_df,
    target_col,
    model_options=model_options,
    polynomial_degree_options=polynomial_degree_options,
    binary_cat_flag_cols_options=binary_cat_flag_cols_options,
    num_bin_cols_options=num_bin_cols_options,
    **filtered_params_options,
)

results_poly_depth["IsBetter"] = results_poly_depth["roc_auc_val"] > roc_auc_val_best
results_poly_depth[["model", "polynomial_degree", "roc_auc_train", "roc_auc_val", "IsBetter"]].sort_values("roc_auc_val", ascending=False).head(10)

CPU times: user 15.3 s, sys: 264 ms, total: 15.6 s
Wall time: 15.7 s


Unnamed: 0,model,polynomial_degree,roc_auc_train,roc_auc_val,IsBetter
0,"DecisionTreeClassifier(max_depth=5, random_sta...",1,0.925694,0.921927,True
1,"DecisionTreeClassifier(max_depth=6, random_sta...",1,0.933778,0.921111,True
13,"DecisionTreeClassifier(max_depth=6, random_sta...",3,0.934525,0.918782,True
6,"DecisionTreeClassifier(max_depth=5, random_sta...",2,0.919803,0.915133,True
7,"DecisionTreeClassifier(max_depth=6, random_sta...",2,0.931477,0.91499,True
12,"DecisionTreeClassifier(max_depth=5, random_sta...",3,0.923137,0.914517,True
2,"DecisionTreeClassifier(max_depth=7, random_sta...",1,0.941535,0.914449,True
18,"DecisionTreeClassifier(max_depth=5, random_sta...",4,0.922249,0.913874,True
19,"DecisionTreeClassifier(max_depth=6, random_sta...",4,0.929079,0.912304,False
8,"DecisionTreeClassifier(max_depth=7, random_sta...",2,0.939812,0.909669,False


### Scale

In [None]:
%%time

polynomial_degree_options = [1]
binary_cat_flag_cols_options = [None]
num_bin_cols_options = [None]
model_options = [
    DecisionTreeClassifier(random_state=42, max_depth=5),
]
scale_mode_options = [None, "minmax", "standard"]

filtered_params_options = {
    k: v
    for k, v in fixed_params_options.items()
    if k not in [
        "scale_mode_options",
        "model_options",
        "polynomial_degree_options",
        "binary_cat_flag_cols_options",
        "num_bin_cols_options",
    ]
}

results_scale = run_experiments_poly(
    raw_df,
    target_col,
    scale_mode_options=scale_mode_options,
    model_options=model_options,
    polynomial_degree_options=polynomial_degree_options,
    binary_cat_flag_cols_options=binary_cat_flag_cols_options,
    num_bin_cols_options=num_bin_cols_options,
    **filtered_params_options,
)

results_scale["IsBetter"] = results_scale["roc_auc_val"] > roc_auc_val_best
results_scale[["scale_mode", "roc_auc_train", "roc_auc_val", "IsBetter"]].sort_values("roc_auc_val", ascending=False).head(10)

FAILED: scale_mode must be "minmax" or "standard"
CPU times: user 58.6 ms, sys: 15.8 ms, total: 74.3 ms
Wall time: 80.7 ms


Unnamed: 0,scale_mode,roc_auc_train,roc_auc_val,IsBetter
0,minmax,0.925694,0.921927,True
1,standard,0.925694,0.921927,True


### Encoder drop

In [17]:
%%time

polynomial_degree_options = [1]
binary_cat_flag_cols_options = [None]
num_bin_cols_options = [None]
model_options = [
    DecisionTreeClassifier(random_state=42, max_depth=5),
    DecisionTreeClassifier(random_state=42, max_depth=6),
    DecisionTreeClassifier(random_state=42, max_depth=7),
]

encode_drop_options = [None, "if_binary", "first"]

filtered_params_options = {
    k: v
    for k, v in fixed_params_options.items()
    if k not in [
        "encode_drop_options",
        "model_options",
        "polynomial_degree_options",
        "binary_cat_flag_cols_options",
        "num_bin_cols_options",
    ]
}

results_encode_drop = run_experiments_poly(
    raw_df,
    target_col,
    encode_drop_options=encode_drop_options,
    model_options=model_options,
    polynomial_degree_options=polynomial_degree_options,
    binary_cat_flag_cols_options=binary_cat_flag_cols_options,
    num_bin_cols_options=num_bin_cols_options,
    **filtered_params_options
)


results_encode_drop["IsBetter"] = results_encode_drop["roc_auc_val"] > roc_auc_val_best
results_encode_drop[["encode_drop", "model", "roc_auc_train", "roc_auc_val", "IsBetter"]].sort_values("roc_auc_val", ascending=False).head(10)

CPU times: user 250 ms, sys: 19.2 ms, total: 269 ms
Wall time: 274 ms


Unnamed: 0,encode_drop,model,roc_auc_train,roc_auc_val,IsBetter
0,,"DecisionTreeClassifier(max_depth=5, random_sta...",0.925694,0.921927,True
3,if_binary,"DecisionTreeClassifier(max_depth=5, random_sta...",0.925694,0.921927,True
6,first,"DecisionTreeClassifier(max_depth=5, random_sta...",0.925694,0.921927,True
1,,"DecisionTreeClassifier(max_depth=6, random_sta...",0.933778,0.921111,True
7,first,"DecisionTreeClassifier(max_depth=6, random_sta...",0.933687,0.920216,True
4,if_binary,"DecisionTreeClassifier(max_depth=6, random_sta...",0.933778,0.919789,True
2,,"DecisionTreeClassifier(max_depth=7, random_sta...",0.941535,0.914449,True
5,if_binary,"DecisionTreeClassifier(max_depth=7, random_sta...",0.941535,0.914449,True
8,first,"DecisionTreeClassifier(max_depth=7, random_sta...",0.941508,0.914359,True


### Max leaf

In [18]:
%%time

polynomial_degree_options = [1]
binary_cat_flag_cols_options = [None]
num_bin_cols_options = [None]
model_options = [
    DecisionTreeClassifier(random_state=42, max_leaf_nodes=1024),
    DecisionTreeClassifier(random_state=42, max_leaf_nodes=856),
    DecisionTreeClassifier(random_state=42, max_leaf_nodes=512),
    DecisionTreeClassifier(random_state=42, max_leaf_nodes=256),
    DecisionTreeClassifier(random_state=42, max_leaf_nodes=128),
    DecisionTreeClassifier(random_state=42, max_leaf_nodes=64),
    DecisionTreeClassifier(random_state=42, max_leaf_nodes=32),
    DecisionTreeClassifier(random_state=42, max_leaf_nodes=16),
]

filtered_params_options = {
    k: v
    for k, v in fixed_params_options.items()
    if k not in [
        "model_options",
        "polynomial_degree_options",
        "binary_cat_flag_cols_options",
        "num_bin_cols_options",
    ]
}

results_max_leaf = run_experiments_poly(
    raw_df,
    target_col,
    model_options=model_options,
    polynomial_degree_options=polynomial_degree_options,
    binary_cat_flag_cols_options=binary_cat_flag_cols_options,
    num_bin_cols_options=num_bin_cols_options,
    **filtered_params_options,
)

results_max_leaf["IsBetter"] = results_max_leaf["roc_auc_val"] > roc_auc_val_best
results_max_leaf[["model", "roc_auc_train", "roc_auc_val", "IsBetter"]].sort_values("roc_auc_val", ascending=False).head(10)

CPU times: user 240 ms, sys: 23.6 ms, total: 263 ms
Wall time: 272 ms


Unnamed: 0,model,roc_auc_train,roc_auc_val,IsBetter
5,"DecisionTreeClassifier(max_leaf_nodes=64, rand...",0.93306,0.919112,True
6,"DecisionTreeClassifier(max_leaf_nodes=32, rand...",0.922074,0.918667,True
7,"DecisionTreeClassifier(max_leaf_nodes=16, rand...",0.914511,0.913572,True
4,"DecisionTreeClassifier(max_leaf_nodes=128, ran...",0.940133,0.905906,False
3,"DecisionTreeClassifier(max_leaf_nodes=256, ran...",0.950648,0.873603,False
2,"DecisionTreeClassifier(max_leaf_nodes=512, ran...",0.964208,0.813832,False
1,"DecisionTreeClassifier(max_leaf_nodes=856, ran...",0.980825,0.729314,False
0,"DecisionTreeClassifier(max_leaf_nodes=1024, ra...",0.986788,0.696163,False
