# Bank Customer Churn Prediction
EDA and previous experiments can be found in below notebook:

https://github.com/Maxstef/data-loves-ml-for-people-course/blob/main/notebooks/2_2_logistic_regression/0_4_bank_customer.ipynb

## Intro & imports

In [15]:
from sklearn.linear_model import LogisticRegression

import pandas as pd
import numpy as np

# read train.csv / show first 5 rows / show info
raw_df = pd.read_csv('downloads/train.csv', index_col=0)
target_col = "Exited"

In [17]:
from mlpeople.optimization.experiments.polynomial_pipeline_experiments import run_experiment_poly, run_experiments_poly

In [18]:
# previous best result
train_pred_proba, val_pred_proba, roc_auc_train, roc_auc_val, poly_model_pipeline_optimal = run_experiment_poly(
    raw_df,
    target_col,
    stratify_col=target_col,
    drop_cols=["Surname", "CustomerId", "Tenure", "EstimatedSalary", "HasCrCard", "CreditScore"],
    scale_mode="standard",
    encode_drop=None,
    separate_binary_numeric=False,
    polynomial_degree=4,
    polynomial_after_scale=True,
    top_n_cat_values=None,
    binary_cat_flag_cols={
        'Geography': [{'flag_name': 'IsGermany',
            'value': 'Germany',
            'drop_original': True
        }]
    },
    binary_num_flag_cols=None,
    polynomial_interaction_only=False,
    model=LogisticRegression(C=0.5, solver='liblinear')
).values()
roc_auc_train, roc_auc_val

(0.9373648601520542, 0.9386991563207353)

In [19]:
roc_auc_val_max = roc_auc_val

In [20]:
fixed_params = {
    "stratify_col": target_col,
    "drop_cols": ["Surname", "CustomerId", "Tenure", "EstimatedSalary", "HasCrCard", "CreditScore"],
    "scale_mode": "standard",
    "encode_drop": None,
    "separate_binary_numeric": False,
    "polynomial_degree": 4,
    "polynomial_after_scale": True,
    "top_n_cat_values": None,
    "binary_cat_flag_cols": {
        'Geography': [{'flag_name': 'IsGermany',
            'value': 'Germany',
            'drop_original': True
        }]
    },
    "binary_num_flag_cols": None,
    "polynomial_interaction_only": False,
    "model": LogisticRegression(C=0.5, solver='liblinear')
}

fixed_params_options = {
    "test_size_options": [0.2],
    "stratify_col_options": [fixed_params["stratify_col"]],
    "drop_cols_options": [fixed_params["drop_cols"]],
    "scale_mode_options": [fixed_params["scale_mode"]],
    "encode_drop_options": [fixed_params["encode_drop"]],
    "model_options": [fixed_params["model"]],
    "separate_binary_numeric_options": [fixed_params["separate_binary_numeric"]],
    "polynomial_degree_options": [fixed_params["polynomial_degree"]],
    "polynomial_after_scale_options": [fixed_params["polynomial_after_scale"]],
    "top_n_cat_values_options": [fixed_params["top_n_cat_values"]],
    "binary_cat_flag_cols_options": [fixed_params["binary_cat_flag_cols"]],
    "binary_num_flag_cols_options": [fixed_params["binary_num_flag_cols"]],
    "polynomial_interaction_only_options": [fixed_params["polynomial_interaction_only"]],
}

## NumericBinner

Implemented new custom transfrmer NumericBinner and experiment with it on columns from dataset

### EstimatedSalary col

In [None]:
%%time


drop_cols_options = [["Surname", "CustomerId", "Tenure", "HasCrCard", "CreditScore"]] # no "EstimatedSalary",
salary_bins = [
    [0, 50000, 120000, float("inf")],
    [0, 40000, 100000, float("inf")],
    [0, 30000, 90000, float("inf")],
    [0, 30000, 100000, float("inf")],
]

labels = ["low", "medium", "high"]

num_bin_cols_options = [None] + [
    {
        "EstimatedSalary": {
            "bins": bins,
            "labels": labels,
            "new_col": "SalaryScore",
            "drop_original": drop
        }
    }
    for bins in salary_bins
    for drop in (True, False)
]


results_est_salary = run_experiments_poly(
    raw_df,
    target_col,
    test_size_options=fixed_params_options["test_size_options"],
    stratify_col_options=fixed_params_options["stratify_col_options"],
    drop_cols_options=drop_cols_options,
    scale_mode_options=fixed_params_options["scale_mode_options"],
    encode_drop_options=fixed_params_options["encode_drop_options"],
    model_options=fixed_params_options["model_options"],
    separate_binary_numeric_options=fixed_params_options["separate_binary_numeric_options"],
    polynomial_degree_options=fixed_params_options["polynomial_degree_options"],
    polynomial_after_scale_options=fixed_params_options["polynomial_after_scale_options"],
    top_n_cat_values_options=fixed_params_options["top_n_cat_values_options"],
    binary_cat_flag_cols_options=fixed_params_options["binary_cat_flag_cols_options"],
    binary_num_flag_cols_options=fixed_params_options["binary_num_flag_cols_options"],
    polynomial_interaction_only_options=fixed_params_options["polynomial_interaction_only_options"],
    num_bin_cols_options=num_bin_cols_options
)

results_est_salary[["num_bin_cols", "roc_auc_train", "roc_auc_val"]].sort_values("roc_auc_val", ascending=False).head(10)

CPU times: user 18.8 s, sys: 557 ms, total: 19.4 s
Wall time: 21 s


Unnamed: 0,num_bin_cols,roc_auc_train,roc_auc_val
7,"((SalaryScore, True, [0, 30000, 100000, inf]),)",0.937598,0.938898
5,"((SalaryScore, True, [0, 30000, 90000, inf]),)",0.937564,0.938879
3,"((SalaryScore, True, [0, 40000, 100000, inf]),)",0.937531,0.938819
1,"((SalaryScore, True, [0, 50000, 120000, inf]),)",0.93741,0.938744
6,"((SalaryScore, False, [0, 30000, 90000, inf]),)",0.939143,0.93624
8,"((SalaryScore, False, [0, 30000, 100000, inf]),)",0.939158,0.936221
4,"((SalaryScore, False, [0, 40000, 100000, inf]),)",0.939112,0.93616
0,(),0.939092,0.936139
2,"((SalaryScore, False, [0, 50000, 120000, inf]),)",0.939194,0.936073


In [24]:
results_est_salary["IsBetter"] = results_est_salary["roc_auc_val"] > roc_auc_val_max
results_est_salary[["num_bin_cols", "roc_auc_train", "roc_auc_val", "IsBetter"]].sort_values("roc_auc_val", ascending=False).head(10)

Unnamed: 0,num_bin_cols,roc_auc_train,roc_auc_val,IsBetter
7,"((SalaryScore, True, [0, 30000, 100000, inf]),)",0.937598,0.938898,True
5,"((SalaryScore, True, [0, 30000, 90000, inf]),)",0.937564,0.938879,True
3,"((SalaryScore, True, [0, 40000, 100000, inf]),)",0.937531,0.938819,True
1,"((SalaryScore, True, [0, 50000, 120000, inf]),)",0.93741,0.938744,True
6,"((SalaryScore, False, [0, 30000, 90000, inf]),)",0.939143,0.93624,False
8,"((SalaryScore, False, [0, 30000, 100000, inf]),)",0.939158,0.936221,False
4,"((SalaryScore, False, [0, 40000, 100000, inf]),)",0.939112,0.93616,False
0,(),0.939092,0.936139,False
2,"((SalaryScore, False, [0, 50000, 120000, inf]),)",0.939194,0.936073,False


In [26]:
%%time


drop_cols_options = [["Surname", "CustomerId", "Tenure", "HasCrCard", "CreditScore"]] # no "EstimatedSalary",
salary_bins = [
    [0, 50000, 120000, float("inf")],
    [0, 40000, 100000, float("inf")],
    [0, 40000, 120000, float("inf")],
    [0, 30000, 80000, float("inf")],
    [0, 30000, 90000, float("inf")],
    [0, 30000, 100000, float("inf")],
    [0, 30000, 110000, float("inf")],
    [0, 30000, 120000, float("inf")],
    [0, 25000, 80000, float("inf")],
    [0, 25000, 90000, float("inf")],
    [0, 25000, 100000, float("inf")],
    [0, 25000, 110000, float("inf")],
    [0, 25000, 120000, float("inf")],
]

labels = ["low", "medium", "high"]

num_bin_cols_options = [None] + [
    {
        "EstimatedSalary": {
            "bins": bins,
            "labels": labels,
            "new_col": "SalaryScore",
            "drop_original": True
        }
    }
    for bins in salary_bins
]


results_est_salary_v2 = run_experiments_poly(
    raw_df,
    target_col,
    test_size_options=fixed_params_options["test_size_options"],
    stratify_col_options=fixed_params_options["stratify_col_options"],
    drop_cols_options=drop_cols_options,
    scale_mode_options=fixed_params_options["scale_mode_options"],
    encode_drop_options=fixed_params_options["encode_drop_options"],
    model_options=fixed_params_options["model_options"],
    separate_binary_numeric_options=fixed_params_options["separate_binary_numeric_options"],
    polynomial_degree_options=fixed_params_options["polynomial_degree_options"],
    polynomial_after_scale_options=fixed_params_options["polynomial_after_scale_options"],
    top_n_cat_values_options=fixed_params_options["top_n_cat_values_options"],
    binary_cat_flag_cols_options=fixed_params_options["binary_cat_flag_cols_options"],
    binary_num_flag_cols_options=fixed_params_options["binary_num_flag_cols_options"],
    polynomial_interaction_only_options=fixed_params_options["polynomial_interaction_only_options"],
    num_bin_cols_options=num_bin_cols_options
)

results_est_salary_v2[["num_bin_cols", "roc_auc_train", "roc_auc_val"]].sort_values("roc_auc_val", ascending=False).head(10)

CPU times: user 18.4 s, sys: 478 ms, total: 18.9 s
Wall time: 19.9 s


Unnamed: 0,num_bin_cols,roc_auc_train,roc_auc_val
11,"((SalaryScore, True, [0, 25000, 100000, inf]),)",0.937508,0.939007
10,"((SalaryScore, True, [0, 25000, 90000, inf]),)",0.937471,0.938982
13,"((SalaryScore, True, [0, 25000, 120000, inf]),)",0.937447,0.938916
6,"((SalaryScore, True, [0, 30000, 100000, inf]),)",0.937598,0.938898
12,"((SalaryScore, True, [0, 25000, 110000, inf]),)",0.93747,0.938895
5,"((SalaryScore, True, [0, 30000, 90000, inf]),)",0.937564,0.938879
9,"((SalaryScore, True, [0, 25000, 80000, inf]),)",0.93743,0.938821
2,"((SalaryScore, True, [0, 40000, 100000, inf]),)",0.937531,0.938819
7,"((SalaryScore, True, [0, 30000, 110000, inf]),)",0.937555,0.938761
1,"((SalaryScore, True, [0, 50000, 120000, inf]),)",0.93741,0.938744


In [27]:
results_est_salary_v2["IsBetter"] = results_est_salary_v2["roc_auc_val"] > roc_auc_val_max
results_est_salary_v2[["num_bin_cols", "roc_auc_train", "roc_auc_val", "IsBetter"]].sort_values("roc_auc_val", ascending=False).head(10)

Unnamed: 0,num_bin_cols,roc_auc_train,roc_auc_val,IsBetter
11,"((SalaryScore, True, [0, 25000, 100000, inf]),)",0.937508,0.939007,True
10,"((SalaryScore, True, [0, 25000, 90000, inf]),)",0.937471,0.938982,True
13,"((SalaryScore, True, [0, 25000, 120000, inf]),)",0.937447,0.938916,True
6,"((SalaryScore, True, [0, 30000, 100000, inf]),)",0.937598,0.938898,True
12,"((SalaryScore, True, [0, 25000, 110000, inf]),)",0.93747,0.938895,True
5,"((SalaryScore, True, [0, 30000, 90000, inf]),)",0.937564,0.938879,True
9,"((SalaryScore, True, [0, 25000, 80000, inf]),)",0.93743,0.938821,True
2,"((SalaryScore, True, [0, 40000, 100000, inf]),)",0.937531,0.938819,True
7,"((SalaryScore, True, [0, 30000, 110000, inf]),)",0.937555,0.938761,True
1,"((SalaryScore, True, [0, 50000, 120000, inf]),)",0.93741,0.938744,True


### CreditScore col

In [28]:
%%time

drop_cols_options = [["Surname", "CustomerId", "Tenure", "HasCrCard", "EstimatedSalary"]] # no "CreditScore",
creditscore_bins = [
    [0, 500, float("inf")],
    [0, 550, float("inf")],
    [0, 600, float("inf")],
    [0, 650, float("inf")],
    [0, 700, float("inf")],
]

labels = ["low", "high"]

num_bin_cols_options = [None] + [
    {
        "CreditScore": {
            "bins": bins,
            "labels": labels,
            "new_col": "CreditScoreScore",
            "drop_original": drop
        }
    }
    for bins in creditscore_bins
    for drop in (True, False)
]


results_credit_score = run_experiments_poly(
    raw_df,
    target_col,
    test_size_options=fixed_params_options["test_size_options"],
    stratify_col_options=fixed_params_options["stratify_col_options"],
    drop_cols_options=drop_cols_options,
    scale_mode_options=fixed_params_options["scale_mode_options"],
    encode_drop_options=fixed_params_options["encode_drop_options"],
    model_options=fixed_params_options["model_options"],
    separate_binary_numeric_options=fixed_params_options["separate_binary_numeric_options"],
    polynomial_degree_options=fixed_params_options["polynomial_degree_options"],
    polynomial_after_scale_options=fixed_params_options["polynomial_after_scale_options"],
    top_n_cat_values_options=fixed_params_options["top_n_cat_values_options"],
    binary_cat_flag_cols_options=fixed_params_options["binary_cat_flag_cols_options"],
    binary_num_flag_cols_options=fixed_params_options["binary_num_flag_cols_options"],
    polynomial_interaction_only_options=fixed_params_options["polynomial_interaction_only_options"],
    num_bin_cols_options=num_bin_cols_options
)

results_credit_score[["num_bin_cols", "roc_auc_train", "roc_auc_val"]].sort_values("roc_auc_val", ascending=False).head(10)

CPU times: user 25.7 s, sys: 820 ms, total: 26.6 s
Wall time: 29.3 s


Unnamed: 0,num_bin_cols,roc_auc_train,roc_auc_val
7,"((CreditScoreScore, True, [0, 650, inf]),)",0.938022,0.939165
3,"((CreditScoreScore, True, [0, 550, inf]),)",0.937519,0.939136
5,"((CreditScoreScore, True, [0, 600, inf]),)",0.937637,0.938879
1,"((CreditScoreScore, True, [0, 500, inf]),)",0.937419,0.938833
9,"((CreditScoreScore, True, [0, 700, inf]),)",0.93767,0.938543
8,"((CreditScoreScore, False, [0, 650, inf]),)",0.939503,0.935886
6,"((CreditScoreScore, False, [0, 600, inf]),)",0.939472,0.935819
10,"((CreditScoreScore, False, [0, 700, inf]),)",0.939662,0.935744
4,"((CreditScoreScore, False, [0, 550, inf]),)",0.939452,0.935722
0,(),0.939455,0.935701


In [29]:
results_credit_score["IsBetter"] = results_credit_score["roc_auc_val"] > roc_auc_val_max
results_credit_score[["num_bin_cols", "roc_auc_train", "roc_auc_val", "IsBetter"]].sort_values("roc_auc_val", ascending=False).head(10)

Unnamed: 0,num_bin_cols,roc_auc_train,roc_auc_val,IsBetter
7,"((CreditScoreScore, True, [0, 650, inf]),)",0.938022,0.939165,True
3,"((CreditScoreScore, True, [0, 550, inf]),)",0.937519,0.939136,True
5,"((CreditScoreScore, True, [0, 600, inf]),)",0.937637,0.938879,True
1,"((CreditScoreScore, True, [0, 500, inf]),)",0.937419,0.938833,True
9,"((CreditScoreScore, True, [0, 700, inf]),)",0.93767,0.938543,False
8,"((CreditScoreScore, False, [0, 650, inf]),)",0.939503,0.935886,False
6,"((CreditScoreScore, False, [0, 600, inf]),)",0.939472,0.935819,False
10,"((CreditScoreScore, False, [0, 700, inf]),)",0.939662,0.935744,False
4,"((CreditScoreScore, False, [0, 550, inf]),)",0.939452,0.935722,False
0,(),0.939455,0.935701,False


In [30]:
%%time

drop_cols_options = [["Surname", "CustomerId", "Tenure", "HasCrCard", "EstimatedSalary"]] # no "CreditScore",
creditscore_bins = [
    [0, 550, float("inf")],
    [0, 575, float("inf")],
    [0, 600, float("inf")],
    [0, 625, float("inf")],
    [0, 650, float("inf")],
    [0, 675, float("inf")],
]

labels = ["low", "high"]

num_bin_cols_options = [None] + [
    {
        "CreditScore": {
            "bins": bins,
            "labels": labels,
            "new_col": "CreditScoreScore",
            "drop_original": True
        }
    }
    for bins in creditscore_bins
]


results_credit_score_v2 = run_experiments_poly(
    raw_df,
    target_col,
    test_size_options=fixed_params_options["test_size_options"],
    stratify_col_options=fixed_params_options["stratify_col_options"],
    drop_cols_options=drop_cols_options,
    scale_mode_options=fixed_params_options["scale_mode_options"],
    encode_drop_options=fixed_params_options["encode_drop_options"],
    model_options=fixed_params_options["model_options"],
    separate_binary_numeric_options=fixed_params_options["separate_binary_numeric_options"],
    polynomial_degree_options=fixed_params_options["polynomial_degree_options"],
    polynomial_after_scale_options=fixed_params_options["polynomial_after_scale_options"],
    top_n_cat_values_options=fixed_params_options["top_n_cat_values_options"],
    binary_cat_flag_cols_options=fixed_params_options["binary_cat_flag_cols_options"],
    binary_num_flag_cols_options=fixed_params_options["binary_num_flag_cols_options"],
    polynomial_interaction_only_options=fixed_params_options["polynomial_interaction_only_options"],
    num_bin_cols_options=num_bin_cols_options
)

results_credit_score_v2[["num_bin_cols", "roc_auc_train", "roc_auc_val"]].sort_values("roc_auc_val", ascending=False).head(10)

CPU times: user 9.45 s, sys: 208 ms, total: 9.66 s
Wall time: 9.92 s


Unnamed: 0,num_bin_cols,roc_auc_train,roc_auc_val
6,"((CreditScoreScore, True, [0, 675, inf]),)",0.938058,0.939239
5,"((CreditScoreScore, True, [0, 650, inf]),)",0.938022,0.939165
1,"((CreditScoreScore, True, [0, 550, inf]),)",0.937519,0.939136
2,"((CreditScoreScore, True, [0, 575, inf]),)",0.937483,0.93907
4,"((CreditScoreScore, True, [0, 625, inf]),)",0.937715,0.938972
3,"((CreditScoreScore, True, [0, 600, inf]),)",0.937637,0.938879
0,(),0.939455,0.935701


In [31]:
results_credit_score_v2["IsBetter"] = results_credit_score_v2["roc_auc_val"] > roc_auc_val_max
results_credit_score_v2[["num_bin_cols", "roc_auc_train", "roc_auc_val", "IsBetter"]].sort_values("roc_auc_val", ascending=False).head(10)

Unnamed: 0,num_bin_cols,roc_auc_train,roc_auc_val,IsBetter
6,"((CreditScoreScore, True, [0, 675, inf]),)",0.938058,0.939239,True
5,"((CreditScoreScore, True, [0, 650, inf]),)",0.938022,0.939165,True
1,"((CreditScoreScore, True, [0, 550, inf]),)",0.937519,0.939136,True
2,"((CreditScoreScore, True, [0, 575, inf]),)",0.937483,0.93907,True
4,"((CreditScoreScore, True, [0, 625, inf]),)",0.937715,0.938972,True
3,"((CreditScoreScore, True, [0, 600, inf]),)",0.937637,0.938879,True
0,(),0.939455,0.935701,False


### Combine CreditScore and EstimatedSalary

In [37]:
%%time

from math import inf

drop_cols_options = [["Surname", "CustomerId", "Tenure", "HasCrCard"]] # no "CreditScore", "EstimatedSalary"

creditscore_labels = ["low", "high"]
creditscore_bins = [
    [0, 675, inf],
    [0, 650, inf],
    [0, 550, inf],
    None
]

salary_labels = ["low", "medium", "high"]
salary_bins = [
    [0, 25000, 100000, inf],
    [0, 25000, 90000, inf],
    [0, 25000, 120000, inf],
    None
]


def build_mapping(s_bins, c_bins):
    mapping = {}

    if s_bins is not None:
        mapping["EstimatedSalary"] = {
            "bins": s_bins,
            "labels": salary_labels,
            "new_col": "SalaryScore",
            "drop_original": True
        }

    if c_bins is not None:
        mapping["CreditScore"] = {
            "bins": c_bins,
            "labels": creditscore_labels,
            "new_col": "CreditScoreScore",
            "drop_original": True
        }

    return mapping or None  # returns None if empty


num_bin_cols_options = [
    build_mapping(s, c)
    for s in salary_bins
    for c in creditscore_bins
]



results_combine = run_experiments_poly(
    raw_df,
    target_col,
    test_size_options=fixed_params_options["test_size_options"],
    stratify_col_options=fixed_params_options["stratify_col_options"],
    drop_cols_options=drop_cols_options,
    scale_mode_options=fixed_params_options["scale_mode_options"],
    encode_drop_options=fixed_params_options["encode_drop_options"],
    model_options=fixed_params_options["model_options"],
    separate_binary_numeric_options=fixed_params_options["separate_binary_numeric_options"],
    polynomial_degree_options=fixed_params_options["polynomial_degree_options"],
    polynomial_after_scale_options=fixed_params_options["polynomial_after_scale_options"],
    top_n_cat_values_options=fixed_params_options["top_n_cat_values_options"],
    binary_cat_flag_cols_options=fixed_params_options["binary_cat_flag_cols_options"],
    binary_num_flag_cols_options=fixed_params_options["binary_num_flag_cols_options"],
    polynomial_interaction_only_options=fixed_params_options["polynomial_interaction_only_options"],
    num_bin_cols_options=num_bin_cols_options
)

results_combine[["num_bin_cols", "roc_auc_train", "roc_auc_val"]].sort_values("roc_auc_val", ascending=False).head(20)


CPU times: user 28.2 s, sys: 336 ms, total: 28.5 s
Wall time: 28.8 s


Unnamed: 0,num_bin_cols,roc_auc_train,roc_auc_val
0,"((SalaryScore, True, [0, 25000, 100000, inf]),...",0.938148,0.939533
1,"((SalaryScore, True, [0, 25000, 100000, inf]),...",0.938113,0.939502
4,"((SalaryScore, True, [0, 25000, 90000, inf]), ...",0.938128,0.939499
2,"((SalaryScore, True, [0, 25000, 100000, inf]),...",0.937634,0.939459
5,"((SalaryScore, True, [0, 25000, 90000, inf]), ...",0.938083,0.939456
6,"((SalaryScore, True, [0, 25000, 90000, inf]), ...",0.937613,0.939402
8,"((SalaryScore, True, [0, 25000, 120000, inf]),...",0.938112,0.939397
10,"((SalaryScore, True, [0, 25000, 120000, inf]),...",0.937579,0.939328
9,"((SalaryScore, True, [0, 25000, 120000, inf]),...",0.938071,0.939319
14,"((CreditScoreScore, True, [0, 550, inf]),)",0.939228,0.936594


In [41]:
results_combine["IsBetter"] = results_combine["roc_auc_val"] > roc_auc_val_max

with pd.option_context('display.max_colwidth', None):
    display(
        results_combine[
            ["num_bin_cols", "roc_auc_train", "roc_auc_val", "IsBetter"]
        ].sort_values("roc_auc_val", ascending=False).head(10)
    )

Unnamed: 0,num_bin_cols,roc_auc_train,roc_auc_val,IsBetter
0,"((SalaryScore, True, [0, 25000, 100000, inf]), (CreditScoreScore, True, [0, 675, inf]))",0.938148,0.939533,True
1,"((SalaryScore, True, [0, 25000, 100000, inf]), (CreditScoreScore, True, [0, 650, inf]))",0.938113,0.939502,True
4,"((SalaryScore, True, [0, 25000, 90000, inf]), (CreditScoreScore, True, [0, 675, inf]))",0.938128,0.939499,True
2,"((SalaryScore, True, [0, 25000, 100000, inf]), (CreditScoreScore, True, [0, 550, inf]))",0.937634,0.939459,True
5,"((SalaryScore, True, [0, 25000, 90000, inf]), (CreditScoreScore, True, [0, 650, inf]))",0.938083,0.939456,True
6,"((SalaryScore, True, [0, 25000, 90000, inf]), (CreditScoreScore, True, [0, 550, inf]))",0.937613,0.939402,True
8,"((SalaryScore, True, [0, 25000, 120000, inf]), (CreditScoreScore, True, [0, 675, inf]))",0.938112,0.939397,True
10,"((SalaryScore, True, [0, 25000, 120000, inf]), (CreditScoreScore, True, [0, 550, inf]))",0.937579,0.939328,True
9,"((SalaryScore, True, [0, 25000, 120000, inf]), (CreditScoreScore, True, [0, 650, inf]))",0.938071,0.939319,True
14,"((CreditScoreScore, True, [0, 550, inf]),)",0.939228,0.936594,False


## Train model with best found combination

In [46]:
from math import inf

result_2 = run_experiment_poly(
    raw_df,
    target_col,
    stratify_col=target_col,
    drop_cols=["Surname", "CustomerId", "Tenure", "HasCrCard"],
    scale_mode="standard",
    encode_drop=None,
    separate_binary_numeric=False,
    polynomial_degree=4,
    polynomial_after_scale=True,
    top_n_cat_values=None,
    binary_cat_flag_cols={
        'Geography': [{'flag_name': 'IsGermany',
            'value': 'Germany',
            'drop_original': True
        }]
    },
    binary_num_flag_cols=None,
    polynomial_interaction_only=False,
    model=LogisticRegression(C=0.5, solver='liblinear'),
    num_bin_cols={
        "EstimatedSalary": {
            "bins": [0, 25000, 100000, inf],
            "labels": ["low", "medium", "high"],
            "new_col": "SalaryScore",
            "drop_original": True
        },
        "CreditScore": {
            "bins": [0, 675, inf],
            "labels": ["low", "high"],
            "new_col": "CreditScoreScore",
            "drop_original": True
        }
    }
)

roc_auc_train = result_2["roc_auc_train"]
roc_auc_val = result_2["roc_auc_val"]
poly_model_pipeline_optimal_v3 = result_2["pipeline"]

roc_auc_train, roc_auc_val

(0.9381483220937081, 0.9395325468139104)

### Test data prediction

In [47]:
test_raw_df = pd.read_csv("downloads/test.csv")
test_raw_df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15000,15594796.0,Chu,584.0,Germany,Male,30.0,2.0,146053.66,1.0,1.0,1.0,157891.86
1,15001,15642821.0,Mazzi,551.0,France,Male,39.0,5.0,0.0,2.0,1.0,1.0,67431.28
2,15002,15716284.0,Onyekachi,706.0,France,Male,43.0,8.0,0.0,2.0,1.0,0.0,156768.45
3,15003,15785078.0,Martin,717.0,Spain,Male,45.0,3.0,0.0,1.0,1.0,1.0,166909.87
4,15004,15662955.0,Kenechukwu,592.0,Spain,Male,43.0,8.0,0.0,2.0,1.0,1.0,143681.97


In [48]:
test_pred_proba = poly_model_pipeline_optimal_v3.predict_proba(test_raw_df)[:, 1]
test_raw_df["Exited"] = test_pred_proba.round(2)
test_raw_df["Exited"].head()

0    0.09
1    0.01
2    0.05
3    0.42
4    0.03
Name: Exited, dtype: float64

In [49]:
sample_submission_df = pd.read_csv("downloads/sample_submission.csv")
sample_submission_df["Exited"] = test_raw_df["Exited"]
sample_submission_df.to_csv("downloads/submission_poly_reg_optimal_v3.csv", index=False)