In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

from catboost import CatBoostClassifier

from xgboost import XGBClassifier
import xgboost as xgb
import lightgbm as lgb

from imblearn.over_sampling import RandomOverSampler, ADASYN

In [2]:
df = pd.read_csv('Data/df.csv')

df = df.drop(['CustomerId', 'Surname','id'], axis=1)

df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,668,1,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,0,0
1,627,1,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,0,0
2,678,1,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,0,0
3,581,1,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,0,0
4,716,1,33.0,5,0.0,2,1.0,1.0,15068.83,0,0,0,1


# OverSampling No Standarization



In [3]:

X = df.drop('Exited', axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

smote = ADASYN(sampling_strategy='auto', random_state=42)
X_train_over, y_train_over= smote.fit_resample(X_train, y_train)


## Complex Base Models no parameter tunning

### CatBoost


In [5]:
cat_model = CatBoostClassifier()
cat_model.fit(X_train_over, y_train_over)


y_pred_cat = cat_model.predict_proba(X_test)[:, 1]

roc_auc_cat = roc_auc_score(y_test, y_pred_cat)
print(f"CatBoost ROC AUC Score: {roc_auc_cat}")


Learning rate set to 0.095489
0:	learn: 0.5856193	total: 176ms	remaining: 2m 55s
1:	learn: 0.5177478	total: 205ms	remaining: 1m 42s
2:	learn: 0.4695376	total: 238ms	remaining: 1m 18s
3:	learn: 0.4358991	total: 267ms	remaining: 1m 6s
4:	learn: 0.4054885	total: 299ms	remaining: 59.5s
5:	learn: 0.3871569	total: 328ms	remaining: 54.3s
6:	learn: 0.3710494	total: 350ms	remaining: 49.6s
7:	learn: 0.3587727	total: 382ms	remaining: 47.4s
8:	learn: 0.3342898	total: 416ms	remaining: 45.8s
9:	learn: 0.3162360	total: 452ms	remaining: 44.7s
10:	learn: 0.3081960	total: 482ms	remaining: 43.3s
11:	learn: 0.2979176	total: 515ms	remaining: 42.4s
12:	learn: 0.2932277	total: 547ms	remaining: 41.5s
13:	learn: 0.2838768	total: 588ms	remaining: 41.4s
14:	learn: 0.2776717	total: 612ms	remaining: 40.2s
15:	learn: 0.2747379	total: 644ms	remaining: 39.6s
16:	learn: 0.2722947	total: 676ms	remaining: 39.1s
17:	learn: 0.2702850	total: 708ms	remaining: 38.6s
18:	learn: 0.2682530	total: 748ms	remaining: 38.6s
19:	lear

### XGBoost

In [6]:

model = XGBClassifier()
model.fit(X_train_over, y_train_over)

y_prob = model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)

print(f'AUC Score: {auc_score:.4f}')

AUC Score: 0.8845


### LightGBM

In [7]:
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train_over, y_train_over)


lgb_y_prob = lgb_model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, lgb_y_prob)
print(f'AUC Score: {auc_score:.4f}')

[LightGBM] [Info] Number of positive: 92864, number of negative: 91078
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001433 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 183942, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504855 -> initscore=0.019420
[LightGBM] [Info] Start training from score 0.019420
AUC Score: 0.8873


## Complex Model with simple parameter tunning 

### GridSearch XGBoost

In [14]:
"""
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1],
    'min_child_weight': [1, 5, 10],
    'scale_pos_weight': [1, 2, 5] #
}
"""

param_grid = {
    'learning_rate': [0.01],
    'n_estimators': [200, 250 , 300],
    'max_depth': [ 10],
    'subsample': [0.9],
    'colsample_bytree': [ 0.7],
    'gamma': [0.01] ,
    'min_child_weight': [1],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1],
    'scale_pos_weight': [1, 2]
}


best_auc_score = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid))} ({np.round((n_param + 1) * 100 / len(ParameterGrid(param_grid)), 2)}%)")

    xgb_model = xgb.XGBClassifier(**params, random_state=42, objective='binary:logistic', eval_metric='auc')

    xgb_model.fit(X_train_over, y_train_over)

    xgb_y_prob = xgb_model.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, xgb_y_prob)
    print(f"AUC Score: {auc_score:.4f}")

    if auc_score > best_auc_score:
        print(f"Best actual AUC Score:\t {auc_score}")

        best_auc_score = auc_score
        best_params = params
        print(f"Best params: {best_params}")

print(f"Best parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_auc_score}")



Iteration 1 of 108 (0.93%)
AUC Score: 0.8838
Best actual AUC Score:	 0.8837610501007676
Best params: {'colsample_bytree': 0.7, 'gamma': 0.01, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 0, 'scale_pos_weight': 1, 'subsample': 0.9}

Iteration 2 of 108 (1.85%)
AUC Score: 0.8838
Best actual AUC Score:	 0.8838379616730131
Best params: {'colsample_bytree': 0.7, 'gamma': 0.01, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 0, 'scale_pos_weight': 2, 'subsample': 0.9}

Iteration 3 of 108 (2.78%)
AUC Score: 0.8838

Iteration 4 of 108 (3.7%)
AUC Score: 0.8838
Best actual AUC Score:	 0.8838407482089996
Best params: {'colsample_bytree': 0.7, 'gamma': 0.01, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 0.1, 'scale_pos_weight': 2, 'subsample': 0.9}

Iteration 5 of 108 (4.63%)
AUC Score: 0.8838

Iterat

### GridSearch LightGBM

In [6]:
param_grid = {
    'num_leaves': [31],
    'learning_rate': [0.1],
    'n_estimators': [150],
    'subsample': [0.7],
    'colsample_bytree': [0.7],
    'objective': ['binary'],
    'boosting_type': ['gbdt'],
    'reg_alpha': [1.2],
    'reg_lambda': [1.5 , 1.7],
    'max_depth': [10, 15],     
}
 
#This param grid is reduced 

best_auc_score = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid))} ({round((n_param + 1) * 100 / len(ParameterGrid(param_grid)), 2)}%)")

    lgb_model = lgb.LGBMClassifier(**params, random_state=42, metric='auc')

    lgb_model.fit(X_train_over, y_train_over)

    lgb_y_prob = lgb_model.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, lgb_y_prob)
    print(f"AUC Score: {auc_score:.4f}")

    if auc_score > best_auc_score:
        print(f"Best actual AUC Score:\t {auc_score}")

        best_auc_score = auc_score
        best_params = params
        print(f"Best params: {best_params}")

print(f"Best parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_auc_score}")


Iteration 1 of 18 (5.56%)
[LightGBM] [Info] Number of positive: 92864, number of negative: 91078
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001454 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 183942, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504855 -> initscore=0.019420
[LightGBM] [Info] Start training from score 0.019420
AUC Score: 0.8873
Best actual AUC Score:	 0.8873186495725216
Best params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150, 'num_leaves': 31, 'objective': 'binary', 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.7}

Iteration 2 of 18 (11.11%)
[LightGBM] [Info] Number of positive: 92864, number of negative: 91078
[LightGBM] [Info] Auto-choosing row-wise 

### CAT


In [11]:

param_grid = {
    'learning_rate': [0.01, 0.001, 0.1],
    'iterations': [50, 100, 150],
    'depth': [5, 3],
    'subsample': [0.9, 0.95],
    'colsample_bylevel': [0.6, 0.7, 0.8],
    'l2_leaf_reg': [0, 0.5]
}

best_auc_score = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid))} ({round((n_param + 1) * 100 / len(ParameterGrid(param_grid)), 2)}%)")

    cat_model = CatBoostClassifier(loss_function='Logloss', verbose=0, **params)

    cat_model.fit(X_train_over, y_train_over)

    y_pred_cat = cat_model.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, y_pred_cat)
    print(f"AUC Score: {auc_score:.4f}")

    if auc_score > best_auc_score:
        print(f"Best actual AUC Score:\t {auc_score}")
        best_auc_score = auc_score
        best_params = params
        print(f"Best params: {best_params}")
print(f"\nBest parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_auc_score}")


Iteration 1 of 216 (0.46%)
AUC Score: 0.8600
Best actual AUC Score:	 0.8599748573511216
Best params: {'colsample_bylevel': 0.6, 'depth': 5, 'iterations': 50, 'l2_leaf_reg': 0, 'learning_rate': 0.01, 'subsample': 0.9}

Iteration 2 of 216 (0.93%)
AUC Score: 0.8591

Iteration 3 of 216 (1.39%)
AUC Score: 0.8549

Iteration 4 of 216 (1.85%)
AUC Score: 0.8549

Iteration 5 of 216 (2.31%)
AUC Score: 0.8820
Best actual AUC Score:	 0.8820403794129793
Best params: {'colsample_bylevel': 0.6, 'depth': 5, 'iterations': 50, 'l2_leaf_reg': 0, 'learning_rate': 0.1, 'subsample': 0.9}

Iteration 6 of 216 (2.78%)
AUC Score: 0.8820

Iteration 7 of 216 (3.24%)
AUC Score: 0.8600

Iteration 8 of 216 (3.7%)
AUC Score: 0.8591

Iteration 9 of 216 (4.17%)
AUC Score: 0.8549

Iteration 10 of 216 (4.63%)
AUC Score: 0.8549

Iteration 11 of 216 (5.09%)
AUC Score: 0.8821
Best actual AUC Score:	 0.8820655034767997
Best params: {'colsample_bylevel': 0.6, 'depth': 5, 'iterations': 50, 'l2_leaf_reg': 0.5, 'learning_rate': 