In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid


from xgboost import XGBClassifier
import xgboost as xgb
import lightgbm as lgb

from imblearn.over_sampling import RandomOverSampler, SMOTE

In [10]:
df = pd.read_csv('Data/df.csv')

df = df.drop(['CustomerId', 'Surname','id'], axis=1)

df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,668,1,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,0,0
1,627,1,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,0,0
2,678,1,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,0,0
3,581,1,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,0,0
4,716,1,33.0,5,0.0,2,1.0,1.0,15068.83,0,0,0,1


# OverSampling No Standarization



In [11]:

X = df.drop('Exited', axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_over, y_train_over= smote.fit_resample(X_train, y_train)


## Complex Base Models no parameter tunning

### Gradient Boosting 


In [12]:

model = GradientBoostingClassifier()

model.fit(X_train_over, y_train_over)

y_prob = model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)

print(f'AUC Score: {auc_score:.4f}')


AUC Score: 0.8839


### XGBoost

In [13]:

model = XGBClassifier()
model.fit(X_train_over, y_train_over)

y_prob = model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)

print(f'AUC Score: {auc_score:.4f}')

AUC Score: 0.8842


### LightGBM

In [14]:
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train_over, y_train_over)


lgb_y_prob = lgb_model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, lgb_y_prob)
print(f'AUC Score: {auc_score:.4f}')

[LightGBM] [Info] Number of positive: 91078, number of negative: 91078
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001330 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 182156, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
AUC Score: 0.8874


## Complex Model with simple parameter tunning 

### GridSearch XGBoost

In [15]:
"""
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1],
    'min_child_weight': [1, 5, 10],
    'scale_pos_weight': [1, 2, 5] #
}
"""

param_grid = {
    'learning_rate': [0.01, 0.001],
    'n_estimators': [50, 100,  150],
    'max_depth': [ 5],
    'subsample': [0.9,0.95],
    'colsample_bytree': [0.6,0.7,0.8],
    'gamma': [0,0.5]  
}


best_auc_score = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid))} ({np.round((n_param + 1) * 100 / len(ParameterGrid(param_grid)), 2)}%)")

    xgb_model = xgb.XGBClassifier(**params, random_state=42, objective='binary:logistic', eval_metric='logloss')

    xgb_model.fit(X_train_over, y_train_over)

    xgb_y_prob = xgb_model.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, xgb_y_prob)
    print(f"AUC Score: {auc_score:.4f}")

    if auc_score > best_auc_score:
        print(f"Best actual AUC Score:\t {auc_score}")

        best_auc_score = auc_score
        best_params = params
        print(f"Best params: {best_params}")

print(f"Best parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_auc_score}")



Iteration 1 of 72 (1.39%)
AUC Score: 0.8742
Best actual AUC Score:	 0.8742017167409262
Best params: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.9}

Iteration 2 of 72 (2.78%)
AUC Score: 0.8741

Iteration 3 of 72 (4.17%)
AUC Score: 0.8767
Best actual AUC Score:	 0.8767375121738243
Best params: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.9}

Iteration 4 of 72 (5.56%)
AUC Score: 0.8767

Iteration 5 of 72 (6.94%)
AUC Score: 0.8787
Best actual AUC Score:	 0.878701283978071
Best params: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}

Iteration 6 of 72 (8.33%)
AUC Score: 0.8787
Best actual AUC Score:	 0.878744485678819
Best params: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.95}

Iteration 7 of 72 (9.72%)
AUC Score: 0.8728

### GridSearch LightGBM

In [16]:
param_grid = {
    'num_leaves': [15, 31, 50],
    'learning_rate': [0.01, 0.001, 0.1],
    'n_estimators': [50, 100, 75],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'objective': ['binary'],
    'boosting_type': ['gbdt', 'dart'],
}
 
best_auc_score = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid))} ({round((n_param + 1) * 100 / len(ParameterGrid(param_grid)), 2)}%)")

    lgb_model = lgb.LGBMClassifier(**params, random_state=42, metric='binary_logloss')

    lgb_model.fit(X_train_over, y_train_over)

    lgb_y_prob = lgb_model.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, lgb_y_prob)
    print(f"AUC Score: {auc_score:.4f}")

    if auc_score > best_auc_score:
        print(f"Best actual AUC Score:\t {auc_score}")

        best_auc_score = auc_score
        best_params = params
        print(f"Best params: {best_params}")

print(f"Best parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_auc_score}")


Iteration 1 of 486 (0.21%)
[LightGBM] [Info] Number of positive: 91078, number of negative: 91078
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002005 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 182156, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
AUC Score: 0.8729
Best actual AUC Score:	 0.87288720537609
Best params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'learning_rate': 0.01, 'n_estimators': 50, 'num_leaves': 15, 'objective': 'binary', 'subsample': 0.8}

Iteration 2 of 486 (0.41%)
[LightGBM] [Info] Number of positive: 91078, number of negative: 91078
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007368 seconds.
You can set `force_col_wise=true` to rem