In [6]:
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

In [3]:
df = pd.read_csv('Data/df.csv')

df = df.drop(['CustomerId', 'Surname','id'], axis=1)

df.head()


X = df.drop('Exited', axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

under= RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_under, y_train_under = under.fit_resample(X_train, y_train)

In [4]:
# Best parameters found for each model
params_lgb = {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 15, 'subsample': 0.8}
params_xgboost = {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.95}
params_gradient_boosting = {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100, 'subsample': 0.8}

lgb_model = lgb.LGBMClassifier(**params_lgb)
xgboost_model = XGBClassifier(**params_xgboost)
gradient_boosting_model = GradientBoostingClassifier(**params_gradient_boosting)

models = [('lgb', lgb_model), ('xgboost', xgboost_model), ('gradient_boosting', gradient_boosting_model)]




In [5]:

ensemble_model = VotingClassifier(estimators=models, voting='soft')  # 'soft' for averaging probabilities

ensemble_model.fit(X_train, y_train)

y_ensemble_probabilities = ensemble_model.predict_proba(X_test)[:, 1]

roc_auc_ensemble = roc_auc_score(y_test, y_ensemble_probabilities)

print(f"Ensemble ROC AUC Score: {roc_auc_ensemble}")

[LightGBM] [Info] Number of positive: 24445, number of negative: 91078
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003985 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 860
[LightGBM] [Info] Number of data points in the train set: 115523, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211603 -> initscore=-1.315291
[LightGBM] [Info] Start training from score -1.315291
Ensemble ROC AUC Score: 0.8881266936551053


In [8]:

ensemble_model = StackingClassifier(estimators=models)  

ensemble_model.fit(X_train, y_train)

y_ensemble_probabilities = ensemble_model.predict_proba(X_test)[:, 1]

roc_auc_ensemble = roc_auc_score(y_test, y_ensemble_probabilities)

print(f"Ensemble ROC AUC Score: {roc_auc_ensemble}")

[LightGBM] [Info] Number of positive: 24445, number of negative: 91078
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001203 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 860
[LightGBM] [Info] Number of data points in the train set: 115523, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211603 -> initscore=-1.315291
[LightGBM] [Info] Start training from score -1.315291
[LightGBM] [Info] Number of positive: 19556, number of negative: 72862
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001309 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 859
[LightGBM] [Info] Number of data points in the train set: 92418, number of used features: 12
[LightGBM] [Info] [