In [14]:
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score

from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np

import logging

logging.getLogger('lightgbm').setLevel(logging.CRITICAL)

In [2]:
df = pd.read_csv('Data/df.csv')

df = df.drop(['CustomerId', 'Surname','id'], axis=1)

df.head()




Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,668,1,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,0,0
1,627,1,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,0,0
2,678,1,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,0,0
3,581,1,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,0,0
4,716,1,33.0,5,0.0,2,1.0,1.0,15068.83,0,0,0,1


In [3]:
num_folds = 15
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)


In [6]:
X = df.drop('Exited', axis=1)
y = df['Exited']

In [10]:
# Best parameters found for each model
params_lgb = {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 15, 'subsample': 0.8}
params_xgboost = {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.95}
params_gradient_boosting = {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100, 'subsample': 0.8}

lgb_model = lgb.LGBMClassifier(**params_lgb)
xgboost_model = XGBClassifier(**params_xgboost)
gradient_boosting_model = GradientBoostingClassifier(**params_gradient_boosting)

models = [('lgb', lgb_model), ('xgboost', xgboost_model), ('gradient_boosting', gradient_boosting_model)]




### 15 K-Folds, Ensemble Models (LGBM, XGBoost, Gradient Boosting) 

In [15]:

ensemble_model = VotingClassifier(estimators=models, voting='soft')
roc_auc_scores = []

for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    under = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    X_train_under, y_train_under = under.fit_resample(X_train_fold, y_train_fold)

    ensemble_model.fit(X_train_under, y_train_under)

    y_ensemble_probabilities = ensemble_model.predict_proba(X_test_fold)[:, 1]

    roc_auc_fold = roc_auc_score(y_test_fold, y_ensemble_probabilities)
    roc_auc_scores.append(roc_auc_fold)

average_roc_auc = np.mean(roc_auc_scores)

print(f"Average Ensemble ROC-AUC across {num_folds}-fold Cross-validation: {average_roc_auc}")

[LightGBM] [Info] Number of positive: 32593, number of negative: 32593
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 858
[LightGBM] [Info] Number of data points in the train set: 65186, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32593, number of negative: 32593
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000652 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 859
[LightGBM] [Info] Number of data points in the train set: 65186, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32593, number of negat

In [16]:

ensemble_model = StackingClassifier(estimators=models)  


roc_auc_scores = []

for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    under = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    X_train_under, y_train_under = under.fit_resample(X_train_fold, y_train_fold)

    ensemble_model.fit(X_train_under, y_train_under)

    y_ensemble_probabilities = ensemble_model.predict_proba(X_test_fold)[:, 1]

    roc_auc_fold = roc_auc_score(y_test_fold, y_ensemble_probabilities)
    roc_auc_scores.append(roc_auc_fold)

average_roc_auc = np.mean(roc_auc_scores)

print(f"Average Ensemble ROC-AUC across {num_folds}-fold Cross-validation: {average_roc_auc}")

[LightGBM] [Info] Number of positive: 32593, number of negative: 32593
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003812 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 858
[LightGBM] [Info] Number of data points in the train set: 65186, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 26074, number of negative: 26074
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000595 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 857
[LightGBM] [Info] Number of data points in the train set: 52148, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000

In [27]:
df_test = pd.read_csv('Data/test.csv')

test_id = df_test["id"]

df_test = df_test.drop(['CustomerId', 'Surname','id'], axis=1)

df_test.head()


df_test['Gender'] = df_test['Gender'].map({'Male': 1, 'Female': 0})
geography_dummies = pd.get_dummies(df_test['Geography'], prefix='Geography').astype(int)
df_test = pd.concat([df_test, geography_dummies], axis=1)
df_test = df_test.drop('Geography', axis=1)

In [28]:
y_test_pred = ensemble_model.predict_proba(df_test) 

sample_submission = pd.DataFrame({
    'id': test_id,
    'Exited': y_test_pred[:, 1],  # Select the probabilities for the positive class
})

print(sample_submission)

sample_submission.to_csv("submission.csv",index=False)

            id    Exited
0       165034  0.077665
1       165035  0.921388
2       165036  0.095691
3       165037  0.596061
4       165038  0.769258
...        ...       ...
110018  275052  0.124138
110019  275053  0.262107
110020  275054  0.079280
110021  275055  0.349195
110022  275056  0.417520

[110023 rows x 2 columns]
