In [1]:
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score

from imblearn.over_sampling import RandomOverSampler, SMOTE
import pandas as pd
import numpy as np

import logging

logging.getLogger('lightgbm').setLevel(logging.CRITICAL)

In [2]:
df = pd.read_csv('Data/df.csv')

df = df.drop(['CustomerId', 'Surname','id'], axis=1)

df.head()




Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,668,1,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,0,0
1,627,1,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,0,0
2,678,1,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,0,0
3,581,1,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,0,0
4,716,1,33.0,5,0.0,2,1.0,1.0,15068.83,0,0,0,1


In [3]:
num_folds = 15
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)


In [4]:
X = df.drop('Exited', axis=1)
y = df['Exited']

In [5]:
# Best parameters found for each model
params_lgb = {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31, 'objective': 'binary', 'subsample': 0.8}



lgb_model = lgb.LGBMClassifier(**params_lgb)
xgboost_model = XGBClassifier() #Better ROC AUC without params rather than in the GridSearch
gradient_boosting_model = GradientBoostingClassifier() #Same scenario as XGBoost

models = [('lgb', lgb_model), ('xgboost', xgboost_model), ('gradient_boosting', gradient_boosting_model)]




### 15 K-Folds, Ensemble Models (LGBM, XGBoost, Gradient Boosting) 

In [6]:
ensemble_model_1 = VotingClassifier(estimators=models, voting='soft')

roc_auc_scores = []

for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    over = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_over, y_train_over = over.fit_resample(X_train_fold, y_train_fold)

    ensemble_model_1.fit(X_train_over, y_train_over)

    y_ensemble_probabilities = ensemble_model_1.predict_proba(X_test_fold)[:, 1]

    roc_auc_fold = roc_auc_score(y_test_fold, y_ensemble_probabilities)
    roc_auc_scores.append(roc_auc_fold)

average_roc_auc = np.mean(roc_auc_scores)

print(f"Average Ensemble ROC-AUC across {num_folds}-fold Cross-validation: {average_roc_auc}")

[LightGBM] [Info] Number of positive: 121438, number of negative: 121438
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010034 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 242876, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 121438, number of negative: 121438
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009507 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1553
[LightGBM] [Info] Number of data points in the train set: 242876, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 121438, number of negative: 121438
[LightGBM] [Info] Auto-choosing col-wise mu

In [7]:

ensemble_model_2 = StackingClassifier(estimators=models)  


roc_auc_scores = []

for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    under = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_under, y_train_under = under.fit_resample(X_train_fold, y_train_fold)

    ensemble_model_2.fit(X_train_under, y_train_under)

    y_ensemble_probabilities = ensemble_model_2.predict_proba(X_test_fold)[:, 1]

    roc_auc_fold = roc_auc_score(y_test_fold, y_ensemble_probabilities)
    roc_auc_scores.append(roc_auc_fold)

average_roc_auc = np.mean(roc_auc_scores)

print(f"Average Ensemble ROC-AUC across {num_folds}-fold Cross-validation: {average_roc_auc}")

[LightGBM] [Info] Number of positive: 121438, number of negative: 121438
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008623 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 242876, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 97150, number of negative: 97150
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 194300, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 97151, number of negative: 97150
[LightGBM] [Info] Auto-choosing col-wise multi-

In [8]:
df_test = pd.read_csv('Data/test.csv')

test_id = df_test["id"]

df_test = df_test.drop(['CustomerId', 'Surname','id'], axis=1)

df_test.head()


df_test['Gender'] = df_test['Gender'].map({'Male': 1, 'Female': 0})
geography_dummies = pd.get_dummies(df_test['Geography'], prefix='Geography').astype(int)
df_test = pd.concat([df_test, geography_dummies], axis=1)
df_test = df_test.drop('Geography', axis=1)

In [11]:
y_test_pred = ensemble_model_1.predict_proba(df_test) 

sample_submission = pd.DataFrame({
    'id': test_id,
    'Exited': y_test_pred[:, 1],  # Select the probabilities for the positive class
})

print(sample_submission)

sample_submission.to_csv("submission.csv",index=False)

            id    Exited
0       165034  0.037065
1       165035  0.890958
2       165036  0.050024
3       165037  0.283598
4       165038  0.369587
...        ...       ...
110018  275052  0.044426
110019  275053  0.163333
110020  275054  0.029494
110021  275055  0.178880
110022  275056  0.307457

[110023 rows x 2 columns]
