In [1]:
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score

from catboost import CatBoostClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTE
import pandas as pd
import numpy as np

import logging

logging.getLogger('lightgbm').setLevel(logging.CRITICAL)

In [2]:
df = pd.read_csv('Data/df.csv')

df = df.drop(['CustomerId', 'Surname','id'], axis=1)

df.head()




Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,668,1,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,0,0
1,627,1,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,0,0
2,678,1,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,0,0
3,581,1,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,0,0
4,716,1,33.0,5,0.0,2,1.0,1.0,15068.83,0,0,0,1


In [3]:
num_folds = 15
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)


In [4]:
X = df.drop('Exited', axis=1)
y = df['Exited']

In [5]:
# Best pa# Best parameters found for each model
params_lgb = {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 150, 'num_leaves': 31, 'objective': 'binary', 'reg_alpha': 1.2, 'reg_lambda': 1.5, 'subsample': 0.7}
params_xgboost = {'colsample_bytree': 0.7, 'gamma': 0.01, 'learning_rate': 0.01, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 300, 'reg_alpha': 1, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 0.9}
params_CAT=  {'colsample_bylevel': 0.7, 'depth': 5, 'iterations': 150, 'l2_leaf_reg': 0, 'learning_rate': 0.1, 'subsample': 0.9}

lgb_model = lgb.LGBMClassifier(**params_lgb)
xgboost_model = XGBClassifier(**params_xgboost)
cat_model = CatBoostClassifier(**params_CAT)

models = [('lgb', lgb_model), ('xgboost', xgboost_model), ('cat', cat_model)]

### 15 K-Folds, Ensemble Models (LGBM, XGBoost, Gradient Boosting) 

In [6]:
ensemble_model_1 = VotingClassifier(estimators=models, voting='soft')

roc_auc_scores = []

for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    over = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_over, y_train_over = over.fit_resample(X_train_fold, y_train_fold)

    ensemble_model_1.fit(X_train_over, y_train_over)

    y_ensemble_probabilities = ensemble_model_1.predict_proba(X_test_fold)[:, 1]

    roc_auc_fold = roc_auc_score(y_test_fold, y_ensemble_probabilities)
    roc_auc_scores.append(roc_auc_fold)

average_roc_auc = np.mean(roc_auc_scores)

print(f"Average Ensemble ROC-AUC across {num_folds}-fold Cross-validation: {average_roc_auc}")

[LightGBM] [Info] Number of positive: 121438, number of negative: 121438
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 242876, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	learn: 0.6413818	total: 164ms	remaining: 24.4s
1:	learn: 0.5992592	total: 191ms	remaining: 14.1s
2:	learn: 0.5640307	total: 213ms	remaining: 10.4s
3:	learn: 0.5348311	total: 229ms	remaining: 8.37s
4:	learn: 0.5094687	total: 245ms	remaining: 7.09s
5:	learn: 0.4876305	total: 259ms	remaining: 6.21s
6:	learn: 0.4687673	total: 274ms	remaining: 5.59s
7:	learn: 0.4531172	total: 289ms	remaining: 5.13s
8:	learn: 0.4386840	total: 306ms	remaining: 4.8s
9:	learn: 0.4253764	total: 321ms	remaining: 4

In [7]:

ensemble_model_2 = StackingClassifier(estimators=models)  


roc_auc_scores = []

for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    under = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_under, y_train_under = under.fit_resample(X_train_fold, y_train_fold)

    ensemble_model_2.fit(X_train_under, y_train_under)

    y_ensemble_probabilities = ensemble_model_2.predict_proba(X_test_fold)[:, 1]

    roc_auc_fold = roc_auc_score(y_test_fold, y_ensemble_probabilities)
    roc_auc_scores.append(roc_auc_fold)

average_roc_auc = np.mean(roc_auc_scores)

print(f"Average Ensemble ROC-AUC across {num_folds}-fold Cross-validation: {average_roc_auc}")

[LightGBM] [Info] Number of positive: 121438, number of negative: 121438
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009795 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 242876, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	learn: 0.6413818	total: 26.3ms	remaining: 3.92s
1:	learn: 0.5992592	total: 47.1ms	remaining: 3.48s
2:	learn: 0.5640307	total: 65.3ms	remaining: 3.2s
3:	learn: 0.5348311	total: 94.5ms	remaining: 3.45s
4:	learn: 0.5094687	total: 111ms	remaining: 3.21s
5:	learn: 0.4876305	total: 129ms	remaining: 3.1s
6:	learn: 0.4687673	total: 151ms	remaining: 3.09s
7:	learn: 0.4531172	total: 172ms	remaining: 3.05s
8:	learn: 0.4386840	total: 192ms	remaining: 3s
9:	learn: 0.4253764	total: 210ms	remaining: 2.94s
10:	learn: 0.4139053	total: 230ms	remaining: 2.91s
11:	lear

In [11]:
df_test = pd.read_csv('Data/test.csv')

test_id = df_test["id"]

df_test = df_test.drop(['CustomerId', 'Surname','id'], axis=1)
 
df_test.head()


df_test['Gender'] = df_test['Gender'].map({'Male': 1, 'Female': 0})
geography_dummies = pd.get_dummies(df_test['Geography'], prefix='Geography').astype(int)
df_test = pd.concat([df_test, geography_dummies], axis=1)
df_test = df_test.drop('Geography', axis=1)

In [13]:
y_test_pred = ensemble_model_1.predict_proba(df_test) 

sample_submission = pd.DataFrame({
    'id': test_id,
    'Exited': y_test_pred[:, 1],  # Select the probabilities for the positive class
})

print(sample_submission)

sample_submission.to_csv("submission.csv",index=False)

            id    Exited
0       165034  0.040818
1       165035  0.896429
2       165036  0.057320
3       165037  0.287343
4       165038  0.374022
...        ...       ...
110018  275052  0.058487
110019  275053  0.174431
110020  275054  0.037827
110021  275055  0.210112
110022  275056  0.320959

[110023 rows x 2 columns]
