In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

from catboost import CatBoostClassifier

from xgboost import XGBClassifier
import xgboost as xgb
import lightgbm as lgb

from itertools import product

from sklearn.ensemble import VotingClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from imblearn.over_sampling import RandomOverSampler, ADASYN

In [2]:
df = pd.read_csv('Data/df.csv')

df = df.drop(['CustomerId', 'Surname','id'], axis=1)

df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,668,1,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,0,0
1,627,1,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,0,0
2,678,1,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,0,0
3,581,1,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,0,0
4,716,1,33.0,5,0.0,2,1.0,1.0,15068.83,0,0,0,1


# OverSampling No Standarization



In [3]:

X = df.drop('Exited', axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
over = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_over, y_train_over= over.fit_resample(X_train, y_train)


## Complex Base Models no parameter tunning

### CatBoost


In [4]:
cat_model = CatBoostClassifier()
cat_model.fit(X_train_over, y_train_over)


y_pred_cat = cat_model.predict_proba(X_test)[:, 1]

roc_auc_cat = roc_auc_score(y_test, y_pred_cat)
print(f"CatBoost ROC AUC Score: {roc_auc_cat}")


Learning rate set to 0.095092
0:	learn: 0.6370936	total: 174ms	remaining: 2m 53s
1:	learn: 0.5933440	total: 205ms	remaining: 1m 42s
2:	learn: 0.5604634	total: 231ms	remaining: 1m 16s
3:	learn: 0.5348530	total: 256ms	remaining: 1m 3s
4:	learn: 0.5150059	total: 278ms	remaining: 55.3s
5:	learn: 0.4985766	total: 302ms	remaining: 50s
6:	learn: 0.4850816	total: 322ms	remaining: 45.7s
7:	learn: 0.4748520	total: 344ms	remaining: 42.6s
8:	learn: 0.4663992	total: 366ms	remaining: 40.3s
9:	learn: 0.4596255	total: 391ms	remaining: 38.7s
10:	learn: 0.4539595	total: 422ms	remaining: 37.9s
11:	learn: 0.4494292	total: 451ms	remaining: 37.1s
12:	learn: 0.4453976	total: 470ms	remaining: 35.7s
13:	learn: 0.4422361	total: 490ms	remaining: 34.5s
14:	learn: 0.4393443	total: 514ms	remaining: 33.7s
15:	learn: 0.4373031	total: 538ms	remaining: 33.1s
16:	learn: 0.4354590	total: 564ms	remaining: 32.6s
17:	learn: 0.4337053	total: 586ms	remaining: 32s
18:	learn: 0.4324187	total: 605ms	remaining: 31.2s
19:	learn: 0

### XGBoost

In [5]:

model = XGBClassifier()
model.fit(X_train_over, y_train_over)

y_prob = model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)

print(f'AUC Score: {auc_score:.4f}')

AUC Score: 0.8826


### LightGBM

In [6]:
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train_over, y_train_over)


lgb_y_prob = lgb_model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, lgb_y_prob)
print(f'AUC Score: {auc_score:.4f}')

[LightGBM] [Info] Number of positive: 91078, number of negative: 91078
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 861
[LightGBM] [Info] Number of data points in the train set: 182156, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
AUC Score: 0.8882


## Complex Model with simple parameter tunning 

### GridSearch XGBoost

In [7]:


param_grid = {
    'learning_rate': [0.01],
    'n_estimators': [200, 250 , 300],
    'max_depth': [ 10],
    'subsample': [0.9],
    'colsample_bytree': [ 0.7],
    'gamma': [0.01] ,
    'min_child_weight': [1],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1],
    'scale_pos_weight': [1, 2]
}


best_auc_score = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid))} ({np.round((n_param + 1) * 100 / len(ParameterGrid(param_grid)), 2)}%)")

    xgb_model = xgb.XGBClassifier(**params, random_state=42, objective='binary:logistic', eval_metric='auc')

    xgb_model.fit(X_train_over, y_train_over)

    xgb_y_prob = xgb_model.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, xgb_y_prob)
    print(f"AUC Score: {auc_score:.4f}")

    if auc_score > best_auc_score:
        print(f"Best actual AUC Score:\t {auc_score}")

        best_auc_score = auc_score
        best_params = params
        print(f"Best params: {best_params}")

print(f"Best parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_auc_score}")



Iteration 1 of 54 (1.85%)
AUC Score: 0.8863
Best actual AUC Score:	 0.8862627798561253
Best params: {'colsample_bytree': 0.7, 'gamma': 0.01, 'learning_rate': 0.01, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 0, 'scale_pos_weight': 1, 'subsample': 0.9}

Iteration 2 of 54 (3.7%)
AUC Score: 0.8853

Iteration 3 of 54 (5.56%)
AUC Score: 0.8863
Best actual AUC Score:	 0.8862892770133695
Best params: {'colsample_bytree': 0.7, 'gamma': 0.01, 'learning_rate': 0.01, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 0.1, 'scale_pos_weight': 1, 'subsample': 0.9}

Iteration 4 of 54 (7.41%)
AUC Score: 0.8854

Iteration 5 of 54 (9.26%)
AUC Score: 0.8866
Best actual AUC Score:	 0.8865863102561202
Best params: {'colsample_bytree': 0.7, 'gamma': 0.01, 'learning_rate': 0.01, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 0.9}

Iteratio

### GridSearch LightGBM

In [8]:
param_grid = {
    'num_leaves': [31],
    'learning_rate': [0.1],
    'n_estimators': [150],
    'subsample': [0.7],
    'colsample_bytree': [0.7],
    'objective': ['binary'],
    'boosting_type': ['gbdt'],
    'reg_alpha': [1.2],
    'reg_lambda': [1.5 , 1.7],
    'max_depth': [10, 15],     
}
 
#This param grid is reduced 

best_auc_score = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid))} ({round((n_param + 1) * 100 / len(ParameterGrid(param_grid)), 2)}%)")

    lgb_model = lgb.LGBMClassifier(**params, random_state=42, metric='auc')

    lgb_model.fit(X_train_over, y_train_over)

    lgb_y_prob = lgb_model.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, lgb_y_prob)
    print(f"AUC Score: {auc_score:.4f}")

    if auc_score > best_auc_score:
        print(f"Best actual AUC Score:\t {auc_score}")

        best_auc_score = auc_score
        best_params = params
        print(f"Best params: {best_params}")

print(f"Best parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_auc_score}")


Iteration 1 of 4 (25.0%)
[LightGBM] [Info] Number of positive: 91078, number of negative: 91078
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001225 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 861
[LightGBM] [Info] Number of data points in the train set: 182156, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
AUC Score: 0.8880
Best actual AUC Score:	 0.8880080867499639
Best params: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 150, 'num_leaves': 31, 'objective': 'binary', 'reg_alpha': 1.2, 'reg_lambda': 1.5, 'subsample': 0.7}

Iteration 2 of 4 (50.0%)
[LightGBM] [Info] Number of positive: 91078, number of negative: 91078
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.00775

### CAT


In [9]:

param_grid = {
    'learning_rate': [0.01,  0.1],
    'iterations': [150, 200, 300],
    'depth': [5, 3],
    'subsample': [0.9, 0.95],
    'colsample_bylevel': [ 0.7],
    'l2_leaf_reg': [0, 0.5]
}

best_auc_score = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid))} ({round((n_param + 1) * 100 / len(ParameterGrid(param_grid)), 2)}%)")

    cat_model = CatBoostClassifier(loss_function='Logloss', verbose=0, **params)

    cat_model.fit(X_train_over, y_train_over)

    y_pred_cat = cat_model.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, y_pred_cat)
    print(f"AUC Score: {auc_score:.4f}")

    if auc_score > best_auc_score:
        print(f"Best actual AUC Score:\t {auc_score}")
        best_auc_score = auc_score
        best_params = params
        print(f"Best params: {best_params}")
print(f"\nBest parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_auc_score}")


Iteration 1 of 48 (2.08%)


AUC Score: 0.8809
Best actual AUC Score:	 0.8808771357471705
Best params: {'colsample_bylevel': 0.7, 'depth': 5, 'iterations': 150, 'l2_leaf_reg': 0, 'learning_rate': 0.01, 'subsample': 0.9}

Iteration 2 of 48 (4.17%)
AUC Score: 0.8809
Best actual AUC Score:	 0.8809294979251494
Best params: {'colsample_bylevel': 0.7, 'depth': 5, 'iterations': 150, 'l2_leaf_reg': 0, 'learning_rate': 0.01, 'subsample': 0.95}

Iteration 3 of 48 (6.25%)
AUC Score: 0.8884
Best actual AUC Score:	 0.8884241291665439
Best params: {'colsample_bylevel': 0.7, 'depth': 5, 'iterations': 150, 'l2_leaf_reg': 0, 'learning_rate': 0.1, 'subsample': 0.9}

Iteration 4 of 48 (8.33%)
AUC Score: 0.8884

Iteration 5 of 48 (10.42%)
AUC Score: 0.8809

Iteration 6 of 48 (12.5%)
AUC Score: 0.8809

Iteration 7 of 48 (14.58%)
AUC Score: 0.8884

Iteration 8 of 48 (16.67%)
AUC Score: 0.8883

Iteration 9 of 48 (18.75%)
AUC Score: 0.8838

Iteration 10 of 48 (20.83%)
AUC Score: 0.8838

Iteration 11 of 48 (22.92%)
AUC Score: 0.8879

Iter

### Gradient Boosting Classifier

In [10]:

large_param_grid = {
    'n_estimators': [ 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [5],
    'subsample': [0.6 ,0.8, 1.0],
    'min_samples_split': [10, 12],
    'min_samples_leaf': [4, 6],
}



param_grid_to_use = large_param_grid

best_roc_auc = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid_to_use)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid_to_use))} ({np.round((n_param + 1) * 100 / len(ParameterGrid(param_grid_to_use)), 2)}%)")

    gb_model = GradientBoostingClassifier(random_state=42, **params)

    gb_model.fit(X_train_over, y_train_over)

    gb_y_prob = gb_model.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, gb_y_prob)
    print(f"ROC AUC Score: {roc_auc:.4f}")

    if roc_auc > best_roc_auc:
        print(f"Best actual ROC AUC Score:\t {roc_auc}")

        best_roc_auc = roc_auc
        best_params = params
        print(f"Best params: {best_params}")

print(f"Best parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_roc_auc}")


Iteration 1 of 36 (2.78%)
ROC AUC Score: 0.8879
Best actual ROC AUC Score:	 0.8878896852586207
Best params: {'learning_rate': 0.05, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100, 'subsample': 0.6}

Iteration 2 of 36 (5.56%)
ROC AUC Score: 0.8880
Best actual ROC AUC Score:	 0.8879766975653036
Best params: {'learning_rate': 0.05, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100, 'subsample': 0.8}

Iteration 3 of 36 (8.33%)
ROC AUC Score: 0.8878

Iteration 4 of 36 (11.11%)
ROC AUC Score: 0.8878

Iteration 5 of 36 (13.89%)
ROC AUC Score: 0.8880

Iteration 6 of 36 (16.67%)
ROC AUC Score: 0.8878

Iteration 7 of 36 (19.44%)
ROC AUC Score: 0.8878

Iteration 8 of 36 (22.22%)
ROC AUC Score: 0.8879

Iteration 9 of 36 (25.0%)
ROC AUC Score: 0.8879

Iteration 10 of 36 (27.78%)
ROC AUC Score: 0.8878

Iteration 11 of 36 (30.56%)
ROC AUC Score: 0.8879

Iteration 12 of 36 (33.33%)
ROC AUC Score: 0.8879

Iteration 13 of 36 (36.11%

In [13]:
# Best parameters found for each model
params_lgb = {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 150, 'num_leaves': 31, 'objective': 'binary', 'reg_alpha': 1.2, 'reg_lambda': 1.5, 'subsample': 0.7}
params_xgboost = {'colsample_bytree': 0.7, 'gamma': 0.01, 'learning_rate': 0.01, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 300, 'reg_alpha': 1, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 0.9}
params_gradient_boosting =  {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 6, 'min_samples_split': 10, 'n_estimators': 100, 'subsample': 0.8}
params_CAT = {'colsample_bylevel': 0.7, 'depth': 5, 'iterations': 150, 'l2_leaf_reg': 0, 'learning_rate': 0.1, 'subsample': 0.9}


lgb_model = lgb.LGBMClassifier(**params_lgb)
xgboost_model = XGBClassifier(**params_xgboost)
gradient_boosting_model = GradientBoostingClassifier(**params_gradient_boosting)
cat_model = CatBoostClassifier(**params_CAT)

models = [('lgb', lgb_model), ('xgboost', xgboost_model), ('gradient_boosting', gradient_boosting_model), ('cat', cat_model)]

ensemble_model = VotingClassifier(estimators=models, voting='soft')


In [15]:

weights_combinations = list(product(np.arange(0, 1.1, 0.1), repeat=len(models)))
weights_combinations = [list(w) for w in weights_combinations if sum(w) == 1]  # Ensure the sum of weights is 1


param_grid = {
    'weights': weights_combinations
}
print(param_grid)
best_roc_auc = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid))} ({np.round((n_param + 1) * 100 / len(ParameterGrid(param_grid)), 2)}%)")

    ensemble_model.set_params(weights=params['weights'])

    ensemble_model.fit(X_train_over, y_train_over)

    ensemble_y_prob = ensemble_model.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, ensemble_y_prob)
    print(f"ROC AUC Score: {roc_auc:.4f}")

    if roc_auc > best_roc_auc:
        print(f"Best actual ROC AUC Score:\t {roc_auc}")

        best_roc_auc = roc_auc
        best_params = params
        print(f"Best params: {best_params}")

print(f"Best parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_roc_auc}")


{'weights': [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.1, 0.9], [0.0, 0.0, 0.2, 0.8], [0.0, 0.0, 0.30000000000000004, 0.7000000000000001], [0.0, 0.0, 0.4, 0.6000000000000001], [0.0, 0.0, 0.5, 0.5], [0.0, 0.0, 0.6000000000000001, 0.4], [0.0, 0.0, 0.7000000000000001, 0.30000000000000004], [0.0, 0.0, 0.8, 0.2], [0.0, 0.0, 0.9, 0.1], [0.0, 0.0, 1.0, 0.0], [0.0, 0.1, 0.0, 0.9], [0.0, 0.1, 0.1, 0.8], [0.0, 0.1, 0.2, 0.7000000000000001], [0.0, 0.1, 0.30000000000000004, 0.6000000000000001], [0.0, 0.1, 0.4, 0.5], [0.0, 0.1, 0.5, 0.4], [0.0, 0.1, 0.6000000000000001, 0.30000000000000004], [0.0, 0.1, 0.7000000000000001, 0.2], [0.0, 0.1, 0.8, 0.1], [0.0, 0.1, 0.9, 0.0], [0.0, 0.2, 0.0, 0.8], [0.0, 0.2, 0.1, 0.7000000000000001], [0.0, 0.2, 0.2, 0.6000000000000001], [0.0, 0.2, 0.30000000000000004, 0.5], [0.0, 0.2, 0.4, 0.4], [0.0, 0.2, 0.5, 0.30000000000000004], [0.0, 0.2, 0.6000000000000001, 0.2], [0.0, 0.2, 0.8, 0.0], [0.0, 0.30000000000000004, 0.0, 0.7000000000000001], [0.0, 0.30000000000000004, 0.1, 0.6

In [16]:
df_test = pd.read_csv('Data/test.csv')

test_id = df_test["id"]

df_test = df_test.drop(['CustomerId', 'Surname','id'], axis=1)
 
df_test.head()


df_test['Gender'] = df_test['Gender'].map({'Male': 1, 'Female': 0})
geography_dummies = pd.get_dummies(df_test['Geography'], prefix='Geography').astype(int)
df_test = pd.concat([df_test, geography_dummies], axis=1)
df_test = df_test.drop('Geography', axis=1)

In [17]:
best_weights = best_params['weights']

ensemble_model = VotingClassifier(estimators=models, voting='soft', weights=best_weights)

ensemble_model.fit(X_train_over, y_train_over)



y_test_pred = ensemble_model.predict_proba(df_test) 

sample_submission = pd.DataFrame({
    'id': test_id,
    'Exited': y_test_pred[:, 1],  # Select the probabilities for the positive class
})

print(sample_submission)

sample_submission.to_csv("submission.csv",index=False)

[LightGBM] [Info] Number of positive: 91078, number of negative: 91078
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006966 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 861
[LightGBM] [Info] Number of data points in the train set: 182156, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	learn: 0.6581145	total: 25.1ms	remaining: 3.73s
1:	learn: 0.6262403	total: 43.2ms	remaining: 3.19s
2:	learn: 0.5994024	total: 58.4ms	remaining: 2.86s
3:	learn: 0.5766865	total: 75.2ms	remaining: 2.74s
4:	learn: 0.5576934	total: 91.6ms	remaining: 2.66s
5:	learn: 0.5417309	total: 108ms	remaining: 2.59s
6:	learn: 0.5271276	total: 122ms	remaining: 2.48s
7:	learn: 0.5153212	total: 134ms	remaining: 2.37s
8:	learn: 0.5044064	total: 150ms	remaining: 2.34s
9:	learn: 0.4957394	total: 166ms	remaining: 2.33s
10:	learn: 0.4873716	total: 190ms	remaining: 2.4s
11:	le