In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid


from xgboost import XGBClassifier
import xgboost as xgb


from imblearn.under_sampling import RandomUnderSampler

In [4]:
df = pd.read_csv('df.csv')

df = df.drop(['CustomerId', 'Surname','id'], axis=1)

df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,668,1,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,0,0
1,627,1,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,0,0
2,678,1,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,0,0
3,581,1,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,0,0
4,716,1,33.0,5,0.0,2,1.0,1.0,15068.83,0,0,0,1


# UnderSampling No Standarization



In [5]:

X = df.drop('Exited', axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

under= RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_under, y_train_under = under.fit_resample(X_train, y_train)


## Basic Models no parameter tunning

### Logistic Regression

In [4]:
model = LogisticRegression()
model.fit(X_train_under, y_train_under)

y_prob = model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)

print(f'AUC Score: {auc_score:.4f}')

AUC Score: 0.7348


### Decision Tree

In [5]:
model = DecisionTreeClassifier()
model.fit(X_train_under, y_train_under)

y_prob = model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)

print(f'AUC Score: {auc_score:.4f}')


AUC Score: 0.7284


### Random Forest

In [6]:
model = RandomForestClassifier()
model.fit(X_train_under, y_train_under)

y_prob = model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)

print(f'AUC Score: {auc_score:.4f}')

AUC Score: 0.8746


### K-Nearest Neighbors (KNN)

In [7]:
model = KNeighborsClassifier()
model.fit(X_train_under, y_train_under)

y_prob = model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)

print(f'AUC Score: {auc_score:.4f}')

AUC Score: 0.5613


### Naive Bayes
Will try GNB with normalize data as it assumes normal distribution


In [8]:

gaussian_model = GaussianNB()
gaussian_model.fit(X_train_under, y_train_under)
gaussian_prob = gaussian_model.predict_proba(X_test)[:, 1]
gaussian_auc = roc_auc_score(y_test, gaussian_prob)
print(f'Gaussian Naive Bayes AUC Score: {gaussian_auc:.4f}')


bernoulli_model = BernoulliNB()
bernoulli_model.fit(X_train_under, y_train_under)
bernoulli_prob = bernoulli_model.predict_proba(X_test)[:, 1]
bernoulli_auc = roc_auc_score(y_test, bernoulli_prob)
print(f'Bernoulli Naive Bayes AUC Score: {bernoulli_auc:.4f}')


Gaussian Naive Bayes AUC Score: 0.7702
Bernoulli Naive Bayes AUC Score: 0.7068


### Super Vector Machine (SVM) is sensible to scale of data. Will be tried in the section where data is scaled

## Analyzing Results

I will try and add simple parameter tunning to the best two performing models (SVM and Random Forest) and the most basic one (Logistic Regression) to find how they improve and then apply a grid search.

- Random Forest, ROC AUC:
- Naive Bayes GNB, ROC AUC;
- Logistic Regression, ROC AUC:

## Basic Model with simple parameter tunning

### Simple GridSearch Random Forest (only model worth doing a gridsearch in this scenario)

In [21]:
param_grid = {
    'n_estimators': [10, 25, 50],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', None],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced'],
}



best_roc_auc = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid))} ({np.round((n_param + 1) * 100 / len(ParameterGrid(param_grid)), 2)}%)")

    rf_model = RandomForestClassifier(random_state=42, **params)

    rf_model.fit(X_train, y_train)

    rf_y_prob = rf_model.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, rf_y_prob)
    print(f"ROC AUC Score: {roc_auc:.4f}")

    # Check if it's the best iteration
    if roc_auc > best_roc_auc:
        print(f"Best previous ROC AUC Score:\t {best_roc_auc}")
        print(f"Best actual ROC AUC Score:\t {roc_auc}")

        best_roc_auc = roc_auc
        best_params = params
        print(f"Best params: {best_params}")

print(f"Best parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_roc_auc}")





Iteration 1 of 864 (0.12%)
AUC Score: 0.8687
Best actual AUC Score:	 0.8687450031259578
Best params: {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}

Iteration 2 of 864 (0.23%)
AUC Score: 0.8770
Best actual AUC Score:	 0.8770405097529248
Best params: {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 25}

Iteration 3 of 864 (0.35%)
AUC Score: 0.8782
Best actual AUC Score:	 0.8781606042941363
Best params: {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}

Iteration 4 of 864 (0.46%)
AUC Score: 0.8687

Iteration 5 of 864 (0.58%)
AUC Score: 0.8770

Iteration 6 of 864 (0.69%)
AUC Score: 0.8782

Iteration 7 of 864 (0.81%)
AUC Score: 0.8687


KeyboardInterrupt: 

## Complex Base Models no parameter tunning

### Gradient Boosting 


In [6]:

model = GradientBoostingClassifier()

model.fit(X_train_under, y_train_under)

y_prob = model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)

print(f'AUC Score: {auc_score:.4f}')


AUC Score: 0.8875


### XGBoost

In [10]:

model = XGBClassifier()
model.fit(X_train_under, y_train_under)

y_prob = model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_prob)

print(f'AUC Score: {auc_score:.4f}')

AUC Score: 0.8821


## Complex Model with simple parameter tunning 

### GridSearch Gradient Boosting

In [28]:


large_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}



param_grid_to_use = small_param_grid

best_roc_auc = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid_to_use)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid_to_use))} ({np.round((n_param + 1) * 100 / len(ParameterGrid(param_grid_to_use)), 2)}%)")

    gb_model = GradientBoostingClassifier(random_state=42, **params)

    gb_model.fit(X_train_under, y_train_under)

    gb_y_prob = gb_model.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, gb_y_prob)
    print(f"ROC AUC Score: {roc_auc:.4f}")

    if roc_auc > best_roc_auc:
        print(f"Best actual ROC AUC Score:\t {roc_auc}")

        best_roc_auc = roc_auc
        best_params = params
        print(f"Best params: {best_params}")

print(f"Best parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_roc_auc}")



Iteration 1 of 16 (6.25%)
ROC AUC Score: 0.8542
Best actual ROC AUC Score:	 0.8542106588437268
Best params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}

Iteration 2 of 16 (12.5%)
ROC AUC Score: 0.8546
Best actual ROC AUC Score:	 0.8546447238756811
Best params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}

Iteration 3 of 16 (18.75%)


KeyboardInterrupt: 

### GridSearch XGBoost

In [27]:

""""
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1],
    'min_child_weight': [1, 5, 10],
    'scale_pos_weight': [1, 2, 5] #
}
"""

param_grid = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'max_depth': [ 5, 10],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'gamma': [0, 1, 5]  
}

best_auc_score = -1
best_params = None

for n_param, params in enumerate(ParameterGrid(param_grid)):
    print(f"\nIteration {n_param + 1} of {len(ParameterGrid(param_grid))} ({np.round((n_param + 1) * 100 / len(ParameterGrid(param_grid)), 2)}%)")

    xgb_model = xgb.XGBClassifier(**params, random_state=42, objective='binary:logistic', eval_metric='logloss')

    xgb_model.fit(X_train_under, y_train_under)

    xgb_y_prob = xgb_model.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, xgb_y_prob)
    print(f"AUC Score: {auc_score:.4f}")

    if auc_score > best_auc_score:
        print(f"Best actual AUC Score:\t {auc_score}")

        best_auc_score = auc_score
        best_params = params
        print(f"Best params: {best_params}")

print(f"Best parameters found: {best_params}")
print(f"Best ROC AUC Score: {best_auc_score}")



Iteration 1 of 96 (1.04%)
AUC Score: 0.8843
Best actual AUC Score:	 0.8843053281942714
Best params: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}

Iteration 2 of 96 (2.08%)
AUC Score: 0.8843

Iteration 3 of 96 (3.12%)
AUC Score: 0.8859
Best actual AUC Score:	 0.8859261665535179
Best params: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}

Iteration 4 of 96 (4.17%)
AUC Score: 0.8859

Iteration 5 of 96 (5.21%)
AUC Score: 0.8857

Iteration 6 of 96 (6.25%)
AUC Score: 0.8855

Iteration 7 of 96 (7.29%)
AUC Score: 0.8863
Best actual AUC Score:	 0.8863223461894493
Best params: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 200, 'subsample': 0.8}

Iteration 8 of 96 (8.33%)
AUC Score: 0.8861

Iteration 9 of 96 (9.38%)
AUC Score: 0.8880
Best actual AUC Score:	 0.8879762268253499
Best params: {'colsample_bytree': 0

KeyboardInterrupt: 