### Libraries

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#!pip install fast_ml --quiet
from fast_ml.model_development import train_valid_test_split


from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier,GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn import metrics,svm
from sklearn.metrics import confusion_matrix

import xgboost as xgb

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Data Handling

In [86]:
df = pd.read_csv("./data/churn.csv")

# Remove irrelevant columns
df.drop(columns=["CustomerId",'RowNumber',"Surname"],inplace=True)

# Rename Exited column as: target
df = df.rename(columns={'Exited': 'target'})


df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,target
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [87]:
X_train, y_train, X_val, y_val, X_test, y_test = train_valid_test_split(df,
                                                                           target="target",
                                                                           train_size=0.70,
                                                                           valid_size=0.15,
                                                                           test_size=0.15)

In [88]:
df = pd.concat([X_train, y_train],ignore_index=False,axis=1,sort=False)
df_val = pd.concat([X_val, y_val],ignore_index=False,axis=1,sort=False)
df_test = pd.concat([X_test, y_test],ignore_index=False,axis=1,sort=False)

## Outliers Removed

In [89]:
Q1 = df["Age"].quantile(0.25)
Q3 = df["Age"].quantile(0.75)

IQR = Q3 - Q1

lower_lim = Q1  - 1.5 * IQR
upper_lim = Q3  + 1.5 * IQR

print("Age lower limit:",lower_lim)
print("Age upper limit:",upper_lim)


# Mask 
lower_outliers =(df["Age"]<lower_lim)
upper_outliers =(df["Age"]>upper_lim)

print("Number of outliers removed by Age:",len(df["Age"][(lower_outliers|upper_outliers)]))

df = df[~(lower_outliers|upper_outliers)]

Age lower limit: 14.0
Age upper limit: 62.0
Number of outliers removed by Age: 255


In [90]:
Q1 = df["CreditScore"].quantile(0.25)
Q3 = df["CreditScore"].quantile(0.75)

IQR = Q3 - Q1

lower_lim = Q1  - 1.5 * IQR
upper_lim = Q3  + 1.5 * IQR

print("CreditScore lower limit:",lower_lim)
print("CreditScore upper limit:",upper_lim)


# Mask 
lower_outliers =(df["CreditScore"]<lower_lim)
upper_outliers =(df["CreditScore"]>upper_lim)

print("Number of outliers removed by CreditScore:",len(df["CreditScore"][(lower_outliers|upper_outliers)]))

df = df[~(lower_outliers|upper_outliers)]
len(df)

CreditScore lower limit: 380.5
CreditScore upper limit: 920.5
Number of outliers removed by CreditScore: 9


6736

In [91]:
Q1 = df["NumOfProducts"].quantile(0.25)
Q3 = df["NumOfProducts"].quantile(0.75)

IQR = Q3 - Q1

lower_lim = Q1  - 1.5 * IQR
upper_lim = Q3  + 1.5 * IQR

print("NumOfProducts lower limit:",lower_lim)
print("NumOfProducts upper limit:",upper_lim)


# Mask 
lower_outliers =(df["NumOfProducts"]<lower_lim)
upper_outliers =(df["NumOfProducts"]>upper_lim)

print("Number of outliers removed by NumOfProducts:",len(df["NumOfProducts"][(lower_outliers|upper_outliers)]))

df = df[~(lower_outliers|upper_outliers)]
print("Outliers removed: ",6000-len(df),"out of 6,000")

NumOfProducts lower limit: -0.5
NumOfProducts upper limit: 3.5
Number of outliers removed by NumOfProducts: 37
Outliers removed:  -699 out of 6,000


## Undersampling

In [92]:
rus = RandomUnderSampler(random_state=42)

X_train, y_train = rus.fit_resample(df.iloc[:, 0:10], df['target'])
df = pd.concat([X_train, y_train],ignore_index=False,axis=1,sort=False)
X_train 

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,670,France,Female,50,8,138340.06,1,0,1,3159.15
1,761,France,Male,31,7,0.00,3,1,1,166698.18
2,709,Spain,Male,23,10,0.00,2,0,0,129590.18
3,690,France,Male,21,8,0.00,2,1,1,155782.89
4,758,Spain,Male,61,2,0.00,2,1,1,43982.41
...,...,...,...,...,...,...,...,...,...,...
2623,685,Spain,Female,30,2,0.00,3,1,1,172576.43
2624,598,Germany,Female,46,7,131769.04,1,0,0,184980.23
2625,483,France,Female,44,5,136836.49,1,1,0,192359.90
2626,564,Spain,Male,31,5,121461.87,1,1,1,20432.09


In [93]:
scaler = StandardScaler()

# Scaling train
scl_columns = ['CreditScore', 'Age', 'Balance',"EstimatedSalary"]
df[scl_columns] = scaler.fit_transform(df[scl_columns])

# Scaling val
scl_columns = ['CreditScore', 'Age', 'Balance',"EstimatedSalary"]
df_val[scl_columns] = scaler.fit_transform(df_val[scl_columns])

# Scaling test
scl_columns = ['CreditScore', 'Age', 'Balance',"EstimatedSalary"]
df_test[scl_columns] = scaler.fit_transform(df_test[scl_columns])

In [94]:
# Encoding train
transformer = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),["Geography","Gender"]),remainder='passthrough')
transformed = transformer.fit_transform(df)
df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())

# Encoding val
transformer = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),["Geography","Gender"]),remainder='passthrough')
transformed = transformer.fit_transform(df_val)
df_val = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())

# Encoding test
transformer = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),["Geography","Gender"]),remainder='passthrough')
transformed = transformer.fit_transform(df_test)
df_test = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())

In [95]:
X_train = df.drop(['remainder__target'], axis = 1)
y_train = df['remainder__target']

X_val = df_val.drop(['remainder__target'], axis = 1)
y_val = df_val['remainder__target']

X_test = df_test.drop(['remainder__target'], axis = 1)
y_test = df_test['remainder__target']

## Machine Learning

In [96]:
# 1. Logistic Regression (8.1s)

lr = LogisticRegression(random_state=42)

param_grid = {
    'max_iter': [100],
    'penalty': ['l1', 'l2'],
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10,1000],
    'solver': ['lbfgs', 'liblinear']
}

gs_log_reg = GridSearchCV(estimator=lr,
                      param_grid=param_grid,
                      scoring="accuracy",
                      cv=5, # K-fold cross-validation.
                      verbose=False,
                      n_jobs=-1)



# 2. Decision Tree Classifier (1.5s)

dtc = DecisionTreeClassifier(random_state=42)

param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'ccp_alpha': [0.1, .01, .001],
              'max_depth' : [5, 6, 7, 8, 9],
              'criterion' :['gini', 'entropy']
             }

gs_dtc = GridSearchCV(estimator=dtc,
                     param_grid=param_grid,
                    cv= 5)



# 3. Random Forest Classifier (2m 30s)

rfc = RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

gs_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)


# 4. kNN (8.2s)

knn = KNeighborsClassifier(n_neighbors=1)

param_grid = { 
    'n_neighbors': [1, 5,10,20,30],
    'weights': ['uniform', 'distance'],
    'leaf_size' : [1,3,5,10,20],
    'p' :[10,20,30]
}

gs_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv= 5)

# 5. Naive Bayes (0.5s)

gau = GaussianNB()

param_grid = { 
    'var_smoothing': [0.00000001, 0.000000001, 0.00000001],
}

gs_naive_bayes = GridSearchCV(estimator=gau, param_grid=param_grid, cv= 5)


# 6. Gradient Boosting (11.7s)

gbc = GradientBoostingClassifier()

param_grid = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.075, 0.2],
    # "min_samples_split": np.linspace(0.1, 0.5, 12),
    # "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[5],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.85, 1.0],
    "n_estimators":[5]
    }

gs_gradient_boosting = GridSearchCV(estimator=gbc, param_grid=param_grid, cv= 5,verbose=1)


# 7. XGBoost (5.6s)
import xgboost as xgb
xgb = xgb.XGBClassifier()

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}

gs_XGBoost = GridSearchCV(estimator=xgb, param_grid=param_grid, cv= 5,verbose=1)



grids = {
    "Logistic Regression": gs_log_reg,
    "Decission Tree": gs_dtc,
    "Random Forest": gs_rfc,
    "kNN": gs_knn,
    "Naive Bayes": gs_naive_bayes,
    "Gradient Boosting": gs_gradient_boosting,
    "XGBoost": gs_XGBoost,
}

for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)

best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids, columns = ["Grid", "Best score"])
best_grids.sort_values(by = "Best score", ascending = False)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Param

Unnamed: 0,Grid,Best score
2,Random Forest,0.77055
6,XGBoost,0.768276
5,Gradient Boosting,0.753803
0,Logistic Regression,0.724125
4,Naive Bayes,0.71652
1,Decission Tree,0.712719
3,kNN,0.698257


### Validation

In [97]:
for nombre, grid_search in grids.items():
    grid_search.fit(X_val, y_val)

best_grids_val = [(i, j.best_score_) for i, j in grids.items()]

best_grids_val = pd.DataFrame(best_grids_val, columns = ["Grid", "Best score val"])
best_grids_val.sort_values(by = "Best score val", ascending = False)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Param

Unnamed: 0,Grid,Best score val
6,XGBoost,0.856
2,Random Forest,0.854
1,Decission Tree,0.838
5,Gradient Boosting,0.834
4,Naive Bayes,0.815333
0,Logistic Regression,0.794667
3,kNN,0.794


### Test

In [98]:
for nombre, grid_search in grids.items():
    grid_search.fit(X_test, y_test)

best_grids_test = [(i, j.best_score_) for i, j in grids.items()]

best_grids_test = pd.DataFrame(best_grids_test, columns = ["Grid", "Best score test"])
best_grids_test.sort_values(by = "Best score test", ascending = False)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Parameters: { "criterion", "loss", "max_features" } are not used.

Param

Unnamed: 0,Grid,Best score test
2,Random Forest,0.856667
6,XGBoost,0.849333
5,Gradient Boosting,0.829333
1,Decission Tree,0.827333
4,Naive Bayes,0.807333
0,Logistic Regression,0.804
3,kNN,0.791333


In [99]:
### Save to Excel
best_grids.to_excel(r'./results/70_15_15_train.xlsx',sheet_name='train', index=False)
best_grids_val.to_excel(r'./results/70_15_15_val.xlsx',sheet_name='val', index=False)
best_grids_test.to_excel(r'./results/70_15_15_test.xlsx',sheet_name='test', index=False)