# Table of Contents
1. [Imports](#imports)
1. [Data_preperation](#Data_preperation)
1. [Feature_Engeneering](#Feature_Engeneering)
1. [Model_Training](#Model_Training)<br>
    4.1[LR](#LR)<br>
    4.2[RF](#RF)<br>
    4.3[LGBM](#LGBM)<br>
    4.4[XGB](#XGB)<br>
    4.5[CATBoost](#CATBoost)<br>
1. [Model_Selection](#Model_Selection)<br>
1. [Model_Selection_Conclusion:](#Model_Selection_Conclusion:)



# Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from IPython.display import display
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

import time
from collections import defaultdict

In [2]:
df_contract = pd.read_csv('contract.csv')
df_personal = pd.read_csv('personal.csv')
df_internet = pd.read_csv('internet.csv')
df_phone = pd.read_csv('phone.csv')

# Data_preperation

In [3]:
# Joimimg the data together
from functools import reduce

dfs = [df_contract, df_internet, df_personal, df_phone]
df = reduce(lambda left,right: pd.merge(left,right,on='customerID', how='outer'), dfs)
df.head()

Unnamed: 0,customerID,BeginDate,EndDate,Type,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,gender,SeniorCitizen,Partner,Dependents,MultipleLines
0,7590-VHVEG,2020-01-01,No,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,
1,5575-GNVDE,2017-04-01,No,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No
2,3668-QPYBK,2019-10-01,2019-12-01 00:00:00,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No
3,7795-CFOCW,2016-05-01,No,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,
4,9237-HQITU,2019-09-01,2019-11-01 00:00:00,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No


In [4]:
#Fixing the total charges to float
space_index = list(df[df['TotalCharges'].str.isspace()].index)
df.loc[space_index, 'TotalCharges'] = 0
df['TotalCharges'] = df['TotalCharges'].astype('float')

In [5]:
#The convertion to float might add zeros in empty places witch wont make sence if the monthly charge is bigger than yearly charge
df[df['TotalCharges']==0 ].head()

Unnamed: 0,customerID,BeginDate,EndDate,Type,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,gender,SeniorCitizen,Partner,Dependents,MultipleLines
488,4472-LVYGI,2020-02-01,No,Two year,Yes,Bank transfer (automatic),52.55,0.0,DSL,Yes,No,Yes,Yes,Yes,No,Female,0,Yes,Yes,
753,3115-CZMZD,2020-02-01,No,Two year,No,Mailed check,20.25,0.0,,,,,,,,Male,0,No,Yes,No
936,5709-LVOEQ,2020-02-01,No,Two year,No,Mailed check,80.85,0.0,DSL,Yes,Yes,Yes,No,Yes,Yes,Female,0,Yes,Yes,No
1082,4367-NUYAO,2020-02-01,No,Two year,No,Mailed check,25.75,0.0,,,,,,,,Male,0,Yes,Yes,Yes
1340,1371-DWPAZ,2020-02-01,No,Two year,No,Credit card (automatic),56.05,0.0,DSL,Yes,Yes,Yes,Yes,Yes,No,Female,0,Yes,Yes,


In [6]:
# indeed, will be fixed

In [7]:
# Changing dates to datetyme
df['BeginDate'] = pd.to_datetime(df['BeginDate'], format='%Y-%m-%d')
df['EndDate'] = pd.to_datetime(df['EndDate'], format='%Y-%m-%d', errors='coerce')

# Feature_Engeneering

In [8]:
#The begining year and month might influence the decision to leave
df['SubscriptionYear'] = df['BeginDate'].dt.year
df['SubscriptionMonth'] = df['BeginDate'].dt.month

In [9]:
#Since there is an autocorelation between the additional internet services I will add the column that counts them

internet_extras = ['OnlineSecurity','OnlineBackup', 'DeviceProtection',
                 'TechSupport', 'StreamingTV', 'StreamingMovies']
df[internet_extras] = df[internet_extras].applymap(lambda x: 1 if x=='Yes' else 0)
df['InternetExtras'] = df[internet_extras].sum(axis=1)

In [10]:
#Getting categorical features
categorical_features = ['Type', 'PaperlessBilling', 'PaymentMethod','gender', 'Partner', 'Dependents',
                        'InternetService', 'MultipleLines']
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [11]:
#Generating the rarget column
df['Target'] = df['EndDate'].apply(lambda x: 0 if pd.isna(x) else 1) 

In [12]:
# Removing irelevant columns
df.drop(['customerID', 'BeginDate', 'EndDate','OnlineSecurity','OnlineBackup', 'DeviceProtection',
                 'TechSupport', 'StreamingTV', 'StreamingMovies'], axis=1, inplace=True)

In [13]:
# last check that the data is fine
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 18 columns):
MonthlyCharges                           7043 non-null float64
TotalCharges                             7043 non-null float64
SeniorCitizen                            7043 non-null int64
SubscriptionYear                         7043 non-null int64
SubscriptionMonth                        7043 non-null int64
InternetExtras                           7043 non-null int64
Type_One year                            7043 non-null uint8
Type_Two year                            7043 non-null uint8
PaperlessBilling_Yes                     7043 non-null uint8
PaymentMethod_Credit card (automatic)    7043 non-null uint8
PaymentMethod_Electronic check           7043 non-null uint8
PaymentMethod_Mailed check               7043 non-null uint8
gender_Male                              7043 non-null uint8
Partner_Yes                              7043 non-null uint8
Dependents_Yes                     

In [14]:
#Everything seems fine, lets look at the target and see if it is nalanced
df['Target'].value_counts()

0    5174
1    1869
Name: Target, dtype: int64

The data is not balanced so I will balance the train set only for not making the model examination biased

In [15]:
# Splitting the data
train,test = train_test_split(df,test_size=0.20, random_state=1234)
train, valid = train_test_split(train,test_size=0.20, random_state=1234)
train['Target'].value_counts()

0    3308
1    1199
Name: Target, dtype: int64

In [16]:
train_upsampled = train.loc[train['Target'] == 1].sample(n=1150)
train = pd.concat([train_upsampled, train])
train['Target'].value_counts()

0    3308
1    2349
Name: Target, dtype: int64

In [17]:
train_upsampled = train.loc[train['Target'] == 1].sample(n=(3308-2349))
train = pd.concat([train_upsampled, train])
train['Target'].value_counts()

1    3308
0    3308
Name: Target, dtype: int64

In [18]:
x_train =  train.drop('Target', axis=1)
x_valid =  valid.drop('Target', axis=1)
x_test =  test.drop('Target', axis=1)
y_train = train['Target']
y_valid = valid['Target']
y_test = test['Target']

#one last check before model training
print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)
print(x_test.shape)
print(y_test.shape)


(6616, 17)
(6616,)
(1127, 17)
(1127,)
(1409, 17)
(1409,)


# Model_Training

In [19]:
def eval_model(model, x_train,y_train,x_test,y_test):
    print(model)
    model.fit(x_train, y_train)
    proba = model.predict_proba(x_test)
    proba = proba[:, 1]
    return roc_auc_score(y_test, proba)   

### LR

In [20]:
lr_roc_auc = eval_model(LogisticRegression(), x_train, y_train, x_valid, y_valid)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [21]:
print('ROC_AUC for LR is:' ,lr_roc_auc)

ROC_AUC for LR is: 0.8390521967108923


### RF

In [22]:
rf_results = defaultdict(list)

for depth in range(1,10):
    for estim in range(20,100,10):
        model = RandomForestClassifier(n_estimators=estim, max_depth=depth,random_state=12345)
        
        rf_results['roc_auc'].append(eval_model(model, x_train, y_train, x_valid, y_valid))
        rf_results['depth'].append(depth)
        rf_results['estim'].append(estim)

        


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=1, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=1, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)
RandomForestClassifier(b

                       verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=60,
                       n_jobs=None, oob_score=False, random_state=12345,
                       v

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=60,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=70,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)
RandomForestClassifier(b

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=70,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=80,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)
RandomForestClassifier(b

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=9, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=80,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=9, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=90,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)


In [23]:
        
pd.DataFrame(rf_results).sort_values(by='roc_auc', ascending = False).head() 

Unnamed: 0,roc_auc,depth,estim
71,0.880968,9,90
70,0.880615,9,80
69,0.87957,9,70
63,0.878589,8,90
67,0.878545,9,50


### LGBM

In [24]:
lgbm_results = defaultdict(list)

learningRate = [0.00000000001,0.1,0.05]
numIterations = [500,10000]
for ni in numIterations:
    for lr in learningRate:
        model = lgb.LGBMClassifier(learningRate = lr,numIterations = ni,min_data_in_leaf=5,silent=True )
        
        lgbm_results['roc_auc'].append(eval_model(model, x_train, y_train, x_valid, y_valid))
        lgbm_results['numIterations'].append(ni)
        lgbm_results['learningRate'].append(lr)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learningRate=1e-11, learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_data_in_leaf=5, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, numIterations=500, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learningRate=0.1, learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_data_in_leaf=5, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, numIterations=500, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
          

In [25]:
pd.DataFrame(lgbm_results).sort_values(by='roc_auc', ascending = False).head() 


Unnamed: 0,roc_auc,numIterations,learningRate
0,0.923624,500,1e-11
1,0.923624,500,0.1
2,0.923624,500,0.05
3,0.923624,10000,1e-11
4,0.923624,10000,0.1


### XGB

In [26]:
xgb_results = defaultdict(list)
learningRate = [0.00000000001,0.1,0.2,0.3,0.4,0.05]


for lr in learningRate:
    model = XGBClassifier(learning_rate = lr,silent=True )
    xgb_results['roc_auc'].append(eval_model(model,x_train, y_train, x_valid, y_valid))
    xgb_results['learningRate'].append(lr)


XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=1e-11, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='binary:logistic', random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, silent=True,
              subsample=None, tree_method=None, validate_parameters=None,
              verbosity=None)
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


XGBClassifier(base_score=None

In [27]:
pd.DataFrame(xgb_results).sort_values(by='roc_auc', ascending = False).head() 

Unnamed: 0,roc_auc,learningRate
4,0.930432,0.4
3,0.929332,0.3
2,0.922865,0.2
1,0.91529,0.1
5,0.904256,0.05


### CATBoost

In [28]:
cat_roc_auc = eval_model(CatBoostClassifier(),x_train, y_train, x_valid, y_valid)

<catboost.core.CatBoostClassifier object at 0x000002505941EA88>
Learning rate set to 0.023085
0:	learn: 0.6789439	total: 88.1ms	remaining: 1m 28s
1:	learn: 0.6683862	total: 93.3ms	remaining: 46.6s
2:	learn: 0.6562185	total: 106ms	remaining: 35.1s
3:	learn: 0.6455336	total: 112ms	remaining: 27.9s
4:	learn: 0.6334746	total: 119ms	remaining: 23.6s
5:	learn: 0.6242007	total: 124ms	remaining: 20.5s
6:	learn: 0.6156162	total: 131ms	remaining: 18.6s
7:	learn: 0.6075210	total: 138ms	remaining: 17.1s
8:	learn: 0.5987621	total: 144ms	remaining: 15.8s
9:	learn: 0.5899817	total: 151ms	remaining: 14.9s
10:	learn: 0.5834748	total: 158ms	remaining: 14.2s
11:	learn: 0.5779480	total: 165ms	remaining: 13.6s
12:	learn: 0.5733683	total: 171ms	remaining: 13s
13:	learn: 0.5656913	total: 177ms	remaining: 12.5s
14:	learn: 0.5604671	total: 183ms	remaining: 12s
15:	learn: 0.5550616	total: 190ms	remaining: 11.7s
16:	learn: 0.5499000	total: 201ms	remaining: 11.6s
17:	learn: 0.5442061	total: 208ms	remaining: 11.3s

160:	learn: 0.3830146	total: 1.28s	remaining: 6.68s
161:	learn: 0.3826351	total: 1.29s	remaining: 6.67s
162:	learn: 0.3824271	total: 1.3s	remaining: 6.66s
163:	learn: 0.3820237	total: 1.3s	remaining: 6.65s
164:	learn: 0.3811187	total: 1.31s	remaining: 6.63s
165:	learn: 0.3802963	total: 1.32s	remaining: 6.63s
166:	learn: 0.3799437	total: 1.33s	remaining: 6.62s
167:	learn: 0.3793127	total: 1.33s	remaining: 6.61s
168:	learn: 0.3787917	total: 1.34s	remaining: 6.59s
169:	learn: 0.3784761	total: 1.35s	remaining: 6.59s
170:	learn: 0.3782294	total: 1.35s	remaining: 6.57s
171:	learn: 0.3778924	total: 1.36s	remaining: 6.56s
172:	learn: 0.3775576	total: 1.37s	remaining: 6.55s
173:	learn: 0.3771228	total: 1.38s	remaining: 6.54s
174:	learn: 0.3766853	total: 1.38s	remaining: 6.53s
175:	learn: 0.3760572	total: 1.39s	remaining: 6.52s
176:	learn: 0.3757693	total: 1.4s	remaining: 6.5s
177:	learn: 0.3754411	total: 1.41s	remaining: 6.51s
178:	learn: 0.3749814	total: 1.42s	remaining: 6.5s
179:	learn: 0.374

325:	learn: 0.3265123	total: 2.46s	remaining: 5.08s
326:	learn: 0.3262409	total: 2.46s	remaining: 5.07s
327:	learn: 0.3259557	total: 2.47s	remaining: 5.06s
328:	learn: 0.3256057	total: 2.47s	remaining: 5.04s
329:	learn: 0.3253283	total: 2.48s	remaining: 5.04s
330:	learn: 0.3251328	total: 2.49s	remaining: 5.03s
331:	learn: 0.3249228	total: 2.49s	remaining: 5.01s
332:	learn: 0.3246637	total: 2.5s	remaining: 5s
333:	learn: 0.3242738	total: 2.5s	remaining: 4.99s
334:	learn: 0.3238449	total: 2.51s	remaining: 4.98s
335:	learn: 0.3234979	total: 2.52s	remaining: 4.98s
336:	learn: 0.3230688	total: 2.53s	remaining: 4.97s
337:	learn: 0.3226429	total: 2.53s	remaining: 4.96s
338:	learn: 0.3222199	total: 2.54s	remaining: 4.95s
339:	learn: 0.3219116	total: 2.54s	remaining: 4.94s
340:	learn: 0.3217050	total: 2.55s	remaining: 4.93s
341:	learn: 0.3214057	total: 2.56s	remaining: 4.92s
342:	learn: 0.3210594	total: 2.56s	remaining: 4.91s
343:	learn: 0.3208216	total: 2.57s	remaining: 4.9s
344:	learn: 0.3205

509:	learn: 0.2770859	total: 3.65s	remaining: 3.51s
510:	learn: 0.2768844	total: 3.66s	remaining: 3.5s
511:	learn: 0.2766019	total: 3.67s	remaining: 3.49s
512:	learn: 0.2763146	total: 3.67s	remaining: 3.48s
513:	learn: 0.2759839	total: 3.68s	remaining: 3.48s
514:	learn: 0.2758464	total: 3.69s	remaining: 3.47s
515:	learn: 0.2756676	total: 3.69s	remaining: 3.46s
516:	learn: 0.2753357	total: 3.7s	remaining: 3.45s
517:	learn: 0.2750384	total: 3.71s	remaining: 3.45s
518:	learn: 0.2748013	total: 3.71s	remaining: 3.44s
519:	learn: 0.2746406	total: 3.72s	remaining: 3.43s
520:	learn: 0.2744481	total: 3.72s	remaining: 3.42s
521:	learn: 0.2741260	total: 3.73s	remaining: 3.42s
522:	learn: 0.2739234	total: 3.74s	remaining: 3.41s
523:	learn: 0.2736229	total: 3.74s	remaining: 3.4s
524:	learn: 0.2732893	total: 3.75s	remaining: 3.39s
525:	learn: 0.2730676	total: 3.75s	remaining: 3.38s
526:	learn: 0.2728749	total: 3.76s	remaining: 3.38s
527:	learn: 0.2726662	total: 3.77s	remaining: 3.37s
528:	learn: 0.2

673:	learn: 0.2408636	total: 4.84s	remaining: 2.34s
674:	learn: 0.2407467	total: 4.85s	remaining: 2.34s
675:	learn: 0.2405420	total: 4.86s	remaining: 2.33s
676:	learn: 0.2404340	total: 4.87s	remaining: 2.32s
677:	learn: 0.2402146	total: 4.87s	remaining: 2.31s
678:	learn: 0.2399943	total: 4.88s	remaining: 2.31s
679:	learn: 0.2397437	total: 4.89s	remaining: 2.3s
680:	learn: 0.2394602	total: 4.89s	remaining: 2.29s
681:	learn: 0.2393431	total: 4.9s	remaining: 2.29s
682:	learn: 0.2391892	total: 4.92s	remaining: 2.28s
683:	learn: 0.2389908	total: 4.93s	remaining: 2.28s
684:	learn: 0.2387533	total: 4.94s	remaining: 2.27s
685:	learn: 0.2385149	total: 4.95s	remaining: 2.27s
686:	learn: 0.2383824	total: 4.96s	remaining: 2.26s
687:	learn: 0.2381086	total: 4.96s	remaining: 2.25s
688:	learn: 0.2379038	total: 4.97s	remaining: 2.24s
689:	learn: 0.2376861	total: 4.98s	remaining: 2.24s
690:	learn: 0.2376289	total: 4.99s	remaining: 2.23s
691:	learn: 0.2374453	total: 5s	remaining: 2.22s
692:	learn: 0.237

841:	learn: 0.2127154	total: 6.19s	remaining: 1.16s
842:	learn: 0.2126427	total: 6.2s	remaining: 1.15s
843:	learn: 0.2124125	total: 6.21s	remaining: 1.15s
844:	learn: 0.2122186	total: 6.21s	remaining: 1.14s
845:	learn: 0.2120440	total: 6.22s	remaining: 1.13s
846:	learn: 0.2118917	total: 6.23s	remaining: 1.13s
847:	learn: 0.2117729	total: 6.24s	remaining: 1.12s
848:	learn: 0.2116904	total: 6.25s	remaining: 1.11s
849:	learn: 0.2115569	total: 6.25s	remaining: 1.1s
850:	learn: 0.2114891	total: 6.26s	remaining: 1.09s
851:	learn: 0.2113053	total: 6.26s	remaining: 1.09s
852:	learn: 0.2111469	total: 6.27s	remaining: 1.08s
853:	learn: 0.2110886	total: 6.28s	remaining: 1.07s
854:	learn: 0.2109518	total: 6.29s	remaining: 1.06s
855:	learn: 0.2107418	total: 6.29s	remaining: 1.06s
856:	learn: 0.2106452	total: 6.3s	remaining: 1.05s
857:	learn: 0.2105670	total: 6.31s	remaining: 1.04s
858:	learn: 0.2104334	total: 6.31s	remaining: 1.04s
859:	learn: 0.2103429	total: 6.32s	remaining: 1.03s
860:	learn: 0.2

In [29]:
print('ROC_AUC for CatBoost is:' ,cat_roc_auc)

ROC_AUC for CatBoost is: 0.9256296178596966


# Model_Selection

In [30]:
print('ROC_AUC for LR is:' ,lr_roc_auc)
display('ROC_AUC for RF and the best params are are:'
        ,pd.DataFrame(rf_results).sort_values(by='roc_auc', ascending = False).head(1) )
display('ROC_AUC for LGBM and the best params are are:'
        ,pd.DataFrame(lgbm_results).sort_values(by='roc_auc', ascending = False).head(1) )
display('ROC_AUC for XGB and the best params are are:'
        ,pd.DataFrame(xgb_results).sort_values(by='roc_auc', ascending = False).head(1) )

print('ROC_AUC for CatBoost is:' ,cat_roc_auc)

ROC_AUC for LR is: 0.8390521967108923


'ROC_AUC for RF and the best params are are:'

Unnamed: 0,roc_auc,depth,estim
71,0.880968,9,90


'ROC_AUC for LGBM and the best params are are:'

Unnamed: 0,roc_auc,numIterations,learningRate
0,0.923624,500,1e-11


'ROC_AUC for XGB and the best params are are:'

Unnamed: 0,roc_auc,learningRate
4,0.930432,0.4


ROC_AUC for CatBoost is: 0.9256296178596966


All models have roc_auc score above 80, The most promosing models seems to be LGBM, XGB and CatBoostsince their roc_auc score is above 90, lets check them on the test set

In [31]:
models = [lgb.LGBMClassifier(learningRate = 1.000000e-11,numIterations = 500,min_data_in_leaf=5 ),
         XGBClassifier(learning_rate = 0.4),
          CatBoostClassifier()]
models_names = ['LGBM','XGBoost','CATBoost']



In [32]:
model_comparison = defaultdict(list)
for i in range(len(models)):
    model=models[i]
    model_comparison['roc_auc'].append(eval_model(model,x_train, y_train, x_test, y_test)) 
    #model_comparison['Models name'].append(models_names[i])
    #model_comparison['Models name'].append(models_names[i])

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learningRate=1e-11, learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_data_in_leaf=5, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, numIterations=500, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.4, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='bina

139:	learn: 0.3920612	total: 1.3s	remaining: 7.97s
140:	learn: 0.3916694	total: 1.3s	remaining: 7.95s
141:	learn: 0.3912141	total: 1.31s	remaining: 7.93s
142:	learn: 0.3908670	total: 1.32s	remaining: 7.92s
143:	learn: 0.3899593	total: 1.33s	remaining: 7.9s
144:	learn: 0.3893729	total: 1.34s	remaining: 7.89s
145:	learn: 0.3889887	total: 1.34s	remaining: 7.87s
146:	learn: 0.3886165	total: 1.35s	remaining: 7.85s
147:	learn: 0.3879587	total: 1.36s	remaining: 7.82s
148:	learn: 0.3873359	total: 1.36s	remaining: 7.8s
149:	learn: 0.3871249	total: 1.37s	remaining: 7.77s
150:	learn: 0.3868407	total: 1.39s	remaining: 7.82s
151:	learn: 0.3865535	total: 1.4s	remaining: 7.8s
152:	learn: 0.3862241	total: 1.4s	remaining: 7.78s
153:	learn: 0.3860068	total: 1.41s	remaining: 7.75s
154:	learn: 0.3855045	total: 1.42s	remaining: 7.74s
155:	learn: 0.3849322	total: 1.43s	remaining: 7.72s
156:	learn: 0.3845984	total: 1.43s	remaining: 7.7s
157:	learn: 0.3841227	total: 1.44s	remaining: 7.68s
158:	learn: 0.383773

319:	learn: 0.3284005	total: 2.8s	remaining: 5.95s
320:	learn: 0.3279203	total: 2.81s	remaining: 5.94s
321:	learn: 0.3275390	total: 2.81s	remaining: 5.93s
322:	learn: 0.3272168	total: 2.83s	remaining: 5.92s
323:	learn: 0.3270513	total: 2.83s	remaining: 5.91s
324:	learn: 0.3268405	total: 2.84s	remaining: 5.89s
325:	learn: 0.3265123	total: 2.85s	remaining: 5.88s
326:	learn: 0.3262409	total: 2.85s	remaining: 5.87s
327:	learn: 0.3259557	total: 2.86s	remaining: 5.86s
328:	learn: 0.3256057	total: 2.87s	remaining: 5.84s
329:	learn: 0.3253283	total: 2.87s	remaining: 5.83s
330:	learn: 0.3251328	total: 2.88s	remaining: 5.82s
331:	learn: 0.3249228	total: 2.89s	remaining: 5.82s
332:	learn: 0.3246637	total: 2.9s	remaining: 5.8s
333:	learn: 0.3242738	total: 2.9s	remaining: 5.79s
334:	learn: 0.3238449	total: 2.91s	remaining: 5.78s
335:	learn: 0.3234979	total: 2.92s	remaining: 5.77s
336:	learn: 0.3230688	total: 2.93s	remaining: 5.76s
337:	learn: 0.3226429	total: 2.93s	remaining: 5.75s
338:	learn: 0.32

479:	learn: 0.2844249	total: 4.15s	remaining: 4.5s
480:	learn: 0.2841380	total: 4.16s	remaining: 4.49s
481:	learn: 0.2839591	total: 4.17s	remaining: 4.48s
482:	learn: 0.2837633	total: 4.17s	remaining: 4.47s
483:	learn: 0.2834680	total: 4.18s	remaining: 4.46s
484:	learn: 0.2833337	total: 4.19s	remaining: 4.45s
485:	learn: 0.2831556	total: 4.2s	remaining: 4.44s
486:	learn: 0.2829133	total: 4.2s	remaining: 4.43s
487:	learn: 0.2826587	total: 4.21s	remaining: 4.42s
488:	learn: 0.2824932	total: 4.22s	remaining: 4.41s
489:	learn: 0.2821221	total: 4.22s	remaining: 4.39s
490:	learn: 0.2819746	total: 4.23s	remaining: 4.39s
491:	learn: 0.2816695	total: 4.24s	remaining: 4.38s
492:	learn: 0.2813192	total: 4.24s	remaining: 4.37s
493:	learn: 0.2810073	total: 4.25s	remaining: 4.35s
494:	learn: 0.2808289	total: 4.26s	remaining: 4.34s
495:	learn: 0.2806473	total: 4.26s	remaining: 4.33s
496:	learn: 0.2804524	total: 4.28s	remaining: 4.33s
497:	learn: 0.2802502	total: 4.28s	remaining: 4.32s
498:	learn: 0.2

657:	learn: 0.2441617	total: 5.5s	remaining: 2.86s
658:	learn: 0.2439920	total: 5.51s	remaining: 2.85s
659:	learn: 0.2437383	total: 5.51s	remaining: 2.84s
660:	learn: 0.2434847	total: 5.52s	remaining: 2.83s
661:	learn: 0.2433894	total: 5.53s	remaining: 2.82s
662:	learn: 0.2431849	total: 5.53s	remaining: 2.81s
663:	learn: 0.2429803	total: 5.54s	remaining: 2.8s
664:	learn: 0.2428092	total: 5.55s	remaining: 2.79s
665:	learn: 0.2426702	total: 5.55s	remaining: 2.79s
666:	learn: 0.2424675	total: 5.56s	remaining: 2.78s
667:	learn: 0.2422862	total: 5.57s	remaining: 2.77s
668:	learn: 0.2420380	total: 5.57s	remaining: 2.76s
669:	learn: 0.2416716	total: 5.58s	remaining: 2.75s
670:	learn: 0.2415709	total: 5.59s	remaining: 2.74s
671:	learn: 0.2413429	total: 5.59s	remaining: 2.73s
672:	learn: 0.2411522	total: 5.6s	remaining: 2.72s
673:	learn: 0.2408636	total: 5.61s	remaining: 2.71s
674:	learn: 0.2407467	total: 5.61s	remaining: 2.7s
675:	learn: 0.2405420	total: 5.62s	remaining: 2.69s
676:	learn: 0.24

828:	learn: 0.2146309	total: 6.7s	remaining: 1.38s
829:	learn: 0.2145171	total: 6.71s	remaining: 1.37s
830:	learn: 0.2144176	total: 6.71s	remaining: 1.36s
831:	learn: 0.2142778	total: 6.72s	remaining: 1.36s
832:	learn: 0.2140921	total: 6.73s	remaining: 1.35s
833:	learn: 0.2139173	total: 6.74s	remaining: 1.34s
834:	learn: 0.2137535	total: 6.75s	remaining: 1.33s
835:	learn: 0.2136032	total: 6.75s	remaining: 1.32s
836:	learn: 0.2134281	total: 6.76s	remaining: 1.32s
837:	learn: 0.2132861	total: 6.77s	remaining: 1.31s
838:	learn: 0.2131210	total: 6.77s	remaining: 1.3s
839:	learn: 0.2129594	total: 6.78s	remaining: 1.29s
840:	learn: 0.2128078	total: 6.79s	remaining: 1.28s
841:	learn: 0.2127154	total: 6.79s	remaining: 1.27s
842:	learn: 0.2126427	total: 6.8s	remaining: 1.27s
843:	learn: 0.2124125	total: 6.8s	remaining: 1.26s
844:	learn: 0.2122186	total: 6.81s	remaining: 1.25s
845:	learn: 0.2120440	total: 6.82s	remaining: 1.24s
846:	learn: 0.2118917	total: 6.82s	remaining: 1.23s
847:	learn: 0.21

998:	learn: 0.1916644	total: 7.9s	remaining: 7.91ms
999:	learn: 0.1915356	total: 7.9s	remaining: 0us


In [33]:
#final model comparison
models_compare = pd.DataFrame(model_comparison)
models_compare['Models name'] = ['LGBM','XGBoost','CATBoost']
models_compare.sort_values(by='roc_auc', ascending = False)

Unnamed: 0,roc_auc,Models name
1,0.902017,XGBoost
0,0.901256,LGBM
2,0.898098,CATBoost


### Model_Selection_Conclusion:
 XGBoost and LGBM gives the best preformances. 