In [16]:
import pandas as pd

DATA_PATH = 'data/bank.xlsx'

df = pd.read_excel(DATA_PATH)
df.head(4)

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
0,1,25,1,49,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,1,2.7,2,0,0,0,0,0,0


In [17]:
df.dtypes

ID                      int64
Age                     int64
Experience              int64
Income                  int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal_Loan           int64
Securities_Account      int64
CD_Account              int64
Online                  int64
CreditCard              int64
dtype: object

In [18]:
X = df.drop(columns=['Personal_Loan', 'ID']).copy()
y = df['Personal_Loan'].copy()

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

tree  = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [21]:
from sklearn.metrics import accuracy_score

# 1.0 -> Overfitting
print(f'Train accuracy -> {accuracy_score(y_train, tree.predict(X_train))}\n\
Test  accuracy -> {accuracy_score(y_test, tree.predict(X_test))}')

Train accuracy -> 1.0
Test  accuracy -> 0.98


In [22]:
param = {
  'max_depth': [i for i in range(1, 11)]
}

gs = GridSearchCV(tree, param, scoring='accuracy', cv=5, n_jobs=-1)
gs.fit(X_train, y_train)

In [23]:
tree_rank = pd.DataFrame(gs.cv_results_).sort_values(by='rank_test_score')
tree_rank.head(4)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.012151,0.005906,0.007045,0.004044,10,{'max_depth': 10},0.98375,0.98375,0.985,0.9925,0.99125,0.98725,0.003824,1
8,0.017593,0.010972,0.003499,0.001314,9,{'max_depth': 9},0.985,0.98375,0.98625,0.98875,0.98875,0.9865,0.002,2
7,0.01575,0.010237,0.004953,0.002379,8,{'max_depth': 8},0.985,0.98625,0.985,0.9875,0.9875,0.98625,0.001118,3
6,0.021294,0.015539,0.003872,0.001767,7,{'max_depth': 7},0.98125,0.9825,0.9825,0.98375,0.99,0.984,0.003102,4


In [24]:
gs.best_estimator_.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [25]:
tunned_tree = DecisionTreeClassifier(ccp_alpha=0.0,
 class_weight=None,
 criterion='gini',
 max_depth=5,
 max_features=None,
 max_leaf_nodes=None,
 min_impurity_decrease=0.0,
 min_samples_leaf=1,
 min_samples_split=2,
 min_weight_fraction_leaf=0.0,
 random_state=None,
 splitter='best')
tunned_tree.fit(X_train, y_train)

In [26]:
print(f'Train accuracy -> {accuracy_score(y_train, tunned_tree.predict(X_train))}\n'
      f'Test  accuracy -> {accuracy_score(y_test, tunned_tree.predict(X_test))}')

Train accuracy -> 0.989
Test  accuracy -> 0.986


In [27]:
from sklearn.ensemble import RandomForestClassifier

rfc   = RandomForestClassifier()
param = {
  'max_depth': [i for i in range(1, 11)],
  'n_estimators': [100, 300, 500]
}
rfc.fit(X_train, y_train)

In [28]:
print(f'Train accuracy -> {accuracy_score(y_train, rfc.predict(X_train))}\n'
      f'Test  accuracy -> {accuracy_score(y_test, rfc.predict(X_test))}')

Train accuracy -> 1.0
Test  accuracy -> 0.98


In [29]:
gs = GridSearchCV(rfc, param, scoring='accuracy', cv=5, n_jobs=-1)
gs.fit(X_train, y_train)

In [30]:
rfc_rank = pd.DataFrame(gs.cv_results_).sort_values(by='rank_test_score')
rfc_rank.head(4)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
28,1.646616,0.091091,0.077141,0.00438,10,300,"{'max_depth': 10, 'n_estimators': 300}",0.9825,0.99375,0.98375,0.98875,0.99125,0.988,0.004301,1
25,1.774998,0.107388,0.086113,0.022426,9,300,"{'max_depth': 9, 'n_estimators': 300}",0.985,0.99125,0.985,0.98625,0.99125,0.98775,0.002894,2
26,3.189676,0.163403,0.147208,0.015233,9,500,"{'max_depth': 9, 'n_estimators': 500}",0.98375,0.9925,0.98125,0.99,0.99,0.9875,0.004257,3
29,2.661196,0.117871,0.096503,0.006888,10,500,"{'max_depth': 10, 'n_estimators': 500}",0.9825,0.99375,0.9825,0.98625,0.99,0.987,0.004373,4


In [31]:
gs.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 300,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [32]:
tunned_rfc = RandomForestClassifier(bootstrap=True,
 ccp_alpha=0.0,
 class_weight=None,
 criterion='gini',
 max_depth=10,
 max_features='sqrt',
 max_leaf_nodes=None,
 max_samples=None,
 min_impurity_decrease=0.0,
 min_samples_leaf=1,
 min_samples_split=2,
 min_weight_fraction_leaf=0.0,
 n_estimators=300,
 n_jobs=None,
 oob_score=False,
 random_state=None,
 verbose=0,
 warm_start=False)
tunned_rfc.fit(X_train, y_train)

In [33]:
print(f'Train accuracy -> {accuracy_score(y_train, tunned_rfc.predict(X_train))}\n'
      f'Test  accuracy -> {accuracy_score(y_test, tunned_rfc.predict(X_test))}')

Train accuracy -> 0.997
Test  accuracy -> 0.982


In [34]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_jobs=-1)

In [35]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {'learning_rate': [0.001, 0.01],
              'num_leaves': [2, 128],
              'min_child_samples': [1, 100],
              'subsample': [0.05, 1],
              'colsample_bytree': [0.1, 1.0]}

rs = RandomizedSearchCV(lgbm, parameters, scoring='accuracy', cv=5, n_iter=5, n_jobs=-1)
rs.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 384, number of negative: 3616
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000394 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 612
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.096000 -> initscore=-2.242481
[LightGBM] [Info] Start training from score -2.242481


In [36]:
pd.DataFrame(rs.cv_results_).sort_values(by='rank_test_score').head(4)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_num_leaves,param_min_child_samples,param_learning_rate,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.487299,0.088219,0.004616,0.000809,0.05,128,100,0.001,1.0,"{'subsample': 0.05, 'num_leaves': 128, 'min_ch...",0.905,0.90375,0.90375,0.90375,0.90375,0.904,0.0005,1
1,0.137824,0.052295,0.005138,0.001093,1.0,2,1,0.001,0.1,"{'subsample': 1, 'num_leaves': 2, 'min_child_s...",0.905,0.90375,0.90375,0.90375,0.90375,0.904,0.0005,1
2,0.118354,0.01054,0.004059,0.000355,1.0,2,100,0.001,0.1,"{'subsample': 1, 'num_leaves': 2, 'min_child_s...",0.905,0.90375,0.90375,0.90375,0.90375,0.904,0.0005,1
3,0.117215,0.01068,0.003878,0.000415,0.05,2,1,0.001,1.0,"{'subsample': 0.05, 'num_leaves': 2, 'min_chil...",0.905,0.90375,0.90375,0.90375,0.90375,0.904,0.0005,1
