In [3]:
import pandas as pd

In [4]:
dados = pd.read_excel('Bank_Personal_Loan_Modelling.xlsx')

In [5]:
dados.head()

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
0,1,25,1,49,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,4,1.0,2,0,0,0,0,0,1


In [6]:
dados = dados.dropna()

In [7]:
X = dados[["Age", "Experience", "Income", "Family", "CCAvg", "Education", "Mortgage", "CreditCard", "Securities_Account", "CD_Account", "Online"]]

y = dados['Personal_Loan'].copy()
y = pd.DataFrame(y)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [9]:
from sklearn.tree import DecisionTreeClassifier

arvore_decisao = DecisionTreeClassifier()

In [10]:
arvore_decisao.fit(X_train, y_train)

In [11]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print(f"Acurácia (Treino): {accuracy_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Acurácia (Teste): {accuracy_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Acurácia Balanceada (Treino): {balanced_accuracy_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Acurácia Balanceada (Teste): {balanced_accuracy_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Precision (Treino): {precision_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Precision (Teste): {precision_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Recall (Treino): {recall_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Recall (Teste): {recall_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"F1-Score (Treino): {f1_score(y_train, arvore_decisao.predict(X_train))}")
print(f"F1-Score (Teste): {f1_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"ROCAUC (Treino): {roc_auc_score(y_train, arvore_decisao.predict_proba(X_train)[:,1])}")
print(f"ROCAUC (Teste): {roc_auc_score(y_test, arvore_decisao.predict_proba(X_test)[:,1])}")

Acurácia (Treino): 1.0
Acurácia (Teste): 0.985
Acurácia Balanceada (Treino): 1.0
Acurácia Balanceada (Teste): 0.9498064159292036
Precision (Treino): 1.0
Precision (Teste): 0.9354838709677419
Recall (Treino): 1.0
Recall (Teste): 0.90625
F1-Score (Treino): 1.0
F1-Score (Teste): 0.9206349206349206
ROCAUC (Treino): 1.0
ROCAUC (Teste): 0.9498064159292036


In [12]:
from sklearn.model_selection import GridSearchCV

parameters = {
  'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

grid_search = GridSearchCV(arvore_decisao, parameters, scoring='roc_auc', cv=5, n_jobs=-1)

In [13]:
grid_search.fit(X_train, y_train)

In [14]:
grid_search.best_estimator_.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [15]:
arvore_decisao.fit(X_train, y_train)

In [16]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print(f"Acurácia (Treino): {accuracy_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Acurácia (Teste): {accuracy_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Acurácia Balanceada (Treino): {balanced_accuracy_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Acurácia Balanceada (Teste): {balanced_accuracy_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Precision (Treino): {precision_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Precision (Teste): {precision_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"Recall (Treino): {recall_score(y_train, arvore_decisao.predict(X_train))}")
print(f"Recall (Teste): {recall_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"F1-Score (Treino): {f1_score(y_train, arvore_decisao.predict(X_train))}")
print(f"F1-Score (Teste): {f1_score(y_test, arvore_decisao.predict(X_test))}")
print("====================================")
print(f"ROCAUC (Treino): {roc_auc_score(y_train, arvore_decisao.predict_proba(X_train)[:,1])}")
print(f"ROCAUC (Teste): {roc_auc_score(y_test, arvore_decisao.predict_proba(X_test)[:,1])}")

Acurácia (Treino): 1.0
Acurácia (Teste): 0.986
Acurácia Balanceada (Treino): 1.0
Acurácia Balanceada (Teste): 0.9457042772861357
Precision (Treino): 1.0
Precision (Teste): 0.9555555555555556
Recall (Treino): 1.0
Recall (Teste): 0.8958333333333334
F1-Score (Treino): 1.0
F1-Score (Teste): 0.9247311827956989
ROCAUC (Treino): 1.0
ROCAUC (Teste): 0.9457042772861358


In [17]:
results = pd.DataFrame(grid_search.cv_results_)

In [18]:
results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.009786,0.000517,0.009377,0.000626,4,{'max_depth': 4},0.986688,0.994979,0.996039,0.996228,0.995689,0.993925,0.003644,1
2,0.009986,0.000759,0.008939,0.000253,3,{'max_depth': 3},0.987024,0.994063,0.994656,0.992869,0.995222,0.992767,0.002976,2
4,0.010618,0.001394,0.009149,0.000413,5,{'max_depth': 5},0.953293,0.994818,0.991099,0.99718,0.981301,0.983538,0.016064,3
5,0.009783,0.000298,0.008772,0.000426,6,{'max_depth': 6},0.926396,0.979783,0.984804,0.964093,0.975409,0.966097,0.020996,4
1,0.009101,0.000863,0.010068,0.002212,2,{'max_depth': 2},0.948304,0.954258,0.96385,0.971251,0.958704,0.959273,0.007877,5
6,0.009611,0.000626,0.010325,0.002955,7,{'max_depth': 7},0.921425,0.975382,0.978095,0.943274,0.970398,0.957715,0.021972,6
7,0.01223,0.005459,0.010593,0.001016,8,{'max_depth': 8},0.922079,0.968808,0.971556,0.949408,0.969966,0.956364,0.018945,7
8,0.010348,0.000329,0.009666,0.002231,9,{'max_depth': 9},0.902179,0.967946,0.952255,0.92462,0.969589,0.943318,0.026148,8
9,0.010153,0.000397,0.008828,0.000132,10,{'max_depth': 10},0.890139,0.96182,0.958219,0.916438,0.969643,0.939252,0.030742,9
0,0.008383,0.00056,0.009268,0.000498,1,{'max_depth': 1},0.804195,0.83275,0.865334,0.839243,0.87321,0.842946,0.024643,10


In [34]:
from sklearn.ensemble import RandomForestClassifier

rf =  RandomForestClassifier(max_features='sqrt')

In [35]:
parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'n_estimators': [100, 300, 500]}

grid_search = GridSearchCV(rf, parameters, scoring='roc_auc', cv=5, n_jobs=-1)

In [36]:
grid_search.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,
  return fit_method(estimator, *args, **kwargs)


In [37]:
results = pd.DataFrame(grid_search.cv_results_)
results.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
29,3.274803,0.79495,0.094672,0.035946,10,500,"{'max_depth': 10, 'n_estimators': 500}",0.995039,0.998581,0.99912,0.995635,0.99885,0.997445,0.00174,1
28,1.451758,0.015848,0.049021,0.000653,10,300,"{'max_depth': 10, 'n_estimators': 300}",0.994784,0.998347,0.999138,0.996192,0.998707,0.997434,0.001669,2
27,0.488313,0.014682,0.025647,0.005907,10,100,"{'max_depth': 10, 'n_estimators': 100}",0.995602,0.998437,0.998689,0.994809,0.999066,0.997321,0.001757,3
26,3.083307,0.873548,0.094906,0.0393,9,500,"{'max_depth': 9, 'n_estimators': 500}",0.995093,0.998365,0.99912,0.995599,0.998329,0.997301,0.001629,4
24,0.483578,0.010834,0.023329,0.000346,9,100,"{'max_depth': 9, 'n_estimators': 100}",0.99482,0.998563,0.998994,0.994881,0.99894,0.99724,0.001956,5


In [38]:
grid_search.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 500,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [46]:
rf =  RandomForestClassifier(bootstrap = True,
 ccp_alpha= 0.0,
 class_weight= None,
 criterion= 'gini',
 max_depth = 9,
 max_features= 'sqrt',
 max_leaf_nodes = None,
 max_samples = None,
 min_impurity_decrease = 0.0,
 min_samples_leaf = 1,
 min_samples_split = 2,
 min_weight_fraction_leaf = 0.0,
 n_estimators = 500,
 n_jobs = None,
 oob_score = False,
 random_state = None,
 verbose = 0,
 warm_start = False)

In [47]:
rf.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [48]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print(f"Acurácia (Treino): {accuracy_score(y_train, rf.predict(X_train))}")
print(f"Acurácia (Teste): {accuracy_score(y_test, rf.predict(X_test))}")
print("====================================")
print(f"Acurácia Balanceada (Treino): {balanced_accuracy_score(y_train, rf.predict(X_train))}")
print(f"Acurácia Balanceada (Teste): {balanced_accuracy_score(y_test, rf.predict(X_test))}")
print("====================================")
print(f"Precision (Treino): {precision_score(y_train, rf.predict(X_train))}")
print(f"Precision (Teste): {precision_score(y_test, rf.predict(X_test))}")
print("====================================")
print(f"Recall (Treino): {recall_score(y_train, rf.predict(X_train))}")
print(f"Recall (Teste): {recall_score(y_test, rf.predict(X_test))}")
print("====================================")
print(f"F1-Score (Treino): {f1_score(y_train, rf.predict(X_train))}")
print(f"F1-Score (Teste): {f1_score(y_test, rf.predict(X_test))}")
print("====================================")
print(f"ROCAUC (Treino): {roc_auc_score(y_train, rf.predict_proba(X_train)[:,1])}")
print(f"ROCAUC (Teste): {roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])}")

Acurácia (Treino): 0.99575
Acurácia (Teste): 0.993
Acurácia Balanceada (Treino): 0.9778645833333333
Acurácia Balanceada (Teste): 0.9635416666666667
Precision (Treino): 1.0
Precision (Teste): 1.0
Recall (Treino): 0.9557291666666666
Recall (Teste): 0.9270833333333334
F1-Score (Treino): 0.9773635153129161
F1-Score (Teste): 0.9621621621621622
ROCAUC (Treino): 0.9999841560656342
ROCAUC (Teste): 0.9984328908554572
