# Encontrando hiperparametros ótimos para uma árvore de decisão

In [None]:
import numpy as np
import  pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('df_2.csv')
df.columns

x = df[['LIMIT_BAL', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]

y = df['default payment next month']

x_treino,x_teste,y_treino, y_teste = train_test_split(x,y,test_size=0.2, random_state=1)

In [None]:
# Valores a ser testado
params  =  {'max_depth':[1,2,4,6,8,10,12]}

# Instaciando o decision tree
dt = DecisionTreeClassifier()
dt.fit(x_treino, y_treino)

# Instaciando o gridsearchcv
cv = GridSearchCV(dt, param_grid=params, scoring='roc_auc', n_jobs=None, refit=True, cv=4, 
                  verbose=1, pre_dispatch=None, error_score = np.nan, return_train_score=True)
cv.fit(x_treino, y_treino)

# Analisando o resultado
cv_results_df = pd.DataFrame(cv.cv_results_)
cv_results_df

In [None]:
# Visualizando os resultados
ax = plt.axes()
ax.errorbar(cv_results_df['param_max_depth'], cv_results_df['mean_train_score'],yerr = cv_results_df['std_train_score'],label='train score')
ax.errorbar(cv_results_df['param_max_depth'], cv_results_df['mean_test_score'],yerr = cv_results_df['std_test_score'], label='testing score')
ax.legend()
plt.xlabel('max_depth')
plt.ylabel('ROC AUC')

# Ajustando uma floresta aleatória

In [None]:
# Instancioando a floresta 
rf = RandomForestClassifier(n_estimators=10,criterion='gini', max_depth=3, min_samples_split=2, min_samples_leaf=1,min_weight_fraction_leaf=0, max_features='auto',
                            max_leaf_nodes=None, min_impurity_decrease=0, bootstrap=True,oob_score=False, n_jobs=None, random_state=1, verbose=0,
                            warm_start=False, class_weight=None)

# Parametros a ser testado
rf_params = {'n_estimators':list(range(10,110,10))}

# Cv da floresta randomica
cv_rf_ex = GridSearchCV(rf, param_grid=rf_params,scoring='roc_auc', cv=4,verbose=1, pre_dispatch=None,error_score=np.nan, return_train_score=True)
cv_rf_ex.fit(x_treino, y_treino)

# Analisando o resultado
cv_results_rf = pd.DataFrame(cv_rf_ex.cv_results_)

# Selecionando os melhores parametros analisados
cv_rf_ex.best_params_

In [None]:
# Criando dataframe com as melhores features
feat_imp_df = pd.DataFrame({'Importance':cv_rf_ex.best_estimator_.feature_importances_})
feat_imp_df = feat_imp_df.sort_values('Importance', ascending=False)