# Otimização dos Parâmetros com Randomized Search

## Extremely Randomized Forest

In [1]:
# Abrir um prompt ou terminal e executar o comando abaixo (responder yes quando solicitado). Reiniciar o Kernel do Jupyter Notebook
# conda update scikit-learn

In [2]:
import sklearn as sl
sl.__version__

'0.24.2'

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
#from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Carrega o dataset
data = pd.read_excel('data/credit.xls', skiprows = 1)

# Variável target
target = 'default payment next month'
y = np.asarray(data[target])

# Variáveis preditoras
features = data.columns.drop(['ID', target])
X = np.asarray(data[features])

# Dataset de treino e de teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 99)

# Classificador
clf = ExtraTreesClassifier(n_estimators = 50, random_state = 99)

# Modelo
clf.fit(X_train, y_train)

# Score
scores = cross_val_score(clf, X_train, y_train, cv = 3, scoring = 'accuracy', n_jobs = -1)

# Imprimindo o resultado
print ("ExtraTreesClassifier -> Acurácia: Média = %0.3f Desvio Padrão = %0.3f" % (np.mean(scores), np.std(scores)))

# Fazendo previsões
y_pred = clf.predict(X_test)

# Confusion Matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
print (confusionMatrix)

# Acurácia
accuracy_score(y_test, y_pred)


ExtraTreesClassifier -> Acurácia: Média = 0.808 Desvio Padrão = 0.003
[[6539  439]
 [1306  716]]


0.8061111111111111

## Otimização dos Parâmetros com Randomized Search

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

O Randomized Search gera amostras dos parâmetros dos algoritmos a partir de uma distribuição randômica uniforme para um número fixo de interações. Um modelo é construído e testado para cada combinação de parâmetros. 

In [4]:
# Import
from sklearn.model_selection import RandomizedSearchCV

In [5]:
# Definição dos parâmetros
param_dist = {"max_depth": [1, 3, 7, 8, 12, None],
              "max_features": [8, 9, 10, 11, 16, 22],
              "min_samples_split": [8, 10, 11, 14, 16, 19],
              "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7],
              "bootstrap": [True, False]}

# Para o classificador criado na célula anterior, testamos diferentes combinações de parâmetros
rsearch = RandomizedSearchCV(clf, param_distributions = param_dist, n_iter = 25)  

# Aplicando o resultado ao conjunto de dados de treino e obtendo o score
rsearch.fit(X_train,y_train)
rsearch.cv_results_

# Imprimindo o melhor estimador
bestclf = rsearch.best_estimator_
print (bestclf)

# Aplicando o melhor estimador para realizar as previsões
y_pred = bestclf.predict(X_test)

# Confusion Matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
print(confusionMatrix)

# Acurácia
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

ExtraTreesClassifier(bootstrap=True, max_depth=7, max_features=16,
                     min_samples_leaf=6, min_samples_split=16, n_estimators=50,
                     random_state=99)
[[6640  338]
 [1277  745]]
0.8205555555555556


In [6]:
# Obtendo o grid com todas as combinações de parâmetros
rsearch.cv_results_

{'mean_fit_time': array([0.3894033 , 0.4258224 , 1.11215353, 0.13680091, 0.38179512,
        0.22700562, 0.16315932, 0.20385022, 0.19268661, 0.4444171 ,
        0.37219863, 0.19267831, 0.58722939, 0.14440856, 0.47911201,
        0.74720688, 0.52461371, 0.72127104, 0.39274931, 0.96882529,
        0.7081069 , 0.97220492, 0.32154384, 0.33710403, 0.63570533]),
 'std_fit_time': array([0.02118026, 0.0103942 , 0.03179246, 0.00230589, 0.00653754,
        0.02228859, 0.03661781, 0.0391953 , 0.03456227, 0.06095281,
        0.01841421, 0.00291576, 0.04832508, 0.00298436, 0.00421068,
        0.12311374, 0.00581091, 0.00651121, 0.00375881, 0.01180122,
        0.00334605, 0.00896866, 0.0036807 , 0.00432402, 0.00559155]),
 'mean_score_time': array([0.02120328, 0.02095594, 0.03849049, 0.0119575 , 0.01794744,
        0.01495442, 0.01536317, 0.01656036, 0.01296334, 0.02055001,
        0.01935973, 0.01196809, 0.0271277 , 0.0117784 , 0.01856279,
        0.02692895, 0.02591376, 0.020152  , 0.02014632, 0.04

## Grid Search x Randomized Search para Estimação dos Hiperparâmetros

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

O Grid Search realiza metodicamente combinações entre todos os parâmetros do algoritmo, criando um grid. 

In [7]:
import numpy as np
from time import time
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits

# Obtém o dataset
digits = load_digits()
X, y = digits.data, digits.target

# Construindo o classificador
clf = RandomForestClassifier(n_estimators = 20)

In [8]:
# Randomized Search

# Valores dos parâmetros que serão testados
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# Executando o Randomized Search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions = param_dist, n_iter = n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV executou em %.2f segundos para %d candidatos a parâmetros do modelo." 
      % ((time() - start), n_iter_search))

# Imprime as combinações dos parâmetros e susas respectivas médias de acurácia
random_search.cv_results_

RandomizedSearchCV executou em 4.08 segundos para 20 candidatos a parâmetros do modelo.


{'mean_fit_time': array([0.04388328, 0.0325141 , 0.03242402, 0.03610945, 0.03798733,
        0.02732177, 0.02830849, 0.06343026, 0.02354255, 0.02832966,
        0.05009956, 0.03018017, 0.02358856, 0.02274861, 0.02533269,
        0.06614728, 0.02951488, 0.03510666, 0.03449764, 0.05416093]),
 'std_fit_time': array([0.01780769, 0.00232606, 0.00131198, 0.00114859, 0.00254211,
        0.00150212, 0.00135641, 0.01817964, 0.00118128, 0.00162246,
        0.00095896, 0.0012527 , 0.0007864 , 0.00172212, 0.0010328 ,
        0.00161734, 0.00134838, 0.00073927, 0.00079972, 0.00191742]),
 'mean_score_time': array([0.00398898, 0.00279126, 0.00319028, 0.00298581, 0.00319681,
        0.00339637, 0.00281377, 0.00338521, 0.00278587, 0.00298643,
        0.00428109, 0.00318489, 0.00279112, 0.00298409, 0.00319066,
        0.00279245, 0.00239377, 0.00259762, 0.00220423, 0.00278864]),
 'std_score_time': array([8.92016775e-04, 7.45868640e-04, 7.38809129e-04, 1.09367361e-05,
        7.46096309e-04, 1.02079910e-

In [9]:
# Grid Search

# Usando um grid completo de todos os parâmetros
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# Executando o Grid Search
grid_search = GridSearchCV(clf, param_grid = param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV executou em %.2f segundos para todas as combinações de candidatos a parâmetros do modelo."
      % (time() - start))
grid_search.cv_results_

GridSearchCV executou em 15.99 segundos para todas as combinações de candidatos a parâmetros do modelo.


{'mean_fit_time': array([0.03040919, 0.02721882, 0.0252636 , 0.02832952, 0.02791991,
        0.03091803, 0.03690114, 0.0367012 , 0.03580613, 0.03630929,
        0.03121371, 0.02793107, 0.04088602, 0.03709588, 0.03531785,
        0.06522007, 0.06094489, 0.05485721, 0.02813492, 0.02792001,
        0.02932258, 0.03151646, 0.03052444, 0.02812467, 0.0427825 ,
        0.04895153, 0.04208722, 0.04408827, 0.03769965, 0.03031893,
        0.06462269, 0.04399896, 0.03879938, 0.08237906, 0.07799406,
        0.07456398, 0.02214174, 0.02154222, 0.02373114, 0.02613597,
        0.02393708, 0.02413568, 0.03630357, 0.03621707, 0.03621697,
        0.03889604, 0.028021  , 0.02464256, 0.04326544, 0.03948884,
        0.03531146, 0.07780294, 0.07045884, 0.06147165, 0.02072783,
        0.02134857, 0.02094507, 0.02493353, 0.0247333 , 0.02504077,
        0.04040575, 0.04088707, 0.03938994, 0.04986424, 0.03351173,
        0.02552719, 0.05524726, 0.05097089, 0.03930202, 0.09346085,
        0.08607092, 0.08198051]

In [10]:
grid_search.cv_results_

{'mean_fit_time': array([0.03040919, 0.02721882, 0.0252636 , 0.02832952, 0.02791991,
        0.03091803, 0.03690114, 0.0367012 , 0.03580613, 0.03630929,
        0.03121371, 0.02793107, 0.04088602, 0.03709588, 0.03531785,
        0.06522007, 0.06094489, 0.05485721, 0.02813492, 0.02792001,
        0.02932258, 0.03151646, 0.03052444, 0.02812467, 0.0427825 ,
        0.04895153, 0.04208722, 0.04408827, 0.03769965, 0.03031893,
        0.06462269, 0.04399896, 0.03879938, 0.08237906, 0.07799406,
        0.07456398, 0.02214174, 0.02154222, 0.02373114, 0.02613597,
        0.02393708, 0.02413568, 0.03630357, 0.03621707, 0.03621697,
        0.03889604, 0.028021  , 0.02464256, 0.04326544, 0.03948884,
        0.03531146, 0.07780294, 0.07045884, 0.06147165, 0.02072783,
        0.02134857, 0.02094507, 0.02493353, 0.0247333 , 0.02504077,
        0.04040575, 0.04088707, 0.03938994, 0.04986424, 0.03351173,
        0.02552719, 0.05524726, 0.05097089, 0.03930202, 0.09346085,
        0.08607092, 0.08198051]