# 4.1.4 Grid SearchCV

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

df = pd.read_csv("diabetes.csv")
X, y = df.drop("Outcome", axis = 1), df["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.7, shuffle = True, random_state = 42)
print("X_train.shape = {}, y_train.shape = {}".format(X_train.shape, y_train.shape))

X_train.shape = (537, 8), y_train.shape = (537,)


# Grid Search

--> 가장 우수한 성능을 보이는 모델의 하이퍼 파라미터를 찾기 위함

In [52]:
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier(random_state=42)
param_grid = {"max_depth": range(3,12),
              "max_features":[0.3, 0.5, 0.7, 0.9, 1]} # 이건 뭘 의미하는거지? 의사결정나무에 사용하는 feature 개수 

# cv = 조각
clf = GridSearchCV(model, param_grid=param_grid, n_jobs = -1, cv= 5,verbose = 1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


0.7359307359307359


[Parallel(n_jobs=-1)]: Done 190 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:    0.7s finished


In [23]:
# 완전 이해했다.
pd.DataFrame(clf.cv_results_).sort_values(by = "rank_test_score").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
12,0.007147,0.003466,0.002714,0.000382,5,0.7,"{'max_depth': 5, 'max_features': 0.7}",0.833333,0.703704,0.813084,0.766355,0.738318,0.770959,0.047487,1
41,0.007709,0.001873,0.002962,0.000398,11,0.5,"{'max_depth': 11, 'max_features': 0.5}",0.759259,0.722222,0.775701,0.738318,0.757009,0.750502,0.018451,2
13,0.011874,0.007382,0.002577,0.000161,5,0.9,"{'max_depth': 5, 'max_features': 0.9}",0.796296,0.722222,0.794393,0.682243,0.738318,0.746694,0.043723,3
2,0.005536,0.001769,0.003347,0.001031,3,0.7,"{'max_depth': 3, 'max_features': 0.7}",0.787037,0.731481,0.785047,0.682243,0.738318,0.744825,0.038824,4
3,0.031346,0.016842,0.013443,0.011481,3,0.9,"{'max_depth': 3, 'max_features': 0.9}",0.768519,0.722222,0.757009,0.728972,0.738318,0.743008,0.017308,5


# 4.5 RandomSearchCV 사용하기

- 위에 있는 Grid Search CV와의 차이점은 Grid CV는 개발자가 입력한 모든 파라미터를 다 Search함에 반해서, RandomSearchCV는 지정한 범위에서 몇개만 Search하면서 최적의 파라미터를 찾아준다.

- 시간효율에서 강하고, 심지어 성능도 더 좋다는 논문이 나왔다.

In [41]:
max_depth = np.random.randint(1,20,10)
max_depth

max_features =  np.random.uniform(0.4, 1.0, 100)
max_features

array([0.5370542 , 0.85761858, 0.89162282, 0.79715988, 0.67635147,
       0.93774713, 0.43448438, 0.68208478, 0.58763146, 0.81600601,
       0.90979837, 0.47496774, 0.52118595, 0.40829956, 0.50310176,
       0.59471687, 0.88457785, 0.98175683, 0.73854226, 0.40222497,
       0.76149095, 0.42642225, 0.44895301, 0.92664345, 0.5724593 ,
       0.89146034, 0.90524689, 0.77731385, 0.46343441, 0.91626165,
       0.61028052, 0.9528096 , 0.74782716, 0.55017858, 0.75027508,
       0.86948171, 0.48703803, 0.42847334, 0.68408179, 0.5636356 ,
       0.68617238, 0.87423904, 0.70434844, 0.50480339, 0.62706863,
       0.83173661, 0.90263009, 0.43602728, 0.84411198, 0.49734445,
       0.8593461 , 0.95685755, 0.78154735, 0.84719462, 0.40944695,
       0.74570149, 0.88874247, 0.46051959, 0.53136742, 0.81649624,
       0.75173161, 0.81266811, 0.78385138, 0.70295609, 0.95164423,
       0.9956191 , 0.40706808, 0.81245325, 0.74511317, 0.57846895,
       0.8321494 , 0.9575412 , 0.6544653 , 0.83787831, 0.43849

In [56]:
from sklearn.model_selection import RandomizedSearchCV



param_distributions = {"max_depth": max_depth,
                      "max_features":max_features,
                      "min_samples_split": list(range(2,7))
                      }
clf = RandomizedSearchCV(model,
                  param_distributions,
                  n_iter =200,
                  scoring = "accuracy",
                  n_jobs= -1,
                  cv = 5,
                  random_state = 42
                  )
clf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=200, n_jobs=-1,
                   param_distributions={'max_depth': array([17, 14, 18, 16, 14, 16,  5, 19, 14,  9]),
                                        'max_features': array([0.5370542 , 0.85761858, 0.89162282, 0.79715988, 0.67635147,
       0.93774713, 0.43448438, 0.68208478, 0.58763146, 0.81600601,
       0.90979837, 0.47496774, 0.52118595, 0.40829956, 0.50310176...
       0.8321494 , 0.9575412 , 0.6544653 , 0.83787831, 0.43849336,
       0.46767451, 0.77106967, 0.58276475, 0.45235126, 0.72333214,
       0.43013007, 0.75232221, 0.41902069, 0.78830863, 0.40884074,
       0.88696191, 0.64115161, 0.64943285, 0.82911036, 0.46227302,
       0.70515798, 0.97165795, 0.83382173, 0.8251153 , 0.83709886,
       0.72976357, 0.46549444, 0.69527673, 0.74490276, 0.4633457 ]),
                                        'min_samples_split': [2, 3, 4, 5, 6]},
                   random_state=42, sc

In [57]:
clf.best_params_

{'min_samples_split': 2, 'max_features': 0.745113167347061, 'max_depth': 5}

In [58]:
clf.score(X_test, y_test)

0.7359307359307359

In [55]:
pd.DataFrame(clf.cv_results_).sort_values(by = "rank_test_score").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
12,0.005934,0.001156,0.004594,0.003274,5,0.7,"{'max_depth': 5, 'max_features': 0.7}",0.833333,0.703704,0.813084,0.766355,0.738318,0.770959,0.047487,1
41,0.006828,0.002846,0.002419,8.8e-05,11,0.5,"{'max_depth': 11, 'max_features': 0.5}",0.759259,0.722222,0.775701,0.738318,0.757009,0.750502,0.018451,2
13,0.005941,0.000696,0.002474,0.000678,5,0.9,"{'max_depth': 5, 'max_features': 0.9}",0.796296,0.722222,0.794393,0.682243,0.738318,0.746694,0.043723,3
2,0.006726,0.003081,0.003945,0.002002,3,0.7,"{'max_depth': 3, 'max_features': 0.7}",0.787037,0.731481,0.785047,0.682243,0.738318,0.744825,0.038824,4
3,0.005481,0.001587,0.002341,0.000382,3,0.9,"{'max_depth': 3, 'max_features': 0.9}",0.768519,0.722222,0.757009,0.728972,0.738318,0.743008,0.017308,5


# 번외: geopandas와 folium 실행 잘되는지 확인

In [None]:
import geopandas as gpd
import numpy as np
data1 = gpd.read_file('geosample.geojson')
data1

In [None]:
import folium

center = (37.64726, 126.82557)
zoom = 11

m = folium.Map(location = center, 
               zoom_start = zoom,
               tiles = 'http://api.vworld.kr/req/wmts/1.0.0/YOUR_API_KEY/Base/{z}/{y}/{x}.png',
               attr = '고양시'
              )
folium.GeoJson(data1).add_to(m)
m