# GridSearchCV
= 최고의 하이퍼파라미터를 찾기위해 사용된다.

- sklearn.model_selection.GridSearchCV
>- estimator : 학습 모델  
>- param_grid : 실행해볼 하이퍼파라미터의 목록 (dict객체 형태)  
>- scoring = None  
>- n_jobs = None : 코어의 사용  
>- refit = True  
>- cv = None : CrossValication에 사용할 나누는 개수 (default = 5)  
>- verbose = 0  (0: 메세지 출력 안함, 1: 간단한 메세지, 2: 하이퍼파라미터별 메세지 출력)
>- pre_dispatch = '2nd_jobs'  
>- error_score = nan  
>- return_train_score = False  

In [51]:
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.model_selection import GridSearchCV

help(GridSearchCV)

Help on class GridSearchCV in module sklearn.model_selection._search:

class GridSearchCV(BaseSearchCV)
 |  GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)
 |  
 |  Exhaustive search over specified parameter values for an estimator.
 |  
 |  Important members are fit, predict.
 |  
 |  GridSearchCV implements a "fit" and a "score" method.
 |  It also implements "score_samples", "predict", "predict_proba",
 |  "decision_function", "transform" and "inverse_transform" if they are
 |  implemented in the estimator used.
 |  
 |  The parameters of the estimator used to apply these methods are optimized
 |  by cross-validated grid-search over a parameter grid.
 |  
 |  Read more in the :ref:`User Guide <grid_search>`.
 |  
 |  Parameters
 |  ----------
 |  estimator : estimator object
 |      This is assumed to implement the scikit-learn estimator interface.
 |      Either est

# iris dataset

In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()

iris_df = pd.DataFrame(data = iris.data, columns = iris.feature_names)
iris_df["target"] = iris.target

iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [4]:
iris_df["target"].to_numpy()

# 일부 데이터셋의 경우 target의 class가 다음과 같은 형태로 존재할 수도 있다.
# 이처럼 존재할 경우 교차검증을위해 나눌때 한 class로 몰리게될 수 있기 때문에
# 섞어주는 과정이 필요하다.

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
iris_df = iris_df.sample(frac = 1, random_state = 0)

In [25]:
X = iris_df.iloc[:,:-1]
y = iris_df["target"]

In [7]:
iris_df["target"].to_numpy()

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
       0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1,
       1, 2, 0, 0, 2, 1, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0,
       0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 2, 1, 2, 1, 0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2,
       1, 1, 0, 1, 2, 2, 0, 1, 1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2)

In [16]:
print("data shape : ", iris_df.shape)
print("train shape : ", X_train.shape)
print("test shape : ",X_test.shape)

data shape :  (150, 5)
train shape :  (120, 4)
test shape :  (30, 4)


In [None]:
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_sclaed = scaler.fit_transform(X_train)
X_test_sclaed = scaler.transform(X_test)

# GridSearchCV (X)

In [21]:
model = KNeighborsClassifier().fit(X_train, y_train)

print("train score : {:.3f}".format(model.score(X_train, y_train)))
print("test score : {:.3f}".format(model.score(X_test, y_test)))

train score : 0.975
test score : 0.967


# GridSearchCV (O)

##  모델의 파라미터 조회

In [22]:
help(KNeighborsClassifier)

Help on class KNeighborsClassifier in module sklearn.neighbors._classification:

class KNeighborsClassifier(sklearn.neighbors._base.KNeighborsMixin, sklearn.base.ClassifierMixin, sklearn.neighbors._base.NeighborsBase)
 |  KNeighborsClassifier(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
 |  
 |  Classifier implementing the k-nearest neighbors vote.
 |  
 |  Read more in the :ref:`User Guide <classification>`.
 |  
 |  Parameters
 |  ----------
 |  n_neighbors : int, default=5
 |      Number of neighbors to use by default for :meth:`kneighbors` queries.
 |  
 |  weights : {'uniform', 'distance'} or callable, default='uniform'
 |      Weight function used in prediction.  Possible values:
 |  
 |      - 'uniform' : uniform weights.  All points in each neighborhood
 |        are weighted equally.
 |      - 'distance' : weight points by the inverse of their distance.
 |        in this case, closer neighbors of

### 시도해볼 parameter 설정하기

In [53]:
params = {"n_neighbors" : range(3, 10),
         "p" : [1, 2]}

### gs 객체 만들기

In [32]:
gs = GridSearchCV(estimator = KNeighborsClassifier(),
                            param_grid = params,
                            cv = 5, n_jobs = -1,
                            verbose = 1)

### gs 학습하기

In [33]:
gs.fit(X, y)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': range(3, 10), 'p': [1, 2]}, verbose=1)

### gs의 주요 attribute 확인하기

In [34]:
print(dir(gs))

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_feature_names', '_check_n_features', '_check_refit_for_multimetric', '_estimator_type', '_format_results', '_get_param_names', '_get_tags', '_more_tags', '_pairwise', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_run_search', '_select_best_index', '_validate_data', 'best_estimator_', 'best_index_', 'best_params_', 'best_score_', 'classes_', 'cv', 'cv_results_', 'decision_function', 'error_score', 'estimator', 'feature_names_in_', 'fit', 'get_params', 'inverse_transform', 'multimetric_', 'n_features_in_', 'n_jobs', 'n_splits_', 'param_gri

### cv별 결과 확인하기

In [38]:
result = pd.DataFrame(gs.cv_results_)
result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004388,0.001017,0.00738,0.00249176,3,1,"{'n_neighbors': 3, 'p': 1}",1.0,0.866667,1.0,1.0,0.933333,0.96,0.053333,7
1,0.004188,0.000977,0.004987,4.623108e-07,3,2,"{'n_neighbors': 3, 'p': 2}",1.0,0.9,1.0,1.0,0.933333,0.966667,0.042164,3
2,0.008577,0.009241,0.006183,0.001162688,4,1,"{'n_neighbors': 4, 'p': 1}",0.966667,0.866667,1.0,1.0,0.933333,0.953333,0.049889,10
3,0.006583,0.006693,0.006383,0.001352705,4,2,"{'n_neighbors': 4, 'p': 2}",1.0,0.866667,1.0,1.0,0.966667,0.966667,0.05164,3
4,0.00738,0.003969,0.00758,0.002239276,5,1,"{'n_neighbors': 5, 'p': 1}",1.0,0.866667,1.0,1.0,0.933333,0.96,0.053333,7
5,0.004389,0.001352,0.007579,0.004259111,5,2,"{'n_neighbors': 5, 'p': 2}",1.0,0.933333,1.0,1.0,0.933333,0.973333,0.03266,1
6,0.005585,0.00371,0.005186,0.0003991131,6,1,"{'n_neighbors': 6, 'p': 1}",0.966667,0.866667,1.0,1.0,0.933333,0.953333,0.049889,10
7,0.003591,0.000489,0.00718,0.002221142,6,2,"{'n_neighbors': 6, 'p': 2}",1.0,0.866667,1.0,1.0,0.966667,0.966667,0.05164,3
8,0.003791,0.000746,0.005185,0.0009771066,7,1,"{'n_neighbors': 7, 'p': 1}",0.966667,0.866667,1.0,1.0,0.933333,0.953333,0.049889,10
9,0.00379,0.001163,0.005784,0.0009775837,7,2,"{'n_neighbors': 7, 'p': 2}",1.0,0.933333,1.0,1.0,0.933333,0.973333,0.03266,1


### 최적의 모델 만들기

In [50]:
model = gs.best_estimator_

### 최적의 파라미터와, 해당 파라미터 사용시 모델의 점수 확인하기

In [52]:
print("best parameter :", gs.best_params_)
print("model score :" ,model.score(X_test, y_test))

best parameter : {'n_neighbors': 5, 'p': 2}
model score : 0.9666666666666667
