### 12.1 완전 탐색을 사용해 최선의 모델 선택하기

In [1]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV #모델 선택도와주는 library
import warnings
warnings.filterwarnings("ignore")

In [2]:
#데이터 로드
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [3]:
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [4]:
#선형모델 중에 로지스틱 사용(정균분포와 비슷)
logistic = linear_model.LogisticRegression()

#하이퍼 파라미터조정(사용자가 정의하는 변수값)
penalty = ['l1', 'l2']#엘원, 엘투

In [5]:
np.logspace(0, 4, 10)

array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04])

In [6]:
logistic.fit(features, target)
logistic.score(features, target)

0.9733333333333334

In [7]:
C = np.logspace(0, 4, 10)#(start, stop, num) 규제 하이퍼 파라미터 값의 후보 범위를 만든다.

In [8]:
hyperparameters = dict(C=C, penalty=penalty)#하이퍼 파라미터 후보로 딕셔너리 만들기
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)#0 침묵 1: = 2: 1/10
best_model = gridsearch.fit(features, target)
print('가장 좋은 페널티:', best_model.best_estimator_.get_params()['penalty'])
print('가장 좋은 C 값:', best_model.best_estimator_.get_params()['C'])

가장 좋은 페널티: l2
가장 좋은 C 값: 7.742636826811269


In [9]:
best_model.score(features, target)

0.9866666666666667

In [10]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### 12.2 랜덤 탐색을 사용해 최선의 모델

In [11]:
from scipy.stats import uniform
from sklearn import linear_model, datasets
from sklearn.model_selection import RandomizedSearchCV

In [12]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [13]:
logistic = linear_model.LogisticRegression()
penalty = ['l1', 'l2']
C = uniform(loc=0, scale=4)#0에서 4사이로 주기 
hyperparameters = dict(C=C, penalty = penalty)

In [14]:
randomizedsearch = RandomizedSearchCV(logistic, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)

In [15]:
best_model = randomizedsearch.fit(features, target)

In [16]:
print('가장 좋은 페널티:', best_model.best_estimator_.get_params()['penalty'])
print('가장 좋은 C 값:', best_model.best_estimator_.get_params()['C'])

가장 좋은 페널티: l2
가장 좋은 C 값: 3.730229437354635


In [17]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### 12.3 여러 학습 알고리즘에서 최선의 모델 선택하기


In [18]:
import numpy as np
from sklearn import datasets 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [19]:
np.random.seed(0)

In [20]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [21]:
pipe = Pipeline([("classifier", RandomForestClassifier())])

In [22]:
search_space = [{"classifier": [LogisticRegression()],
                  "classifier__penalty": ['l1', 'l2'],
                 "classifier__C": np.logspace(0, 4, 10)},
                {"classifier": [RandomForestClassifier()],
                "classifier__n_estimators" : [10, 100, 1000],
                 "classifier__max_features": [1, 2, 3]}]

In [23]:
gridsearch = GridSearchCV(pipe, search_space, cv=5, verbose=0)

In [24]:
best_model = gridsearch.fit(features, target)

In [25]:
best_model.best_estimator_.get_params()["classifier"]

LogisticRegression(C=7.742636826811269)

In [26]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### 12.4 전처리와 함께 최선의 모델 선택하기 

In [27]:
import numpy as np 
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA #principal componets analysis 
from sklearn.preprocessing import StandardScaler

In [28]:
np.random.seed(0)

In [29]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [30]:
#전처리 객체 만들기
preprocess = FeatureUnion([("std", StandardScaler()), ("pca", PCA())])
pipe = Pipeline([("preprocess", preprocess), ("classifier", LogisticRegression())])

In [31]:
#후보값을 정의
search_space = [{"preprocess__pca__n_components": [1, 2, 3],
                "classifier__penalty": ["l1", "l2"],
                "classifier__C": np.logspace(0, 4, 10)}]
#그리드 서치 객체 만들기
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)

In [32]:
#그리드 서치를 수행
best_model = clf.fit(features, target)

In [33]:
best_model.best_estimator_.get_params()['preprocess__pca__n_components']

2

In [34]:
clf.best_score_

0.9800000000000001

In [35]:
clf.best_estimator_.named_steps["preprocess"].transform(features[0:1])

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 , -2.68412563,
         0.31939725]])

In [36]:
pipe = Pipeline([("std", StandardScaler()),
                ("pca", PCA()),
                ("classifier", LogisticRegression())],
               memory = 'cache')

In [37]:
search_space = [{"pca__n_components": [1, 2, 3],
                "classifier__penalty": ["l1", "l2"],
                "classifier__C": np.logspace(0, 4, 10)}]
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)
best_model = clf.fit(features, target)
clf.best_score_

0.9733333333333334

In [38]:
clf.best_estimator_.get_params()['pca__n_components']

3

In [39]:
clf.best_estimator_.named_steps["pca"].transform(features[0:1])

array([[ 2.64026976,  5.2040413 , -2.48862071]])

### 12.5 병렬화로 모델 선택 속도 높이기

In [40]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV



In [41]:
iris = datasets.load_iris()
features = iris.data
target = iris.target
logistic = linear_model.LogisticRegression()
penalty = ["l1", "l2"]
C = np.logspace(0, 4, 1000)
hyperparameters = dict(C=C, penalty = penalty)

In [42]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [43]:
logistic = linear_model.LogisticRegression()
penalty = ["l1", "l2"]
C = np.logspace(0, 4, 1000)

In [44]:
hyperparameters = dict(C=C, penalty = penalty)

In [45]:
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs= -1, verbose =1)

In [46]:
best_model = gridsearch.fit(features, target)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 728 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 1728 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 3128 tasks      | elapsed:   43.5s
[Parallel(n_jobs=-1)]: Done 4928 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 7128 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 9728 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 9993 out of 10000 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:  2.3min finished


### 12.6 알고리즘에 특화된 기법을 사용하여 모델 선택 수행 속도 높이기 

In [47]:
from sklearn import linear_model, datasets
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [48]:
logit = linear_model.LogisticRegressionCV(Cs=100)
logit.fit(features, target)

LogisticRegressionCV(Cs=100)

### 모델 선택 후 성능 평가하기 

In [49]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV, cross_val_score 

In [50]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [51]:
logistic = linear_model.LogisticRegression(solver ='liblinear', multi_class = 'auto')
C = np.logspace(0, 4, 20)
hyperparameters = dict(C=C)
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose =0, iid=False)

In [52]:
cross_val_score(gridsearch, features, target, cv=3).mean()

0.9733333333333333

In [53]:
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=1, iid = False)

In [54]:
best_model=gridsearch.fit(features, target)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


In [55]:
scores = cross_val_score(gridsearch, features, target, cv=3)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
