# Parameter (매개변수) 튜닝을 통한 모델 성능 개선


- 다음과 같은 주로 사용하는 방법들이 있음.
    1. GridSearchCV: 파라미터 후보군을 미리 정의해서 반복문을 통해 최적값을 찾음.
    2. RandomizedSearchCV: 파라미터 분포를 주고 난수를 생성시키면서 반복문을 통해 최적값을 찾음.

## GridSearchCV
- scikit-learn에서 제공
- 이를 사용하기 위해서는 먼저 dictionary 형태로 파라미터 후보군 (검색 대상 매개변수)을 지정해야한다.
- GridSearchCv의 객체 생성


In [1]:
# 객체명.score(X_test, Y_test): 모델 정확도 점수
# 객체명.best_params_: 최적 parameter 값
# 객체명.best_score_: 최고 정확도 값

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn import metrics

%matplotlib inline

In [3]:
# Read the data
data = pd.read_csv('data/bikeshare.csv')

# Year와 Month를 추출
datetime = pd.DatetimeIndex(data['datetime'])
data['year'] = datetime.year
data['month'] = datetime.month
data['hour'] = datetime.hour

# "count" is a method, so it's best to name that column something else
data.rename(columns={'count':'total'}, inplace=True)

# Handling 'season' variable
season_dummies = pd.get_dummies(data.season, prefix='season')
season_dummies.drop(season_dummies.columns[0], axis=1, inplace=True)
data = pd.concat([data, season_dummies], axis=1)

# Add derivative variable "daytime"
data['daytime'] = ((data.hour > 6) & (data.hour < 21)).astype(int)

# Handling 'hour' variable
hour_dummies = pd.get_dummies(data.hour, prefix='hour')
hour_dummies.drop(hour_dummies.columns[0], axis=1, inplace=True)
data = pd.concat([data, hour_dummies], axis=1)

In [4]:
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,...,0,0,0,0,0,0,0,0,0,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,...,0,0,0,0,0,0,0,0,0,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,...,0,0,0,0,0,0,0,0,0,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,...,0,0,0,0,0,0,0,0,0,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,...,0,0,0,0,0,0,0,0,0,0


### GridSearchCV의 기본 원리

In [5]:
X = data.drop(['datetime','casual','registered','total'], axis = 1)
Y = data.total

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=123)

best_score=0

for alpha in [0.1, 0.5, 1.0, 10.0, 100.0, 1000.0]:
    # 각 파라미터 후보군에 대해 Ridge 모델 훈련
    model = Ridge(alpha=alpha)
    model.fit(X_train, Y_train)
    # 테스트 세트로 model 평가
    rsquared_test = model.score(X_test, Y_test)
    # 점수가 더 높으면 파라미터와 함께 기록
    if rsquared_test > best_score:
        best_score = rsquared_test
        best_parameter = {'alpha': alpha}
    
    print('%f: %f' % (alpha, rsquared_test))

print("best score: ", best_score)
print("best parameter: ", best_parameter)

0.100000: 0.677364
0.500000: 0.677383
1.000000: 0.677406
10.000000: 0.677690
100.000000: 0.673554
1000.000000: 0.590988
best score:  0.677689829252621
best parameter:  {'alpha': 10.0}


### 교차 검증을 사용한 GridSearchCV

In [6]:
# 파라미터 후보군 정의
param_grid = {'alpha': [0.1, 0.5, 1.0, 10.0, 100.0, 1000.0]}

# 클래스로부터 객체 생성
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(Ridge(), param_grid, cv=5)

In [7]:
# 데이터 세트 나누기 (훈련/테스트)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=123)

# 훈련 데이터 fit
grid_search.fit(X_train, Y_train)


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.1, 0.5, 1.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [8]:
grid_search.best_params_

{'alpha': 1.0}

In [9]:
grid_search.score(X_test, Y_test)

0.6774056546703342

In [10]:
grid_search.cv_results_



{'mean_fit_time': array([0.01700001, 0.00520005, 0.00520005, 0.00500011, 0.00580006,
        0.00560002]),
 'std_fit_time': array([1.95345773e-02, 3.99994889e-04, 3.99994889e-04, 2.13248060e-07,
        3.99971037e-04, 4.89920871e-04]),
 'mean_score_time': array([0.00159993, 0.00099998, 0.00120001, 0.00099998, 0.00119996,
        0.00099998]),
 'std_score_time': array([7.99906259e-04, 6.32409699e-04, 4.00042545e-04, 9.53674316e-08,
        4.00066376e-04, 9.53674316e-08]),
 'param_alpha': masked_array(data=[0.1, 0.5, 1.0, 10.0, 100.0, 1000.0],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 0.1},
  {'alpha': 0.5},
  {'alpha': 1.0},
  {'alpha': 10.0},
  {'alpha': 100.0},
  {'alpha': 1000.0}],
 'split0_test_score': array([0.68209406, 0.6820692 , 0.68203728, 0.68132333, 0.66796847,
        0.56148803]),
 'split1_test_score': array([0.67570686, 0.67572498, 0.67574676, 0.67599063, 0.66987066,
        0.

# RandomizedSearchCV

In [12]:
from sklearn.model_selection import RandomizedSearchCV
# GridSearch와의 차이는 몇 번 후보군을 찾겠는지를 포함하고 있다는 것(n_iter)

In [13]:
params = {'alpha':[10**i for i in range(-10, 10)]} # 얘도 아직까지는 격자형태
# from scipy.stats import expon
# param = {'alpha': expon.ppf([0.001, 0.5, 0.999])}  # 얘는 분포로 하는 것

In [14]:
rand_search = RandomizedSearchCV(Ridge(), params, n_iter = 10, cv = 5)

In [16]:
rand_search.fit(X_train, Y_train)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number2.106689e-17
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.918493e-17
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.925633e-17
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.701906e-17
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number2.124389e-17
  overwrite_a=True).T


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'alpha': [1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [17]:
rand_search.score(X_test, Y_test)

0.6773637071541074