## [範例重點]
了解 sklearn 中，GridSearchCV 的使用方法與原理

In [1]:
import warnings

warnings.simplefilter('ignore')

# Datasets
from sklearn import datasets

# Preprocessing
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

# Model
from sklearn.ensemble import GradientBoostingRegressor

# Evaluation
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [2]:
# 讀取波士頓房價資料集
boston = datasets.load_boston()

# 切分訓練集/測試集
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.25, random_state=0)

# 建立 GradientBoostingRegressor 模型
gdb_reg = GradientBoostingRegressor(random_state=0)

# 訓練模型
gdb_reg.fit(X_train, y_train)

# 預測測試集
y_pred = gdb_reg.predict(X_test)

# 回歸問題的衡量採用 MSE，預設參數得到 15.3 MSE
mse = mean_squared_error(y_test, y_pred)
print('MSE: ', mse)

MSE:  15.300237820682744


In [3]:
# 設定要訓練的超參數組合
gdb_reg_param_grid = {'n_estimators': [100, 200, 300], 
                      'max_depth': [1, 3, 5]}

# 建立搜尋物件，放入模型及參數組合字典
gsgdb_reg = GridSearchCV(gdb_reg, gdb_reg_param_grid, cv=3, scoring="neg_mean_squared_error", n_jobs=2, verbose=1)

# 開始搜尋最佳參數
gsgdb_reg.fit(X_train, y_train)

# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (gsgdb_reg.best_score_, gsgdb_reg.best_params_))

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Best Accuracy: -11.430351 using {'max_depth': 3, 'n_estimators': 200}


[Parallel(n_jobs=2)]: Done  27 out of  27 | elapsed:    5.6s finished


In [5]:
# 最佳參數模型
gdb_reg_best = gsgdb_reg.best_estimator_

# 預測測試集
y_pred = gsgdb_reg.predict(X_test)

# 調整參數後約可降至 14.53 的 MSE
print(mean_squared_error(y_test, y_pred))

14.53003833297418
