### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [10]:
import numpy as np
from sklearn import datasets, metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split,GridSearchCV

In [3]:
diabetes = datasets.load_diabetes()
x_train, x_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.1, random_state=42)
x_train.shape

(397, 10)

In [4]:
y_train[:10]

array([ 52., 200.,  87.,  90., 258., 136., 158.,  69.,  72., 171.])

In [6]:
model = GradientBoostingRegressor()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("MSE: ", mse)

MSE:  2681.0216117395807


### Grid Search

In [8]:
n_estimators = [100, 200, 300, 400, 500]
learning_rate = [0.001, 0.01, 0.1, 1, 10]
max_depth = [1, 3, 5, 7, 9]
param_grid = dict(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

# n_job=-1 means using all processors
grid_search = GridSearchCV(model, param_grid, scoring="neg_mean_squared_error", cv=5, 
                           n_jobs=-1, verbose=1)
grid_result = grid_search.fit(x_train, y_train)

print("Best MSE: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 125 candidates, totalling 625 fits


 -4.87624418e+003 -5.53918396e+003 -5.16560385e+003 -4.86149234e+003
 -4.61729297e+003 -4.40678921e+003 -5.48536234e+003 -5.10142399e+003
 -4.80789621e+003 -4.57654078e+003 -4.40006656e+003 -5.51468906e+003
 -5.14608364e+003 -4.86103355e+003 -4.64000395e+003 -4.46293959e+003
 -5.51208384e+003 -5.17239511e+003 -4.90561090e+003 -4.71813744e+003
 -4.58102773e+003 -4.33647626e+003 -3.76653234e+003 -3.50710011e+003
 -3.39053801e+003 -3.33746753e+003 -3.78462630e+003 -3.47657951e+003
 -3.48347882e+003 -3.49147669e+003 -3.53513557e+003 -3.87314520e+003
 -3.67760978e+003 -3.70506206e+003 -3.75777502e+003 -3.78455889e+003
 -4.09156974e+003 -4.09215434e+003 -4.12229543e+003 -4.14414703e+003
 -4.15107276e+003 -4.29886377e+003 -4.43162988e+003 -4.58766515e+003
 -4.63968024e+003 -4.65688433e+003 -3.28649850e+003 -3.30882069e+003
 -3.35114568e+003 -3.39180869e+003 -3.43832908e+003 -3.73076406e+003
 -3.99942021e+003 -4.13481272e+003 -4.21054930e+003 -4.26225634e+003
 -4.06010836e+003 -4.11468377e+003

Best MSE: -3286.498505 using {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 100}


In [9]:
model_bestparam = GradientBoostingRegressor(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'],
                                           learning_rate=grid_result.best_params_['learning_rate'])
model_bestparam.fit(x_train, y_train)
y_pred = model_bestparam.predict(x_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("MSE: ", mse)

MSE:  2573.3901834613707


### Random Search

In [14]:
num_sample = 100
learning_rate_sample_list = np.random.uniform(low=0.0, high=1.0, size=num_sample)
n_estimators_sample_list = np.random.randint(low=100, high=500, size=num_sample)
max_depth_sample_list = np.random.randint(low=1, high=10, size=num_sample)

best_mse = 5000

for i in range(num_sample):
    model = GradientBoostingRegressor(max_depth=max_depth_sample_list[i],
                                     n_estimators=n_estimators_sample_list[i],
                                     learning_rate=learning_rate_sample_list[i])
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mse = metrics.mean_squared_error(y_test, y_pred)
    if mse < best_mse:
        best_mse = mse
        best_param = dict(n_estimators=n_estimators_sample_list[i],
                          learning_rate=learning_rate_sample_list[i], 
                          max_depth=max_depth_sample_list[i])   
        
print("Best MSE: %f using %s" % (best_mse, best_param))

Best MSE: 2581.621072 using {'n_estimators': 166, 'learning_rate': 0.07013824464071816, 'max_depth': 1}
