### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [17]:
import numpy as np
from sklearn import datasets, metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [18]:
diabetes = datasets.load_diabetes()
x_train, x_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.1, random_state=42)
x_train.shape

(397, 10)

In [19]:
y_train[:10]

array([ 52., 200.,  87.,  90., 258., 136., 158.,  69.,  72., 171.])

In [20]:
model = GradientBoostingRegressor()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("MSE: ", mse)

MSE:  2658.762720320672


### Grid Search

In [21]:
n_estimators = [100, 200, 300, 400, 500]
learning_rate = [0.001, 0.01, 0.1, 1, 10]
max_depth = [1, 3, 5, 7, 9]
param_grid = dict(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

# n_job=-1 means using all processors
grid_search = GridSearchCV(model, param_grid, scoring="neg_mean_squared_error", cv=5, 
                           n_jobs=-1, verbose=1)
grid_result = grid_search.fit(x_train, y_train)

print("Best MSE: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 125 candidates, totalling 625 fits
Best MSE: -3286.498505 using {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 100}


 -4.87624418e+003 -5.53943044e+003 -5.16556651e+003 -4.86119096e+003
 -4.61747376e+003 -4.40643664e+003 -5.48483598e+003 -5.10158166e+003
 -4.80745283e+003 -4.57598014e+003 -4.39904343e+003 -5.51504820e+003
 -5.14746669e+003 -4.85911837e+003 -4.64087094e+003 -4.46386405e+003
 -5.51155007e+003 -5.16771639e+003 -4.90402368e+003 -4.71692361e+003
 -4.57605476e+003 -4.33647626e+003 -3.76653234e+003 -3.50710011e+003
 -3.39053801e+003 -3.33746753e+003 -3.78287118e+003 -3.47788059e+003
 -3.48588535e+003 -3.49280495e+003 -3.53748915e+003 -3.86943580e+003
 -3.67746199e+003 -3.70808944e+003 -3.75926252e+003 -3.79687545e+003
 -4.09070089e+003 -4.10214715e+003 -4.10997660e+003 -4.14084662e+003
 -4.15037229e+003 -4.29890923e+003 -4.44483346e+003 -4.57627375e+003
 -4.65469992e+003 -4.64602805e+003 -3.28649850e+003 -3.30882069e+003
 -3.34788036e+003 -3.39633145e+003 -3.43468675e+003 -3.71178321e+003
 -3.97824427e+003 -4.11449258e+003 -4.18360396e+003 -4.24867776e+003
 -4.06274646e+003 -4.08962538e+003

In [22]:
model_bestparam = GradientBoostingRegressor(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'],
                                           learning_rate=grid_result.best_params_['learning_rate'])
model_bestparam.fit(x_train, y_train)
y_pred = model_bestparam.predict(x_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("MSE: ", mse)

MSE:  2573.3901834613694


### Random Search

In [23]:
num_sample = 125
learning_rate_sample_list = np.random.uniform(low=0.0, high=1.0, size=num_sample)
n_estimators_sample_list = np.random.randint(low=100, high=500, size=num_sample)
max_depth_sample_list = np.random.randint(low=1, high=10, size=num_sample)

best_score = -5000

for i in range(num_sample):
    model = GradientBoostingRegressor(max_depth=max_depth_sample_list[i],
                                     n_estimators=n_estimators_sample_list[i],
                                     learning_rate=learning_rate_sample_list[i])
    score = np.mean(cross_val_score(model, x_train, y_train, scoring="neg_mean_squared_error",
                                    cv=5, n_jobs=-1))
    if score > best_score:
        best_mse = -score
        best_param = dict(n_estimators=n_estimators_sample_list[i],
                          learning_rate=learning_rate_sample_list[i], 
                          max_depth=max_depth_sample_list[i])   
        
print("Best MSE: %f using %s" % (best_mse, best_param))

Best MSE: 3893.365110 using {'n_estimators': 276, 'learning_rate': 0.6011804060254833, 'max_depth': 1}


In [24]:
model_bestparam = GradientBoostingRegressor(max_depth=best_param['max_depth'],
                                           n_estimators=best_param['n_estimators'],
                                           learning_rate=best_param['learning_rate'])
model_bestparam.fit(x_train, y_train)
y_pred = model_bestparam.predict(x_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("MSE: ", mse)

MSE:  3062.867841450141
