# 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [1]:
from sklearn import datasets, metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score

In [2]:
# load breast cancer wisconsin dataset
breast_cancer = datasets.load_breast_cancer()
print(breast_cancer.data.shape)
print(breast_cancer.feature_names)

(569, 30)
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


### 先觀察使用未調參之GradientBoostingClassifier之結果

In [3]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(breast_cancer.data, breast_cancer.target, test_size=0.1, random_state=42)
# 建立模型
clf = GradientBoostingClassifier(random_state = 42)
# 訓練模型
clf.fit(x_train, y_train)
# 預測測試集
y_pred = clf.predict(x_test)

# 準確率
acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.9649122807017544


### 觀察調參後之GradientBoostingClassifier之結果

In [4]:
# 設定參數
n_estimators = list(i * 100 for i in range(1, 11))
max_depth = list(j for j in range(1, 8))
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
## 10-Fold
cv = KFold(10)

# 建立模型
clf_GSCV = GridSearchCV(clf, param_grid, cv = cv, n_jobs = -1)
# 訓練模型
clf_GSCV.fit(x_train, y_train)
print(f'The best parameters: {clf_GSCV.best_params_}\nvalidation score: {clf_GSCV.best_score_: .2f}')
# 預測測試集
y_pred_GSCV = clf_GSCV.predict(x_test)

# 準確率
acc = accuracy_score(y_test, y_pred_GSCV)
print("Accuracy: ", acc)

The best parameters: {'max_depth': 1, 'n_estimators': 800}
validation score:  0.98
Accuracy:  0.9649122807017544


### 結果一樣，應該是樣本本身就太理想之緣故