# Chapter04 模型选择与评估

## 4.2 超参数优化方法

### 什么是超参数？

- 参数
    - 可以根据数据集估计得到的模型参数
- 超参数
    - 不能根据数据集估计得到的参数，通常需要人为手工设定

### 超参数优化方法

- GridSearch网格搜索

- RandomizedSearch随机采样


In [4]:
# GridSearchCV

from sklearn.datasets import load_iris
from sklearn import svm
from sklearn.model_selection import GridSearchCV

iris = load_iris()

# 定义参数网格(字典)
parameters = {'kernel':('rbf', 'linear'), 'C':[1, 5, 10]}
svr = svm.SVC()
clf = GridSearchCV(svr, parameters)
clf.fit(iris.data, iris.target)
print(clf.best_estimator_)


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [10]:
# RandomizedSearchCV

import numpy as np
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# 定义一个用于报告超参数搜索最好结果的函数
def report(results, n_top = 3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print('Model with rank:{0}'.format(i))
            print("Mean validation score:{0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters:{0}".format(results['params'][candidate]))
            print("")

# dataset
digits = load_digits()
X, y = digits.data, digits.target

# # classifier
clf = RandomForestClassifier(n_estimators= 20)

# # 设置超参数及其取值分布
param_dist = {"max_depth": [3, None], 
              "max_features": sp_randint(1, 11), 
              "min_samples_split": sp_randint(2, 11), 
              "min_samples_leaf": sp_randint(1, 11), 
              "bootstrap": [True, False], 
              "criterion": ['gini', 'entropy']}

# # 随机搜索
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions = param_dist, n_iter = n_iter_search)
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)


RandomizedSearchCV took 2.76 seconds for 20 candidatesparameter settings.
Model with rank:1
Mean validation score:0.928 (std: 0.009)
Parameters:{'criterion': 'entropy', 'max_features': 3, 'min_samples_split': 6, 'bootstrap': False, 'min_samples_leaf': 2, 'max_depth': None}

Model with rank:2
Mean validation score:0.921 (std: 0.015)
Parameters:{'criterion': 'entropy', 'max_features': 9, 'min_samples_split': 8, 'bootstrap': False, 'min_samples_leaf': 3, 'max_depth': None}

Model with rank:3
Mean validation score:0.916 (std: 0.013)
Parameters:{'criterion': 'gini', 'max_features': 3, 'min_samples_split': 8, 'bootstrap': True, 'min_samples_leaf': 1, 'max_depth': None}

Model with rank:3
Mean validation score:0.916 (std: 0.011)
Parameters:{'criterion': 'gini', 'max_features': 7, 'min_samples_split': 9, 'bootstrap': True, 'min_samples_leaf': 2, 'max_depth': None}

