# 使用GridSearch进行参数调优

首先定义下面几组变量:

- 基础模型.
- 模型参数字典, 每个超参所有取值的列表.
- 实验参数.


In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets

# 用iris花数据
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import multiprocessing

# find cpu core number
n_cpu = multiprocessing.cpu_count()

# parameter matrix
param_grid = dict(
    max_depth=[1, 2, 3, 4],
    max_features=[0.5, 0.8, 1.0],
    n_estimators=[5, 10, 20, 50],
)

# base estimator
estimator = RandomForestClassifier()

# experiment setting
gs = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=n_cpu,
)

gs.fit(X, y)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [5, 10, 20, 50], 'max_features': [0.5, 0.8, 1.0], 'max_depth': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [48]:
"""
``GridSearchCV.cv_results_`` 是一个字典结构, 里面测试了所有可能的参数组合. 
"""

assert len(gs.cv_results_["params"]) == \
    len(param_grid["max_depth"]) * len(param_grid["max_features"]) * len(param_grid["n_estimators"])

from collections import OrderedDict

# order by mean_test_score
df = pd.DataFrame(OrderedDict(sorted(gs.cv_results_.items(), key=lambda x: x[0])))
df = df.sort_values("mean_test_score", ascending=False)
df.head(3)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,param_max_features,param_n_estimators,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
24,0.016611,0.001705,0.966667,0.971667,3,0.5,5,"{'max_features': 0.5, 'n_estimators': 5, 'max_...",1,0.966667,...,0.966667,0.975,0.933333,0.975,1.0,0.958333,0.001944,0.00104,0.021082,0.006667
37,0.02303,0.001683,0.966667,0.991667,4,0.5,10,"{'max_features': 0.5, 'n_estimators': 10, 'max...",1,0.966667,...,0.933333,0.991667,0.966667,0.991667,1.0,0.983333,0.001076,0.000108,0.021082,0.00527
29,0.030062,0.002046,0.966667,0.966667,3,0.8,10,"{'max_features': 0.8, 'n_estimators': 10, 'max...",1,0.966667,...,0.933333,0.975,0.966667,0.966667,1.0,0.966667,0.002365,0.000401,0.021082,0.00527


In [49]:
# best mean_test_score parameter
df.head(1)["params"].values[0]

{'max_depth': 3, 'max_features': 0.5, 'n_estimators': 5}

In [50]:
gs.best_params_

{'max_depth': 2, 'max_features': 0.5, 'n_estimators': 5}

In [51]:
gs.best_score_

0.9666666666666667