In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

# 1. 获取数据

In [None]:
from sklearn.datasets._california_housing import fetch_california_housing
housing = fetch_california_housing()
print(housing.DESCR)

In [3]:
housing.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [None]:
house = pd.DataFrame(data=housing.data,columns=housing.feature_names)
house.head()

# 2. 使用决策树

In [4]:
from sklearn import tree
# 回归树
dtr = tree.DecisionTreeRegressor(max_depth=2)
dtr.fit(housing.data[:,[6,7]],housing.target)

DecisionTreeRegressor(max_depth=2)

# 3. 可视化决策树

In [None]:
dot_data = tree.export_graphviz(
    dtr, 
    out_file=None, 
    feature_names=housing.feature_names[6:8], 
    filled=True, 
    impurity=False, 
    rounded=True)

In [None]:
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)

# 展示图像
from IPython.display import Image
Image(graph.create_png())

In [None]:
# 存储图像
graph.write_png("dtr.png")

# 3. Grid Search调参

In [5]:
# 切分训练集测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    housing.data, housing.target, test_size=0.1, random_state=0)
    
# 回归决策树
dtr = tree.DecisionTreeRegressor(random_state=0)
dtr.fit(X_train, y_train)
dtr.score(X_test, y_test)


0.5779699824126867

In [6]:
# 随机森林
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=0)
rfr.fit(X_train, y_train)
rfr.score(X_test, y_test)


0.816253235008409

In [7]:
from sklearn.model_selection import GridSearchCV
# 待调参数
param = {'min_samples_split': [3, 6, 9], 'n_estimators': [10, 50, 100]}
# cv: k折交叉验证折数
grid = GridSearchCV(RandomForestRegressor(), param_grid=param, cv=5)
grid.fit(X_train, y_train)
grid.best_params_, grid.best_score_, grid.cv_results_


({'min_samples_split': 3, 'n_estimators': 100},
 0.803829272805188,
 {'mean_fit_time': array([0.67402945, 3.11756001, 6.21531587, 0.60066237, 2.90367374,
         5.88041406, 0.55204663, 2.78018999, 5.54557767]),
  'std_fit_time': array([0.0473454 , 0.01705672, 0.03974842, 0.01395715, 0.03097749,
         0.16549294, 0.01181388, 0.05575615, 0.06099078]),
  'mean_score_time': array([0.00860043, 0.03520021, 0.0696003 , 0.006001  , 0.02898455,
         0.05540013, 0.00519958, 0.02520037, 0.05298114]),
  'std_score_time': array([1.01916033e-03, 1.46849742e-03, 8.00228133e-04, 5.22348936e-07,
         1.20037944e-03, 3.32278474e-03, 3.99876181e-04, 2.03945569e-03,
         7.97195963e-03]),
  'param_min_samples_split': masked_array(data=[3, 3, 3, 6, 6, 6, 9, 9, 9],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'param_n_estimators': masked_array(data=[10, 50, 100, 10, 50, 100, 