## Decision Tree 决策树模型
- 节点node
- 分支split
- 剪枝：预剪枝：在创建树的同时剪枝。后剪枝：树建好了再修剪
- 树模型的优势：自动处理大量变量，树模型会在所有自变量中选出最重要的自变量对样本进行切分。对数据没有正态独立方差齐这些要求，应用范围更广

### 决策树分类

In [60]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV   #网格搜索
from sklearn.model_selection import cross_val_score #交叉验证
from sklearn.model_selection import cross_validate#交叉验证
import pandas as pd
import math
import joblib
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import numpy as np

In [70]:

iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

DTC = DecisionTreeClassifier(criterion='entropy')

cv_score = cross_val_score(DTC       #实例化的模型
				, X   #完整的特征值
				, y #完整的目标值
				, cv=5         #几折交叉验证
				,scoring = "accuracy"   #对于回归，默认返回R²。在回归交叉验证要使用MSE
				)

print(cv_score)
# DTC.fit(X_train, y_train)
# DTC.score(X_test,y_test)


[0.96666667 0.96666667 0.9        0.96666667 1.        ]


### DTC可视化

In [2]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# .Data:.2019/12/27
# -*- coding: utf-8 -*-
 
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pydotplus
 
import warnings
warnings.filterwarnings("ignore")
 
#加载数据
iris = datasets.load_iris()
 
#构建模型
fls = DecisionTreeClassifier()
fls = fls.fit(iris.data,iris.target)
 
#保存模型
with open('iris.dot','w') as f:
    f = tree.export_graphviz(fls,out_file=f)
 
#画图，保存到pdf文件中
 
#设置图像参数
dot_data = tree.export_graphviz(fls,out_file=None,
                                feature_names=iris.feature_names,
                                class_names=iris.target_names,
                                filled=True,rounded=True,special_characters=True)
 
graph = pydotplus.graph_from_dot_data(dot_data)
 
#保存图像到pdf文件
graph.write_pdf('iris.pdf')


True

In [2]:
print(iris.feature_names)
DTC.feature_importances_

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


array([0.01906837, 0.02764914, 0.40706919, 0.5462133 ])

In [4]:
print(classification_report(y_test, DTC.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.89      1.00      0.94        17
           2       1.00      0.87      0.93        15

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45



In [None]:
# 看看树的样子
from sklearn.tree import export_graphviz

export_graphviz(DTC, out_file="classify_tree.dot", feature_names=iris.feature_names, class_names=iris.target_names)

# 使用graphviz查看，使用软件打开.dot文件
with open('classify_tree.dot') as f:
    dot_graph = f.read

import graphviz
graph = graphviz.Source(dot_graph)
graph.render('classify_tree')

### 决策树回归
- 分类与回归的区别：分类问题中的因变量是分类变量，回归种的因变量是连续变量
- 分类决策树中，用信息熵表示节点的混乱程度
- 回归决策树中，改用均方差来表示混乱程度
- 分类决策树中，叶子结点的众数就是输出结果
- 回归决策树中，改用叶子节点的平均数作为结果

In [2]:


boston = datasets.load_boston()
X = boston.data
y = boston.target

DTR = DecisionTreeRegressor(max_depth=3)
DTR.fit(X,y)
print(DTR.score(X,y))


0.8177924678036443



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [4]:
print(boston.feature_names)
DTR.feature_importances_

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


array([0.03254249, 0.        , 0.        , 0.        , 0.0288244 ,
       0.64124098, 0.        , 0.0721473 , 0.        , 0.        ,
       0.01593447, 0.        , 0.20931036])

In [None]:
export_graphviz(DTR, out_file='regression_tree.dot', feature_names = boston.featurenames)

with open('regression_tree.dot') as f:
    dot_graph = f.read

graph = graphviz.Source(dot_graph)
graph.render('regression_tree')

## part1、 DT_Regression

In [6]:
df = pd.read_csv("./data/ads_3.csv")

X = df[df.columns[:62]]
Y = df[df.columns[62:]]

In [9]:

MSE = []
RMSE = []
R_squared = []
feature_importance = []

for i in range(12):
    y = Y[Y.columns[i]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)
    
    DT_regression = DecisionTreeRegressor(max_depth=4)
    DT_regression.fit(X_train, y_train)

    joblib.dump(DT_regression, "model/SVM_regression/model{}.pkl".format(i+1))
    y_pred = DT_regression.predict(X_test)
    MSE.append(metrics.mean_squared_error(y_test, y_pred))
    RMSE.append(math.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    R_squared.append(metrics.r2_score(y_test, y_pred))

    
    feature_importance.append(list(DT_regression.feature_importances_))

    

In [10]:
result_dic = {"MSE":MSE, "RMSE":RMSE, "R_squared":R_squared}
result_df = pd.DataFrame(result_dic, index=Y.columns)
result_df.to_csv("result/DT_regression.csv")

feature_importance_df = pd.DataFrame(feature_importance, columns=X.columns, index=Y.columns)
feature_importance_df.to_csv("result/DTR_feature_importance.csv")

In [21]:
MSE = []
RMSE = []
R_squared = []
feature_importance = []

for i in range(12):
    y = Y[Y.columns[i]]
    
    DT_regression = DecisionTreeRegressor(max_depth=6)
    DT_regression.fit(X, y)

    joblib.dump(DT_regression, "model/DT_regression/model{}.pkl".format(i+1))
    
    MSE.append(metrics.mean_squared_error(y, DT_regression.predict(X)))
    RMSE.append(math.sqrt(metrics.mean_squared_error(y, DT_regression.predict(X))))
    R_squared.append(metrics.r2_score(y, DT_regression.predict(X)))

    feature_importance.append(list(DT_regression.feature_importances_))

In [19]:
result_dic = {"MSE":MSE, "RMSE":RMSE, "R_squared":R_squared}
result_df = pd.DataFrame(result_dic, index=Y.columns)
result_df.to_csv("result/DT_regression.csv")

feature_importance_df = pd.DataFrame(feature_importance, columns=X.columns, index=Y.columns)
feature_importance_df.to_csv("result/DTR_feature_importance.csv")

## part2、 DT_Classification

In [41]:
df = pd.read_csv("./data/ads_3.csv")

X = df[df.columns[:62]]
Y = df[df.columns[62:]]
Y = round(Y*10).astype(int)


In [44]:
score_list = []
feature_importance = []

for i in range(12):
    y = Y[Y.columns[i]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)
    
    DT_classification = DecisionTreeClassifier(max_depth=6)
    DT_classification.fit(X_train, y_train)

    joblib.dump(DT_classification, "model/DT_classification/model{}.pkl".format(i+1))
    y_pred = DT_classification.predict(X_test)
    score_list.append(DT_classification.score(X_test,y_test))

    feature_importance.append(list(DT_classification.feature_importances_))

In [45]:

result_df = pd.DataFrame(score_list, index=Y.columns, columns=['ACC'])
result_df.to_csv("./result/DT_classification.csv")

feature_importance_df = pd.DataFrame(feature_importance, columns=X.columns, index=Y.columns)
feature_importance_df.to_csv("result/DTC_feature_importance.csv")

## part3、Optimize DT_Regression

```
sklearn.tree.DecisionTreeRegressor (criterion=’mse’
                                    , splitter=’best’
                                    , max_depth=None
                                    ,min_samples_split=2
                                    , min_samples_leaf=1
                                    , min_weight_fraction_leaf=0.0
                                    , max_features=None
                                    ,random_state=None
                                    , max_leaf_nodes=None
                                    , min_impurity_decrease=0.0
                                    , min_impurity_split=None
                                    , presort=False
                                    )

```

In [142]:
df = pd.read_csv("./data/ads_3.csv")

X = df[df.columns[:62]]
Y = df[df.columns[62:]]


In [143]:
MSE = []
RMSE = []
R_squared_validation = []
R_squared_test = []
feature_importance = []

for i in range(12):
    y = Y[Y.columns[i]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)
    
    params = {
        "max_depth":[2,3,4,5],
        'splitter':["random", "best"],
        'min_samples_leaf':[1,2,3,4],
        'min_samples_split':[2,3,4,5]
    }
    
    DT_regression = DecisionTreeRegressor()
    model = GridSearchCV(DT_regression, param_grid=params, cv=5)
    model.fit(X_train, y_train)
    max_depth= model.best_params_["max_depth"]
    splitter = model.best_params_["splitter"]
    min_samples_leaf= model.best_params_["min_samples_leaf"]
    min_samples_split = model.best_params_["min_samples_split"]
    

    DT_regression = DecisionTreeRegressor(random_state=0, max_depth=max_depth, splitter=splitter, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)

    cv_score = cross_validate(DT_regression       #实例化的模型
				, X   #完整的特征值
				, y #完整的目标值
				, cv=5         #几折交叉验证
				,scoring = ["neg_mean_squared_error","neg_root_mean_squared_error","r2"]   
				)

    
    
    MSE.append(cv_score["test_neg_mean_squared_error"].mean())
    RMSE.append(cv_score["test_neg_root_mean_squared_error"].mean())
    R_squared_validation.append(cv_score["test_r2"].mean())

    DT_regression = DecisionTreeRegressor(random_state=0, max_depth=max_depth, splitter=splitter, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    DT_regression.fit(X_train,y_train)
    R_squared_test.append(DT_regression.score(X_test, y_test))
    joblib.dump(DT_regression, "model/DT_optimized_regression/model{}.pkl".format(i+1))

    feature_importance.append(list(DT_regression.feature_importances_))


In [140]:
MSE = np.array(MSE) * -1
RMSE = np.array(RMSE) * -1
result_dic = {"MSE":MSE, "RMSE":RMSE, "R_squared_validation":R_squared_validation, "R_squared_test":R_squared_test}
result_df = pd.DataFrame(result_dic, index=Y.columns)
result_df.to_csv("result/DT_optimized_regression.csv")

feature_importance_df = pd.DataFrame(feature_importance, columns=X.columns, index=Y.columns)
feature_importance_df.to_csv("result/DTR_optimized_feature_importance.csv")

## part4、Optimize DT_Classification

```
sklearn.tree.DecisionTreeClassifier (criterion=’gini’
					, splitter=’best’
					,max_depth=None
					,min_samples_split=2
					, min_samples_leaf=1
					, min_weight_fraction_leaf=0.0
					, max_features=None
					,random_state=None
					, max_leaf_nodes=None
					, min_impurity_decrease=0.0
					, min_impurity_split=None
					,class_weight=None
					, presort=False
					)
```

In [134]:
df = pd.read_csv("./data/ads_3.csv")

X = df[df.columns[:62]]
Y = df[df.columns[62:]]
Y = round(Y*10).astype(int)

In [135]:
recall = []
f1_score = []
acc_validation = []
acc_test = []
feature_importance = []

for i in range(12):
    y = Y[Y.columns[i]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)
    
    params = {
        "max_depth":[2,3,4,5,6],
        'criterion':["gini", "entropy"],
        'splitter':["random", "best"],
        'min_samples_leaf':[1,2,3,4],
        'min_samples_split':[2,3,4,5]
    }
    
    DT_classification = DecisionTreeClassifier()
    model = GridSearchCV(DT_classification, param_grid=params, cv=5)
    model.fit(X_train, y_train)
    max_depth= model.best_params_["max_depth"]
    criterion= model.best_params_["criterion"]
    splitter = model.best_params_["splitter"]
    min_samples_leaf= model.best_params_["min_samples_leaf"]
    min_samples_split = model.best_params_["min_samples_split"]


    DT_classification = DecisionTreeClassifier(random_state=0, max_depth=max_depth, criterion=criterion, splitter=splitter, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    
    cv_score = cross_validate(DT_classification,
                               X,
                               y,
                               cv=10,
                               scoring=["accuracy","recall_macro","f1_macro"]

    )
    recall.append(cv_score["test_recall_macro"].mean())
    f1_score.append(cv_score["test_f1_macro"].mean())
    acc_validation.append(cv_score["test_accuracy"].mean())
    
    
    DT_classification = DecisionTreeClassifier(random_state=0, max_depth=max_depth, criterion=criterion, splitter=splitter, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    DT_classification.fit(X_train, y_train)
    acc_test.append(DT_classification.score(X_test, y_test))

    joblib.dump(DT_classification, "model/DT_optimized_classification/model{}.pkl".format(i+1))

    feature_importance.append(list(DT_classification.feature_importances_))


In [136]:
result_dic = {"recall":recall, "f1_score":f1_score, "acc_validation":acc_validation, "acc_test":acc_test}
result_df = pd.DataFrame(result_dic, index=Y.columns)
result_df.to_csv("result/DT_optimized_classification.csv")

feature_importance_df = pd.DataFrame(feature_importance, columns=X.columns, index=Y.columns)
feature_importance_df.to_csv("result/DTC_optimized_feature_importance.csv")