In [None]:
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import pandas as pd
import graphviz
import matplotlib.pyplot as plt

wine = load_wine()
pd.concat([pd.DataFrame(wine.data), pd.DataFrame(wine.target)], axis=1)


In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(wine.data, wine.target, test_size=0.3)
clf = tree.DecisionTreeClassifier(criterion="entropy")
clf.fit(x_train, y_train)
score = clf.score(x_valid, y_valid)
score

feature_name = ['酒精', '苹果酸', '灰', '灰的碱性', '镁', '总酚', '类黄素', '非黄烷类分类', '花青素', '颜色强度', '色调', '稀释葡萄酒', '脯氨酸']
dot_data = tree.export_graphviz(clf
                                ,feature_names=feature_name
                                ,class_names=["琴酒", "雪梨", "贝尔摩德"]
                                ,filled=True
                                ,rounded=True
                                )
graph = graphviz.Source(dot_data)
graph


In [None]:
clf.feature_importances_
[*zip(feature_name, clf.feature_importances_)]

In [None]:
clf = tree.DecisionTreeClassifier(criterion="entropy", random_state=30)
clf = clf.fit(x_train, y_train)
score = clf.score(x_valid, y_valid)
score
clf = tree.DecisionTreeClassifier(criterion="entropy"
                                    ,random_state=30
                                    ,splitter="random"
                                    ,max_depth=3
                                    ,min_samples_leaf=10
                                    ,min_samples_split=10
                                    )
clf = clf.fit(x_train, y_train)
score = clf.score(x_valid, y_valid)
print(score)
dot_data = tree.export_graphviz(clf
                                ,feature_names=feature_name
                                ,class_names=["琴酒", "雪梨", "贝尔摩德"]
                                ,filled=True
                                ,rounded=True
                                )
graph = graphviz.Source(dot_data)
graph



In [None]:
test = []
for i in range(10):
    clf = tree.DecisionTreeClassifier(max_depth=i+1
                                        ,criterion="entropy"
                                        ,random_state=30
                                        ,splitter="random"
    )
    clf = clf.fit(x_train, y_train)
    score = clf.score(x_valid, y_valid)
    test.append(score)

plt.plot(range(1, 11), test, color="red", label="max_depth")
plt.legend()
plt.show()

from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

boston = load_boston()
regressor = DecisionTreeRegressor(random_state=0)
# 交叉验证
#cross_val_score(regressor, boston.data, boston.target, cv=10) r^2
cross_val_score(regressor, boston.data, boston.target, cv=10
                ,scoring="neg_mean_squared_error")


In [None]:
# 实例回归一维数据
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
# 生成随机数种子，是随机稳定
rng = np.random.RandomState(1)
# 生成0-1,且不能为一维,生成的是二维的80*1矩阵
#5 * rng.rand(80, 1)
X = np.sort(5 * rng.rand(80, 1), axis=0)
# y只能有一维数据，ravel（）降维n维变成n-1维。一维数组不分行列
y = np.sin(X).ravel()
#plt.figure()
#plt.scatter(X, y, s=20, edgecolors="black", c="darkorange", label="data")
#plt.legend()
#不可能搞到完全相同的数据，需要有一些噪声，通过给每个数字加上随机数
#np.romdom.rand(数据结构)随机生成随机数
y[::5] += 3 * (0.5 - rng.rand(16))
plt.figure()
plt.scatter(X, y, s=20, edgecolors="black", c="darkorange", label="data")
plt.legend()

In [None]:
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=5)
regr_1.fit(X,y)
regr_2.fit(X,y)

# 测试集,newaxis 用来增维度
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
#l = np.array([1,2,3,4])
#l.shape #(4,)
#l[:, np.newaxis]
#l[:, np.newaxis].shape #(4,1)
#l[np.newaxis,:].shape #(1,4)
# 预测
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)
# 画图
plt.figure()
# c 点的颜色，edgecolor 边框颜色 s 点的大小
plt.scatter(X, y, s=20, edgecolors="black", c="darkorange", label="data")
plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
# 显示图例
plt.legend()
plt.show()
# max_depth=5 有一些过拟合

In [None]:
#数据在ai studio
# 分析泰坦尼克号
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

data = pd.read_csv('data/data/train.csv')
# 查看数据集本身的特性
#data.info()
#data.head()

# 乘客的名字，还有登录的仓，票号等与存活关系不大，且缺失值过大的时候直接删除
###### 筛选特征 
#inplace表示直接对原先的数据进行修改, axis=1 删除列，默认axis=0 删除行
data.drop(["Cabin", "Name", "Ticket"], inplace=True, axis=1)
#data = data.drop(["Cabin", "Name", "ticket"]) 同上

###### 处理缺失值 
#年龄缺200行 fillna 填补缺失值
data['Age'] = data["Age"].fillna(data["Age"].mean())
#enbark两个缺失值，就删掉相应的行,dropna 删除nan
data = data.dropna(axis=0)
data.info()
#data["Embarked"].unique()# 不重复的取值
labels = data["Embarked"].unique().tolist()
# 把特征换算成0123...,换算索引,这里舱门相互独立，所以直接换
data["Embarked"] = data["Embarked"].apply(lambda x: labels.index(x))
# int(true) = 1 int(false) = 0
#data.loc[:, "sex"]取文字切片 iloc[:, 3]数字索引
data["Sex"] = (data["Sex"] == "male").astype("int") 
# data.columns返回标签列表
x = data.iloc[:, data.columns != "Survived"]
y = data.iloc[:, data.columns == "Survived"]

# 开始训练
from sklearn.model_selection import train_test_split, cross_val_score
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.3)
# 所以这时候索引混乱，最好改一下
#索引改成排序了
#x_train.index = range(x_train.shape[0])
for i in [x_train, x_valid, y_train, y_valid]:
    i.index = range(i.shape[0])


clf = DecisionTreeClassifier(random_state=25)
clf = clf.fit(x_train, y_train)
score = clf.score(x_valid, y_valid)
score
#0.71

# 有交叉验证
clf = DecisionTreeClassifier(random_state=25)
#交叉验证会输出10个结果的列表
#score = cross_val_score(clf, x, y, cv=10).mean()
#score
#0.75

# 调参
tr = []
te = []
# 只有random和max_depth
for i in range(10):
    clf = DecisionTreeClassifier(random_state=25
                                ,max_depth=i+1
                                ,criterion="entropy"
                                )
    clf = clf.fit(x_train, y_train)
    score_tr = clf.score(x_train, y_train)
    score_te = cross_val_score(clf, x, y, cv=10).mean()
    tr.append(score_tr)
    te.append(score_te)
print(max(te))
plt.plot(range(1, 11), tr, color="red",label="train")
plt.plot(range(1, 11), te, color="blue", label="test")
plt.xticks(range(1, 11))
plt.legend()
plt.show()
# 过拟合很严重

# 先换entropy试一试

# 网格搜索是能够同时调整多个参数的技术，是个枚举技术，底层把所有参数带入，所以计算量非常大，时间很长。而且不一定比自己写的跑的好
#np.linspace(0, 0.5, 50)# 生成50个随机的有顺序的数
#gini_threholds = np.linspace(0, 0.5, 50)
#entropy_threholds = np.linspace(0, 1, 50)

clf = DecisionTreeClassifier(random_state=25)

# 一些参数和这些参数对应的，我们希望网格搜索来搜索的取值范围



# min_impurity_decrease非常不容易定出来
parameters = {"criterion":("gini", "entropy")
                ,"splitter": ("best", "random")
                ,"max_depth":[*range(1,10)]
                ,"min_samples_leaf":[*range(1,50,5)]
                ,"min_impurity_decrease":[*np.linspace(0, 0.5, 50)]
}

clf = DecisionTreeClassifier(random_state=25)
# GS同时fit sxore 和交叉验证三种功能
GS = GridSearchCV(clf, parameters, cv=10)
GS = GS.fit(x_train, y_train)


GS.best_params_ #从我们输入的参数和参数取值的列表中，返回最佳组合
GS.best_score_ #网络搜索后的模型的评判标准