In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
from sklearn import metrics
import pydot
import pandas as pd
import numpy as np

# 设置随机数种子，以实现过程的可复现
seed = 200
np.random.seed(seed)


In [16]:
# 读取加工数据，取第3行作为列索引
raw_data = pd.read_excel("./ProcessingData.xlsx", sheet_name='all', header=2)
# 筛除无关的信息
raw_data = raw_data.loc[:, '锭位号':'CV%最小[%]']
raw_data = raw_data.drop(
    ['落纱原因', '卷绕开始时刻', '卷绕结束时刻', '丝筒重量[Kg]', '卷绕时间'], axis=1)
raw_data

Unnamed: 0,锭位号,丝筒号码,等级,直径[毫米],卷密度[g/cm3],平均张力[cN],最大张力[cN],最小张力[cN],CV%平均[%],CV%最大[%],CV%最小[%]
0,193,86,A,223.42,0.725,33.8,38.5,30.2,0.78,3.69,0.41
1,194,86,AAA,220.33,0.748,27.9,29.5,25.5,1.09,2.15,0.56
2,195,84,AAA,218.80,0.760,29.5,31.0,27.2,0.79,1.60,0.37
3,196,88,AAA,219.45,0.755,31.9,33.5,29.4,1.18,1.86,0.52
4,197,81,A,219.93,0.751,33.7,42.3,22.3,0.76,8.49,0.40
...,...,...,...,...,...,...,...,...,...,...,...
4621,188,97,AAA,221.21,0.741,31.2,32.5,29.1,0.79,1.75,0.37
4622,189,97,A,221.19,0.742,33.6,40.8,26.6,0.75,4.82,0.37
4623,190,98,AAA,218.66,0.761,31.2,33.2,28.6,0.77,2.03,0.39
4624,191,102,A,224.35,0.717,33.2,40.2,25.1,0.88,7.99,0.43


In [17]:
# 显示数据缺失情况
raw_data.loc[raw_data.isnull().any(axis=1)]

Unnamed: 0,锭位号,丝筒号码,等级,直径[毫米],卷密度[g/cm3],平均张力[cN],最大张力[cN],最小张力[cN],CV%平均[%],CV%最大[%],CV%最小[%]


In [18]:
# 将存在空白值/NaN值的行删除
raw_data = raw_data.dropna(axis=0, how='any')
# 去除重复的数据
raw_data = raw_data.drop_duplicates()
raw_data


Unnamed: 0,锭位号,丝筒号码,等级,直径[毫米],卷密度[g/cm3],平均张力[cN],最大张力[cN],最小张力[cN],CV%平均[%],CV%最大[%],CV%最小[%]
0,193,86,A,223.42,0.725,33.8,38.5,30.2,0.78,3.69,0.41
1,194,86,AAA,220.33,0.748,27.9,29.5,25.5,1.09,2.15,0.56
2,195,84,AAA,218.80,0.760,29.5,31.0,27.2,0.79,1.60,0.37
3,196,88,AAA,219.45,0.755,31.9,33.5,29.4,1.18,1.86,0.52
4,197,81,A,219.93,0.751,33.7,42.3,22.3,0.76,8.49,0.40
...,...,...,...,...,...,...,...,...,...,...,...
4621,188,97,AAA,221.21,0.741,31.2,32.5,29.1,0.79,1.75,0.37
4622,189,97,A,221.19,0.742,33.6,40.8,26.6,0.75,4.82,0.37
4623,190,98,AAA,218.66,0.761,31.2,33.2,28.6,0.77,2.03,0.39
4624,191,102,A,224.35,0.717,33.2,40.2,25.1,0.88,7.99,0.43


In [22]:
# 对等级列进行重赋值

raw_data.loc[raw_data['等级'] == 'AAA', '等级'] = 2
raw_data.loc[raw_data['等级'] == 'A', '等级'] = 1
raw_data.loc[raw_data['等级'] == 'B', '等级'] = 0
raw_data


Unnamed: 0,锭位号,丝筒号码,等级,直径[毫米],卷密度[g/cm3],平均张力[cN],最大张力[cN],最小张力[cN],CV%平均[%],CV%最大[%],CV%最小[%],2,1
0,193,86,1,223.42,0.725,33.8,38.5,30.2,0.78,3.69,0.41,,
1,194,86,2,220.33,0.748,27.9,29.5,25.5,1.09,2.15,0.56,2.0,2.0
2,195,84,2,218.80,0.760,29.5,31.0,27.2,0.79,1.60,0.37,2.0,2.0
3,196,88,2,219.45,0.755,31.9,33.5,29.4,1.18,1.86,0.52,2.0,2.0
4,197,81,1,219.93,0.751,33.7,42.3,22.3,0.76,8.49,0.40,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4621,188,97,2,221.21,0.741,31.2,32.5,29.1,0.79,1.75,0.37,2.0,2.0
4622,189,97,1,221.19,0.742,33.6,40.8,26.6,0.75,4.82,0.37,,
4623,190,98,2,218.66,0.761,31.2,33.2,28.6,0.77,2.03,0.39,2.0,2.0
4624,191,102,1,224.35,0.717,33.2,40.2,25.1,0.88,7.99,0.43,,


In [23]:
# 取出相应的特征值以及目标值
X_train = raw_data.loc[:,'直径[毫米]':'CV%最小[%]'].values.astype(float)
y_train = raw_data.loc[:, '等级'].values.astype(float)
# 创建min—max归一化的实例,并对数据进行归一化
min_max_scaler = MinMaxScaler()
X_train_scale = min_max_scaler.fit_transform(X_train)
X_train_scale


array([[0.45682268, 0.51818182, 0.73809524, ..., 0.32835821, 0.06990553,
        0.54054054],
       [0.25313118, 0.72727273, 0.26984127, ..., 0.79104478, 0.02834008,
        0.94594595],
       [0.15227423, 0.83636364, 0.3968254 , ..., 0.34328358, 0.01349528,
        0.43243243],
       ...,
       [0.14304548, 0.84545455, 0.53174603, ..., 0.31343284, 0.02510121,
        0.48648649],
       [0.51812788, 0.44545455, 0.69047619, ..., 0.47761194, 0.18596491,
        0.59459459],
       [0.19314436, 0.79090909, 0.6984127 , ..., 0.43283582, 0.02348178,
        0.51351351]])

In [24]:
# 训练（train）-测试（test）数据集初始化
# 测试集大小占比20%
inputs_train, inputs_test, labels_train, labels_test = train_test_split(
    X_train_scale, y_train, test_size=0.1, random_state=seed)
print('The length of train data: ' , len(inputs_train))
print('The length of test data: ' , len(inputs_test))

The length of train data:  4163
The length of test data:  463


In [25]:
# 创建决策树实例(模型初始化)，不限制树的生长
clf = DecisionTreeClassifier(
    max_depth=None,
    criterion='gini',
    splitter='best',
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    max_leaf_nodes=None
    )


In [26]:
# 采用训练集数据进行训练（模型学习）
clf.fit(inputs_train, labels_train)
# 检验预测准确率
print('模型在训练集上预测的准确率为：', clf.score(inputs_train, labels_train))
print('模型在测试集上预测的准确率为：', clf.score(inputs_test,
labels_test))


模型在训练集上预测的准确率为： 1.0
模型在测试集上预测的准确率为： 0.9697624190064795


In [15]:
predicted_train = clf.predict(inputs_train)
predicted_test = clf.predict(inputs_test)
# 分类器的性能报告
print('训练集上的性能：')
print(metrics.classification_report(labels_train, predicted_train))
print('测试集上的性能：')
print(metrics.classification_report(labels_test, predicted_test))
# 分类器混淆矩阵
print('训练集混淆矩阵：')
print(metrics.confusion_matrix(labels_train, predicted_train))
print('测试集上的性能：')
print(metrics.confusion_matrix(labels_test, predicted_test))


训练集上的性能：
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        63
           1       1.00      1.00      1.00      1400
           2       1.00      1.00      1.00      2700

    accuracy                           1.00      4163
   macro avg       1.00      1.00      1.00      4163
weighted avg       1.00      1.00      1.00      4163

测试集上的性能：
              precision    recall  f1-score   support

           0       0.38      0.38      0.38         8
           1       0.97      0.97      0.97       162
           2       1.00      1.00      1.00       293

    accuracy                           0.98       463
   macro avg       0.78      0.78      0.78       463
weighted avg       0.98      0.98      0.98       463

训练集混淆矩阵：
[[  63    0    0]
 [   0 1400    0]
 [   0    0 2700]]
测试集上的性能：
[[  3   5   0]
 [  5 157   0]
 [  0   0 293]]


In [25]:
# 采用所有数据重新训练
clf = DecisionTreeClassifier(
    max_depth=None,
    criterion='gini',
    splitter='best',
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    max_leaf_nodes=None
)
clf.fit(X_train_scale, y_train)
print('模型在完整数据集上预测的准确率为：', clf.score(X_train_scale, y_train))
predicted_y = clf.predict(X_train_scale)
print('完整数据集上的性能：')
print(metrics.classification_report(y_train, predicted_y))
print('完整数据集的混淆矩阵：')
print(metrics.confusion_matrix(y_train, predicted_y))


模型在完整数据集上预测的准确率为： 1.0
完整数据集上的性能：
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        71
           1       1.00      1.00      1.00      1562
           2       1.00      1.00      1.00      2993

    accuracy                           1.00      4626
   macro avg       1.00      1.00      1.00      4626
weighted avg       1.00      1.00      1.00      4626

完整数据集的混淆矩阵：
[[  71    0    0]
 [   0 1562    0]
 [   0    0 2993]]


In [26]:
# 将运用完整数据集的决策树导出
export_graphviz(clf, out_file='tree.dot', rounded=True,
                precision=3, feature_names=['Diameter', 'Roll density', 'Ave force', 'Max force', 'Min force', 'Ave CV%', 'Max CV%', 'Min CV%'], class_names=['Not qualified', 'Qualified', 'GOOD'],
                filled=True,
                leaves_parallel=True,
                impurity=True,
                node_ids=True,
                label='all',
                proportion=True)
(graph, ) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')
