In [36]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [37]:
# 使label取值在0到num_class -1 之间的范围内
data = pd.read_csv('.\\data\\seeds_dataset.txt', header=None, sep='\s+', converters={7: lambda x:int(x) - 1})

In [38]:
# 将最后一列字段名设置为label
data.rename(columns={7:'label'}, inplace=True)

In [39]:
data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,label
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,0
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,0
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,0
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,0
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,0
5,14.38,14.21,0.8951,5.386,3.312,2.462,4.956,0
6,14.69,14.49,0.8799,5.563,3.259,3.586,5.219,0
7,14.11,14.1,0.8911,5.42,3.302,2.7,5.0,0
8,16.63,15.46,0.8747,6.053,3.465,2.04,5.877,0
9,16.44,15.25,0.888,5.884,3.505,1.969,5.533,0


In [40]:
# 生成一个随机数并选择小于0.8的数据
mask = np.random.rand(len(data)) < 0.8

In [41]:
train = data[mask]
test = data[~mask]

In [42]:
# 生成DMatrix
xgb_train = xgb.DMatrix(train.iloc[:,:6], label=train.label)
xgb_test = xgb.DMatrix(test.iloc[:,:6], label=test.label)

In [43]:
# 通过softmax进行多分类
params = {
    'objective':'multi:softmax',
    'eta':0.1,
    'max_depth':5,
    'num_class':3
}
watch_list = [(xgb_train,'train'),(xgb_test, 'test')]
num_round = 60
bst = xgb.train(params, xgb_train, num_round, watch_list)

[0]	train-merror:0.02500	test-merror:0.16000
[1]	train-merror:0.01250	test-merror:0.12000
[2]	train-merror:0.01250	test-merror:0.12000
[3]	train-merror:0.01250	test-merror:0.12000
[4]	train-merror:0.00625	test-merror:0.12000
[5]	train-merror:0.00625	test-merror:0.12000
[6]	train-merror:0.00625	test-merror:0.12000
[7]	train-merror:0.00000	test-merror:0.12000
[8]	train-merror:0.00000	test-merror:0.12000
[9]	train-merror:0.00000	test-merror:0.12000
[10]	train-merror:0.00000	test-merror:0.12000
[11]	train-merror:0.00000	test-merror:0.12000
[12]	train-merror:0.00000	test-merror:0.12000
[13]	train-merror:0.00000	test-merror:0.12000
[14]	train-merror:0.00000	test-merror:0.12000
[15]	train-merror:0.00000	test-merror:0.12000
[16]	train-merror:0.00000	test-merror:0.12000
[17]	train-merror:0.00000	test-merror:0.12000
[18]	train-merror:0.00000	test-merror:0.12000
[19]	train-merror:0.00000	test-merror:0.12000
[20]	train-merror:0.00000	test-merror:0.12000
[21]	train-merror:0.00000	test-merror:0.1200

In [44]:
# 模型预测
pred = bst.predict(xgb_test)
error_rate = np.sum(pred != test.label)/test.shape[0]
print('测试集错误率(softmax):{}'.format(error_rate))

测试集错误率(softmax):0.12


In [45]:
# 重新训练模型，输出概率值
params['objective'] = 'multi:softprob'

In [46]:
bst = xgb.train(params,xgb_train,num_round,watch_list)

[0]	train-merror:0.02500	test-merror:0.16000
[1]	train-merror:0.01250	test-merror:0.12000
[2]	train-merror:0.01250	test-merror:0.12000
[3]	train-merror:0.01250	test-merror:0.12000
[4]	train-merror:0.00625	test-merror:0.12000
[5]	train-merror:0.00625	test-merror:0.12000
[6]	train-merror:0.00625	test-merror:0.12000
[7]	train-merror:0.00000	test-merror:0.12000
[8]	train-merror:0.00000	test-merror:0.12000
[9]	train-merror:0.00000	test-merror:0.12000
[10]	train-merror:0.00000	test-merror:0.12000
[11]	train-merror:0.00000	test-merror:0.12000
[12]	train-merror:0.00000	test-merror:0.12000
[13]	train-merror:0.00000	test-merror:0.12000
[14]	train-merror:0.00000	test-merror:0.12000
[15]	train-merror:0.00000	test-merror:0.12000
[16]	train-merror:0.00000	test-merror:0.12000
[17]	train-merror:0.00000	test-merror:0.12000
[18]	train-merror:0.00000	test-merror:0.12000
[19]	train-merror:0.00000	test-merror:0.12000
[20]	train-merror:0.00000	test-merror:0.12000
[21]	train-merror:0.00000	test-merror:0.1200

In [47]:
# 模型预测
pred_prob = bst.predict(xgb_test)
print(pred_prob)

[[0.99314183 0.00332915 0.00352908]
 [0.991872   0.00394551 0.00418246]
 [0.8182265  0.17272165 0.0090518 ]
 [0.99264824 0.00356872 0.00378304]
 [0.9830935  0.00857816 0.0083283 ]
 [0.9930722  0.00336289 0.00356485]
 [0.9532523  0.03885902 0.00788868]
 [0.87496346 0.11201561 0.01302097]
 [0.9918902  0.0039367  0.00417312]
 [0.9920202  0.00387362 0.00410625]
 [0.9920202  0.00387362 0.00410625]
 [0.8879479  0.00808774 0.10396435]
 [0.01301679 0.9821827  0.00480055]
 [0.9796275  0.01671542 0.00365707]
 [0.99173963 0.00400976 0.00425056]
 [0.10333674 0.8865497  0.01011356]
 [0.9920202  0.00387362 0.00410625]
 [0.01119421 0.0030668  0.98573893]
 [0.00651125 0.98991466 0.00357416]
 [0.00588603 0.9905376  0.0035764 ]
 [0.00672877 0.9901821  0.00308914]
 [0.04487158 0.9448663  0.01026213]
 [0.00539573 0.9915109  0.00309329]
 [0.04422632 0.9456591  0.01011456]
 [0.00563388 0.9912736  0.00309255]
 [0.00746105 0.9889682  0.00357074]
 [0.04487158 0.9448663  0.01026213]
 [0.00506865 0.9918371  0.00

In [48]:
# 取向量中预测值最大的分类作为预测类别
pred_label = np.argmax(pred_prob, axis=1)
pred_label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 0, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 2], dtype=int64)

In [49]:
# 计算测试集错误率
error_rate = np.sum(pred_label != test.label)/test.shape[0]
print('测试集错误率(softprob):{}'.format(error_rate))

测试集错误率(softprob):0.12


In [51]:
bst.save_model('.\\model\\seeds_multi_prob.model')