# 原生形式LightGBM(import lightgbm as lgb)

In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score

In [3]:
iris = datasets.load_iris()
data = iris.data
label = iris.target

In [4]:
pd.DataFrame(data).head()
pd.DataFrame(label).head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [5]:
data1 = pd.DataFrame(data)

In [6]:
# 给列进行命名花萼长度和花萼宽度

In [7]:
data1.columns = ['sepql_l', 'sepal_w', 'petal_l', 'petal_w']

In [8]:
data1.head()

Unnamed: 0,sepql_l,sepal_w,petal_l,petal_w
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [9]:
label1 = pd.DataFrame(label)
label1.columns = ['label']
label1.head()

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0


In [10]:
data1.shape

(150, 4)

In [11]:
label1.label.value_counts()

2    50
1    50
0    50
Name: label, dtype: int64

In [12]:
# 划分训练集和测试集

In [13]:
train_x, test_x, train_y, test_y = train_test_split(data1, label1, test_size=0.3, random_state=42)
print('训练集长度：', len(train_x))
print('测试集长度：', len(test_x))

训练集长度： 105
测试集长度： 45


In [14]:
# 需要将数据转化为DMatrix格式

In [15]:
test_data = xgb.DMatrix(data=test_x, label=test_y); test_data

<xgboost.core.DMatrix at 0x179bd8fb8d0>

In [16]:
# 设置参数

In [17]:
# multi:softmax是使用softmax后产生的分类结果，而multi:softprob是输出的概率矩阵

In [18]:
xgb_params = {
    'eta': '0.3',  # 学习率 
    'silent': 'True',  # 输出运行讯息
    'objective': 'multi:softprob',
    'num_class': 3,  # 共有几个类别
    'max_depth': 3}
num_round = 20

In [19]:
# 模型训练

In [20]:
model = xgb.train(params=xgb_params, dtrain=xgb.DMatrix(data=train_x, label=train_y), num_boost_round=num_round)

In [21]:
# 模型预测

In [22]:
test_pre = model.predict(test_data)

In [23]:
test_pre[:5]

array([[0.00650657, 0.96226174, 0.03123166],
       [0.970643  , 0.02533227, 0.00402478],
       [0.0033913 , 0.00692109, 0.9896876 ],
       [0.00654362, 0.9677424 , 0.02571394],
       [0.00615641, 0.9104776 , 0.08336602]], dtype=float32)

In [24]:
# 选择表示最高概率分列

In [25]:
test_pre_1 = np.asarray([np.argmax(row) for row in test_pre])
print('test的预测结果：', test_pre_1)

test的预测结果： [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]


In [26]:
# 模型评估

In [27]:
print('验证集精准率：', precision_score(test_y, test_pre_1, average='macro'))
print('验证集召回率：', recall_score(test_y, test_pre_1, average='macro'))

验证集精准率： 1.0
验证集召回率： 1.0


# SkLearn接口形式使用LightGBM

In [28]:
from xgboost import XGBClassifier

In [29]:
model = XGBClassifier(learning_rate=0.01, n_estimators=3000, max_depth=4, objective='binary:logistic', seed=27)

In [30]:
model.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=3000, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=27, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, seed=27, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [31]:
# 预测

In [32]:
test_pre2 = model.predict(test_x); test_pre2

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0])

In [33]:
# 模型评估

In [34]:
print('验证集精确率：', precision_score(test_y, test_pre2, average='macro'))
print('验证集召回率：', recall_score(test_y, test_pre2, average='macro'))

验证集精确率： 1.0
验证集召回率： 1.0
