In [1]:
# 原生形式LightGBM(import lightgbm as lgb)

In [3]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score

In [5]:
iris = datasets.load_iris()
data = iris.data
label = iris.target

In [6]:
pd.DataFrame(data).head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
pd.DataFrame(label).head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [8]:
data1 = pd.DataFrame(data)

In [11]:
# 给列进行命名花萼长度和花萼宽度

In [12]:
data1.columns = ['sepql_l', 'sepal_w', 'petal_l', 'petal_w']

In [13]:
data1.head()

Unnamed: 0,sepql_l,sepal_w,petal_l,petal_w
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [14]:
label1 = pd.DataFrame(label)

In [16]:
label1.columns = ['label']

In [17]:
label1.head()

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0


In [18]:
# 划分训练集和测试集

In [20]:
train_x, test_x, train_y, test_y = train_test_split(data1, label1, test_size=0.3, random_state=42)
print('训练集长度：', len(train_x))
print('测试集长度：', len(test_x))

训练集长度： 105
测试集长度： 45


# LightGBM原生态接口

In [22]:
# 需要将数据转化为DMatrix格式

In [55]:
train_data = lgb.Dataset(train_x, train_y); train_data


<lightgbm.basic.Dataset at 0x1a7af23f518>

In [56]:
test_data = lgb.Dataset(test_x, test_y); test_data

<lightgbm.basic.Dataset at 0x1a7af23f588>

In [57]:
# 设置参数

In [58]:
lgb_params = {
    'boosting_type': 'gbdt', 
    'objective': 'multiclass',
    'metric': 'multi_error',
    'verbose': 1,
    'num_class': 3}

In [59]:
# 模型训练

In [60]:
clf = lgb.train(params=lgb_params, train_set=train_data, num_boost_round=10, valid_sets=[train_data, test_data], verbose_eval=1)

[1]	training's multi_error: 0.133333	valid_1's multi_error: 0.155556
[2]	training's multi_error: 0.0571429	valid_1's multi_error: 0.0444444
[3]	training's multi_error: 0.0666667	valid_1's multi_error: 0
[4]	training's multi_error: 0.0666667	valid_1's multi_error: 0
[5]	training's multi_error: 0.0666667	valid_1's multi_error: 0
[6]	training's multi_error: 0.0666667	valid_1's multi_error: 0
[7]	training's multi_error: 0.0761905	valid_1's multi_error: 0
[8]	training's multi_error: 0.0761905	valid_1's multi_error: 0
[9]	training's multi_error: 0.0761905	valid_1's multi_error: 0
[10]	training's multi_error: 0.0666667	valid_1's multi_error: 0


In [61]:
# 模型预测

In [63]:
test_pre = clf.predict(data=test_x, num_iteration=clf.best_iteration); test_pre[:5]

array([[0.13683286, 0.63500393, 0.22816321],
       [0.69436834, 0.15467706, 0.15095461],
       [0.12934308, 0.16125127, 0.70940565],
       [0.14172417, 0.62195656, 0.23631927],
       [0.13683286, 0.63500393, 0.22816321]])

In [64]:
# 选择表示最高概率的列

In [65]:
test_pre_1 = np.asarray([np.argmax(row) for row in test_pre])
print('test的预测结果：', test_pre_1)

test的预测结果： [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]


In [66]:
# 模型评估

In [67]:
print('验证集精准率：', precision_score(test_y, test_pre_1, average='macro'))
print('验证集召回率：', recall_score(test_y, test_pre_1, average='macro'))

验证集精准率： 1.0
验证集召回率： 1.0


# SkLearn接口形式使用LightGBM

In [68]:
import lightgbm as lgb

In [69]:
lgb_params = {
    'learning_rate': 0.1,
    'max_bin': 150,
    'num_leaves': 32,
    'max_depth': 11,
    'objective': 'multiclass',
    'n_estimators': 300}

In [70]:
model = lgb.LGBMClassifier(**lgb_params)

In [71]:
model.fit(train_x, train_y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_bin=150,
               max_depth=11, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=300, n_jobs=-1, num_leaves=32,
               objective='multiclass', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [72]:
# 预测

In [74]:
test_pre2 = model.predict(test_x); test_pre2

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0])

In [75]:
# 模型评估

In [76]:
print('验证集精确率：', precision_score(test_y, test_pre2, average='macro'))
print('验证集召回率：', recall_score(test_y, test_pre2, average='macro'))

验证集精确率： 1.0
验证集召回率： 1.0
