In [50]:
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
from pathlib import Path

In [2]:
data_path = Path("LightGBM_examples/regression")

In [3]:
print('Load data...')
df_train = pd.read_csv(data_path/'regression.train', header=None, sep='\t')
df_test = pd.read_csv(data_path/'regression.test', header=None, sep='\t')

Load data...


In [12]:
y_train = df_train[0].values
y_test = df_test[0].values
x_train = df_train.drop(0, axis=1).values
x_test = df_test.drop(0, axis=1).values

In [13]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

In [22]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 64,
    'num_trees': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# number of leaves,will be used in feature transformation

num_leaf = 64

print('Start training...')

# train

gbm = lgb.train(params=params,
                train_set=lgb_train,
                valid_sets=lgb_eval)

print('Start predicting...')

# y_pred 分别落在 100 棵树上的哪个节点上

y_pred = gbm.predict(x_train, pred_leaf=True)
y_pred_prob = gbm.predict(x_train)

Start training...
[1]	valid_0's binary_logloss: 0.687436
[2]	valid_0's binary_logloss: 0.685819
[3]	valid_0's binary_logloss: 0.684125
[4]	valid_0's binary_logloss: 0.68213
[5]	valid_0's binary_logloss: 0.679867
[6]	valid_0's binary_logloss: 0.67851
[7]	valid_0's binary_logloss: 0.677043
[8]	valid_0's binary_logloss: 0.675195
[9]	valid_0's binary_logloss: 0.673341
[10]	valid_0's binary_logloss: 0.671472
[11]	valid_0's binary_logloss: 0.669595
[12]	valid_0's binary_logloss: 0.667739
[13]	valid_0's binary_logloss: 0.665983
[14]	valid_0's binary_logloss: 0.664583
[15]	valid_0's binary_logloss: 0.66293
[16]	valid_0's binary_logloss: 0.661583
[17]	valid_0's binary_logloss: 0.659852
[18]	valid_0's binary_logloss: 0.658628
[19]	valid_0's binary_logloss: 0.657021
[20]	valid_0's binary_logloss: 0.655344
[21]	valid_0's binary_logloss: 0.653583
[22]	valid_0's binary_logloss: 0.651843
[23]	valid_0's binary_logloss: 0.650253
[24]	valid_0's binary_logloss: 0.648701
[25]	valid_0's binary_logloss: 0.6

In [24]:
result = []
threshold = 0.5
for pred in y_pred_prob:
    result.append(1 if pred > threshold else 0)
print('result:', result[0:5])

result: [1, 1, 1, 1, 0]


In [33]:
print(y_pred.__class__)
print(y_pred.shape, "表示7000个样本，100个树")
print(x_train.shape, "表示7000个样本，28个特征变量")
print('每个树产生了',num_leaf,'个叶子。')

<class 'numpy.ndarray'>
(7000, 100) 表示7000个样本，100个树
(7000, 28) 表示7000个样本，28个特征变量
每个树产生了 64 个叶子。


In [35]:
print('Writing transformed training data')
transformed_training_matrix = np.zeros([y_pred.shape[0], y_pred.shape[1] * num_leaf], dtype=np.int64)

Writing transformed training data


In [36]:
for i in range(0, y_pred.shape[0]): 
    # temp 表示在每棵树上预测的值所在节点的序号（0,64,128,...,6436 为 100 棵树的序号，中间的值为对应树的节点序号）
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    # 构造one-hot 训练数据集
transformed_training_matrix[i][temp] += 1

In [38]:
y_pred = gbm.predict(x_test, pred_leaf=True)
print('Writing transformed testing data')
transformed_testing_matrix = np.zeros([len(y_pred), len(y_pred[1]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    # 构造one-hot 测试数据集
    transformed_testing_matrix[i][temp] += 1

Writing transformed testing data


In [47]:
print(transformed_training_matrix.__class__)
print(transformed_training_matrix.shape,'表示我们最终得到了我们的特征变量，下面就是检验 LR 和 LightGBM 的差异了')

<class 'numpy.ndarray'>
(7000, 6400) 表示我们最终得到了我们的特征变量，下面就是检验 LR 和 LightGBM 的差异了


In [51]:
print('Start predicting...')
y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
fpr, tpr, thres = roc_curve(y_test, y_pred)
ks = tpr - fpr
ks_max = np.max(ks)
print(ks_max)
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

Start predicting...
0.5023864809081527
The rmse of prediction is: 0.441034323269683


In [69]:
from sklearn.linear_model import LogisticRegression

参考 https://riptutorial.com/scikit-learn/example/27960/classification-using-logistic-regression

In [53]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(x_train, y_train)



In [67]:
print('Start predicting...')
y_pred = lr.predict_proba(x_test)[:,1]
fpr, tpr, thres = roc_curve(y_test, y_pred)
ks = tpr - fpr
ks_max = np.max(ks)
print(ks_max)
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

Start predicting...
0.3188854489164087
The rmse of prediction is: 0.46944242847539996


结果没有变好。