In [50]:
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from pathlib import Path

In [51]:
data_path = Path("LightGBM_examples/regression")

In [52]:
print('Load data...')
df_train = pd.read_csv(data_path/'regression.train', header=None, sep='\t')
df_test = pd.read_csv(data_path/'regression.test', header=None, sep='\t')

Load data...


In [53]:
y_train = df_train[0].values
y_test = df_test[0].values
x_train = df_train.drop(0, axis=1).values
x_test = df_test.drop(0, axis=1).values

In [54]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

In [55]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc'},
    'num_leaves': 64,
    'num_trees': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'max_depth': 3
}

# number of leaves,will be used in feature transformation

num_leaf = 64

print('Start training...')

# train

gbm = lgb.train(params=params,
                train_set=lgb_train,
                valid_sets = [lgb_train, lgb_eval],
                verbose_eval = 2)

print('Start predicting...')

# y_pred 分别落在 100 棵树上的哪个节点上

y_pred = gbm.predict(x_train, pred_leaf=True)
y_pred_prob = gbm.predict(x_train)

Start training...
[2]	training's auc: 0.703772	valid_1's auc: 0.690902
[4]	training's auc: 0.738445	valid_1's auc: 0.73606
[6]	training's auc: 0.73935	valid_1's auc: 0.746638
[8]	training's auc: 0.74507	valid_1's auc: 0.754628
[10]	training's auc: 0.742922	valid_1's auc: 0.749097
[12]	training's auc: 0.742826	valid_1's auc: 0.738132
[14]	training's auc: 0.740134	valid_1's auc: 0.733633
[16]	training's auc: 0.738513	valid_1's auc: 0.732255
[18]	training's auc: 0.743956	valid_1's auc: 0.739986
[20]	training's auc: 0.744627	valid_1's auc: 0.741414
[22]	training's auc: 0.745646	valid_1's auc: 0.74351
[24]	training's auc: 0.745929	valid_1's auc: 0.744203
[26]	training's auc: 0.749257	valid_1's auc: 0.746573
[28]	training's auc: 0.748335	valid_1's auc: 0.745074
[30]	training's auc: 0.747207	valid_1's auc: 0.741663
[32]	training's auc: 0.746398	valid_1's auc: 0.740607
[34]	training's auc: 0.745564	valid_1's auc: 0.740317
[36]	training's auc: 0.745323	valid_1's auc: 0.738003
[38]	training's au




[74]	training's auc: 0.757335	valid_1's auc: 0.759554
[76]	training's auc: 0.757712	valid_1's auc: 0.759747
[78]	training's auc: 0.759006	valid_1's auc: 0.76294
[80]	training's auc: 0.760164	valid_1's auc: 0.765988
[82]	training's auc: 0.760281	valid_1's auc: 0.765988
[84]	training's auc: 0.760801	valid_1's auc: 0.766996
[86]	training's auc: 0.76132	valid_1's auc: 0.767431
[88]	training's auc: 0.761753	valid_1's auc: 0.768068
[90]	training's auc: 0.762556	valid_1's auc: 0.770051
[92]	training's auc: 0.763102	valid_1's auc: 0.771099
[94]	training's auc: 0.763067	valid_1's auc: 0.771148
[96]	training's auc: 0.763695	valid_1's auc: 0.771494
[98]	training's auc: 0.764601	valid_1's auc: 0.772704
[100]	training's auc: 0.765142	valid_1's auc: 0.773599
Start predicting...


In [56]:
result = []
threshold = 0.5
for pred in y_pred_prob:
    result.append(1 if pred > threshold else 0)
print('result:', result[0:5])

result: [1, 1, 1, 1, 1]


In [57]:
print(y_pred.__class__)
print(y_pred.shape, "表示7000个样本，100个树")
print(x_train.shape, "表示7000个样本，28个特征变量")
print('每个树产生了',num_leaf,'个叶子。')

<class 'numpy.ndarray'>
(7000, 100) 表示7000个样本，100个树
(7000, 28) 表示7000个样本，28个特征变量
每个树产生了 64 个叶子。


In [58]:
print('Writing transformed training data')
transformed_training_matrix = np.zeros([y_pred.shape[0], y_pred.shape[1] * num_leaf], dtype=np.int64)

Writing transformed training data


In [59]:
for i in range(0, y_pred.shape[0]): 
    # temp 表示在每棵树上预测的值所在节点的序号（0,64,128,...,6436 为 100 棵树的序号，中间的值为对应树的节点序号）
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    # 构造one-hot 训练数据集
transformed_training_matrix[i][temp] += 1

In [60]:
y_pred = gbm.predict(x_test, pred_leaf=True)
print('Writing transformed testing data')
transformed_testing_matrix = np.zeros([len(y_pred), len(y_pred[1]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    # 构造one-hot 测试数据集
    transformed_testing_matrix[i][temp] += 1

Writing transformed testing data


In [61]:
print(transformed_training_matrix.__class__)
print(transformed_training_matrix.shape,'表示我们最终得到了我们的特征变量，下面就是检验 LR 和 LightGBM 的差异了')

<class 'numpy.ndarray'>
(7000, 6400) 表示我们最终得到了我们的特征变量，下面就是检验 LR 和 LightGBM 的差异了


In [62]:
print('Start predicting...')
y_pred = gbm.predict(x_train, num_iteration=gbm.best_iteration)
fpr, tpr, thres = roc_curve(y_train, y_pred)
ks = tpr - fpr
ks_max = np.max(ks)
print('训练集 KS 值: ', ks_max)

y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
fpr, tpr, thres = roc_curve(y_test, y_pred)
ks = tpr - fpr
ks_max = np.max(ks)
print('测试集 KS 值: ', ks_max)

Start predicting...
训练集 KS 值:  0.39735731451969236
测试集 KS 值:  0.43369453044375644


In [63]:
from sklearn.linear_model import LogisticRegression

参考 https://riptutorial.com/scikit-learn/example/27960/classification-using-logistic-regression

In [64]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=0.1).fit(x_train, y_train)



In [65]:
print('Start predicting...')
y_pred = lr.predict_proba(x_train)[:,1]
fpr, tpr, thres = roc_curve(y_train, y_pred)
ks = tpr - fpr
ks_max = np.max(ks)
print('训练集 KS 值: ', ks_max)


y_pred = lr.predict_proba(x_test)[:,1]
fpr, tpr, thres = roc_curve(y_test, y_pred)
ks = tpr - fpr
ks_max = np.max(ks)
print('测试集 KS 值: ', ks_max)

Start predicting...
训练集 KS 值:  0.2610369747833053
测试集 KS 值:  0.3097910216718266


结果没有变好。