In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import roc_auc_score, classification
from sklearn.model_selection import cross_validate, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
import pandas as pd

from gbdtlr import GBDTLR

In [2]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [14]:
lr = LogisticRegression(solver='liblinear')
gbdt_lr = GBDTLR(n_estimators=10,
                 num_leaves=8,
                 model_type='lightgbm', 
                 include_original_feature=True)

In [15]:
# performance of plain lr
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1024)

lr_result = pd.DataFrame(cross_validate(lr, X, y, cv=cv, return_train_score=True))
lr_result.mean()

fit_time       0.010601
score_time     0.002200
test_score     0.957895
train_score    0.955779
dtype: float64

In [16]:
# the performance of gbdt lr
gbdt_lr_result = pd.DataFrame(cross_validate(gbdt_lr, X, y, cv=cv, return_train_score=True))
gbdt_lr_result.mean()

fit_time       0.060006
score_time     0.014401
test_score     0.974269
train_score    0.995980
dtype: float64