## Credit cards

In [3]:
import pandas as pd
from sklearn import metrics
from sklearn import model_selection

df = pd.read_csv('creditcard.csv')
X = df.drop(columns='Class')
y = df['Class']

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y,
    random_state=42
)

X_fit, X_val, y_fit, y_val = model_selection.train_test_split(
    X_train, y_train,
    random_state=42
)

In [4]:
import lightgbm

fit = lightgbm.Dataset(X_fit, y_fit)
val = lightgbm.Dataset(X_val, y_val, reference=fit)

model = lightgbm.train(
    params={
        'learning_rate': 0.01,
        'objective': 'binary'
    },
    train_set=fit,
    num_boost_round=10000,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
    early_stopping_rounds=20,
    verbose_eval=100
)

y_pred = model.predict(X_test)

print()
print(f"Test's ROC AUC: {metrics.roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {metrics.log_loss(y_test, y_pred):.5f}")


Training until validation scores don't improve for 20 rounds
[100]	fit's binary_logloss: 0.00191083	val's binary_logloss: 0.00358371
[200]	fit's binary_logloss: 0.000825181	val's binary_logloss: 0.00286873
[300]	fit's binary_logloss: 0.000403679	val's binary_logloss: 0.00262094
Early stopping, best iteration is:
[355]	fit's binary_logloss: 0.000282887	val's binary_logloss: 0.00257033

Test's ROC AUC: 0.97721
Test's logloss: 0.00233


In [5]:
import lightgbm
import numpy as np
from scipy import special

def logloss_init_score(y):
    p = y.mean()
    p = np.clip(p, 1e-15, 1 - 1e-15)
    log_odds = np.log(p / (1 - p))
    return log_odds

def logloss_objective(preds, train_data):
    y = train_data.get_label()
    p = special.expit(preds)
    grad = p - y
    hess = p * (1 - p)
    return grad, hess

def logloss_metric(preds, train_data):
    y = train_data.get_label()
    p = special.expit(preds)
    is_higher_better = False
    return 'logloss', metrics.log_loss(y, p), is_higher_better

fit = lightgbm.Dataset(
    X_fit, y_fit,
    init_score=np.full_like(y_fit, logloss_init_score(y_fit), dtype=float)
)

val = lightgbm.Dataset(
    X_val, y_val,
    init_score=np.full_like(y_val, logloss_init_score(y_fit), dtype=float),
    reference=fit
)

model = lightgbm.train(
    params={'learning_rate': 0.01},
    train_set=fit,
    num_boost_round=10000,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
    early_stopping_rounds=20,
    verbose_eval=100,
    fobj=logloss_objective,
    feval=logloss_metric
)

y_pred = special.expit(logloss_init_score(y_fit) + model.predict(X_test))

print()
print(f"Test's ROC AUC: {metrics.roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {metrics.log_loss(y_test, y_pred):.5f}")


Training until validation scores don't improve for 20 rounds
[100]	fit's logloss: 0.00191083	val's logloss: 0.00358371
[200]	fit's logloss: 0.000825181	val's logloss: 0.00286873
[300]	fit's logloss: 0.000403679	val's logloss: 0.00262094
Early stopping, best iteration is:
[355]	fit's logloss: 0.000282887	val's logloss: 0.00257033

Test's ROC AUC: 0.97721
Test's logloss: 0.00233


In [7]:
p = np.array([0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1])
p

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [8]:
3 ** (1 + p)

array([3.        , 3.34836952, 3.73719282, 4.17116751, 4.65553672,
       5.19615242, 5.79954613, 6.47300784, 7.22467406, 8.06362614,
       9.        ])

array([1.        , 1.        , 1.00000004, 1.00000155, 1.0000186 ,
       1.0001201 , 1.00052603, 1.00176972, 1.00492263, 1.01186757,
       1.02560086])

In [12]:
10 * p

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])