In [1]:
%load_ext nb_black
%load_ext autoreload

%autoreload 2

<IPython.core.display.Javascript object>

In [2]:
import os
from pathlib import Path

from requests import get
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping

import logging

logging.basicConfig(level=logging.WARN)

<IPython.core.display.Javascript object>

In [3]:
from thc_net.explainable_model.input_utils import preproc_dataset
from thc_net.explainable_model.model import build_model
from thc_net.explainable_model.random_utils import setup_seed, SEED
from sklearn.model_selection import StratifiedShuffleSplit

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

%matplotlib inline

<IPython.core.display.Javascript object>

In [4]:
from tqdm.auto import tqdm

<IPython.core.display.Javascript object>

In [5]:
setup_seed()

<IPython.core.display.Javascript object>

In [6]:
def download(url, out, force=False, verify=True):
    out.parent.mkdir(parents=True, exist_ok=True)
    if force and out.exists():
        print(f"Removing file at {str(out)}")
        out.unlink()

    if out.exists():
        print("File already exists.")
        return
    print(f"Downloading {url} at {str(out)} ...")
    # open in binary mode
    with out.open(mode="wb") as file:
        # get request
        response = get(url, verify=verify)
        for chunk in response.iter_content(100000):
            # write to file
            file.write(chunk)


<IPython.core.display.Javascript object>

In [7]:
def plot_history(history):
    loss_list = [s for s in history.history.keys() if "loss" in s and "val" not in s]
    val_loss_list = [s for s in history.history.keys() if "loss" in s and "val" in s]
    acc_list = [s for s in history.history.keys() if "AUC" in s and "val" not in s]
    val_acc_list = [s for s in history.history.keys() if "AUC" in s and "val" in s]

    if len(loss_list) == 0:
        print("Loss is missing in history")
        return

    ## As loss always exists
    epochs = range(1, len(history.history[loss_list[0]]) + 1)

    ## Loss
    plt.figure(1)
    for l in loss_list:
        plt.plot(
            epochs,
            history.history[l],
            "b",
            label="Training loss ("
            + str(str(format(history.history[l][-1], ".5f")) + ")"),
        )
    for l in val_loss_list:
        plt.plot(
            epochs,
            history.history[l],
            "g",
            label="Validation loss ("
            + str(str(format(history.history[l][-1], ".5f")) + ")"),
        )

    plt.title("Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()

    plt.show()

<IPython.core.display.Javascript object>

In [8]:
dataset_name = "portoseguro"
filename = "train_bench.csv"
target = "target"
ids = []

<IPython.core.display.Javascript object>

In [9]:
dataset_name = "road-safety"
filename = "train_bench.csv"
target = "Sex_of_Driver_df_res"
ids = []

<IPython.core.display.Javascript object>

In [10]:
dataset_name = "give-me-some-credit"
filename = "train_bench.csv"
target = "SeriousDlqin2yrs"
ids = ["Unamed", "age"]

<IPython.core.display.Javascript object>

In [11]:
dataset_name = "albert"
filename = "train_bench.csv"
target = "target"
ids = []

<IPython.core.display.Javascript object>

In [12]:
dataset_name = "cat-in-the-dat-ii"
filename = "train_bench.csv"
target = "target"
ids = ["id"]

<IPython.core.display.Javascript object>

In [13]:
dataset_name = "bank-marketing"
filename = "train_bench.csv"
target = "y"
ids = []

<IPython.core.display.Javascript object>

In [14]:
dataset_name = "open-payments"
filename = "train_bench.csv"
target = "status"
ids = []

<IPython.core.display.Javascript object>

In [15]:
dataset_name = "census-income"
filename = "train_bench.csv"
target = "taxable income amount"
ids = []

<IPython.core.display.Javascript object>

In [16]:
dataset_name = "homesite-quote-conversion"
filename = "train_bench.csv"
target = "QuoteConversion_Flag"
ids = []

<IPython.core.display.Javascript object>

In [17]:
dataset_name = "bnp-cardif"
filename = "train_bench.csv"
target = "target"
ids = []

<IPython.core.display.Javascript object>

In [18]:
dataset_name = "springleaf-marketing-response"
filename = "train_bench.csv"
target = "target"
ids = []

<IPython.core.display.Javascript object>

In [19]:
out = Path(os.getcwd()) / "data" / dataset_name / filename

<IPython.core.display.Javascript object>

In [20]:
train = pd.read_csv(out, low_memory=False)
train.shape

(145231, 1935)

<IPython.core.display.Javascript object>

In [21]:
train.columns

Index(['ID', 'VAR_0001', 'VAR_0002', 'VAR_0003', 'VAR_0004', 'VAR_0005',
       'VAR_0006', 'VAR_0007', 'VAR_0008', 'VAR_0009',
       ...
       'VAR_1927', 'VAR_1928', 'VAR_1929', 'VAR_1930', 'VAR_1931', 'VAR_1932',
       'VAR_1933', 'VAR_1934', 'target', 'Set'],
      dtype='object', length=1935)

<IPython.core.display.Javascript object>

In [22]:
if "Set" not in train.columns:
    print("Building tailored column")
    train_valid_index, test_index = next(
        StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=SEED).split(
            range(train[target].shape[0]), train[target].values
        )
    )
    train_index, valid_index = next(
        StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=SEED).split(
            train_valid_index, train[target].values[train_valid_index]
        )
    )
    train["Set"] = "train"
    train["Set"][valid_index] = "valid"
    train["Set"][test_index] = "test"
    # train.to_csv((out.parent / "train_bench.csv").as_posix(), index=False)

<IPython.core.display.Javascript object>

In [23]:
train_indices = train[train.Set == "train"].index
valid_indices = train[train.Set == "valid"].index
test_indices = train[train.Set == "test"].index

<IPython.core.display.Javascript object>

In [24]:
train

Unnamed: 0,ID,VAR_0001,VAR_0002,VAR_0003,VAR_0004,VAR_0005,VAR_0006,VAR_0007,VAR_0008,VAR_0009,...,VAR_1927,VAR_1928,VAR_1929,VAR_1930,VAR_1931,VAR_1932,VAR_1933,VAR_1934,target,Set
0,106581,H,108,19,3200,C,0.0,0.0,False,False,...,98,998,999999998,998,998,9998,9998,IAPS,0,train
1,97406,R,43,0,3406,C,0.0,0.0,False,False,...,1,0,999999996,400,400,9996,130,BRANCH,1,train
2,242499,R,52,90,1720,C,1.0,1.0,False,False,...,98,998,999999998,998,998,9998,9998,IAPS,0,train
3,62287,H,221,65,3500,C,1.0,1.0,False,False,...,98,998,999999998,998,998,9998,9998,BRANCH,0,train
4,17042,R,0,20,1410,B,5.0,4.0,False,False,...,98,998,999999998,998,998,9998,9998,BRANCH,1,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145226,193130,H,490,325,5600,N,1.0,3.0,False,False,...,98,998,999999998,998,998,9998,9998,IAPS,0,test
145227,71632,R,225,249,2695,B,0.0,0.0,False,False,...,98,998,999999998,998,998,9998,9998,BRANCH,1,test
145228,144695,R,5,67,1937,B,0.0,0.0,False,False,...,98,998,999999998,998,998,9998,9998,BRANCH,0,test
145229,95760,H,27,46,38000,B,0.0,0.0,False,False,...,98,998,999999998,998,998,9998,9998,IAPS,0,test


<IPython.core.display.Javascript object>

In [25]:
n_unique = train.nunique()
to_ignore = []
to_ignore.append(target)
to_ignore.extend(ids)

<IPython.core.display.Javascript object>

In [26]:
constant_cols = train.columns[n_unique <= 1]
constant_cols = list(set(constant_cols.tolist()) - set(to_ignore))
constant_cols

['VAR_0031',
 'VAR_0221',
 'VAR_0222',
 'VAR_0026',
 'VAR_0011',
 'VAR_0021',
 'VAR_0246',
 'VAR_0022',
 'VAR_0027',
 'VAR_0041',
 'VAR_0189',
 'VAR_0038',
 'VAR_0223',
 'VAR_0446',
 'VAR_0196',
 'VAR_0032',
 'VAR_0207',
 'VAR_0030',
 'VAR_0527',
 'VAR_0847',
 'VAR_0024',
 'VAR_0197',
 'VAR_0528',
 'VAR_0019',
 'VAR_0039',
 'VAR_1428',
 'VAR_0199',
 'VAR_0040',
 'VAR_0190',
 'VAR_0202',
 'VAR_0438',
 'VAR_0028',
 'VAR_0239',
 'VAR_0044',
 'VAR_0025',
 'VAR_0188',
 'VAR_0530',
 'VAR_0029',
 'VAR_0009',
 'VAR_0043',
 'VAR_0042',
 'VAR_0216',
 'VAR_0394',
 'VAR_0229',
 'VAR_0012',
 'VAR_0840',
 'VAR_0023',
 'VAR_0213',
 'VAR_0008',
 'VAR_0203',
 'VAR_0018',
 'VAR_0010',
 'VAR_0020',
 'VAR_0215']

<IPython.core.display.Javascript object>

In [27]:
ids

[]

<IPython.core.display.Javascript object>

In [28]:
bool_cols = train.columns[n_unique == 2]
bool_cols = list(set(bool_cols.tolist()) - set(to_ignore))
bool_cols

['VAR_0566',
 'VAR_0181',
 'VAR_0740',
 'VAR_0563',
 'VAR_0138',
 'VAR_0741',
 'VAR_0344',
 'VAR_0180',
 'VAR_1162',
 'VAR_0130',
 'VAR_0383',
 'VAR_0503',
 'VAR_0362',
 'VAR_0529',
 'VAR_0567',
 'VAR_1163',
 'VAR_0736',
 'VAR_0114',
 'VAR_0244',
 'VAR_0098',
 'VAR_0182',
 'VAR_0505',
 'VAR_0490',
 'VAR_0230',
 'VAR_0459',
 'VAR_0502',
 'VAR_0466',
 'VAR_0739',
 'VAR_0106',
 'VAR_0733',
 'VAR_0236',
 'VAR_1427',
 'VAR_0191',
 'VAR_0924',
 'VAR_0504',
 'VAR_0737',
 'VAR_1165',
 'VAR_0395',
 'VAR_0526',
 'VAR_0226',
 'VAR_0232',
 'VAR_0247',
 'VAR_0392',
 'VAR_0384',
 'VAR_0411',
 'VAR_0732',
 'VAR_1164',
 'VAR_0468']

<IPython.core.display.Javascript object>

In [29]:
num_cols = list(
    set(
        train.columns[
            (n_unique > 2)
            # & ((n_unique / train.shape[0]) > 0.05)
            & (train.dtypes != "object")
        ].tolist()
    )
    - set(to_ignore)
)
num_cols

['VAR_0777',
 'VAR_1542',
 'VAR_0473',
 'VAR_0273',
 'VAR_0990',
 'VAR_1096',
 'VAR_0694',
 'VAR_1871',
 'VAR_0994',
 'VAR_0691',
 'VAR_0858',
 'VAR_0129',
 'VAR_1590',
 'VAR_0537',
 'VAR_1217',
 'VAR_1425',
 'VAR_1271',
 'VAR_1541',
 'VAR_1626',
 'VAR_1435',
 'VAR_1352',
 'VAR_1252',
 'VAR_1565',
 'VAR_1849',
 'VAR_0894',
 'VAR_0829',
 'VAR_1445',
 'VAR_0624',
 'VAR_1497',
 'VAR_1393',
 'VAR_1727',
 'VAR_0276',
 'VAR_0393',
 'VAR_1456',
 'VAR_1776',
 'VAR_0150',
 'VAR_0079',
 'VAR_0690',
 'VAR_0805',
 'VAR_0507',
 'VAR_1785',
 'VAR_1740',
 'VAR_1800',
 'VAR_0594',
 'VAR_0254',
 'VAR_1654',
 'VAR_1099',
 'VAR_1430',
 'VAR_1317',
 'VAR_0341',
 'VAR_0700',
 'VAR_1743',
 'VAR_1583',
 'VAR_1047',
 'VAR_0861',
 'VAR_1642',
 'VAR_1523',
 'VAR_1638',
 'VAR_1750',
 'VAR_0425',
 'VAR_0888',
 'VAR_0317',
 'VAR_0823',
 'VAR_0662',
 'VAR_1911',
 'VAR_0487',
 'VAR_1864',
 'VAR_1621',
 'VAR_1318',
 'VAR_0927',
 'VAR_0164',
 'VAR_1931',
 'VAR_0878',
 'VAR_0675',
 'VAR_1912',
 'VAR_1304',
 'VAR_0896',

<IPython.core.display.Javascript object>

In [30]:
cat_cols = list(
    set(train.columns.tolist())
    - set(num_cols)
    - set(bool_cols)
    - set(constant_cols)
    - set(to_ignore)
    - set(["Set"])
)
cat_cols

['VAR_0179',
 'VAR_0342',
 'VAR_0217',
 'VAR_0493',
 'VAR_0169',
 'VAR_0178',
 'VAR_0274',
 'VAR_1934',
 'VAR_0214',
 'VAR_0305',
 'VAR_0354',
 'VAR_0168',
 'VAR_0467',
 'VAR_0156',
 'VAR_0005',
 'VAR_0200',
 'VAR_0353',
 'VAR_0325',
 'VAR_0204',
 'VAR_0167',
 'VAR_0159',
 'VAR_0157',
 'VAR_0075',
 'VAR_0176',
 'VAR_0073',
 'VAR_0166',
 'VAR_0177',
 'VAR_0404',
 'VAR_0352',
 'VAR_0158',
 'VAR_0001',
 'VAR_0283',
 'VAR_0237']

<IPython.core.display.Javascript object>

In [31]:
config = {}

<IPython.core.display.Javascript object>

In [32]:
from thc_net.safe_label_encoder import SafeLabelEncoder

<IPython.core.display.Javascript object>

In [33]:
for col in bool_cols:
    config[col] = SafeLabelEncoder()
    config[col].fit(train[col].values[train_indices].astype("str").reshape(-1))

<IPython.core.display.Javascript object>

In [34]:
for col in cat_cols:
    config[col] = SafeLabelEncoder()
    config[col].fit(train[col].values[train_indices].astype("str").reshape(-1))

<IPython.core.display.Javascript object>

In [35]:
from sklearn.preprocessing import QuantileTransformer, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer, MissingIndicator

<IPython.core.display.Javascript object>

In [36]:
for col in tqdm(num_cols):
    config[col] = FeatureUnion(
                [
                    (
                        "fillna",
                        Pipeline(
                            [
                                (
                                    "fillna",
                                    SimpleImputer(
                                        strategy="median"
                                    ),
                                ),
                                ("scaler", StandardScaler()),
                            ]
                        ),
                    ),
                    ("indicator", MissingIndicator(features="all")),
                    (
                        "quantile",
                        Pipeline(
                            [
                                ("fillna", SimpleImputer(strategy="median"),),
                                ("quantile", QuantileTransformer()),
                            ]
                        ),
                    ),
                    (
                        "minmax",
                        Pipeline(
                            [
                                ("fillna", SimpleImputer(strategy="median"),),
                                ("minmax", MinMaxScaler()),
                            ]
                        ),
                    ),
                ]
            )
    config[col].fit(train[col].values[train_indices].reshape(-1, 1))    


HBox(children=(FloatProgress(value=0.0, max=1798.0), HTML(value='')))




<IPython.core.display.Javascript object>

In [37]:
config[col]

FeatureUnion(transformer_list=[('fillna',
                                Pipeline(steps=[('fillna',
                                                 SimpleImputer(strategy='median')),
                                                ('scaler', StandardScaler())])),
                               ('indicator', MissingIndicator(features='all')),
                               ('quantile',
                                Pipeline(steps=[('fillna',
                                                 SimpleImputer(strategy='median')),
                                                ('quantile',
                                                 QuantileTransformer())])),
                               ('minmax',
                                Pipeline(steps=[('fillna',
                                                 SimpleImputer(strategy='median')),
                                                ('minmax', MinMaxScaler())]))])

<IPython.core.display.Javascript object>

In [38]:
from sklearn.preprocessing import LabelEncoder

config[target] = LabelEncoder()
config[target].fit(train[target].values[train_indices].reshape(-1))

LabelEncoder()

<IPython.core.display.Javascript object>

In [39]:
Y = config[target].transform(train[target].values.reshape(-1))
Y.shape

(145231,)

<IPython.core.display.Javascript object>

In [40]:
X_list = []

<IPython.core.display.Javascript object>

In [41]:
for col in tqdm(bool_cols):
    X_list.append(config[col].transform(train[col].values.reshape(-1)).reshape(-1, 1))

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




<IPython.core.display.Javascript object>

In [42]:
for col in tqdm(cat_cols):
    X_list.append(config[col].transform(train[col].values.reshape(-1)).reshape(-1, 1))

HBox(children=(FloatProgress(value=0.0, max=33.0), HTML(value='')))




<IPython.core.display.Javascript object>

In [43]:
for col in tqdm(num_cols):
    X_list.append(config[col].transform(train[col].values.reshape(-1, 1)))

HBox(children=(FloatProgress(value=0.0, max=1798.0), HTML(value='')))




<IPython.core.display.Javascript object>

In [44]:
!pip install lightgbm

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


<IPython.core.display.Javascript object>

In [45]:
from lightgbm import LGBMClassifier

<IPython.core.display.Javascript object>

In [46]:
from sklearn.metrics import roc_auc_score, log_loss

<IPython.core.display.Javascript object>

In [47]:
full_X = np.hstack(X_list)
full_X.shape

(145231, 7273)

<IPython.core.display.Javascript object>

In [48]:
clf = LGBMClassifier(n_estimators=1000, importance_type="gain", first_metric_only=True)
clf.fit(
    full_X[train_indices][:10000],
    Y[train_indices][:10000],
    eval_set=[
        (full_X[valid_indices], Y[valid_indices]),
        # (X_list[0][test_indices], Y[test_indices]),
    ],
    eval_names=[
        "valid",
        # "test"
    ],
    early_stopping_rounds=20,
    eval_metric="auc",
    # categorical_feature=[],
)
model_auc_valid = roc_auc_score(
    y_true=Y[valid_indices],
    y_score=clf.predict_proba(full_X[valid_indices])[:, 1].reshape(-1),
)
model_auc_test = roc_auc_score(
    y_true=Y[test_indices],
    y_score=clf.predict_proba(full_X[test_indices])[:, 1].reshape(-1),
)

[1]	valid's auc: 0.691565	valid's binary_logloss: 0.530336
Training until validation scores don't improve for 20 rounds
[2]	valid's auc: 0.71907	valid's binary_logloss: 0.521061
[3]	valid's auc: 0.725526	valid's binary_logloss: 0.51418
[4]	valid's auc: 0.731793	valid's binary_logloss: 0.50767
[5]	valid's auc: 0.735445	valid's binary_logloss: 0.502374
[6]	valid's auc: 0.738278	valid's binary_logloss: 0.497785
[7]	valid's auc: 0.740939	valid's binary_logloss: 0.493509
[8]	valid's auc: 0.74362	valid's binary_logloss: 0.489911
[9]	valid's auc: 0.746377	valid's binary_logloss: 0.486828
[10]	valid's auc: 0.748596	valid's binary_logloss: 0.483699
[11]	valid's auc: 0.749122	valid's binary_logloss: 0.481513
[12]	valid's auc: 0.749152	valid's binary_logloss: 0.479774
[13]	valid's auc: 0.749965	valid's binary_logloss: 0.478116
[14]	valid's auc: 0.750082	valid's binary_logloss: 0.476839
[15]	valid's auc: 0.750808	valid's binary_logloss: 0.475475
[16]	valid's auc: 0.751809	valid's binary_logloss: 0

<IPython.core.display.Javascript object>

In [49]:
total = clf.feature_importances_.sum()

<IPython.core.display.Javascript object>

In [50]:
relative = clf.feature_importances_ / total

<IPython.core.display.Javascript object>

In [51]:
sorted_idx = np.argsort(-clf.feature_importances_)
selected = []
importance = 0
i = 0
while importance < 0.9999:
    importance += relative[sorted_idx[i]]
    i += 1
    selected.append(sorted_idx[i])

<IPython.core.display.Javascript object>

In [52]:
len(clf.feature_importances_)

7273

<IPython.core.display.Javascript object>

In [53]:
len(selected)

827

<IPython.core.display.Javascript object>

In [54]:
len(bool_cols) + len(cat_cols) + len(num_cols)

1879

<IPython.core.display.Javascript object>

In [55]:
X_list

[array([[1],
        [0],
        [0],
        ...,
        [0],
        [1],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[1],
        [0],
        [0],
        ...,
        [0],
        [1],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [1],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[1],
        [0],
        [0],
        ...,
        [0],
        [1],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[1],
        [0],
        [0],
        ...,
        [0],
     

<IPython.core.display.Javascript object>

In [56]:
selected_X = full_X[:, selected]
selected_X.shape

(145231, 827)

<IPython.core.display.Javascript object>

In [57]:
clf = LGBMClassifier(n_estimators=1000, importance_type="gain", first_metric_only=True)
clf.fit(
    selected_X[train_indices],
    Y[train_indices],
    eval_set=[
        (selected_X[valid_indices], Y[valid_indices]),
        # (X_list[0][test_indices], Y[test_indices]),
    ],
    eval_names=[
        "valid",
        # "test"
    ],
    early_stopping_rounds=20,
    eval_metric="auc",
    # categorical_feature=[],
)
model_auc_valid = roc_auc_score(
    y_true=Y[valid_indices],
    y_score=clf.predict_proba(selected_X[valid_indices])[:, 1].reshape(-1),
)
model_auc_test = roc_auc_score(
    y_true=Y[test_indices],
    y_score=clf.predict_proba(selected_X[test_indices])[:, 1].reshape(-1),
)

[1]	valid's auc: 0.721335	valid's binary_logloss: 0.529529
Training until validation scores don't improve for 20 rounds
[2]	valid's auc: 0.737881	valid's binary_logloss: 0.520434
[3]	valid's auc: 0.742016	valid's binary_logloss: 0.512814
[4]	valid's auc: 0.744003	valid's binary_logloss: 0.506538
[5]	valid's auc: 0.744915	valid's binary_logloss: 0.501238
[6]	valid's auc: 0.745928	valid's binary_logloss: 0.496763
[7]	valid's auc: 0.747554	valid's binary_logloss: 0.492863
[8]	valid's auc: 0.749697	valid's binary_logloss: 0.489368
[9]	valid's auc: 0.751785	valid's binary_logloss: 0.486063
[10]	valid's auc: 0.753238	valid's binary_logloss: 0.483307
[11]	valid's auc: 0.754642	valid's binary_logloss: 0.480719
[12]	valid's auc: 0.755394	valid's binary_logloss: 0.478613
[13]	valid's auc: 0.756685	valid's binary_logloss: 0.476598
[14]	valid's auc: 0.757341	valid's binary_logloss: 0.474812
[15]	valid's auc: 0.75838	valid's binary_logloss: 0.473112
[16]	valid's auc: 0.759399	valid's binary_logloss

<IPython.core.display.Javascript object>

In [58]:
model_auc_valid

0.7810328202893912

<IPython.core.display.Javascript object>

In [59]:
# 0.7483815427455826

<IPython.core.display.Javascript object>

In [60]:
model_auc_test

0.7811517885524315

<IPython.core.display.Javascript object>

In [61]:
# 0.7562194674153414

<IPython.core.display.Javascript object>

In [62]:
clf = LGBMClassifier(n_estimators=1000, importance_type="gain", first_metric_only=True)
clf.fit(
    full_X[train_indices],
    Y[train_indices],
    eval_set=[
        (full_X[valid_indices], Y[valid_indices]),
        # (X_list[0][test_indices], Y[test_indices]),
    ],
    eval_names=[
        "valid",
        # "test"
    ],
    early_stopping_rounds=20,
    eval_metric="auc",
    # categorical_feature=[],
)
model_auc_valid = roc_auc_score(
    y_true=Y[valid_indices],
    y_score=clf.predict_proba(full_X[valid_indices])[:, 1].reshape(-1),
)
model_auc_test = roc_auc_score(
    y_true=Y[test_indices],
    y_score=clf.predict_proba(full_X[test_indices])[:, 1].reshape(-1),
)

[1]	valid's auc: 0.724288	valid's binary_logloss: 0.529247
Training until validation scores don't improve for 20 rounds
[2]	valid's auc: 0.732919	valid's binary_logloss: 0.520319
[3]	valid's auc: 0.738345	valid's binary_logloss: 0.512805
[4]	valid's auc: 0.741954	valid's binary_logloss: 0.506356
[5]	valid's auc: 0.743747	valid's binary_logloss: 0.500988
[6]	valid's auc: 0.748095	valid's binary_logloss: 0.495918
[7]	valid's auc: 0.749324	valid's binary_logloss: 0.492145
[8]	valid's auc: 0.751081	valid's binary_logloss: 0.48872
[9]	valid's auc: 0.752665	valid's binary_logloss: 0.485445
[10]	valid's auc: 0.754197	valid's binary_logloss: 0.48263
[11]	valid's auc: 0.755545	valid's binary_logloss: 0.480014
[12]	valid's auc: 0.756201	valid's binary_logloss: 0.477852
[13]	valid's auc: 0.756887	valid's binary_logloss: 0.476022
[14]	valid's auc: 0.758753	valid's binary_logloss: 0.473913
[15]	valid's auc: 0.759961	valid's binary_logloss: 0.47212
[16]	valid's auc: 0.761002	valid's binary_logloss: 

<IPython.core.display.Javascript object>

In [63]:
model_auc_valid

0.7815419133059875

<IPython.core.display.Javascript object>

In [64]:
model_auc_test

0.7819902283613407

<IPython.core.display.Javascript object>