In [194]:

#%% get libraries
import os
import numpy as np
import pandas as pd
pd.options.display.max_rows = 300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

import holoviews as hv
hv.extension('bokeh')
from holoviews.operation.datashader import datashade, shade, dynspread, rasterize
from bokeh.models import HoverTool


#%% get data and peek
train = pd.read_csv('./input/raw/application_train.csv', index_col='SK_ID_CURR')
test = pd.read_csv('./input/raw/application_test.csv', index_col='SK_ID_CURR')
test['TARGET'] = 2
traintest = pd.concat([train, test], sort=False).sort_index()
traintest.head().T

SK_ID_CURR,100001,100002,100003,100004,100005
TARGET,2,1,0,0,2
NAME_CONTRACT_TYPE,Cash loans,Cash loans,Cash loans,Revolving loans,Cash loans
CODE_GENDER,F,M,F,M,M
FLAG_OWN_CAR,N,N,N,Y,N
FLAG_OWN_REALTY,Y,Y,N,Y,Y
CNT_CHILDREN,0,0,0,0,0
AMT_INCOME_TOTAL,135000,202500,270000,67500,99000
AMT_CREDIT,568800,406598,1.2935e+06,135000,222768
AMT_ANNUITY,20560.5,24700.5,35698.5,6750,17370
AMT_GOODS_PRICE,450000,351000,1.1295e+06,135000,180000


In [2]:
#%% prep for model
for c in traintest.columns:
    if traintest[c].dtype == 'object':
        traintest[c] = traintest[c].astype('category').cat.codes

train = traintest[traintest.TARGET != 2]
test = traintest[traintest.TARGET == 2]

#%% split data and run model
X = train.drop('TARGET', axis=1)
y = train.TARGET
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lmod = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=4, learning_rate=0.05, 
    n_estimators=1000, subsample_for_bin=200000, objective='binary', class_weight=None, 
    min_child_samples=10, subsample=1.0, 
    subsample_freq=0, colsample_bytree=0.9, silent=False) 


In [3]:
lmod.fit(X_train, y_train, eval_set=[(X_val, y_val)],  eval_metric='auc', 
    early_stopping_rounds=50, verbose=20)


[1]	valid_0's auc: 0.70311
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.705653
[3]	valid_0's auc: 0.708474
[4]	valid_0's auc: 0.713462
[5]	valid_0's auc: 0.718221
[6]	valid_0's auc: 0.718785
[7]	valid_0's auc: 0.719713
[8]	valid_0's auc: 0.72002
[9]	valid_0's auc: 0.720202
[10]	valid_0's auc: 0.720587
[11]	valid_0's auc: 0.721812
[12]	valid_0's auc: 0.722981
[13]	valid_0's auc: 0.723393
[14]	valid_0's auc: 0.724333
[15]	valid_0's auc: 0.724687
[16]	valid_0's auc: 0.725354
[17]	valid_0's auc: 0.726014
[18]	valid_0's auc: 0.726466
[19]	valid_0's auc: 0.726854
[20]	valid_0's auc: 0.727264
[21]	valid_0's auc: 0.727577
[22]	valid_0's auc: 0.728038
[23]	valid_0's auc: 0.728463
[24]	valid_0's auc: 0.72946
[25]	valid_0's auc: 0.72962
[26]	valid_0's auc: 0.729852
[27]	valid_0's auc: 0.730407
[28]	valid_0's auc: 0.73091
[29]	valid_0's auc: 0.731501
[30]	valid_0's auc: 0.731929
[31]	valid_0's auc: 0.732201
[32]	valid_0's auc: 0.732645
[33]	valid_0's auc: 0.73

[283]	valid_0's auc: 0.759912
[284]	valid_0's auc: 0.759911
[285]	valid_0's auc: 0.759928
[286]	valid_0's auc: 0.75993
[287]	valid_0's auc: 0.759909
[288]	valid_0's auc: 0.759917
[289]	valid_0's auc: 0.759928
[290]	valid_0's auc: 0.759973
[291]	valid_0's auc: 0.759964
[292]	valid_0's auc: 0.759989
[293]	valid_0's auc: 0.760016
[294]	valid_0's auc: 0.760013
[295]	valid_0's auc: 0.760035
[296]	valid_0's auc: 0.760017
[297]	valid_0's auc: 0.760033
[298]	valid_0's auc: 0.76007
[299]	valid_0's auc: 0.760095
[300]	valid_0's auc: 0.760116
[301]	valid_0's auc: 0.760195
[302]	valid_0's auc: 0.760172
[303]	valid_0's auc: 0.76017
[304]	valid_0's auc: 0.760202
[305]	valid_0's auc: 0.760218
[306]	valid_0's auc: 0.760224
[307]	valid_0's auc: 0.760236
[308]	valid_0's auc: 0.760267
[309]	valid_0's auc: 0.760253
[310]	valid_0's auc: 0.760264
[311]	valid_0's auc: 0.760258
[312]	valid_0's auc: 0.760272
[313]	valid_0's auc: 0.76026
[314]	valid_0's auc: 0.760282
[315]	valid_0's auc: 0.760367
[316]	valid_0'

[557]	valid_0's auc: 0.762312
[558]	valid_0's auc: 0.76231
[559]	valid_0's auc: 0.762331
[560]	valid_0's auc: 0.762288
[561]	valid_0's auc: 0.762263
[562]	valid_0's auc: 0.762291
[563]	valid_0's auc: 0.762256
[564]	valid_0's auc: 0.762238
[565]	valid_0's auc: 0.762223
[566]	valid_0's auc: 0.762231
[567]	valid_0's auc: 0.762243
[568]	valid_0's auc: 0.762281
[569]	valid_0's auc: 0.762271
[570]	valid_0's auc: 0.762264
[571]	valid_0's auc: 0.762256
[572]	valid_0's auc: 0.762259
[573]	valid_0's auc: 0.762265
[574]	valid_0's auc: 0.762254
[575]	valid_0's auc: 0.762289
[576]	valid_0's auc: 0.762283
[577]	valid_0's auc: 0.762313
[578]	valid_0's auc: 0.762309
[579]	valid_0's auc: 0.762326
[580]	valid_0's auc: 0.762337
[581]	valid_0's auc: 0.762339
[582]	valid_0's auc: 0.762344
[583]	valid_0's auc: 0.762339
[584]	valid_0's auc: 0.762316
[585]	valid_0's auc: 0.762321
[586]	valid_0's auc: 0.762339
[587]	valid_0's auc: 0.762318
[588]	valid_0's auc: 0.762321
[589]	valid_0's auc: 0.762322
[590]	valid

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
        learning_rate=0.05, max_depth=4, min_child_samples=10,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000,
        n_jobs=-1, num_leaves=31, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=False, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [134]:

#%% get info
print(lmod.best_score_)
featmat = pd.DataFrame({'feat':X.columns, 'imp':lmod.feature_importances_})
featmat.sort_values('imp', ascending=False)

pred = lmod.predict_proba(X_val)[:, 1]
evalmat = pd.DataFrame({'label':y_val, 'pred':preds})

evalmat.head()
roc_auc_score(y_val, pred)

evalmat['rank'] = evalmat.pred.rank()
evalmat.sort_values('rank', inplace=True)
yax = np.ones_like(evalmat.label.values)
evalmat['yax'] = yax + np.random.randn(len(yax)) # for points plot
evalmat.head(8)

defaultdict(<class 'dict'>, {'valid_0': {'auc': 0.7623545796200415}})


Unnamed: 0_level_0,label,pred,rank,yax
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
118093,0,0.000767,1.0,0.880036
245690,0,0.001196,2.0,2.191404
172286,0,0.001361,3.0,2.200578
452985,0,0.001433,4.0,1.555798
375383,0,0.001443,5.0,1.018678
226412,0,0.001584,6.0,0.362179
339432,0,0.001721,7.0,0.848296
114122,0,0.001726,8.0,2.49714


In [179]:
%%opts Histogram [width=600 height=400]

evalmat0 = evalmat[evalmat.label == 0]
evalmat1 = evalmat[evalmat.label == 1]

hist0 = np.histogram(evalmat0.pred.values, density=False, bins=64)
histplot0 = hv.Histogram(hist0)
hist1 = np.histogram(evalmat1.pred.values, density=False, bins=64)
histplot1 = hv.Histogram(hist1)

# dynspread(datashade(points0)) * dynspread(datashade(points1))
histplot0 * histplot1

In [212]:
%%opts Points [width=600 height=400 jitter=0.7 yaxis=None] 
%%opts Points(fill_alpha=0.3, line_alpha=0.6)

#plot preds with jitter v rank, color by label
points0 = hv.Points(evalmat0, kdims=['pred', 'yax']).opts(plot=dict(fill_alpha=0.3))
points1 = hv.Points(evalmat1, kdims=['pred', 'yax'])                   
points0.relabel('TGT=0') * points1.relabel('TGT=1')