In [3]:

#%% get libraries
import os
import numpy as np
import pandas as pd
pd.options.display.max_rows = 300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

import holoviews as hv
hv.extension('bokeh')
from holoviews.operation.datashader import datashade, shade, dynspread, rasterize
from bokeh.models import HoverTool


#%% get data and peek
train = pd.read_csv('./input/raw/application_train.csv', index_col='SK_ID_CURR')
test = pd.read_csv('./input/raw/application_test.csv', index_col='SK_ID_CURR')
test['TARGET'] = 2
traintest = pd.concat([train, test], sort=False).sort_index()
traintest.head().T

SK_ID_CURR,100001,100002,100003,100004,100005
TARGET,2,1,0,0,2
NAME_CONTRACT_TYPE,Cash loans,Cash loans,Cash loans,Revolving loans,Cash loans
CODE_GENDER,F,M,F,M,M
FLAG_OWN_CAR,N,N,N,Y,N
FLAG_OWN_REALTY,Y,Y,N,Y,Y
CNT_CHILDREN,0,0,0,0,0
AMT_INCOME_TOTAL,135000,202500,270000,67500,99000
AMT_CREDIT,568800,406598,1.2935e+06,135000,222768
AMT_ANNUITY,20560.5,24700.5,35698.5,6750,17370
AMT_GOODS_PRICE,450000,351000,1.1295e+06,135000,180000


In [None]:
#%% prep for model
for c in traintest.columns:
    if traintest[c].dtype == 'object':
        traintest[c] = traintest[c].astype('category').cat.codes

train = traintest[traintest.TARGET != 2]
test = traintest[traintest.TARGET == 2]

#%% split data and run model
X = train.drop('TARGET', axis=1)
y = train.TARGET
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lmod = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=4, learning_rate=0.05, 
    n_estimators=1000, subsample_for_bin=200000, objective='binary', class_weight=None, 
    min_child_samples=10, subsample=1.0, 
    subsample_freq=0, colsample_bytree=0.9, silent=False) 


In [None]:
lmod.fit(X_train, y_train, eval_set=[(X_val, y_val)],  eval_metric='auc', 
    early_stopping_rounds=50, verbose=20)


In [None]:
#%% get info
print(lmod.best_score_)
featmat = pd.DataFrame({'feat':X.columns, 'imp':lmod.feature_importances_})
featmat.sort_values('imp', ascending=False)


In [6]:
# pred = lmod.predict_proba(X_val)[:, 1]
# evalmat = pd.DataFrame({'label':y_val, 'pred':preds})

evalmat = pd.read_csv('./valpreds.csv')
evalmat.head()
print(roc_auc_score(evalmat.label, evalmat.pred))

evalmat['rank'] = evalmat.pred.rank()
evalmat.sort_values('rank', inplace=True)
yax = np.ones_like(evalmat.label.values)
# evalmat['yax'] = yax + np.random.randn(len(yax)) # for points plot
evalmat.head(8)

0.790644798360814


Unnamed: 0,label,pred,rank
302186,0.0,0.001545,1.0
285049,0.0,0.001579,2.0
206567,0.0,0.00169,3.0
188082,0.0,0.001706,4.0
120594,0.0,0.001713,5.0
49225,0.0,0.00178,6.0
170853,0.0,0.00179,7.0
104366,0.0,0.001809,8.0


In [7]:
%%opts Histogram [width=600 height=400]

evalmat0 = evalmat[evalmat.label == 0].sample(frac=0.2)
yax = np.ones_like(evalmat0.label.values)
evalmat0['yax'] = yax + np.random.randn(len(yax)) # for points plot
evalmat1 = evalmat[evalmat.label == 1].sample(frac=0.2)
yax = np.ones_like(evalmat1.label.values)
evalmat1['yax'] = yax + np.random.randn(len(yax))/4 # for points plot

hist0 = np.histogram(evalmat0.pred.values, density=False, bins=64)
histplot0 = hv.Histogram(hist0, label="TGT=0").redim.label(x="Predicted Probability")
hist1 = np.histogram(evalmat1.pred.values, density=False, bins=64)
histplot1 = hv.Histogram(hist1, label="TGT=1").redim.label(x="Predicted Probability")

histplot0 * histplot1

In [8]:
%%opts Distribution [width=600 height=400]

aliases = hv.util.Aliases(x='Some long label')

dist0 = hv.Distribution(evalmat0['pred'].values, label="TGT=0")
dist1 = hv.Distribution(evalmat1.pred, label="TGT=1")
dist0 * dist1

  zip(columns, data)])
  zip(columns, data)])


In [11]:
%%opts Overlay [width=600 height=400 yaxis=None] 
%%opts Points [width=600 height=400 yaxis=None] 
%%opts Points(fill_alpha=0.3, line_alpha=0.6)

#plot preds with jitter v rank, color by label
points0 = hv.Points(evalmat0, kdims=['pred', 'yax']).opts(plot=dict(fill_alpha=0.2, jitter=0.7))
points1 = hv.Points(evalmat1, kdims=['pred', 'yax']).opts(plot=dict(fill_alpha=0.1, jitter=0.7))
(points0.relabel('TGT=0')) * (points1.relabel('TGT=1'))

In [None]:
evalmat.to_csv('./evalmat.csv')


In [None]:
import seaborn as sns
sns.swarmplot(x="label", y="pred", hue="label", data=evalmat.sample(frac=0.003))

In [15]:
evalmat0

Unnamed: 0,label,pred,rank,yax
169578,0.0,0.006502,4105.0,-0.554449
153042,0.0,0.039208,137132.0,-0.157954
302676,0.0,0.056713,179219.0,2.053982
230483,0.0,0.050406,166020.0,1.892763
239258,0.0,0.034026,120719.0,1.639219
292725,0.0,0.021670,70241.0,-0.230986
248943,0.0,0.023742,79687.0,0.123378
227206,0.0,0.074563,207426.0,2.181339
33956,0.0,0.010128,15629.0,0.067046
260379,0.0,0.030778,109069.0,0.950598
