In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold

  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  from pandas._libs.tslibs.strptime import array_strptime
  from pandas._libs.tslibs.frequencies import (  # noqa
  from pandas._libs.period import Period
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import lib, groupby as libgroupby, Timestamp, NaT, iNaT
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from . import _csparsetools
  from ._shortest_path import shortest_path, floyd_warshall, dijkstra,\
  from ._tools import csgraph_to_dense, csgraph_from_dense,\
  from ._traversal import breadth_first_order, depth_first_order, \
  from ._min_spanning_tree import minimum_spanning_tree
  from ._reordering impo

In [2]:

data = pd.read_csv('data/application_train.csv')
data_submission = pd.read_csv('data/application_test.csv')
submission_id = data_submission['SK_ID_CURR']

def process_data(data):
    Null_dict = dict()

    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        Null_dict[column] = 0


    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        if data[column].isnull().sum()>0:
            Null_dict[column] = data[column].isnull().sum()

        if (data[column]=='XNA').sum()>0:
            Null_dict[column] = (data[column]=='XNA').sum()

    for (key,value) in Null_dict.items():
        Null_dict[key] = float(value)/data.shape[0]

    def replace_binary_categorical_var(df, column_name):
        categories = list(df[column_name].unique())
        if np.nan in categories:
            categories.remove(np.nan)
        assert(len(categories) == 2)
        df.loc[df[column_name] == categories[0], column_name] = 0
        df.loc[df[column_name] == categories[1], column_name] = 1    
            
    for col, dtype in data.dtypes.iteritems():
        if np.issubdtype(dtype, np.number) and col not in ('TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'):
            data[col + '/(AMT_CREDIT)'] = data[col] / data['AMT_CREDIT']
            
    data['FONDKAPREMONT_MODE'] = data['FONDKAPREMONT_MODE'].replace('XNA', np.nan)
    # Name contract type is either Cash loans or Revolving loans
    replace_binary_categorical_var(data, 'NAME_CONTRACT_TYPE')

    # Gender is either male, female or N/A. We'll consider it binary
    data = data.replace('XNA', np.nan)
    replace_binary_categorical_var(data, 'CODE_GENDER')

    # FLAG_OWN_CAR and FLAG_OWN_REALTY are flags, either Y or N
    replace_binary_categorical_var(data, 'FLAG_OWN_CAR')
    replace_binary_categorical_var(data, 'FLAG_OWN_REALTY')

    # We'll consider unknown to be N/A
    data['NAME_FAMILY_STATUS'] = data['NAME_FAMILY_STATUS'].replace('Unknown', np.nan)

    # All these are categorical
    data = pd.get_dummies(data, columns=['NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE'])

    data = data.fillna(data.median())

    return data

data = process_data(data)
data_submission = process_data(data_submission)


In [3]:
previous_data = pd.read_csv('data/previous_application.csv')
def process_previous_application_data(previous_data):
#     missing_percentage = previous_data.isnull().sum() / len(previous_data) 
#     dropped = missing_percentage.loc[missing_percentage > 0.9]
#     previous_data = previous_data.drop(dropped.index, axis=1)
    previous_data = pd.get_dummies(previous_data, columns=['NAME_CONTRACT_STATUS', ])
    
    previous_data['percentage_down'] = previous_data['AMT_DOWN_PAYMENT'] / previous_data['AMT_CREDIT']
    previous_data.loc[previous_data['NAME_CONTRACT_STATUS_Approved'] == 0, 'percentage_down'] = np.nan
    previous_data['AMT_CREDIT_accepted'] = previous_data['AMT_CREDIT']
    previous_data.loc[previous_data['NAME_CONTRACT_STATUS_Approved'] == 0, 'AMT_CREDIT_accepted'] = np.nan
    previous_data['AMT_ANNUITY_accepted'] = previous_data['AMT_ANNUITY']
    previous_data.loc[previous_data['NAME_CONTRACT_STATUS_Approved'] == 0, 'AMT_ANNUITY_accepted'] = np.nan

    for col in ('AMT_ANNUITY', 'AMT_ANNUITY_accepted', 'AMT_APPLICATION', 'AMT_GOODS_PRICE'):
        previous_data[col + '/AMT_CREDIT'] = previous_data[col] / previous_data['AMT_CREDIT']
        
    previous_data = previous_data.replace([float('inf'), -float('inf')], np.nan)

    previous_data = previous_data.groupby('SK_ID_CURR', as_index=False).agg(
        {'SK_ID_PREV': 'count', 
         'NAME_CONTRACT_STATUS_Refused': ('count', 'mean'), 
         'NAME_CONTRACT_STATUS_Approved': ('count', 'mean'),
         'AMT_CREDIT': 'mean',
         'AMT_CREDIT_accepted': ('mean', 'max'),
         'percentage_down': 'mean',
         'CNT_PAYMENT': ('mean', 'max'),
         'NFLAG_INSURED_ON_APPROVAL': 'mean',
         'AMT_ANNUITY': ('mean', 'max'),
         'AMT_ANNUITY_accepted': ('mean', 'max'),
         'AMT_ANNUITY/AMT_CREDIT': ('mean', 'max'),
         'AMT_ANNUITY_accepted/AMT_CREDIT': ('mean', 'max'),
         'AMT_APPLICATION/AMT_CREDIT': ('mean', 'max'),
         'AMT_GOODS_PRICE/AMT_CREDIT': ('mean', 'max')})
    previous_data.columns = list(map('_'.join, previous_data.columns.values))
    previous_data = previous_data.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'})

    return previous_data

previous_data = process_previous_application_data(previous_data)
previous_data.fillna(0)
previous_data.head()

Unnamed: 0,SK_ID_CURR,AMT_CREDIT_accepted_mean,AMT_CREDIT_accepted_max,SK_ID_PREV_count,AMT_GOODS_PRICE/AMT_CREDIT_mean,AMT_GOODS_PRICE/AMT_CREDIT_max,AMT_ANNUITY_accepted/AMT_CREDIT_mean,AMT_ANNUITY_accepted/AMT_CREDIT_max,CNT_PAYMENT_mean,CNT_PAYMENT_max,...,AMT_ANNUITY_max,percentage_down_mean,AMT_ANNUITY_accepted_mean,AMT_ANNUITY_accepted_max,NFLAG_INSURED_ON_APPROVAL_mean,AMT_CREDIT_mean,NAME_CONTRACT_STATUS_Refused_count,NAME_CONTRACT_STATUS_Refused_mean,AMT_APPLICATION/AMT_CREDIT_mean,AMT_APPLICATION/AMT_CREDIT_max
0,100001,23787.0,23787.0,1,1.044079,1.044079,0.166099,0.166099,8.0,8.0,...,3951.0,0.10594,3951.0,3951.0,0.0,23787.0,1,0.0,1.044079,1.044079
1,100002,179055.0,179055.0,1,1.0,1.0,0.05167,0.05167,24.0,24.0,...,9251.775,0.0,9251.775,9251.775,0.0,179055.0,1,0.0,1.0,1.0
2,100003,484191.0,1035882.0,3,0.949329,1.011109,0.126383,0.1852,10.0,12.0,...,98356.995,0.050585,56553.99,98356.995,0.666667,484191.0,3,0.0,0.949329,1.011109
3,100004,20106.0,20106.0,1,1.207699,1.207699,0.26645,0.26645,4.0,4.0,...,5357.25,0.241719,5357.25,5357.25,0.0,20106.0,1,0.0,1.207699,1.207699
4,100005,40153.5,40153.5,2,1.111173,1.111173,0.11987,0.11987,12.0,12.0,...,4813.2,0.111173,4813.2,4813.2,0.0,20076.75,2,0.0,1.111173,1.111173


In [4]:
data_bureau = pd.read_csv('data/bureau.csv')
bureau_balance = pd.read_csv('data/bureau_balance.csv')

def process_bureau(data_bureau,bureau_balance):
    
    bureau_balance = pd.get_dummies(bureau_balance,columns = ['STATUS'])

    list_replacement_0 = [-i for i in range(12)]
    list_replacement_1 = [-i for i in range(12,97)]

    bureau_balance['MONTHS_BALANCE'] = bureau_balance['MONTHS_BALANCE'].replace(list_replacement_0,0)
    bureau_balance['MONTHS_BALANCE'] = bureau_balance['MONTHS_BALANCE'].replace(list_replacement_1,1)

    bureau_balance = bureau_balance.groupby('SK_ID_BUREAU', as_index = False).agg({'MONTHS_BALANCE':'mean','STATUS_0':'mean','STATUS_1':'mean','STATUS_2':'mean','STATUS_3':'mean','STATUS_4':'mean','STATUS_5':'mean','STATUS_C':'mean','STATUS_X':'mean'})
   
    data_bureau = data_bureau.merge(bureau_balance, on = 'SK_ID_BUREAU',how = 'left')
    
    data_bureau = data_bureau.replace(['Mobile operator loan','Interbank credit','Loan for purchase of shares (margin lending)','Loan for purchase of equipment','Another type of loan','Unknown type of loan'],'Other')

    data_bureau = pd.get_dummies(data_bureau,columns = ['CREDIT_ACTIVE','CREDIT_CURRENCY','CREDIT_TYPE'])
    data_bureau = data_bureau.drop(['DAYS_CREDIT_UPDATE','CREDIT_CURRENCY_currency 1','CREDIT_CURRENCY_currency 2','CREDIT_CURRENCY_currency 3','CREDIT_CURRENCY_currency 4'],axis=1)
    data_bureau = data_bureau.fillna(data_bureau.median())
    for column in data_bureau.columns:
        if column.startswith('AMT_'):
            data_bureau[column + '/' + 'AMT_CREDIT_SUM'] = data_bureau[column] / data_bureau['AMT_CREDIT_SUM']
            
    data_bureau = data_bureau.replace([float('inf'), -float('inf')], np.nan)
    data_bureau = data_bureau.groupby('SK_ID_CURR', as_index = False).agg(
        {'SK_ID_BUREAU':'count','DAYS_CREDIT':'min',
         'CREDIT_DAY_OVERDUE':'max',
         'DAYS_CREDIT_ENDDATE':'max',
         'DAYS_ENDDATE_FACT':'min',
         'AMT_CREDIT_MAX_OVERDUE':['max','mean'],
         'AMT_CREDIT_MAX_OVERDUE/AMT_CREDIT_SUM':['max','mean'],
         'CNT_CREDIT_PROLONG':['max','median'],
         'AMT_CREDIT_SUM':['max','median'],
         'AMT_CREDIT_SUM_DEBT':'sum',
         'AMT_CREDIT_SUM_DEBT/AMT_CREDIT_SUM':['max', 'mean'],
         'AMT_CREDIT_SUM_LIMIT':'max',
         'AMT_CREDIT_SUM_LIMIT/AMT_CREDIT_SUM':['max', 'mean'],
         'AMT_CREDIT_SUM_OVERDUE':'sum',
         'AMT_CREDIT_SUM_OVERDUE/AMT_CREDIT_SUM':['max', 'mean'],
         'CREDIT_ACTIVE_Active':['count', 'mean'],
         'CREDIT_ACTIVE_Bad debt':['count', 'mean'],
         'CREDIT_ACTIVE_Closed':['count', 'mean'],
         'CREDIT_ACTIVE_Sold':['count', 'mean'],
         'CREDIT_TYPE_Car loan':'mean',
         'CREDIT_TYPE_Cash loan (non-earmarked)':'mean',
         'CREDIT_TYPE_Consumer credit':'mean',
         'CREDIT_TYPE_Credit card':'mean',
         'CREDIT_TYPE_Loan for business development':'mean',
         'CREDIT_TYPE_Loan for the purchase of equipment':'mean',
         'CREDIT_TYPE_Loan for working capital replenishment':'mean',
         'CREDIT_TYPE_Microloan':'mean',
         'CREDIT_TYPE_Mortgage':'mean',
         'CREDIT_TYPE_Other':'mean',
         'CREDIT_TYPE_Real estate loan':'mean',
         'AMT_ANNUITY':['max', 'mean'],
         'AMT_ANNUITY/AMT_CREDIT_SUM':['max', 'mean']})

    data_bureau.columns = list(map('_'.join, data_bureau.columns.values))
    data_bureau = data_bureau.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'})
    
    
    return data_bureau

data_bureau = process_bureau(data_bureau,bureau_balance)
data_bureau.head()

Unnamed: 0,SK_ID_CURR,CREDIT_TYPE_Microloan_mean,CREDIT_ACTIVE_Closed_count,CREDIT_ACTIVE_Closed_mean,CREDIT_ACTIVE_Sold_count,CREDIT_ACTIVE_Sold_mean,DAYS_CREDIT_ENDDATE_max,CREDIT_TYPE_Loan for the purchase of equipment_mean,CREDIT_TYPE_Mortgage_mean,CREDIT_ACTIVE_Active_count,...,CREDIT_TYPE_Credit card_mean,AMT_CREDIT_MAX_OVERDUE/AMT_CREDIT_SUM_max,AMT_CREDIT_MAX_OVERDUE/AMT_CREDIT_SUM_mean,DAYS_ENDDATE_FACT_min,AMT_ANNUITY/AMT_CREDIT_SUM_max,AMT_ANNUITY/AMT_CREDIT_SUM_mean,CREDIT_TYPE_Other_mean,CREDIT_TYPE_Loan for business development_mean,AMT_CREDIT_SUM_max,AMT_CREDIT_SUM_median
0,100001,0.0,7,0.571429,7,0.0,1778.0,0.0,0.0,7,...,0.0,0.0,0.0,-1328.0,0.055627,0.013996,0.0,0.0,378000.0,168345.0
1,100002,0.0,8,0.75,8,0.0,780.0,0.0,0.0,8,...,0.5,0.174139,0.031026,-1185.0,0.0,0.0,0.0,0.0,450000.0,54130.5
2,100003,0.0,4,0.75,4,0.0,1216.0,0.0,0.0,4,...,0.5,0.0,0.0,-2131.0,0.0,0.0,0.0,0.0,810000.0,92576.25
3,100004,0.0,2,1.0,2,0.0,-382.0,0.0,0.0,2,...,0.0,0.0,0.0,-683.0,0.0,0.0,0.0,0.0,94537.8,94518.9
4,100005,0.0,3,0.333333,3,0.0,1324.0,0.0,0.0,3,...,0.333333,0.0,0.0,-897.0,0.142879,0.047626,0.0,0.0,568800.0,58500.0


In [5]:
data_pos_cash = pd.read_csv('data/POS_CASH_balance.csv')
def process_poscash(data_pos_cash):
    data_pos_cash['MONTHS_BALANCE'] = data_pos_cash['MONTHS_BALANCE'].replace(-1, 0)

    data_pos_cash['MONTHS_LEFT_TO_PAY'] = data_pos_cash['CNT_INSTALMENT_FUTURE']
    data_pos_cash.loc[data_pos_cash['MONTHS_BALANCE'] != 0, 'MONTHS_LEFT_TO_PAY'] = np.nan

    aggregated = data_pos_cash.groupby(('SK_ID_CURR', 'SK_ID_PREV')).agg({
        'SK_DPD_DEF': ('sum', 'max'),
        'SK_DPD': ('sum', 'max'),
        'MONTHS_LEFT_TO_PAY': ('max'),
    }).fillna(0).groupby('SK_ID_CURR').agg(('mean', 'max'))


    aggregated.columns = list(map('_'.join, aggregated.columns.values))
    aggregated['SK_ID_CURR'] = aggregated.index
    return aggregated

data_pos_cash = process_poscash(data_pos_cash)
data_pos_cash.head()

Unnamed: 0_level_0,MONTHS_LEFT_TO_PAY_max_mean,MONTHS_LEFT_TO_PAY_max_max,SK_DPD_DEF_sum_mean,SK_DPD_DEF_sum_max,SK_DPD_DEF_max_mean,SK_DPD_DEF_max_max,SK_DPD_sum_mean,SK_DPD_sum_max,SK_DPD_max_mean,SK_DPD_max_max,SK_ID_CURR
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100001,0.0,0.0,3.5,7,3.5,7,3.5,7,3.5,7,100001
100002,6.0,6.0,0.0,0,0.0,0,0.0,0,0.0,0,100002
100003,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0,100003
100004,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0,100004
100005,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0,100005


In [6]:
data = data.merge(previous_data, on='SK_ID_CURR', how='left', suffixes=('', '_previous'))
data_submission = data_submission.merge(previous_data, on='SK_ID_CURR', how='left', suffixes=('', '_previous'))
data = data.merge(data_bureau, on= 'SK_ID_CURR',how='left', suffixes=('', '_bureau'))
data_submission = data_submission.merge(data_bureau, on= 'SK_ID_CURR',how='left', suffixes=('', '_bureau'))
data = data.merge(data_pos_cash, on= 'SK_ID_CURR',how='left', suffixes=('', '_bureau'))
data_submission = data_submission.merge(data_pos_cash, on= 'SK_ID_CURR',how='left', suffixes=('', '_poscash'))

data = data.drop('SK_ID_CURR', axis=1)
data_submission = data_submission.drop('SK_ID_CURR', axis=1)

# manual features
data['AMT_PREVIOUS_CREDIT/AMT_CREDIT'] = data['AMT_CREDIT_accepted_mean'] / data['AMT_CREDIT']
data_submission['AMT_PREVIOUS_CREDIT/AMT_CREDIT'] = data_submission['AMT_CREDIT_accepted_mean'] / data_submission['AMT_CREDIT']

data['AMT_PREVIOUS_MAXCREDIT/AMT_CREDIT'] = data['AMT_CREDIT_accepted_max'] / data['AMT_CREDIT']
data_submission['AMT_PREVIOUS_MAXCREDIT/AMT_CREDIT'] = data_submission['AMT_CREDIT_accepted_max'] / data_submission['AMT_CREDIT']

data['AMT_PREVIOUS_ANNUITY/AMT_ANNUITY'] = data['AMT_ANNUITY_accepted_mean'] / data['AMT_ANNUITY']
data_submission['AMT_PREVIOUS_ANNUITY/AMT_ANNUITY'] = data_submission['AMT_ANNUITY_accepted_mean'] / data_submission['AMT_ANNUITY']

# data['AMT_PREVIOUS_GOODS_PRICE/AMT_GOODS_PRICE'] = data['AMT_GOODS_PRICE/AMT_CREDIT_mean'] / data['AMT_GOODS_PRICE/(AMT_CREDIT)']
# data_submission['AMT_PREVIOUS_GOODS_PRICE/AMT_GOODS_PRICE'] = data_submission['AMT_GOODS_PRICE/AMT_CREDIT_mean'] / data_submission['AMT_GOODS_PRICE/(AMT_CREDIT)']


# ## BUREAU FILLING NA
# default_values = dict()

# for column in ['DAYS_CREDIT_min','DAYS_ENDDATE_FACT_min','AMT_CREDIT_SUM_LIMIT_max','DAYS_CREDIT_ENDDATE_max']:
#     default_values[column] = data_bureau[column].median()
    
# data = data.fillna(default_values)
# data = data.fillna(0)
# data_submission= data_submission.fillna(default_values)
# data_submission= data_submission.fillna(0)

## PREVIOUS APPLICATION FILLING NA

default_values = {
    'AMT_CREDIT_mean': 0,
    'AMT_CREDIT_accepted_max': 0,
    'AMT_CREDIT_accepted_mean': 0,
    'NAME_CONTRACT_STATUS_Approved_mean': 1.0,
    'NAME_CONTRACT_STATUS_Refused_mean': 0.0,
    'SK_ID_PREV_count': 0
}

data = data.fillna(default_values)
data = data.fillna(data.median())

data_submission = data_submission.fillna(default_values)
data_submission = data_submission.fillna(data.median())

del previous_data, data_bureau, data_pos_cash

In [7]:
print(set(data.columns) - set(data_submission.columns))
data_submission['NAME_INCOME_TYPE_Maternity leave'] = 0

set(['NAME_INCOME_TYPE_Maternity leave', 'TARGET'])


In [8]:
data_x = data.drop('TARGET', axis=1).as_matrix()
data_y = data['TARGET'].as_matrix()
data_submission_x = data_submission.as_matrix()

train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.05)
del data_x, data_y

In [9]:
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)
data_submission_x = scaler.transform(data_submission_x)

In [10]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_x, train_y)

  from ..utils.seq_dataset import ArrayDataset, CSRDataset
  from ..utils import arrayfuncs, as_float_array, check_X_y, deprecated
  from . import cd_fast
  from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
  from .sag_fast import sag
  from . import libsvm, liblinear
  from . import libsvm_sparse
  from .ball_tree import BallTree
  from .kd_tree import KDTree


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
roc_auc_score(test_y, lr.predict_proba(test_x)[:,1])

0.766039693769571

In [None]:
xgb = XGBClassifier()
xgb.fit(train_x, train_y)

In [None]:
print('train_score:', roc_auc_score(train_y, xgb.predict_proba(train_x)[:,1]))
print('test_score:', roc_auc_score(test_y, xgb.predict_proba(test_x)[:,1]))

In [None]:
print('TOP 20 Importances')
for importance, col in sorted(zip(list(xgb.feature_importances_), list(data.columns)[1:]), reverse=True)[:20]:
    print(str(importance) + '\t' + col)

In [None]:
predicted = xgb.predict_proba(data_submission_x)[:, 1]

results = pd.DataFrame({'SK_ID_CURR': submission_id, 'TARGET': predicted})

results.to_csv('predictions.csv', index=False)

In [None]:
del train_x, train_y, test_x, test_y, data_submission_x
from script import train_model
folds = KFold(n_splits=5, shuffle=True, random_state=546789)
# Train model and get oof and test predictions
oof_preds, test_preds, importances = train_model(data.drop('TARGET', axis=1), data_submission, data['TARGET'], folds)

  from .linbin import fast_linbin
  from .linbin import fast_linbin
  from ._smoothers_lowess import lowess as _lowess
  from ._smoothers_lowess import lowess as _lowess
  from . import _vq
  from . import _hierarchy, _optimal_leaf_ordering


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.781082	valid_1's auc: 0.763879
[200]	training's auc: 0.801366	valid_1's auc: 0.774885
[300]	training's auc: 0.814966	valid_1's auc: 0.77896
[400]	training's auc: 0.826446	valid_1's auc: 0.78054
[500]	training's auc: 0.836191	valid_1's auc: 0.781297
[600]	training's auc: 0.845255	valid_1's auc: 0.781734
[700]	training's auc: 0.853304	valid_1's auc: 0.782044
[800]	training's auc: 0.860835	valid_1's auc: 0.781951
Early stopping, best iteration is:
[745]	training's auc: 0.85652	valid_1's auc: 0.78212
Fold  1 AUC : 0.782120
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.781367	valid_1's auc: 0.766811
