In [40]:
import gc, os
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys
sys.path.append(f'/home/{os.environ.get("USER")}/PythonLibrary')
import lgbextension as ex
import lightgbm as lgb
from matplotlib import pyplot as plt
from multiprocessing import cpu_count, Pool
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from glob import glob

In [41]:
NFOLD = 5

In [2]:
train = pd.read_csv('../input/application_train.csv.zip')

In [3]:
prev = pd.read_csv('../input/previous_application.csv.zip')

In [4]:
X_train, X_test = prev.align(train, join='inner', axis=1)

In [30]:
X_train.drop('SK_ID_CURR', axis=1, inplace=True)
X_train.head()

Unnamed: 0,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,NAME_TYPE_SUITE
0,1,1730.43,17145.0,17145.0,2,15,7
1,0,25188.615,679671.0,607500.0,4,11,6
2,0,15060.735,136444.5,112500.0,5,11,5
3,0,47041.335,470790.0,450000.0,1,7,7
4,0,31924.395,404055.0,337500.0,4,9,7


In [6]:
prev[X_test.columns.tolist()+['NAME_SELLER_INDUSTRY']].head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,NAME_TYPE_SUITE,NAME_SELLER_INDUSTRY
0,271877,Consumer loans,1730.43,17145.0,17145.0,SATURDAY,15,,Connectivity
1,108129,Cash loans,25188.615,679671.0,607500.0,THURSDAY,11,Unaccompanied,XNA
2,122040,Cash loans,15060.735,136444.5,112500.0,TUESDAY,11,"Spouse, partner",XNA
3,176158,Cash loans,47041.335,470790.0,450000.0,MONDAY,7,,XNA
4,202054,Cash loans,31924.395,404055.0,337500.0,THURSDAY,9,,XNA


In [8]:
col_cat = X_train.head().select_dtypes('O').columns.tolist()

In [9]:
col_cat

['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'NAME_TYPE_SUITE']

In [11]:
le = LabelEncoder()
for c in col_cat:
    X_train[c].fillna('na dayo', inplace=True)
    X_test[c].fillna('na dayo', inplace=True)
    le.fit( X_train[c].append(X_test[c]) )
    X_train[c] = le.transform(X_train[c])
    X_test[c]  = le.transform(X_test[c])

In [13]:
y_names = prev.columns.difference(X_train.columns).tolist()
y_names

['AMT_APPLICATION',
 'AMT_DOWN_PAYMENT',
 'CHANNEL_TYPE',
 'CNT_PAYMENT',
 'CODE_REJECT_REASON',
 'DAYS_DECISION',
 'DAYS_FIRST_DRAWING',
 'DAYS_FIRST_DUE',
 'DAYS_LAST_DUE',
 'DAYS_LAST_DUE_1ST_VERSION',
 'DAYS_TERMINATION',
 'FLAG_LAST_APPL_PER_CONTRACT',
 'NAME_CASH_LOAN_PURPOSE',
 'NAME_CLIENT_TYPE',
 'NAME_CONTRACT_STATUS',
 'NAME_GOODS_CATEGORY',
 'NAME_PAYMENT_TYPE',
 'NAME_PORTFOLIO',
 'NAME_PRODUCT_TYPE',
 'NAME_SELLER_INDUSTRY',
 'NAME_YIELD_GROUP',
 'NFLAG_INSURED_ON_APPROVAL',
 'NFLAG_LAST_APPL_IN_DAY',
 'PRODUCT_COMBINATION',
 'RATE_DOWN_PAYMENT',
 'RATE_INTEREST_PRIMARY',
 'RATE_INTEREST_PRIVILEGED',
 'SELLERPLACE_AREA',
 'SK_ID_PREV']

In [21]:
prev[y_names].dtypes

AMT_APPLICATION                float64
AMT_DOWN_PAYMENT               float64
CHANNEL_TYPE                    object
CNT_PAYMENT                    float64
CODE_REJECT_REASON              object
DAYS_DECISION                    int64
DAYS_FIRST_DRAWING             float64
DAYS_FIRST_DUE                 float64
DAYS_LAST_DUE                  float64
DAYS_LAST_DUE_1ST_VERSION      float64
DAYS_TERMINATION               float64
FLAG_LAST_APPL_PER_CONTRACT     object
NAME_CASH_LOAN_PURPOSE          object
NAME_CLIENT_TYPE                object
NAME_CONTRACT_STATUS            object
NAME_GOODS_CATEGORY             object
NAME_PAYMENT_TYPE               object
NAME_PORTFOLIO                  object
NAME_PRODUCT_TYPE               object
NAME_SELLER_INDUSTRY            object
NAME_YIELD_GROUP                object
NFLAG_INSURED_ON_APPROVAL      float64
NFLAG_LAST_APPL_IN_DAY           int64
PRODUCT_COMBINATION             object
RATE_DOWN_PAYMENT              float64
RATE_INTEREST_PRIMARY    

In [22]:
prev[y_names].isnull().sum()

AMT_APPLICATION                      0
AMT_DOWN_PAYMENT                895844
CHANNEL_TYPE                         0
CNT_PAYMENT                     372230
CODE_REJECT_REASON                   0
DAYS_DECISION                        0
DAYS_FIRST_DRAWING              673065
DAYS_FIRST_DUE                  673065
DAYS_LAST_DUE                   673065
DAYS_LAST_DUE_1ST_VERSION       673065
DAYS_TERMINATION                673065
FLAG_LAST_APPL_PER_CONTRACT          0
NAME_CASH_LOAN_PURPOSE               0
NAME_CLIENT_TYPE                     0
NAME_CONTRACT_STATUS                 0
NAME_GOODS_CATEGORY                  0
NAME_PAYMENT_TYPE                    0
NAME_PORTFOLIO                       0
NAME_PRODUCT_TYPE                    0
NAME_SELLER_INDUSTRY                 0
NAME_YIELD_GROUP                     0
NFLAG_INSURED_ON_APPROVAL       673065
NFLAG_LAST_APPL_IN_DAY               0
PRODUCT_COMBINATION                346
RATE_DOWN_PAYMENT               895844
RATE_INTEREST_PRIMARY    

In [44]:
prev[y_names].head()

Unnamed: 0,AMT_APPLICATION,AMT_DOWN_PAYMENT,CHANNEL_TYPE,CNT_PAYMENT,CODE_REJECT_REASON,DAYS_DECISION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE,DAYS_LAST_DUE_1ST_VERSION,...,NAME_SELLER_INDUSTRY,NAME_YIELD_GROUP,NFLAG_INSURED_ON_APPROVAL,NFLAG_LAST_APPL_IN_DAY,PRODUCT_COMBINATION,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,SELLERPLACE_AREA,SK_ID_PREV
0,17145.0,0.0,Country-wide,12.0,XAP,-73,365243.0,-42.0,-42.0,300.0,...,Connectivity,middle,0.0,1,POS mobile with interest,0.0,0.182832,0.867336,35,2030495
1,607500.0,,Contact center,36.0,XAP,-164,365243.0,-134.0,365243.0,916.0,...,XNA,low_action,1.0,1,Cash X-Sell: low,,,,-1,2802425
2,112500.0,,Credit and cash offices,12.0,XAP,-301,365243.0,-271.0,365243.0,59.0,...,XNA,high,1.0,1,Cash X-Sell: high,,,,-1,2523466
3,450000.0,,Credit and cash offices,12.0,XAP,-512,365243.0,-482.0,-182.0,-152.0,...,XNA,middle,1.0,1,Cash X-Sell: middle,,,,-1,2819243
4,337500.0,,Credit and cash offices,24.0,HC,-781,,,,,...,XNA,high,,1,Cash Street: high,,,,-1,1784265


In [35]:
SEED = 71

param_bin = {
         'objective': 'binary',
         'metric': 'auc',
         'learning_rate': 0.01,
         
         'max_depth': 6,
         'num_leaves': 63,
         'max_bin': 255,
         
         'min_child_weight': 10,
         'min_data_in_leaf': 150,
         'reg_lambda': 0.5,  # L2 regularization term on weights.
         'reg_alpha': 0.5,  # L1 regularization term on weights.
         
         'colsample_bytree': 0.9,
         'subsample': 0.9,
#         'nthread': 32,
         'nthread': cpu_count(),
         'bagging_freq': 1,
         'verbose':-1,
         'seed': SEED
         }


param_reg = {
         'objective': 'regression',
         'metric': 'rmse',
         'learning_rate': 0.01,
         
         'max_depth': 6,
         'num_leaves': 63,
         'max_bin': 255,
         
         'min_child_weight': 10,
         'min_data_in_leaf': 150,
         'reg_lambda': 0.5,  # L2 regularization term on weights.
         'reg_alpha': 0.5,  # L1 regularization term on weights.
         
         'colsample_bytree': 0.9,
         'subsample': 0.9,
#         'nthread': 32,
         'nthread': cpu_count(),
         'bagging_freq': 1,
         'verbose':-1,
         'seed': SEED
         }


In [42]:
group_kfold = GroupKFold(n_splits=NFOLD)

sub_train = prev[['SK_ID_CURR']]
sub_train['g'] = sub_train.SK_ID_CURR % NFOLD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [23]:
y_name = y_names[0]

In [29]:
y = prev[y_name]

In [38]:
dtrain = lgb.Dataset(X_train, y.map(np.log1p), categorical_feature=col_cat )
gc.collect()

ret, models = lgb.cv(param_reg, dtrain, 99999, stratified=False,
                     folds=group_kfold.split(X_train, y, 
                                                              sub_train['g']), 
                     early_stopping_rounds=100, verbose_eval=50,
                     seed=111)




[50]	cv_agg's rmse: 3.07185 + 0.0027036
[100]	cv_agg's rmse: 1.86975 + 0.00164737
[150]	cv_agg's rmse: 1.1401 + 0.00104986
[200]	cv_agg's rmse: 0.694443 + 0.000834006
[250]	cv_agg's rmse: 0.424417 + 0.00116463
[300]	cv_agg's rmse: 0.261803 + 0.00196494
[350]	cv_agg's rmse: 0.165172 + 0.00318776
[400]	cv_agg's rmse: 0.107636 + 0.00497619
[450]	cv_agg's rmse: 0.0756826 + 0.00716195
[500]	cv_agg's rmse: 0.0583175 + 0.00938678
[550]	cv_agg's rmse: 0.0497306 + 0.0111414
[600]	cv_agg's rmse: 0.0457236 + 0.0122381
[650]	cv_agg's rmse: 0.0437599 + 0.012881
[700]	cv_agg's rmse: 0.0427608 + 0.0132542
[750]	cv_agg's rmse: 0.0422582 + 0.0134538
[800]	cv_agg's rmse: 0.0420215 + 0.0135494
[850]	cv_agg's rmse: 0.0418701 + 0.0136045
[900]	cv_agg's rmse: 0.0417752 + 0.0136433
[950]	cv_agg's rmse: 0.0416709 + 0.0136799
[1000]	cv_agg's rmse: 0.0415476 + 0.0137071
[1050]	cv_agg's rmse: 0.0414483 + 0.0137284
[1100]	cv_agg's rmse: 0.0413735 + 0.0137448
[1150]	cv_agg's rmse: 0.0413096 + 0.0137523
[1200]	cv_a

KeyboardInterrupt: 

In [27]:
X_train.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,NAME_TYPE_SUITE
0,271877,1,1730.43,17145.0,17145.0,2,15,7
1,108129,0,25188.615,679671.0,607500.0,4,11,6
2,122040,0,15060.735,136444.5,112500.0,5,11,5
3,176158,0,47041.335,470790.0,450000.0,1,7,7
4,202054,0,31924.395,404055.0,337500.0,4,9,7


In [28]:
y

Unnamed: 0,AMT_APPLICATION,AMT_DOWN_PAYMENT,CHANNEL_TYPE,CNT_PAYMENT,CODE_REJECT_REASON,DAYS_DECISION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE,DAYS_LAST_DUE_1ST_VERSION,...,NAME_SELLER_INDUSTRY,NAME_YIELD_GROUP,NFLAG_INSURED_ON_APPROVAL,NFLAG_LAST_APPL_IN_DAY,PRODUCT_COMBINATION,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,SELLERPLACE_AREA,SK_ID_PREV
0,17145.0,0.0,Country-wide,12.0,XAP,-73,365243.0,-42.0,-42.0,300.0,...,Connectivity,middle,0.0,1,POS mobile with interest,0.000000,0.182832,0.867336,35,2030495
1,607500.0,,Contact center,36.0,XAP,-164,365243.0,-134.0,365243.0,916.0,...,XNA,low_action,1.0,1,Cash X-Sell: low,,,,-1,2802425
2,112500.0,,Credit and cash offices,12.0,XAP,-301,365243.0,-271.0,365243.0,59.0,...,XNA,high,1.0,1,Cash X-Sell: high,,,,-1,2523466
3,450000.0,,Credit and cash offices,12.0,XAP,-512,365243.0,-482.0,-182.0,-152.0,...,XNA,middle,1.0,1,Cash X-Sell: middle,,,,-1,2819243
4,337500.0,,Credit and cash offices,24.0,HC,-781,,,,,...,XNA,high,,1,Cash Street: high,,,,-1,1784265
5,315000.0,,Credit and cash offices,18.0,XAP,-684,365243.0,-654.0,-144.0,-144.0,...,XNA,low_normal,1.0,1,Cash X-Sell: low,,,,-1,1383531
6,0.0,,Credit and cash offices,,XAP,-14,,,,,...,XNA,XNA,,1,Cash,,,,-1,2315218
7,0.0,,Credit and cash offices,,XAP,-21,,,,,...,XNA,XNA,,1,Cash,,,,-1,1656711
8,0.0,,Credit and cash offices,,XAP,-386,,,,,...,XNA,XNA,,1,Cash,,,,-1,2367563
9,0.0,,Credit and cash offices,,XAP,-57,,,,,...,XNA,XNA,,1,Cash,,,,-1,2579447
