In [40]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMClassifier

In [57]:
data_pos_cash = pd.read_csv('data/POS_CASH_balance.csv')

In [59]:
data_pos_cash['NAME_CONTRACT_STATUS'] = data_pos_cash['NAME_CONTRACT_STATUS'].replace(['Signed','Demand','Returned to the store','Approved','XNA'],0)

data_pos_cash['NAME_CONTRACT_STATUS'] = data_pos_cash['NAME_CONTRACT_STATUS'].replace({'Amortized debt': 1,'Active':3,'Canceled':2,'Completed':4})



In [60]:
data_pos_cash.loc[(data_pos_cash['NAME_CONTRACT_STATUS'].isin((1, 3))) & (data_pos_cash['CNT_INSTALMENT_FUTURE'] == 0), 'NAME_CONTRACT_STATUS'] = 4
data_pos_cash.loc[(data_pos_cash['NAME_CONTRACT_STATUS'].isin((1, 3))) & (data_pos_cash['CNT_INSTALMENT_FUTURE'] != 0), 'NAME_CONTRACT_STATUS'] = 2

In [None]:
data_pos_cash['MONTHS_BALANCE'] = data_pos_cash['MONTHS_BALANCE'].replace(-1, 0)

data_pos_cash['MONTHS_LEFT_TO_PAY'] = data_pos_cash['CNT_INSTALMENT_FUTURE']
data_pos_cash.loc[data_pos_cash['MONTHS_BALANCE'] != 0, 'MONTHS_LEFT_TO_PAY'] = np.nan

data_pos_cash = data_pos_cash.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])
data_pos_cash['CNT_INSTALMENT_DIFF'] = data_pos_cash['CNT_INSTALMENT'].diff()

data_pos_cash.loc[data_pos_cash['SK_ID_CURR'].diff() != 0, 'CNT_INSTALMENT_DIFF'] = np.nan

aggregated = data_pos_cash.groupby(('SK_ID_CURR', 'SK_ID_PREV')).agg({
    'SK_DPD_DEF': ('sum', 'max'),
    'SK_DPD': ('sum', 'max'),
    'MONTHS_LEFT_TO_PAY': ('max'),
    'NAME_CONTRACT_STATUS': 'max'
}).fillna(0)

agg_dict = {col: ('mean', 'max') for col in aggregated.columns}
agg_dict[('NAME_CONTRACT_STATUS', 'max')] = ('min')

aggregated = aggregated.groupby('SK_ID_CURR').agg(agg_dict)

aggregated.columns = list(map('_'.join, aggregated.columns.values))
aggregated['SK_ID_CURR'] = aggregated.index


In [62]:
data_pos_cash.head(7)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,MONTHS_LEFT_TO_PAY
0,1803195,182943,-31,48.0,45.0,2,0,0,
1,1715348,367990,-33,36.0,35.0,2,0,0,
2,1784872,397406,-32,12.0,9.0,2,0,0,
3,1903291,269225,-35,48.0,42.0,2,0,0,
4,2341044,334279,-35,36.0,35.0,2,0,0,
5,2207092,342166,-32,12.0,12.0,2,0,0,
6,1110516,204376,-38,48.0,43.0,2,0,0,


In [65]:
aggregated['NAME_CONTRACT_STATUS_max_min'] = aggregated['NAME_CONTRACT_STATUS_max_min'].replace({4: 1,2:0})

aggregated

Unnamed: 0_level_0,SK_DPD_max_mean,SK_DPD_max_max,SK_DPD_sum_mean,SK_DPD_sum_max,MONTHS_LEFT_TO_PAY_max_mean,MONTHS_LEFT_TO_PAY_max_max,SK_DPD_DEF_max_mean,SK_DPD_DEF_max_max,SK_DPD_DEF_sum_mean,SK_DPD_DEF_sum_max,NAME_CONTRACT_STATUS_max_min,SK_ID_CURR
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
100001,3.500000,7,3.500000,7,0.000000,0.0,3.500000,7,3.500000,7,1,100001
100002,0.000000,0,0.000000,0,6.000000,6.0,0.000000,0,0.000000,0,0,100002
100003,0.000000,0,0.000000,0,0.000000,0.0,0.000000,0,0.000000,0,0,100003
100004,0.000000,0,0.000000,0,0.000000,0.0,0.000000,0,0.000000,0,1,100004
100005,0.000000,0,0.000000,0,0.000000,0.0,0.000000,0,0.000000,0,1,100005
100006,0.000000,0,0.000000,0,1.000000,3.0,0.000000,0,0.000000,0,0,100006
100007,0.000000,0,0.000000,0,2.600000,13.0,0.000000,0,0.000000,0,0,100007
100008,323.500000,1294,7035.500000,28142,0.000000,0.0,0.000000,0,0.000000,0,1,100008
100009,0.000000,0,0.000000,0,1.125000,9.0,0.000000,0,0.000000,0,0,100009
100010,0.000000,0,0.000000,0,0.000000,0.0,0.000000,0,0.000000,0,1,100010


In [64]:
aggregated['NAME_CONTRACT_STATUS_max_min'].value_counts()

4    172076
2    163865
0      1311
Name: NAME_CONTRACT_STATUS_max_min, dtype: int64

In [16]:
data = data.merge(aggregated, on='SK_ID_CURR', how='left', suffixes=('', '_previous'))
data_submission = data_submission.merge(aggregated, on='SK_ID_CURR', how='left', suffixes=('', '_previous'))

data = data.drop('SK_ID_CURR', axis=1)
data_submission = data_submission.drop('SK_ID_CURR', axis=1)

data = data.fillna(data.median())

data_submission = data_submission.fillna(data.median())

In [17]:
data_submission['NAME_INCOME_TYPE_Maternity leave'] = 0

In [21]:
data_x = data.drop('TARGET', axis=1).as_matrix()
data_y = data['TARGET'].as_matrix()
data_submission = data_submission.as_matrix()

train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.05)
del data_x, data_y

In [22]:
lgbm = XGBClassifier()
lgbm.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [23]:
print('train_score:', roc_auc_score(train_y, lgbm.predict_proba(train_x)[:,1]))
print('test_score:', roc_auc_score(test_y, lgbm.predict_proba(test_x)[:,1]))

('train_score:', 0.7670202864020771)
('test_score:', 0.7661513683299861)


In [24]:
predicted = lgbm.predict_proba(data_submission)[:, 1]

results = pd.DataFrame({'SK_ID_CURR': submission_id, 'TARGET': predicted})

results.to_csv('predictions.csv', index=False)

In [27]:
print('TOP 50 Importances')
for importance, col in sorted(zip(list(lgbm.feature_importances_), list(data.columns[1:])), reverse=True)[:50]:
    print(str(importance) + '\t' + col)

TOP 50 Importances
0.16428572	EXT_SOURCE_3
0.12	EXT_SOURCE_2
0.09714286	EXT_SOURCE_1
0.08142857	AMT_ANNUITY/(AMT_CREDIT)
0.052857142	AMT_GOODS_PRICE/(AMT_CREDIT)
0.037142858	DAYS_BIRTH
0.034285713	CODE_GENDER
0.027142856	MONTHS_LEFT_TO_PAY_max_mean
0.025714286	SK_DPD_DEF_max_mean
0.024285715	DAYS_EMPLOYED
0.018571429	AMT_ANNUITY
0.015714286	FLAG_DOCUMENT_3/(AMT_CREDIT)
0.015714286	DAYS_ID_PUBLISH
0.012857143	SK_DPD_DEF_sum_mean
0.012857143	MONTHS_LEFT_TO_PAY_max_max
0.012857143	DAYS_EMPLOYED/(AMT_CREDIT)
0.011428571	AMT_INCOME_TOTAL/(AMT_CREDIT)
0.01	REGION_RATING_CLIENT_W_CITY
0.01	NAME_FAMILY_STATUS_Married
0.01	NAME_EDUCATION_TYPE_Higher education
0.01	AMT_GOODS_PRICE
0.008571428	SK_DPD_sum_mean
0.008571428	SK_DPD_max_mean
0.008571428	OWN_CAR_AGE
0.008571428	FLAG_WORK_PHONE/(AMT_CREDIT)
0.008571428	DEF_30_CNT_SOCIAL_CIRCLE/(AMT_CREDIT)
0.008571428	DAYS_LAST_PHONE_CHANGE/(AMT_CREDIT)
0.007142857	NAME_INCOME_TYPE_Working
0.007142857	NAME_EDUCATION_TYPE_Secondary / secondary special
0.