In [103]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import MinMaxScaler

In [104]:
data = pd.read_csv('data/application_train.csv')
data_submission = pd.read_csv('data/application_test.csv')
submission_id = data_submission['SK_ID_CURR']

print(data.shape)
print(data_submission.shape)

def process_data(data):
    Null_dict = dict()

    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        Null_dict[column] = 0


    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        if data[column].isnull().sum()>0:
            Null_dict[column] = data[column].isnull().sum()

        if (data[column]=='XNA').sum()>0:
            Null_dict[column] = (data[column]=='XNA').sum()

    for (key,value) in Null_dict.items():
        Null_dict[key] = float(value)/data.shape[0]

    def replace_binary_categorical_var(df, column_name):
        categories = list(df[column_name].unique())
        if np.nan in categories:
            categories.remove(np.nan)
        assert(len(categories) == 2)
        df.loc[df[column_name] == categories[0], column_name] = 0
        df.loc[df[column_name] == categories[1], column_name] = 1    

    for element in data['FONDKAPREMONT_MODE']:
        if element == 'XNA':
            element = np.nan
    # Name contract type is either Cash loans or Revolving loans
    replace_binary_categorical_var(data, 'NAME_CONTRACT_TYPE')

    # Gender is either male, female or N/A. We'll consider it binary
    data['CODE_GENDER'] = data['CODE_GENDER'].replace('XNA', np.nan)
    replace_binary_categorical_var(data, 'CODE_GENDER')

    # FLAG_OWN_CAR and FLAG_OWN_REALTY are flags, either Y or N
    replace_binary_categorical_var(data, 'FLAG_OWN_CAR')
    replace_binary_categorical_var(data, 'FLAG_OWN_REALTY')

    # We'll consider unknown to be N/A
    data['NAME_FAMILY_STATUS'] = data['NAME_FAMILY_STATUS'].replace('Unknown', np.nan)

    # All these are categorical
    data = pd.get_dummies(data, columns=['NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE'])

    data = data.fillna(data.median())
    return data

data = process_data(data)
data_submission = process_data(data_submission)

(307511, 122)
(48744, 121)


In [105]:
set(data.columns) - set(data_submission.columns)
data_submission['NAME_INCOME_TYPE_Maternity leave'] = 0

Bureau dataset 

In [106]:
data_bureau = pd.read_csv('data/bureau.csv')

In [107]:
data_bureau.dtypes

SK_ID_CURR                  int64
SK_ID_BUREAU                int64
CREDIT_ACTIVE              object
CREDIT_CURRENCY            object
DAYS_CREDIT                 int64
CREDIT_DAY_OVERDUE          int64
DAYS_CREDIT_ENDDATE       float64
DAYS_ENDDATE_FACT         float64
AMT_CREDIT_MAX_OVERDUE    float64
CNT_CREDIT_PROLONG          int64
AMT_CREDIT_SUM            float64
AMT_CREDIT_SUM_DEBT       float64
AMT_CREDIT_SUM_LIMIT      float64
AMT_CREDIT_SUM_OVERDUE    float64
CREDIT_TYPE                object
DAYS_CREDIT_UPDATE          int64
AMT_ANNUITY               float64
dtype: object

In [108]:
object_columns = ['CREDIT_ACTIVE','CREDIT_CURRENCY','CREDIT_TYPE']
dic = dict()

for column in lista:
    dic[column] = data_bureau[column].unique()

print(dic)

{'CREDIT_CURRENCY': array(['currency 1', 'currency 2', 'currency 4', 'currency 3'],
      dtype=object), 'CREDIT_TYPE': array(['Consumer credit', 'Credit card', 'Mortgage', 'Car loan',
       'Microloan', 'Loan for working capital replenishment',
       'Loan for business development', 'Real estate loan',
       'Unknown type of loan', 'Another type of loan',
       'Cash loan (non-earmarked)', 'Loan for the purchase of equipment',
       'Mobile operator loan', 'Interbank credit',
       'Loan for purchase of shares (margin lending)'], dtype=object), 'CREDIT_ACTIVE': array(['Closed', 'Active', 'Sold', 'Bad debt'], dtype=object)}


In [109]:
data_bureau['CREDIT_TYPE'].value_counts()

Consumer credit                                 1251615
Credit card                                      402195
Car loan                                          27690
Mortgage                                          18391
Microloan                                         12413
Loan for business development                      1975
Another type of loan                               1017
Unknown type of loan                                555
Loan for working capital replenishment              469
Cash loan (non-earmarked)                            56
Real estate loan                                     27
Loan for the purchase of equipment                   19
Loan for purchase of shares (margin lending)          4
Interbank credit                                      1
Mobile operator loan                                  1
Name: CREDIT_TYPE, dtype: int64

In [110]:
data_bureau = data_bureau.replace(['Mobile operator loan','Interbank credit','Loan for purchase of shares (margin lending)','Loan for purchase of equipment','Another type of loan','Unknown type of loan'],'Other')

In [111]:
data_bureau = pd.get_dummies(data_bureau,columns = ['CREDIT_ACTIVE','CREDIT_CURRENCY','CREDIT_TYPE'])

In [112]:
Null_dict = dict()

for column in data_bureau.columns:
    Null_dict[column] = 0

for column in data_bureau.columns:
    if data_bureau[column].isnull().sum()>0:
        Null_dict[column] = data_bureau[column].isnull().sum()

for (key,value) in Null_dict.items():
    Null_dict[key] = float(value)/data_bureau.shape[0]

print(Null_dict)

{'CREDIT_ACTIVE_Closed': 0.0, 'CREDIT_ACTIVE_Sold': 0.0, 'CREDIT_TYPE_Car loan': 0.0, 'DAYS_CREDIT': 0.0, 'CREDIT_CURRENCY_currency 4': 0.0, 'CREDIT_CURRENCY_currency 1': 0.0, 'AMT_CREDIT_SUM_OVERDUE': 0.0, 'CREDIT_CURRENCY_currency 3': 0.0, 'CNT_CREDIT_PROLONG': 0.0, 'DAYS_CREDIT_UPDATE': 0.0, 'CREDIT_TYPE_Credit card': 0.0, 'CREDIT_TYPE_Other': 0.0, 'CREDIT_TYPE_Mortgage': 0.0, 'CREDIT_TYPE_Loan for business development': 0.0, 'AMT_CREDIT_SUM': 7.573868522303295e-06, 'CREDIT_TYPE_Microloan': 0.0, 'CREDIT_TYPE_Loan for working capital replenishment': 0.0, 'DAYS_CREDIT_ENDDATE': 0.06149573416420613, 'CREDIT_TYPE_Loan for the purchase of equipment': 0.0, 'CREDIT_ACTIVE_Active': 0.0, 'CREDIT_TYPE_Cash loan (non-earmarked)': 0.0, 'AMT_CREDIT_SUM_LIMIT': 0.3447741472406649, 'SK_ID_CURR': 0.0, 'AMT_CREDIT_SUM_DEBT': 0.15011931755948982, 'SK_ID_BUREAU': 0.0, 'AMT_ANNUITY': 0.714734902949614, 'CREDIT_DAY_OVERDUE': 0.0, 'CREDIT_ACTIVE_Bad debt': 0.0, 'CREDIT_TYPE_Real estate loan': 0.0, 'CREDI

In [113]:
data_bureau = data_bureau.drop(['AMT_ANNUITY','DAYS_CREDIT_UPDATE','CREDIT_CURRENCY_currency 1','CREDIT_CURRENCY_currency 2','CREDIT_CURRENCY_currency 3','CREDIT_CURRENCY_currency 4'],axis=1)

In [114]:
data_bureau = data_bureau.fillna(data_bureau.median())

In [115]:
data_bureau.columns

Index([u'SK_ID_CURR', u'SK_ID_BUREAU', u'DAYS_CREDIT', u'CREDIT_DAY_OVERDUE',
       u'DAYS_CREDIT_ENDDATE', u'DAYS_ENDDATE_FACT', u'AMT_CREDIT_MAX_OVERDUE',
       u'CNT_CREDIT_PROLONG', u'AMT_CREDIT_SUM', u'AMT_CREDIT_SUM_DEBT',
       u'AMT_CREDIT_SUM_LIMIT', u'AMT_CREDIT_SUM_OVERDUE',
       u'CREDIT_ACTIVE_Active', u'CREDIT_ACTIVE_Bad debt',
       u'CREDIT_ACTIVE_Closed', u'CREDIT_ACTIVE_Sold', u'CREDIT_TYPE_Car loan',
       u'CREDIT_TYPE_Cash loan (non-earmarked)',
       u'CREDIT_TYPE_Consumer credit', u'CREDIT_TYPE_Credit card',
       u'CREDIT_TYPE_Loan for business development',
       u'CREDIT_TYPE_Loan for the purchase of equipment',
       u'CREDIT_TYPE_Loan for working capital replenishment',
       u'CREDIT_TYPE_Microloan', u'CREDIT_TYPE_Mortgage', u'CREDIT_TYPE_Other',
       u'CREDIT_TYPE_Real estate loan'],
      dtype='object')

In [125]:
data_bureau = data_bureau.groupby('SK_ID_CURR', as_index = False).agg({'SK_ID_BUREAU':'count','DAYS_CREDIT':'min','CREDIT_DAY_OVERDUE':'max','DAYS_CREDIT_ENDDATE':'max','DAYS_ENDDATE_FACT':'min','AMT_CREDIT_MAX_OVERDUE':['max','mean'],'CNT_CREDIT_PROLONG':['max','median'],'AMT_CREDIT_SUM':['max','median'],'AMT_CREDIT_SUM_DEBT':'sum','AMT_CREDIT_SUM_LIMIT':'max','AMT_CREDIT_SUM_OVERDUE':'sum','CREDIT_ACTIVE_Active':'count','CREDIT_ACTIVE_Bad debt':'count','CREDIT_ACTIVE_Closed':'count','CREDIT_ACTIVE_Sold':'count','CREDIT_TYPE_Car loan':'mean','CREDIT_TYPE_Cash loan (non-earmarked)':'mean','CREDIT_TYPE_Consumer credit':'mean','CREDIT_TYPE_Credit card':'mean','CREDIT_TYPE_Loan for business development':'mean','CREDIT_TYPE_Loan for the purchase of equipment':'mean','CREDIT_TYPE_Loan for working capital replenishment':'mean','CREDIT_TYPE_Microloan':'mean','CREDIT_TYPE_Mortgage':'mean','CREDIT_TYPE_Other':'mean','CREDIT_TYPE_Real estate loan':'mean'})
data_bureau.columns = list(map('_'.join, data_bureau.columns.values))
data_bureau = data_bureau.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'})

In [126]:
data_bureau.head()

Unnamed: 0,SK_ID_CURR,CREDIT_ACTIVE_Closed_count,CREDIT_ACTIVE_Sold_count,AMT_CREDIT_SUM_DEBT_sum,DAYS_CREDIT_min,AMT_CREDIT_SUM_OVERDUE_sum,CNT_CREDIT_PROLONG_max,CNT_CREDIT_PROLONG_median,CREDIT_TYPE_Credit card_mean,CREDIT_TYPE_Other_mean,...,AMT_CREDIT_SUM_LIMIT_max,CREDIT_TYPE_Car loan_mean,SK_ID_BUREAU_count,CREDIT_DAY_OVERDUE_max,CREDIT_ACTIVE_Bad debt_count,CREDIT_TYPE_Real estate loan_mean,CREDIT_TYPE_Consumer credit_mean,AMT_CREDIT_MAX_OVERDUE_max,AMT_CREDIT_MAX_OVERDUE_mean,DAYS_ENDDATE_FACT_min
0,100001,7,7,596686.5,-1572,0.0,0,0.0,0.0,0.0,...,0.0,0.0,7,0,7,0.0,1.0,0.0,0.0,-1328.0
1,100002,8,8,245781.0,-1437,0.0,0,0.0,0.5,0.0,...,31988.565,0.0,8,0,8,0.0,0.5,5043.645,1050.643125,-1185.0
2,100003,4,4,0.0,-2586,0.0,0,0.0,0.5,0.0,...,810000.0,0.0,4,0,4,0.0,0.5,0.0,0.0,-2131.0
3,100004,2,2,0.0,-1326,0.0,0,0.0,0.0,0.0,...,0.0,0.0,2,0,2,0.0,1.0,0.0,0.0,-683.0
4,100005,3,3,568408.5,-373,0.0,0,0.0,0.333333,0.0,...,0.0,0.0,3,0,3,0.0,0.666667,0.0,0.0,-897.0


In [127]:
data = data.merge(data_bureau, on= 'SK_ID_CURR',how='left')

In [128]:
data_submission = data_submission.merge(data_bureau, on= 'SK_ID_CURR',how='left')

In [129]:
data_bureau.columns

Index([u'SK_ID_CURR', u'CREDIT_ACTIVE_Closed_count',
       u'CREDIT_ACTIVE_Sold_count', u'AMT_CREDIT_SUM_DEBT_sum',
       u'DAYS_CREDIT_min', u'AMT_CREDIT_SUM_OVERDUE_sum',
       u'CNT_CREDIT_PROLONG_max', u'CNT_CREDIT_PROLONG_median',
       u'CREDIT_TYPE_Credit card_mean', u'CREDIT_TYPE_Other_mean',
       u'CREDIT_TYPE_Mortgage_mean',
       u'CREDIT_TYPE_Loan for business development_mean',
       u'AMT_CREDIT_SUM_max', u'AMT_CREDIT_SUM_median',
       u'CREDIT_TYPE_Microloan_mean',
       u'CREDIT_TYPE_Loan for working capital replenishment_mean',
       u'DAYS_CREDIT_ENDDATE_max',
       u'CREDIT_TYPE_Loan for the purchase of equipment_mean',
       u'CREDIT_ACTIVE_Active_count',
       u'CREDIT_TYPE_Cash loan (non-earmarked)_mean',
       u'AMT_CREDIT_SUM_LIMIT_max', u'CREDIT_TYPE_Car loan_mean',
       u'SK_ID_BUREAU_count', u'CREDIT_DAY_OVERDUE_max',
       u'CREDIT_ACTIVE_Bad debt_count', u'CREDIT_TYPE_Real estate loan_mean',
       u'CREDIT_TYPE_Consumer credit_mean', u'A

In [131]:
Null_dic = dict()

for column in ['DAYS_CREDIT_min','DAYS_ENDDATE_FACT_min','AMT_CREDIT_SUM_LIMIT_max','DAYS_CREDIT_ENDDATE_max']:
    Null_dic[column] = data_bureau[column].median()


data = data.fillna(Null_dic)
data = data.fillna(0)

data_submission= data_submission.fillna(Null_dic)
data_submission= data_submission.fillna(0)