In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import MinMaxScaler

In [139]:
data = pd.read_csv('data/application_train.csv')
data_submission = pd.read_csv('data/application_test.csv')
submission_id = data_submission['SK_ID_CURR']

print(data.shape)
print(data_submission.shape)

def process_data(data):
    Null_dict = dict()

    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        Null_dict[column] = 0


    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        if data[column].isnull().sum()>0:
            Null_dict[column] = data[column].isnull().sum()

        if (data[column]=='XNA').sum()>0:
            Null_dict[column] = (data[column]=='XNA').sum()

    for (key,value) in Null_dict.items():
        Null_dict[key] = float(value)/data.shape[0]

    def replace_binary_categorical_var(df, column_name):
        categories = list(df[column_name].unique())
        if np.nan in categories:
            categories.remove(np.nan)
        assert(len(categories) == 2)
        df.loc[df[column_name] == categories[0], column_name] = 0
        df.loc[df[column_name] == categories[1], column_name] = 1    

    for element in data['FONDKAPREMONT_MODE']:
        if element == 'XNA':
            element = np.nan
    # Name contract type is either Cash loans or Revolving loans
    replace_binary_categorical_var(data, 'NAME_CONTRACT_TYPE')

    # Gender is either male, female or N/A. We'll consider it binary
    data['CODE_GENDER'] = data['CODE_GENDER'].replace('XNA', np.nan)
    replace_binary_categorical_var(data, 'CODE_GENDER')

    # FLAG_OWN_CAR and FLAG_OWN_REALTY are flags, either Y or N
    replace_binary_categorical_var(data, 'FLAG_OWN_CAR')
    replace_binary_categorical_var(data, 'FLAG_OWN_REALTY')

    # We'll consider unknown to be N/A
    data['NAME_FAMILY_STATUS'] = data['NAME_FAMILY_STATUS'].replace('Unknown', np.nan)

    # All these are categorical
    data = pd.get_dummies(data, columns=['NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE'])

    data = data.fillna(data.median())
    return data

data = process_data(data)
data_submission = process_data(data_submission)

(307511, 122)
(48744, 121)


In [140]:
set(data.columns) - set(data_submission.columns)
data_submission['NAME_INCOME_TYPE_Maternity leave'] = 0

Bureau dataset 

In [2]:
data_bureau = pd.read_csv('data/bureau.csv')

In [7]:
data_bureau.head(10)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,
5,215354,5714467,Active,currency 1,-273,0,27460.0,,0.0,0,180000.0,71017.38,108982.62,0.0,Credit card,-31,
6,215354,5714468,Active,currency 1,-43,0,79.0,,0.0,0,42103.8,42103.8,0.0,0.0,Consumer credit,-22,
7,162297,5714469,Closed,currency 1,-1896,0,-1684.0,-1710.0,14985.0,0,76878.45,0.0,0.0,0.0,Consumer credit,-1710,
8,162297,5714470,Closed,currency 1,-1146,0,-811.0,-840.0,0.0,0,103007.7,0.0,0.0,0.0,Consumer credit,-840,
9,162297,5714471,Active,currency 1,-1146,0,-484.0,,0.0,0,4500.0,0.0,0.0,0.0,Credit card,-690,


In [145]:
data_bureau = data_bureau.replace(['Mobile operator loan','Interbank credit','Loan for purchase of shares (margin lending)','Loan for purchase of equipment','Another type of loan','Unknown type of loan'],'Other')

In [146]:
data_bureau = pd.get_dummies(data_bureau,columns = ['CREDIT_ACTIVE','CREDIT_CURRENCY','CREDIT_TYPE'])

In [147]:
Null_dict = dict()

for column in data_bureau.columns:
    Null_dict[column] = 0

for column in data_bureau.columns:
    if data_bureau[column].isnull().sum()>0:
        Null_dict[column] = data_bureau[column].isnull().sum()

for (key,value) in Null_dict.items():
    Null_dict[key] = float(value)/data_bureau.shape[0]

print(Null_dict)

{'CREDIT_ACTIVE_Closed': 0.0, 'CREDIT_ACTIVE_Sold': 0.0, 'CREDIT_TYPE_Car loan': 0.0, 'DAYS_CREDIT': 0.0, 'CREDIT_CURRENCY_currency 4': 0.0, 'CREDIT_CURRENCY_currency 1': 0.0, 'AMT_CREDIT_SUM_OVERDUE': 0.0, 'CREDIT_CURRENCY_currency 3': 0.0, 'CNT_CREDIT_PROLONG': 0.0, 'DAYS_CREDIT_UPDATE': 0.0, 'CREDIT_TYPE_Credit card': 0.0, 'CREDIT_TYPE_Other': 0.0, 'CREDIT_TYPE_Mortgage': 0.0, 'CREDIT_TYPE_Loan for business development': 0.0, 'AMT_CREDIT_SUM': 7.573868522303295e-06, 'CREDIT_TYPE_Microloan': 0.0, 'CREDIT_TYPE_Loan for working capital replenishment': 0.0, 'DAYS_CREDIT_ENDDATE': 0.06149573416420613, 'CREDIT_TYPE_Loan for the purchase of equipment': 0.0, 'CREDIT_ACTIVE_Active': 0.0, 'CREDIT_TYPE_Cash loan (non-earmarked)': 0.0, 'AMT_CREDIT_SUM_LIMIT': 0.3447741472406649, 'SK_ID_CURR': 0.0, 'AMT_CREDIT_SUM_DEBT': 0.15011931755948982, 'SK_ID_BUREAU': 0.0, 'AMT_ANNUITY': 0.714734902949614, 'CREDIT_DAY_OVERDUE': 0.0, 'CREDIT_ACTIVE_Bad debt': 0.0, 'CREDIT_TYPE_Real estate loan': 0.0, 'CREDI

In [148]:
data_bureau = data_bureau.drop(['DAYS_CREDIT_UPDATE','CREDIT_CURRENCY_currency 1','CREDIT_CURRENCY_currency 2','CREDIT_CURRENCY_currency 3','CREDIT_CURRENCY_currency 4'],axis=1)

In [149]:
data_bureau = data_bureau.fillna(data_bureau.median())

In [151]:
data_bureau = data_bureau.groupby('SK_ID_CURR', as_index = False).agg({'SK_ID_BUREAU':'count','DAYS_CREDIT':'min','CREDIT_DAY_OVERDUE':'max','DAYS_CREDIT_ENDDATE':'max','DAYS_ENDDATE_FACT':'min','AMT_CREDIT_MAX_OVERDUE':['max','mean'],'CNT_CREDIT_PROLONG':['max','median'],'AMT_CREDIT_SUM':['max','median'],'AMT_CREDIT_SUM_DEBT':'sum','AMT_CREDIT_SUM_LIMIT':'max','AMT_CREDIT_SUM_OVERDUE':'sum','CREDIT_ACTIVE_Active':'count','CREDIT_ACTIVE_Bad debt':'count','CREDIT_ACTIVE_Closed':'count','CREDIT_ACTIVE_Sold':'count','CREDIT_TYPE_Car loan':'mean','CREDIT_TYPE_Cash loan (non-earmarked)':'mean','CREDIT_TYPE_Consumer credit':'mean','CREDIT_TYPE_Credit card':'mean','CREDIT_TYPE_Loan for business development':'mean','CREDIT_TYPE_Loan for the purchase of equipment':'mean','CREDIT_TYPE_Loan for working capital replenishment':'mean','CREDIT_TYPE_Microloan':'mean','CREDIT_TYPE_Mortgage':'mean','CREDIT_TYPE_Other':'mean','CREDIT_TYPE_Real estate loan':'mean','AMT_ANNUITY':'mean'})
data_bureau.columns = list(map('_'.join, data_bureau.columns.values))
data_bureau = data_bureau.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'})

In [152]:
data_bureau.head()

Unnamed: 0,SK_ID_CURR,CREDIT_ACTIVE_Closed_count,CREDIT_ACTIVE_Sold_count,AMT_CREDIT_SUM_DEBT_sum,DAYS_CREDIT_min,AMT_CREDIT_SUM_OVERDUE_sum,CNT_CREDIT_PROLONG_max,CNT_CREDIT_PROLONG_median,CREDIT_TYPE_Credit card_mean,CREDIT_TYPE_Other_mean,...,CREDIT_TYPE_Car loan_mean,SK_ID_BUREAU_count,AMT_ANNUITY_mean,CREDIT_DAY_OVERDUE_max,CREDIT_ACTIVE_Bad debt_count,CREDIT_TYPE_Real estate loan_mean,CREDIT_TYPE_Consumer credit_mean,AMT_CREDIT_MAX_OVERDUE_max,AMT_CREDIT_MAX_OVERDUE_mean,DAYS_ENDDATE_FACT_min
0,100001,7,7,596686.5,-1572,0.0,0,0.0,0.0,0.0,...,0.0,7,3545.357143,0,7,0.0,1.0,0.0,0.0,-1328.0
1,100002,8,8,245781.0,-1437,0.0,0,0.0,0.5,0.0,...,0.0,8,0.0,0,8,0.0,0.5,5043.645,1050.643125,-1185.0
2,100003,4,4,0.0,-2586,0.0,0,0.0,0.5,0.0,...,0.0,4,0.0,0,4,0.0,0.5,0.0,0.0,-2131.0
3,100004,2,2,0.0,-1326,0.0,0,0.0,0.0,0.0,...,0.0,2,0.0,0,2,0.0,1.0,0.0,0.0,-683.0
4,100005,3,3,568408.5,-373,0.0,0,0.0,0.333333,0.0,...,0.0,3,1420.5,0,3,0.0,0.666667,0.0,0.0,-897.0


In [153]:
data = data.merge(data_bureau, on= 'SK_ID_CURR',how='left')

In [154]:
data_submission = data_submission.merge(data_bureau, on= 'SK_ID_CURR',how='left')

In [155]:
data_bureau.columns

Index([u'SK_ID_CURR', u'CREDIT_ACTIVE_Closed_count',
       u'CREDIT_ACTIVE_Sold_count', u'AMT_CREDIT_SUM_DEBT_sum',
       u'DAYS_CREDIT_min', u'AMT_CREDIT_SUM_OVERDUE_sum',
       u'CNT_CREDIT_PROLONG_max', u'CNT_CREDIT_PROLONG_median',
       u'CREDIT_TYPE_Credit card_mean', u'CREDIT_TYPE_Other_mean',
       u'CREDIT_TYPE_Mortgage_mean',
       u'CREDIT_TYPE_Loan for business development_mean',
       u'AMT_CREDIT_SUM_max', u'AMT_CREDIT_SUM_median',
       u'CREDIT_TYPE_Microloan_mean',
       u'CREDIT_TYPE_Loan for working capital replenishment_mean',
       u'DAYS_CREDIT_ENDDATE_max',
       u'CREDIT_TYPE_Loan for the purchase of equipment_mean',
       u'CREDIT_ACTIVE_Active_count',
       u'CREDIT_TYPE_Cash loan (non-earmarked)_mean',
       u'AMT_CREDIT_SUM_LIMIT_max', u'CREDIT_TYPE_Car loan_mean',
       u'SK_ID_BUREAU_count', u'AMT_ANNUITY_mean', u'CREDIT_DAY_OVERDUE_max',
       u'CREDIT_ACTIVE_Bad debt_count', u'CREDIT_TYPE_Real estate loan_mean',
       u'CREDIT_TYPE_Consu

In [156]:
Null_dic = dict()

for column in ['DAYS_CREDIT_min','DAYS_ENDDATE_FACT_min','AMT_CREDIT_SUM_LIMIT_max','DAYS_CREDIT_ENDDATE_max']:
    Null_dic[column] = data_bureau[column].median()


data = data.fillna(Null_dic)
data = data.fillna(0)

data_submission= data_submission.fillna(Null_dic)
data_submission= data_submission.fillna(0)

In [157]:
data_x = data.drop('TARGET', axis=1).as_matrix()
data_y = data['TARGET'].as_matrix()
data_submission = data_submission.as_matrix()

train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.05)
del data_x, data_y

In [158]:
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)
data_submission = scaler.transform(data_submission)

In [None]:
xgb = XGBClassifier()
xgb.fit(train_x, train_y)

In [None]:
roc_auc_score(test_y, xgb.predict_proba(test_x)[:,1])

In [None]:
predicted = xgb.predict_proba(data_submission)[:, 1]

predicted

results = pd.DataFrame({'SK_ID_CURR': submission_id, 'TARGET': predicted})

results.to_csv('predictions.csv', index=False)