In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import MinMaxScaler

  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  from pandas._libs.tslibs.strptime import array_strptime
  from pandas._libs.tslibs.frequencies import (  # noqa
  from pandas._libs.period import Period
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import lib, groupby as libgroupby, Timestamp, NaT, iNaT
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from . import _csparsetools
  from ._shortest_path import shortest_path, floyd_warshall, dijkstra,\
  from ._tools import csgraph_to_dense, csgraph_from_dense,\
  from ._traversal import breadth_first_order, depth_first_order, \
  from ._min_spanning_tree import minimum_spanning_tree
  from ._reordering impo

In [2]:

data = pd.read_csv('data/application_train.csv')
data_submission = pd.read_csv('data/application_test.csv')
submission_id = data_submission['SK_ID_CURR']

previous_data = pd.read_csv('data/previous_application.csv')


def process_data(data):
    Null_dict = dict()

    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        Null_dict[column] = 0


    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        if data[column].isnull().sum()>0:
            Null_dict[column] = data[column].isnull().sum()

        if (data[column]=='XNA').sum()>0:
            Null_dict[column] = (data[column]=='XNA').sum()

    for (key,value) in Null_dict.items():
        Null_dict[key] = float(value)/data.shape[0]

    def replace_binary_categorical_var(df, column_name):
        categories = list(df[column_name].unique())
        if np.nan in categories:
            categories.remove(np.nan)
        assert(len(categories) == 2)
        df.loc[df[column_name] == categories[0], column_name] = 0
        df.loc[df[column_name] == categories[1], column_name] = 1    

    for element in data['FONDKAPREMONT_MODE']:
        if element == 'XNA':
            element = np.nan
    # Name contract type is either Cash loans or Revolving loans
    replace_binary_categorical_var(data, 'NAME_CONTRACT_TYPE')

    # Gender is either male, female or N/A. We'll consider it binary
    data['CODE_GENDER'] = data['CODE_GENDER'].replace('XNA', np.nan)
    replace_binary_categorical_var(data, 'CODE_GENDER')

    # FLAG_OWN_CAR and FLAG_OWN_REALTY are flags, either Y or N
    replace_binary_categorical_var(data, 'FLAG_OWN_CAR')
    replace_binary_categorical_var(data, 'FLAG_OWN_REALTY')

    # We'll consider unknown to be N/A
    data['NAME_FAMILY_STATUS'] = data['NAME_FAMILY_STATUS'].replace('Unknown', np.nan)

    # All these are categorical
    data = pd.get_dummies(data, columns=['NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE'])

    data = data.fillna(data.median())

    return data

data = process_data(data)
data_submission = process_data(data_submission)


In [3]:
def process_previous_application_data(previous_data):
    missing_percentage = previous_data.isnull().sum() / len(previous_data) 
    dropped = missing_percentage.loc[missing_percentage > 0.9]
    previous_data = previous_data.drop(dropped.index, axis=1)
    previous_data = pd.get_dummies(previous_data)

    previous_data['percentage_down'] = previous_data['AMT_DOWN_PAYMENT'] / previous_data['AMT_CREDIT']
    previous_data.loc[previous_data['NAME_CONTRACT_STATUS_Approved'] == 0, 'percentage_down'] = np.nan
    previous_data['AMT_CREDIT_accepted'] = previous_data['AMT_CREDIT']
    previous_data.loc[previous_data['NAME_CONTRACT_STATUS_Approved'] == 0, 'AMT_CREDIT_accepted'] = np.nan

    aggregated = previous_data.groupby('SK_ID_CURR', as_index=False).agg(
        {'SK_ID_PREV': 'count', 
         'NAME_CONTRACT_STATUS_Refused': 'mean', 
         'NAME_CONTRACT_STATUS_Approved': 'mean',
         'AMT_CREDIT': 'mean',
         'AMT_CREDIT_accepted': 'mean',
         'percentage_down': 'mean',
         'CNT_PAYMENT': ('mean', 'max'),
         'NFLAG_INSURED_ON_APPROVAL': 'mean'})
    aggregated.columns = list(map('_'.join, aggregated.columns.values))
    aggregated = aggregated.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'})
    aggregated.columns
    return aggregated

previous_data = process_previous_application_data(previous_data)
previous_data.fillna(0)

Unnamed: 0,SK_ID_CURR,AMT_CREDIT_mean,AMT_CREDIT_accepted_mean,NAME_CONTRACT_STATUS_Approved_mean,NAME_CONTRACT_STATUS_Refused_mean,CNT_PAYMENT_mean,CNT_PAYMENT_max,SK_ID_PREV_count,NFLAG_INSURED_ON_APPROVAL_mean,percentage_down_mean
0,100001,23787.000000,23787.000000,1.000000,0.000000,8.000000,8.0,1,0.000000,0.105940
1,100002,179055.000000,179055.000000,1.000000,0.000000,24.000000,24.0,1,0.000000,0.000000
2,100003,484191.000000,484191.000000,1.000000,0.000000,10.000000,12.0,3,0.666667,0.050585
3,100004,20106.000000,20106.000000,1.000000,0.000000,4.000000,4.0,1,0.000000,0.241719
4,100005,20076.750000,40153.500000,0.500000,0.000000,12.000000,12.0,2,0.000000,0.111173
5,100006,291695.500000,343728.900000,0.555556,0.111111,23.000000,48.0,9,0.000000,0.180612
6,100007,166638.750000,166638.750000,1.000000,0.000000,20.666667,48.0,6,0.600000,0.176401
7,100008,162767.700000,203459.625000,0.800000,0.000000,14.000000,30.0,5,0.250000,0.074579
8,100009,70137.642857,70137.642857,1.000000,0.000000,8.000000,12.0,7,0.000000,0.136969
9,100010,260811.000000,260811.000000,1.000000,0.000000,10.000000,10.0,1,0.000000,0.000000


In [4]:
data = data.merge(previous_data, on='SK_ID_CURR', how='left')
data_submission = data_submission.merge(previous_data, on='SK_ID_CURR', how='left')

data = data.drop('SK_ID_CURR', axis=1)
data_submission = data_submission.drop('SK_ID_CURR', axis=1)

default_values = {
    'AMT_CREDIT_mean': 0,
    'AMT_CREDIT_accepted_mean': 0,
    'NAME_CONTRACT_STATUS_Approved_mean': 1.0,
    'NAME_CONTRACT_STATUS_Refused_mean': 0.0,
    'SK_ID_PREV_count': 0}
data = data.fillna(default_values)
data = data.fillna(data.median())

data_submission = data_submission.fillna(default_values)
data_submission = data_submission.fillna(data.median())

In [5]:
data_submission['NAME_INCOME_TYPE_Maternity leave'] = 0

In [6]:
data_x = data.drop('TARGET', axis=1).as_matrix()
data_y = data['TARGET'].as_matrix()
# data_submission = data_submission.as_matrix()

train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.3)
del data_x, data_y

In [7]:
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)
data_submission = scaler.transform(data_submission)

In [8]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_x, train_y)

  from ..utils.seq_dataset import ArrayDataset, CSRDataset
  from ..utils import arrayfuncs, as_float_array, check_X_y, deprecated
  from . import cd_fast
  from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
  from .sag_fast import sag
  from . import libsvm, liblinear
  from . import libsvm_sparse
  from .ball_tree import BallTree
  from .kd_tree import KDTree


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
roc_auc_score(test_y, lr.predict_proba(test_x)[:,1])

0.750184873291998

In [10]:
xgb = XGBClassifier()
xgb.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [11]:
roc_auc_score(test_y, xgb.predict_proba(test_x)[:,1])

0.7557661142462534

In [12]:
predicted = xgb.predict_proba(data_submission)[:, 1]

results = pd.DataFrame({'SK_ID_CURR': submission_id, 'TARGET': predicted})

results.to_csv('predictions.csv', index=False)

In [13]:
xgb.feature_importances_

array([0.01142857, 0.03428571, 0.00857143, 0.        , 0.        ,
       0.00285714, 0.02857143, 0.02571429, 0.02857143, 0.00285714,
       0.06142857, 0.03571429, 0.00857143, 0.02142857, 0.01285714,
       0.        , 0.        , 0.00142857, 0.        , 0.        ,
       0.        , 0.        , 0.00142857, 0.00714286, 0.00142857,
       0.        , 0.        , 0.        , 0.00285714, 0.        ,
       0.        , 0.10714286, 0.13857143, 0.16857143, 0.        ,
       0.00142857, 0.00142857, 0.        , 0.        , 0.        ,
       0.        , 0.00428571, 0.        , 0.        , 0.00285714,
       0.00142857, 0.        , 0.00142857, 0.00142857, 0.00285714,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00142857, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.00142857, 0.        , 0.00142857,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00428571, 0.     

In [14]:
sorted(zip(list(xgb.feature_importances_), list(data.columns)[1:]), reverse=True)

[(0.16857143, 'EXT_SOURCE_3'),
 (0.13857143, 'EXT_SOURCE_2'),
 (0.10714286, 'EXT_SOURCE_1'),
 (0.061428573, 'DAYS_BIRTH'),
 (0.03857143, 'CNT_PAYMENT_mean'),
 (0.035714287, 'DAYS_EMPLOYED'),
 (0.034285713, 'CODE_GENDER'),
 (0.028571429, 'AMT_GOODS_PRICE'),
 (0.028571429, 'AMT_CREDIT'),
 (0.025714286, 'AMT_ANNUITY'),
 (0.021428572, 'DAYS_ID_PUBLISH'),
 (0.02, 'NAME_CONTRACT_STATUS_Approved_mean'),
 (0.02, 'CNT_PAYMENT_max'),
 (0.018571429, 'NAME_CONTRACT_STATUS_Refused_mean'),
 (0.018571429, 'AMT_CREDIT_accepted_mean'),
 (0.017142856, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.012857143, 'percentage_down_mean'),
 (0.012857143, 'OWN_CAR_AGE'),
 (0.012857143, 'FLAG_DOCUMENT_3'),
 (0.012857143, 'DAYS_LAST_PHONE_CHANGE'),
 (0.011428571, 'NAME_CONTRACT_TYPE'),
 (0.01, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.01, 'DEF_60_CNT_SOCIAL_CIRCLE'),
 (0.008571428, 'FLAG_OWN_CAR'),
 (0.008571428, 'DAYS_REGISTRATION'),
 (0.007142857, 'REGION_RATING_CLIENT_W_CITY'),
 (0.007142857, 'OR

In [15]:
zip([1,2,3], [4,5,6])

[(1, 4), (2, 5), (3, 6)]

In [16]:
data.isnull().sum()

TARGET                                      0
NAME_CONTRACT_TYPE                          0
CODE_GENDER                                 0
FLAG_OWN_CAR                                0
FLAG_OWN_REALTY                             0
CNT_CHILDREN                                0
AMT_INCOME_TOTAL                            0
AMT_CREDIT                                  0
AMT_ANNUITY                                 0
AMT_GOODS_PRICE                             0
REGION_POPULATION_RELATIVE                  0
DAYS_BIRTH                                  0
DAYS_EMPLOYED                               0
DAYS_REGISTRATION                           0
DAYS_ID_PUBLISH                             0
OWN_CAR_AGE                                 0
FLAG_MOBIL                                  0
FLAG_EMP_PHONE                              0
FLAG_WORK_PHONE                             0
FLAG_CONT_MOBILE                            0
FLAG_PHONE                                  0
FLAG_EMAIL                        