In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMClassifier

  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  from pandas._libs.tslibs.strptime import array_strptime
  from pandas._libs.tslibs.frequencies import (  # noqa
  from pandas._libs.period import Period
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import lib, groupby as libgroupby, Timestamp, NaT, iNaT
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from . import _csparsetools
  from ._shortest_path import shortest_path, floyd_warshall, dijkstra,\
  from ._tools import csgraph_to_dense, csgraph_from_dense,\
  from ._traversal import breadth_first_order, depth_first_order, \
  from ._min_spanning_tree import minimum_spanning_tree
  from ._reordering impo

In [2]:

data = pd.read_csv('data/application_train.csv')
data_submission = pd.read_csv('data/application_test.csv')
submission_id = data_submission['SK_ID_CURR']

previous_data = pd.read_csv('data/previous_application.csv')


def process_data(data):
    Null_dict = dict()

    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        Null_dict[column] = 0


    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        if data[column].isnull().sum()>0:
            Null_dict[column] = data[column].isnull().sum()

        if (data[column]=='XNA').sum()>0:
            Null_dict[column] = (data[column]=='XNA').sum()

    for (key,value) in Null_dict.items():
        Null_dict[key] = float(value)/data.shape[0]

    def replace_binary_categorical_var(df, column_name):
        categories = list(df[column_name].unique())
        if np.nan in categories:
            categories.remove(np.nan)
        assert(len(categories) == 2)
        df.loc[df[column_name] == categories[0], column_name] = 0
        df.loc[df[column_name] == categories[1], column_name] = 1    
            
    for col, dtype in data.dtypes.iteritems():
        if np.issubdtype(dtype, np.number) and col not in ('TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'):
            data[col + '/(AMT_CREDIT)'] = data[col] / data['AMT_CREDIT']
            
    data['FONDKAPREMONT_MODE'] = data['FONDKAPREMONT_MODE'].replace('XNA', np.nan)
    # Name contract type is either Cash loans or Revolving loans
    replace_binary_categorical_var(data, 'NAME_CONTRACT_TYPE')

    # Gender is either male, female or N/A. We'll consider it binary
    data['CODE_GENDER'] = data['CODE_GENDER'].replace('XNA', np.nan)
    replace_binary_categorical_var(data, 'CODE_GENDER')

    # FLAG_OWN_CAR and FLAG_OWN_REALTY are flags, either Y or N
    replace_binary_categorical_var(data, 'FLAG_OWN_CAR')
    replace_binary_categorical_var(data, 'FLAG_OWN_REALTY')

    # We'll consider unknown to be N/A
    data['NAME_FAMILY_STATUS'] = data['NAME_FAMILY_STATUS'].replace('Unknown', np.nan)

    # All these are categorical
    data = pd.get_dummies(data, columns=['NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE'])

    data = data.fillna(data.median())

    return data

data = process_data(data)
data_submission = process_data(data_submission)


In [3]:
def process_previous_application_data(previous_data):
    missing_percentage = previous_data.isnull().sum() / len(previous_data) 
    dropped = missing_percentage.loc[missing_percentage > 0.9]
    previous_data = previous_data.drop(dropped.index, axis=1)
    previous_data = pd.get_dummies(previous_data)

    previous_data['percentage_down'] = previous_data['AMT_DOWN_PAYMENT'] / previous_data['AMT_CREDIT']
    previous_data.loc[previous_data['NAME_CONTRACT_STATUS_Approved'] == 0, 'percentage_down'] = np.nan
    previous_data['AMT_CREDIT_accepted'] = previous_data['AMT_CREDIT']
    previous_data.loc[previous_data['NAME_CONTRACT_STATUS_Approved'] == 0, 'AMT_CREDIT_accepted'] = np.nan
    previous_data['AMT_ANNUITY_accepted'] = previous_data['AMT_ANNUITY']
    previous_data.loc[previous_data['NAME_CONTRACT_STATUS_Approved'] == 0, 'AMT_ANNUITY_accepted'] = np.nan

    for col in ('AMT_ANNUITY', 'AMT_ANNUITY_accepted', 'AMT_APPLICATION', 'AMT_GOODS_PRICE'):
        previous_data[col + '/AMT_CREDIT'] = previous_data[col] / previous_data['AMT_CREDIT']
        
    previous_data = previous_data.replace(float('inf'), np.nan)

    aggregated = previous_data.groupby('SK_ID_CURR', as_index=False).agg(
        {'SK_ID_PREV': 'count', 
         'NAME_CONTRACT_STATUS_Refused': 'mean', 
         'NAME_CONTRACT_STATUS_Approved': 'mean',
         'AMT_CREDIT': 'mean',
         'AMT_CREDIT_accepted': ('mean', 'max'),
         'percentage_down': 'mean',
         'CNT_PAYMENT': ('mean', 'max'),
         'NFLAG_INSURED_ON_APPROVAL': 'mean',
         'AMT_ANNUITY': ('mean', 'max'),
         'AMT_ANNUITY_accepted': ('mean', 'max'),
         'AMT_ANNUITY/AMT_CREDIT': ('mean', 'max'),
         'AMT_ANNUITY_accepted/AMT_CREDIT': ('mean', 'max'),
         'AMT_APPLICATION/AMT_CREDIT': ('mean', 'max'),
         'AMT_GOODS_PRICE/AMT_CREDIT': ('mean', 'max')})
    aggregated.columns = list(map('_'.join, aggregated.columns.values))
    aggregated = aggregated.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'})
    aggregated.columns
    return aggregated

previous_data = process_previous_application_data(previous_data)
previous_data.fillna(0)

Unnamed: 0,SK_ID_CURR,AMT_CREDIT_accepted_mean,AMT_CREDIT_accepted_max,SK_ID_PREV_count,AMT_GOODS_PRICE/AMT_CREDIT_mean,AMT_GOODS_PRICE/AMT_CREDIT_max,AMT_ANNUITY_accepted/AMT_CREDIT_mean,AMT_ANNUITY_accepted/AMT_CREDIT_max,CNT_PAYMENT_mean,CNT_PAYMENT_max,...,AMT_ANNUITY_mean,AMT_ANNUITY_max,percentage_down_mean,AMT_ANNUITY_accepted_mean,AMT_ANNUITY_accepted_max,NFLAG_INSURED_ON_APPROVAL_mean,AMT_CREDIT_mean,NAME_CONTRACT_STATUS_Refused_mean,AMT_APPLICATION/AMT_CREDIT_mean,AMT_APPLICATION/AMT_CREDIT_max
0,100001,23787.000000,23787.0,1,1.044079,1.044079,0.166099,0.166099,8.000000,8.0,...,3951.000000,3951.000,0.105940,3951.000000,3951.000,0.000000,23787.000000,0.000000,1.044079,1.044079
1,100002,179055.000000,179055.0,1,1.000000,1.000000,0.051670,0.051670,24.000000,24.0,...,9251.775000,9251.775,0.000000,9251.775000,9251.775,0.000000,179055.000000,0.000000,1.000000,1.000000
2,100003,484191.000000,1035882.0,3,0.949329,1.011109,0.126383,0.185200,10.000000,12.0,...,56553.990000,98356.995,0.050585,56553.990000,98356.995,0.666667,484191.000000,0.000000,0.949329,1.011109
3,100004,20106.000000,20106.0,1,1.207699,1.207699,0.266450,0.266450,4.000000,4.0,...,5357.250000,5357.250,0.241719,5357.250000,5357.250,0.000000,20106.000000,0.000000,1.207699,1.207699
4,100005,40153.500000,40153.5,2,1.111173,1.111173,0.119870,0.119870,12.000000,12.0,...,4813.200000,4813.200,0.111173,4813.200000,4813.200,0.000000,20076.750000,0.000000,1.111173,1.111173
5,100006,343728.900000,675000.0,9,1.010763,1.250017,0.075952,0.108340,23.000000,48.0,...,23651.175000,39954.510,0.180612,21842.190000,39954.510,0.000000,291695.500000,0.111111,1.010763,1.250017
6,100007,166638.750000,284400.0,6,0.969650,1.175185,0.090659,0.125499,20.666667,48.0,...,12278.805000,22678.785,0.176401,12278.805000,22678.785,0.600000,166638.750000,0.000000,0.969650,1.175185
7,100008,203459.625000,501975.0,5,1.030049,1.112625,0.118055,0.200701,14.000000,30.0,...,15839.696250,25309.575,0.074579,15839.696250,25309.575,0.250000,162767.700000,0.000000,1.030049,1.112625
8,100009,70137.642857,98239.5,7,1.095482,1.191067,0.155250,0.197370,8.000000,12.0,...,10051.412143,17341.605,0.136969,10051.412143,17341.605,0.000000,70137.642857,0.000000,1.095482,1.191067
9,100010,260811.000000,260811.0,1,0.947859,0.947859,0.105300,0.105300,10.000000,10.0,...,27463.410000,27463.410,0.000000,27463.410000,27463.410,0.000000,260811.000000,0.000000,0.947859,0.947859


In [4]:
data = data.merge(previous_data, on='SK_ID_CURR', how='left')
data_submission = data_submission.merge(previous_data, on='SK_ID_CURR', how='left')

data = data.drop('SK_ID_CURR', axis=1)
data_submission = data_submission.drop('SK_ID_CURR', axis=1)

default_values = {
    'AMT_CREDIT_mean': 0,
    'AMT_CREDIT_accepted_max': 0,
    'AMT_CREDIT_accepted_mean': 0,
    'NAME_CONTRACT_STATUS_Approved_mean': 1.0,
    'NAME_CONTRACT_STATUS_Refused_mean': 0.0,
    'SK_ID_PREV_count': 0}


# manual features
data['AMT_PREVIOUS_CREDIT/AMT_CREDIT'] = data['AMT_CREDIT_accepted_mean'] / data['AMT_CREDIT']
data_submission['AMT_PREVIOUS_CREDIT/AMT_CREDIT'] = data_submission['AMT_CREDIT_accepted_mean'] / data_submission['AMT_CREDIT']

data['AMT_PREVIOUS_MAXCREDIT/AMT_CREDIT'] = data['AMT_CREDIT_accepted_max'] / data['AMT_CREDIT']
data_submission['AMT_PREVIOUS_MAXCREDIT/AMT_CREDIT'] = data_submission['AMT_CREDIT_accepted_max'] / data_submission['AMT_CREDIT']

data['AMT_PREVIOUS_ANNUITY/AMT_ANNUITY'] = data['AMT_ANNUITY_accepted_mean'] / data['AMT_ANNUITY']
data_submission['AMT_PREVIOUS_ANNUITY/AMT_ANNUITY'] = data_submission['AMT_ANNUITY_accepted_mean'] / data_submission['AMT_ANNUITY']

# data['AMT_PREVIOUS_GOODS_PRICE/AMT_GOODS_PRICE'] = data['AMT_GOODS_PRICE/AMT_CREDIT_mean'] / data['AMT_GOODS_PRICE/(AMT_CREDIT)']
# data_submission['AMT_PREVIOUS_GOODS_PRICE/AMT_GOODS_PRICE'] = data_submission['AMT_GOODS_PRICE/AMT_CREDIT_mean'] / data_submission['AMT_GOODS_PRICE/(AMT_CREDIT)']

data = data.fillna(default_values)
data = data.fillna(data.median())

data_submission = data_submission.fillna(default_values)
data_submission = data_submission.fillna(data.median())

In [5]:
data_submission['NAME_INCOME_TYPE_Maternity leave'] = 0

In [6]:
data_x = data.drop('TARGET', axis=1).as_matrix()
data_y = data['TARGET'].as_matrix()
data_submission = data_submission.as_matrix()

train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.3, random_state=42)
del data_x, data_y

In [7]:
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)
data_submission = scaler.transform(data_submission)

In [39]:
to_remove = np.array([  3,   4,  15,  16,  17,  18,  19,  20,  22,  25,  26,  27,  28,
        29,  30,  39,  41,  46,  53,  55,  56,  60,  67,  68,  69,  70,
        74,  78,  80,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 111, 166, 169, 184, 186, 187, 189, 191, 192, 193, 194, 195,
       196, 197, 199, 201, 202, 203, 210, 211, 212, 213, 214, 215, 216,
       217, 218, 219, 220, 221, 222, 223, 225, 227, 228, 232, 234, 235,
       236, 237, 238, 239, 240, 241, 242, 243, 244, 246, 248, 250, 251,
       252, 253, 254, 256, 257, 258, 259, 261, 262, 263, 264, 266, 267,
       268, 269, 270, 272, 274, 275, 276, 277, 278, 279, 280, 281, 282,
       283, 284, 285, 286, 287, 288, 289, 290, 291, 293, 295, 296, 298,
       299, 300, 301, 302, 303, 304, 306, 307, 309, 310, 311, 312, 313,
       314, 315, 316, 317, 318, 319, 321, 322, 323, 324, 325, 326, 327,
       328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339])
train_x = np.delete(train_x, to_remove, axis=1)
test_x = np.delete(test_x, to_remove, axis=1)
data_submission = np.delete(data_submission, to_remove, axis=1)

In [45]:
lgbm = XGBClassifier()
lgbm.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

0.7650063583385889
0.7693727659193655
0.7700072132295341
0.7721334018303858

In [46]:
print('train_score:', roc_auc_score(train_y, lgbm.predict_proba(train_x)[:,1]))
print('test_score:', roc_auc_score(test_y, lgbm.predict_proba(test_x)[:,1]))

('train_score:', 0.7721235759368765)
('test_score:', 0.7632538769846675)


In [47]:
predicted = lgbm.predict_proba(data_submission)[:, 1]

results = pd.DataFrame({'SK_ID_CURR': submission_id, 'TARGET': predicted})

results.to_csv('predictions.csv', index=False)

In [48]:
print('TOP 50 Importances')
for importance, col in sorted(zip(list(lgbm.feature_importances_), list(data.columns.delete(to_remove))[1:]), reverse=True)[:50]:
    print(str(importance) + '\t' + col)

TOP 50 Importances
0.16309012	EXT_SOURCE_3
0.123032905	EXT_SOURCE_2
0.087267525	EXT_SOURCE_1
0.060085837	AMT_ANNUITY/(AMT_CREDIT)
0.047210302	AMT_GOODS_PRICE/(AMT_CREDIT)
0.04434907	AMT_PREVIOUS_ANNUITY/AMT_ANNUITY
0.037195995	DAYS_BIRTH
0.032904148	CODE_GENDER
0.022889843	NAME_CONTRACT_STATUS_Refused_mean
0.022889843	DAYS_EMPLOYED
0.020028612	CNT_PAYMENT_max
0.020028612	AMT_GOODS_PRICE/AMT_CREDIT_mean
0.017167382	CNT_PAYMENT_mean
0.015736766	DAYS_ID_PUBLISH/(AMT_CREDIT)
0.012875536	FLAG_DOCUMENT_5/(AMT_CREDIT)
0.012875536	AMT_GOODS_PRICE
0.011444922	AMT_ANNUITY
0.010014306	NAME_EDUCATION_TYPE_Lower secondary
0.010014306	NAME_CONTRACT_STATUS_Approved_mean
0.010014306	DEF_60_CNT_SOCIAL_CIRCLE/(AMT_CREDIT)
0.010014306	DEF_30_CNT_SOCIAL_CIRCLE/(AMT_CREDIT)
0.010014306	DAYS_ID_PUBLISH
0.010014306	CNT_CHILDREN
0.008583691	NAME_FAMILY_STATUS_Separated
0.008583691	FLAG_WORK_PHONE/(AMT_CREDIT)
0.008583691	AMT_ANNUITY_accepted/AMT_CREDIT_mean
0.0071530757	REG_CITY_NOT_LIVE_CITY/(AMT_CREDIT)
0.0

In [38]:
to_remove = np.where(lgbm.feature_importances_ < np.median(lgbm.feature_importances_))
print(to_remove)
print(list(data.columns[1:][to_remove]))

(array([  3,   4,  15,  16,  17,  18,  19,  20,  22,  25,  26,  27,  28,
        29,  30,  39,  41,  46,  53,  55,  56,  60,  67,  68,  69,  70,
        74,  78,  80,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 111, 166, 169, 184, 186, 187, 189, 191, 192, 193, 194, 195,
       196, 197, 199, 201, 202, 203, 210, 211, 212, 213, 214, 215, 216,
       217, 218, 219, 220, 221, 222, 223, 225, 227, 228, 232, 234, 235,
       236, 237, 238, 239, 240, 241, 242, 243, 244, 246, 248, 250, 251,
       252, 253, 254, 256, 257, 258, 259, 261, 262, 263, 264, 266, 267,
       268, 269, 270, 272, 274, 275, 276, 277, 278, 279, 280, 281, 282,
       283, 284, 285, 286, 287, 288, 289, 290, 291, 293, 295, 296, 298,
       299, 300, 301, 302, 303, 304, 306, 307, 309, 310, 311, 312, 313,
       314, 315, 316, 317, 318, 319, 321, 322, 323, 324, 325, 326, 327,
       328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 3