In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMClassifier

  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  from pandas._libs.tslibs.strptime import array_strptime
  from pandas._libs.tslibs.frequencies import (  # noqa
  from pandas._libs.period import Period
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import lib, groupby as libgroupby, Timestamp, NaT, iNaT
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from . import _csparsetools
  from ._shortest_path import shortest_path, floyd_warshall, dijkstra,\
  from ._tools import csgraph_to_dense, csgraph_from_dense,\
  from ._traversal import breadth_first_order, depth_first_order, \
  from ._min_spanning_tree import minimum_spanning_tree
  from ._reordering impo

In [None]:

data = pd.read_csv('data/application_train.csv')
data_submission = pd.read_csv('data/application_test.csv')
submission_id = data_submission['SK_ID_CURR']

previous_data = pd.read_csv('data/previous_application.csv')
data_bureau = pd.read_csv('data/bureau.csv')

def process_data(data):
    Null_dict = dict()

    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        Null_dict[column] = 0


    for column in ['NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']:
        if data[column].isnull().sum()>0:
            Null_dict[column] = data[column].isnull().sum()

        if (data[column]=='XNA').sum()>0:
            Null_dict[column] = (data[column]=='XNA').sum()

    for (key,value) in Null_dict.items():
        Null_dict[key] = float(value)/data.shape[0]

    def replace_binary_categorical_var(df, column_name):
        categories = list(df[column_name].unique())
        if np.nan in categories:
            categories.remove(np.nan)
        assert(len(categories) == 2)
        df.loc[df[column_name] == categories[0], column_name] = 0
        df.loc[df[column_name] == categories[1], column_name] = 1    
            
    for col, dtype in data.dtypes.iteritems():
        if np.issubdtype(dtype, np.number) and col not in ('TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'):
            data[col + '/(AMT_CREDIT)'] = data[col] / data['AMT_CREDIT']
            
    data['FONDKAPREMONT_MODE'] = data['FONDKAPREMONT_MODE'].replace('XNA', np.nan)
    # Name contract type is either Cash loans or Revolving loans
    replace_binary_categorical_var(data, 'NAME_CONTRACT_TYPE')

    # Gender is either male, female or N/A. We'll consider it binary
    data['CODE_GENDER'] = data['CODE_GENDER'].replace('XNA', np.nan)
    replace_binary_categorical_var(data, 'CODE_GENDER')

    # FLAG_OWN_CAR and FLAG_OWN_REALTY are flags, either Y or N
    replace_binary_categorical_var(data, 'FLAG_OWN_CAR')
    replace_binary_categorical_var(data, 'FLAG_OWN_REALTY')

    # We'll consider unknown to be N/A
    data['NAME_FAMILY_STATUS'] = data['NAME_FAMILY_STATUS'].replace('Unknown', np.nan)

    # All these are categorical
    data = pd.get_dummies(data, columns=['NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE'])

    data = data.fillna(data.median())

    return data

data = process_data(data)
data_submission = process_data(data_submission)


In [None]:
def process_previous_application_data(previous_data):
    missing_percentage = previous_data.isnull().sum() / len(previous_data) 
    dropped = missing_percentage.loc[missing_percentage > 0.9]
    previous_data = previous_data.drop(dropped.index, axis=1)
    previous_data = pd.get_dummies(previous_data)

    previous_data['percentage_down'] = previous_data['AMT_DOWN_PAYMENT'] / previous_data['AMT_CREDIT']
    previous_data.loc[previous_data['NAME_CONTRACT_STATUS_Approved'] == 0, 'percentage_down'] = np.nan
    previous_data['AMT_CREDIT_accepted'] = previous_data['AMT_CREDIT']
    previous_data.loc[previous_data['NAME_CONTRACT_STATUS_Approved'] == 0, 'AMT_CREDIT_accepted'] = np.nan
    previous_data['AMT_ANNUITY_accepted'] = previous_data['AMT_ANNUITY']
    previous_data.loc[previous_data['NAME_CONTRACT_STATUS_Approved'] == 0, 'AMT_ANNUITY_accepted'] = np.nan

    for col in ('AMT_ANNUITY', 'AMT_ANNUITY_accepted', 'AMT_APPLICATION', 'AMT_GOODS_PRICE'):
        previous_data[col + '/AMT_CREDIT'] = previous_data[col] / previous_data['AMT_CREDIT']
        
    previous_data = previous_data.replace(float('inf'), np.nan)

    aggregated = previous_data.groupby('SK_ID_CURR', as_index=False).agg(
        {'SK_ID_PREV': 'count', 
         'NAME_CONTRACT_STATUS_Refused': 'mean', 
         'NAME_CONTRACT_STATUS_Approved': 'mean',
         'AMT_CREDIT': 'mean',
         'AMT_CREDIT_accepted': ('mean', 'max'),
         'percentage_down': 'mean',
         'CNT_PAYMENT': ('mean', 'max'),
         'NFLAG_INSURED_ON_APPROVAL': 'mean',
         'AMT_ANNUITY': ('mean', 'max'),
         'AMT_ANNUITY_accepted': ('mean', 'max'),
         'AMT_ANNUITY/AMT_CREDIT': ('mean', 'max'),
         'AMT_ANNUITY_accepted/AMT_CREDIT': ('mean', 'max'),
         'AMT_APPLICATION/AMT_CREDIT': ('mean', 'max'),
         'AMT_GOODS_PRICE/AMT_CREDIT': ('mean', 'max')})
    aggregated.columns = list(map('_'.join, aggregated.columns.values))
    aggregated = aggregated.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'})
    aggregated.columns
    return aggregated

previous_data = process_previous_application_data(previous_data)
previous_data.fillna(0)
previous_data.head()

In [None]:
def process_bureau(data_bureau):
    data_bureau = data_bureau.replace(['Mobile operator loan','Interbank credit','Loan for purchase of shares (margin lending)','Loan for purchase of equipment','Another type of loan','Unknown type of loan'],'Other')

    data_bureau = pd.get_dummies(data_bureau,columns = ['CREDIT_ACTIVE','CREDIT_CURRENCY','CREDIT_TYPE'])
    data_bureau = data_bureau.drop(['DAYS_CREDIT_UPDATE','CREDIT_CURRENCY_currency 1','CREDIT_CURRENCY_currency 2','CREDIT_CURRENCY_currency 3','CREDIT_CURRENCY_currency 4'],axis=1)
    data_bureau = data_bureau.fillna(data_bureau.median())
    data_bureau = data_bureau.groupby('SK_ID_CURR', as_index = False).agg({'SK_ID_BUREAU':'count','DAYS_CREDIT':'min','CREDIT_DAY_OVERDUE':'max','DAYS_CREDIT_ENDDATE':'max','DAYS_ENDDATE_FACT':'min','AMT_CREDIT_MAX_OVERDUE':['max','mean'],'CNT_CREDIT_PROLONG':['max','median'],'AMT_CREDIT_SUM':['max','median'],'AMT_CREDIT_SUM_DEBT':'sum','AMT_CREDIT_SUM_LIMIT':'max','AMT_CREDIT_SUM_OVERDUE':'sum','CREDIT_ACTIVE_Active':'count','CREDIT_ACTIVE_Bad debt':'count','CREDIT_ACTIVE_Closed':'count','CREDIT_ACTIVE_Sold':'count','CREDIT_TYPE_Car loan':'mean','CREDIT_TYPE_Cash loan (non-earmarked)':'mean','CREDIT_TYPE_Consumer credit':'mean','CREDIT_TYPE_Credit card':'mean','CREDIT_TYPE_Loan for business development':'mean','CREDIT_TYPE_Loan for the purchase of equipment':'mean','CREDIT_TYPE_Loan for working capital replenishment':'mean','CREDIT_TYPE_Microloan':'mean','CREDIT_TYPE_Mortgage':'mean','CREDIT_TYPE_Other':'mean','CREDIT_TYPE_Real estate loan':'mean','AMT_ANNUITY':'mean'})
    data_bureau.columns = list(map('_'.join, data_bureau.columns.values))
    data_bureau = data_bureau.rename(columns={'SK_ID_CURR_': 'SK_ID_CURR'})
    
    return data_bureau

data_bureau = process_bureau(data_bureau)
data_bureau.head()

In [7]:
data = data.merge(previous_data, on='SK_ID_CURR', how='left')
data_submission = data_submission.merge(previous_data, on='SK_ID_CURR', how='left')
data = data.merge(data_bureau, on= 'SK_ID_CURR',how='left')
data_submission = data_submission.merge(data_bureau, on= 'SK_ID_CURR',how='left')

data = data.drop('SK_ID_CURR', axis=1)
data_submission = data_submission.drop('SK_ID_CURR', axis=1)

# manual features
data['AMT_PREVIOUS_CREDIT/AMT_CREDIT'] = data['AMT_CREDIT_accepted_mean'] / data['AMT_CREDIT']
data_submission['AMT_PREVIOUS_CREDIT/AMT_CREDIT'] = data_submission['AMT_CREDIT_accepted_mean'] / data_submission['AMT_CREDIT']

data['AMT_PREVIOUS_MAXCREDIT/AMT_CREDIT'] = data['AMT_CREDIT_accepted_max'] / data['AMT_CREDIT']
data_submission['AMT_PREVIOUS_MAXCREDIT/AMT_CREDIT'] = data_submission['AMT_CREDIT_accepted_max'] / data_submission['AMT_CREDIT']

data['AMT_PREVIOUS_ANNUITY/AMT_ANNUITY'] = data['AMT_ANNUITY_accepted_mean'] / data['AMT_ANNUITY']
data_submission['AMT_PREVIOUS_ANNUITY/AMT_ANNUITY'] = data_submission['AMT_ANNUITY_accepted_mean'] / data_submission['AMT_ANNUITY']

# data['AMT_PREVIOUS_GOODS_PRICE/AMT_GOODS_PRICE'] = data['AMT_GOODS_PRICE/AMT_CREDIT_mean'] / data['AMT_GOODS_PRICE/(AMT_CREDIT)']
# data_submission['AMT_PREVIOUS_GOODS_PRICE/AMT_GOODS_PRICE'] = data_submission['AMT_GOODS_PRICE/AMT_CREDIT_mean'] / data_submission['AMT_GOODS_PRICE/(AMT_CREDIT)']


## BUREAU FILLING NA
default_values = dict()

for column in ['DAYS_CREDIT_min','DAYS_ENDDATE_FACT_min','AMT_CREDIT_SUM_LIMIT_max','DAYS_CREDIT_ENDDATE_max']:
    default_values[column] = data_bureau[column].median()
    
data_bureau = data.fillna(default_values)
data_bureau = data.fillna(0)
data_submission= data_submission.fillna(default_values)
data_submission= data_submission.fillna(0)

## PREVIOUS APPLICATION FILLING NA

default_values = {
    'AMT_CREDIT_mean': 0,
    'AMT_CREDIT_accepted_max': 0,
    'AMT_CREDIT_accepted_mean': 0,
    'NAME_CONTRACT_STATUS_Approved_mean': 1.0,
    'NAME_CONTRACT_STATUS_Refused_mean': 0.0,
    'SK_ID_PREV_count': 0
}

data = data.fillna(default_values)
data = data.fillna(data.median())

data_submission = data_submission.fillna(default_values)
data_submission = data_submission.fillna(data.median())

KeyError: 'SK_ID_CURR'

In [5]:
print(set(data.columns) - set(data_submission.columns))
data_submission['NAME_INCOME_TYPE_Maternity leave'] = 0

set(['NAME_INCOME_TYPE_Maternity leave', 'TARGET'])


In [17]:
data_x = data.drop('TARGET', axis=1).as_matrix()
data_y = data['TARGET'].as_matrix()
data_submission = data_submission.as_matrix()

train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.05)
del data_x, data_y

In [18]:
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)
data_submission = scaler.transform(data_submission)

In [19]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_x, train_y)

  from ..utils.seq_dataset import ArrayDataset, CSRDataset
  from ..utils import arrayfuncs, as_float_array, check_X_y, deprecated
  from . import cd_fast
  from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
  from .sag_fast import sag
  from . import libsvm, liblinear
  from . import libsvm_sparse
  from .ball_tree import BallTree
  from .kd_tree import KDTree


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
roc_auc_score(test_y, lr.predict_proba(test_x)[:,1])

0.7525504705149396

In [25]:
xgb = XGBClassifier()
xgb.fit(train_x, train_y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [26]:
roc_auc_score(test_y, xgb.predict_proba(test_x)[:,1])

0.7638441398538572

In [31]:
print('TOP 20 Importances')
for importance, col in sorted(zip(list(xgb.feature_importances_), list(data.columns)[1:]), reverse=True)[:20]:
    print(str(importance) + '\t' + col)

TOP 20 Importances
255	AMT_ANNUITY_/(AMT_CREDIT)
172	EXT_SOURCE_3
160	EXT_SOURCE_1
145	EXT_SOURCE_2
145	DAYS_BIRTH
106	AMT_GOODS_PRICE_/(AMT_CREDIT)
80	EXT_SOURCE_1_/(AMT_CREDIT)
74	DAYS_ID_PUBLISH
69	DAYS_EMPLOYED
53	AMT_ANNUITY
52	EXT_SOURCE_3_/(AMT_CREDIT)
51	DAYS_EMPLOYED_/(AMT_CREDIT)
47	DAYS_REGISTRATION_/(AMT_CREDIT)
45	DAYS_LAST_PHONE_CHANGE
42	AMT_GOODS_PRICE
41	DAYS_REGISTRATION
41	CODE_GENDER
40	AMT_INCOME_TOTAL_/(AMT_CREDIT)
38	FLAG_DOCUMENT_3_/(AMT_CREDIT)
38	AMT_REQ_CREDIT_BUREAU_YEAR_/(AMT_CREDIT)


In [32]:
confusion_matrix(test_y,lgbm.predict(test_x))

  if diff:


array([[14082,    17],
       [ 1251,    26]])

In [17]:
proba = cross_val_predict(xgb,train_x,train_y, method='predict_proba')

array([0, 0, 0, ..., 0, 0, 0])

In [33]:
predicted = lgbm.predict_proba(data_submission)[:, 1]

results = pd.DataFrame({'SK_ID_CURR': submission_id, 'TARGET': predicted})

results.to_csv('predictions.csv', index=False)