In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from lightgbm import LGBMClassifier
import lightgbm as lgb
import gc
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score



In [2]:
os.chdir('/home/luke/Desktop/kaggle/Home_Credit_Default_Risk')

#The Merged_df is a meta dataframe join all the tables together including both Train rows and Test rows.
#Shape of test data(48744, 121) 
#Shape of train data(307511, 122) 
#Shape of merged_df(356255, 298)

merged_df = pd.read_csv('processed_input_data.csv')


In [3]:
meta_cols = ['SK_ID_CURR']
meta_df = merged_df[meta_cols]
merged_df.drop(columns=meta_cols, inplace=True)

In [4]:
def process_dataframe(input_df, encoder_dict=None):
    """ Process a dataframe into a form useable by LightGBM """

    # Label encode categoricals
    print('Label encoding categorical features...')
    categorical_feats = input_df.columns[input_df.dtypes == 'object']
    for feat in categorical_feats:
        encoder = LabelEncoder()
        input_df[feat] = encoder.fit_transform(input_df[feat].fillna('NULL'))
    print('Label encoding complete.')

    return input_df, categorical_feats.tolist(), encoder_dict


In [5]:
merged_df, categorical_feats, encoder_dict = process_dataframe(input_df=merged_df)

Label encoding categorical features...
Label encoding complete.


In [6]:
non_obj_categoricals = [
    'FONDKAPREMONT_MODE', 'HOUR_APPR_PROCESS_START', 'HOUSETYPE_MODE',
    'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
    'NAME_INCOME_TYPE', 'NAME_TYPE_SUITE', 'OCCUPATION_TYPE',
    'ORGANIZATION_TYPE', 'STATUS', 'NAME_CONTRACT_STATUS_CAVG',
    'WALLSMATERIAL_MODE', 'WEEKDAY_APPR_PROCESS_START', 'NAME_CONTRACT_TYPE_BAVG',
    'WEEKDAY_APPR_PROCESS_START_BAVG', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 
    'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE_BAVG', 
    'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 
    'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 
    'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION', 'NAME_CONTRACT_STATUS_CCAVG' 
]
categorical_feats = categorical_feats + non_obj_categoricals

In [7]:
application_test = pd.read_csv('application_test.csv')
application_train = pd.read_csv('application_train.csv')
len_train = len(application_train)

In [20]:
train_df = merged_df[:len_train]
test_df = merged_df[len_train:]
#del merged_df, app_test_df, bureau_df, bureau_balance_df, credit_card_df, pos_cash_df, prev_app_df
gc.collect()

""" Train the model """
target = train_df.pop('TARGET')
test_df.drop(columns='TARGET', inplace=True)

lgbm_train = lgb.Dataset(data=train_df,
                          label=target,
                          categorical_feature=categorical_feats,
                          free_raw_data=False)

del train_df
gc.collect()


lgbm_params = {
    'boosting': 'dart',
    'application': 'binary',
    'learning_rate': 0.1,
    'min_data_in_leaf': 30,
    'num_leaves': 31,
    'max_depth': -1,
    'feature_fraction': 0.5,
    'scale_pos_weight': 2,
    'drop_rate': 0.02
}

cv_results = lgb.cv(train_set=lgbm_train,
                     params=lgbm_params,
                     nfold=5,
                     num_boost_round=600,
                     early_stopping_rounds=50,
                     verbose_eval=50,
                     metrics=['auc'])

optimum_boost_rounds = np.argmax(cv_results['auc-mean'])
print('Optimum boost rounds = {}'.format(optimum_boost_rounds))
print('Best CV result = {}'.format(np.max(cv_results['auc-mean'])))

clf = lgb.train(train_set=lgbm_train,
                 params=lgbm_params,
                 num_boost_round=optimum_boost_rounds)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


[50]	cv_agg's auc: 0.775093 + 0.00312303
[100]	cv_agg's auc: 0.781779 + 0.0024516
[150]	cv_agg's auc: 0.783134 + 0.00257406
[200]	cv_agg's auc: 0.784272 + 0.00259041
[250]	cv_agg's auc: 0.785275 + 0.00240448
[300]	cv_agg's auc: 0.786504 + 0.0023492
[350]	cv_agg's auc: 0.786826 + 0.00241311
[400]	cv_agg's auc: 0.786985 + 0.00244814
[450]	cv_agg's auc: 0.787372 + 0.00237858
Optimum boost rounds = 437
Best CV result = 0.787427749715952


In [13]:
y_pred = clf.predict(test_df)
out_df = pd.DataFrame({'SK_ID_CURR': meta_df['SK_ID_CURR'][len_train:], 'TARGET': y_pred})

In [14]:
out_df.to_csv('submission.csv', index=False)