In [30]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from xgboost import XGBClassifier
import os
import gc
import operator
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
import lightgbm as lgb
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import warnings
warnings.simplefilter('ignore')
OriginDataDir = '../../Data/OriginData'
CleanDataDir = '../../Data/CleanData'
TrainTestDataDir = '../../Data/TrainTestData'
ModelSaveDir = 'models'
ResultSaveDir = 'result'

In [2]:
def loadData(DataName):
    DataPath = os.path.join(CleanDataDir, DataName)
    if not os.path.exists(DataPath):
        print('%s does not exist!' % DataPath)
        return
    OriginData = pd.read_csv(DataPath, index_col=0)
    #OriginData = OriginData.sample(frac=1)  # 打乱顺序后返回
    return OriginData

In [3]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:50].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('XGBOOST Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('XGBOOST_ImportantFeats.png')

In [4]:
X = loadData('application_open_master.csv')

In [5]:
X.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,OCCUPATION_TYPE_mean_AMT_ANNUITY,OCCUPATION_TYPE_mean_CNT_CHILDREN,OCCUPATION_TYPE_mean_CNT_FAM_MEMBERS,OCCUPATION_TYPE_mean_DAYS_BIRTH,OCCUPATION_TYPE_mean_DAYS_EMPLOYED,OCCUPATION_TYPE_mean_DAYS_ID_PUBLISH,OCCUPATION_TYPE_mean_DAYS_REGISTRATION,OCCUPATION_TYPE_mean_EXT_SOURCE_1,OCCUPATION_TYPE_mean_EXT_SOURCE_2,OCCUPATION_TYPE_mean_EXT_SOURCE_3
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,26698.670596,0.508122,2.274479,-14692.656271,-2435.869864,-2858.5017,-4701.896618,0.423305,0.498924,0.501227
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,27886.303533,0.552817,2.307945,-14105.971157,-2805.722621,-2755.924775,-4370.974006,0.511412,0.527237,0.491913
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26698.670596,0.508122,2.274479,-14692.656271,-2435.869864,-2858.5017,-4701.896618,0.423305,0.498924,0.501227
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,26698.670596,0.508122,2.274479,-14692.656271,-2435.869864,-2858.5017,-4701.896618,0.423305,0.498924,0.501227
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,27886.303533,0.552817,2.307945,-14105.971157,-2805.722621,-2755.924775,-4370.974006,0.511412,0.527237,0.491913


In [6]:
X_null = X.isnull().sum().sort_values(ascending=False)/len(X)

In [7]:
X_null.head(10)

last_1_instalment_paid_late_in_days_std_x    1.000000
last_1_instalment_paid_over_amount_std_x     1.000000
last_1_NUM_INSTALMENT_VERSION_std_x          1.000000
last_1_NUM_INSTALMENT_VERSION_std_y          1.000000
last_1_instalment_paid_late_in_days_std_y    1.000000
last_1_instalment_paid_over_amount_std_y     1.000000
credit_card_avg_loading_of_credit_limit      0.819711
credit_card_cash_card_ratio                  0.802770
SK_ID_CURR_var_AMT_PAYMENT_CURRENT           0.799015
SK_ID_CURR_var_AMT_DRAWINGS_OTHER_CURRENT    0.798762
dtype: float64

In [8]:
X_null_col = list(X_null[X_null == 1].index)
X_null_col

['last_1_instalment_paid_late_in_days_std_x',
 'last_1_instalment_paid_over_amount_std_x',
 'last_1_NUM_INSTALMENT_VERSION_std_x',
 'last_1_NUM_INSTALMENT_VERSION_std_y',
 'last_1_instalment_paid_late_in_days_std_y',
 'last_1_instalment_paid_over_amount_std_y']

In [9]:
#把缺失太多的去掉
X.drop(X_null_col,axis=1,inplace=True)

In [10]:
#categoryFeats = X.select_dtypes(include=['object']).columns.tolist()
categoryFeats = ['CODE_GENDER',
                       'EMERGENCYSTATE_MODE',
                       'FLAG_CONT_MOBILE',
                       'FLAG_DOCUMENT_3',
                       'FLAG_DOCUMENT_4',
                       'FLAG_DOCUMENT_5',
                       'FLAG_DOCUMENT_6',
                       'FLAG_DOCUMENT_7',
                       'FLAG_DOCUMENT_8',
                       'FLAG_DOCUMENT_9',
                       'FLAG_DOCUMENT_11',
                       'FLAG_DOCUMENT_18',
                       'FLAG_EMAIL',
                       'FLAG_EMP_PHONE',
                       'FLAG_MOBIL',
                       'FLAG_OWN_CAR',
                       'FLAG_OWN_REALTY',
                       'FLAG_PHONE',
                       'FLAG_WORK_PHONE',
                       'FONDKAPREMONT_MODE',
                       'HOUR_APPR_PROCESS_START',
                       'HOUSETYPE_MODE',
                       'LIVE_CITY_NOT_WORK_CITY',
                       'LIVE_REGION_NOT_WORK_REGION',
                       'NAME_CONTRACT_TYPE',
                       'NAME_TYPE_SUITE',
                       'NAME_INCOME_TYPE',
                       'NAME_EDUCATION_TYPE',
                       'NAME_FAMILY_STATUS',
                       'NAME_HOUSING_TYPE',
                       'OCCUPATION_TYPE',
                       'ORGANIZATION_TYPE',
                       'REG_CITY_NOT_LIVE_CITY',
                       'REG_CITY_NOT_WORK_CITY',
                       'REG_REGION_NOT_LIVE_REGION',
                       'REG_REGION_NOT_WORK_REGION',
                       'WALLSMATERIAL_MODE',
                       'WEEKDAY_APPR_PROCESS_START']

In [11]:
for catname in categoryFeats:
    X[catname] = LabelEncoder().fit_transform(X[catname].astype(str))
    X = pd.concat([X,pd.get_dummies(X[catname],prefix=catname)],axis=1)
    X.drop(catname,inplace=True,axis=1)
X.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,WALLSMATERIAL_MODE_5,WALLSMATERIAL_MODE_6,WALLSMATERIAL_MODE_7,WEEKDAY_APPR_PROCESS_START_0,WEEKDAY_APPR_PROCESS_START_1,WEEKDAY_APPR_PROCESS_START_2,WEEKDAY_APPR_PROCESS_START_3,WEEKDAY_APPR_PROCESS_START_4,WEEKDAY_APPR_PROCESS_START_5,WEEKDAY_APPR_PROCESS_START_6
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1,0,0,0,0,0,0,0,0,1
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,0
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,0,0,1,0,0,0,0,0,0,1
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0


In [12]:
TestData = X[X.TARGET.isnull()]
TestData.reset_index(drop=True, inplace=True)
TestID = TestData.SK_ID_CURR.values
AllTrainData = X[~X.TARGET.isnull()]
AllTrainData.reset_index(drop=True, inplace=True)

In [13]:
AllTrainData.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,WALLSMATERIAL_MODE_5,WALLSMATERIAL_MODE_6,WALLSMATERIAL_MODE_7,WEEKDAY_APPR_PROCESS_START_0,WEEKDAY_APPR_PROCESS_START_1,WEEKDAY_APPR_PROCESS_START_2,WEEKDAY_APPR_PROCESS_START_3,WEEKDAY_APPR_PROCESS_START_4,WEEKDAY_APPR_PROCESS_START_5,WEEKDAY_APPR_PROCESS_START_6
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1,0,0,0,0,0,0,0,0,1
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,0
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,0,0,1,0,0,0,0,0,0,1
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0


In [14]:
AllTrainData.shape

(307511, 1481)

In [15]:
TestData.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,WALLSMATERIAL_MODE_5,WALLSMATERIAL_MODE_6,WALLSMATERIAL_MODE_7,WEEKDAY_APPR_PROCESS_START_0,WEEKDAY_APPR_PROCESS_START_1,WEEKDAY_APPR_PROCESS_START_2,WEEKDAY_APPR_PROCESS_START_3,WEEKDAY_APPR_PROCESS_START_4,WEEKDAY_APPR_PROCESS_START_5,WEEKDAY_APPR_PROCESS_START_6
0,20560.5,568800.0,450000.0,135000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,1,0
1,17370.0,222768.0,180000.0,99000.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0,0,1,1,0,0,0,0,0,0
2,69777.0,663264.0,630000.0,202500.0,0.0,0.0,0.0,1.0,0.0,4.0,...,0,0,1,0,1,0,0,0,0,0
3,49018.5,1575000.0,1575000.0,315000.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0,0,0,0,0,0,0,0,0,1
4,32067.0,625500.0,625500.0,180000.0,,,,,,,...,0,0,1,1,0,0,0,0,0,0


In [16]:
features = [x for x in AllTrainData.columns if x != 'TARGET' and x != 'SK_ID_CURR']

In [17]:
len(features)

1479

In [48]:
df_train = AllTrainData
df_test = TestData
num_folds = 5
stratified = True
debug= False

In [49]:
# Divide in training/validation and test data
print("Starting XGBoost. Train shape: {}, test shape: {}".format(df_train.shape, df_test.shape))
print("Num of Feature:",len(features))
# Cross validation model
if stratified:
    folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=90210)
else:
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=90210)
# Create arrays and dataframes to store results
oof_preds = np.zeros(df_train.shape[0])
sub_preds = np.zeros([df_test.shape[0],num_folds])
roc_score_list = []
feature_importance_df = pd.DataFrame()
    
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train[features], df_train['TARGET'])):
    train_x, train_y = df_train[features].iloc[train_idx], df_train['TARGET'].iloc[train_idx]
    valid_x, valid_y = df_train[features].iloc[valid_idx], df_train['TARGET'].iloc[valid_idx]

    model_config = {  #'random_search_runs': 0,
                      'booster': 'gbtree', # gpu cpu
                      'tree_method':'gpu_hist',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'eta': 0.01,
                      'max_bin': 255,
                      'max_depth': 16,
                      'max_leaves': 40,
                      'subsample': 0.5,
                      'colsample_bylevel': 1,
                      'colsample_bytree': 0.5,
                      'min_child_weight': 4,
                      'lambda': 0.001,
                      'alpha': 0.001,
                      'nthread':-1,
                      'scale_pos_weight': 1}
    data_train = xgb.DMatrix(train_x,label=train_y)
    data_valid = xgb.DMatrix(valid_x,label=valid_y)
    data_test = xgb.DMatrix(df_test[features],label=None)
    clf = xgb.train( params=model_config,
                     dtrain=data_train,
                     evals=[(data_train, 'train'), (data_valid, 'valid')],
                     num_boost_round=10000,
                     early_stopping_rounds=200,
                     verbose_eval=200)
    
    oof_preds[valid_idx] = clf.predict(data_valid,ntree_limit=clf.best_ntree_limit)
    sub_preds[:,n_fold]= clf.predict(data_test,ntree_limit=clf.best_ntree_limit)
    roc_curr = roc_auc_score(valid_y, oof_preds[valid_idx])
    roc_score_list.append(roc_curr)
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_curr ))
    
    importance = clf.get_fscore()
    fold_importance_df = pd.DataFrame(importance, columns=['feature', 'importance'])
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    del clf, train_x, train_y, valid_x, valid_y
    gc.collect()

Starting XGBoost. Train shape: (307511, 1481), test shape: (48744, 1481)
Num of Feature: 1479
[04:05:17] Allocated 780MB on [0] GeForce GTX 1080 Ti, 6808MB remaining.
[04:05:18] Allocated 7MB on [0] GeForce GTX 1080 Ti, 6802MB remaining.
[04:05:18] Allocated 224MB on [0] GeForce GTX 1080 Ti, 6576MB remaining.
[04:05:18] Allocated 610MB on [0] GeForce GTX 1080 Ti, 5960MB remaining.
[0]	train-auc:0.727306	valid-auc:0.72319
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 200 rounds.
[200]	train-auc:0.767508	valid-auc:0.753438
[400]	train-auc:0.788313	valid-auc:0.765387
[600]	train-auc:0.80754	valid-auc:0.774666
[800]	train-auc:0.821568	valid-auc:0.779183
[1000]	train-auc:0.833436	valid-auc:0.782006
[1200]	train-auc:0.843736	valid-auc:0.78391
[1400]	train-auc:0.852967	valid-auc:0.785087
[1600]	train-auc:0.861431	valid-auc:0.785764
[1800]	train-auc:0.869329	valid-auc:0.786512
[2000]	train-auc:0.876475	valid-

KeyboardInterrupt: 

In [35]:
df_train = AllTrainData
df_test = TestData
num_folds = 5
stratified = False
debug= False

In [36]:
# Divide in training/validation and test data
print("Starting XGBoost. Train shape: {}, test shape: {}".format(df_train.shape, df_test.shape))
print("Num of Feature:",len(features))
# Cross validation model
if stratified:
    folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1054)
else:
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=1054)
# Create arrays and dataframes to store results
oof_preds = np.zeros(df_train.shape[0])
sub_preds = np.zeros([df_test.shape[0],num_folds])
roc_score_list = []
feature_importance_df = pd.DataFrame()
    
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train[features], df_train['TARGET'])):
    train_x, train_y = df_train[features].iloc[train_idx], df_train['TARGET'].iloc[train_idx]
    valid_x, valid_y = df_train[features].iloc[valid_idx], df_train['TARGET'].iloc[valid_idx]

    model_config = {  #'random_search_runs': 0,
                      'booster': 'gbtree', # gpu cpu
                      'tree_method':'gpu_hist',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'eta': 0.01,
                      'max_depth': 5,
                      'subsample': 0.8,
                      'colsample_bytree': 0.8,
                      'min_child_weight': 5,
                      'lambda': 1.2,
                      'nthread':-1,
                      'seed':27,
                      'scale_pos_weight': 1}
    data_train = xgb.DMatrix(train_x,label=train_y)
    data_valid = xgb.DMatrix(valid_x,label=valid_y)
    data_test = xgb.DMatrix(df_test[features],label=None)
    clf = xgb.train( params=model_config,
                     dtrain=data_train,
                     evals=[(data_train, 'train'), (data_valid, 'valid')],
                     num_boost_round=10000,
                     early_stopping_rounds=200,
                     verbose_eval=200)
    
    oof_preds[valid_idx] = clf.predict(data_valid,ntree_limit=clf.best_ntree_limit)
    sub_preds[:,n_fold]= clf.predict(data_test,ntree_limit=clf.best_ntree_limit)
    roc_curr = roc_auc_score(valid_y, oof_preds[valid_idx])
    roc_score_list.append(roc_curr)
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_curr ))
    
    importance = clf.get_fscore()
    fold_importance_df = pd.DataFrame(importance, columns=['feature', 'importance'])
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    del clf, train_x, train_y, valid_x, valid_y
    gc.collect()

Starting XGBoost. Train shape: (307511, 1481), test shape: (48744, 1481)
Num of Feature: 1479
[02:58:48] Allocated 780MB on [0] GeForce GTX 1080 Ti, 8444MB remaining.
[02:58:48] Allocated 7MB on [0] GeForce GTX 1080 Ti, 8438MB remaining.
[02:58:48] Allocated 177MB on [0] GeForce GTX 1080 Ti, 8260MB remaining.
[02:58:48] Allocated 610MB on [0] GeForce GTX 1080 Ti, 7644MB remaining.
[0]	train-auc:0.730271	valid-auc:0.724387
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 200 rounds.
[200]	train-auc:0.762787	valid-auc:0.751856
[400]	train-auc:0.78293	valid-auc:0.76532
[600]	train-auc:0.799914	valid-auc:0.774999
[800]	train-auc:0.811386	valid-auc:0.780002
[1000]	train-auc:0.820405	valid-auc:0.783051
[1200]	train-auc:0.828102	valid-auc:0.785081
[1400]	train-auc:0.835311	valid-auc:0.786662
[1600]	train-auc:0.841601	valid-auc:0.787814
[1800]	train-auc:0.847423	valid-auc:0.788626
[2000]	train-auc:0.852913	valid

In [42]:
def calculate_rank(predictions):
    rank = (1 + predictions.rank().values) / (predictions.shape[0] + 1)
    return rank

In [43]:
subtmp = pd.DataFrame({'fold%d'%i:sub_preds[:,i] for i in range(sub_preds.shape[1])})
subtmp.head()

Unnamed: 0,fold0,fold1,fold2,fold3,fold4
0,0.04224,0.036619,0.039411,0.024528,0.036408
1,0.185206,0.188405,0.21702,0.163534,0.208762
2,0.043668,0.029271,0.041216,0.034369,0.036053
3,0.041475,0.041187,0.044073,0.050859,0.040428
4,0.164684,0.179815,0.152733,0.158491,0.169584


In [44]:
pred = []
for i in range(subtmp.shape[1]):
    pred.append(calculate_rank(subtmp['fold%d'%i]))
pred

[array([0.48759873, 0.906534  , 0.50068725, ..., 0.05580059, 0.18968099,
        0.9168325 ]),
 array([0.44917427, 0.91069853, 0.36992512, ..., 0.06066263, 0.24804595,
        0.88809109]),
 array([0.45442609, 0.93096728, 0.47073546, ..., 0.07682839, 0.34101959,
        0.93221869]),
 array([0.30302595, 0.88378295, 0.41712996, ..., 0.04121448, 0.246733  ,
        0.91654529]),
 array([0.44821007, 0.92450508, 0.44472254, ..., 0.0489281 , 0.23311109,
        0.89352754])]

In [45]:
submission = pd.DataFrame({'SK_ID_CURR': TestID, 'TARGET': np.mean(pred,axis=0)})
submission.to_csv("lgbm_open_lucky.csv", index=False)

In [46]:
submission.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.428487
1,100005,0.911298
2,100013,0.44064
3,100028,0.501787
4,100038,0.885223
