In [28]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from skopt.space import Real, Categorical, Integer
import xgboost as xgb
from xgboost import XGBClassifier
import os
import gc
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
import lightgbm as lgb
from sklearn.svm import LinearSVC, SVC
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import warnings
warnings.simplefilter('ignore')
OriginDataDir = '../../Data/OriginData'
CleanDataDir = '../../Data/CleanData'
TrainTestDataDir = '../../Data/TrainTestData'
ModelSaveDir = 'models'
ResultSaveDir = 'result2'
ResultSaveDir_open = 'result_open'

In [29]:
def loadData(DataName):
    DataPath = os.path.join(CleanDataDir, DataName)
    if not os.path.exists(DataPath):
        print('%s does not exist!' % DataPath)
        return
    OriginData = pd.read_csv(DataPath, index_col=0)
    #OriginData = OriginData.sample(frac=1)  # 打乱顺序后返回
    return OriginData

In [30]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:50].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('XGBOOST Features (avg over folds)')
    plt.tight_layout()

## 1.加载保存好的数据，并做一些处理

In [52]:
X = loadData('Train_AddFeatures.csv')

In [53]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 1134 entries, AMT_ANNUITY to AMT_PAYMENT_minus_mean_mean_multiply_DAYS_BIRTH
dtypes: float64(1077), int64(41), object(16)
memory usage: 3.0+ GB


In [54]:
ObjectCol = X.select_dtypes(include=['object']).columns.tolist()
ObjectCol

['CODE_GENDER',
 'EMERGENCYSTATE_MODE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'NAME_CONTRACT_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'NAME_INCOME_TYPE',
 'NAME_TYPE_SUITE',
 'OCCUPATION_TYPE',
 'ORGANIZATION_TYPE',
 'WALLSMATERIAL_MODE',
 'WEEKDAY_APPR_PROCESS_START']

In [55]:
X_null = X.isnull().sum().sort_values(ascending=False)/len(X)

In [56]:
X_null.head(10)

last_1_instalment_paid_over_amount_std_y     1.000000
last_1_instalment_paid_over_amount_std_x     1.000000
last_1_instalment_paid_late_in_days_std_x    1.000000
last_1_NUM_INSTALMENT_VERSION_std_x          1.000000
last_1_NUM_INSTALMENT_VERSION_std_y          1.000000
last_1_instalment_paid_late_in_days_std_y    1.000000
credit_card_avg_loading_of_credit_limit      0.819711
credit_card_cash_card_ratio                  0.802770
SK_ID_CURR_var_AMT_PAYMENT_CURRENT           0.799015
SK_ID_CURR_var_AMT_DRAWINGS_POS_CURRENT      0.798762
dtype: float64

In [57]:
X_null_col = list(X_null[X_null == 1].index)
X_null_col

['last_1_instalment_paid_over_amount_std_y',
 'last_1_instalment_paid_over_amount_std_x',
 'last_1_instalment_paid_late_in_days_std_x',
 'last_1_NUM_INSTALMENT_VERSION_std_x',
 'last_1_NUM_INSTALMENT_VERSION_std_y',
 'last_1_instalment_paid_late_in_days_std_y']

In [58]:
X.drop(X_null_col,axis=1,inplace=True)

In [59]:
for catname in ObjectCol:
    X[catname] = LabelEncoder().fit_transform(X[catname].astype(str))
    X = pd.concat([X,pd.get_dummies(X[catname],prefix=catname)],axis=1)
    X.drop(catname,inplace=True,axis=1)
X.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,WALLSMATERIAL_MODE_5,WALLSMATERIAL_MODE_6,WALLSMATERIAL_MODE_7,WEEKDAY_APPR_PROCESS_START_0,WEEKDAY_APPR_PROCESS_START_1,WEEKDAY_APPR_PROCESS_START_2,WEEKDAY_APPR_PROCESS_START_3,WEEKDAY_APPR_PROCESS_START_4,WEEKDAY_APPR_PROCESS_START_5,WEEKDAY_APPR_PROCESS_START_6
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1,0,0,0,0,0,0,0,0,1
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,0
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,0,0,1,0,0,0,0,0,0,1
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0


In [60]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 1258 entries, AMT_ANNUITY to WEEKDAY_APPR_PROCESS_START_6
dtypes: float64(1071), int64(41), uint8(146)
memory usage: 3.0 GB


In [61]:
TestData = X[X.TARGET.isnull()]
TestData.reset_index(drop=True, inplace=True)
TestID = TestData.SK_ID_CURR.values
AllTrainData = X[~X.TARGET.isnull()]
AllTrainData.reset_index(drop=True, inplace=True)

In [62]:
#用于提交test的函数
def calculate_rank(predictions):
    rank = (1 + predictions.rank().values) / (predictions.shape[0] + 1)
    return rank
def subtest(sub_preds_):
    subtmp = pd.DataFrame({'fold%d'%i:sub_preds_[:,i] for i in range(sub_preds_.shape[1])})
    pred = []
    for i in range(subtmp.shape[1]):
        pred.append(calculate_rank(subtmp['fold%d'%i]))
    submission = pd.DataFrame({'SK_ID_CURR': TestID, 'TARGET': np.mean(pred,axis=0)})
    subtmp['SK_ID_CURR'] = TestID
    return subtmp,submission

In [63]:
#获取特征重要度的函数
def get_importances(feature_importance_df_):
    importances = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)
    return importances

In [64]:
df_train = AllTrainData
df_test = TestData
num_folds = 5
stratified = True
debug= False
del AllTrainData
del TestData
del X
del X_null
gc.collect()

802

In [65]:
features = [x for x in df_train.columns if x != 'TARGET' and x != 'SK_ID_CURR']

In [66]:
len(features)

1256

## 2. XGBoost

### 2.1 xgb1

In [68]:
# Divide in training/validation and test data
print("Starting XGBoost. Train shape: {}, test shape: {}".format(df_train.shape, df_test.shape))
print("Num of Feature:",len(features))
# Cross validation model
if stratified:
    folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=90210)
else:
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=90210)
# Create arrays and dataframes to store results
oof_preds1 = np.zeros(df_train.shape[0])
sub_preds1 = np.zeros([df_test.shape[0],num_folds])
roc_score_list1 = []
feature_importance_df1 = pd.DataFrame()
    
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train[features], df_train['TARGET'])):
    train_x, train_y = df_train[features].iloc[train_idx], df_train['TARGET'].iloc[train_idx]
    valid_x, valid_y = df_train[features].iloc[valid_idx], df_train['TARGET'].iloc[valid_idx]

    model_config = {  #'random_search_runs': 0,
                      'booster': 'gbtree', # gpu cpu
                      'tree_method':'gpu_hist',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'eta': 0.02,
                      'max_leaves': 30,
                      'max_depth': 16,
                      'subsample': 0.5,
                      'colsample_bytree': 0.6,
                      'colsample_bylevel': 1,
                      'min_child_weight': 4,
                      'lambda': 100,
                      'alpha':0.0,
                      'nthread':-1,
                      #'seed':27,
                      'scale_pos_weight': 1}
    data_train = xgb.DMatrix(train_x,label=train_y)
    data_valid = xgb.DMatrix(valid_x,label=valid_y)
    data_test = xgb.DMatrix(df_test[features],label=None)
    clf = xgb.train( params=model_config,
                     dtrain=data_train,
                     evals=[(data_train, 'train'), (data_valid, 'valid')],
                     num_boost_round=10000,
                     early_stopping_rounds=100,
                     verbose_eval=200)
    
    oof_preds1[valid_idx] = clf.predict(data_valid,ntree_limit=clf.best_ntree_limit)
    sub_preds1[:,n_fold]= clf.predict(data_test,ntree_limit=clf.best_ntree_limit)
    roc_curr1 = roc_auc_score(valid_y, oof_preds1[valid_idx])
    roc_score_list1.append(roc_curr1)
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_curr1 ))
    
    importance1 = clf.get_fscore()
    fold_importance_df1 = pd.DataFrame(list(importance1.items()), columns=['feature','importance'])
    fold_importance_df1["fold"] = n_fold + 1
    feature_importance_df1 = pd.concat([feature_importance_df1, fold_importance_df1], axis=0)
    del clf, train_x, train_y, valid_x, valid_y
    gc.collect()

Starting XGBoost. Train shape: (307511, 1258), test shape: (48744, 1258)
Num of Feature: 1256
[01:34:01] Allocated 663MB on [0] GeForce GTX 1080 Ti, 7467MB remaining.
[01:34:02] Allocated 7MB on [0] GeForce GTX 1080 Ti, 7461MB remaining.
[01:34:02] Allocated 174MB on [0] GeForce GTX 1080 Ti, 7285MB remaining.
[01:34:02] Allocated 503MB on [0] GeForce GTX 1080 Ti, 6777MB remaining.
[0]	train-auc:0.633805	valid-auc:0.635462
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 100 rounds.
[200]	train-auc:0.770843	valid-auc:0.765807
[400]	train-auc:0.7952	valid-auc:0.782481
[600]	train-auc:0.808821	valid-auc:0.788262
[800]	train-auc:0.818723	valid-auc:0.791484
[1000]	train-auc:0.827363	valid-auc:0.793428
[1200]	train-auc:0.835013	valid-auc:0.794496
[1400]	train-auc:0.841797	valid-auc:0.795309
[1600]	train-auc:0.848188	valid-auc:0.795792
[1800]	train-auc:0.854297	valid-auc:0.796074
Stopping. Best iteration:
[1879

In [70]:
subtmp_xgb1,sub_xgb1 = subtest(sub_preds1)

In [74]:
sub_xgb1.to_csv("submission.csv", index=False)

In [79]:
pd.DataFrame(importance1.items(), columns=['feature', 'importance'])

ValueError: DataFrame constructor not properly called!

In [83]:
pd.DataFrame(list(importance1.items()), columns=['feature','importance'])

Unnamed: 0,feature,importance
0,OCCUPATION_TYPE_mean_CNT_CHILDREN,31
1,CODE_GENDER_0,6
2,SK_ID_CURR_var_DAYS_ENTRY_PAYMENT,107
3,last_20_instalment_paid_over_amount_max_x,1
4,NAME_FAMILY_STATUS_1,78
5,last_50_instalment_paid_over_amount_std_x,29
6,last_20_instalment_paid_over_amount_mean_y,8
7,FLAG_OWN_REALTY_0,12
8,CODE_GENDER_NAME_EDUCATION_TYPE_OCCUPATION_TYP...,112
9,last_100_instalment_paid_over_amount_skew_x,50
