In [15]:
import numpy as np
import featuretools as ft
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")
import warnings
warnings.filterwarnings("ignore")
from sklearn import preprocessing
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost
from xgboost.sklearn import XGBClassifier

In [16]:
train_df = pd.read_csv('input/train.csv', na_values=['NAN','NA','NaN','na','nan'])
test_df = pd.read_csv('input/test.csv', na_values=['NAN','NA','NaN','na','nan'])
X_all = pd.read_csv('input/X_all.csv', na_values=['NAN','NA','NaN','na','nan'])
X = pd.read_csv('input/X.csv', na_values=['NAN','NA','NaN','na','nan'])
X_test = pd.read_csv('input/X_test.csv', na_values=['NAN','NA','NaN','na','nan'])
ft_importance_total = pd.read_csv('input/ft_importance_total.csv', na_values=['NAN','NA','NaN','na','nan'])

In [17]:
train_df['OC'][train_df['OC'] == 'open'] = 0
train_df['OC'][train_df['OC'] == ' close'] = 1
y = train_df['OC']
X_id = train_df['inst_id']
X_test_id = test_df['inst_id']
X.fillna(X.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)
y.fillna(0, inplace=True)
imbalanced_pos_ratio = y.value_counts()[1]/y.value_counts()[0]
imbalanced_pos_ratio

0.05244755244755245

In [18]:
def modelfit(alg, X, y, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgboost.DMatrix(X.values, label=y.values)
        cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='logloss', early_stopping_rounds=early_stopping_rounds, stratified=True)
        alg.set_params(n_estimators=cvresult.shape[0])
    #print(cvresult)
    #Fit the algorithm on the data
    #alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc')
        
    #Predict training set:
    #dtrain_predictions = alg.predict(dtrain[predictors])
    #dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    #print "\nModel Report"
    #print "Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions)
    #print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob)
                    
    #feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    #feat_imp.plot(kind='bar', title='Feature Importances')
    #plt.ylabel('Feature Importance Score')
    return cvresult.shape[0], cvresult.iloc[cvresult.shape[0]-1, 0]

In [19]:
xgb = XGBClassifier(
         learning_rate=0.1,
         n_estimators=500,
         objective= 'binary:logistic',
         tree_method='gpu_exact',
         predictor='gpu_predictor',
         scale_pos_weight=int(1/imbalanced_pos_ratio),
         seed=42)

In [22]:
min_logloss = 100000
xgb_tunned_params = dict()
xgb_percentile = 0
for i in range(1,15):
    xgb_x_cols = ft_importance_total[ft_importance_total.value >= np.percentile(ft_importance_total.value,100-i)].sort_values(by='value', ascending=False).feature
    X2 = X[xgb_x_cols] 
    y2 = y.astype('int')
    v_n_estimators, v_logloss = modelfit(xgb, X2, y2) 
    print('Iteration', i, 'estimators', v_n_estimators, 'percentile', 100-i, 'logloss', v_logloss)
    if v_logloss < min_logloss:
        xgb_tunned_params['n_estimators'] = v_n_estimators
        xgb_percentile = 100-i
        min_logloss = v_logloss
print('Best params', xgb_tunned_params, 'percentile', xgb_percentile, 'logloss', min_logloss)

Iteration 1 estimators 82 percentile 99 logloss 0.17515340000000001
Iteration 2 estimators 82 percentile 98 logloss 0.1669172
Iteration 3 estimators 82 percentile 97 logloss 0.16018939999999998
Iteration 4 estimators 82 percentile 96 logloss 0.18223859999999997
Iteration 5 estimators 82 percentile 95 logloss 0.19189099999999998
Iteration 6 estimators 82 percentile 94 logloss 0.2060902
Iteration 7 estimators 82 percentile 93 logloss 0.20623479999999997
Iteration 8 estimators 82 percentile 92 logloss 0.20999440000000003
Iteration 9 estimators 82 percentile 91 logloss 0.204365
Iteration 10 estimators 82 percentile 90 logloss 0.20602499999999999
Iteration 11 estimators 82 percentile 89 logloss 0.1996742
Iteration 12 estimators 82 percentile 88 logloss 0.20193339999999999
Iteration 13 estimators 82 percentile 87 logloss 0.2053778
Iteration 14 estimators 82 percentile 86 logloss 0.2051132
Best params {'n_estimators': 82} percentile 97 logloss 0.16018939999999998


In [23]:
xgb_x_cols = ft_importance_total[ft_importance_total.value >= np.percentile(ft_importance_total.value,xgb_percentile)].sort_values(by='value', ascending=False).feature
xgb_x_cols

933                       employee1_enc
934                       employee2_enc
930                            sido_enc
113     DIFF(shortLoan1 by ownerChange)
935                     ownerChange_enc
0                                  noi1
149        DIFF(profit2 by ownerChange)
154          DIFF(bedCount by instkind)
186           DIFF(noe1 by ownerChange)
93                      inventoryAsset2
131       DIFF(OnonCAsset2 by instkind)
91                   liquidLiabilities2
67                             bedCount
229            DIFF(sgg by ownerChange)
169       DIFF(bedCount by ownerChange)
152      DIFF(longLoan2 by ownerChange)
615        city.STD(clients.shortLoan1)
108         DIFF(tanAsset2 by instkind)
84                          OnonCAsset1
94     DIFF(OnonCAsset1 by ownerChange)
201       DIFF(OnonCAsset1 by instkind)
10                   liquidLiabilities1
92                                  sgg
83                                debt1
12                      inventoryAsset1


In [24]:
xgb = XGBClassifier(
         learning_rate=0.1,
         n_estimators=xgb_tunned_params['n_estimators'],
         objective= 'binary:logistic',
         tree_method='gpu_exact',
         predictor='gpu_predictor',
         scale_pos_weight=int(1/imbalanced_pos_ratio),
         seed=42)

In [25]:
xgb.fit(X, y.astype(int))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=82,
       n_jobs=1, nthread=None, objective='binary:logistic',
       predictor='gpu_predictor', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=19, seed=42, silent=True,
       subsample=1, tree_method='gpu_exact')

In [27]:
preds_class = xgb.predict(X_test)
preds_proba = xgb.predict_proba(X_test)

In [28]:
preds_class

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0])

In [29]:
preds_proba[:,1]

array([0.00477265, 0.29325062, 0.13723552, 0.06851161, 0.07501835,
       0.12442242, 0.13544735, 0.05867784, 0.02189175, 0.16566478,
       0.00542779, 0.26392436, 0.00970337, 0.02994373, 0.04968549,
       0.03270299, 0.01871032, 0.20814504, 0.68903273, 0.2746539 ,
       0.806976  , 0.1723162 , 0.7997966 , 0.20137466, 0.00743742,
       0.01850219, 0.01723975, 0.00914306, 0.04108263, 0.00601537,
       0.03596375, 0.00563089, 0.04006407, 0.00914992, 0.06582721,
       0.03412053, 0.00769332, 0.01263769, 0.05864076, 0.0403929 ,
       0.00989509, 0.05577864, 0.04142151, 0.10807236, 0.0045981 ,
       0.10073417, 0.38611004, 0.04901055, 0.49327233, 0.06152562,
       0.05308534, 0.00744429, 0.62181777, 0.28868556, 0.16535336,
       0.18401875, 0.01856942, 0.00710452, 0.00490293, 0.00374117,
       0.01051193, 0.12698258, 0.14709403, 0.75770503, 0.30449268,
       0.14817064, 0.04698721, 0.86748934, 0.07594793, 0.12365425,
       0.02432493, 0.30946264, 0.00785873, 0.0415432 , 0.00986