# Home Credit Default Risk 2018

In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
import gc
import time
import warnings
from tqdm import tqdm
warnings.simplefilter(action = 'ignore', category = FutureWarning)
try:
    import cPickle as pickle
except:
    import pickle
# import os

In [2]:
RANDOM_STATE = 2042
np.random.seed(RANDOM_STATE)

In [3]:
file_path = '../input/'

In [4]:
submission_df = pd.read_csv(f'{file_path}sample_submission.csv')

In [5]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold

In [7]:
from lightgbm import LGBMClassifier

In [8]:
from scipy.stats import ranksums
from scipy.stats import gmean

In [9]:
from bayes_opt import BayesianOptimization

## Aggregating datasets

### Service functions

In [10]:
def convert_col_to_proper_int(df_col):
    col_type = df_col.dtype
#     print('convert_col_to_proper_int column: ', df_col.name, 'type: ', col_type, 'c_min: ', c_min)
    if ((str(col_type)[:3] == 'int') | (str(col_type)[:4] == 'uint')): # | (str(col_type)[:5] == 'float')
        c_min = df_col.min()
        c_max = df_col.max()
        if c_min < 0:
#             print('c_min: ', c_min, 'less 0')
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df_col = df_col.astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df_col = df_col.astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df_col = df_col.astype(np.int32)
            elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                df_col = df_col.astype(np.int64)
        else:
#             print('c_min: ', c_min, 'not less 0')
            if c_max <= np.iinfo(np.uint8).max:
                df_col = df_col.astype(np.uint8)
            elif c_max <= np.iinfo(np.uint16).max:
                df_col = df_col.astype(np.uint16)
            elif c_max <= np.iinfo(np.uint32).max:
                df_col = df_col.astype(np.uint32)
            elif c_max <= np.iinfo(np.uint64).max:
                df_col = df_col.astype(np.uint64)
            
    return df_col

def convert_col_to_proper_float(df_col):
    col_type = df_col.dtype
    if str(col_type)[:5] == 'float':
        unique_count = len(np.unique(df_col))
        df_col_temp = df_col.astype(np.float32)
        if len(np.unique(df_col_temp)) == unique_count:
            df_col = df_col_temp
            c_min = df_col.min()
            c_max = df_col.max()
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df_col_temp = df_col.astype(np.float16)
                if len(np.unique(df_col_temp)) == unique_count:
                    df_col = df_col_temp
            
    return df_col



def float_to_int(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_to_int')
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            if (df[col] % 1 == 0).all():
                df[col] = convert_col_to_proper_int(df[col].astype(np.int64))
    
    return df

def float_reduced(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_reduced')
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            df[col] = convert_col_to_proper_float(df[col])
    
    return df

def int_reduced(df):
    """ iterate through all int columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_reduced')
    for col in df.columns:
        df[col] = convert_col_to_proper_int(df[col])
    
    return df

## Thanks You Guillaume Martin for the Awesome Memory Optimizer!
## https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(data, verbose = True):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))

    for col in data.columns:
#         print(col, type(data[col]), data[col].shape)
        col_type = data[col].dtype

        if ((col_type != object) & (col_type != '<M8[ns]') & (col_type.name != 'category')):#
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                data[col] = convert_col_to_proper_int(data[col])
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else: data[col] = data[col].astype('category')

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return data

def gentle_reduce_mem_usage(data, verbose = True):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))

    for col in data.columns:
#         print(col, type(data[col]), data[col].shape)
        col_type = data[col].dtype

        if ((col_type != object) & (col_type != '<M8[ns]') & (col_type.name != 'category')):#
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                data[col] = convert_col_to_proper_int(data[col])
            else:
                if (data[col] % 1 == 0).all():
                    data[col] = convert_col_to_proper_int(data[col].astype(np.int64))
                else:
                    data[col] = convert_col_to_proper_float(data[col])
        else: data[col] = data[col].astype('category')

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return data

In [11]:
# [141289,
# 144669,
# 148605,
# 156685,
# 196708,
# 227123,
# 317181,
# 319880,
# 352877,
# 437628,
# 442305]

In [34]:
%%time
df_extended2 = pd.read_pickle(f'{file_path}df_extended_v2.pkl.zip')


Wall time: 8.14 s


In [41]:
df_extended2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 640 entries, SK_ID_CURR to nrm_app most popular AMT_GOODS_PRICE
dtypes: float16(215), float32(253), float64(171), uint32(1)
memory usage: 958.8 MB


In [40]:
%%time
models_df = pd.read_pickle(f'{file_path}models_df02.pkl.zip')

Wall time: 654 ms


In [42]:
# df_ext_mod = pd.concat([df_extended2, iv_extended_feat_test],
#                              ignore_index = True, axis=0, verify_integrity = True)
df_ext_mod = df_extended2.merge(models_df.drop(['TARGET'], axis=1), on='SK_ID_CURR', how='left')
df_ext_mod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 658 entries, SK_ID_CURR to leonid26_638feats_5CV_078196
dtypes: float16(215), float32(253), float64(189), uint32(1)
memory usage: 1007.7 MB


In [48]:
%%time
gp123 = pd.read_pickle(f'{file_path}gp123.pkl')

Wall time: 13.4 s


In [50]:
%%time
neptune_ml_features = pd.read_pickle(f'{file_path}neptune_ml_features.pkl.zip')
neptune_ml_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Data columns (total 54 columns):
SK_ID_CURR                                               356255 non-null int64
annuity_income_percentage                                356219 non-null float64
car_to_birth_ratio                                       121014 non-null float64
car_to_employ_ratio                                      121014 non-null float64
children_ratio                                           356253 non-null float64
credit_to_annuity_ratio                                  356219 non-null float64
credit_to_goods_ratio                                    355977 non-null float64
credit_to_income_ratio                                   356255 non-null float64
days_employed_percentage                                 356255 non-null float64
income_credit_percentage                                 356255 non-null float64
income_per_child                                         356255 non-null float64


In [51]:
%%time
df_ext_gp_nept = df_extended2.merge(gp123, on='SK_ID_CURR', how='left')
df_ext_gp_nept = df_ext_gp_nept.merge(neptune_ml_features, on='SK_ID_CURR', how='left')
df_ext_gp_nept = gentle_reduce_mem_usage(df_ext_gp_nept, verbose = False)
df_ext_gp_nept.to_pickle(f'{file_path}df_ext_gp_nept.pkl.zip')
df_ext_gp_nept.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 2229 entries, SK_ID_CURR to prev_applications_prev_was_revolving_loan
dtypes: float16(746), float32(433), float64(1047), uint32(1), uint8(2)
memory usage: 3.9 GB
Wall time: 29min 15s


In [62]:
df_ext_gp_nept = pd.read_pickle(f'{file_path}df_ext_gp_nept.pkl.zip')
df_ext_gp_nept.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 2229 entries, SK_ID_CURR to prev_applications_prev_was_revolving_loan
dtypes: float16(746), float32(433), float64(1047), uint32(1), uint8(2)
memory usage: 3.9 GB


In [59]:
pca_cluster = pd.read_pickle(f'{file_path}iv_pca_cluster.pkl.zip')

In [63]:
df_ext_gp_nept.drop(pca_cluster.drop(['SK_ID_CURR'],axis=1).columns.tolist(),axis=1,inplace=True)
df_ext_gp_nept = df_ext_gp_nept.merge(pca_cluster, on='SK_ID_CURR', how='left')

In [64]:
df_ext_gp_nept.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 356254
Columns: 2229 entries, SK_ID_CURR to cluster
dtypes: float16(745), float32(433), float64(1048), uint32(1), uint8(2)
memory usage: 3.9 GB


In [66]:
df_ext_gp_nept = df_ext_gp_nept[['SK_ID_CURR', 'TARGET']\
            + feature_importance[feature_importance['mean']>0].sort_values(by='mean', ascending = False).index.tolist()]

In [65]:
feature_importance[feature_importance['mean']>0].sort_values(by='mean', ascending = False).index.tolist()

['app DAYS_EMPLOYED / DAYS_BIRTH',
 'NEW_EMPLOY_TO_BIRTH_RATIO',
 'INSTAL_AMT_PAYMENT_MIN',
 'INSTAL_DBD_STD',
 'POS_SK_ID_PREV_MIN',
 'PREV_prev missing_VAR',
 'PREV_prev AMT_APPLICATION / AMT_CREDIT_VAR',
 'INS_ins DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT_VAR',
 'PREV_HOUR_APPR_PROCESS_START_VAR',
 'APPROVED_DAYS_DECISION_MAX',
 'INSTAL_DAYS_ENTRY_PAYMENT_MAX',
 'ACTIVE_bureau DAYS_CREDIT - CREDIT_DAY_OVERDUE_MAX',
 'PREV_AMT_ANNUITY_VAR',
 1455,
 'NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER',
 'PREV_SELLERPLACE_AREA_MAX',
 'PREV_prev AMT_GOODS_PRICE - AMT_CREDIT_VAR',
 'POS_SK_ID_PREV_MAX',
 'PREV_SK_ID_PREV_MIN',
 'INS_ins DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT_STD',
 'external_sources_min',
 'PREV_prev AMT_APPLICATION - AMT_CREDIT_VAR',
 'PREV_SELLERPLACE_AREA_VAR',
 'INS_SK_ID_PREV_MIN',
 16,
 'INS_ins DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT_MEAN',
 'POS_pos CNT_INSTALMENT more CNT_INSTALMENT_FUTURE_STD',
 'INSTAL_DBD_MEAN',
 598,
 'external_sources_nanmedian',
 'PREV_SELLERPLACE_AREA_MEAN',
 'PREV

In [67]:
%%time
df_ext_gp_nept.to_pickle(f'{file_path}df_ext_gp_nept.pkl.zip')

Wall time: 2min 58s


## Optimization LGBM parameters

### Optimization and visualisation functions

In [68]:
def rank_average_df(data):
    for key in data.columns:
        data[str(key) + '_rank'] = data[key].rank()
    data['rank_sum'] = np.sum(data[col] for col in data.columns if '_rank' in str(col))
#     print(data.shape[0], len([col for col in data.columns if '_rank' in str(col)]))
    data['TARGET'] = data['rank_sum']/(len([col for col in data.columns if '_rank' in str(col)]) *
            data.shape[0])
    return data['TARGET']

In [69]:
def cv_scores(df, num_folds, params, model_name, stratified = False, verbose = -1,
              early_stopping = 300,
              save_train_prediction = True, save_test_prediction = True, train_full_model=False,
              folder_to_save='../stack_it/', seed = 42, submission_sample = submission_df):
    warnings.simplefilter('ignore')
    
#     clf = LGBMClassifier(class_weight = 'balanced', importance_type = 'gain',
#                          random_state = RANDOM_STATE, **params)

    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = seed)
    else:
        folds = KFold(n_splits = num_folds, shuffle = True, random_state = seed)
        
    # Create arrays and dataframes to store results
#     train_pred = np.zeros(train_df.shape[0])
#     train_pred_proba = np.zeros(train_df.shape[0])

#     valid_pred = np.zeros(train_df.shape[0])
    valid_pred_proba = np.zeros(train_df.shape[0])
    
#     test_prediction = np.zeros(test_df.shape[0])
    test_prediction = pd.DataFrame(index = submission_sample['SK_ID_CURR'])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    df_feature_importance = pd.DataFrame(index = feats)
    
    iterations = np.zeros(num_folds, dtype=np.uint16)
    train_scores = np.zeros(num_folds, dtype=np.float32)
    fold_scores = np.zeros(num_folds, dtype=np.float32)
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        clf = LGBMClassifier(importance_type = 'gain',# class_weight = 'balanced',
                             random_state = (RANDOM_STATE + n_fold), **params)
        print('Fold', n_fold, 'started at', time.ctime())
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc',
                eval_names = [f'{n_fold}_train', f'{n_fold}_valid'],
                verbose = verbose, early_stopping_rounds = early_stopping)
        iterations[n_fold] = clf.best_iteration_

#         train_pred[train_idx] = clf.predict(train_x, num_iteration = clf.best_iteration_)
#         train_pred_proba[train_idx] = clf.predict_proba(train_x, num_iteration = clf.best_iteration_)[:, 1]
#         valid_pred[valid_idx] = clf.predict(valid_x, num_iteration = clf.best_iteration_)
        valid_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
#         test_prediction += \
#                 clf.predict_proba(test_df[feats], num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits
        test_prediction[n_fold] = pd.Series(clf.predict_proba(test_df[feats], num_iteration = clf.best_iteration_)[:, 1],
                                            index = submission_sample['SK_ID_CURR'])

        df_feature_importance[n_fold] = pd.Series(clf.feature_importances_, index = feats)
#         print(clf.booster_.attr('max_depth'))
        train_scores[n_fold] = clf.evals_result_[f'{n_fold}_train']['auc'][clf.best_iteration_-1]
        fold_scores[n_fold] = roc_auc_score(valid_y, valid_pred_proba[valid_idx])
        print('Fold %2d train AUC: %.6f, valid AUC: %.6f; best iteration %5d;' % (n_fold,
                                                                                  train_scores[n_fold],
                                                                                  fold_scores[n_fold],
                                                                                  iterations[n_fold]))
        del train_x, train_y, valid_x, valid_y
        gc.collect()

#     roc_auc_train = roc_auc_score(train_df['TARGET'], train_pred_proba)
#     precision_train = precision_score(train_df['TARGET'], train_pred, average = None)
#     recall_train = recall_score(train_df['TARGET'], train_pred, average = None)
    
    roc_auc_valid = roc_auc_score(train_df['TARGET'], valid_pred_proba)
#     precision_valid = precision_score(train_df['TARGET'], valid_pred, average = None)
#     recall_valid = recall_score(train_df['TARGET'], valid_pred, average = None)

    print('Full AUC score %.6f' % roc_auc_valid)
    print(f'AUC score across {num_folds} folds is mean {fold_scores.mean():.4f} with std {fold_scores.std():.4f}',
          f'(train score mean {train_scores.mean():.4f} with std {train_scores.std():.4f})')
    
    df_feature_importance.fillna(0, inplace = True)
    df_feature_importance['mean'] = df_feature_importance.mean(axis = 1)
    important_feats = df_feature_importance[df_feature_importance['mean']>0]\
                        .sort_values(by='mean', ascending = False).index.tolist()

    model_description = f'{model_name}_{len(feats)}feats_{num_folds}CV_{roc_auc_valid*1e5:0>6.0f}'
    # Write prediction files
    if save_train_prediction:
#         df_prediction = train_df[['SK_ID_CURR', 'TARGET']]
        df_prediction = pd.DataFrame({'SK_ID_CURR': train_df['SK_ID_CURR'].values,
                                      'TARGET': valid_pred_proba})
        train_prediction_file_name = f'{folder_to_save}{model_description}_oof_train.csv'
        print(f'Saving oof train predictions to {train_prediction_file_name}')
        df_prediction.to_csv(train_prediction_file_name, index = False)
        del df_prediction
        gc.collect()

    if save_test_prediction:
        test_predictions_file_name = f'{folder_to_save}{model_description}_{num_folds}_test_preds.csv'
        print(f'Saving {num_folds} folds test predictions to {test_predictions_file_name}')
        test_prediction.to_csv(test_predictions_file_name, index = False)
        
        df_prediction = pd.DataFrame({'SK_ID_CURR': submission_sample['SK_ID_CURR'].values,
                                      'TARGET': test_prediction.mean(axis = 1)})
        test_mean_prediction_file_name = f'{folder_to_save}{model_description}_test_mean.csv'
        print(f'Saving mean of {num_folds} test predictions to {test_mean_prediction_file_name}')
        df_prediction.to_csv(test_mean_prediction_file_name, index = False)

        df_prediction = pd.DataFrame({'SK_ID_CURR': submission_sample['SK_ID_CURR'].values,
                                      'TARGET': gmean(test_prediction, axis=1)})
        test_gmean_prediction_file_name = f'{folder_to_save}{model_description}_test_gmean.csv'
        print(f'Saving geometric mean of {num_folds} test predictions to {test_gmean_prediction_file_name}')
        df_prediction.to_csv(test_gmean_prediction_file_name, index = False)
       
        df_prediction = pd.DataFrame({'SK_ID_CURR': submission_sample['SK_ID_CURR'].values,
                                      'TARGET': rank_average_df(test_prediction)})
        test_ranked_prediction_file_name = f'{folder_to_save}{model_description}_test_rank_averaged.csv'
        print(f'Saving rank average of {num_folds} test predictions to {test_ranked_prediction_file_name}')
        df_prediction.to_csv(test_ranked_prediction_file_name, index = False)

        del df_prediction
        gc.collect()
    
    if train_full_model:
        print(f'Full train learning for {iterations.max()} iterations',
              f'on {len(important_feats)} features of original {len(feats)}',
              'started at', time.ctime())
        train_x, train_y = train_df[important_feats], train_df['TARGET']
        params['n_estimators'] = iterations.max()
        clf_full = LGBMClassifier(importance_type = 'gain', #class_weight = 'balanced',
                                 random_state = RANDOM_STATE, **params)
        clf_full.fit(train_x, train_y,
                verbose = verbose)


        fulltrained_model_description = f'{model_name}_{len(important_feats)}feats_{num_folds}CV_{roc_auc_valid*1e5:0>6.0f}'
        if save_test_prediction:
    #         df_prediction = test_df[['SK_ID_CURR', 'TARGET']]
            test_full_prediction_file_name = f'{folder_to_save}{fulltrained_model_description}_test_fulltrained.csv'
            print(f'Saving fulltrained test predictions to {test_full_prediction_file_name}')
            df_prediction = pd.DataFrame({'SK_ID_CURR': submission_sample['SK_ID_CURR'].values,
                                          'TARGET': clf_full.predict_proba(test_df[important_feats])[:, 1]})

    #         df_prediction['TARGET'] = pd.Series(clf_full.predict_proba(test_df[important_feats])[:, 1],
    #                                             index = test_df['SK_ID_CURR'])
            df_prediction.to_csv(test_full_prediction_file_name, index = False)

            del df_prediction
            gc.collect()
    
    return df_feature_importance, \
            roc_auc_valid
#            [#roc_auc_train,
#             roc_auc_valid,
#             precision_train[0], precision_valid[0], precision_train[1], precision_test[1],
#             recall_train[0], recall_test[0], recall_train[1], recall_test[1], 0]

### First scores with parameters from Tilii kernel

In [78]:
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
lgbm_params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'nthread': 4,
            'n_estimators': 10000,#10000
            'learning_rate': .015,
            'num_leaves': 600,#30
            'colsample_bytree': 0.27,#0.05
            'max_depth': -1,
           # 'max_bin': 100,#100
            'bagging_fraction': 0.37,
            'bagging_freq': 30,
#             'reg_alpha': .041545473,
            'reg_lambda': 300,
            'min_gain_to_split': 0.5,
#             'min_child_weight': 39.3259775,
            'min_data_in_leaf': 500,
            'silent': -1,
            'verbose': -1
}

# lgbm_params = {
#             'objective': 'binary',
#             'boosting_type': 'gbdt',
#             'nthread': 4,
#             'n_estimators': 10000,
#             'learning_rate': .02,
#             'num_leaves': 30,
#             'colsample_bytree': .05,
#             'subsample': .8,
#             'max_depth': -1,
#             'max_bin': 100,
#             'bagging_fraction': 0.4,
#             'bagging_freq': 2,
#             'min_child_samples': 70,
# #             'reg_alpha': .041545473,
#             'reg_lambda': 100,
#             'min_gain_to_split': 0.5,
# #             'min_child_weight': 39.3259775,
#             'min_data_in_leaf': 500,
#             'silent': -1,
#             'verbose': -1
# }

# lgbm_params = {
#             'objective': 'binary',
#             'boosting_type': 'gbdt',
#             'nthread': 4,
#             'n_estimators': 10000,
#             'learning_rate': .02,
#             'num_leaves': 300,
#             'colsample_bytree': .05,
#             'subsample': .8,
#             'max_depth': 16,
#             'bagging_fraction': 0.8,
#             'bagging_freq': 5,
#             'min_child_samples': 70,
# #             'reg_alpha': .041545473,
#             'reg_lambda': 100,
#             'min_gain_to_split': 0.5,
# #             'min_child_weight': 39.3259775,
#             'silent': -1,
#             'verbose': -1
# }


In [79]:
%%time
feature_importance, scor = cv_scores(df_ext_gp_nept,
                                     7, lgbm_params, model_name = 'leonid34',
#                                      save_train_prediction = False,
#                                      train_full_model= True,
                                     verbose = 200, early_stopping=500,
                                     stratified = False, seed = RANDOM_STATE)

Starting LightGBM. Train shape: (307511, 2153), test shape: (48744, 2153)
Fold 0 started at Wed Aug 29 15:09:40 2018
Training until validation scores don't improve for 500 rounds.
[200]	0_train's auc: 0.785923	0_valid's auc: 0.775087
[400]	0_train's auc: 0.805615	0_valid's auc: 0.787434
[600]	0_train's auc: 0.819111	0_valid's auc: 0.793349
[800]	0_train's auc: 0.830184	0_valid's auc: 0.796799
[1000]	0_train's auc: 0.84007	0_valid's auc: 0.798936
[1200]	0_train's auc: 0.848961	0_valid's auc: 0.800451
[1400]	0_train's auc: 0.857091	0_valid's auc: 0.801932
[1600]	0_train's auc: 0.864995	0_valid's auc: 0.802989
[1800]	0_train's auc: 0.872144	0_valid's auc: 0.803723
[2000]	0_train's auc: 0.878469	0_valid's auc: 0.804301
[2200]	0_train's auc: 0.885054	0_valid's auc: 0.804718
[2400]	0_train's auc: 0.89103	0_valid's auc: 0.804871
[2600]	0_train's auc: 0.896249	0_valid's auc: 0.804911
[2800]	0_train's auc: 0.90183	0_valid's auc: 0.805516
[3000]	0_train's auc: 0.907028	0_valid's auc: 0.805436
[3

[2200]	5_train's auc: 0.885361	5_valid's auc: 0.796738
[2400]	5_train's auc: 0.891085	5_valid's auc: 0.797216
[2600]	5_train's auc: 0.896451	5_valid's auc: 0.797609
[2800]	5_train's auc: 0.901855	5_valid's auc: 0.797709
[3000]	5_train's auc: 0.907021	5_valid's auc: 0.797739
[3200]	5_train's auc: 0.911795	5_valid's auc: 0.7978
[3400]	5_train's auc: 0.916891	5_valid's auc: 0.797872
[3600]	5_train's auc: 0.921036	5_valid's auc: 0.797971
[3800]	5_train's auc: 0.925069	5_valid's auc: 0.797833
[4000]	5_train's auc: 0.92905	5_valid's auc: 0.797823
Early stopping, best iteration is:
[3614]	5_train's auc: 0.921396	5_valid's auc: 0.79798
Fold  5 train AUC: 0.921396, valid AUC: 0.797980; best iteration  3614;
Fold 6 started at Wed Aug 29 17:34:01 2018
Training until validation scores don't improve for 500 rounds.
[200]	6_train's auc: 0.786605	6_valid's auc: 0.772655
[400]	6_train's auc: 0.806644	6_valid's auc: 0.784181
[600]	6_train's auc: 0.819887	6_valid's auc: 0.789567
[800]	6_train's auc: 0.8

In [80]:
feature_importance[feature_importance['mean']>0].sort_values(by='mean', ascending = False)\
    .to_csv(f'{file_path}leonid34_fi_sorted.csv')

In [81]:
feature_importance.sort_values(by='mean', ascending = False)

Unnamed: 0,0,1,2,3,4,5,6,mean
1455,743,819,696,774,700,690,586,715.428571
598,714,861,670,710,580,625,505,666.428571
PREV_HOUR_APPR_PROCESS_START_VAR,722,745,582,740,569,571,447,625.142857
external_sources_min,671,764,579,650,580,622,470,619.428571
app DAYS_EMPLOYED / DAYS_BIRTH,708,784,582,702,550,544,436,615.142857
INSTAL_AMT_PAYMENT_MIN,635,805,616,624,634,490,475,611.285714
612,617,728,627,646,565,606,473,608.857143
1407,633,720,626,652,566,537,512,606.571429
5,642,745,579,637,556,552,490,600.142857
PREV_prev missing_VAR,611,742,620,642,540,553,429,591.000000
